Segment Tree

class TreeNode:
    def __init__(self, val=None, parent=None, left=None, right=None, start=None, end=None):
        self.val = val
        self.parent = parent
        self.left = left
        self.right = right
        self.start = start
        self.end = end
    
    def __repr__(self):
        print('val {} start {} end {}'.format(self.val, self.start, self.end))
        
class SegmentTree:
    def __init__(self, values):
        self.build_tree(values)
    
    def build_tree(self, values):
        nodes = [TreeNode(val=val, start=i, end=i) for i, val in enumerate(values)]
        self.leaves = nodes
        while len(nodes) != 1:
            parents = []
            for l, r in zip_longest(nodes[::2], nodes[1::2], fillvalue=None):
                val = l.val if not r else l.val + r.val
                end = l.end if not r else r.end
                parent = TreeNode(val, left=l, right=r, start=l.start, end=end)
                l.parent = parent
                if r: r.parent = parent
                parents.append(parent)
            nodes = parents
        self.root = nodes[0]
        
    def update(self, index, val):
        leaf = self.leaves[index]
        diff = val - leaf.val
        node = leaf
        while node:
            node.val += diff
            node = node.parent              
            
    def query(self, left, right):
        l = self.leaves[left]
        r = self.leaves[right]
        if l == r: return l.val
        
        total = l.val + r.val
        while l.parent != r.parent:
            if l == l.parent.left:
                total += l.parent.right.val
            if r == r.parent.right:
                total += r.parent.left.val
            l = l.parent
            r = r.parent
        return total        

KMP

  • Motivation:
def is_substr(source, query):
    """return if query is a substring of the source"""
    pass
    for i in range(len(source)):
        if source[i:i+len(query)] == query: return True
    return False

O(MN) time where N = len(source), M = len(query). Redaundant work is that when search fails, we go back to next i.

source = aabdcaaabdcaaabcdba
query  = aabdcaaabcdb
                  | match process started from i = 0 we fail here

Do we need to go back to i+1 and start over? No

source = aabdcaaabdcaaabcdba
query  =       aabdcaaabcdb
               |
We should try to start from here. 

How do we know?

query = aabdca|aab|cdb
query =       |aab|dcaaabcdb

aab is the prefix but also the suffix for subtring of query aabdcaaab

If we know such information about the query string, we can move i back to where that suffix started.

  • Longest proper prefix that is also a suffix. LPS

Generate an array that contains the length of the LPS of each part of target.

def lps(p):
    m = len(p)
    lps = [0, 0]
    j = 0 
    for i in range(1, m): 
        while j and p[i] != p[j]: j = lps[j]
        if p[i] == p[j]: j += 1
        lps.append(j)
    return lps

query = "aabdcaaabcdb"
print(query)
print("".join(map(str, lps(query))))
aabdcaaabcdb
0010001223000

lps[i] is: if match at query location i failed (does not match source[j]), we can go back to lps[i] of query and try to match query[lps[i]] with the source[j].

Matching prodedure.

def match(s, p):
    n, m = len(s), len(p)
    lps = lps(p)
    ans = []
    j = 0
    for i in range(n):
        while j and s[i] != p[j]: j = lps[j]  # don't match, use lps table to jump back
        if s[i] == p[j]: j += 1
        if j == m:
            ans.append(i - m + 1)
            j = lps[j]
    return ans

match('abcdeabcdabcbacad', 'abc')

partial use:

def kmp_table(s):
    m = len(s)
    table, j = [0, 0], 0
    for i in range(1, m):
        while j > 0 and s[i] != s[j]:  j = table[j]
        if s[i] == s[j]:               j += 1
        table.append(j)    
    return table


class Solution:
    def shortestPalindrome(self, s: str) -> str:
        ss = s + "+" + s[::-1]
        tb = kmp_table(ss)
        k = tb[-1]  
        return s[k:][::-1] + s[:k] + s[k:]

Rabin-Karp Rolling Hash

  • Motivation:
# rolling hash a fixed size substring on DNA sequence. 
def update_code(code, c):
    code <<= 2
    code &= mask
    if c == 'A': code |= 0
    if c == 'C': code |= 1
    if c == 'G': code |= 2
    if c == 'T': code |= 3
    return code

Data Structure Crossing

  • HashSet + Deque
class Logger:
    def __init__(T, limit=limit):
        T.messages = set()
        T.queue = deque()
        T.limite = limite

    def shouldPrintMessage(T, timestamp: int, message: str) -> bool:
        while T.queue and timestamp - T.queue[0][1] >= T.limite:
            msg = T.queue[0][0]
            T.queue.popleft()
            T.messages.remove(msg)
        
        if message not in T.messages:
            T.messages.add(message)
            T.queue.append((message, timestamp))
            return True
        return False