# ____ ____ ____ ___ __ __ ____ ____ ____ _ _ ____ ___ # ( _ \(_ _)( ___)___ / __)( )( )( ___)( ___)(_ _)( \/ )( ___)/ __) # ) _ < _)(_ )__)(___)\__ \ )(__)( )__) )__) _)(_ ) ( )__) \__ \ # (____/(____)(__) (___/(______)(__) (__) (____)(_/\_)(____)(___/ # # Template by Leo Ackermann # ============================================================================== # C L A S S for Node # ============================================================================== class Node: def __init__(self): ''' Create a new node ''' # Whether the node is marked self.marked = False # A dictionnary to store auxiliary information at the level of node self.misc = {} # The tree structure is stored as a dictionnary self.children = {} def pretty_print(self, indent=''): ''' Pretty print the subtree of the node ''' children = list(self.children.items()) if len(children) == 0: # This is a leaf, or an empty root if self.marked: print(f'<{self.misc["starting_pos"]}>') else: print(f'━') elif len(children) == 1: # This is a non-branching node (label, child) = children[0] if self.marked: print(f'#──({label})──', end='') else: print(f'━──({label})──', end='') child.pretty_print(indent + " ") else: # This is a branching node (at least two children) # First child (label, child) = children[0] if self.marked: print(f'#──({label})──', end='') else: print(f'┬──({label})──', end='') child.pretty_print(indent + "│ ") # Mid children (possibly skipped) for i_children in range(1, len(children)-1): (label, child) = children[i_children] print(f'{indent}├──({label})──', end='') child.pretty_print(indent + "│ ") # Last child (label, child) = children[-1] print(f'{indent}└──({label})──', end='') child.pretty_print(indent + " ") def count_covered_leaves(self): ''' Count the number of leaves that are covered by the node ''' counter = 0 stack = [self] while stack != []: node = stack.pop() if node.marked == True: counter += 1 for c in node.children: stack.append(node.children[c]) return counter def retrieve_covered_locations(self): ''' Retrieve the starting positions of the leaves covered by the node ''' locations = [] stack = [self] while stack != []: node = stack.pop() if node.marked == True: locations.append(node.misc["starting_pos"]) for c in node.children: stack.append(node.children[c]) return locations # ============================================================================== # C L A S S for Trie # ============================================================================== class Trie: def __init__(self, words=[]): ''' Create a new trie ''' self.root = Node() for word in words: self.__insert(word) def pretty_print(self): ''' Display a trie ''' self.root.pretty_print() ### [ Private methods ] #################################################### def __insert(self, word): ''' Insert a word within a trie ''' node = self.root for c in word: if c not in node.children: node.children[c] = Node() node = node.children[c] node.marked = True # ============================================================================== # C L A S S for Suffix Trie # ============================================================================== class SuffixTrie: def __init__(self, seq): ''' Create a new suffix trie ''' self.root = Node() seq += '$' suffixes = [seq[i:] for i in range(len(seq))] for i in range(len(suffixes)): self.__insert(suffixes[i], i) self.preprocessed_for_count = False def pretty_print(self): ''' Display a suffix trie ''' self.root.pretty_print() def membership(self, pattern): ''' Whether a pattern appear in the sequence ''' node = self.root for c in pattern: if c not in node.children: return False node = node.children[c] return True def count(self, pattern): ''' How many times a pattern appears in the sequence Complexity: O(|P| + |ST|) ''' node = self.root for c in pattern: if c not in node.children: return 0 node = node.children[c] return node.count_covered_leaves() def count_fast(self, pattern): ''' How many times a pattern appears in the sequence Complexity: O(|P|) ''' if not self.preprocessed_for_count: raise Exception("The Suffix trie has not been preprocessed for count: please call beforehand") node = self.root for c in pattern: if c not in node.children: return 0 node = node.children[c] return node.misc["nb_covered_leaves"] def preprocess_leaf_covering(self): ''' Traverse the suffix trie to fill the field "nb_covered_leaves" of every node Complexity: O(|ST|) ''' # NOTE. The algorithm propagate the information from the leaves to the # the root. It starts by building an order that # ensures that children are processed before parents # Build the order with a DFS pre_order = [] stack = [self.root] while stack != []: node = stack.pop() pre_order.append(node) for c in node.children: stack.append(node.children[c]) # Propagate the covered leaves information bottom-up for node in reversed(pre_order): if node.marked: node.misc["nb_covered_leaves"] = 1 else: node.misc["nb_covered_leaves"] = sum(map(lambda x: x.misc["nb_covered_leaves"], node.children.values())) self.preprocessed_for_count = True def locate_all(self, pattern): ''' Where does a pattern appears within the sequence ''' node = self.root for c in pattern: if c not in node.children: return [] node = node.children[c] return node.retrieve_covered_locations() ### [ Private methods ] #################################################### def __insert(self, word, i): ''' Insert a word within a trie ''' node = self.root for c in word: if c not in node.children: node.children[c] = Node() node = node.children[c] node.marked = True node.misc["starting_pos"] = i # ============================================================================== # C L A S S for Suffix Tree # ============================================================================== class SuffixTree: def __init__(self, word): ''' Create a new suffix tree Complexity: O(|word|^2) ''' # The suffix tree datastructure contains two attributes: the tree, but # also the original word, from which one can recover the implicit label # on the branches. # Initially, the tree is taken as the suffix trie of the word. The # remaining part of the init function will compact it. self.word = word self.root = SuffixTrie(word).root # [COMPACTION STEP 1: merging non-branching paths] # # The trie is traversed in a top-down manner, using a DF-traversal. The # non-branching children are replaced by their upmost branching # (grand-)child. The nodes of the trie are annotated with their # . We also collect the order in which nodes are seen, for # the second compaction step. self.root.misc["trie_depth"] = 0 tree_stack = [self.root] top_bottom_order = [self.root] while tree_stack: tree_node = tree_stack.pop() top_bottom_order.append(tree_node) for (letter, tree_child) in tree_node.children.items(): branch_length = 1 # Follow the branch as long as non-branching while len(tree_child.children.keys()) == 1: tree_child = next(iter(tree_child.children.values())) branch_length += 1 # Update the tree, including depth information tree_node.children[letter] = tree_child tree_child.misc["trie_depth"] = tree_node.misc["trie_depth"] + branch_length tree_stack.append(tree_child) # [COMPACTION STEP 2: make label of the trie branches easy-to-recover] # # The field is filled, in a bottom-up manner for node in reversed(top_bottom_order): if node.marked: node.misc["some_covered_starting_pos"] = node.misc["starting_pos"] else: node.misc["some_covered_starting_pos"] = next(iter(node.children.values())).misc["some_covered_starting_pos"] def pretty_print(self): ''' Display a suffix tree ''' self.tree.root.pretty_print() def membership(self, pattern): ''' Whether a pattern appear in the sequence ''' node = self.root for i in range(len(pattern)): # Update by branching if i as reached 's depth if i >= node.misc["trie_depth"]: if pattern[i] not in node.children: return False node = node.children[pattern[i]] # Otherwise, check whether the i-th character is indeed in the tree elif self.word[node.misc["some_covered_starting_pos"] + i] != pattern[i]: return False return True