# __ __ ___ ___ __ __ ____ ____ ____ _ _ ____ ___ # /__\ ( ) / __) ___ / __)( )( )( ___)( ___)(_ _)( \/ )( ___)/ __) # /(__)\ )(__( (_-.(___)\__ \ )(__)( )__) )__) _)(_ ) ( )__) \__ \ # (__)(__)(____)\___/ (___/(______)(__) (__) (____)(_/\_)(____)(___/ # # Template by Leo Ackermann # Code by Leo Ackermann import time # ============================================================================== # C L A S S for Trie node # ============================================================================== class TrieNode: def __init__(self): # Whether the node is marked self.marked = False # A dictionnary to store auxiliary information at the level of node self.misc = {} # The tree structure is stored as a dictionnary self.children = {} def pretty_print(self, indent=''): ''' Pretty print the subtree of the node ''' children = list(self.children.items()) if len(children) == 0: # This is a leaf, marked by construction print(f'<{self.misc["starting_pos"]}>') elif len(children) == 1: # This is a non-branching node (label, child) = children[0] if self.marked: print(f'#──({label})──', end='') else: print(f'━──({label})──', end='') child.pretty_print(indent + " ") else: # This is a branching node (at least two children) # First child (label, child) = children[0] if self.marked: print(f'#──({label})──', end='') else: print(f'┬──({label})──', end='') child.pretty_print(indent + "│ ") # Mid children (possibly skipretty_printed) for i_children in range(1, len(children)-1): (label, child) = children[i_children] print(f'{indent}├──({label})──', end='') child.pretty_print(indent + "│ ") # Last child (label, child) = children[-1] print(f'{indent}└──({label})──', end='') child.pretty_print(indent + " ") def count_covered_leaves(self): counter = 0 stack = [self] while stack != []: node = stack.pop() if node.marked == True: counter += 1 for c in node.children: stack.append(node.children[c]) return counter def retrieve_covered_locations(self): locations = [] stack = [self] while stack != []: node = stack.pop() if node.marked == True: locations.append(node.misc["starting_pos"]) for c in node.children: stack.append(node.children[c]) return locations # ============================================================================== # C L A S S for Trie # ============================================================================== class Trie: # Create a new trie def __init__(self, words=[]): self.root = TrieNode() for word in words: self.insert(word) # Display a trie def pretty_print(self): self.root.pretty_print() # Insert a word within a trie def insert(self, word): node = self.root for c in word: if c not in node.children: node.children[c] = TrieNode() node = node.children[c] node.marked = True # ============================================================================== # C L A S S for Trie # ============================================================================== class SuffixTrie: # Create a new suffix trie def __init__(self, seq): self.root = TrieNode() seq += '$' suffixes = [seq[i:] for i in range(len(seq))] for i in range(len(suffixes)): self.__insert(suffixes[i], i) # Display a suffix trie def pretty_print(self): self.root.pretty_print() # Whether a pattern appear in the sequence def membership(self, pattern): node = self.root for c in pattern: if c not in node.children: return False node = node.children[c] return True # How many times a pattern appears in the sequence def count(self, pattern): node = self.root for c in pattern: if c not in node.children: return 0 node = node.children[c] return node.count_covered_leaves() # How many times a pattern appears in the sequence def count_using_preprocessing(self, pattern): node = self.root for c in pattern: if c not in node.children: return 0 node = node.children[c] return node.misc["nb_covered_leaves"] # Naive implementation O(|ST|^2) def preprocess_leaf_covering(self): stack = [self.root] while stack != []: node = stack.pop() node.misc["nb_covered_leaves"] = node.count_covered_leaves() for c in node.children: stack.append(node.children[c]) # Better implementation O(|ST|) def preprocess_leaf_covering__fast(self): pre_order = [] stack = [self.root] while stack != []: node = stack.pop() pre_order.append(node) for c in node.children: stack.append(node.children[c]) for node in reversed(pre_order): if node.marked: node.misc["nb_covered_leaves"] = 1 else: # NOTE. The order in which nodes are visited is such that children fields has been completed node.misc["nb_covered_leaves"] = sum(map(lambda x: x.misc["nb_covered_leaves"], node.children.values())) def locate_all(self, pattern): node = self.root for c in pattern: if c not in node.children: return [] node = node.children[c] return node.retrieve_covered_locations() # PRIVATE FUNCTIONS # Insert a word within a trie def __insert(self, word, i): node = self.root for c in word: if c not in node.children: node.children[c] = TrieNode() node = node.children[c] node.marked = True node.misc["starting_pos"] = i # ============================================================================== # U T I L I T I E S # ============================================================================== def open_fasta_as_string(filepath): ''' Retrieve the genome stored in a (simple) fasta file, as a string ''' with open(filepath) as fastafile: return ''.join(fastafile.read().splitlines()[1:]) # ============================================================================== # O T H E R S T U F F # ============================================================================== def compare_speeds(): genome = open_fasta_as_string("./sHDAg.fa") ST = SuffixTrie(genome) # Count method start = time.time() ST.count("ATC") print(f"--- {time.time() - start} seconds ---") # Count_fast method ST.preprocess_leaf_covering() start = time.time() ST.count_using_preprocessing("ATC") print(f"--- {time.time() - start} seconds ---")