# Template by Claire Lemaitre # Code by Leo Ackermann import random import time ################################################################################ class DynamicMatrix: ''' Stores a matrix |S|x|T| (|S|+1 lines and |T|+1columns), sequences S and T and the score system (match, mismatch, gap) Defines some global alignment functions ''' ## Constructor ############################################################# def __init__(self, S, T, match, mismatch, gap): ''' DynamicMatrix constructor ''' self.S=S self.T=T self.gap=gap self.match=match self.mismatch=mismatch # NOTE. Initializing the matrix with '_' allows for the Python code to # crash when a non-initialized cell is used in an integer comparison. self.matrix = [['_' for j in range(len(T)+1)] for i in range(len(S)+1)] ## Private methods ######################################################### def __score(self, char_a, char_b): ''' Score contribution of two aligned characters ''' if char_a == char_b: return self.match else: return self.mismatch def __initFull(self): ''' Fill first line and first column with gap costs (base cases) ''' for x in range(len(self.T)+1): self.matrix[0][x] = x * self.gap for y in range(len(self.S)+1): self.matrix[y][0] = y * self.gap def __initBanded(self, width): ''' Fill first line and first column with gap costs (base cases), but restrain to cells at distance at most from the diagonal ''' for x in range(width+1): self.matrix[0][x] = x * self.gap for y in range(width+1): self.matrix[y][0] = y * self.gap def __fillFull(self): ''' Fill the matrix using the recursive relation ''' for y in range(1, len(self.S)+1): for x in range(1, len(self.T)+1): cost_left = self.matrix[y][x-1] + self.gap cost_up = self.matrix[y-1][x] + self.gap cost_upleft = self.matrix[y-1][x-1] \ + self.__score(self.S[y-1], self.T[x-1]) self.matrix[y][x] = max(cost_left, cost_upleft, cost_up) def __fillBanded(self, width): ''' Fill the matrix using the recursive relation, but restrain to cells at distance at most from the diagonal ''' for y in range(1, len(self.S)+1): for x in range(max(y - width, 1), min(y + width + 1, len(self.T)+1)): # Banded celles always have an upleft-neighbor cost_upleft = self.matrix[y-1][x-1] \ + self.__score(self.S[y-1], self.T[x-1]) # If no up-neighbor, the cost of coming from the above is set to # infinite if x == min(y + width + 1, len(self.T)+1) - 1: cost_up = - float('inf') else: cost_up = self.matrix[y-1][x] + self.gap # If no left--neighbor, the cost of coming from the left is set # to infinite if x == max(y - width, 1): cost_left = - float('inf') else: cost_left = self.matrix[y][x-1] + self.gap self.matrix[y][x] = max(cost_left, cost_upleft, cost_up) def __computeGlobalAlignementFull(self): ''' Retrieve *an* alignment of maximal score. NOTE. This function assumes that the matrix has been filled recursively ''' algn_S = "" algn_T = "" y = len(self.S) x = len(self.T) while (x, y) != (0, 0): if self.matrix[y][x] == self.matrix[y-1][x] + self.gap: # up algn_S = self.S[y-1] + algn_S algn_T = '-' + algn_T y -= 1 elif self.matrix[y][x] == self.matrix[y][x-1] + self.gap: # left algn_S ='-' + algn_S algn_T = self.T[x-1] + algn_T x -= 1 elif self.matrix[y][x] == self.matrix[y-1][x-1] \ + self.__score(self.S[y-1], self.T[x-1]): # upleft algn_S = self.S[y-1] + algn_S algn_T = self.T[x-1] + algn_T x -= 1 y -= 1 return algn_S, algn_T def __computeIdArray(self, algn_S, algn_T): ''' Compute a boolean array that states whether characters from the alignements match, pairwise ''' return map(lambda x: x[0] == x[1], zip(algn_S, algn_T)) ## Public methods ########################################################## def printMatrix(self): ''' Display the matrix in a pretty way ''' width = 4 vide = " " line = f"{vide:>{2*width}}" for j in range(0,len(self.T)): line += f"{self.T[j]:>{width}}" print(line) line = f"{vide:>{width}}" for j in range(0,len(self.T)+1): line += f"{self.matrix[0][j]:>{width}}" print(line) for i in range(1,len(self.S)+1): line = f"{self.S[i-1]:>{width}}" for j in range(0,len(self.T)+1): line += f"{self.matrix[i][j]:>{width}}" print(line) def globalAlignementFull(self, displayResults=True): ''' Full pipeline for a global alignment, in O(max{|S|, |T|}^2) time ''' self.__initFull() self.__fillFull() if displayResults: algn_S, algn_T = self.__computeGlobalAlignementFull() id_array = list(self.__computeIdArray(algn_S, algn_T)) id_perc = sum(id_array) / len(algn_S) * 100 print() print(f"S: {algn_S}") print(" " + ''.join(map(lambda x: '|' if x else ' ', id_array))) print(f"T: {algn_T}") print("---" + "-"*len(algn_S)) print(f"score: {self.matrix[-1][-1]}") print(f"sim: {id_perc :.1f}% identity") print() def globalAlignementBanded(self, width, displayResults=True): ''' Full pipeline for a global alignment, in O(max{|S|, |T|} * width) time ''' # If the width is too small, matrix[-1][-1] would never be computed # Raise an error in this regime of parameters if width < abs(len(self.S) - len(self.T)): raise Exception("Band width is too low") self.__initBanded(width) self.__fillBanded(width) if displayResults: print() print(f"score: {self.matrix[-1][-1]}") print() ##[ End of DynamicMatrix ]###################################################### def runBanded(): dm = DynamicMatrix("GGATAGC", "AATGAATC", +2, -1, -2) dm.globalAlignementBanded(2) dm.printMatrix() def run(): dm = DynamicMatrix("GGATAGC", "AATGAATC", +2, -1, -2) dm.globalAlignementFull() dm.printMatrix() def random_dna_of_length(n): dna = "" letters = ['A', 'T', 'C', 'G'] for _ in range(n): dna += random.choice(letters) return dna def test_heuristics_on_random_data(): for dna_len in [500, 1000, 2000]: print(f"=== DNALEN: {dna_len} ===") print("Width\tScore\tTime") print("---------------------") seq_A = random_dna_of_length(dna_len) seq_B = random_dna_of_length(dna_len) for width in [1, 5, 10, 20, 40]: start_time = time.time() dm = DynamicMatrix(seq_A, seq_B, 2, -1, -2) dm.globalAlignementBanded(width, displayResults=False) print(f"{width}\t\t{dm.matrix[-1][-1]}\t\t{time.time()-start_time :.2f}s") print("---------------------") start_time = time.time() dm = DynamicMatrix(seq_A, seq_B, 2, -1, -2) dm.globalAlignementFull(displayResults=False) print(f"_\t\t{dm.matrix[-1][-1]}\t\t{time.time()-start_time :.2f}s") print()