|
| 1 | +''' |
| 2 | + NeedlemanWunsch.py |
| 3 | + This file implements the Needleman-Wunsch sequence alignment algorithm. The code |
| 4 | + is not mine, the credit goes completely to: |
| 5 | + https://wilkelab.org/classes/SDS348/2018_spring/labs/lab13-solution.html |
| 6 | +''' |
| 7 | + |
| 8 | +# Use these values to calculate scores |
| 9 | +gap_penalty = -1 |
| 10 | +match_award = 1 |
| 11 | +mismatch_penalty = -2 |
| 12 | + |
| 13 | +# A function for making a matrix of zeroes |
| 14 | + |
| 15 | + |
| 16 | +def zeros(rows, cols): |
| 17 | + # Define an empty list |
| 18 | + retval = [] |
| 19 | + # Set up the rows of the matrix |
| 20 | + for x in range(rows): |
| 21 | + # For each row, add an empty list |
| 22 | + retval.append([]) |
| 23 | + # Set up the columns in each row |
| 24 | + for y in range(cols): |
| 25 | + # Add a zero to each column in each row |
| 26 | + retval[-1].append(0) |
| 27 | + # Return the matrix of zeros |
| 28 | + return retval |
| 29 | + |
| 30 | +# A function for determining the score between any two bases in alignment |
| 31 | + |
| 32 | + |
| 33 | +def match_score(alpha, beta): |
| 34 | + if alpha == beta: |
| 35 | + return match_award |
| 36 | + elif alpha == '-' or beta == '-': |
| 37 | + return gap_penalty |
| 38 | + else: |
| 39 | + return mismatch_penalty |
| 40 | + |
| 41 | +# The function that actually fills out a matrix of scores |
| 42 | + |
| 43 | + |
| 44 | +def needleman_wunsch(seq1, seq2): |
| 45 | + |
| 46 | + # Store length of two sequences |
| 47 | + n = len(seq1) |
| 48 | + m = len(seq2) |
| 49 | + |
| 50 | + # Generate matrix of zeros to store scores |
| 51 | + score = zeros(m + 1, n + 1) |
| 52 | + |
| 53 | + # Calculate score table |
| 54 | + |
| 55 | + # Fill out first column |
| 56 | + for i in range(0, m + 1): |
| 57 | + score[i][0] = gap_penalty * i |
| 58 | + |
| 59 | + # Fill out first row |
| 60 | + for j in range(0, n + 1): |
| 61 | + score[0][j] = gap_penalty * j |
| 62 | + |
| 63 | + # Fill out all other values in the score matrix |
| 64 | + for i in range(1, m + 1): |
| 65 | + for j in range(1, n + 1): |
| 66 | + # Calculate the score by checking the top, left, and diagonal cells |
| 67 | + match = score[i - 1][j - 1] + match_score(seq1[j - 1], seq2[i - 1]) |
| 68 | + delete = score[i - 1][j] + gap_penalty |
| 69 | + insert = score[i][j - 1] + gap_penalty |
| 70 | + # Record the maximum score from the three possible scores |
| 71 | + # calculated above |
| 72 | + score[i][j] = max(match, delete, insert) |
| 73 | + |
| 74 | + # Traceback and compute the alignment |
| 75 | + |
| 76 | + # Create variables to store alignment |
| 77 | + align1 = "" |
| 78 | + align2 = "" |
| 79 | + |
| 80 | + # Start from the bottom right cell in matrix |
| 81 | + i = m |
| 82 | + j = n |
| 83 | + |
| 84 | + similarity = 0 |
| 85 | + |
| 86 | + # We'll use i and j to keep track of where we are in the matrix, just like |
| 87 | + # above |
| 88 | + while i > 0 and j > 0: # end touching the top or the left edge |
| 89 | + score_current = score[i][j] |
| 90 | + similarity += score_current |
| 91 | + score_diagonal = score[i - 1][j - 1] |
| 92 | + score_up = score[i][j - 1] |
| 93 | + score_left = score[i - 1][j] |
| 94 | + |
| 95 | + # Check to figure out which cell the current score was calculated from, |
| 96 | + # then update i and j to correspond to that cell. |
| 97 | + if score_current == score_diagonal + \ |
| 98 | + match_score(seq1[j - 1], seq2[i - 1]): |
| 99 | + align1 += seq1[j - 1] |
| 100 | + align2 += seq2[i - 1] |
| 101 | + i -= 1 |
| 102 | + j -= 1 |
| 103 | + elif score_current == score_up + gap_penalty: |
| 104 | + align1 += seq1[j - 1] |
| 105 | + align2 += '-' |
| 106 | + j -= 1 |
| 107 | + elif score_current == score_left + gap_penalty: |
| 108 | + align1 += '-' |
| 109 | + align2 += seq2[i - 1] |
| 110 | + i -= 1 |
| 111 | + |
| 112 | + # Finish tracing up to the top left cell |
| 113 | + while j > 0: |
| 114 | + align1 += seq1[j - 1] |
| 115 | + align2 += '-' |
| 116 | + j -= 1 |
| 117 | + while i > 0: |
| 118 | + align1 += '-' |
| 119 | + align2 += seq2[i - 1] |
| 120 | + i -= 1 |
| 121 | + |
| 122 | + # Since we traversed the score matrix from the bottom right, our two sequences will be reversed. |
| 123 | + # These two lines reverse the order of the characters in each sequence. |
| 124 | + align1 = align1[::-1] |
| 125 | + align2 = align2[::-1] |
| 126 | + |
| 127 | + min = mismatch_penalty * len(seq1) |
| 128 | + rangeValues = match_award * len(seq1) - min |
| 129 | + |
| 130 | + return(align1, align2, similarity) |
0 commit comments