Skip to content

Commit 4225ff6

Browse files
committed
landau vishkin
Former-commit-id: 5ff7df94e028bc7a25f2e3b4b3b4950bde951fcf
1 parent c150f8e commit 4225ff6

File tree

2 files changed

+278
-0
lines changed

2 files changed

+278
-0
lines changed

brisera/lv.py

+277
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
"""
2+
Basic implementation of Landau-Viskin for alignments
3+
"""
4+
5+
##########################################################################
6+
## Imports
7+
##########################################################################
8+
9+
import sys
10+
from numpy import zeros
11+
12+
##########################################################################
13+
## Module Constants
14+
##########################################################################
15+
16+
NO_ALIGNMENT = ( 0, 0,None,None,0)
17+
BAD_ALIGNMENT = (-1,-1,None,None,0)
18+
19+
class LandauVishkin(object):
20+
21+
def __init__(self):
22+
self.L = None
23+
self.B = None
24+
self.dist = None
25+
self.what = None
26+
27+
def configure(self, k):
28+
self.L = zeros((k*2+1, k+1), dtype=int)
29+
self.B = zeros((k*2+1, k+1), dtype=int)
30+
self.dist = zeros(k+1, dtype=int)
31+
self.what = zeros(k+1, dtype=int)
32+
33+
def kmismatch(self, text, pattern, k):
34+
self.configure(k)
35+
m = len(pattern)
36+
n = len(text)
37+
38+
if m == 0 or n == 0:
39+
return NO_ALIGNMENT
40+
41+
last = m if m < n else n
42+
mm = 0
43+
match = 0
44+
45+
for pos in xrange(last):
46+
if text[pos] != pattern[pos]:
47+
self.what[mm] = 0
48+
self.dist[mm] = match
49+
match = 0
50+
51+
mm += 1
52+
53+
if mm > k:
54+
return BAD_ALIGNMENT
55+
match += 1
56+
57+
self.dist[mm] = match
58+
self.what[mm] = 2
59+
60+
return (last, mm, self.dist, self.what, mm+1)
61+
62+
def kdifference(self, text, pattern, k):
63+
self.configure(k)
64+
m = len(pattern)
65+
n = len(text)
66+
67+
if m == 0 or n == 0:
68+
return NO_ALIGNMENT
69+
70+
# Compute via dynamic programming
71+
for e in xrange(k+1):
72+
for d in xrange(-e, e):
73+
row = -1
74+
75+
if e > 0:
76+
if abs(d) < e:
77+
up = self.L[k+d][e-1] + 1
78+
if up > row:
79+
row = up
80+
self.B[k+d][e] = 0
81+
82+
if d > -(e-1):
83+
left = self.L[k+d-1][e-1]
84+
if left > row:
85+
row = left
86+
self.B[k+d][e] = -1
87+
88+
if d < (e-1):
89+
right = self.L[k+d+1][e-1]+1
90+
if right > row:
91+
row = right
92+
self.B[k+d][e] = 1
93+
else:
94+
row = 0
95+
96+
while ((row < m) and (row+d < n) and (pattern[row] == text[row+d])):
97+
row += 1
98+
99+
self.L[k+d][e] = row
100+
101+
if (row+d == n) or (row == m):
102+
# reached the end of the pattern or text
103+
distlen = e+1
104+
105+
E = e
106+
D = d
107+
108+
self.what[E] = 2
109+
110+
while e >= 0:
111+
b = self.B[k+d][e]
112+
if e > 0:
113+
self.what[e-1] = b
114+
115+
self.dist[e] = self.L[k+d][e]
116+
117+
if e < E:
118+
self.dist[e+1] -= self.dist[e]
119+
120+
d += b
121+
e -= 1
122+
123+
return (row+D, E, self.dist, self.what, distlen)
124+
125+
# How did we get here?
126+
raise TypeError("Unexpected end of dynamic programming")
127+
128+
##########################################################################
129+
## Helper functions for alignments
130+
##########################################################################
131+
132+
def is_bazea_yates_seed(alignment, qlen, kmerlen):
133+
# Since an alignment may be recompute k+1 times for each of the k+1 seeds,
134+
# see if the current alignment is the leftmost alignment by checking for
135+
# differences in the proceeding chunks of the query
136+
137+
alignlen, differences, dist, what, distlen = alignment
138+
num_buckets = qlen / kmerlen
139+
lastbucket = -1
140+
distdelta = 0
141+
pos = 0
142+
143+
for idx in xrange(distlen):
144+
pos += dist[idx] + distdelta
145+
distdelta = 0
146+
147+
if what[idx] == 2:
148+
continue # end of string
149+
elif what[idx] == -1:
150+
# gap character occurs
151+
if pos % kmerlen == 0:
152+
continue # occurs between buckets, skip
153+
154+
bucket = pos / kmerlen
155+
if (bucket - lastbucket) > 1:
156+
return False
157+
lastbucket = bucket
158+
159+
return lastbucket == (num_buckets-1)
160+
161+
def str_alignment(alignment):
162+
alignlen, differences, dist, what, distlen = alignment
163+
astr = "%i;%i;" % (alignlen, differences)
164+
astr += ";".join("%i" % i for i in dist)
165+
astr += ";"
166+
astr += ";".join("%i" % i for i in what)
167+
astr += ";"
168+
return astr
169+
170+
def print_alignment(alignment, t, p):
171+
alignlen, differences, dist, what, distlen = alignment
172+
173+
if dist is None:
174+
sys.stdout.write("t: %s" % text)
175+
else:
176+
sys.stdout.write("a: ")
177+
nextstride = 0
178+
for idx in xrange(distlen):
179+
if what[idx] == 2:
180+
break
181+
stride = dist[idx] + nextstride
182+
for jdx in xrange(stride):
183+
sys.stdout.write(" ")
184+
sys.stdout.write("*")
185+
186+
nextstride = 0
187+
if what[idx] == 1 or what[idx] == 0:
188+
nextstride = -1
189+
sys.stdout.write("\n")
190+
191+
sys.stdout.write("t: ")
192+
pos = 0
193+
nextstride = 0
194+
for idx in xrange(distlen):
195+
stride = dist[idx] + nextstride
196+
197+
nextstride = -1 if what[idx] == 1 else 0
198+
199+
for jdx in xrange(stride):
200+
sys.stdout.write(t[pos])
201+
pos += 1
202+
203+
if what[idx] == 1:
204+
sys.stdout.write('-')
205+
elif what[idx] == -1:
206+
sys.stdout.write(t[pos])
207+
pos += 1
208+
209+
sys.stdout.write("\np: ")
210+
211+
if dist is None:
212+
sys.stdout.write(p)
213+
else:
214+
pos = 0
215+
for idx in xrange(distlen):
216+
for jdx in xrange(dist[idx]):
217+
sys.stdout.write(p[pos])
218+
pos += 1
219+
220+
if what[idx] == -1:
221+
sys.stdout.write("-")
222+
223+
sys.stdout.write("\n")
224+
225+
def check_bys(k, t, p, shouldbeseed, name, kmerlen):
226+
print "checking %s (%s)" % (name, shouldbeseed)
227+
228+
if len(p) != 10:
229+
raise ValueError("Read length incorrect: %i" % len(p))
230+
231+
lv = LandauVishkin()
232+
a = lv.kdifference(t,p,k)
233+
print_alignment(a,t,p)
234+
235+
if is_bazea_yates_seed(a, len(p), kmerlen) != shouldbeseed:
236+
print "should match!"
237+
238+
print
239+
240+
if __name__ == '__main__':
241+
k = 5
242+
lv = LandauVishkin()
243+
text = "TTTCTCAAACACCTATATTTTTTGT"
244+
pattern = "TTTCTCAAACACCTATATTTTTT"
245+
246+
align = lv.kdifference(text, pattern, k)
247+
print "ASCII: %s" % str_alignment(align)
248+
print_alignment(align, text, pattern)
249+
250+
251+
# Check BY seeds
252+
k = 3
253+
KMER_LEN = 5
254+
255+
t = "AACCGATTCCCAA"
256+
257+
check_bys(k, t, "AACCGATTCC", False, "exact", KMER_LEN);
258+
259+
check_bys(k, t, "ACCCGATTCC", False, "1-mismatch", KMER_LEN);
260+
check_bys(k, t, "ACCGATTCCC", False, "1-del", KMER_LEN);
261+
check_bys(k, t, "AAACCGATTC", False, "1-ins", KMER_LEN);
262+
263+
check_bys(k, t, "AACCGATCCC", False, "2-mismatch", KMER_LEN);
264+
check_bys(k, t, "AACCGTTCCC", False, "2-del", KMER_LEN);
265+
check_bys(k, t, "AACCGATTTC", False, "2-ins", KMER_LEN);
266+
267+
check_bys(k, t, "ACCCGATCCC", True, "1-mis, 2-mis", KMER_LEN);
268+
check_bys(k, t, "ATCCGATTCA", True, "1-mis, 2-del", KMER_LEN);
269+
check_bys(k, t, "ATCCGATTTC", True, "1-mis, 2-ins", KMER_LEN);
270+
271+
check_bys(k, t, "ACCGATACCC", True, "1-del, 2-mis", KMER_LEN);
272+
check_bys(k, t, "ACCGATCCCA", True, "1-del, 2-del", KMER_LEN);
273+
check_bys(k, t, "ACCGATTTCC", True, "1-del, 2-ins", KMER_LEN);
274+
275+
check_bys(k, t, "TACCGTTCCA", True, "1 del boundary", KMER_LEN);
276+
check_bys(k, t, "AACCGTCCCA", False, "1 del boundary", KMER_LEN);
277+
check_bys(k, t, "AACCGCCCAA", False, "1 del boundary", KMER_LEN);

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ PyYAML==3.11
22
avro==1.7.7
33
confire==0.2.0
44
nose==1.3.4
5+
numpy==1.9.1
56
python-dateutil==2.3
67
six==1.8.0
78
wsgiref==0.1.2

0 commit comments

Comments
 (0)