Skip to content

Commit f6b3a12

Browse files
committed
alignment with seed reduce
Former-commit-id: ef438849c031ae4282b9286c1d9eea8d3f682563
1 parent 4225ff6 commit f6b3a12

File tree

5 files changed

+69
-15
lines changed

5 files changed

+69
-15
lines changed

apps/brisera_align.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,21 @@
1414
##########################################################################
1515

1616
import sys
17+
import csv
1718
import brisera
19+
import StringIO
1820

1921
from brisera.utils import timeit
2022
from brisera.config import settings
2123
from pyspark import SparkConf, SparkContext
2224

25+
def write_records(records):
26+
output = StringIO.StringIO()
27+
writer = csv.writer(output)
28+
for record in records:
29+
writer.writerow(record)
30+
return [output.getvalue()]
31+
2332
@timeit
2433
def run_brisera_alignment(sc, refpath, qrypath, outpath):
2534
"""
@@ -35,7 +44,7 @@ def run_brisera_alignment(sc, refpath, qrypath, outpath):
3544
fdelta = 0
3645

3746
# Write alignments to disk
38-
alignments.saveAsTextFile(outpath)
47+
alignments.mapPartitions(write_records).saveAsTextFile(outpath)
3948

4049
return alignments, adelta, fdelta
4150

@@ -64,4 +73,4 @@ def run_brisera_alignment(sc, refpath, qrypath, outpath):
6473
alignments, adelta, fdelta = result
6574
print "Total execution time: %0.3f seconds" % delta
6675
print " Alignment time: %0.3f seconds" % adelta
67-
print " Filtering time: %0.3f seconds" % fdelta
76+
# print " Filtering time: %0.3f seconds" % fdelta

brisera/align.py

+52-9
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
## Imports
77
##########################################################################
88

9+
from brisera.lv import *
910
from brisera.utils import *
1011
from brisera.records import *
1112
from brisera.config import settings
@@ -91,9 +92,9 @@ def map_reference(self, arg):
9192
if self.redundancy > 1 and repseed(sequence, start, self.seed_len):
9293
for rdx in xrange(self.redundancy):
9394
r = rdx % self.redundancy
94-
yield (seed, (key, offset, is_last, leftstart, leftlen, rightstart, rightlen, r))
95+
yield (seed, (key, offset, is_last, sequence[leftstart:leftlen], sequence[rightstart:rightlen], r))
9596
else:
96-
yield (seed, (key, offset, is_last, leftstart, leftlen, rightstart, rightlen, 0))
97+
yield (seed, (key, offset, is_last, sequence[leftstart:leftlen], sequence[rightstart:rightlen], 0))
9798

9899
def map_queries(self, arg):
99100
"""
@@ -135,9 +136,43 @@ def map_queries(self, arg):
135136

136137
if self.redundancy > 1 and repseed(sequence, idx, self.seed_len):
137138
r = key % self.redundancy
138-
yield (seed, (key, idx, is_last, 0, idx, rightstart, rightlen, r))
139+
yield (seed, (key, idx, is_last, sequence[0:idx], sequence[rightstart:rightlen], r))
139140
else:
140-
yield (seed, (key, idx, is_last, 0, idx, rightstart, rightlen, 0))
141+
yield (seed, (key, idx, is_last, sequence[0:idx], sequence[rightstart:rightlen], 0))
142+
143+
def extend(self, arg):
144+
"""
145+
Performs the extend portion of the BLAST algorithm.
146+
"""
147+
key, (query, reference) = arg
148+
refstart = reference[1]
149+
refend = refstart + self.seed_len
150+
differences = 0
151+
152+
try:
153+
# Align left flanks
154+
if len(query[3]) != 0:
155+
a = LandauVishkin().kdifference(reference[3], query[3], self.k)
156+
if a[0] == -1:
157+
return NO_ALIGNMENT
158+
if not is_bazea_yates_seed(a, len(query[3]), self.seed_len):
159+
return NO_ALIGNMENT
160+
161+
refstart -= a[0]
162+
differences = a[1]
163+
164+
# Align right flanks
165+
if len(query[4]) != 0:
166+
b = LandauVishkin.kdifference(reference[4], query[4], self.k-differences)
167+
if a[0] == -1:
168+
return NO_ALIGNMENT
169+
170+
refend += b[0]
171+
differences += b[1]
172+
173+
return reference[0], refstart, refend, differences
174+
except:
175+
return NO_ALIGNMENT
141176

142177
##########################################################################
143178
## Spark Functionality
@@ -148,13 +183,21 @@ def align_all(sc, refpath, qrypath):
148183
"""
149184
Returns an RDD of alignments (no writes to disk)
150185
"""
151-
reference = sc.sequenceFile(refpath)
152-
queries = sc.sequenceFile(qrypath)
153-
alignment = MerAlignment()
186+
reference = sc.sequenceFile(refpath)
187+
queries = sc.sequenceFile(qrypath)
188+
alignment = MerAlignment()
154189

155190
# Perform mapping
156-
reference = reference.flatMap(alignment.map_reference)
157-
return reference
191+
reference = reference.flatMap(alignment.map_reference)
192+
queries = queries.flatMap(alignment.map_queries)
193+
194+
# Find shared seeds
195+
shared = queries.join(reference)
196+
197+
# Compute alignments with Landau-Viskin
198+
alignments = shared.map(alignment.extend).filter(lambda a: a != NO_ALIGNMENT )
199+
200+
return alignments
158201

159202
if __name__ == '__main__':
160203
from brisera.convert import *

brisera/config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class BriseraConfiguration(confire.Configuration):
4444
max_read_len = 36
4545
k = 3
4646
allow_diff = False
47-
filter_align = True
47+
filter_align = False
4848
block_size = 128
4949
redundancy = 16
5050

brisera/filter.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""
22
Filters the best alignments from the computed alignments.
3+
4+
Was unable to implement due to time constraints.
35
"""
46

57
##########################################################################

conf/brisera-example.yaml

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ min_read_len: 36 # the minimum length of the reads
1414
max_read_len: 36 # the maximum length of the reads
1515

1616

17-
k: 3 # number of mismatches to allow (bigger means more time)
18-
allow_diff: false # False is mismatches only, True indels as well
19-
fliter_align: true # False uses all alignments, True only report unambiguous best alignment
17+
k: 3 # number of mismatches to allow (bigger means more time)
18+
allow_diff: false # False is mismatches only, True indels as well
19+
fliter_align: false # False uses all alignments, True only report unambiguous best alignment
2020

2121

2222
block_size: 128 # number of qry and ref tuples to consider at a time in the reduce phase

0 commit comments

Comments
 (0)