Skip to content

Commit 806aeb8

Browse files
committed
binary conversion
Former-commit-id: 2af30771175e08c41465f6c6073a75cd1a584f28
1 parent 441ca34 commit 806aeb8

File tree

6 files changed

+197
-9
lines changed

6 files changed

+197
-9
lines changed

brisera/config.py

+30-5
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,17 @@ class BriseraConfiguration(confire.Configuration):
1717
"""
1818
Meaningful defaults and required configurations.
1919
20-
debug: the app will print or log debug statements
21-
testing: the app will not overwrite important resources
20+
debug: the app will print or log debug statements
21+
testing: the app will not overwrite important resources
22+
max_chunk: the maximum chunk of a sequence for mapping
23+
overlap: the bp overlap to carry over in sequences
24+
min_read_len: the minimum length of the reads
25+
max_read_len: the maximum length of the reads
26+
k: number of mismatches to allow (bigger means more time)
27+
allow_diff: False is mismatches only, True indels as well
28+
filter_align: False uses all alignments, True only report unambiguous best alignment
29+
block_size: number of qry and ref tuples to consider at a time in the reduce phase
30+
redundancy: number of copies of low complexity seeds to use
2231
"""
2332

2433
CONF_PATHS = [
@@ -27,9 +36,25 @@ class BriseraConfiguration(confire.Configuration):
2736
os.path.abspath("conf/brisera.yaml"), # Local configuration
2837
]
2938

30-
debug = True
31-
testing = True
32-
39+
debug = True
40+
testing = True
41+
max_chunk = 65535
42+
overlap = 1024 # ensure this is longer than longest read
43+
min_read_len = 36
44+
max_read_len = 36
45+
k = 3
46+
allow_diff = False
47+
fliter_align = True
48+
block_size = 128
49+
redundancy = 16
50+
51+
@property
52+
def seed_len(self):
53+
return self.min_read_len / (self.k+1)
54+
55+
@property
56+
def flank_len(self):
57+
return self.max_read_len - self.seed_len + self.k
3358

3459
## Load settings immediately for import
3560
settings = BriseraConfiguration.load()

brisera/convert.py

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""
2+
Handles the conversion of a FASTA sequence into a sequence format
3+
"""
4+
5+
from brisera.utils import fasta
6+
from brisera.config import settings
7+
8+
class FastaChunker(object):
9+
10+
def __init__(self, path):
11+
self.path = path
12+
self.min_sequence_len = None
13+
self.max_sequence_len = None
14+
self.sequences = 0
15+
16+
def chunk(self, sequence):
17+
"""
18+
Chunks sequences and also records the min/max lengths.
19+
Yields a tuple as follows:
20+
0: the sequence chunk which is < settings.max_chunk
21+
1: the offset of the sequence from the original
22+
2: a boolean tag indicating a reference sequence (or last chunk)
23+
"""
24+
25+
length = len(sequence)
26+
27+
# Record the minimum and maximum sequence lengths
28+
if self.min_sequence_len is None or length < self.min_sequence_len:
29+
self.min_sequence_len = length
30+
31+
if self.max_sequence_len is None or length > self.max_sequence_len:
32+
self.max_sequence_len = length
33+
34+
# Alert if the sequence is large
35+
if length > 100:
36+
print "Large sequence discovered: %ibp" % length
37+
38+
offset = 0
39+
numchunks = 0
40+
41+
while offset < length:
42+
numchunks += 1
43+
end = min(offset+settings.max_chunk, length)
44+
45+
chunk = sequence[offset:end]
46+
yield (chunk, offset, end == length)
47+
48+
if end == length:
49+
offset = length
50+
else:
51+
offset = end - settings.overlap
52+
53+
def convert(self, writer):
54+
"""
55+
The main entry point, convert the FASTA file and output it by
56+
writing it to the given stream (the writer).
57+
"""
58+
59+
for idx, seq in self:
60+
for chunk in self.chunk(seq):
61+
record = str((idx, chunk))
62+
writer.write(record+"\n")
63+
break
64+
break
65+
66+
def __iter__(self):
67+
"""
68+
Iterates over all the sequences using fasta reader and emits the
69+
1-indexed idx, sequence for each (omiting the label). Note that the
70+
sequence will be completely uppercase.
71+
"""
72+
for label, sequence in fasta(self.path):
73+
self.sequences += 1 # Count the number of sequences
74+
yield (self.sequences, sequence.upper())
75+
76+
if __name__ == '__main__':
77+
import sys
78+
from brisera.utils import fixture
79+
path = fixture('100k.fa', 'cloudburst')
80+
chunker = FastaChunker(path)
81+
chunker.convert(sys.stdout)

brisera/records.py

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""
2+
Utilities to help create and serialize records in binary format
3+
"""
4+
5+
DNA_BYTES = {
6+
'A': 0x00,
7+
'C': 0x01,
8+
'G': 0x02,
9+
'T': 0x04,
10+
'N': 0x08,
11+
'space': 0x0F,
12+
'hardstop': 0xFF,
13+
}
14+
15+
BYTES_DNA = dict((v, k) for (k,v) in DNA_BYTES.items())
16+
17+
def dna_from_seq(dna, pos, length):
18+
if length == 0:
19+
return ""
20+
21+
alen = 2 * length
22+
end = pos + length - 1
23+
24+
if (dna[end] & 0x0F) == DNA_BYTES['space']:
25+
alen -= 1
26+
27+
idx = 0
28+
arr = [""] * alen
29+
30+
while pos < end:
31+
arr[idx] = (dna[pos] & 0xF0) >> 4
32+
arr[idx+1] = (dna[pos] & 0x0F)
33+
34+
pos += 1
35+
idx += 2
36+
37+
arr[idx] = (dna[pos] & 0xF0) >> 4
38+
idx += 1
39+
40+
if (dna[pos] & 0x0F) != DNA_BYTES['space']:
41+
arr[idx] = dna[pos] & 0x0F
42+
43+
string = ""
44+
for b in arr:
45+
string += BYTES_DNA[b]
46+
47+
return string
48+
49+
def record_from_bytes(raw):
50+
51+
last_chunk = raw[0] == 1
52+
offset = (
53+
(raw[1] & 0xFF) << 24 |
54+
(raw[2] & 0xFF) << 16 |
55+
(raw[3] & 0xFF) << 8 |
56+
raw[4] & 0xFF
57+
)
58+
sequence = dna_from_seq(raw, 5, len(raw)-5)
59+
60+
return sequence, offset, last_chunk
61+
62+
if __name__ == '__main__':
63+
value = bytearray(b'\x01\x00\x00\x00\x00!\x14$AD@\x10B\x04DD"A@$$\x04"')
64+
print record_from_bytes(value)

conf/brisera-example.yaml

+20-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,23 @@
11
# Example configuration file for the Brisera application
22
# Copy and paste this into a file named "brisera.yaml" and add your
3-
# settings
3+
# settings the defaults are configured to the CloudBurst sample.
44

5-
debug: true
6-
testing: false
5+
debug: true # the app will print or log debug statements
6+
testing: false # the app will not overwrite important resources
7+
8+
# the bp overlap to carry over in sequences
9+
# (ensure this is longer than longest read)
10+
overlap: 1024
11+
max_chunk: 65535 # the maximum length of a chunk in the mapping
12+
13+
min_read_len: 36 # the minimum length of the reads
14+
max_read_len: 36 # the maximum length of the reads
15+
16+
17+
k: 3 # number of mismatches to allow (bigger means more time)
18+
allow_diff: false # False is mismatches only, True indels as well
19+
fliter_align: true # False uses all alignments, True only report unambiguous best alignment
20+
21+
22+
block_size: 128 # number of qry and ref tuples to consider at a time in the reduce phase
23+
redundancy: 16 # number of copies of low complexity seeds to use

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
PyYAML==3.11
2+
avro==1.7.7
23
confire==0.2.0
34
nose==1.3.4
45
python-dateutil==2.3

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
"install_requires": requires,
3838
"classifiers": classifiers,
3939
"zip_safe": False,
40-
"scripts": [],
40+
"scripts": ["apps/convert_fasta.py"],
4141
}
4242

4343
setup(**config)

0 commit comments

Comments
 (0)