binary conversion

bbengfort · bbengfort · commit 806aeb88aeff · 2014-12-10T19:44:02.000-05:00
Former-commit-id: 2af30771175e08c41465f6c6073a75cd1a584f28
diff --git a/brisera/config.py b/brisera/config.py
@@ -17,8 +17,17 @@ class BriseraConfiguration(confire.Configuration):
     """
     Meaningful defaults and required configurations.
 
-    debug:    the app will print or log debug statements
-    testing:  the app will not overwrite important resources
+    debug:        the app will print or log debug statements
+    testing:      the app will not overwrite important resources
+    max_chunk:    the maximum chunk of a sequence for mapping
+    overlap:      the bp overlap to carry over in sequences
+    min_read_len: the minimum length of the reads
+    max_read_len: the maximum length of the reads
+    k:            number of mismatches to allow (bigger means more time)
+    allow_diff:   False is mismatches only, True indels as well
+    filter_align: False uses all alignments, True only report unambiguous best alignment
+    block_size:   number of qry and ref tuples to consider at a time in the reduce phase
+    redundancy:   number of copies of low complexity seeds to use
     """
 
     CONF_PATHS = [
@@ -27,9 +36,25 @@ class BriseraConfiguration(confire.Configuration):
         os.path.abspath("conf/brisera.yaml"),     # Local configuration
     ]
 
-    debug    = True
-    testing  = True
-
+    debug        = True
+    testing      = True
+    max_chunk    = 65535
+    overlap      = 1024  # ensure this is longer than longest read
+    min_read_len = 36
+    max_read_len = 36
+    k            = 3
+    allow_diff   = False
+    fliter_align = True
+    block_size   = 128
+    redundancy   = 16
+
+    @property
+    def seed_len(self):
+        return self.min_read_len / (self.k+1)
+
+    @property
+    def flank_len(self):
+        return self.max_read_len - self.seed_len + self.k
 
 ## Load settings immediately for import
 settings = BriseraConfiguration.load()
diff --git a/brisera/convert.py b/brisera/convert.py
@@ -0,0 +1,81 @@
+"""
+Handles the conversion of a FASTA sequence into a sequence format
+"""
+
+from brisera.utils import fasta
+from brisera.config import settings
+
+class FastaChunker(object):
+
+    def __init__(self, path):
+        self.path = path
+        self.min_sequence_len = None
+        self.max_sequence_len = None
+        self.sequences = 0
+
+    def chunk(self, sequence):
+        """
+        Chunks sequences and also records the min/max lengths.
+        Yields a tuple as follows:
+            0: the sequence chunk which is < settings.max_chunk
+            1: the offset of the sequence from the original
+            2: a boolean tag indicating a reference sequence (or last chunk)
+        """
+
+        length   = len(sequence)
+
+        # Record the minimum and maximum sequence lengths
+        if self.min_sequence_len is None or length < self.min_sequence_len:
+            self.min_sequence_len = length
+
+        if self.max_sequence_len is None or length > self.max_sequence_len:
+            self.max_sequence_len = length
+
+        # Alert if the sequence is large
+        if length > 100:
+            print "Large sequence discovered: %ibp" % length
+
+        offset    = 0
+        numchunks = 0
+
+        while offset < length:
+            numchunks += 1
+            end = min(offset+settings.max_chunk, length)
+
+            chunk = sequence[offset:end]
+            yield (chunk, offset, end == length)
+
+            if end == length:
+                offset = length
+            else:
+                offset = end - settings.overlap
+
+    def convert(self, writer):
+        """
+        The main entry point, convert the FASTA file and output it by
+        writing it to the given stream (the writer).
+        """
+
+        for idx, seq in self:
+            for chunk in self.chunk(seq):
+                record = str((idx, chunk))
+                writer.write(record+"\n")
+                break
+            break
+
+    def __iter__(self):
+        """
+        Iterates over all the sequences using fasta reader and emits the
+        1-indexed idx, sequence for each (omiting the label). Note that the
+        sequence will be completely uppercase.
+        """
+        for label, sequence in fasta(self.path):
+            self.sequences += 1 # Count the number of sequences
+            yield (self.sequences, sequence.upper())
+
+if __name__ == '__main__':
+    import sys
+    from brisera.utils import fixture
+    path = fixture('100k.fa', 'cloudburst')
+    chunker = FastaChunker(path)
+    chunker.convert(sys.stdout)
diff --git a/brisera/records.py b/brisera/records.py
@@ -0,0 +1,64 @@
+"""
+Utilities to help create and serialize records in binary format
+"""
+
+DNA_BYTES = {
+    'A': 0x00,
+    'C': 0x01,
+    'G': 0x02,
+    'T': 0x04,
+    'N': 0x08,
+    'space': 0x0F,
+    'hardstop': 0xFF,
+}
+
+BYTES_DNA = dict((v, k) for (k,v) in DNA_BYTES.items())
+
+def dna_from_seq(dna, pos, length):
+    if length == 0:
+        return ""
+
+    alen = 2 * length
+    end  = pos + length - 1
+
+    if (dna[end] & 0x0F) == DNA_BYTES['space']:
+        alen -= 1
+
+    idx = 0
+    arr = [""] * alen
+
+    while pos < end:
+        arr[idx]   = (dna[pos] & 0xF0) >> 4
+        arr[idx+1] = (dna[pos] & 0x0F)
+
+        pos += 1
+        idx += 2
+
+    arr[idx] = (dna[pos] & 0xF0) >> 4
+    idx += 1
+
+    if (dna[pos] & 0x0F) != DNA_BYTES['space']:
+        arr[idx] = dna[pos] & 0x0F
+
+    string = ""
+    for b in arr:
+        string += BYTES_DNA[b]
+
+    return string
+
+def record_from_bytes(raw):
+
+    last_chunk = raw[0] == 1
+    offset = (
+        (raw[1] & 0xFF) << 24 |
+        (raw[2] & 0xFF) << 16 |
+        (raw[3] & 0xFF) << 8  |
+         raw[4] & 0xFF
+    )
+    sequence = dna_from_seq(raw, 5, len(raw)-5)
+
+    return sequence, offset, last_chunk
+
+if __name__ == '__main__':
+    value = bytearray(b'\x01\x00\x00\x00\x00!\x14$AD@\x10B\x04DD"A@$$\x04"')
+    print record_from_bytes(value)
diff --git a/conf/brisera-example.yaml b/conf/brisera-example.yaml
@@ -1,6 +1,23 @@
 # Example configuration file for the Brisera application
 # Copy and paste this into a file named "brisera.yaml" and add your
-# settings
+# settings the defaults are configured to the CloudBurst sample.
 
-debug: true
-testing: false
+debug: true         # the app will print or log debug statements
+testing: false      # the app will not overwrite important resources
+
+# the bp overlap to carry over in sequences
+# (ensure this is longer than longest read)
+overlap: 1024
+max_chunk: 65535 # the maximum length of a chunk in the mapping
+
+min_read_len: 36 # the minimum length of the reads
+max_read_len: 36 # the maximum length of the reads
+
+
+k: 3               # number of mismatches to allow (bigger means more time)
+allow_diff: false  # False is mismatches only, True indels as well
+fliter_align: true # False uses all alignments, True only report unambiguous best alignment
+
+
+block_size: 128    # number of qry and ref tuples to consider at a time in the reduce phase
+redundancy: 16     # number of copies of low complexity seeds to use
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 PyYAML==3.11
+avro==1.7.7
 confire==0.2.0
 nose==1.3.4
 python-dateutil==2.3
diff --git a/setup.py b/setup.py
@@ -37,7 +37,7 @@
     "install_requires": requires,
     "classifiers": classifiers,
     "zip_safe": False,
-    "scripts": [],
+    "scripts": ["apps/convert_fasta.py"],
 }
 
 setup(**config)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`PyYAML==3.11`
	`2`	`+avro==1.7.7`
`2`	`3`	`confire==0.2.0`
`3`	`4`	`nose==1.3.4`
`4`	`5`	`python-dateutil==2.3`
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@`
`37`	`37`	`"install_requires": requires,`
`38`	`38`	`"classifiers": classifiers,`
`39`	`39`	`"zip_safe": False,`
`40`		`- "scripts": [],`
	`40`	`+ "scripts": ["apps/convert_fasta.py"],`
`41`	`41`	`}`
`42`	`42`
`43`	`43`	`setup(**config)`