Skip to content

Commit 72257a6

Browse files
authored
Merge pull request #29 from ShawHahnLab/release-0.3.0
Version 0.3.0
2 parents ea96c91 + 0adb1eb commit 72257a6

15 files changed

Lines changed: 1028 additions & 908 deletions

File tree

.circleci/config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ version: 2.1
22
jobs:
33
build:
44
machine:
5-
image: circleci/classic:latest
5+
image: ubuntu-2004:202201-02
66
steps:
77
- checkout:
88
path: igseq

CHANGELOG.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,27 @@
11
# Changelog
22

3+
## 0.3.0 - 2022-07-14
4+
5+
### Added
6+
7+
* Support for .afa (as FASTA) and .fq (as FASTQ) in input/output handling
8+
([#26])
9+
10+
### Changed
11+
12+
* Replaced Rhesus germline reference "bernat2021" with "kimdb" version 1.1,
13+
containing additions and corrections since the original publication, and
14+
added a CSV of germline reference information ([#28])
15+
16+
### Fixed
17+
18+
* vdj-match now skips references that are missing gene segments but were only
19+
included implicitly rather than directly named ([#25])
20+
21+
[#28]: https://github.com/ShawHahnLab/igseq/pull/28
22+
[#26]: https://github.com/ShawHahnLab/igseq/pull/26
23+
[#25]: https://github.com/ShawHahnLab/igseq/pull/25
24+
325
## 0.2.0 - 2022-02-15
426

527
### Added

conda/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html
2-
{% set version = "0.2.0" %}
2+
{% set version = "0.3.0" %}
33
{% set build = "0" %}
44

55
package:

igseq/data/germ.csv

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Ref,DOI,URL,Notes
2+
rhesus/zhang2019,10.4049/jimmunol.1800342,https://doi.org/10.4049/jimmunol.1800342,IGHV and IGHJ only
3+
rhesus/kimdb,10.1016/j.immuni.2020.12.018,http://kimdb.gkhlab.se/datasets,IGH only; KIMDB v1.1 (updated since publication)
4+
rhesus/sonarramesh,10.3389/fimmu.2017.01407,https://github.com/scharch/SONAR/tree/master/germDB,Trimmed subset of full published sequences
5+
rhesus/imgt,,https://www.imgt.org/download/V-QUEST/IMGT_V-QUEST_reference_directory/Macaca_mulatta/IG,
6+
human/imgt,,https://www.imgt.org/download/V-QUEST/IMGT_V-QUEST_reference_directory/Homo_sapiens/IG,

igseq/data/germ/rhesus/bernat2021/IGH/IGHD.fasta renamed to igseq/data/germ/rhesus/kimdb/IGH/IGHD.fasta

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,13 @@ AGGATATTGTAGTGGTGGTGTCTGCTATGCC
3939
>IGHD2-9*01
4040
AGGATATTGTACTAGTACTACTTGCTATGCC
4141
>IGHD3-15*01
42-
GTATTACGAGGATGATTACGGTTACTATTACACCCACAGCGT
42+
GTATTACGAGGATGATTACGGTTACTATTACACC
4343
>IGHD3-16*01
44-
GTATTACTATAGTGGTAGTTATTACTACCACAGTGT
44+
GTATTACTATAGTGGTAGTTATTACTAC
4545
>IGHD3-17*01
4646
GTACTGGGGTGATTATTATGAC
4747
>IGHD3-18*01
48-
GTATTACTATGGTAGTGGTTATTACACCCACAGCGT
48+
GTATTACTATGGTAGTGGTTATTACACC
4949
>IGHD3-19*02_S4720
5050
GTACTGGAGTGATTATTATGAC
5151
>IGHD3-20*01
File renamed without changes.

igseq/data/germ/rhesus/bernat2021/IGH/IGHV.fasta renamed to igseq/data/germ/rhesus/kimdb/IGH/IGHV.fasta

Lines changed: 887 additions & 863 deletions
Large diffs are not rendered by default.

igseq/record.py

Lines changed: 35 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,25 @@
1717
DEFAULT_DUMMY_QUAL = "I"
1818
DEFAULT_COLUMNS = {k: k for k in ["sequence_id", "sequence", "sequence_quality", "sequence_description"]}
1919

20+
# Mapping from file extensions to file format names used here
21+
FMT_EXT_MAP = {
22+
".csv": "csv",
23+
".tsv": "tsv",
24+
".tab": "tsv",
25+
".fa": "fa",
26+
".fasta": "fa",
27+
".afa": "fa",
28+
".fq": "fq",
29+
".fastq": "fq"}
30+
31+
# Mapping from file format names to functions that take file handles and give
32+
# format-specific record iterators.
33+
READERS = {
34+
"csv": lambda hndl: csv.DictReader(hndl),
35+
"tsv": lambda hndl: csv.DictReader(hndl, delimiter="\t"),
36+
"fa": lambda hndl: SeqIO.parse(hndl, "fasta"),
37+
"fq": lambda hndl: SeqIO.parse(hndl, "fastq")}
38+
2039
class RecordHandler:
2140
"""Abstract class for shared generic record reading and writing
2241
functionality. This behaves as a context manager to handle file opening
@@ -63,28 +82,7 @@ def close(self):
6382

6483
def infer_fmt(self, fmt=None):
6584
"""Guess file format from our path, with optional default."""
66-
try:
67-
path = Path(self.pathlike)
68-
except TypeError:
69-
path = Path(str(self.pathlike.name))
70-
fmt_inferred = None
71-
if path.suffix.lower() in [".csv"]:
72-
fmt_inferred = "csv"
73-
elif path.suffix.lower() in [".tsv", ".tab"]:
74-
fmt_inferred = "tsv"
75-
elif path.suffix.lower() in [".fa", ".fasta"]:
76-
fmt_inferred = "fa"
77-
elif path.suffix.lower() in [".fq", ".fastq"]:
78-
fmt_inferred = "fq"
79-
elif path.suffix.lower() == ".gz":
80-
if Path(path.stem).suffix.lower() in [".fa", ".fasta"]:
81-
fmt_inferred = "fagz"
82-
elif Path(path.stem).suffix.lower() in [".fq", ".fastq"]:
83-
fmt_inferred = "fqgz"
84-
elif Path(path.stem).suffix.lower() in [".csv"]:
85-
fmt_inferred = "csvgz"
86-
elif Path(path.stem).suffix.lower() in [".tsv", ".tab"]:
87-
fmt_inferred = "tsvgz"
85+
fmt_inferred = self._infer_fmt(self.pathlike)
8886
LOGGER.info("given format: %s", fmt)
8987
LOGGER.info("inferred format: %s", fmt_inferred)
9088
if not fmt:
@@ -95,6 +93,20 @@ def infer_fmt(self, fmt=None):
9593
fmt = fmt_inferred
9694
return fmt
9795

96+
@staticmethod
97+
def _infer_fmt(path):
98+
try:
99+
path = Path(path)
100+
except TypeError:
101+
path = Path(str(path.name))
102+
ext = path.suffix.lower()
103+
ext2 = Path(path.stem).suffix.lower()
104+
fmt2 = FMT_EXT_MAP.get(ext2)
105+
if ext == ".gz" and fmt2:
106+
return fmt2 + "gz"
107+
else:
108+
return FMT_EXT_MAP.get(ext)
109+
98110
def encode_record(self, record):
99111
"""Convert record dictionary into a SeqRecord object."""
100112
seq = record[self.colmap["sequence"]]
@@ -160,14 +172,7 @@ def open(self):
160172
else:
161173
LOGGER.info("reading from text")
162174
self.handle = open(self.pathlike, "rt", encoding="ascii")
163-
if self.fmt in ["csv", "csvgz"]:
164-
self.reader = csv.DictReader(self.handle)
165-
elif self.fmt in ["tsv", "tsvgz"]:
166-
self.reader = csv.DictReader(self.handle, delimiter="\t")
167-
elif self.fmt in ["fa", "fagz"]:
168-
self.reader = SeqIO.parse(self.handle, "fasta")
169-
elif self.fmt in ["fq", "fqgz"]:
170-
self.reader = SeqIO.parse(self.handle, "fastq")
175+
self.reader = READERS[self.fmt.removesuffix("gz")](self.handle)
171176

172177
def __iter__(self):
173178
return self

igseq/vdj_match.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,17 +36,34 @@ def vdj_match(ref_paths, query, output=None, showtxt=None, species=None, fmt_in=
3636
species_det = {s for s in species_det if s}
3737
organism = igblast.detect_organism(species_det, species)
3838

39-
vdj_files_grouped = vdj.group(
40-
attrs_list,
41-
lambda x: f"{x['species']}/{x['ref']}" if x["type"] == "internal" else x["input"])
39+
# Group references into sets with V/D/J. If a segment is missing (note
40+
# that D is always needed by igblast even for light chain) we'll remove the
41+
# whole reference with a warning *if* it was found internally from a
42+
# partial text match, but will fail outright if it was selected explicitly
43+
# or supplied from external files.
44+
def groupkey(row):
45+
return(f"{row['species']}/{row['ref']}" if row["type"] == "internal" else row["input"])
46+
vdj_files_grouped = vdj.group(attrs_list, groupkey)
47+
# This is sloppy because it'll just take whichever is the last matching row
48+
# (with its segment-specific stuff included) but i'll do for now
49+
orig_attrs = {groupkey(row): row for row in attrs_list}
50+
skips = set()
4251
for key, trio in vdj_files_grouped.items():
4352
LOGGER.info("detected V FASTA from %s: %d", key, len(trio["V"]))
4453
LOGGER.info("detected D FASTA from %s: %d", key, len(trio["D"]))
4554
LOGGER.info("detected J FASTA from %s: %d", key, len(trio["J"]))
4655
for segment, attrs_group in trio.items():
47-
if not attrs_group:
48-
LOGGER.critical("No FASTA for %s from %s", segment, key)
49-
raise util.IgSeqError("Missing VDJ input for database")
56+
if key not in skips and not attrs_group:
57+
# Only case where we'll just warn and continue is: it's an
58+
# internal ref, and it was found implicitly (not by name).
59+
if orig_attrs[key]["type"] == "internal" and orig_attrs[key]["input"] != key:
60+
LOGGER.warning("No FASTA for %s from %s; skipping", segment, key)
61+
skips.add(key)
62+
else:
63+
LOGGER.critical("No FASTA for %s from %s", segment, key)
64+
raise util.IgSeqError("Missing VDJ input for database")
65+
for key in skips:
66+
del vdj_files_grouped[key]
5067

5168
if not dry_run:
5269
results = []

igseq/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
See https://www.python.org/dev/peps/pep-0396/
55
"""
66

7-
__version__ = "0.2.0"
7+
__version__ = "0.3.0"

0 commit comments

Comments
 (0)