Switch tools to use SQLite database, add tests for common_ancestor_distance

pvanheus · pvanheus · commit 48345203d4e2 · 2020-05-02T22:06:10.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,3 @@
 /target/
 **/*.rs.bk
 Cargo.lock
-data/ncbi_taxonomy.sqlite
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "ncbitaxonomy"
 description = "Read NCBI Taxonomy Database from files and work with NCBI Taxonomy DB"
-version = "0.3.0"
+version = "1.0.0"
 authors = ["Peter van Heusden <pvh@sanbi.ac.za>"]
 license = "MIT"
 
diff --git a/README.md b/README.md
@@ -5,9 +5,10 @@
 This is a Rust crate (i.e. library) for working with a local copy of the 
 [NCBI Taxonomy database](https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/).
 The database can be downloaded (either `taxdump.zip` or `taxdump.tar.gz`) from the
-[NCBI Taxonomy FTP site](https://ftp.ncbi.nih.gov/pub/taxonomy/).
+[NCBI Taxonomy FTP site](https://ftp.ncbi.nih.gov/pub/taxonomy/) and reformatted into a SQLite database
+using the `taxonomy_util` utility's `to_sqlite` subcommand.
 
-Documentation for version 0.1.3 is available at [crates.io](https://docs.rs/ncbitaxonomy/0.1.3/ncbitaxonomy/struct.NcbiTaxonomy.html).
+Documentation is available at [crates.io](https://crates.io/crates/ncbitaxonomy).
 
 ### taxonomy_filter_refseq
 
@@ -18,42 +19,42 @@ are retained.
 
 ```bash
 $ taxonomy_filter_refseq --help
-taxonomy_filter_refseq 0.1.2
+taxonomy_filter_refseq 1.0.0
 Peter van Heusden <pvh@sanbi.axc.za>
 Filter NCBI RefSeq FASTA files by taxonomic lineage
 
 USAGE:
-    taxonomy_filter_refseq [OPTIONS] <INPUT_FASTA> <TAXONOMY_DIR> <ANCESTOR_NAME> [OUTPUT_FASTA]
+    taxonomy_filter_refseq [FLAGS] [OPTIONS] <INPUT_FASTA> <ANCESTOR_NAME> [OUTPUT_FASTA]
 
 FLAGS:
         --no_curated      Don't accept curated RNAs and proteins (NM_, NR_ and NP_ accessions)
         --no_predicted    Don't accept computationally predicted RNAs and proteins (XM_, XR_ and XP_ accessions)
-    -h, --help       Prints help information
-    -V, --version    Prints version information
+    -h, --help            Prints help information
+    -V, --version         Prints version information
 
 OPTIONS:
-    -t, --tax_prefix <TAXONOMY_FILENAME_PREFIX>    String to prepend to names of nodes.dmp and names.dmp
+    -d, --db <TAXDB_URL>    URL for SQLite taxonomy database
 
 ARGS:
     <INPUT_FASTA>      FASTA file with RefSeq sequences
-    <TAXONOMY_DIR>     Directory containing the NCBI taxonomy nodes.dmp and names.dmp files
     <ANCESTOR_NAME>    Name of ancestor to use as ancestor filter
     <OUTPUT_FASTA>     Output FASTA filename (or stdout if omitted)
-
 ```
 
 ### taxonomy_filter_fastq
 
 (new in version 0.2.0)
 
+
 ```bash
 $ taxonomy_filter_fastq --help
-taxonomy_filter_refseq 0.1.2
+taxonomy_filter_fastq 1.0.0
 Peter van Heusden <pvh@sanbi.axc.za>
-Filter NCBI RefSeq FASTA files by taxonomic lineage
+Filter FASTQ files whose reads have been classified by Centrifuge or Kraken2, only retaining reads in taxa descending
+from given ancestor
 
 USAGE:
-    taxonomy_filter_fastq [FLAGS] [OPTIONS] <INPUT_FASTQ> --ancestor_taxid <ANCESTOR_ID> --taxdir <TAXONOMY_DIR> --tax_report_filename <TAXONOMY_REPORT_FILENAME> <--centrifuge|--kraken2>
+    taxonomy_filter_fastq [FLAGS] [OPTIONS] <INPUT_FASTQ>... --ancestor_taxid <ANCESTOR_ID> --tax_report_filename <TAXONOMY_REPORT_FILENAME> <--centrifuge|--kraken2>
 
 FLAGS:
     -d, --output_dir    Directory to deposited filtered output files in
@@ -64,12 +65,39 @@ FLAGS:
 
 OPTIONS:
     -A, --ancestor_taxid <ANCESTOR_ID>                      Name of ancestor to use as ancestor filter
-    -T, --taxdir <TAXONOMY_DIR>
-            Directory containing the NCBI taxonomy nodes.dmp and names.dmp files
-
-    -t, --tax_prefix <TAXONOMY_FILENAME_PREFIX>             String to prepend to names of nodes.dmp and names.dmp
+    -d, --db <TAXDB_URL>                                    URL for SQLite taxonomy database
     -F, --tax_report_filename <TAXONOMY_REPORT_FILENAME>    Output from Kraken2 (default) or Centrifuge
 
 ARGS:
-    <INPUT_FASTQ>    FASTA file with RefSeq sequences
+    <INPUT_FASTQ>...    FASTA file with RefSeq sequences
 ```
+
+### taxonomy_util
+
+(new in 1.0.0)
+
+Utilities to convert NCBI taxonomy database files into SQLite database (the input format used in other tools).
+
+```bash
+taxonomy_util 1.0.0
+Peter van Heusden <pvh@sanbi.axc.za>
+Utilities for working with the NCBI taxonomy database
+
+USAGE:
+    taxonomy_util [OPTIONS] [SUBCOMMAND]
+
+FLAGS:
+    -h, --help       Prints help information
+    -V, --version    Prints version information
+
+OPTIONS:
+    -d, --db <TAXDB_URL>    URL for SQLite taxonomy database
+
+SUBCOMMANDS:
+    common_ancestor_distance    find the tree distance to te common ancestor between two taxa
+    get_id                      find taxonomy ID for name
+    get_lineage                 get lineage for name [unimplemented]
+    get_name                    find name for taxonomy ID
+    help                        Prints this message or the help of the given subcommand(s)
+    to_sqlite                   save taxonomy database loaded from files to SQLite database file
+```
diff --git a/data/ncbi_taxonomy.sqlite b/data/ncbi_taxonomy.sqlite
diff --git a/src/bin/taxonomy_filter_fastq.rs b/src/bin/taxonomy_filter_fastq.rs
@@ -19,7 +19,7 @@ use flate2::Compression;
 use flate2::read::GzDecoder;
 use flate2::write::GzEncoder;
 use seq_io::fastq::Record;
-use ncbitaxonomy::{NcbiTaxonomy, NcbiFileTaxonomy};
+use ncbitaxonomy::{NcbiTaxonomy, NcbiSqliteTaxonomy};
 
 enum FilterTool {
     Centrifuge,
@@ -35,24 +35,6 @@ impl fmt::Display for FilterTool {
     }
 }
 
-fn read_taxonomy(tax_prefix: &str, ncbi_taxonomy_path: &Path) -> NcbiFileTaxonomy {
-    let nodes_path = ncbi_taxonomy_path.join(tax_prefix.to_owned() + "nodes.dmp");
-    if !nodes_path.exists() {
-        eprintln!("NCBI Taxonomy {}nodes.dmp file not found in {}", tax_prefix, ncbi_taxonomy_path.to_str().unwrap());
-        process::exit(1);
-    }
-
-    let names_path = ncbi_taxonomy_path.join(tax_prefix.to_owned() + "names.dmp");
-    if !names_path.exists() {
-        eprintln!("NCBI Taxonomy {}names.dmp file not found in {}", tax_prefix, ncbi_taxonomy_path.to_str().unwrap());
-        process::exit(1);
-    }
-
-    ncbitaxonomy::NcbiFileTaxonomy::from_ncbi_files(
-        nodes_path.as_path().to_str().unwrap(),
-        names_path.as_path().to_str().unwrap()).expect("Failed to load NCBI Taxonomy")
-}
-
 fn filter_fastq(fastq_filename: &Path, tax_report_filename: &str,
                 taxonomy: &dyn NcbiTaxonomy,
                 output_dir: &Path, filter_tool: &FilterTool, ancestor_id: i32) {
@@ -162,12 +144,12 @@ fn filter_fastq(fastq_filename: &Path, tax_report_filename: &str,
 }
 
 pub fn main() {
-    let matches = clap_app!(taxonomy_filter_refseq =>
+    // TODO: write test suite
+    let matches = clap_app!(taxonomy_filter_fastq =>
         (version: ncbitaxonomy::VERSION)
         (author: "Peter van Heusden <pvh@sanbi.axc.za>")
-        (about: "Filter NCBI RefSeq FASTA files by taxonomic lineage")
-        (@arg TAXONOMY_FILENAME_PREFIX: -t --tax_prefix +takes_value "String to prepend to names of nodes.dmp and names.dmp")
-        (@arg TAXONOMY_DIR: -T --taxdir +takes_value +required "Directory containing the NCBI taxonomy nodes.dmp and names.dmp files")
+        (about: "Filter FASTQ files whose reads have been classified by Centrifuge or Kraken2, only retaining reads in taxa descending from given ancestor")
+        (@arg TAXDB_URL: -d --db +takes_value "URL for SQLite taxonomy database")
         (@arg ANCESTOR_ID: -A --ancestor_taxid +takes_value +required "Name of ancestor to use as ancestor filter")
         (@group filter_tool +required =>
             (@arg centrifuge: -C --centrifuge !required "Filter using report from Centrifuge")
@@ -178,13 +160,6 @@ pub fn main() {
         (@arg INPUT_FASTQ: ... +required "FASTA file with RefSeq sequences")
         ).get_matches();
 
-    let tax_prefix = match matches.value_of("TAXONOMY_FILENAME_PREFIX") {
-        Some(name) => name,
-        None => ""
-    }.to_string();
-
-    let ncbi_taxonomy_path = Path::new(matches.value_of("TAXONOMY_DIR").unwrap());
-
     let output_dir = match matches.value_of("OUTPUT_DIR") {
         Some(path) => Path::new(path),
         None => Path::new(".")
@@ -202,7 +177,9 @@ pub fn main() {
 
     let tax_report_filename = matches.value_of("TAXONOMY_REPORT_FILENAME").unwrap();
 
-    let taxonomy = read_taxonomy(&tax_prefix, ncbi_taxonomy_path);
+    let taxdb_url = if matches.is_present("TAXDB_URL") { Some(matches.value_of("TAXDB_URL").unwrap()) } else { None };
+
+    let taxonomy = NcbiSqliteTaxonomy::new(taxdb_url);
     if !taxonomy.contains_id(ancestor_id) {
         eprintln!("Taxonomy does not contain an ancestor with taxid {}", ancestor_id);
         process::exit(1);
diff --git a/src/bin/taxonomy_filter_refseq.rs b/src/bin/taxonomy_filter_refseq.rs
@@ -6,14 +6,13 @@ extern crate ncbitaxonomy;
 use std::cmp;
 use std::fs::File;
 use std::io;
-use std::path::Path;
 use std::process;
 use std::vec::Vec;
 
 use bio::io::fasta;
 use bio::utils::TextSlice;
 
-use ncbitaxonomy::{NcbiTaxonomy, NcbiFileTaxonomy};
+use ncbitaxonomy::{NcbiTaxonomy, NcbiSqliteTaxonomy};
 
 // wrap a TextSlice (a rust-bio name for a &[u8] i.e. byte array)
 // at a certain width (e.g. 80 to look like NCBI RefSeq)
@@ -35,15 +34,15 @@ fn wrap(seq: TextSlice, width: usize) -> Vec<u8> {
 }
 
 pub fn main() {
+    // TODO: use functions, write testing suite
     let matches = clap_app!(taxonomy_filter_refseq =>
         (version: ncbitaxonomy::VERSION)
         (author: "Peter van Heusden <pvh@sanbi.axc.za>")
         (about: "Filter NCBI RefSeq FASTA files by taxonomic lineage")
+        (@arg TAXDB_URL: -d --db +takes_value "URL for SQLite taxonomy database")
         (@arg NO_PREDICTED: --no_predicted "Don't accept computationally predicted RNAs and proteins (XM_, XR_ and XP_ accessions)")
         (@arg NO_CURATED: --no_curated "Don't accept curated RNAs and proteins (NM_, NR_ and NP_ accessions)")
-        (@arg TAXONOMY_FILENAME_PREFIX: -t --tax_prefix +takes_value "String to prepend to names of nodes.dmp and names.dmp")
         (@arg INPUT_FASTA: +required "FASTA file with RefSeq sequences")
-        (@arg TAXONOMY_DIR: +required "Directory containing the NCBI taxonomy nodes.dmp and names.dmp files")
         (@arg ANCESTOR_NAME: +required "Name of ancestor to use as ancestor filter")
         (@arg OUTPUT_FASTA: "Output FASTA filename (or stdout if omitted)")
         ).get_matches();
@@ -62,24 +61,8 @@ pub fn main() {
     let input_fasta = File::open(input_fasta_filename).unwrap_or_else(|_| panic!("Failed to open input FASTA file ({})", input_fasta_filename));
     let input_fasta_reader = fasta::Reader::new(input_fasta);
 
-    let ncbi_taxonomy_path = Path::new(matches.value_of("TAXONOMY_DIR").unwrap());
-
-    let tax_prefix = match matches.value_of("TAXONOMY_FILENAME_PREFIX") {
-        Some(name) => name,
-        None => ""
-    }.to_string();
-
-    let nodes_path = ncbi_taxonomy_path.join(tax_prefix.clone() + "nodes.dmp");
-    if ! nodes_path.exists() {
-        eprintln!("NCBI Taxonomy {}nodes.dmp file not found in {}", tax_prefix, ncbi_taxonomy_path.to_str().unwrap());
-        process::exit(1);
-    }
-
-    let names_path = ncbi_taxonomy_path.join(tax_prefix.clone() + "names.dmp");
-    if ! names_path.exists() {
-        eprintln!("NCBI Taxonomy {}names.dmp file not found in {}", tax_prefix, ncbi_taxonomy_path.to_str().unwrap());
-        process::exit(1);
-    }
+    let taxdb_url = if matches.is_present("TAXDB_URL") { Some(matches.value_of("TAXDB_URL").unwrap()) } else { None };
+    let taxonomy = NcbiSqliteTaxonomy::new(taxdb_url);
 
     // the use of Box here is inspired by:
     // https://stackoverflow.com/questions/26378842/how-do-i-overcome-match-arms-with-incompatible-types-for-structs-implementing-sa
@@ -93,10 +76,6 @@ pub fn main() {
 
     let ancestor_name = matches.value_of("ANCESTOR_NAME").unwrap();
 
-    let taxonomy = NcbiFileTaxonomy::from_ncbi_files(
-        nodes_path.as_path().to_str().unwrap(),
-        names_path.as_path().to_str().unwrap()).expect("Failed to load NCBI Taxonomy");
-
     if !taxonomy.contains_name(ancestor_name) {
         eprintln!("Taxonomy does not contain an ancestor named {}", ancestor_name);
         process::exit(1);
diff --git a/src/bin/taxonomy_util.rs b/src/bin/taxonomy_util.rs
@@ -8,8 +8,8 @@ use ncbitaxonomy::{NcbiTaxonomy, NcbiSqliteTaxonomy};
 
 fn common_ancestor_distance(taxonomy: &dyn NcbiTaxonomy, name1: &str, name2: &str, only_canonical: bool) {
     match taxonomy.get_distance_to_common_ancestor(name1, name2, only_canonical) {
-        Some(distance) => {
-            println!("{}", distance);
+        Some((distance, common_ancestor_name)) => {
+            println!("{}\t{}", distance, common_ancestor_name);
         },
         None => {
             eprintln!("no common ancestor found");
@@ -19,7 +19,9 @@ fn common_ancestor_distance(taxonomy: &dyn NcbiTaxonomy, name1: &str, name2: &st
 }
 
 pub fn main() {
-    let app_m = clap_app!(taxonomy_filter_refseq =>
+    // TODO:
+    // * write get_lineage - print lineage of taxon
+    let app_m = clap_app!(taxonomy_util =>
         (version: ncbitaxonomy::VERSION)
         (author: "Peter van Heusden <pvh@sanbi.axc.za>")
         (about: "Utilities for working with the NCBI taxonomy database")
@@ -30,16 +32,21 @@ pub fn main() {
             (@arg NAME1: +required "Name of first taxon")
             (@arg NAME2: +required "Name of second taxon")
         )
-        (@subcommand find_id =>
+        (@subcommand get_id =>
             (about: "find taxonomy ID for name")
             (@arg NAME: +required "Name of taxon")
         )
-        (@subcommand find_name =>
+        (@subcommand get_name =>
             (about: "find name for taxonomy ID")
             (@arg ID: +required "Taxonomy ID to look up")
         )
-        (@subcommand sqlite =>
-            (about: "sqlite testing")
+        (@subcommand get_lineage =>
+            (about: "get lineage for name [unimplemented]")
+            (@arg DELIMITER: --delimiter -D +takes_value "Delimiter for lineage string")
+            (@arg NAME: +required "Name of taxon")
+        )
+        (@subcommand to_sqlite =>
+            (about: "save taxonomy database loaded from files to SQLite database file")
             (@arg TAXONOMY_FILENAME_PREFIX: -t --tax_prefix +takes_value "String to prepend to names of nodes.dmp and names.dmp")
             (@arg TAXONOMY_DIR: "Directory containing the NCBI taxonomy nodes.dmp and names.dmp files")
         )
@@ -56,20 +63,25 @@ pub fn main() {
             let name2 = sub_m.value_of("NAME2").unwrap();
             common_ancestor_distance(&taxonomy, name1, name2, only_canonical);
         },
-        ("find_id", Some(sub_m)) => {
+        ("get_id", Some(sub_m)) => {
             let name = sub_m.value_of("NAME").unwrap();
             match taxonomy.get_id_by_name(name) {
                 Some(val) => println!("{}", val),
                 None => eprintln!("name {} not found in taxomomy", name)
             }
         },
-        ("find_name", Some(sub_m)) => {
+        ("get_name", Some(sub_m)) => {
             let taxid = (sub_m.value_of("ID").unwrap()).parse::<i32>().unwrap();
             match taxonomy.get_name_by_id(taxid) {
                 Some(val) => println!("{}", val),
                 None => eprintln!("id {} not found in taxonomy", taxid)
             }
         },
+        ("get_lineage", Some(sub_m)) => {
+            let _ = sub_m.value_of("DELIMITER").unwrap_or(";");
+            let _ = sub_m.value_of("NAME").unwrap();
+            // TODO: implement here (depends on expanding NcbiTaxonomy method signature
+        }
         ("to_sqlite", Some(sub_m)) => {
             let ncbi_taxonomy_path = Path::new(sub_m.value_of("TAXONOMY_DIR").unwrap());
 
@@ -96,7 +108,7 @@ pub fn main() {
                 names_path.as_path().to_str().unwrap()).expect("Failed to load NCBI Taxonomy");
             eprintln!("taxonomy loaded");
 
-            taxonomy.save_to_sqlite().expect("failed to save taxonomy database to SQLite");
+            taxonomy.save_to_sqlite(taxdb_url).expect("failed to save taxonomy database to SQLite");
         },
         _ => {
             eprintln!("Unknown subcommand");
diff --git a/src/lib.rs b/src/lib.rs