diff --git a/.gitignore b/.gitignore index ea8c4bf..2a0038a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +.idea \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 7630133..5baf7b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" +dependencies = [ + "gimli", +] + [[package]] name = "adler" version = "1.0.2" @@ -44,8 +53,23 @@ dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", - "anstyle-wincon", + "anstyle-wincon 2.1.0", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstream" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon 3.0.3", "colorchoice", + "is_terminal_polyfill", "utf8parse", ] @@ -70,7 +94,7 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" dependencies = [ - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -80,7 +104,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "58f54d10c6dfa51283a066ceab3ec1ab78d13fae00aa49243a45e4571fb79dfd" dependencies = [ "anstyle", - "windows-sys", + "windows-sys 0.48.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", ] [[package]] @@ -98,6 +132,21 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +[[package]] +name = "backtrace" +version = "0.3.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17c6a35df3749d2e8bb1b7b21a976d82b15548788d2735b9d82f329268f71a11" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + [[package]] name = "bit-vec" version = "0.6.3" @@ -179,12 +228,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.83" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" -dependencies = [ - "libc", -] +checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f" [[package]] name = "cfg-if" @@ -199,6 +245,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1d7b8d5ec32af0fadc644bf1fd509a688c2103b185644bb1e29d164e0703136" dependencies = [ "clap_builder", + "clap_derive", ] [[package]] @@ -207,12 +254,24 @@ version = "4.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5179bb514e4d7c2051749d8fcefa2ed6d06a9f4e6d69faf3805f5d80b8cf8d56" dependencies = [ - "anstream", + "anstream 0.5.0", "anstyle", "clap_lex", "strsim", ] +[[package]] +name = "clap_derive" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0862016ff20d69b84ef8247369fabf5c008a7417002411897d40ee1f4532b873" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.60", +] + [[package]] name = "clap_lex" version = "0.5.1" @@ -233,7 +292,7 @@ checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6" dependencies = [ "is-terminal", "lazy_static", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -362,6 +421,12 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "either" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" + [[package]] name = "enum-iterator" version = "1.5.0" @@ -396,7 +461,7 @@ checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd" dependencies = [ "errno-dragonfly", "libc", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -417,7 +482,10 @@ dependencies = [ "colored", "compare", "csv", + "human-panic", "io", + "itertools", + "log", "noodles", "regex", "serde", @@ -566,6 +634,23 @@ dependencies = [ "version_check", ] +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "gimli" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" + [[package]] name = "gmeta" version = "1.3.0" @@ -626,6 +711,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "hermit-abi" version = "0.3.3" @@ -638,6 +729,22 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "human-panic" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c5d0e9120f6bca6120d142c7ede1ba376dd6bf276d69dd3dbe6cbeb7824179" +dependencies = [ + "anstream 0.6.14", + "anstyle", + "backtrace", + "os_info", + "serde", + "serde_derive", + "toml", + "uuid", +] + [[package]] name = "impl-codec" version = "0.6.0" @@ -688,7 +795,22 @@ checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi", "rustix", - "windows-sys", + "windows-sys 0.48.0", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", ] [[package]] @@ -788,6 +910,12 @@ version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3852614a3bd9ca9804678ba6be5e3b8ce76dfc902cae004e3e0c44051b6e88db" +[[package]] +name = "log" +version = "0.4.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" + [[package]] name = "lzma-sys" version = "0.1.20" @@ -934,12 +1062,32 @@ dependencies = [ "noodles-csi", ] +[[package]] +name = "object" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8ec7ab813848ba4522158d5517a6093db1ded27575b070f4177b8d12b41db5e" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "os_info" +version = "3.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae99c7fa6dd38c7cafe1ec085e804f8f555a2f8659b0dbe03f1f9963a9b51092" +dependencies = [ + "log", + "serde", + "windows-sys 0.52.0", +] + [[package]] name = "page_size" version = "0.6.0" @@ -968,7 +1116,7 @@ version = "3.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be30eaf4b0a9fba5336683b38de57bb86d179a35862ba6bfcf57625d006bde5b" dependencies = [ - "proc-macro-crate 2.0.2", + "proc-macro-crate 2.0.0", "proc-macro2", "quote", "syn 1.0.109", @@ -1016,11 +1164,10 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "2.0.2" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b00f26d3400549137f92511a46ac1cd8ce37cb5598a96d382381458b992a5d24" +checksum = "7e8366a6159044a37876a2b9817124296703c586a5c92e2c53751fa06d8d43e8" dependencies = [ - "toml_datetime", "toml_edit 0.20.2", ] @@ -1080,6 +1227,12 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + [[package]] name = "rustc_version" version = "0.4.0" @@ -1099,7 +1252,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -1158,6 +1311,15 @@ dependencies = [ "syn 2.0.60", ] +[[package]] +name = "serde_spanned" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0" +dependencies = [ + "serde", +] + [[package]] name = "serde_yaml" version = "0.9.25" @@ -1239,11 +1401,26 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "toml" +version = "0.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit 0.22.14", +] + [[package]] name = "toml_datetime" -version = "0.6.3" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" +checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" +dependencies = [ + "serde", +] [[package]] name = "toml_edit" @@ -1267,6 +1444,18 @@ dependencies = [ "winnow", ] +[[package]] +name = "toml_edit" +version = "0.22.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", +] + [[package]] name = "typenum" version = "1.17.0" @@ -1309,12 +1498,27 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +[[package]] +name = "uuid" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a183cf7feeba97b4dd1c0d46788634f6221d87fa961b305bed08c851829efcc0" +dependencies = [ + "getrandom", +] + [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "winapi" version = "0.3.9" @@ -1343,7 +1547,16 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.5", ] [[package]] @@ -1352,13 +1565,29 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +dependencies = [ + "windows_aarch64_gnullvm 0.52.5", + "windows_aarch64_msvc 0.52.5", + "windows_i686_gnu 0.52.5", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.5", + "windows_x86_64_gnu 0.52.5", + "windows_x86_64_gnullvm 0.52.5", + "windows_x86_64_msvc 0.52.5", ] [[package]] @@ -1367,42 +1596,90 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" + [[package]] name = "winnow" version = "0.5.40" diff --git a/Cargo.toml b/Cargo.toml index 4c55605..a523174 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,13 +6,16 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -clap = { version = "4.4.4", features = ["cargo"] } +clap = { version = "4.4.4", features = ["cargo", "derive"] } colored = "2.0.4" compare = "0.1.0" csv = "1.3.0" io = "0.0.2" +itertools = "0.13.0" noodles = { version = "0.52.0", features = ["fasta", "cram", "csi", "core"] } regex = "1.9.5" serde = { version = "1.0.188", features = ["derive"] } serde_yaml = "0.9.25" stacker = "0.1.15" +log = "0.4.21" +human-panic = "2.0.0" diff --git a/README.md b/README.md index 79c7001..6c478a2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # FastaManipulator +![img](https://github.com/Rust-Wellcome/FasMan/actions/workflows/release-repo.yml/badge.svg) + This is a re-write of the current fasta manipulation scripts I've written whilst at ToL, as well as adding some functionality needed for future projects. Currently, this program has the following arguments: diff --git a/src/cli/mod.rs b/src/cli/mod.rs new file mode 100644 index 0000000..2638ee9 --- /dev/null +++ b/src/cli/mod.rs @@ -0,0 +1,181 @@ +use clap::{Parser, Subcommand}; + +const SPLIT_OPTIONS: [&str; 5] = ["pep", "cds", "cdna", "rna", "other"]; + +// CLI for Fasta Processing +#[derive(Parser)] +#[command(version="v1.0.0", about, long_about = None)] +pub struct Cli { + // command is optional (TODO: Make this not optional) + // Reference: https://docs.rs/clap/latest/clap/_derive/_tutorial/chapter_2/index.html#defaults + #[command(subcommand)] + pub command: Option, +} + +// Reference: https://docs.rs/clap/latest/clap/_derive/_tutorial/chapter_2/index.html +#[derive(Subcommand)] +pub enum Commands { + YamlValidator { + // Path to the TreeVal yaml file generated by the user + #[arg(short, long)] + yaml: String, + + // Print explainers as to why validation fails, if it does fail + #[arg(short = 'v', long)] + verbose: bool, + + // Output the log to file + #[arg(short = 'o', long, default_value_t=String::from("./"))] + output: String, + }, + + SplitByCount { + // A path to a valid fasta file. + #[arg(short = 'f', long)] + fasta_file: String, + + // The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa + #[arg(short = 'o', long, default_value_t = String::from("./"))] + output_directory: String, + + // The data type of the input data + #[arg(short = 'd', value_parser = clap::builder::PossibleValuesParser::new(SPLIT_OPTIONS))] + data_type: String, + + // Do we need to sanitise the headers of the input fasta + #[arg(short = 's', value_parser = clap::value_parser!(bool))] + sanitise: bool, + + // How many sequences per file + #[arg(short = 'c', value_parser = clap::value_parser!(u16))] + count: u16, + }, + + SplitBySize { + // A path to a valid fasta file. + #[arg(short = 'f', long)] + fasta_file: String, + + // Size in MB that a fasta file is to be chunked into + #[arg(short = 's', long = "mem-size")] + mem_size: u16, + + // The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa + #[arg(short = 'o', long, default_value_t = String::from("./"))] + output_directory: String, + }, + + GenesetCSVS { + // The path to the top level directory of your geneset directory. + #[arg(short = 'd')] + geneset_dir: String, + + // Specify the clade folder to refresh + #[arg(short = 'c', default_value_t = String::from("ALL"))] + specifiy_clade: String, + }, + + MapHeaders { + // A path to a valid fasta file. + #[arg(short = 'f', long)] + fasta_file: String, + + // The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta + #[arg(short = 'o', long, default_value_t = String::from("./"))] + output_directory: String, + + #[arg(short = 'r', default_value_t = String::from("FMMH"))] + replace_with: String, + }, + + ReMapHeaders { + // A path to a valid fasta file. + #[arg(short = 'f', long)] + fasta_file: String, + + // The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta + #[arg(short = 'o', long, default_value_t = String::from("./new"))] + output_directory: String, + + // "The original mapped header field, a TSV of old-header, new-header + #[arg(short = 'm', default_value_t = String::from("FMMH"))] + map_file: String, + }, + + #[command(version, about="Profile an input fasta file and return various statistics", long_about = None)] + Profile { + // A path to a valid fasta file. + #[arg(short = 'f', long)] + fasta_file: String, + + // The input fasta file for profiling + #[arg(short = 'o', long, default_value_t = String::from("FasMan-out"))] + output_dir: String, + }, + + Curate { + // The input fasta file for re-organising + #[arg(short = 'f', long)] + fasta: String, + + // The TPF file used to re-organise the input fasta + #[arg(short = 't', long)] + tpf: String, + + // Size sort the output or leave as order in AGP + #[arg(short = 's')] + sort: bool, + + #[arg(short = 'o', default_value_t = String::from("new.fasta"))] + output: String, + + // Length that the N (gap) string should be. + #[arg(short, long, default_value_t = 200)] + n_length: usize, + }, + + Subset { + // A path to a valid fasta file for profiling. + #[arg(short = 'f', long)] + fasta_file: String, + + // Random subset of input file. Default skims the first X given percent + #[arg(short = 'r', long)] + random: bool, + + // Percentage of the original file entries that should be retained + #[arg(short = 'p', long, default_value_t = 50)] + percent: u16, + }, + + FilterFasta { + // A fasta file for processing. + #[arg(short = 'f', long)] + fasta: String, + + // The outfile naming + #[arg(short = 'o', default_value_t = String::from("FilteredFasta.fa"))] + output: String, + + #[arg(short = 'l', long = "filter_list")] + filter_list: String, + }, + + Mergehaps { + // The input fasta file for re-organising + #[arg(short = 'p', long)] + fasta_1: String, + + // The second input fasta file + #[arg(short = 's', long)] + fasta_2: String, + + // TA '/' separated list with an item per file, these are the namings of the new scaffolds in the merged output + #[arg(short = 's', long, default_value_t = String::from("PRI/HAP"))] + naming: String, + + // Output file prefix + #[arg(short = 'o', default_value_t = String::from("merged"))] + output: String, + }, +} diff --git a/src/errors/file_error.rs b/src/errors/file_error.rs new file mode 100644 index 0000000..77ec0f8 --- /dev/null +++ b/src/errors/file_error.rs @@ -0,0 +1,27 @@ +use std::fmt::{self}; + +use std::io::Error; + +// Define our error types. These may be customized for our error handling cases. +// Now we will be able to write our own errors, defer to an underlying error +// implementation, or do something in between. +// Resource: https://doc.rust-lang.org/rust-by-example/error/multiple_error_types/define_error_type.html +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub struct FileError { + message: String, +} + +impl fmt::Display for FileError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Error in handling the file.") + } +} + +impl From for FileError { + fn from(error: Error) -> Self { + FileError { + message: format!("{}", error), + } + } +} diff --git a/src/errors/mod.rs b/src/errors/mod.rs new file mode 100644 index 0000000..6bf812b --- /dev/null +++ b/src/errors/mod.rs @@ -0,0 +1 @@ +pub mod file_error; diff --git a/src/exclude_seq.rs b/src/exclude_seq.rs deleted file mode 100644 index ab82c4e..0000000 --- a/src/exclude_seq.rs +++ /dev/null @@ -1,44 +0,0 @@ -pub mod exclude_seq_mod { - use clap::ArgMatches; - use noodles::fasta; - use std::error::Error; - use std::{fs, io::BufRead, str}; - - fn open_fasta<'a>( - exclusions: Vec<&str>, - fasta: &'a str, - out_file: &str, - ) -> std::result::Result<&'a str, Box> { - let reader: Result>, std::io::Error> = - fasta::reader::Builder.build_from_path(fasta); - let file = fs::OpenOptions::new() - .create(true) - .append(true) - .open(out_file)?; - let mut writer = fasta::Writer::new(file); - - match reader { - Ok(fasta) => { - let mut binding = fasta; - for result in binding.records() { - let record = result?; - if !exclusions.contains(&record.name()) { - writer.write_record(&record)?; - } else { - println!("Found record to exclude: {:?}", &record.name()); - } - } - Ok("Removed Exclusionary List") - } - Err(_) => Err("Error: Fasta is not valid check file!".into()), - } - } - - pub fn filter_fasta(arguments: std::option::Option<&ArgMatches>) { - let fasta = arguments.unwrap().get_one::("fasta").unwrap(); - let exclude = arguments.unwrap().get_one::("filter_list").unwrap(); - let outfile = arguments.unwrap().get_one::("output").unwrap(); - let list_to_exclude = exclude.split(',').collect::>(); - let _x = open_fasta(list_to_exclude, fasta, outfile); - } -} diff --git a/src/file_utils/file_utility.rs b/src/file_utils/file_utility.rs new file mode 100644 index 0000000..8a30eaa --- /dev/null +++ b/src/file_utils/file_utility.rs @@ -0,0 +1,136 @@ +use log::info; +use std::fs::File; +use std::io::{BufRead, BufReader}; + +use crate::errors::file_error::FileError; +use itertools::Itertools; + +#[allow(dead_code)] +struct Records { + items: Vec, +} + +#[allow(dead_code)] +impl Records { + pub fn size(&self) -> usize { + self.items.len() + } +} + +#[allow(dead_code)] +struct BatchFileReader {} + +#[allow(dead_code)] +pub trait DefaultReader { + fn default() -> Self; +} + +impl DefaultReader for BatchFileReader { + fn default() -> Self { + BatchFileReader {} + } +} + +#[allow(dead_code)] +impl BatchFileReader { + /* + * Reads a specific number of lines from a file from the top + */ + pub fn read_lines( + &mut self, + file_path: &str, + num_lines: usize, + ) -> Result, FileError> { + info!("Reading lines in file."); + let file = File::open(file_path); + + let result = match file { + Ok(file) => file, + Err(error) => { + info!("Error in file handler: {:?}", error); + return Err(error.into()); + } + }; + + let reader = BufReader::new(result); + let mut internal_buffer = Vec::::new(); + + // Error unwrapping: https://tinyurl.com/brt9fphk + // take() function https://tinyurl.com/6vx7m3k6 + for line in reader.lines().take(num_lines) { + let result = line.expect("Error in reading file"); // This will panic if errored + internal_buffer.push(result.clone()) + } + + Ok(Records { + items: internal_buffer, + }) + } + + /** + * Reads a file batch by batch, and applies a function Fn for each chunk + * Function pointers documentation: https://doc.rust-lang.org/book/ch19-05-advanced-functions-and-closures.html#function-pointers + * f is a closure pushed into the stack of read_file_by_batch that is similar to an anonymous function in Java/JavaScript/C# + * https://doc.rust-lang.org/book/ch13-01-closures.html#moving-captured-values-out-of-closures-and-the-fn-traits + * Note that f is not intended to mutate the captured Records value, and should not return anything (i.e., move the captured Record value out of the closure). + */ + pub fn read_file_by_batch( + &mut self, + file_path: &str, + batch_size: usize, + f: &dyn Fn(Records), + ) -> Result<(), FileError> { + info!("Reading file by chunk."); + + let file = File::open(file_path); + + let result = match file { + Ok(file) => file, + Err(error) => { + info!("Error in file handler: {:?}", error); + return Err(error.into()); + } + }; + + let reader = BufReader::new(result); + + // map_while() Creates an iterator that both yields elements based on a predicate and maps. + // https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map_while + for chunk in &reader.lines().map_while(Result::ok).chunks(batch_size) { + f(Records { + items: chunk.collect(), + }); + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + const TEST_FILE_PATH: &str = "test_data/synthetic/tiny.fa"; + + #[test] + fn read_lines() { + let mut batch_file_reader = BatchFileReader::default(); + let records = batch_file_reader.read_lines(TEST_FILE_PATH, 3).unwrap(); + assert_eq!(3, records.items.len()); + } + + // You can create the closure in one place and then call the closure elsewhere to evaluate it in a different context. + // Reference: https://doc.rust-lang.org/book/ch13-01-closures.html + fn assert_function(input: Records) { + assert!(input.size() <= 3); + } + + #[test] + fn read_file_batch() { + let mut batch_file_reader = BatchFileReader::default(); + batch_file_reader + .read_file_by_batch(TEST_FILE_PATH, 3, &assert_function) + .unwrap_or_else(|e| panic!("Error: {:?}", e)); + } +} diff --git a/src/file_utils/mod.rs b/src/file_utils/mod.rs new file mode 100644 index 0000000..79b7217 --- /dev/null +++ b/src/file_utils/mod.rs @@ -0,0 +1 @@ +pub mod file_utility; diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..36ac37b --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,82 @@ +use clap::Parser; + +use cli::{Cli, Commands}; +use std::io::Error; + +// Reference: https://doc.rust-lang.org/book/ch07-02-defining-modules-to-control-scope-and-privacy.html +use crate::processors::exclude_seq::filter_fasta; +use crate::processors::map_headers::map_fasta_head; +use crate::processors::remap_head::remapping_head; +use crate::processors::split_by_count::split_file_by_count; +use crate::processors::split_by_size::split_file_by_size; +use crate::processors::tpf_fasta::curate_fasta; +use crate::processors::yaml_validator::validate_yaml; + +mod cli; +mod errors; +mod file_utils; +mod generics; + +mod processors; + +pub fn run() -> Result<(), Error> { + let cli = Cli::parse(); + + match &cli.command { + Some(Commands::YamlValidator { + yaml, + verbose, + output, + }) => validate_yaml(yaml, verbose, output), + Some(Commands::SplitByCount { + fasta_file, + output_directory, + data_type, + sanitise, + count, + }) => split_file_by_count(fasta_file, output_directory, data_type, sanitise, count), + Some(Commands::SplitBySize { + fasta_file, + mem_size, + output_directory, + }) => split_file_by_size(fasta_file, mem_size, output_directory), + Some(Commands::MapHeaders { + fasta_file, + output_directory, + replace_with, + }) => _ = map_fasta_head(fasta_file, output_directory, replace_with), + Some(Commands::ReMapHeaders { + fasta_file, + output_directory, + map_file, + }) => remapping_head(fasta_file, output_directory, map_file), + Some(Commands::Curate { + fasta, + tpf, + sort, + output, + n_length, + }) => curate_fasta(fasta, tpf, sort, output, n_length), + Some(Commands::FilterFasta { + fasta, + output, + filter_list, + }) => filter_fasta(fasta, output, filter_list), + Some(Commands::GenesetCSVS { .. }) => { + todo!() + } + Some(Commands::Profile { .. }) => { + todo!() + } + Some(Commands::Subset { .. }) => { + todo!() + } + Some(Commands::Mergehaps { .. }) => { + todo!() + } + None => { + println!("No command provided"); + } + } + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index 9e947f9..6163c35 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,327 +1,16 @@ #![allow(non_snake_case)] -use clap::{command, Arg, Command}; -use colored::Colorize; -use std::env; -use std::io::Error; - -mod yaml_validator; -use crate::yaml_validator::yaml_validator_mod::validate_yaml; - -mod map_headers; -use crate::map_headers::mapping_headers::map_fasta_head; - -mod remap_head; -use crate::remap_head::remapping_headers::remapping_head; - -mod split_by_size; -use crate::split_by_size::split_by_size_mod::split_file_by_size; - -mod split_by_count; -use crate::split_by_count::split_by_count_mod::split_file_by_count; - -mod generics; -//use crate::generics::validate_fasta; - -mod tpf_fasta; -use crate::tpf_fasta::tpf_fasta_mod::curate_fasta; - -mod exclude_seq; -use crate::exclude_seq::exclude_seq_mod::filter_fasta; - -fn main() -> Result<(), Error> { - let split_options = ["pep", "cds", "cdna", "rna", "other"]; - let match_result = command!() - .about("A program for fasta manipulation and yaml validation ~ Used in TreeVal project") - .subcommand( - Command::new("validateyaml") - .about("Subcommand for validating the users TreeVal yaml file") - .arg( - Arg::new("yaml") - .required(true) - .help("Path to the TreeVal yaml file generated by the user") - ) - .arg( - Arg::new("verbose") - .short('v') - .value_parser(clap::value_parser!(bool)) - .default_value("false") - .help("Print explainers as to why validation fails, if it does fail") - ) - .arg( - Arg::new("output") - .short('o') - .default_value("./") - .help("Output the log to file") - ) - ) - .subcommand( - Command::new("splitbycount") - .about("Subcommand for splitting fasta files by number of sequence-header pairs, e.g., 100 pairs per file") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("A path to a valid fasta file.") - ) - .arg( - Arg::new("output-directory") - .short('o') - .default_value("./") - .help("The output directory that files will be placed in | outfile will be formatted like {input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa") - ) - .arg( - Arg::new("data_type") - .short('d') - .value_parser(clap::builder::PossibleValuesParser::new(split_options)) - .help("The data type of the input data") - ) - .arg( - Arg::new("sanitise") - .short('s') - .value_parser(clap::value_parser!(bool)) - .help("Do we need to sanitise the headers of the input fasta") - ) - .arg( - Arg::new("count") - .short('c') - .value_parser(clap::value_parser!(u16)) - .help("How many sequences per file") - ) - ) - .subcommand( - Command::new("splitbysize") - .about("Subcommand for splitting fasta files by user given size (in MegaBytes) into n (fasta_size / user_given_size) files") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("A path to a valid fasta file.") - ) - .arg( - Arg::new("mem-size") - .short('s') - .required(true) - .value_parser(clap::value_parser!(u16)) - .help("Size in MB that a fasta file is to be chunked into") - ) - .arg( - Arg::new("output-directory") - .short('o') - .default_value("./") - .help("The output directory that files will be placed in") - ) - ) - .subcommand( - Command::new("geneset_csvs") - .about("Subcommand to generate csv files that condense geneset directories generated by splitbycount/splitbysize. Mainly for use in TreeVal") - .arg( - Arg::new("geneset_dir") - .short('d') - .required(true) - .help("The path to the top level directory of your geneset directory.") - ) - .arg( - Arg::new("specifiy_clade") - .short('c') - .required(true) - .default_value("ALL") - .help("Specify the clade folder to refresh") - ) - ) - .subcommand( - Command::new("mapheaders") - .about("Subcommand for stripping out headers and replacing with a standardised automatic or user-given string, this also returns a dict of old:new headers") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("A path to a valid fasta file.") - ) - .arg( - Arg::new("output-directory") - .short('o') - .default_value("./") - .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta") - ) - .arg( - Arg::new("replace-with") - .short('r') - .default_value("FMMH") - .help("The new header format, appended with a numerical value. Without being set the new header will default to 'FMMH_{numberical}'") - ) - ) - .subcommand( - Command::new("remapheaders") - .about("Subcommand for stripping out previously mapped headers and replacing with the old headers") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("A path to a valid fasta file.") - ) - .arg( - Arg::new("output-directory") - .short('o') - .default_value("./new") - .help("The output directory which will contain the mapped-heads.txt as well as the *mapped.fasta") - ) - .arg( - Arg::new("map-file") - .short('m') - .required(true) - .help("The original mapped header field, a TSV of old-header, new-header") - ) - ) - .subcommand( - Command::new("profile") - .about("Profile an input fasta file and return various statistics") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("The input fasta file for profiling") - ) - .arg( - Arg::new("output-dir") - .short('o') - .default_value("FasMan-out") - .help("The input fasta file for profiling") - ) - ) - .subcommand( - Command::new("curate") - .about("Convert an tpf file and original fasta file into a fasta file - useful for curation") - .arg( - Arg::new("fasta") - .short('f') - .required(true) - .help("The input fasta file for re-organising") - ) - .arg( - Arg::new("tpf") - .short('t') - .required(true) - .help("The TPF file used to re-organise the input fasta") - ) - .arg( - Arg::new("sort") - .short('s') - .value_parser(clap::value_parser!(bool)) - .default_value("false") - .help("Size sort the output or leave as order in AGP") - ) - .arg( - Arg::new("output") - .short('o') - .default_value("new.fasta") - .help("The output name of the new fasta file") - ) - .arg( - Arg::new("n_length") - .value_parser(clap::value_parser!(usize)) - .default_value("200") - .help("Length that the N (gap) string should be.") - ) - ) - .subcommand( - Command::new("subset") - .about("Subset a fasta file in a random manner by percentage of file") - .arg( - Arg::new("fasta-file") - .short('f') - .required(true) - .help("The input fasta file for profiling") - ) - .arg( - Arg::new("random") - .short('r') - .value_parser(clap::value_parser!(bool)) - .help("Random subset of input file. Default skims the first X given percent") - ) - .arg( - Arg::new("percent") - .short('p') - .value_parser(clap::value_parser!(u16)) - .default_value("50") - .help("Percentage of the original file entries that should be retained") - ) - ) - .subcommand( - Command::new("filterfasta") - .about("Filter a given list of sequences from fasta file") - .arg( - Arg::new("fasta") - .short('f') - .required(true) - .help("A fasta file for processing") - ) - .arg( - Arg::new("output") - .short('o') - .default_value("FiilteredFasta.fa") - .help("The outfile naming") - ) - .arg( - Arg::new("filter_list") - .short('l') - .help("A string comma-separated list of sequence names to exclude from the final fasta") - ) - ) - .subcommand( - Command::new("mergehaps") - .about("Merge haplotypes / multi fasta files together") - .arg( - Arg::new("fasta-1") - .short('p') - .required(true) - .help("The input fasta file for re-organising") - ) - .arg( - Arg::new("fasta-2") - .short('s') - .required(true) - .help("The second input fasta file") - ) - .arg( - Arg::new("naming") - .short('s') - .default_value("PRI/HAP") - .help("A '/' separated list with an item per file, these are the namings of the new scaffolds in the merged output") - ) - .arg( - Arg::new("output") - .short('o') - .default_value("merged") - .help("Output file prefix") - ) - ) - .get_matches(); - - println! { - "{}\n{}\n{}\nRUNNING SUBCOMMAND: |\n-- {}\nRUNNING ON: |\n-- {}", - "WELCOME TO Fasta Manipulator".bold(), - "This has been made to help prep data for use in the Treeval and curationpretext pipelines".bold(), - "ONLY THE yamlvalidator IS SPECIFIC TO TREEVAL, THE OTHER COMMANDS CAN BE USED FOR ANY OTHER PURPOSE YOU WANT".purple(), - match_result.subcommand_name().unwrap(), - env::consts::OS - }; - - match match_result.subcommand_name() { - Some("splitbysize") => split_file_by_size(match_result.subcommand_matches("splitbysize")), - Some("splitbycount") => { - split_file_by_count(match_result.subcommand_matches("splitbycount")) - } - Some("mapheaders") => { - _ = map_fasta_head(match_result.subcommand_matches("mapheaders")); - } - Some("validateyaml") => validate_yaml(match_result.subcommand_matches("validateyaml")), - Some("remapheaders") => remapping_head(match_result.subcommand_matches("remapheaders")), - Some("curate") => curate_fasta(match_result.subcommand_matches("curate")), - Some("filterfasta") => filter_fasta(match_result.subcommand_matches("filterfasta")), - _ => { - unreachable!() - } - }; - Ok(()) +use fasta_manipulation::run; +use human_panic::setup_panic; + +// https://doc.rust-lang.org/book/ch12-03-improving-error-handling-and-modularity.html#separation-of-concerns-for-binary-projects +fn main() { + // https://rust-cli.github.io/book/in-depth/human-communication.html + setup_panic!(); + if let Err(e) = run() { + eprintln!("Error: {}", e); + std::process::exit(1); + } else { + println!("Done!"); + } } diff --git a/src/map_headers.rs b/src/map_headers.rs deleted file mode 100644 index 2b066b0..0000000 --- a/src/map_headers.rs +++ /dev/null @@ -1,142 +0,0 @@ -pub mod mapping_headers { - - use clap::ArgMatches; - use colored::Colorize; - use std::error::Error; - use std::fmt; - use std::fs::File; - use std::io::{BufRead, BufReader, BufWriter, Write}; - use std::iter::Zip; - - use crate::generics::only_keys; - use crate::generics::validate_fasta; - - #[allow(dead_code)] - #[derive(Debug, Clone)] - struct EmptyVec; - impl Error for EmptyVec {} - - impl fmt::Display for EmptyVec { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Can't Display Empty Vec") - } - } - - #[allow(clippy::explicit_counter_loop)] - pub fn create_mapping( - name_vec: Vec, - new_name: &str, - ) -> Zip, std::vec::IntoIter> { - // Generate a new mapping for the Fasta - // - let mut new_heads: Vec = Vec::new(); - let mut head_counter: i32 = 0; - let name_vec_clone = name_vec.clone(); - - for _x in name_vec { - new_heads.push(format!("{}_{}", new_name, head_counter)); - head_counter += 1; - } - - let mapped_heads: Zip, std::vec::IntoIter> = - name_vec_clone.into_iter().zip(new_heads); - - mapped_heads - } - - pub fn save_mapping( - output: &str, - mapped: Zip< - std::vec::IntoIter, - std::vec::IntoIter, - >, - ) { - let f: File = File::create(output).expect("Unable to create file"); - let mut f: BufWriter = BufWriter::new(f); - for map_pair in mapped { - let line: String = format!("{}\t{}\n", map_pair.0, map_pair.1); - f.write_all(&line.into_bytes()) - .expect("Unable to write data"); - } - } - - #[allow(unused_mut)] - pub fn create_mapped_fasta( - input: &str, - output: &str, - mapped: Zip< - std::vec::IntoIter, - std::vec::IntoIter, - >, - ) { - let file_reader: File = File::open(input).expect("CAN'T OPEN FILE"); - let buff_reader: BufReader = BufReader::new(file_reader); - let mut new_fasta: File = File::create(output).unwrap(); - - for line in buff_reader.lines() { - let l: &str = &line.as_ref().unwrap()[..]; - if l.starts_with('>') { - let mut to_replace = l.replace('>', ""); - let mut mapped_heads: Zip, std::vec::IntoIter> = - mapped.clone(); - let mut map: Option<(String, String)> = - mapped_heads.find(|x: &(String, String)| x.0 == to_replace); - let mut new_head: String = map.expect("").1; - let fmt_head: String = format!(">{}\n", new_head); - let _ = new_fasta.write_all(&fmt_head.into_bytes()); - } else { - let mut seq = line.expect(""); - let fmt_seq = format!("{}\n", seq); - let _ = new_fasta.write_all(&fmt_seq.into_bytes()); - } - } - } - - pub fn map_fasta_head( - arguments: std::option::Option<&ArgMatches>, - ) -> Result<(), Box> { - let file: &String = arguments.unwrap().get_one::("fasta-file").unwrap(); - let replacer: &String = arguments - .unwrap() - .get_one::("replace-with") - .unwrap(); - let output: &String = arguments - .unwrap() - .get_one::("output-directory") - .unwrap(); - - println!("Mapping headers for file: {}", file); - println!("Replace headers with string: {:?}", &replacer); - - match validate_fasta(file) { - Ok(names) => { - let new_names = Vec::from_iter(only_keys(names)); - - let new_map: Zip, std::vec::IntoIter> = - create_mapping(new_names, replacer); - - let map_to_save: Zip, std::vec::IntoIter> = - new_map.clone(); - let output_file = format!("{}mapped-heads.tsv", output); - - save_mapping(&output_file, map_to_save); - - let new_fasta: String = format!("{output}mapped.fasta"); - - create_mapped_fasta(file, &new_fasta, new_map); - - println!( - "{}\n{}\n\t{}\n\t{}", - "FASTA HAS BEEN MAPPED AND REWRITTEN".green(), - "FOUND HERE:".green(), - &new_fasta.green(), - &output_file.green() - ); - } - - Err(e) => panic!("Something is wrong with the file! | {}", e), - }; - - Ok(()) - } -} diff --git a/src/processors/exclude_seq.rs b/src/processors/exclude_seq.rs new file mode 100644 index 0000000..1b0b8c1 --- /dev/null +++ b/src/processors/exclude_seq.rs @@ -0,0 +1,48 @@ +use noodles::fasta; +use std::error::Error; +use std::{fs, io::BufRead, str}; + +fn open_fasta<'a>( + exclusions: Vec<&str>, + fasta: &'a str, + out_file: &str, +) -> std::result::Result<&'a str, Box> { + let reader: Result>, std::io::Error> = + fasta::reader::Builder.build_from_path(fasta); + let file = fs::OpenOptions::new() + .create(true) + .append(true) + .open(out_file)?; + let mut writer = fasta::Writer::new(file); + + match reader { + Ok(fasta) => { + let mut binding = fasta; + for result in binding.records() { + let record = result?; + if !exclusions.contains(&record.name()) { + writer.write_record(&record)?; + } else { + println!("Found record to exclude: {:?}", &record.name()); + } + } + Ok("Removed Exclusionary List") + } + Err(_) => Err("Error: Fasta is not valid check file!".into()), + } +} + +pub fn filter_fasta(fasta: &str, outfile: &str, exclude: &str) { + let list_to_exclude = exclude.split(',').collect::>(); + let _x = open_fasta(list_to_exclude, fasta, outfile); +} + +#[cfg(test)] +mod tests { + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} diff --git a/src/processors/map_headers.rs b/src/processors/map_headers.rs new file mode 100644 index 0000000..9c3390f --- /dev/null +++ b/src/processors/map_headers.rs @@ -0,0 +1,134 @@ +use colored::Colorize; +use std::error::Error; +use std::fmt; +use std::fs::File; +use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::iter::Zip; + +use crate::generics::only_keys; +use crate::generics::validate_fasta; + +#[allow(dead_code)] +#[derive(Debug, Clone)] +struct EmptyVec; +impl Error for EmptyVec {} + +impl fmt::Display for EmptyVec { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Can't Display Empty Vec") + } +} + +#[allow(clippy::explicit_counter_loop)] +pub fn create_mapping( + name_vec: Vec, + new_name: &str, +) -> Zip, std::vec::IntoIter> { + // Generate a new mapping for the Fasta + // + let mut new_heads: Vec = Vec::new(); + let mut head_counter: i32 = 0; + let name_vec_clone = name_vec.clone(); + + for _x in name_vec { + new_heads.push(format!("{}_{}", new_name, head_counter)); + head_counter += 1; + } + + let mapped_heads: Zip, std::vec::IntoIter> = + name_vec_clone.into_iter().zip(new_heads); + + mapped_heads +} + +pub fn save_mapping( + output: &str, + mapped: Zip, std::vec::IntoIter>, +) { + let f: File = File::create(output).expect("Unable to create file"); + let mut f: BufWriter = BufWriter::new(f); + for map_pair in mapped { + let line: String = format!("{}\t{}\n", map_pair.0, map_pair.1); + f.write_all(&line.into_bytes()) + .expect("Unable to write data"); + } +} + +#[allow(unused_mut)] +pub fn create_mapped_fasta( + input: &str, + output: &str, + mapped: Zip, std::vec::IntoIter>, +) { + let file_reader: File = File::open(input).expect("CAN'T OPEN FILE"); + let buff_reader: BufReader = BufReader::new(file_reader); + let mut new_fasta: File = File::create(output).unwrap(); + + for line in buff_reader.lines() { + let l: &str = &line.as_ref().unwrap()[..]; + if l.starts_with('>') { + let mut to_replace = l.replace('>', ""); + let mut mapped_heads: Zip, std::vec::IntoIter> = + mapped.clone(); + let mut map: Option<(String, String)> = + mapped_heads.find(|x: &(String, String)| x.0 == to_replace); + let mut new_head: String = map.expect("").1; + let fmt_head: String = format!(">{}\n", new_head); + let _ = new_fasta.write_all(&fmt_head.into_bytes()); + } else { + let mut seq = line.expect(""); + let fmt_seq = format!("{}\n", seq); + let _ = new_fasta.write_all(&fmt_seq.into_bytes()); + } + } +} + +pub fn map_fasta_head( + file: &String, + output: &String, + replacer: &String, +) -> Result<(), Box> { + println!("Mapping headers for file: {}", file); + println!("Replace headers with string: {:?}", &replacer); + + match validate_fasta(file) { + Ok(names) => { + let new_names = Vec::from_iter(only_keys(names)); + + let new_map: Zip, std::vec::IntoIter> = + create_mapping(new_names, replacer); + + let map_to_save: Zip, std::vec::IntoIter> = + new_map.clone(); + let output_file = format!("{}mapped-heads.tsv", output); + + save_mapping(&output_file, map_to_save); + + let new_fasta: String = format!("{output}mapped.fasta"); + + create_mapped_fasta(file, &new_fasta, new_map); + + println!( + "{}\n{}\n\t{}\n\t{}", + "FASTA HAS BEEN MAPPED AND REWRITTEN".green(), + "FOUND HERE:".green(), + &new_fasta.green(), + &output_file.green() + ); + } + + Err(e) => panic!("Something is wrong with the file! | {}", e), + }; + + Ok(()) +} + +#[cfg(test)] +mod tests { + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} diff --git a/src/processors/mod.rs b/src/processors/mod.rs new file mode 100644 index 0000000..dc0f572 --- /dev/null +++ b/src/processors/mod.rs @@ -0,0 +1,7 @@ +pub mod exclude_seq; +pub mod map_headers; +pub mod remap_head; +pub mod split_by_count; +pub mod split_by_size; +pub mod tpf_fasta; +pub mod yaml_validator; diff --git a/src/processors/remap_head.rs b/src/processors/remap_head.rs new file mode 100644 index 0000000..ca30d10 --- /dev/null +++ b/src/processors/remap_head.rs @@ -0,0 +1,74 @@ +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::iter::Zip; + +use colored::Colorize; + +use crate::generics::validate_fasta; +use crate::processors::map_headers::create_mapped_fasta; + +pub fn pull_map_from_tsv( + map_file: &str, +) -> Zip, std::vec::IntoIter> { + let file_reader: File = File::open(map_file).expect("CAN'T OPEN FILE"); + let buff_reader: BufReader = BufReader::new(file_reader); + + let mut old_head: Vec = Vec::new(); + let mut new_head: Vec = Vec::new(); + + for line in buff_reader.lines() { + match line { + Ok(string) => { + let mut old_new = string.split('\t'); + let x = old_new.next().unwrap(); + let y = old_new.next().unwrap(); + old_head.push(x.to_string()); + new_head.push(y.to_string()); + } + Err(_) => { + print!("") + } + }; + } + + let mapped_heads: Zip, std::vec::IntoIter> = + new_head.into_iter().zip(old_head); + + mapped_heads +} + +pub fn remapping_head(file: &String, output: &String, map_file: &String) { + println!("Mapping headers for file: {}", file); + println!("Replace headers with string: {}", map_file); + + match validate_fasta(file) { + Ok(_thing) => { + let new_map: Zip, std::vec::IntoIter> = + pull_map_from_tsv(map_file); + + let new_fasta: String = format!("{output}_OH.fasta"); + + create_mapped_fasta(file, &new_fasta, new_map); + + println!( + "{}\n{}\n\t{}\n", + "FASTA HAS BEEN RE-APPED AND REWRITTEN".green(), + "FOUND HERE:".green(), + &new_fasta.green() + ); + } + Err(_) => { + println!("NOT A VALID FASTA") + } + }; +} + +#[cfg(test)] +mod tests { + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} diff --git a/src/processors/split_by_count.rs b/src/processors/split_by_count.rs new file mode 100644 index 0000000..35ba1fe --- /dev/null +++ b/src/processors/split_by_count.rs @@ -0,0 +1,115 @@ +use crate::generics::sanitise_header; + +use compare::{natural, Compare}; +use noodles::fasta::{self, Record}; +use std::cmp::Ordering; +use std::fs::OpenOptions; +use std::{ + fs::{create_dir_all, File}, + io::BufReader, + path::Path, +}; + +#[allow(clippy::needless_return)] +fn fix_head(records: Record, sanitise: bool) -> Record { + if sanitise { + let header = sanitise_header(records.definition()); + let definition = fasta::record::Definition::new(header, None); + let seq = records.sequence().to_owned(); + return fasta::Record::new(definition, seq); + } else { + return records.to_owned(); + }; +} + +fn write_fasta(outdir: &String, fasta_record: &Vec) { + println!("{}", outdir); + + let _data_file = File::create(outdir); + let file = OpenOptions::new() + .append(true) + .open(outdir) + .expect("creation failed"); + + let mut writer = fasta::Writer::new(file); + for i in fasta_record { + writer.write_record(i).unwrap(); + } +} + +pub fn split_file_by_count( + fasta_file: &String, + output_directory: &String, + data_type: &String, + sanitise: &bool, + fasta_count: &u16, +) { + let path_obj = Path::new(fasta_file); + let grab_name = path_obj.file_name().unwrap(); + let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect(); + let actual_name = actual_list[0]; + + let new_outpath = format!("{}/{}/{}/", output_directory, actual_name, data_type); + create_dir_all(new_outpath.clone()).unwrap(); + println!( + "Fasta file for processing: {:?}\nNumber of records per file: {:?}", + fasta_file, fasta_count + ); + + let mut counter: u16 = 0; + let mut file_counter: u16 = 1; + + let file_name: Vec<&str> = actual_name.split('.').collect(); + + let mut reader = File::open(fasta_file) + .map(BufReader::new) + .map(fasta::Reader::new) + .unwrap(); + + let mut record_list: Vec = Vec::new(); + for result in reader.records() { + let record = result.unwrap(); + counter += 1; + + let final_rec = fix_head(record, *sanitise); + record_list.push(final_rec); + + let cmp = natural(); + let compared = cmp.compare(&counter, fasta_count); + if compared == Ordering::Equal { + let full_outpath = format!( + "{}{}_f{}_c{}-a{}.fa", + new_outpath, + file_name[0], + file_counter, + &fasta_count, + &record_list.len() + ); + + write_fasta(&full_outpath, &record_list); + file_counter += 1; + counter = 0; + record_list = Vec::new(); + } + } + + let full_outpath = format!( + "{}{}_f{}_c{}-a{}.fa", + new_outpath, + file_name[0], + file_counter, + &fasta_count, + &record_list.len() + ); + write_fasta(&full_outpath, &record_list); +} + +#[cfg(test)] +mod tests { + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} diff --git a/src/processors/split_by_size.rs b/src/processors/split_by_size.rs new file mode 100644 index 0000000..6445afd --- /dev/null +++ b/src/processors/split_by_size.rs @@ -0,0 +1,14 @@ +pub fn split_file_by_size(fasta_file: &String, mem_size: &u16, _output_directory: &str) { + println!("Fasta file for processing: {:?}", &fasta_file); + println!("Size to chunk fasta into: {:?}", mem_size); +} + +#[cfg(test)] +mod tests { + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} diff --git a/src/processors/tpf_fasta.rs b/src/processors/tpf_fasta.rs new file mode 100644 index 0000000..0453798 --- /dev/null +++ b/src/processors/tpf_fasta.rs @@ -0,0 +1,278 @@ +use std::fs::OpenOptions; +use std::io::Write; +use std::{fs::read_to_string, fs::File, str}; + +use noodles::core::Position; +use noodles::fasta; +use noodles::fasta::record::Sequence; +use noodles::fasta::repository::adapters::IndexedReader; + +use crate::generics::validate_fasta; + +#[derive(Debug, Clone, PartialEq, Eq)] +struct Tpf { + ori_scaffold: String, + start_coord: usize, + end_coord: usize, + new_scaffold: String, + orientation: String, +} + +impl std::fmt::Display for Tpf { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + write!( + fmt, + "\t{} -- {} -- {}", + self.ori_scaffold, self.start_coord, self.end_coord + ) + } +} + +#[derive(Debug, PartialEq, Eq)] +struct NewFasta { + tpf: Tpf, + sequence: String, +} + +#[derive(Debug)] +struct MyRecord { + name: String, + sequence: Vec, +} + +fn parse_tpf(path: &String) -> Vec { + let mut all_tpf: Vec = Vec::new(); + for line in read_to_string(path).unwrap().lines() { + if line.starts_with('?') { + let line_replaced = line.replace('\t', " "); + let line_list: Vec<&str> = line_replaced.split_whitespace().collect(); + let scaff_data: Vec<&str> = line_list[1].split(':').collect(); + let scaff_coords: Vec<&str> = scaff_data[1].split('-').collect(); + let data = Tpf { + ori_scaffold: scaff_data[0].to_owned(), + start_coord: scaff_coords[0].to_owned().parse::().unwrap(), + end_coord: scaff_coords[1].to_owned().parse::().unwrap(), + new_scaffold: line_list[2].to_owned().replace("RL", "SUPER"), + orientation: line_list[3].to_owned(), + }; + all_tpf.push(data); + } + } + all_tpf +} + +fn subset_vec_tpf<'a>(tpf: &'a Vec, fasta: (&std::string::String, &usize)) -> Vec<&'a Tpf> { + // + // Subset the Vec based on a search through the fasta + // + let mut subset_tpf: Vec<&Tpf> = Vec::new(); + for i in tpf { + if i.ori_scaffold == *fasta.0 { + subset_tpf.push(i) + } + } + subset_tpf +} + +fn check_orientation( + parsed: std::option::Option, + orientation: String, +) -> String { + if orientation == "MINUS" { + let start = Position::try_from(1).unwrap(); + let parse_orientation = parsed.unwrap(); + let compliment: Sequence = parse_orientation + .complement() + .collect::>() + .unwrap(); + let seq = compliment.get(start..).unwrap(); + str::from_utf8(seq).unwrap().chars().rev().collect() + } else { + let start = Position::try_from(1).unwrap(); + let parse_orientation = parsed.unwrap(); + let seq = parse_orientation.get(start..).unwrap(); + str::from_utf8(seq).unwrap().chars().collect() + } +} + +fn parse_seq( + sequence: std::option::Option, + tpf: Vec<&Tpf>, +) -> Vec { + let mut subset_tpf: Vec = Vec::new(); + // + // Take the input sequence and scaffold name + // Parse the input sequence based on the data contained in + // the TPF. Which is already a subset based on scaff name + // + + let new_seq = sequence.unwrap(); // Option(Sequence ()) -> Sequence () + for &i in &tpf { + let start = Position::try_from(i.start_coord).unwrap(); + let end = Position::try_from(i.end_coord).unwrap(); + //let region = Region::new(&i.new_scaffold, start.unwrap()..=end.unwrap()); + let parsed = new_seq.slice(start..=end); + let the_sequence = check_orientation(parsed, i.orientation.to_owned()); + let data = NewFasta { + tpf: i.to_owned(), + sequence: the_sequence, + }; + subset_tpf.push(data); + } + subset_tpf +} + +fn get_uniques(tpf_list: &Vec) -> Vec { + let mut uniques: Vec = Vec::new(); + + for i in tpf_list { + if !uniques.contains(&i.new_scaffold) { + uniques.push(i.new_scaffold.to_owned()) + } + } + uniques +} + +fn save_to_fasta(fasta_data: Vec, tpf_data: Vec, output: &String, n_length: usize) { + // + // TPF is in the input TPF order, this will continue to be the case until + // the script is modified and the Tpf struct gets modified in place for some reason + // + let _data_file = File::create(output); + let mut file = OpenOptions::new() + .write(true) + .open(output) + .expect("creation failed"); + + let _debugger = File::create("debug.txt"); + let mut file2 = OpenOptions::new() + .write(true) + .open("debug.txt") + .expect("creation failed"); + + let uniques = get_uniques(&tpf_data); + + // This is inefficient as we are scanning through the fasta_data, uniques number of times + // If uniques is 10 long and fasta is 100, then this is 1000 scans through in total. + let mut no_more: Vec = Vec::new(); + for x in uniques { + println!("NOW WRITING DATA FOR: {:?}", &x); + // X = "SUPER_1" + let stringy = format!(">{x}\n"); + file.write_all(stringy.as_bytes()) + .expect("Unable to write to file"); + file2 + .write_all(stringy.as_bytes()) + .expect("Unable to write to file"); + + let mut data: MyRecord = MyRecord { + name: "".to_string(), + sequence: Vec::new(), + }; + + no_more.push(x.to_owned()); + x.clone_into(&mut data.name); + for tpf in &tpf_data { + if tpf.new_scaffold == x { + for fasta in &fasta_data { + if fasta.tpf == *tpf { + let stringy = format!("\t{}\n", tpf); + file2 + .write_all(stringy.as_bytes()) + .expect("Unable to write to file"); + data.sequence.push(fasta.sequence.to_owned()); + } + } + } + } + + let line_len: usize = 60; + let fixed = data.sequence; + let n_string = "N".repeat(n_length); + let fixed2 = fixed.join(&n_string); //.join required a borrowed str + let fixed3 = fixed2 + .as_bytes() + .chunks(line_len) + .map(str::from_utf8) + .collect::, _>>() + .unwrap(); + + for i in fixed3 { + let formatted = i.to_owned() + "\n"; + file.write_all(formatted.as_bytes()).unwrap(); + } + println!("NO LONG SCANNING FOR: {:?}", &no_more) + } +} + +#[allow(clippy::needless_borrow)] +#[allow(clippy::let_and_return)] +pub fn curate_fasta( + fasta_file: &String, + tpf_file: &String, + _sort: &bool, + output: &String, + n_length: &usize, +) { + // + // Generate a curated fasta file based on the input TPF file + // which was generated by Pretext and the agp_to_tpf script. + // This new fasta file contains a new scaffold naming as well + // as pieced together sequences generated by the splitting of + // data in Pretext. + // + println!("LET'S GET CURATING THAT FASTA!"); + stacker::maybe_grow(32 * 1024, 1024 * 5120, || { + match validate_fasta(fasta_file) { + Ok(fasta_d) => { + let tpf_data = parse_tpf(&tpf_file); + //let _validated = varify_validity(&tpf_data, &fasta_d); + + // + // Start indexed reader of the input fasta + // if valid then use the data + // + let reader = fasta::indexed_reader::Builder::default().build_from_path(fasta_file); + let fasta_repo = match reader { + Ok(data) => { + let adapter = IndexedReader::new(data); + let repository = fasta::Repository::new(adapter); + repository + } + Err(_) => todo!(), + }; + + // + // For unique scaffold in the fasta file iter through and + // parse sequence for each line in the tpf + // The tpf will contain multiple enteries for each scaffold, minimum of one entry. + // + let mut new_fasta_data: Vec = Vec::new(); + for i in fasta_d { + let subset_tpf = subset_vec_tpf(&tpf_data, (&i.0, &i.1)); + let sequence = fasta_repo.get(&i.0).transpose(); + + match sequence { + Ok(data) => { + let subset_results = parse_seq(data, subset_tpf); + new_fasta_data.extend(subset_results); + } + Err(e) => panic!("{:?}", e), + }; + } + save_to_fasta(new_fasta_data, tpf_data, output, n_length.to_owned()) + } + Err(e) => panic!("Something is wrong with the file! | {}", e), + } + }) +} + +#[cfg(test)] +mod tests { + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} diff --git a/src/processors/yaml_validator.rs b/src/processors/yaml_validator.rs new file mode 100644 index 0000000..a0a595c --- /dev/null +++ b/src/processors/yaml_validator.rs @@ -0,0 +1,272 @@ +use std::fs::{self, File}; +use std::io::ErrorKind; +use std::path::PathBuf; + +use colored::Colorize; +use csv::Error; +use csv::ReaderBuilder; +use noodles::fasta; +use serde::{Deserialize, Serialize}; + +// Would be nice if there was a simple format_check +// use noodles::cram as cram; + +#[derive(Debug, Serialize, Deserialize)] +struct TreeValYaml { + assembly: Assembly, + reference_file: String, + assem_reads: AssemReads, + alignment: Alignment, + self_comp: SelfComp, + intron: Intron, + telomere: Telomere, + synteny: Synteny, + busco: Busco, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Assembly { + level: String, + sample_id: String, + latin_name: String, + class_t: String, + asm_version: u16, + geval_type: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct AssemReads { + pacbio: String, + hic: String, + supplementary: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Alignment { + data_dir: String, + common_name: String, + geneset: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct SelfComp { + motif_len: u16, + mummer_chunk: u16, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Intron { + size: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Telomere { + teloseq: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Synteny { + synteny_genome_path: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Busco { + lineages_path: String, + lineage: String, +} + +// +// CSV STRUCT +// +//#[derive(Deserialize)] +//struct Record { +// org: String, +// type: String, +// data_file: String +//} + +pub fn validate_paths(path: &str, field_id: &str) { + match fs::metadata(path) { + Ok(_) => { + println!( + "{}{} \t{}\t{}", + ">-".green(), + &field_id.green(), + "| PATH EXISTS: ".green(), + path.green() + ); + match field_id { + "REFERENCE" => validate_fasta(path), + "GENESET-CSV" => { + _ = validate_csv(path); + } + "HIC" => {} + _ => println!("Error"), + } + } + Err(_) => println!( + "{}{} \t{}\t{}", + "<-".red().bold(), + &field_id.red().bold(), + "| CHECK YAML!:".red().bold(), + path + ), + } +} + +pub fn validate_fasta(path: &str) { + let reader = fasta::reader::Builder.build_from_path(path); + + let mut binding = reader.expect("NO VALID HEADER / SEQUENCE PAIRS"); + let result = binding.records(); + let counter = result.count(); + println!( + "{} {} {}", + ">- REFERENCE H/S PAIRS:".green(), + counter, + "H/S PAIRS".green() + ) +} + +pub fn validate_csv(path: &str) -> Result<(), Error> { + let file = File::open(path)?; + + let mut reader = ReaderBuilder::new() + .has_headers(true) + .delimiter(b',') + .from_reader(file); + + let record = reader.records().count(); + println!( + "{} {} {}", + ">-GENESET-RECORD-COUNT: >".green(), + record, + "<".green() + ); + + Ok(()) +} + +// +// FUNCTION: Check if pacbio has fasta.gz files, cram has cram and crai and synteny has fasta +// could make this much easier and consise by passing in a list of file types to check +// validatedata(path, [fa, fna, fasta]) +// +pub fn validate_data(path: &str, dtype: &str) { + match fs::read_dir(path) { + Err(e) if e.kind() == ErrorKind::NotFound => {} + Err(e) => panic!("{} {e}", "<-DIRECTORY PATH DOESN'T EXIST: ".red().bold()), + Ok(data_files) => { + if dtype == "pacbio" { + let files: Vec = data_files + .filter_map(|f| f.ok()) + .filter(|d| match d.path().extension() { + None => false, + Some(ex) => ex == "fasta.gz", + }) + .map(|f| f.path()) + .collect(); + + if files.is_empty() { + println!("{}", "<-NO PACBIO DATA FILES".red()) + } else { + println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files); + } + } else if dtype == "hic" { + let files: Vec = data_files + .filter_map(|f| f.ok()) + .filter(|d| match d.path().extension() { + None => false, + Some(ex) => ex == "cram" || ex == "crai", + }) + .map(|f| f.path()) + .collect(); + + if files.is_empty() { + println!("{}", "<-NO HIC DATA FILES".red()) + } else { + println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files); + } + } else if dtype == "synteny" { + let files: Vec = data_files + .filter_map(|f| f.ok()) + .filter(|d| match d.path().extension() { + None => false, + Some(ex) => ex == "fa" || ex == "fasta" || ex == "fna", + }) + .map(|f| f.path()) + .collect(); + + if files.is_empty() { + println!("{}", "<-NO SYNTENIC GENOMES".red()) + } else { + println!("{} {:?}", ">-YOUR GENOMES ARE:".green(), &files); + } + } + } + }; +} + +pub fn validate_yaml(file: &String, _verbose: &bool, _output: &str) { + println! {"Validating Yaml: {}", file.purple()}; + + let input = fs::File::open(file).expect("Unable to read from file"); + let contents: TreeValYaml = serde_yaml::from_reader(input).expect("Unable to read from file"); + + println!( + "RUNNING VALIDATE-YAML FOR SAMPLE: {}", + contents.assembly.sample_id.purple() + ); + + validate_paths(&contents.reference_file, "REFERENCE"); + validate_paths(&contents.alignment.data_dir, "GENESET"); + validate_paths(&contents.synteny.synteny_genome_path, "SYNTENY"); + validate_paths(&contents.busco.lineages_path, "BUSCO"); + + validate_paths(&contents.assem_reads.pacbio, "PACBIO"); + validate_data(&contents.assem_reads.pacbio, "pacbio"); + + validate_paths(&contents.assem_reads.hic, "HIC"); + validate_data(&contents.assem_reads.hic, "hic"); + + println!("{}", "CHECKING GENESET DIRECTORY RESOLVES".blue()); + let genesets = contents.alignment.geneset.split(','); + for set in genesets { + let gene_alignment_path = contents.alignment.data_dir.clone() + + &contents.assembly.class_t + + "/csv_data/" + + set + + "-data.csv"; + validate_paths(&gene_alignment_path, "GENESET-CSV"); + } + + println!("{}", "CHECKING SYNTENY DIRECTORY RESOLVES".blue()); + let synteny_full = + contents.synteny.synteny_genome_path.clone() + &contents.assembly.class_t + "/"; + validate_paths(&synteny_full, "SYNTENY-FASTA"); + validate_data(&synteny_full, "synteny"); + + println!("{}", "CHECKING BUSCO DIRECTORY RESOLVES".blue()); + let busco_path = contents.busco.lineages_path.clone() + "/lineages/" + &contents.busco.lineage; + validate_paths(&busco_path, "BUSCO-DB"); + // NOW CHECK FOR FILES IN DIRECTORY? + + println!( + "{}\n{}\n{}\n{}\n{}", + "VALIDATION COMPLETE".purple().bold(), + "GENERAL INFORMATION:".purple().bold(), + "Check the log to see what failed".bold(), + "FULL : ONLY synteny fails are permitted".purple(), + "RAPID: geneset, busco and synteny fails are permitted".purple() + ); +} + +#[cfg(test)] +mod tests { + + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} diff --git a/src/remap_head.rs b/src/remap_head.rs deleted file mode 100644 index 83e20ef..0000000 --- a/src/remap_head.rs +++ /dev/null @@ -1,73 +0,0 @@ -pub mod remapping_headers { - use crate::map_headers; - use clap::ArgMatches; - use colored::Colorize; - use std::fs::File; - use std::io::{BufRead, BufReader}; - use std::iter::Zip; - - use crate::generics::validate_fasta; - - pub fn pull_map_from_tsv( - map_file: &str, - ) -> Zip, std::vec::IntoIter> { - let file_reader: File = File::open(map_file).expect("CAN'T OPEN FILE"); - let buff_reader: BufReader = BufReader::new(file_reader); - - let mut old_head: Vec = Vec::new(); - let mut new_head: Vec = Vec::new(); - - for line in buff_reader.lines() { - match line { - Ok(string) => { - let mut old_new = string.split('\t'); - let x = old_new.next().unwrap(); - let y = old_new.next().unwrap(); - old_head.push(x.to_string()); - new_head.push(y.to_string()); - } - Err(_) => { - print!("") - } - }; - } - - let mapped_heads: Zip, std::vec::IntoIter> = - new_head.into_iter().zip(old_head); - - mapped_heads - } - - pub fn remapping_head(arguments: std::option::Option<&ArgMatches>) { - let file: &String = arguments.unwrap().get_one::("fasta-file").unwrap(); - let map_file: &String = arguments.unwrap().get_one::("map-file").unwrap(); - let output: &String = arguments - .unwrap() - .get_one::("output-directory") - .unwrap(); - - println!("Mapping headers for file: {}", file); - println!("Replace headers with string: {}", map_file); - - match validate_fasta(file) { - Ok(_thing) => { - let new_map: Zip, std::vec::IntoIter> = - pull_map_from_tsv(map_file); - - let new_fasta: String = format!("{output}_OH.fasta"); - - map_headers::mapping_headers::create_mapped_fasta(file, &new_fasta, new_map); - - println!( - "{}\n{}\n\t{}\n", - "FASTA HAS BEEN RE-APPED AND REWRITTEN".green(), - "FOUND HERE:".green(), - &new_fasta.green() - ); - } - Err(_) => { - println!("NOT A VALID FASTA") - } - }; - } -} diff --git a/src/split_by_count.rs b/src/split_by_count.rs deleted file mode 100644 index 1396f00..0000000 --- a/src/split_by_count.rs +++ /dev/null @@ -1,111 +0,0 @@ -pub mod split_by_count_mod { - use crate::generics::sanitise_header; - use clap::ArgMatches; - use compare::{natural, Compare}; - use noodles::fasta::{self, Record}; - use std::cmp::Ordering; - use std::fs::OpenOptions; - use std::{ - fs::{create_dir_all, File}, - io::BufReader, - path::Path, - }; - - #[allow(clippy::needless_return)] - fn fix_head(records: Record, sanitise: bool) -> Record { - if sanitise { - let header = sanitise_header(records.definition()); - let definition = fasta::record::Definition::new(header, None); - let seq = records.sequence().to_owned(); - return fasta::Record::new(definition, seq); - } else { - return records.to_owned(); - }; - } - - fn write_fasta(outdir: &String, fasta_record: &Vec) { - println!("{}", outdir); - - let _data_file = File::create(outdir); - let file = OpenOptions::new() - .append(true) - .open(outdir) - .expect("creation failed"); - - let mut writer = fasta::Writer::new(file); - for i in fasta_record { - writer.write_record(i).unwrap(); - } - } - - pub fn split_file_by_count(arguments: std::option::Option<&ArgMatches>) { - let sanitise: &bool = arguments.unwrap().get_one::("sanitise").unwrap(); - let fasta_file = arguments.unwrap().get_one::("fasta-file").unwrap(); - let path_obj = Path::new(fasta_file); - let grab_name = path_obj.file_name().unwrap(); - let actual_list: Vec<&str> = grab_name.to_str().unwrap().split('.').collect(); - let actual_name = actual_list[0]; - - let data_type = arguments.unwrap().get_one::("data_type").unwrap(); - - let outpath = arguments - .unwrap() - .get_one::("output-directory") - .unwrap(); - - let new_outpath = format!("{}/{}/{}/", outpath, actual_name, data_type); - create_dir_all(new_outpath.clone()).unwrap(); - let fasta_count = arguments.unwrap().get_one::("count").unwrap(); - println!( - "Fasta file for processing: {:?}\nNumber of records per file: {:?}", - fasta_file, fasta_count - ); - - let mut counter: u16 = 0; - let mut file_counter: u16 = 1; - - let file_name: Vec<&str> = actual_name.split('.').collect(); - - let mut reader = File::open(fasta_file) - .map(BufReader::new) - .map(fasta::Reader::new) - .unwrap(); - - let mut record_list: Vec = Vec::new(); - for result in reader.records() { - let record = result.unwrap(); - counter += 1; - - let final_rec = fix_head(record, *sanitise); - record_list.push(final_rec); - - let cmp = natural(); - let compared = cmp.compare(&counter, fasta_count); - if compared == Ordering::Equal { - let full_outpath = format!( - "{}{}_f{}_c{}-a{}.fa", - new_outpath, - file_name[0], - file_counter, - &fasta_count, - &record_list.len() - ); - - write_fasta(&full_outpath, &record_list); - file_counter += 1; - counter = 0; - record_list = Vec::new(); - } - } - - let full_outpath = format!( - "{}{}_f{}_c{}-a{}.fa", - new_outpath, - file_name[0], - file_counter, - &fasta_count, - &record_list.len() - ); - write_fasta(&full_outpath, &record_list); - } -} diff --git a/src/split_by_size.rs b/src/split_by_size.rs deleted file mode 100644 index f1b4a7b..0000000 --- a/src/split_by_size.rs +++ /dev/null @@ -1,12 +0,0 @@ -pub mod split_by_size_mod { - use clap::ArgMatches; - - pub fn split_file_by_size(arguments: std::option::Option<&ArgMatches>) { - let fasta_file: &String = arguments.unwrap().get_one::("fasta-file").unwrap(); - println!("Fasta file for processing: {:?}", &fasta_file); - println!( - "Size to chunk fasta into: {:?}", - arguments.unwrap().get_one::("mem-size").unwrap() - ); - } -} diff --git a/src/tpf_fasta.rs b/src/tpf_fasta.rs deleted file mode 100644 index fc5ec7e..0000000 --- a/src/tpf_fasta.rs +++ /dev/null @@ -1,277 +0,0 @@ -pub mod tpf_fasta_mod { - use clap::ArgMatches; - use noodles::core::Position; - use noodles::fasta; - use noodles::fasta::record::Sequence; - use noodles::fasta::repository::adapters::IndexedReader; - use std::fs::OpenOptions; - use std::io::Write; - use std::{fs::read_to_string, fs::File, str}; - - use crate::generics::validate_fasta; - - #[derive(Debug, Clone, PartialEq, Eq)] - struct Tpf { - ori_scaffold: String, - start_coord: usize, - end_coord: usize, - new_scaffold: String, - orientation: String, - } - - impl std::fmt::Display for Tpf { - fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - write!( - fmt, - "\t{} -- {} -- {}", - self.ori_scaffold, self.start_coord, self.end_coord - ) - } - } - - #[derive(Debug, PartialEq, Eq)] - struct NewFasta { - tpf: Tpf, - sequence: String, - } - - #[derive(Debug)] - struct MyRecord { - name: String, - sequence: Vec, - } - - fn parse_tpf(path: &String) -> Vec { - let mut all_tpf: Vec = Vec::new(); - for line in read_to_string(path).unwrap().lines() { - if line.starts_with('?') { - let line_replaced = line.replace('\t', " "); - let line_list: Vec<&str> = line_replaced.split_whitespace().collect(); - let scaff_data: Vec<&str> = line_list[1].split(':').collect(); - let scaff_coords: Vec<&str> = scaff_data[1].split('-').collect(); - let data = Tpf { - ori_scaffold: scaff_data[0].to_owned(), - start_coord: scaff_coords[0].to_owned().parse::().unwrap(), - end_coord: scaff_coords[1].to_owned().parse::().unwrap(), - new_scaffold: line_list[2].to_owned().replace("RL", "SUPER"), - orientation: line_list[3].to_owned(), - }; - all_tpf.push(data); - } - } - all_tpf - } - - fn subset_vec_tpf<'a>( - tpf: &'a Vec, - fasta: (&std::string::String, &usize), - ) -> Vec<&'a Tpf> { - // - // Subset the Vec based on a search through the fasta - // - let mut subset_tpf: Vec<&Tpf> = Vec::new(); - for i in tpf { - if i.ori_scaffold == *fasta.0 { - subset_tpf.push(i) - } - } - subset_tpf - } - - fn check_orientation( - parsed: std::option::Option, - orientation: String, - ) -> String { - if orientation == "MINUS" { - let start = Position::try_from(1).unwrap(); - let parse_orientation = parsed.unwrap(); - let compliment: Sequence = parse_orientation - .complement() - .collect::>() - .unwrap(); - let seq = compliment.get(start..).unwrap(); - str::from_utf8(seq).unwrap().chars().rev().collect() - } else { - let start = Position::try_from(1).unwrap(); - let parse_orientation = parsed.unwrap(); - let seq = parse_orientation.get(start..).unwrap(); - str::from_utf8(seq).unwrap().chars().collect() - } - } - - fn parse_seq( - sequence: std::option::Option, - tpf: Vec<&Tpf>, - ) -> Vec { - let mut subset_tpf: Vec = Vec::new(); - // - // Take the input sequence and scaffold name - // Parse the input sequence based on the data contained in - // the TPF. Which is already a subset based on scaff name - // - - let new_seq = sequence.unwrap(); // Option(Sequence ()) -> Sequence () - for &i in &tpf { - let start = Position::try_from(i.start_coord).unwrap(); - let end = Position::try_from(i.end_coord).unwrap(); - //let region = Region::new(&i.new_scaffold, start.unwrap()..=end.unwrap()); - let parsed = new_seq.slice(start..=end); - let the_sequence = check_orientation(parsed, i.orientation.to_owned()); - let data = NewFasta { - tpf: i.to_owned(), - sequence: the_sequence, - }; - subset_tpf.push(data); - } - subset_tpf - } - - fn get_uniques(tpf_list: &Vec) -> Vec { - let mut uniques: Vec = Vec::new(); - - for i in tpf_list { - if !uniques.contains(&i.new_scaffold) { - uniques.push(i.new_scaffold.to_owned()) - } - } - uniques - } - - fn save_to_fasta( - fasta_data: Vec, - tpf_data: Vec, - output: &String, - n_length: usize, - ) { - // - // TPF is in the input TPF order, this will continue to be the case until - // the script is modified and the Tpf struct gets modified in place for some reason - // - let _data_file = File::create(output); - let mut file = OpenOptions::new() - .write(true) - .open(output) - .expect("creation failed"); - - let _debugger = File::create("debug.txt"); - let mut file2 = OpenOptions::new() - .write(true) - .open("debug.txt") - .expect("creation failed"); - - let uniques = get_uniques(&tpf_data); - - // This is inefficient as we are scanning through the fasta_data, uniques number of times - // If uniques is 10 long and fasta is 100, then this is 1000 scans through in total. - let mut no_more: Vec = Vec::new(); - for x in uniques { - println!("NOW WRITING DATA FOR: {:?}", &x); - // X = "SUPER_1" - let stringy = format!(">{x}\n"); - file.write_all(stringy.as_bytes()) - .expect("Unable to write to file"); - file2 - .write_all(stringy.as_bytes()) - .expect("Unable to write to file"); - - let mut data: MyRecord = MyRecord { - name: "".to_string(), - sequence: Vec::new(), - }; - - no_more.push(x.to_owned()); - x.clone_into(&mut data.name); - for tpf in &tpf_data { - if tpf.new_scaffold == x { - for fasta in &fasta_data { - if fasta.tpf == *tpf { - let stringy = format!("\t{}\n", tpf); - file2 - .write_all(stringy.as_bytes()) - .expect("Unable to write to file"); - data.sequence.push(fasta.sequence.to_owned()); - } - } - } - } - - let line_len: usize = 60; - let fixed = data.sequence; - let n_string = "N".repeat(n_length); - let fixed2 = fixed.join(&n_string); //.join required a borrowed str - let fixed3 = fixed2 - .as_bytes() - .chunks(line_len) - .map(str::from_utf8) - .collect::, _>>() - .unwrap(); - - for i in fixed3 { - let formatted = i.to_owned() + "\n"; - file.write_all(formatted.as_bytes()).unwrap(); - } - println!("NO LONG SCANNING FOR: {:?}", &no_more) - } - } - - #[allow(clippy::needless_borrow)] - #[allow(clippy::let_and_return)] - pub fn curate_fasta(arguments: std::option::Option<&ArgMatches>) { - // - // Generate a curated fasta file based on the input TPF file - // which was generated by Pretext and the agp_to_tpf script. - // This new fasta file contains a new scaffold naming as well - // as pieced together sequences generated by the splitting of - // data in Pretext. - // - let fasta_file: &String = arguments.unwrap().get_one::("fasta").unwrap(); - let tpf_file: &String = arguments.unwrap().get_one::("tpf").unwrap(); - let n_length: &usize = arguments.unwrap().get_one::("n_length").unwrap(); - let output: &String = arguments.unwrap().get_one::("output").unwrap(); - println!("LET'S GET CURATING THAT FASTA!"); - stacker::maybe_grow(32 * 1024, 1024 * 5120, || { - match validate_fasta(fasta_file) { - Ok(fasta_d) => { - let tpf_data = parse_tpf(&tpf_file); - //let _validated = varify_validity(&tpf_data, &fasta_d); - - // - // Start indexed reader of the input fasta - // if valid then use the data - // - let reader = - fasta::indexed_reader::Builder::default().build_from_path(fasta_file); - let fasta_repo = match reader { - Ok(data) => { - let adapter = IndexedReader::new(data); - let repository = fasta::Repository::new(adapter); - repository - } - Err(_) => todo!(), - }; - - // - // For unique scaffold in the fasta file iter through and - // parse sequence for each line in the tpf - // The tpf will contain multiple enteries for each scaffold, minimum of one entry. - // - let mut new_fasta_data: Vec = Vec::new(); - for i in fasta_d { - let subset_tpf = subset_vec_tpf(&tpf_data, (&i.0, &i.1)); - let sequence = fasta_repo.get(&i.0).transpose(); - - match sequence { - Ok(data) => { - let subset_results = parse_seq(data, subset_tpf); - new_fasta_data.extend(subset_results); - } - Err(e) => panic!("{:?}", e), - }; - } - save_to_fasta(new_fasta_data, tpf_data, output, n_length.to_owned()) - } - Err(e) => panic!("Something is wrong with the file! | {}", e), - } - }) - } -} diff --git a/src/yaml_validator.rs b/src/yaml_validator.rs deleted file mode 100644 index e23d121..0000000 --- a/src/yaml_validator.rs +++ /dev/null @@ -1,272 +0,0 @@ -pub mod yaml_validator_mod { - use clap::ArgMatches; - use colored::Colorize; - use csv::Error; - use csv::ReaderBuilder; - use noodles::fasta; - use serde::{Deserialize, Serialize}; - use std::fs::{self, File}; - use std::io::ErrorKind; - use std::path::PathBuf; - // Would be nice if there was a simple format_check - // use noodles::cram as cram; - - #[derive(Debug, Serialize, Deserialize)] - struct TreeValYaml { - assembly: Assembly, - reference_file: String, - assem_reads: AssemReads, - alignment: Alignment, - self_comp: SelfComp, - intron: Intron, - telomere: Telomere, - synteny: Synteny, - busco: Busco, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Assembly { - level: String, - sample_id: String, - latin_name: String, - classT: String, - asmVersion: u16, - gevalType: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct AssemReads { - pacbio: String, - hic: String, - supplementary: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Alignment { - data_dir: String, - common_name: String, - geneset: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct SelfComp { - motif_len: u16, - mummer_chunk: u16, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Intron { - size: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Telomere { - teloseq: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Synteny { - synteny_genome_path: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct Busco { - lineages_path: String, - lineage: String, - } - - // - // CSV STRUCT - // - //#[derive(Deserialize)] - //struct Record { - // org: String, - // type: String, - // data_file: String - //} - - pub fn validate_paths(path: &str, field_id: &str) { - match fs::metadata(path) { - Ok(_) => { - println!( - "{}{} \t{}\t{}", - ">-".green(), - &field_id.green(), - "| PATH EXISTS: ".green(), - path.green() - ); - match field_id { - "REFERENCE" => validate_fasta(path), - "GENESET-CSV" => { - _ = validate_csv(path); - } - "HIC" => {} - _ => println!("Error"), - } - } - Err(_) => println!( - "{}{} \t{}\t{}", - "<-".red().bold(), - &field_id.red().bold(), - "| CHECK YAML!:".red().bold(), - path - ), - } - } - - pub fn validate_fasta(path: &str) { - let reader = fasta::reader::Builder.build_from_path(path); - - let mut binding = reader.expect("NO VALID HEADER / SEQUENCE PAIRS"); - let result = binding.records(); - let counter = result.count(); - println!( - "{} {} {}", - ">- REFERENCE H/S PAIRS:".green(), - counter, - "H/S PAIRS".green() - ) - } - - pub fn validate_csv(path: &str) -> Result<(), Error> { - let file = File::open(path)?; - - let mut reader = ReaderBuilder::new() - .has_headers(true) - .delimiter(b',') - .from_reader(file); - - let record = reader.records().count(); - println!( - "{} {} {}", - ">-GENESET-RECORD-COUNT: >".green(), - record, - "<".green() - ); - - Ok(()) - } - - // - // FUNCTION: Check if pacbio has fasta.gz files, cram has cram and crai and synteny has fasta - // could make this much easier and consise by passing in a list of file types to check - // validatedata(path, [fa, fna, fasta]) - // - pub fn validate_data(path: &str, dtype: &str) { - match fs::read_dir(path) { - Err(e) if e.kind() == ErrorKind::NotFound => {} - Err(e) => panic!("{} {e}", "<-DIRECTORY PATH DOESN'T EXIST: ".red().bold()), - Ok(data_files) => { - if dtype == "pacbio" { - let files: Vec = data_files - .filter_map(|f| f.ok()) - .filter(|d| match d.path().extension() { - None => false, - Some(ex) => ex == "fasta.gz", - }) - .map(|f| f.path()) - .collect(); - - if files.is_empty() { - println!("{}", "<-NO PACBIO DATA FILES".red()) - } else { - println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files); - } - } else if dtype == "hic" { - let files: Vec = data_files - .filter_map(|f| f.ok()) - .filter(|d| match d.path().extension() { - None => false, - Some(ex) => ex == "cram" || ex == "crai", - }) - .map(|f| f.path()) - .collect(); - - if files.is_empty() { - println!("{}", "<-NO HIC DATA FILES".red()) - } else { - println!("{} {:?}", ">-YOUR FILES ARE:".green(), &files); - } - } else if dtype == "synteny" { - let files: Vec = data_files - .filter_map(|f| f.ok()) - .filter(|d| match d.path().extension() { - None => false, - Some(ex) => ex == "fa" || ex == "fasta" || ex == "fna", - }) - .map(|f| f.path()) - .collect(); - - if files.is_empty() { - println!("{}", "<-NO SYNTENIC GENOMES".red()) - } else { - println!("{} {:?}", ">-YOUR GENOMES ARE:".green(), &files); - } - } - } - }; - } - - pub fn validate_yaml(arguments: std::option::Option<&ArgMatches>) { - let file = arguments.unwrap().get_one::("yaml").unwrap(); - let _output: &String = arguments - .unwrap() - .get_one::("output-directory") - .unwrap(); - let _verbose_flag: &bool = arguments.unwrap().get_one::("verbose").unwrap(); - - println! {"Validating Yaml: {}", file.purple()}; - - let input = fs::File::open(file).expect("Unable to read from file"); - let contents: TreeValYaml = - serde_yaml::from_reader(input).expect("Unable to read from file"); - - println!( - "RUNNING VALIDATE-YAML FOR SAMPLE: {}", - contents.assembly.sample_id.purple() - ); - - validate_paths(&contents.reference_file, "REFERENCE"); - validate_paths(&contents.alignment.data_dir, "GENESET"); - validate_paths(&contents.synteny.synteny_genome_path, "SYNTENY"); - validate_paths(&contents.busco.lineages_path, "BUSCO"); - - validate_paths(&contents.assem_reads.pacbio, "PACBIO"); - validate_data(&contents.assem_reads.pacbio, "pacbio"); - - validate_paths(&contents.assem_reads.hic, "HIC"); - validate_data(&contents.assem_reads.hic, "hic"); - - println!("{}", "CHECKING GENESET DIRECTORY RESOLVES".blue()); - let genesets = contents.alignment.geneset.split(','); - for set in genesets { - let gene_alignment_path = contents.alignment.data_dir.clone() - + &contents.assembly.classT - + "/csv_data/" - + set - + "-data.csv"; - validate_paths(&gene_alignment_path, "GENESET-CSV"); - } - - println!("{}", "CHECKING SYNTENY DIRECTORY RESOLVES".blue()); - let synteny_full = - contents.synteny.synteny_genome_path.clone() + &contents.assembly.classT + "/"; - validate_paths(&synteny_full, "SYNTENY-FASTA"); - validate_data(&synteny_full, "synteny"); - - println!("{}", "CHECKING BUSCO DIRECTORY RESOLVES".blue()); - let busco_path = - contents.busco.lineages_path.clone() + "/lineages/" + &contents.busco.lineage; - validate_paths(&busco_path, "BUSCO-DB"); - // NOW CHECK FOR FILES IN DIRECTORY? - - println!( - "{}\n{}\n{}\n{}\n{}", - "VALIDATION COMPLETE".purple().bold(), - "GENERAL INFORMATION:".purple().bold(), - "Check the log to see what failed".bold(), - "FULL : ONLY synteny fails are permitted".purple(), - "RAPID: geneset, busco and synteny fails are permitted".purple() - ); - } -}