diff --git a/pycistopic-lib/Cargo.lock b/pycistopic-lib/Cargo.lock new file mode 100644 index 0000000..17eeb5f --- /dev/null +++ b/pycistopic-lib/Cargo.lock @@ -0,0 +1,357 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bstr" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "flate2" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indexmap" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "indoc" +version = "2.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" + +[[package]] +name = "libc" +version = "0.2.170" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "miniz_oxide" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e3e04debbb59698c15bacbb6d93584a8c0ca9cc3213cb423d31f760d8843ce5" +dependencies = [ + "adler2", +] + +[[package]] +name = "noodles" +version = "0.93.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74eea88cacba7e16cb6aff1bc3e077f3f9e036e59b9107a8415bf9242d0c2d97" +dependencies = [ + "noodles-bgzf", + "noodles-core", + "noodles-csi", + "noodles-tabix", +] + +[[package]] +name = "noodles-bgzf" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c10c4cb29950028823653395ebedfcaf5b652e504aacf816b510fb0268daa2eb" +dependencies = [ + "byteorder", + "bytes", + "crossbeam-channel", + "flate2", +] + +[[package]] +name = "noodles-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "962b13b79312f773a12ffcb0cdaccab6327f8343b6f440a888eff10c749d52b0" +dependencies = [ + "bstr", +] + +[[package]] +name = "noodles-csi" +version = "0.44.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45b402963653cacd5df474889a1cfb969e2f66469023836ad3b0a2c75e2b4e7c" +dependencies = [ + "bit-vec", + "bstr", + "byteorder", + "indexmap", + "noodles-bgzf", + "noodles-core", +] + +[[package]] +name = "noodles-tabix" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46ee069d57770840554e3d8bd02e7415b3a2d972cdea7525bea02472032c234" +dependencies = [ + "byteorder", + "indexmap", + "noodles-bgzf", + "noodles-core", + "noodles-csi", +] + +[[package]] +name = "once_cell" +version = "1.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" + +[[package]] +name = "portable-atomic" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" + +[[package]] +name = "proc-macro2" +version = "1.0.94" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pycistopic-lib" +version = "0.1.0" +dependencies = [ + "noodles", + "pyo3", +] + +[[package]] +name = "pyo3" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "serde" +version = "1.0.213" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.213" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "2.0.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e02e925281e18ffd9d640e234264753c43edc62d64b2d4cf898f1bc5e75f3fc2" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "unindent" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" diff --git a/pycistopic-lib/Cargo.toml b/pycistopic-lib/Cargo.toml new file mode 100644 index 0000000..4dda0b7 --- /dev/null +++ b/pycistopic-lib/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "pycistopic-lib" +version = "0.1.0" +edition = "2021" + +[lib] +# The name of the native library. This is the name which will be used in Python to import the +# library (i.e. `import string_sum`). If you change this, you must also change the name of the +# `#[pymodule]` in `src/lib.rs`. +name = "pycistopic_lib" +# "cdylib" is necessary to produce a shared library for Python to import from. +# +# Downstream Rust code (including code in `bin/`, `examples/`, and `tests/`) will not be able +# to `use string_sum;` unless the "rlib" or "lib" crate type is also included, e.g.: +# crate-type = ["cdylib", "rlib"] +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.23.5", features = ["extension-module"] } +noodles = { version = "0.93.0", features = ["tabix", "bgzf", "csi", "core"] } diff --git a/pycistopic-lib/src/custom_errors.rs b/pycistopic-lib/src/custom_errors.rs new file mode 100644 index 0000000..1653eb7 --- /dev/null +++ b/pycistopic-lib/src/custom_errors.rs @@ -0,0 +1,58 @@ +use pyo3::exceptions::{PyIOError, PyValueError}; +use std::fmt; +use pyo3::PyErr; + +#[derive(Debug)] +pub struct InvalidFragmentFileError { + pub filename: String, +} + +impl InvalidFragmentFileError { + pub fn new(filename: &str) -> InvalidFragmentFileError { + InvalidFragmentFileError{ + filename: filename.to_string() + } + } +} + +impl std::error::Error for InvalidFragmentFileError{} + +impl fmt::Display for InvalidFragmentFileError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}: Invalid fragment file.", self.filename) + } +} + +impl From for PyErr { + fn from(err: InvalidFragmentFileError) -> PyErr { + PyIOError::new_err(err.to_string()) + } +} + +#[derive(Debug)] +pub struct ValueError { + pub message: String, +} + +impl ValueError { + pub fn new(message: String) -> ValueError { + ValueError { + message + } + } +} + +impl std::error::Error for ValueError{} + +impl fmt::Display for ValueError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.message) + } +} + +impl From for PyErr { + fn from(err: ValueError) -> PyErr { + PyValueError::new_err(err.to_string()) + } +} + diff --git a/pycistopic-lib/src/lib.rs b/pycistopic-lib/src/lib.rs new file mode 100644 index 0000000..ee315e2 --- /dev/null +++ b/pycistopic-lib/src/lib.rs @@ -0,0 +1,14 @@ +mod pseudobulk; +mod custom_errors; + +use pyo3::prelude::*; + + +/// A Python module implemented in Rust. The name of this function must match +/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to +/// import the module. +#[pymodule] +fn pycistopic_lib(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(pseudobulk::split_fragment_files_by_cell_type, m)?)?; + Ok(()) +} diff --git a/pycistopic-lib/src/pseudobulk.rs b/pycistopic-lib/src/pseudobulk.rs new file mode 100644 index 0000000..280793e --- /dev/null +++ b/pycistopic-lib/src/pseudobulk.rs @@ -0,0 +1,294 @@ +use super::custom_errors; +use std::fmt; +use std::fs::File; +use std::path::Path; +use std::collections::BinaryHeap; +use std::collections::HashMap; +use std::cmp::Reverse; +use std::io::{BufRead, Write}; +use pyo3::prelude::*; +use std::thread; +use noodles::{tabix, bgzf}; +use noodles::csi::BinningIndex; +use noodles::core::{region::Interval, Position}; + +#[derive(Eq, PartialEq, Clone)] +struct GenomicRange { + chromosome: String, + start: usize, + end: usize, + cell_barcode: String, + score: Option, + file_index: usize, + file_name: String +} + +impl Ord for GenomicRange { + fn cmp(&self, other: &GenomicRange) -> std::cmp::Ordering { + self.start.cmp(&other.start) + .then(self.end.cmp(&other.end)) + } +} + +impl PartialOrd for GenomicRange { + fn partial_cmp(&self, other: &GenomicRange) -> Option { + Some(self.cmp(other)) + } +} + +impl GenomicRange { + fn new(line: &str, file_index: usize, filename: &str) -> Result { + let fields: Vec<&str> = line.split('\t').collect(); + if fields.len() < 4 { + return Err(custom_errors::InvalidFragmentFileError::new(filename)); + } + Ok(GenomicRange{ + chromosome: fields[0].to_string(), + start: fields[1].parse::() + .map_err(|_| custom_errors::InvalidFragmentFileError::new(filename))?, + end: fields[2].parse::() + .map_err(|_| custom_errors::InvalidFragmentFileError::new(filename))?, + cell_barcode: fields[3].to_string(), + score: if fields.len() > 4 { + Some(fields[4].parse::() + .map_err(|_| custom_errors::InvalidFragmentFileError::new(filename))? + ) + } else {None}, + file_index, + file_name: filename.to_string() + }) + } +} + +impl fmt::Display for GenomicRange { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.score { + Some(score) => write!( + f, + "{}\t{}\t{}\t{}\t{}", + self.chromosome, self.start, self.end, self.cell_barcode, score + ), + None => write!( + f, + "{}\t{}\t{}\t{}", + self.chromosome, self.start, self.end, self.cell_barcode + ), + } + } +} + + +struct FragmentFileReader { + reader: bgzf::Reader, + index: Option, + fragment: Result, + buffer: String, + valid_cell_barcodes: Vec, + target_chrom: String, + file_index: usize, + file_name: String, + at_end_of_file: bool, +} + +impl FragmentFileReader { + fn new(fragment_file_path: &str, valid_cell_barcodes: Vec, target_chrom: String, file_index: usize) -> Result { + let reader = bgzf::Reader::new(File::open(Path::new(fragment_file_path))?); + let mut index: Option = None; + if Path::new(&format!("{}.tbi", fragment_file_path)).exists() { + println!("Index found: {}", format!("{}.tbi", fragment_file_path)); + index = Some(tabix::fs::read(format!("{}.tbi", fragment_file_path))?); + } + Ok(FragmentFileReader{ + reader, + index, + fragment: GenomicRange::new("", file_index, fragment_file_path), + buffer: String::new(), + valid_cell_barcodes, + target_chrom, + file_index, + file_name: fragment_file_path.to_string(), + at_end_of_file: false + }) + } + + fn at_chrom(&self) -> bool { + if let Ok(fragment) = &self.fragment{ + fragment.chromosome == self.target_chrom + } else { + false + } + } + + fn read_next(&mut self) -> Result<(), custom_errors::InvalidFragmentFileError>{ + self.buffer.clear(); + match self.reader.read_line(&mut self.buffer) { + Ok(bytes_read) => { + if bytes_read == 0 { + self.at_end_of_file = true + } + }, + Err(_) => { + return Err(custom_errors::InvalidFragmentFileError::new(&self.file_name)) + } + } + self.buffer = self.buffer.trim().to_string(); + self.fragment = GenomicRange::new(&self.buffer, self.file_index, &self.file_name); + Ok(()) + } + + fn skip_header(&mut self, pattern: &str) -> Result<(), custom_errors::InvalidFragmentFileError> { + self.buffer = pattern.to_string(); + while self.buffer.starts_with(pattern) && !self.at_end_of_file { + self.read_next()?; + } + Ok(()) + } + + fn skip_to_chromosome(&mut self, chromosome: &str) -> Result<(), custom_errors::InvalidFragmentFileError> { + let mut using_index = false; + if let Some(index) = &self.index { + if let Some(header) = index.header() { + let seq_names = header.reference_sequence_names(); + if let Some(chromosome_index) = seq_names.get_index_of(chromosome.as_bytes()) { + if let Ok(q) = index.query( + chromosome_index, + Interval::from(Position::MIN..) + ) { + let first_chunk = q[0]; + self.reader.seek(first_chunk.start()).map_err(|_| custom_errors::InvalidFragmentFileError::new(&self.file_name))?; + self.read_next()?; + using_index = true; + } + } + } + } + if !using_index { + // If no index, manually skip header lines. + // otherwise this is done automatically + self.skip_header("#")?; + } + while !self.at_end_of_file { + if let Ok(fragment) = &self.fragment { + if fragment.chromosome == chromosome { + return Ok(()); + } + }; + self.read_next()?; + } + Ok(()) + } + + fn get_next_valid_fragment(&mut self) -> Result, custom_errors::InvalidFragmentFileError> { + while !self.at_end_of_file { + // first check current fragment + // it could be that after skip_header and skip_to_chromosome we are already at a + // valid fragment. + if let Ok(fragment) = &self.fragment { + if self.valid_cell_barcodes.contains(&fragment.cell_barcode) { + let fragment = fragment.clone(); + if fragment.chromosome != self.target_chrom { + return Ok(None) + } + // read next fragment so next time we enter this function we have a new frag,et + self.read_next()?; + return Ok(Some(fragment)); + } + } + self.read_next()?; + } + Ok(None) + } +} + +fn split_fragments_by_cell_barcodes_for_chromosome( + fragment_file_paths: &[&str], + fragment_file_to_cell_barcode: &HashMap>, + chromosome: &str, + gz_output_file: &mut bgzf::Writer +) -> PyResult<()>{ + + // Open fragment files which are gzipped, and pos-sorted. + let mut readers: Vec = Vec::new(); + for (file_index, fragment_file_path) in fragment_file_paths.iter().enumerate() { + if let Some(cell_barcodes) = fragment_file_to_cell_barcode.get(&fragment_file_path.to_string()) { + readers.push( + FragmentFileReader::new( + fragment_file_path, + cell_barcodes.to_vec(), + chromosome.to_string(), + file_index)? + ); + } + } + + // A binary heap will be used to write fragments in order from different files + let mut heap = BinaryHeap::new(); + + // go to correct chromosome and push first valid fragment to binary heap + for reader in readers.iter_mut(){ + reader.skip_to_chromosome(chromosome)?; + if !reader.at_end_of_file && reader.at_chrom() { + if let Some(fragment) = reader.get_next_valid_fragment()? { + heap.push(Reverse(fragment)); + } + } + } + + while let Some(Reverse(fragment)) = heap.pop() { + gz_output_file.write_all(format!("{}\n", fragment).as_bytes())?; + // read from file that currently has the smallest genomic range + let reader = &mut readers[fragment.file_index]; + if !reader.at_end_of_file && reader.at_chrom() { + if let Some(fragment) = reader.get_next_valid_fragment()? { + heap.push(Reverse(fragment)); + } + } + } + Ok(()) +} + +#[pyfunction] +pub fn split_fragment_files_by_cell_type( + fragment_file_paths: Vec, + output_file_prefix: &str, + cell_type_to_fragment_file_to_cell_barcode: HashMap>>, + chromosomes: Vec +) -> PyResult<()> { + for cell_type in cell_type_to_fragment_file_to_cell_barcode.keys() { + let mut handles: Vec> = Vec::new(); + for chromosome in &chromosomes { + let output_file_name = format!("{}_{}.{}.tsv.gz", output_file_prefix, cell_type, chromosome); + let fragment_file_paths = fragment_file_paths.clone(); // Need to clone since threads take ownership + let fragment_file_to_cell_barcode = cell_type_to_fragment_file_to_cell_barcode + .get(cell_type) + .unwrap() + .clone(); + let file = File::create(output_file_name)?; + let chromosome = chromosome.clone(); + let handle = thread::spawn(move || { + let mut gz_output_file = bgzf::Writer::new(file); + split_fragments_by_cell_barcodes_for_chromosome( + &fragment_file_paths.iter().map(|p| p.as_str()).collect::>(), + &fragment_file_to_cell_barcode, + &chromosome, + &mut gz_output_file + ) + }); + handles.push(handle); + } + for handle in handles { + handle.join().expect("Thread panicked")?; + } + // concat all chromosomes + let output_file_name = format!("{}_{}.tsv.gz", output_file_prefix, cell_type); + let output_file = File::create(&output_file_name)?; + let mut writer = std::io::BufWriter::new(output_file); + for chromosome in &chromosomes { + let input_file_name = format!("{}_{}.{}.tsv.gz", output_file_prefix, cell_type, chromosome); + let mut input_file = File::open(&input_file_name)?; + std::io::copy(&mut input_file, &mut writer)?; + } + writer.flush()?; + } + Ok(()) +} diff --git a/pycistopic-lib/test/bcs_nextgem.txt b/pycistopic-lib/test/bcs_nextgem.txt new file mode 100644 index 0000000..2441cc2 --- /dev/null +++ b/pycistopic-lib/test/bcs_nextgem.txt @@ -0,0 +1,10 @@ +CCACGTTCAGTAACTC-1 +TTCGATTCATCTCTCG-1 +TTGCCCACAAGGTTCT-1 +TTTGGCCAGTACAGAT-1 +CTAACTTAGAAATGGG-1 +TGAGTCACACCACGAC-1 +CGAGTTATCAAGTTGC-1 +TGCATTTGTAACATAG-1 +TGCATTTGTAACATAG-1 +GAAGTCTTCTACTTTG-1 diff --git a/pycistopic-lib/test/bcs_v1.txt b/pycistopic-lib/test/bcs_v1.txt new file mode 100644 index 0000000..cc57001 --- /dev/null +++ b/pycistopic-lib/test/bcs_v1.txt @@ -0,0 +1,10 @@ +GGTGTCGTCAAGGCAG-1 +CTCAACCTCAGGTTTG-1 +GGTGTCGTCAAGGCAG-1 +AGACAAACACATCATG-1 +CATGCCTGTCCTTATT-1 +GGTGTCGTCAAGGCAG-1 +GCATTCCGTTAGAGAT-1 +CAGCTAATCTGATCCC-1 +GAGCATTAGCTTACCA-1 +GCATTCCGTTAGAGAT-1 diff --git a/pycistopic-lib/test/pseudobulk_test.py b/pycistopic-lib/test/pseudobulk_test.py new file mode 100644 index 0000000..19d4353 --- /dev/null +++ b/pycistopic-lib/test/pseudobulk_test.py @@ -0,0 +1,154 @@ +import pytest +import urllib.request +import urllib +import gzip +import os +import pathlib +import shutil + +from pycistopic_lib import split_fragment_files_by_cell_type + +TEST_DIRECTORY = pathlib.Path(__file__).parent.absolute() + +FRAGMENT_FILEs = [ + TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_nextgem_fragments.tsv.gz"), + TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_v1_fragments.tsv.gz") +] + +INDEX_FILEs= [ + TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_nextgem_fragments.tsv.gz.tbi"), + TEST_DIRECTORY / pathlib.Path("atac_pbmc_500_v1_fragments.tsv.gz.tbi") +] + +@pytest.fixture +def fragment_file(tmp_path): + for f in FRAGMENT_FILEs: + shutil.copy(f, tmp_path) + return [tmp_path / f.name for f in FRAGMENT_FILEs] + +@pytest.fixture +def fragment_file_and_index(tmp_path): + for f in FRAGMENT_FILEs: + shutil.copy(f, tmp_path) + for f in INDEX_FILEs: + shutil.copy(f, tmp_path) + return [tmp_path / f.name for f in FRAGMENT_FILEs] + +def count_number_of_fragments_per_chrom_for_cbs(file_name: str, cbs: list[str]): + n_fragment_per_cb_per_chrom = { + cb: {} for cb in cbs + } + with gzip.open(file_name) as f: + for fragment in f: + chrom, _, _, cb = fragment.decode().split()[0:4] + if cb in cbs: + if chrom not in n_fragment_per_cb_per_chrom[cb]: + n_fragment_per_cb_per_chrom[cb][chrom] = 0 + n_fragment_per_cb_per_chrom[cb][chrom] += 1 + return n_fragment_per_cb_per_chrom + +def fragment_file_correct_order(file_name: str): + chrom_to_start = {} + with gzip.open(file_name) as f: + for l in f: + l = l.decode() + if l.startswith("#"): + continue + chrom, start = l.split()[0:2] + if chrom not in chrom_to_start: + chrom_to_start[chrom] = [] + chrom_to_start[chrom].append(int(start)) + def always_inc(l): + x = l[0] + for y in l[1:]: + if y < x: + return False + return True + return all([always_inc(chrom_to_start[chrom]) for chrom in chrom_to_start]) + +def test_pseudobulk_indexed(fragment_file_and_index, tmp_path): + fragment_files = fragment_file_and_index + bcs_1 = [] + bcs_2 = [] + with open(TEST_DIRECTORY / pathlib.Path("bcs_nextgem.txt")) as f: + for l in f: + bcs_1.append(l.strip()) + + with open(TEST_DIRECTORY / pathlib.Path("bcs_v1.txt")) as f: + for l in f: + bcs_2.append(l.strip()) + sample_to_cb = { + str(fragment_files[0]): bcs_1, + str(fragment_files[1]): bcs_2, + } + split_fragment_files_by_cell_type( + fragment_file_paths=[str(f) for f in fragment_files], + output_file_prefix=str(tmp_path) + "/", + cell_type_to_fragment_file_to_cell_barcode={ + "cell_type": sample_to_cb + }, + chromosomes=[*[f"chr{x + 1}" for x in range(22)], "chrY", "chrX"] + ) + assert(fragment_file_correct_order(tmp_path / "_cell_type.tsv.gz")) + n_fragment_per_cb_per_chrom_outf = count_number_of_fragments_per_chrom_for_cbs( + tmp_path / "_cell_type.tsv.gz", + [*list(sample_to_cb.values())[0], *list(sample_to_cb.values())[1]] + ) + n_fragment_per_cb_per_chrom_inf = {} + for sample in fragment_files: + n_fragment_per_cb_per_chrom_sample = count_number_of_fragments_per_chrom_for_cbs( + sample, sample_to_cb[str(sample)]) + for cb in n_fragment_per_cb_per_chrom_sample: + if cb not in n_fragment_per_cb_per_chrom_inf: + n_fragment_per_cb_per_chrom_inf[cb] = {} + for chrom in n_fragment_per_cb_per_chrom_sample[cb]: + if chrom not in n_fragment_per_cb_per_chrom_inf[cb]: + n_fragment_per_cb_per_chrom_inf[cb][chrom] = 0 + n_fragment_per_cb_per_chrom_inf[cb][chrom] += n_fragment_per_cb_per_chrom_sample[cb][chrom] + for cb in n_fragment_per_cb_per_chrom_inf: + for chrom in n_fragment_per_cb_per_chrom_inf[cb]: + assert(n_fragment_per_cb_per_chrom_inf[cb][chrom] == n_fragment_per_cb_per_chrom_outf[cb][chrom]) + + +def test_pseudobulk_not_indexed(fragment_file, tmp_path): + fragment_files = fragment_file + bcs_1 = [] + bcs_2 = [] + with open(TEST_DIRECTORY / pathlib.Path("bcs_nextgem.txt")) as f: + for l in f: + bcs_1.append(l.strip()) + + with open(TEST_DIRECTORY / pathlib.Path("bcs_v1.txt")) as f: + for l in f: + bcs_2.append(l.strip()) + sample_to_cb = { + str(fragment_files[0]): bcs_1, + str(fragment_files[1]): bcs_2, + } + split_fragment_files_by_cell_type( + fragment_file_paths=[str(f) for f in fragment_files], + output_file_prefix=str(tmp_path) + "/", + cell_type_to_fragment_file_to_cell_barcode={ + "cell_type": sample_to_cb + }, + chromosomes=[*[f"chr{x + 1}" for x in range(22)], "chrY", "chrX"] + ) + assert(fragment_file_correct_order(tmp_path / "_cell_type.tsv.gz")) + n_fragment_per_cb_per_chrom_outf = count_number_of_fragments_per_chrom_for_cbs( + tmp_path / "_cell_type.tsv.gz", + [*list(sample_to_cb.values())[0], *list(sample_to_cb.values())[1]] + ) + n_fragment_per_cb_per_chrom_inf = {} + for sample in fragment_files: + n_fragment_per_cb_per_chrom_sample = count_number_of_fragments_per_chrom_for_cbs( + sample, sample_to_cb[str(sample)]) + for cb in n_fragment_per_cb_per_chrom_sample: + if cb not in n_fragment_per_cb_per_chrom_inf: + n_fragment_per_cb_per_chrom_inf[cb] = {} + for chrom in n_fragment_per_cb_per_chrom_sample[cb]: + if chrom not in n_fragment_per_cb_per_chrom_inf[cb]: + n_fragment_per_cb_per_chrom_inf[cb][chrom] = 0 + n_fragment_per_cb_per_chrom_inf[cb][chrom] += n_fragment_per_cb_per_chrom_sample[cb][chrom] + for cb in n_fragment_per_cb_per_chrom_inf: + for chrom in n_fragment_per_cb_per_chrom_inf[cb]: + assert(n_fragment_per_cb_per_chrom_inf[cb][chrom] == n_fragment_per_cb_per_chrom_outf[cb][chrom]) \ No newline at end of file