diff --git a/modules/nf-core/ribodetector/environment.yml b/modules/nf-core/ribodetector/environment.yml index 1eaa4e825550..3f9b0530fa3f 100644 --- a/modules/nf-core/ribodetector/environment.yml +++ b/modules/nf-core/ribodetector/environment.yml @@ -4,4 +4,4 @@ channels: - conda-forge - bioconda dependencies: - - "bioconda::ribodetector=0.3.1" + - "bioconda::ribodetector=0.3.2" diff --git a/modules/nf-core/ribodetector/main.nf b/modules/nf-core/ribodetector/main.nf index 7da61ba9fa04..6c1921ad3ebf 100644 --- a/modules/nf-core/ribodetector/main.nf +++ b/modules/nf-core/ribodetector/main.nf @@ -4,8 +4,8 @@ process RIBODETECTOR { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ribodetector:0.3.1--pyhdfd78af_0': - 'biocontainers/ribodetector:0.3.1--pyhdfd78af_0' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/4d/4de8fe74d21198e6fc8218cb3209d929b3d7dab750678501b096b0ccc324307b/data' : + 'community.wave.seqera.io/library/ribodetector:0.3.2--cbe1c77fa14eeb53' }" input: tuple val(meta), path(fastq) @@ -14,7 +14,7 @@ process RIBODETECTOR { output: tuple val(meta), path("*.nonrna*.fastq.gz"), emit: fastq tuple val(meta), path("*.log") , emit: log - path "versions.yml" , emit: versions + tuple val("${task.process}"), val('ribodetector'), eval('ribodetector --version | sed "s/ribodetector //"'), emit: versions_ribodetector, topic: versions when: task.ext.when == null || task.ext.when @@ -35,11 +35,6 @@ process RIBODETECTOR { --log ${prefix}.log \\ ${ribodetector_mem} \\ ${args} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - ribodetector: \$(ribodetector --version | sed 's/ribodetector //g') - END_VERSIONS """ stub: @@ -50,12 +45,7 @@ process RIBODETECTOR { echo $args echo | gzip > ${prefix}.nonrna.1.fastq.gz - echo | gzip > ${prefix}.nonrna.2.fastq.gz + echo | gzip > ${prefix}.nonrna.2.fastq.gz touch ${prefix}.log - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - ribodetector: \$(ribodetector --version | sed 's/ribodetector //g') - END_VERSIONS """ } diff --git a/modules/nf-core/ribodetector/meta.yml b/modules/nf-core/ribodetector/meta.yml index c16f25462abf..6599d45c8c5e 100644 --- a/modules/nf-core/ribodetector/meta.yml +++ b/modules/nf-core/ribodetector/meta.yml @@ -1,7 +1,6 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "ribodetector" -description: Accurate and rapid RiboRNA sequences Detector based on deep - learning +description: Accurate and rapid RiboRNA sequences Detector based on deep learning keywords: - RNA - RNAseq @@ -16,10 +15,10 @@ keywords: tools: - ribodetector: description: Accurate and rapid RiboRNA sequences detector based on deep learning. - RiboDetector uses a deep learning approach to identify rRNA sequences in - ribosome profiling (Ribo-seq) data. It can be used to filter out rRNA reads - from Ribo-seq datasets, improving the quality of downstream analyses. As of version - 0.3.1, Ribodetector doesn't support setting a random seed, so results may not be fully + RiboDetector uses a deep learning approach to identify rRNA sequences in ribosome + profiling (Ribo-seq) data. It can be used to filter out rRNA reads from Ribo-seq + datasets, improving the quality of downstream analyses. As of version 0.3.1, + Ribodetector doesn't support setting a random seed, so results may not be fully deterministic across runs. homepage: "https://github.com/hzi-bifo/RiboDetector" documentation: "https://github.com/hzi-bifo/RiboDetector" @@ -67,13 +66,27 @@ output: description: Log file from RiboDetector pattern: "*.log" ontologies: [] + versions_ribodetector: + - - ${task.process}: + type: string + description: Name of the process + - ribodetector: + type: string + description: Name of the tool + - ribodetector --version | sed "s/ribodetector //: + type: string + description: Version of ribodetector used +topics: versions: - - versions.yml: - type: file - description: File containing software versions - pattern: versions.yml - ontologies: - - edam: http://edamontology.org/format_3750 # YAML + - - ${task.process}: + type: string + description: Name of the process + - ribodetector: + type: string + description: Name of the tool + - ribodetector --version | sed "s/ribodetector //: + type: string + description: Version of ribodetector used authors: - "@maxibor" maintainers: diff --git a/modules/nf-core/ribodetector/tests/main.nf.test b/modules/nf-core/ribodetector/tests/main.nf.test index 24668437ee19..afe568085d08 100644 --- a/modules/nf-core/ribodetector/tests/main.nf.test +++ b/modules/nf-core/ribodetector/tests/main.nf.test @@ -29,8 +29,8 @@ nextflow_process { { assert process.success }, { assert process.out.fastq }, { assert process.out.log }, - { assert path(process.out.log[0][1]).getText().contains("Writing output non-rRNA sequences") }, - { assert snapshot(process.out.versions).match() } + { assert path(process.out.log[0][1]).getText().contains("Writing output non-rRNA sequences") } + // Note: versions collected via topic, not snapshotted ) } diff --git a/modules/nf-core/ribodetector/tests/main.nf.test.snap b/modules/nf-core/ribodetector/tests/main.nf.test.snap index df54066bafb5..5a7166c91c62 100644 --- a/modules/nf-core/ribodetector/tests/main.nf.test.snap +++ b/modules/nf-core/ribodetector/tests/main.nf.test.snap @@ -1,16 +1,4 @@ { - "ribodetector - rnaseq PE input": { - "content": [ - [ - "versions.yml:md5,f98df8f0eaa704e4db74785adc9cc791" - ] - ], - "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.0" - }, - "timestamp": "2025-11-07T13:20:15.909875" - }, "ribodetector - stub rnaseq PE input": { "content": [ { @@ -36,7 +24,11 @@ ] ], "2": [ - "versions.yml:md5,f98df8f0eaa704e4db74785adc9cc791" + [ + "RIBODETECTOR", + "ribodetector", + "0.3.2" + ] ], "fastq": [ [ @@ -59,8 +51,12 @@ "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "versions": [ - "versions.yml:md5,f98df8f0eaa704e4db74785adc9cc791" + "versions_ribodetector": [ + [ + "RIBODETECTOR", + "ribodetector", + "0.3.2" + ] ] } ], @@ -68,6 +64,6 @@ "nf-test": "0.9.3", "nextflow": "25.10.0" }, - "timestamp": "2025-11-07T13:20:26.026547" + "timestamp": "2025-11-29T20:07:13.509994907" } } \ No newline at end of file diff --git a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/main.nf b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/main.nf index 29e25b72f30a..db839531b7cc 100644 --- a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/main.nf +++ b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/main.nf @@ -1,13 +1,10 @@ include { BBMAP_BBSPLIT } from '../../../modules/nf-core/bbmap/bbsplit' include { CAT_FASTQ } from '../../../modules/nf-core/cat/fastq/main' -include { RIBODETECTOR } from '../../../modules/nf-core/ribodetector/main' -include { SEQKIT_STATS } from '../../../modules/nf-core/seqkit/stats/main' -include { SORTMERNA } from '../../../modules/nf-core/sortmerna/main' -include { SORTMERNA as SORTMERNA_INDEX } from '../../../modules/nf-core/sortmerna/main' include { FQ_LINT } from '../../../modules/nf-core/fq/lint/main' include { FQ_LINT as FQ_LINT_AFTER_TRIMMING } from '../../../modules/nf-core/fq/lint/main' include { FQ_LINT as FQ_LINT_AFTER_BBSPLIT } from '../../../modules/nf-core/fq/lint/main' include { FQ_LINT as FQ_LINT_AFTER_RIBO_REMOVAL } from '../../../modules/nf-core/fq/lint/main' +include { FASTQ_REMOVE_RRNA } from '../fastq_remove_rrna' include { FASTQ_SUBSAMPLE_FQ_SALMON } from '../fastq_subsample_fq_salmon' include { FASTQ_FASTQC_UMITOOLS_TRIMGALORE } from '../fastq_fastqc_umitools_trimgalore' include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../fastq_fastqc_umitools_fastp' @@ -84,29 +81,6 @@ def multiqcTsvFromList(tsv_data, header) { return tsv_string } -// -// Function that parses seqkit stats TSV output to extract the mean read length -// for use with RiboDetector's -l parameter -// -def getReadLengthFromSeqkitStats(stats_file) { - def lines = stats_file.text.readLines() - if (lines.size() < 2) { - return 100 // Default fallback - } - - def header = lines[0].split('\t') - def avgLenIdx = header.findIndexOf { it == 'avg_len' } - if (avgLenIdx < 0) { - return 100 // Default fallback if column not found - } - - // Calculate mean avg_len across all files in the stats output - def avgLens = lines[1..-1].collect { it.split('\t')[avgLenIdx] as float } - def meanAvgLen = avgLens.sum() / avgLens.size() - - return Math.round(meanAvgLen) as int -} - workflow FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS { take: // Input channels @@ -116,8 +90,9 @@ workflow FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS { ch_gtf // channel: /path/to/genome.gtf ch_salmon_index // channel: /path/to/salmon/index/ (optional) ch_sortmerna_index // channel: /path/to/sortmerna/index/ (optional) + ch_bowtie2_index // channel: /path/to/bowtie2/index/ (optional) ch_bbsplit_index // channel: /path/to/bbsplit/index/ (optional) - ch_rrna_fastas // channel: one or more fasta files containing rrna sequences to be passed to SortMeRNA (optional) + ch_rrna_fastas // channel: one or more fasta files containing rrna sequences to be passed to SortMeRNA/Bowtie2 (optional) // Skip options skip_bbsplit // boolean: Skip BBSplit for removal of non-reference genome reads. @@ -129,6 +104,7 @@ workflow FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS { // Index generation make_salmon_index // boolean: Whether to create salmon index before running salmon quant make_sortmerna_index // boolean: Whether to create a sortmerna index before running sortmerna + make_bowtie2_index // boolean: Whether to create a bowtie2 index before running bowtie2 // Trimming options trimmer // string (enum): 'fastp' or 'trimgalore' @@ -138,7 +114,7 @@ workflow FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS { // rRNA removal options remove_ribo_rna // boolean: true/false: whether to remove rRNA - ribo_removal_tool // string (enum): 'sortmerna' or 'ribodetector' + ribo_removal_tool // string (enum): 'sortmerna', 'ribodetector', or 'bowtie2' // UMI options with_umi // boolean: true/false: Enable UMI-based read deduplication. @@ -294,64 +270,22 @@ workflow FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS { } // - // MODULE: Remove ribosomal RNA reads + // SUBWORKFLOW: Remove ribosomal RNA reads // if (remove_ribo_rna) { - if (ribo_removal_tool == 'sortmerna') { - ch_sortmerna_fastas = ch_rrna_fastas - .collect() - .map { [[id: 'rrna_refs'], it] } - - if (make_sortmerna_index) { - SORTMERNA_INDEX( - [[], []], - ch_sortmerna_fastas, - [[], []], - ) - ch_sortmerna_index = SORTMERNA_INDEX.out.index.first() - } - - SORTMERNA( - ch_filtered_reads, - ch_sortmerna_fastas, - ch_sortmerna_index, - ) - - SORTMERNA.out.reads.set { ch_filtered_reads } - - ch_multiqc_files = ch_multiqc_files.mix(SORTMERNA.out.log) - - ch_versions = ch_versions.mix(SORTMERNA.out.versions.first()) - } - else if (ribo_removal_tool == 'ribodetector') { - // Run seqkit stats to determine average read length - SEQKIT_STATS( - ch_filtered_reads - ) - - ch_versions = ch_versions.mix(SEQKIT_STATS.out.versions.first()) - - // Join stats with reads and calculate read length for RiboDetector - ch_filtered_reads - .join(SEQKIT_STATS.out.stats) - .multiMap { meta, reads, stats -> - def readLength = getReadLengthFromSeqkitStats(stats) - reads: [meta, reads] - length: readLength - } - .set { ch_reads_with_length } - - RIBODETECTOR( - ch_reads_with_length.reads, - ch_reads_with_length.length, - ) - - RIBODETECTOR.out.fastq.set { ch_filtered_reads } - - ch_multiqc_files = ch_multiqc_files.mix(RIBODETECTOR.out.log) + FASTQ_REMOVE_RRNA( + ch_filtered_reads, + ch_rrna_fastas, + ch_sortmerna_index, + ch_bowtie2_index, + ribo_removal_tool, + make_sortmerna_index, + make_bowtie2_index, + ) - ch_versions = ch_versions.mix(RIBODETECTOR.out.versions.first()) - } + ch_filtered_reads = FASTQ_REMOVE_RRNA.out.reads + ch_multiqc_files = ch_multiqc_files.mix(FASTQ_REMOVE_RRNA.out.multiqc_files) + ch_versions = ch_versions.mix(FASTQ_REMOVE_RRNA.out.versions) if (!skip_linting) { FQ_LINT_AFTER_RIBO_REMOVAL( diff --git a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/meta.yml b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/meta.yml index e2e54be04913..644a567a5c1d 100644 --- a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/meta.yml +++ b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/meta.yml @@ -9,14 +9,9 @@ keywords: - strandedness components: - bbmap/bbsplit - - samtools/sort - - samtools/index - - cat - cat/fastq - fq/lint - - ribodetector - - seqkit/stats - - sortmerna + - fastq_remove_rrna - fastq_subsample_fq_salmon - fastq_fastqc_umitools_trimgalore - fastq_fastqc_umitools_fastp @@ -79,6 +74,15 @@ input: - index: type: directory description: SortMeRNA index directory + - ch_bowtie2_index: + description: Directory containing bowtie2 index for rRNA removal + structure: + - meta: + type: map + description: Metadata for the Bowtie2 index + - index: + type: directory + description: Bowtie2 index directory - ch_bbsplit_index: description: Path to directory or tar.gz archive for pre-built BBSplit index structure: @@ -90,7 +94,7 @@ input: description: BBSplit index directory or tar.gz archive pattern: "{*,*.tar.gz}" - ch_rrna_fastas: - description: Channel containing one or more FASTA files containing rRNA sequences for use with SortMeRNA + description: Channel containing one or more FASTA files containing rRNA sequences for use with SortMeRNA or Bowtie2 structure: - meta: type: map @@ -120,6 +124,9 @@ input: - make_sortmerna_index: type: boolean description: Whether to create sortmerna index before running sortmerna + - make_bowtie2_index: + type: boolean + description: Whether to create bowtie2 index before running bowtie2 for rRNA removal - trimmer: type: string description: Specifies the trimming tool to use @@ -140,7 +147,7 @@ input: - ribo_removal_tool: type: string description: Specifies the rRNA removal tool to use - enum: ["sortmerna", "ribodetector"] + enum: ["sortmerna", "ribodetector", "bowtie2"] - with_umi: type: boolean description: Enable UMI-based read deduplication diff --git a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test index 73e71bc14f0d..3196ed0f18c3 100644 --- a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test +++ b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test @@ -11,13 +11,11 @@ nextflow_workflow { tag "subworkflows/fastq_qc_trim_filter_setstrandedness" tag "bbmap/bbsplit" - tag "cat" tag "cat/fastq" tag "fastqc" tag "fq/lint" - tag "sortmerna" - tag "ribodetector" - tag "seqkit/stats" + + tag "subworkflows/fastq_remove_rrna" tag "subworkflows/fastq_fastqc_umitools_trimgalore" tag "subworkflows/fastq_fastqc_umitools_fastp" tag "subworkflows/fastq_subsample_fq_salmon" @@ -58,37 +56,39 @@ nextflow_workflow { input[3] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true)) // ch_gtf input[4] = [] // ch_salmon_index input[5] = [] // ch_sortmerna_index - input[6] = [] // ch_bbsplit_index - input[7] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas + input[6] = [] // ch_bowtie2_index + input[7] = [] // ch_bbsplit_index + input[8] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas // Skip options - input[8] = true // skip_bbsplit - input[9] = false // skip_fastqc - input[10] = false // skip_trimming - input[11] = true // skip_umi_extract - input[12] = false // skip_linting + input[9] = true // skip_bbsplit + input[10] = false // skip_fastqc + input[11] = false // skip_trimming + input[12] = true // skip_umi_extract + input[13] = false // skip_linting // Index generation - input[13] = true // make_salmon_index - input[14] = true // make_sortmerna_index + input[14] = true // make_salmon_index + input[15] = true // make_sortmerna_index + input[16] = false // make_bowtie2_index // Trimming options - input[15] = 'fastp' // trimmer - input[16] = 10 // min_trimmed_reads - input[17] = true // save_trimmed - input[18] = true // fastp_merge + input[17] = 'fastp' // trimmer + input[18] = 10 // min_trimmed_reads + input[19] = true // save_trimmed + input[20] = true // fastp_merge // rRNA removal options - input[19] = true // remove_ribo_rna - input[20] = 'sortmerna' // ribo_removal_tool + input[21] = true // remove_ribo_rna + input[22] = 'sortmerna' // ribo_removal_tool // UMI options - input[21] = false // with_umi - input[22] = 0 // umi_discard_read + input[23] = false // with_umi + input[24] = 0 // umi_discard_read // Strandedness thresholds - input[23] = 0.8 // stranded_threshold - input[24] = 0.1 // unstranded_threshold + input[25] = 0.8 // stranded_threshold + input[26] = 0.1 // unstranded_threshold """ } } @@ -140,37 +140,39 @@ nextflow_workflow { input[3] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true)) // ch_gtf input[4] = [] // ch_salmon_index input[5] = [] // ch_sortmerna_index - input[6] = [] // ch_bbsplit_index - input[7] = [] // ch_rrna_fastas (not needed for ribodetector) + input[6] = [] // ch_bowtie2_index + input[7] = [] // ch_bbsplit_index + input[8] = [] // ch_rrna_fastas (not needed for ribodetector) // Skip options - input[8] = true // skip_bbsplit - input[9] = false // skip_fastqc - input[10] = false // skip_trimming - input[11] = true // skip_umi_extract - input[12] = false // skip_linting + input[9] = true // skip_bbsplit + input[10] = false // skip_fastqc + input[11] = false // skip_trimming + input[12] = true // skip_umi_extract + input[13] = false // skip_linting // Index generation - input[13] = true // make_salmon_index - input[14] = false // make_sortmerna_index (not needed for ribodetector) + input[14] = true // make_salmon_index + input[15] = false // make_sortmerna_index (not needed for ribodetector) + input[16] = false // make_bowtie2_index // Trimming options - input[15] = 'fastp' // trimmer - input[16] = 10 // min_trimmed_reads - input[17] = true // save_trimmed - input[18] = true // fastp_merge + input[17] = 'fastp' // trimmer + input[18] = 10 // min_trimmed_reads + input[19] = true // save_trimmed + input[20] = true // fastp_merge // rRNA removal options - input[19] = true // remove_ribo_rna - input[20] = 'ribodetector' // ribo_removal_tool + input[21] = true // remove_ribo_rna + input[22] = 'ribodetector' // ribo_removal_tool // UMI options - input[21] = false // with_umi - input[22] = 0 // umi_discard_read + input[23] = false // with_umi + input[24] = 0 // umi_discard_read // Strandedness thresholds - input[23] = 0.8 // stranded_threshold - input[24] = 0.1 // unstranded_threshold + input[25] = 0.8 // stranded_threshold + input[26] = 0.1 // unstranded_threshold """ } } @@ -224,37 +226,39 @@ nextflow_workflow { input[3] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true)) // ch_gtf input[4] = [] // ch_salmon_index input[5] = [] // ch_sortmerna_index - input[6] = [] // ch_bbsplit_index - input[7] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas + input[6] = [] // ch_bowtie2_index + input[7] = [] // ch_bbsplit_index + input[8] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas // Skip options - input[8] = true // skip_bbsplit - input[9] = false // skip_fastqc - input[10] = false // skip_trimming - input[11] = true // skip_umi_extract - input[12] = false // skip_linting + input[9] = true // skip_bbsplit + input[10] = false // skip_fastqc + input[11] = false // skip_trimming + input[12] = true // skip_umi_extract + input[13] = false // skip_linting // Index generation - input[13] = true // make_salmon_index - input[14] = true // make_sortmerna_index + input[14] = true // make_salmon_index + input[15] = true // make_sortmerna_index + input[16] = false // make_bowtie2_index // Trimming options - input[15] = 'trimgalore' // trimmer - input[16] = 10 // min_trimmed_reads - input[17] = true // save_trimmed - input[18] = true // fastp_merge + input[17] = 'trimgalore' // trimmer + input[18] = 10 // min_trimmed_reads + input[19] = true // save_trimmed + input[20] = true // fastp_merge // rRNA removal options - input[19] = true // remove_ribo_rna - input[20] = 'sortmerna' // ribo_removal_tool + input[21] = true // remove_ribo_rna + input[22] = 'sortmerna' // ribo_removal_tool // UMI options - input[21] = false // with_umi - input[22] = 0 // umi_discard_read + input[23] = false // with_umi + input[24] = 0 // umi_discard_read // Strandedness thresholds - input[23] = 0.8 // stranded_threshold - input[24] = 0.1 // unstranded_threshold + input[25] = 0.8 // stranded_threshold + input[26] = 0.1 // unstranded_threshold """ } } @@ -294,4 +298,198 @@ nextflow_workflow { } } + test("homo_sapiens paired-end [fastq] fastp bowtie2") { + + when { + workflow { + """ + // Input channels + input[0] = CAT_FASTQ.out.reads + input[1] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true)) // ch_fasta + input[2] = Channel.of(file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/transcriptome.fasta", checkIfExists: true)) // ch_transcript_fasta + input[3] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true)) // ch_gtf + input[4] = [] // ch_salmon_index + input[5] = [] // ch_sortmerna_index + input[6] = [] // ch_bowtie2_index + input[7] = [] // ch_bbsplit_index + input[8] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas + + // Skip options + input[9] = true // skip_bbsplit + input[10] = false // skip_fastqc + input[11] = false // skip_trimming + input[12] = true // skip_umi_extract + input[13] = false // skip_linting + + // Index generation + input[14] = true // make_salmon_index + input[15] = false // make_sortmerna_index (not needed for bowtie2) + input[16] = true // make_bowtie2_index + + // Trimming options + input[17] = 'fastp' // trimmer + input[18] = 10 // min_trimmed_reads + input[19] = true // save_trimmed + input[20] = true // fastp_merge + + // rRNA removal options + input[21] = true // remove_ribo_rna + input[22] = 'bowtie2' // ribo_removal_tool + + // UMI options + input[23] = false // with_umi + input[24] = 0 // umi_discard_read + + // Strandedness thresholds + input[25] = 0.8 // stranded_threshold + input[26] = 0.1 // unstranded_threshold + """ + } + } + + then { + def pelines1 = path(workflow.out.reads[0][1][0]).linesGzip + def pelines2 = path(workflow.out.reads[0][1][1]).linesGzip + + // First part of each fq lint report line is a timestamp, remove it before snapshotting + def processed_ribo_removal_lint_report = path(workflow.out.lint_log.find { entry -> entry[1].contains('fq_lint_after_ribo_removal')}?.getAt(1)) + .getText() + .readLines() + .collect { line -> line.split(' ', 2)[1] } // Split by the first space and take everything after it + .join('\n') // Join the processed lines back into a single text block + + // Parse bowtie2 log for alignment stats - count mates that aligned to rRNA + def bowtie2Log = path(workflow.out.multiqc_files.find { it.toString().endsWith('.bowtie2.log') }) + .getText() + def exactMatch = (bowtie2Log =~ /(\d+) \(\d+\.\d+%\) aligned exactly 1 time/) + def multiMatch = (bowtie2Log =~ /(\d+) \(\d+\.\d+%\) aligned >1 times/) + def exactCount = exactMatch ? exactMatch[0][1].toInteger() : 0 + def multiCount = multiMatch ? multiMatch[0][1].toInteger() : 0 + def bowtie2RrnaCount = exactCount + multiCount + + // Input: 4159 original + 10 synthetic rRNA = 4169 read pairs + // After fastp merge: 1137 pairs (3022 merged reads) + // Bowtie2 aligns 17 individual mates to rRNA (vs SortMeRNA's 20 - less sensitive) + // These 17 mates come from 10 pairs: 7 pairs had both mates align, 3 pairs had one mate align + // Using samtools -f 12 removes any pair where EITHER mate aligned, leaving 1127 pairs + // (same result as SortMeRNA despite detecting fewer individual mates) + assertAll( + { assert workflow.success }, + { assert pelines1.size() == 4508 }, // 1127 pairs × 4 lines/read + { assert pelines2.size() == 4508 }, + { assert workflow.out.trim_read_count[0][1] == 3022 }, + { assert bowtie2RrnaCount == 17 }, // 17 mates aligned to rRNA reference + { assert snapshot( + pelines1.join('\n').md5(), + pelines2.join('\n').md5(), + processed_ribo_removal_lint_report.md5() + ).match() } + ) + } + } + + test("homo_sapiens single-end [fastq] fastp bowtie2") { + + setup { + run("CAT_FASTQ", alias: "CAT_FASTQ_SE") { + script "../../../../modules/nf-core/cat/fastq/main.nf" + process { + """ + // Single-end test data with synthetic rRNA reads + input[0] = Channel.of([ + [ id:'test_se', single_end:true, strandedness:'auto' ], + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'generic/fastq/rrna_reads_1.fastq.gz', checkIfExists: true) + ] + ]) + """ + } + } + } + + when { + workflow { + """ + // Input channels - wrap single file in list to match expected format + input[0] = CAT_FASTQ_SE.out.reads.map { meta, reads -> [meta, reads instanceof List ? reads : [reads]] } + input[1] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true)) // ch_fasta + input[2] = Channel.of(file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/transcriptome.fasta", checkIfExists: true)) // ch_transcript_fasta + input[3] = Channel.of(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true)) // ch_gtf + input[4] = [] // ch_salmon_index + input[5] = [] // ch_sortmerna_index + input[6] = [] // ch_bowtie2_index + input[7] = [] // ch_bbsplit_index + input[8] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas + + // Skip options + input[9] = true // skip_bbsplit + input[10] = false // skip_fastqc + input[11] = false // skip_trimming + input[12] = true // skip_umi_extract + input[13] = false // skip_linting + + // Index generation + input[14] = true // make_salmon_index + input[15] = false // make_sortmerna_index (not needed for bowtie2) + input[16] = true // make_bowtie2_index + + // Trimming options + input[17] = 'fastp' // trimmer + input[18] = 10 // min_trimmed_reads + input[19] = true // save_trimmed + input[20] = false // fastp_merge (not applicable for single-end) + + // rRNA removal options + input[21] = true // remove_ribo_rna + input[22] = 'bowtie2' // ribo_removal_tool + + // UMI options + input[23] = false // with_umi + input[24] = 0 // umi_discard_read + + // Strandedness thresholds + input[25] = 0.8 // stranded_threshold + input[26] = 0.1 // unstranded_threshold + """ + } + } + + then { + // For single-end, reads output is a single file, not a list + def selines = path(workflow.out.reads[0][1]).linesGzip + + // First part of each fq lint report line is a timestamp, remove it before snapshotting + def processed_ribo_removal_lint_report = path(workflow.out.lint_log.find { entry -> entry[1].contains('fq_lint_after_ribo_removal')}?.getAt(1)) + .getText() + .readLines() + .collect { line -> line.split(' ', 2)[1] } + .join('\n') + + // Parse bowtie2 log for alignment stats + def bowtie2Log = path(workflow.out.multiqc_files.find { it.toString().endsWith('.bowtie2.log') }) + .getText() + def exactMatch = (bowtie2Log =~ /(\d+) \(\d+\.\d+%\) aligned exactly 1 time/) + def multiMatch = (bowtie2Log =~ /(\d+) \(\d+\.\d+%\) aligned >1 times/) + def exactCount = exactMatch ? exactMatch[0][1].toInteger() : 0 + def multiCount = multiMatch ? multiMatch[0][1].toInteger() : 0 + def bowtie2RrnaCount = exactCount + multiCount + + // Input: 4159 original reads + 10 synthetic rRNA = 4169 reads + // After fastp trimming: 4162 reads (7 removed by quality filtering) + // Bowtie2 aligns 10 reads to rRNA (all with multiple alignments) + // Using --un-gz outputs unmapped reads directly: 4152 reads remain + assertAll( + { assert workflow.success }, + { assert selines.size() == 16608 }, // 4152 reads × 4 lines/read + { assert workflow.out.trim_read_count[0][1] == 4162 }, + { assert bowtie2RrnaCount == 10 }, // 10 reads aligned to rRNA reference + { assert snapshot( + selines.join('\n').md5(), + processed_ribo_removal_lint_report.md5() + ).match() } + ) + } + } + } diff --git a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test.snap b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test.snap index 635f719f8d54..548176fa9b27 100644 --- a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test.snap +++ b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test.snap @@ -34,5 +34,28 @@ "nextflow": "25.10.0" }, "timestamp": "2025-11-27T09:46:26.078373852" + }, + "homo_sapiens paired-end [fastq] fastp bowtie2": { + "content": [ + "e4b3f501156cf176093c712e7a1bc0e1", + "f79feea1f5d509b4bcfd0a637b5d8558", + "0a6ee69d1a42f5e38ad2d4dfe9faf5a6" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-28T10:50:23.556456" + }, + "homo_sapiens single-end [fastq] fastp bowtie2": { + "content": [ + "b423f619ae31c22b2bf99bdcb89bf852", + "5c1e74518dd70e4f1506b4f64da7b5f3" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-28T11:03:31.491281" } } \ No newline at end of file diff --git a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/nextflow.config b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/nextflow.config index f63142fefe27..db3096f362cc 100644 --- a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/nextflow.config +++ b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/nextflow.config @@ -38,5 +38,21 @@ process { } withName: 'RIBODETECTOR' { ext.prefix = { "${meta.id}.ribodetector" } + ext.args = '--seed 1' + } + withName: 'BOWTIE2_ALIGN' { + ext.prefix = { "${meta.id}.bowtie2_rrna" } + ext.args = '--very-sensitive-local --seed 1 --reorder' + } + withName: 'BOWTIE2_ALIGN_PE' { + ext.prefix = { "${meta.id}.bowtie2_rrna" } + ext.args = '--very-sensitive-local --seed 1 --reorder' + } + withName: 'SAMTOOLS_VIEW_BOWTIE2' { + ext.prefix = { "${meta.id}.bowtie2_unmapped" } + ext.args = '-f 12' // Keep only pairs where BOTH mates are unmapped + } + withName: 'SAMTOOLS_FASTQ_BOWTIE2' { + ext.prefix = { "${meta.id}.bowtie2_filtered" } } } diff --git a/subworkflows/nf-core/fastq_remove_rrna/main.nf b/subworkflows/nf-core/fastq_remove_rrna/main.nf new file mode 100644 index 000000000000..03385275f572 --- /dev/null +++ b/subworkflows/nf-core/fastq_remove_rrna/main.nf @@ -0,0 +1,185 @@ +include { BOWTIE2_ALIGN } from '../../../modules/nf-core/bowtie2/align/main' +include { BOWTIE2_ALIGN as BOWTIE2_ALIGN_PE } from '../../../modules/nf-core/bowtie2/align/main' +include { BOWTIE2_BUILD } from '../../../modules/nf-core/bowtie2/build/main' +include { RIBODETECTOR } from '../../../modules/nf-core/ribodetector/main' +include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_BOWTIE2 } from '../../../modules/nf-core/samtools/fastq/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_BOWTIE2 } from '../../../modules/nf-core/samtools/view/main' +include { SEQKIT_STATS } from '../../../modules/nf-core/seqkit/stats/main' +include { SORTMERNA } from '../../../modules/nf-core/sortmerna/main' +include { SORTMERNA as SORTMERNA_INDEX } from '../../../modules/nf-core/sortmerna/main' + +// +// Function that parses seqkit stats TSV output to extract the mean read length +// for use with RiboDetector's -l parameter +// +def getReadLengthFromSeqkitStats(stats_file) { + def lines = stats_file.text.readLines() + if (lines.size() < 2) { + return 100 // Default fallback + } + + def header = lines[0].split('\t') + def avgLenIdx = header.findIndexOf { it == 'avg_len' } + if (avgLenIdx < 0) { + return 100 // Default fallback if column not found + } + + // Calculate mean avg_len across all files in the stats output + def avgLens = lines[1..-1].collect { it.split('\t')[avgLenIdx] as float } + def meanAvgLen = avgLens.sum() / avgLens.size() + + return Math.round(meanAvgLen) as int +} + +workflow FASTQ_REMOVE_RRNA { + take: + ch_reads // channel: [ val(meta), [ reads ] ] + ch_rrna_fastas // channel: one or more fasta files containing rrna sequences + ch_sortmerna_index // channel: /path/to/sortmerna/index/ (optional) + ch_bowtie2_index // channel: /path/to/bowtie2/index/ (optional) + ribo_removal_tool // string (enum): 'sortmerna', 'ribodetector', or 'bowtie2' + make_sortmerna_index // boolean: Whether to create a sortmerna index before running sortmerna + make_bowtie2_index // boolean: Whether to create a bowtie2 index before running bowtie2 + + main: + + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + ch_filtered_reads = ch_reads + + if (ribo_removal_tool == 'sortmerna') { + ch_sortmerna_fastas = ch_rrna_fastas + .collect() + .map { [[id: 'rrna_refs'], it] } + + if (make_sortmerna_index) { + SORTMERNA_INDEX( + [[], []], + ch_sortmerna_fastas, + [[], []], + ) + ch_sortmerna_index = SORTMERNA_INDEX.out.index.first() + } + + SORTMERNA( + ch_filtered_reads, + ch_sortmerna_fastas, + ch_sortmerna_index, + ) + + ch_filtered_reads = SORTMERNA.out.reads + ch_multiqc_files = ch_multiqc_files.mix(SORTMERNA.out.log) + ch_versions = ch_versions.mix(SORTMERNA.out.versions.first()) + } + else if (ribo_removal_tool == 'ribodetector') { + // Run seqkit stats to determine average read length + SEQKIT_STATS( + ch_filtered_reads + ) + + ch_versions = ch_versions.mix(SEQKIT_STATS.out.versions.first()) + + // Join stats with reads and calculate read length for RiboDetector + ch_filtered_reads + .join(SEQKIT_STATS.out.stats) + .multiMap { meta, reads, stats -> + def readLength = getReadLengthFromSeqkitStats(stats) + reads: [meta, reads] + length: readLength + } + .set { ch_reads_with_length } + + RIBODETECTOR( + ch_reads_with_length.reads, + ch_reads_with_length.length, + ) + + ch_filtered_reads = RIBODETECTOR.out.fastq + ch_multiqc_files = ch_multiqc_files.mix(RIBODETECTOR.out.log) + // Note: ribodetector versions collected via topic + } + else if (ribo_removal_tool == 'bowtie2') { + if (make_bowtie2_index) { + // Collect all fastas into a single file for index building + // Convert U to T since rRNA references may contain RNA (U) but reads are DNA (T) + ch_rrna_fastas + .collectFile(name: 'rrna_combined.fasta', newLine: true) + .map { fasta -> + def content = fasta.text.replaceAll('U', 'T').replaceAll('u', 't') + def convertedFasta = file("${fasta.parent}/rrna_combined_dna.fasta") + convertedFasta.text = content + [[id: 'rrna_refs'], convertedFasta] + } + .set { ch_combined_fasta } + + BOWTIE2_BUILD( + ch_combined_fasta + ) + ch_bowtie2_index = BOWTIE2_BUILD.out.index.first() + ch_versions = ch_versions.mix(BOWTIE2_BUILD.out.versions.first()) + } + + // Branch reads by single-end vs paired-end for different filtering strategies + ch_filtered_reads + .branch { meta, reads -> + single_end: meta.single_end + paired_end: !meta.single_end + } + .set { ch_reads_for_bowtie2 } + + // For single-end reads: bowtie2's --un-gz works correctly + // save_unaligned=true outputs unmapped reads directly + BOWTIE2_ALIGN( + ch_reads_for_bowtie2.single_end, + ch_bowtie2_index, + [[], []], // No reference fasta needed + true, // save_unaligned - for single-end this works correctly + false, // sort_bam - not needed + ) + + ch_multiqc_files = ch_multiqc_files.mix(BOWTIE2_ALIGN.out.log) + ch_versions = ch_versions.mix(BOWTIE2_ALIGN.out.versions) + + // For paired-end reads: bowtie2's --un-conc-gz outputs pairs that didn't + // align concordantly, which INCLUDES pairs where one mate aligned. + // We need to filter via samtools to get pairs where BOTH mates are unmapped. + BOWTIE2_ALIGN_PE( + ch_reads_for_bowtie2.paired_end, + ch_bowtie2_index, + [[], []], // No reference fasta needed for BAM output + false, // save_unaligned - we'll extract from BAM instead + false, // sort_bam - not needed + ) + + ch_multiqc_files = ch_multiqc_files.mix(BOWTIE2_ALIGN_PE.out.log) + ch_versions = ch_versions.mix(BOWTIE2_ALIGN_PE.out.versions) + + // Filter BAM for read pairs where BOTH mates are unmapped (flag 12 = 4 + 8) + // This removes any pair where at least one mate aligned to rRNA + SAMTOOLS_VIEW_BOWTIE2( + BOWTIE2_ALIGN_PE.out.bam.map { meta, bam -> [meta, bam, []] }, + [[], []], // No reference fasta + [], // No qname file + [] // No index format + ) + // Note: samtools/view versions collected via topic + + // Convert filtered BAM back to paired FASTQ + SAMTOOLS_FASTQ_BOWTIE2( + SAMTOOLS_VIEW_BOWTIE2.out.bam, + false // not interleaved + ) + + ch_versions = ch_versions.mix(SAMTOOLS_FASTQ_BOWTIE2.out.versions) + + // Combine single-end and paired-end results + BOWTIE2_ALIGN.out.fastq + .mix(SAMTOOLS_FASTQ_BOWTIE2.out.fastq) + .set { ch_filtered_reads } + } + + emit: + reads = ch_filtered_reads // channel: [ val(meta), [ reads ] ] + multiqc_files = ch_multiqc_files // channel: [ val(meta), [ log files ] ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/fastq_remove_rrna/meta.yml b/subworkflows/nf-core/fastq_remove_rrna/meta.yml new file mode 100644 index 000000000000..f8ea0cc3b59c --- /dev/null +++ b/subworkflows/nf-core/fastq_remove_rrna/meta.yml @@ -0,0 +1,112 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fastq_remove_rrna" +description: Remove ribosomal RNA reads from FASTQ files using SortMeRNA, RiboDetector, or Bowtie2 +keywords: + - fastq + - rrna + - ribosomal + - filter + - sortmerna + - ribodetector + - bowtie2 +components: + - bowtie2/align + - bowtie2/build + - ribodetector + - samtools/fastq + - samtools/view + - seqkit/stats + - sortmerna +input: + - ch_reads: + type: file + description: | + List of FastQ files of size 1 and 2 for single-end and paired-end data, respectively. + structure: + - meta: + type: map + description: Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: FastQ files + pattern: "*.{fq,fastq}{,.gz}" + - ch_rrna_fastas: + type: file + description: | + Channel containing one or more FASTA files with rRNA sequences for use with SortMeRNA or Bowtie2. + Not required for RiboDetector which uses built-in models. + structure: + - fasta: + type: file + description: rRNA reference fasta files + pattern: "*.{fa,fasta}{,.gz}" + - ch_sortmerna_index: + type: directory + description: | + Pre-built SortMeRNA index directory. Optional - can be built on-the-fly if make_sortmerna_index is true. + structure: + - meta: + type: map + description: Metadata for the SortMeRNA index + - index: + type: directory + description: SortMeRNA index directory + - ch_bowtie2_index: + type: directory + description: | + Pre-built Bowtie2 index directory. Optional - can be built on-the-fly if make_bowtie2_index is true. + structure: + - meta: + type: map + description: Metadata for the Bowtie2 index + - index: + type: directory + description: Bowtie2 index directory + - ribo_removal_tool: + type: string + description: Specifies the rRNA removal tool to use + enum: ["sortmerna", "ribodetector", "bowtie2"] + - make_sortmerna_index: + type: boolean + description: Whether to create SortMeRNA index before running SortMeRNA + - make_bowtie2_index: + type: boolean + description: Whether to create Bowtie2 index before running Bowtie2 for rRNA removal +output: + - reads: + type: file + description: | + FASTQ files with rRNA reads removed. + structure: + - meta: + type: map + description: Groovy Map containing sample information + - reads: + type: file + description: Filtered FastQ files + pattern: "*.{fq,fastq}{,.gz}" + - multiqc_files: + type: file + description: | + Log files from the rRNA removal tool, compatible with MultiQC. + structure: + - meta: + type: map + description: Metadata for the log files + - log: + type: file + description: Tool-specific log files + pattern: "*.log" + - versions: + type: file + description: | + File containing software versions + structure: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@pinin4fjords" +maintainers: + - "@pinin4fjords" diff --git a/subworkflows/nf-core/fastq_remove_rrna/tests/main.nf.test b/subworkflows/nf-core/fastq_remove_rrna/tests/main.nf.test new file mode 100644 index 000000000000..26959efa5478 --- /dev/null +++ b/subworkflows/nf-core/fastq_remove_rrna/tests/main.nf.test @@ -0,0 +1,298 @@ + +nextflow_workflow { + + name "Test Subworkflow FASTQ_REMOVE_RRNA" + script "../main.nf" + workflow "FASTQ_REMOVE_RRNA" + config "./nextflow.config" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fastq_remove_rrna" + + tag "cat/fastq" + tag "bowtie2/align" + tag "bowtie2/build" + tag "ribodetector" + tag "samtools/view" + tag "samtools/fastq" + tag "seqkit/stats" + tag "sortmerna" + + // Global setup: Create test data with synthetic rRNA reads using CAT_FASTQ + // The rRNA reads (from generic/fastq/) are 5.8S rRNA sequences that will be detected + // by SortMeRNA (alignment-based, 100% detection), RiboDetector (ML-based, ~70% detection), + // and Bowtie2 (alignment-based) + setup { + run("CAT_FASTQ") { + script "../../../../modules/nf-core/cat/fastq/main.nf" + process { + """ + // CAT_FASTQ input: alternating R1/R2 files [r1_a, r2_a, r1_b, r2_b, ...] + input[0] = Channel.of([ + [ id:'test', single_end:false ], + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'generic/fastq/rrna_reads_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'generic/fastq/rrna_reads_2.fastq.gz', checkIfExists: true) + ] + ]) + """ + } + } + } + + test("homo_sapiens paired-end [fastq] sortmerna") { + + when { + workflow { + """ + input[0] = CAT_FASTQ.out.reads // ch_reads + input[1] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas + input[2] = [] // ch_sortmerna_index + input[3] = [] // ch_bowtie2_index + input[4] = 'sortmerna' // ribo_removal_tool + input[5] = true // make_sortmerna_index + input[6] = false // make_bowtie2_index + """ + } + } + + then { + def pelines1 = path(workflow.out.reads[0][1][0]).linesGzip + def pelines2 = path(workflow.out.reads[0][1][1]).linesGzip + + // Parse sortmerna log for rRNA detection stats + def sortmernaLog = path(workflow.out.multiqc_files.find { entry -> entry[1].toString().endsWith('.sortmerna.log') }?.getAt(1)) + .getText() + def sortmernaRrnaMatch = (sortmernaLog =~ /Total reads passing E-value threshold = (\d+)/) + def sortmernaRrnaCount = sortmernaRrnaMatch ? sortmernaRrnaMatch[0][1].toInteger() : -1 + + // Input: 4159 original + 10 synthetic rRNA = 4169 read pairs + // SortMeRNA removes all 10 synthetic rRNA pairs (20 individual reads), leaving 4159 pairs + assertAll( + { assert workflow.success }, + { assert pelines1.size() == 16636 }, // 4159 pairs × 4 lines/read + { assert pelines2.size() == 16636 }, + { assert sortmernaRrnaCount == 20 }, // 10 pairs = 20 individual reads (100% detection) + { assert snapshot( + pelines1.join('\n').md5(), + pelines2.join('\n').md5() + ).match() } + ) + } + } + + test("homo_sapiens paired-end [fastq] ribodetector") { + + when { + workflow { + """ + input[0] = CAT_FASTQ.out.reads // ch_reads + input[1] = [] // ch_rrna_fastas (not needed for ribodetector) + input[2] = [] // ch_sortmerna_index + input[3] = [] // ch_bowtie2_index + input[4] = 'ribodetector' // ribo_removal_tool + input[5] = false // make_sortmerna_index + input[6] = false // make_bowtie2_index + """ + } + } + + then { + def pelines1 = path(workflow.out.reads[0][1][0]).linesGzip + def pelines2 = path(workflow.out.reads[0][1][1]).linesGzip + + // Parse ribodetector log for rRNA detection stats + // Note: ribodetector log contains ANSI color codes that must be stripped before regex matching + def ribodetectorLog = path(workflow.out.multiqc_files.find { entry -> entry[1].toString().endsWith('.ribodetector.log') }?.getAt(1)) + .getText() + .replaceAll(/\u001b\[[0-9;]*m/, '') // Strip ANSI escape codes + def ribodetectorRrnaMatch = (ribodetectorLog =~ /Detected (\d+) rRNA sequences/) + def ribodetectorRrnaCount = ribodetectorRrnaMatch ? ribodetectorRrnaMatch[0][1].toInteger() : -1 + + // Sort FASTQ reads before MD5 to handle ribodetector's non-deterministic output order + // (multiprocessing causes variable read ordering even with --seed set) + def sortedLines1 = pelines1.collate(4).sort { it[0] }.flatten() + def sortedLines2 = pelines2.collate(4).sort { it[0] }.flatten() + + // Input: 4159 original + 10 synthetic rRNA = 4169 read pairs + // RiboDetector removes 7 of 10 synthetic rRNA pairs (70% detection), leaving 4162 pairs + assertAll( + { assert workflow.success }, + { assert pelines1.size() == 16648 }, // 4162 pairs × 4 lines/read + { assert pelines2.size() == 16648 }, + { assert ribodetectorRrnaCount == 7 }, // 7 pairs detected (70% - ML model misses some) + { assert snapshot( + sortedLines1.join('\n').md5(), + sortedLines2.join('\n').md5() + ).match() } + ) + } + } + + test("homo_sapiens paired-end [fastq] bowtie2") { + + when { + workflow { + """ + input[0] = CAT_FASTQ.out.reads // ch_reads + input[1] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas + input[2] = [] // ch_sortmerna_index + input[3] = [] // ch_bowtie2_index + input[4] = 'bowtie2' // ribo_removal_tool + input[5] = false // make_sortmerna_index + input[6] = true // make_bowtie2_index + """ + } + } + + then { + def pelines1 = path(workflow.out.reads[0][1][0]).linesGzip + def pelines2 = path(workflow.out.reads[0][1][1]).linesGzip + + // Parse bowtie2 log for alignment stats - count mates that aligned to rRNA + def bowtie2Log = path(workflow.out.multiqc_files.find { entry -> entry[1].toString().endsWith('.bowtie2.log') }?.getAt(1)) + .getText() + def exactMatch = (bowtie2Log =~ /(\d+) \(\d+\.\d+%\) aligned exactly 1 time/) + def multiMatch = (bowtie2Log =~ /(\d+) \(\d+\.\d+%\) aligned >1 times/) + def exactCount = exactMatch ? exactMatch[0][1].toInteger() : 0 + def multiCount = multiMatch ? multiMatch[0][1].toInteger() : 0 + def bowtie2RrnaCount = exactCount + multiCount + + // Input: 4159 original + 10 synthetic rRNA = 4169 read pairs + // Bowtie2 aligns 17 individual mates to rRNA + // Using samtools -f 12 keeps only pairs where BOTH mates are unmapped, leaving 4159 pairs + assertAll( + { assert workflow.success }, + { assert pelines1.size() == 16636 }, // 4159 pairs × 4 lines/read + { assert pelines2.size() == 16636 }, + { assert bowtie2RrnaCount == 17 }, // 17 mates aligned to rRNA reference + { assert snapshot( + pelines1.join('\n').md5(), + pelines2.join('\n').md5() + ).match() } + ) + } + } + + test("homo_sapiens single-end [fastq] bowtie2") { + + setup { + run("CAT_FASTQ", alias: "CAT_FASTQ_SE") { + script "../../../../modules/nf-core/cat/fastq/main.nf" + process { + """ + // Single-end test data with synthetic rRNA reads + input[0] = Channel.of([ + [ id:'test_se', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'generic/fastq/rrna_reads_1.fastq.gz', checkIfExists: true) + ] + ]) + """ + } + } + } + + when { + workflow { + """ + input[0] = CAT_FASTQ_SE.out.reads.map { meta, reads -> [meta, reads instanceof List ? reads : [reads]] } + input[1] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas + input[2] = [] // ch_sortmerna_index + input[3] = [] // ch_bowtie2_index + input[4] = 'bowtie2' // ribo_removal_tool + input[5] = false // make_sortmerna_index + input[6] = true // make_bowtie2_index + """ + } + } + + then { + // For single-end, reads output is a single file, not a list + def selines = path(workflow.out.reads[0][1]).linesGzip + + // Parse bowtie2 log for alignment stats + def bowtie2Log = path(workflow.out.multiqc_files.find { entry -> entry[1].toString().endsWith('.bowtie2.log') }?.getAt(1)) + .getText() + def exactMatch = (bowtie2Log =~ /(\d+) \(\d+\.\d+%\) aligned exactly 1 time/) + def multiMatch = (bowtie2Log =~ /(\d+) \(\d+\.\d+%\) aligned >1 times/) + def exactCount = exactMatch ? exactMatch[0][1].toInteger() : 0 + def multiCount = multiMatch ? multiMatch[0][1].toInteger() : 0 + def bowtie2RrnaCount = exactCount + multiCount + + // Input: 4159 original reads + 10 synthetic rRNA = 4169 reads + // Bowtie2 aligns 10 reads to rRNA + // Using --un-gz outputs unmapped reads directly: 4159 reads remain + assertAll( + { assert workflow.success }, + { assert selines.size() == 16636 }, // 4159 reads × 4 lines/read + { assert bowtie2RrnaCount == 10 }, // 10 reads aligned to rRNA reference + { assert snapshot( + selines.join('\n').md5() + ).match() } + ) + } + } + + test("homo_sapiens single-end [fastq] sortmerna") { + + setup { + run("CAT_FASTQ", alias: "CAT_FASTQ_SE") { + script "../../../../modules/nf-core/cat/fastq/main.nf" + process { + """ + // Single-end test data with synthetic rRNA reads + input[0] = Channel.of([ + [ id:'test_se', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'generic/fastq/rrna_reads_1.fastq.gz', checkIfExists: true) + ] + ]) + """ + } + } + } + + when { + workflow { + """ + input[0] = CAT_FASTQ_SE.out.reads.map { meta, reads -> [meta, reads instanceof List ? reads : [reads]] } + input[1] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas + input[2] = [] // ch_sortmerna_index + input[3] = [] // ch_bowtie2_index + input[4] = 'sortmerna' // ribo_removal_tool + input[5] = true // make_sortmerna_index + input[6] = false // make_bowtie2_index + """ + } + } + + then { + // For single-end, reads output is a single file, not a list + def selines = path(workflow.out.reads[0][1]).linesGzip + + // Parse sortmerna log for rRNA detection stats + def sortmernaLog = path(workflow.out.multiqc_files.find { entry -> entry[1].toString().endsWith('.sortmerna.log') }?.getAt(1)) + .getText() + def sortmernaRrnaMatch = (sortmernaLog =~ /Total reads passing E-value threshold = (\d+)/) + def sortmernaRrnaCount = sortmernaRrnaMatch ? sortmernaRrnaMatch[0][1].toInteger() : -1 + + // Input: 4159 original reads + 10 synthetic rRNA = 4169 reads + // SortMeRNA removes all 10 synthetic rRNA reads (100% detection), leaving 4159 reads + assertAll( + { assert workflow.success }, + { assert selines.size() == 16636 }, // 4159 reads × 4 lines/read + { assert sortmernaRrnaCount == 10 }, // 10 reads detected (100% detection) + { assert snapshot( + selines.join('\n').md5() + ).match() } + ) + } + } + +} diff --git a/subworkflows/nf-core/fastq_remove_rrna/tests/main.nf.test.snap b/subworkflows/nf-core/fastq_remove_rrna/tests/main.nf.test.snap new file mode 100644 index 000000000000..f9a5fc3feeff --- /dev/null +++ b/subworkflows/nf-core/fastq_remove_rrna/tests/main.nf.test.snap @@ -0,0 +1,55 @@ +{ + "homo_sapiens single-end [fastq] bowtie2": { + "content": [ + "bdea4e3bbdbb7c301ff578b9d8976fb6" + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-28T11:51:37.291482561" + }, + "homo_sapiens single-end [fastq] sortmerna": { + "content": [ + "bdea4e3bbdbb7c301ff578b9d8976fb6" + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-28T11:51:50.835424985" + }, + "homo_sapiens paired-end [fastq] sortmerna": { + "content": [ + "bdea4e3bbdbb7c301ff578b9d8976fb6", + "1b83618177abebeb38c29d2258efdd4f" + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-28T11:49:53.980824473" + }, + "homo_sapiens paired-end [fastq] ribodetector": { + "content": [ + "ec0260bcdeef6af8a9b6d470eafb5603", + "a0ef8564218df5741dee81f03db13600" + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-29T18:27:41.89909077" + }, + "homo_sapiens paired-end [fastq] bowtie2": { + "content": [ + "4ef4e259208497288aaefaa88770975d", + "63198a2af4a4a8fb949b01a8b2c4cb7c" + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-28T11:51:24.742227098" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fastq_remove_rrna/tests/nextflow.config b/subworkflows/nf-core/fastq_remove_rrna/tests/nextflow.config new file mode 100644 index 000000000000..4427330c19e4 --- /dev/null +++ b/subworkflows/nf-core/fastq_remove_rrna/tests/nextflow.config @@ -0,0 +1,32 @@ +// +// rRNA removal subworkflow options +// + +process { + + withName: 'SORTMERNA' { + ext.args = '--index 0' + } + withName: 'SORTMERNA_INDEX' { + ext.args = '--index 1' + } + withName: 'RIBODETECTOR' { + ext.prefix = { "${meta.id}.ribodetector" } + ext.args = '--seed 1' + } + withName: 'BOWTIE2_ALIGN' { + ext.prefix = { "${meta.id}.bowtie2_rrna" } + ext.args = '--very-sensitive-local --seed 1 --reorder' + } + withName: 'BOWTIE2_ALIGN_PE' { + ext.prefix = { "${meta.id}.bowtie2_rrna" } + ext.args = '--very-sensitive-local --seed 1 --reorder' + } + withName: 'SAMTOOLS_VIEW_BOWTIE2' { + ext.prefix = { "${meta.id}.bowtie2_unmapped" } + ext.args = '-f 12' // Keep only pairs where BOTH mates are unmapped + } + withName: 'SAMTOOLS_FASTQ_BOWTIE2' { + ext.prefix = { "${meta.id}.bowtie2_filtered" } + } +}