From 79c735bb53ef995cff9d8a502bd2d828f222b846 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova Date: Wed, 29 Apr 2026 11:13:47 +0100 Subject: [PATCH 1/6] add READSUBMIT workflow --- README.md | 39 ++++- assets/schema_input_reads.json | 127 ++++++++++++++ conf/test_reads_paired.config | 33 ++++ docs/output.md | 16 +- docs/usage.md | 52 +++++- main.nf | 13 +- modules/local/create_reads_manifest/main.nf | 56 ++++++ nextflow.config | 2 +- nextflow_schema.json | 6 +- tests/reads_paired_end.nf.test | 39 +++++ workflows/readsubmit.nf | 178 ++++++++++++++++++++ 11 files changed, 548 insertions(+), 13 deletions(-) create mode 100644 assets/schema_input_reads.json create mode 100644 conf/test_reads_paired.config create mode 100644 modules/local/create_reads_manifest/main.nf create mode 100644 tests/reads_paired_end.nf.test create mode 100644 workflows/readsubmit.nf diff --git a/README.md b/README.md index 30e52dc..20ba40d 100644 --- a/README.md +++ b/README.md @@ -22,11 +22,12 @@ ## Introduction **nf-core/seqsubmit** is a Nextflow pipeline for submitting sequence data to [ENA](https://www.ebi.ac.uk/ena/browser/home). -Currently, the pipeline supports three submission modes, each routed to a dedicated workflow and requiring its own input samplesheet structure: +Currently, the pipeline supports four submission modes, each routed to a dedicated workflow and requiring its own input samplesheet structure: - `mags` for Metagenome Assembled Genomes (MAGs) submission with `GENOMESUBMIT` workflow - `bins` for bins submission with `GENOMESUBMIT` workflow - `metagenomic_assemblies` for assembly submission with `ASSEMBLYSUBMIT` workflow +- `reads` for raw sequencing reads submission with `READSUBMIT` workflow ![seqsubmit workflow diagram](assets/seqsubmit_schema.png) @@ -123,6 +124,38 @@ assembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9 > [!IMPORTANT] > **Samplesheet column requirements**: All columns shown in the example above must be present in your samplesheet, even if some values are empty. Columns must be in exactly the same order as shown. +### `reads` mode (`READSUBMIT`) + +The input must follow `assets/schema_input_reads.json`. + +Required columns: + +- `sample` +- `sample_accession` +- `fastq_1` +- `fastq_2` +- `platform` +- `instrument` +- `library_source` +- `library_selection` +- `library_strategy` + +Optional columns: + +- `insert_size` +- `library_name` +- `description` + +Example `samplesheet_reads.csv`: + +```csv +sample,sample_accession,fastq_1,fastq_2,platform,instrument,library_source,library_selection,library_strategy,insert_size,library_name,description +illumina_run_001,SAMEA1234567,data/reads_R1.fastq.gz,data/reads_R2.fastq.gz,ILLUMINA,Illumina HiSeq 2000,GENOMIC,RANDOM,WGS,500,HiSeq_library_001,Illumina sequencing of sample XYZ +``` + +> [!IMPORTANT] +> **Samplesheet column requirements**: All columns shown in the example above must be present in your samplesheet, even if some values are empty. Columns must be in exactly the same order as shown. + ## Usage > [!NOTE] @@ -142,7 +175,7 @@ The `mags`/`bins` workflow requires databases for completeness/contamination est | Parameter | Description | | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------- | -| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` | +| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies, reads]` | | `--input` | Path to the samplesheet describing the data to be submitted | | `--outdir` | Path to the output directory for pipeline results | | `--submission_study` OR `--study_metadata` | ENA study accession (PRJ/ERP) to submit the data to OR metadata file in JSON/TSV/CSV format to register new study | @@ -161,7 +194,7 @@ General command template: ```bash nextflow run nf-core/seqsubmit \ -profile \ - --mode \ + --mode \ --input \ --centre_name \ --submission_study \ diff --git a/assets/schema_input_reads.json b/assets/schema_input_reads.json new file mode 100644 index 0000000..81c0826 --- /dev/null +++ b/assets/schema_input_reads.json @@ -0,0 +1,127 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/seqsubmit/main/assets/schema_input_reads.json", + "title": "nf-core/seqsubmit pipeline - params.input schema", + "description": "Schema for the sample sheet provided with params.input if params.mode is set to 'reads'", + "type": "array", + "items": { + "type": "object", + "properties": { + "sample": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sample must be provided and cannot contain spaces", + "meta": ["id"], + "description": "Unique experiment/run name" + }, + "sample_accession": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sample accession must be provided and cannot contain spaces", + "description": "ENA sample accession of the sample used to generate the reads" + }, + "fastq_1": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(fq|fastq)(\\.gz)?$", + "errorMessage": "FASTQ file must have extension '.fq' or '.fastq' (optionally gzipped)", + "description": "Forward reads FASTQ file (single-end or paired-end)" + }, + "fastq_2": { + "anyOf": [ + { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(fq|fastq)(\\.gz)?$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "errorMessage": "FASTQ file for reverse reads must have extension '.fq' or '.fastq' (optionally gzipped)", + "description": "Reverse reads FASTQ file if paired-end. Leave empty for single-end reads" + }, + "platform": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Platform must be provided and cannot contain spaces", + "description": "Sequencing platform (e.g., ILLUMINA, PACBIO_SMRT, OXFORD_NANOPORE, ION_TORRENT)" + }, + "instrument": { + "type": "string", + "pattern": "^[^\\n]+$", + "errorMessage": "Instrument must be provided and cannot span multiple lines", + "description": "Sequencer model (e.g., 'Illumina HiSeq 2000', 'PacBio Sequel')" + }, + "library_source": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Library source must be provided and cannot contain spaces", + "description": "Library source (GENOMIC, METAGENOMIC, TRANSCRIPTOMIC, etc.)" + }, + "library_selection": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Library selection must be provided and cannot contain spaces", + "description": "Library selection (RANDOM, PCR, cDNA, etc.)" + }, + "library_strategy": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Library strategy must be provided and cannot contain spaces", + "description": "Library strategy (WGS, RNA-Seq, AMPLICON, etc.)" + }, + "insert_size": { + "anyOf": [ + { + "type": "number", + "minimum": 0 + }, + { + "type": "string", + "maxLength": 0 + } + ], + "errorMessage": "Insert size must be a positive number or empty", + "description": "Fragment/insert size for paired-end reads (optional)" + }, + "library_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "description": "Descriptive library name (optional)" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "description": "Free-text description of the experiment (optional)" + } + }, + "required": [ + "sample", + "sample_accession", + "fastq_1", + "platform", + "instrument", + "library_source", + "library_selection", + "library_strategy" + ] + } +} diff --git a/conf/test_reads_paired.config b/conf/test_reads_paired.config new file mode 100644 index 0000000..eaa2b70 --- /dev/null +++ b/conf/test_reads_paired.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test_reads_paired, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '8.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode reads paired_end profile' + config_profile_description = 'Single-case reads test with paired-end reads' + + // Input data + // TODO: prepare test data and add to repo, update path here + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/reads_paired.csv' + outdir = 'test_output' + + mode = "reads" + submission_study = "PRJEB98843" + +} diff --git a/docs/output.md b/docs/output.md index 962e786..c1b1963 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,7 +8,7 @@ The directories listed below will be created in the results directory (set with ## Pipeline overview -The pipeline is built using [Nextflow](https://www.nextflow.io/) and performs automated submission of sequence data to ENA. Exact steps and generated outputs depend on the data type and `--mode` executed (`mags`, `bins` or `metagenomic_assemblies`). +The pipeline is built using [Nextflow](https://www.nextflow.io/) and performs automated submission of sequence data to ENA. Exact steps and generated outputs depend on the data type and `--mode` executed (`mags`, `bins`, `metagenomic_assemblies` or `reads`). ## `mags` and `bins` outputs @@ -50,6 +50,20 @@ When `--mode metagenomic_assemblies` is used, results are written under `metagen Assembly study registration, manifest generation, and Webin-CLI submission are executed by the workflow, but their intermediate outputs are not currently published into `--outdir` by the pipeline. +## `reads` outputs + +When `--mode reads` is used, results are written under `reads/`. + +
+Output files + +- `reads/` + - `upload/assigned_accessions.tsv`: run accessions assigned to submitted reads. + +
+ +Manifest generation and Webin-CLI submission are executed by the workflow, but their intermediate outputs are not currently published into `--outdir` by the pipeline. + ## Common outputs ### MultiQC diff --git a/docs/usage.md b/docs/usage.md index bf87eaa..e379696 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,12 +6,13 @@ ## Introduction -`nf-core/seqsubmit` is a Nextflow pipeline for submitting metagenomic assemblies, MAGs, and bins to ENA. +`nf-core/seqsubmit` is a Nextflow pipeline for submitting metagenomic assemblies, MAGs, bins, and raw reads to ENA. -The pipeline supports two workflow paths: +The pipeline supports three workflow paths: - `GENOMESUBMIT` for `--mode mags` and `--mode bins` - `ASSEMBLYSUBMIT` for `--mode metagenomic_assemblies` +- `READSUBMIT` for `--mode reads` ## Before you start @@ -105,6 +106,36 @@ assembly_002,data/assembly_002.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9 An example file is available at [assets/samplesheet_assembly.csv](../assets/samplesheet_assembly.csv). +### `reads` mode (`READSUBMIT`) + +Use this samplesheet structure for raw sequencing reads submission. The input format follows [assets/schema_input_reads.json](../assets/schema_input_reads.json). + +Example: + +```csv title="samplesheet_reads.csv" +sample,sample_accession,fastq_1,fastq_2,platform,instrument,library_source,library_selection,library_strategy,insert_size,library_name,description +illumina_run_001,SAMEA1234567,data/reads_R1.fastq.gz,data/reads_R2.fastq.gz,ILLUMINA,Illumina HiSeq 2000,GENOMIC,RANDOM,WGS,500,HiSeq_library_001,Illumina sequencing of sample XYZ +pacbio_run_001,SAMEA7654321,data/pacbio_reads.fastq.gz,,PACBIO_SMRT,PacBio Sequel,GENOMIC,RANDOM,WGS,,PacBio_library_002,Long-read sequencing +``` + +> [!IMPORTANT] +> **Samplesheet column requirements**: All columns shown in the example above must be present in your samplesheet, even if some values are empty. Columns must be in exactly the same order as shown. + +| Column | Type | Required | Description | +| ------------------- | --------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | str | Yes | Unique identifier of this particular data entry. Used as an experiment name. | +| `sample_accession` | str | Yes | ENA sample accession (starting with SAMEA) of the sample used to generate raw reads. | +| `fastq_1` | file path | Yes | Path to forward reads in FASTQ format (optionally gzipped). | +| `fastq_2` | file path | No | Path to reverse reads for paired-end data. Leave empty for single-end reads. | +| `platform` | str | Yes | Sequencing platform. Supported values: `ILLUMINA`, `PACBIO_SMRT`, `OXFORD_NANOPORE`, `ION_TORRENT`, `CAPILLARY`, `DNBSEQ`, `ELEMENT`, `GENAPSYS`, `GENEMIND`, `HELICOS`, `LS454`, `BGISEQ`, `ULTIMA`, `VELA_DIAGNOSTICS`. See [ENA documentation](https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#metadata-validation) for complete list. | +| `instrument` | str | Yes | Sequencer model, e.g. "Illumina HiSeq 2000", "PacBio Sequel", "MinION". | +| `library_source` | str | Yes | Library source type. Options: `GENOMIC`, `METAGENOMIC`, `TRANSCRIPTOMIC`, `METAGENOMIC SINGLE CELL`, `TRANSCRIPTOMIC SINGLE CELL`, `SYNTHETIC`, `VIRAL RNA`, `OTHER`. | +| `library_selection` | str | Yes | Library selection method. Options: `RANDOM`, `PCR`, `RANDOM PCR`, `RT-PCR`, `MF`, `cDNA`, `cDNA_randomPriming`, `cDNA_oligo_dT`, `PolyA`, `Inverse rRNA`, `ChIP`, `MNase`, `DNase`, `Hybrid Selection`, etc. See [ENA documentation](https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#metadata-validation) for complete list. | +| `library_strategy` | str | Yes | Library strategy. Options: `WGS`, `WGA`, `WXS`, `RNA-Seq`, `miRNA-Seq`, `ncRNA-Seq`, `EST`, `Hi-C`, `ATAC-seq`, `WCS`, `RAD-Seq`, `CLONE`, `AMPLICON`, `POOLCLONE`, `etc`. See [ENA documentation](https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#metadata-validation) for complete list. | +| `insert_size` | number | No | Fragment/insert size for paired-end reads (e.g., 500 for 500 bp inserts). Leave empty if not applicable. | +| `library_name` | str | No | Descriptive library name (optional). | +| `description` | str | No | Free-text description of the experiment (optional). | + ## Submission study All data submitted through this pipeline must be associated with an ENA study (project). You have two options: @@ -198,7 +229,7 @@ General command template: ```bash nextflow run nf-core/seqsubmit \ -profile \ - --mode \ + --mode \ --input \ --centre_name \ --submission_study \ @@ -209,7 +240,7 @@ Key parameters: | Parameter | Description | | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | -| `--mode` | Submission type. Supported values are `mags`, `bins`, and `metagenomic_assemblies`. | +| `--mode` | Submission type. Supported values are `mags`, `bins`, `metagenomic_assemblies`, and `reads`. | | `--input` | Path to the samplesheet describing the data to submit. | | `--submission_study` | ENA study accession (PRJ/ERP) to submit the data to. For metagenomic assemblies, this is the paper's ENA Assembly Project accession. | | `--centre_name` | Name of the submitter's organisation. | @@ -245,6 +276,19 @@ nextflow run nf-core/seqsubmit \ --outdir results/validate_assemblies ``` +Test example for `reads` run with docker: + +```bash +nextflow run nf-core/seqsubmit \ + -profile docker \ + --mode reads \ + --input samplesheet_reads.csv \ + --submission_study \ + --webincli_mode submit \ + --test_upload true \ + --outdir results/validate_reads +``` + If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. diff --git a/main.nf b/main.nf index 7824a5f..fe6b96d 100644 --- a/main.nf +++ b/main.nf @@ -17,6 +17,7 @@ include { GENOMESUBMIT } from './workflows/genomesubmit' include { ASSEMBLYSUBMIT } from './workflows/assemblysubmit' +include { READSUBMIT } from './workflows/readsubmit' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_seqsubmit_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_seqsubmit_pipeline' /* @@ -38,7 +39,7 @@ workflow NFCORE_SEQSUBMIT { // // WORKFLOW: Run pipeline // - // Depending on the input type (mags/bins or metagenomic_assemblies), one or the another workflow will be triggered + // Depending on the input type (mags/bins, metagenomic_assemblies, or reads), one or another workflow will be triggered if (params.mode == "mags" || params.mode == "bins") { GENOMESUBMIT ( samplesheet, @@ -69,6 +70,16 @@ workflow NFCORE_SEQSUBMIT { params.webincli_mode ) ch_multiqc_report = ASSEMBLYSUBMIT.out.multiqc_report + } else if (params.mode == "reads") { + READSUBMIT ( + samplesheet, + params.submission_study, + params.study_metadata, + params.test_upload, + params.webin_cli_version, + params.webincli_mode + ) + ch_multiqc_report = READSUBMIT.out.multiqc_report } diff --git a/modules/local/create_reads_manifest/main.nf b/modules/local/create_reads_manifest/main.nf new file mode 100644 index 0000000..e4dba77 --- /dev/null +++ b/modules/local/create_reads_manifest/main.nf @@ -0,0 +1,56 @@ +process CREATE_READS_MANIFEST { + tag "$meta.id" + label 'process_single' + + container "docker://alpine:latest" + + input: + tuple val(meta), path(fastq_files) + val(study_accession) + val(test_upload) + + output: + tuple val(meta), path("${meta.id}.manifest"), emit: manifest + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def fastq_list = fastq_files instanceof List ? fastq_files : [fastq_files] + def fastq_entries = fastq_list.collect { "FASTQ\t${it.name}" }.join('\n') + def insert_size_line = meta.insert_size ? "INSERT_SIZE\t${meta.insert_size}\n" : "" + def library_name_line = meta.library_name ? "LIBRARY_NAME\t${meta.library_name}\n" : "" + def description_line = meta.description ? "DESCRIPTION\t${meta.description}\n" : "" + + """ + cat > ${meta.id}.manifest <<'EOF' +STUDY ${study_accession} +SAMPLE ${meta.sample_accession} +NAME ${meta.id} +PLATFORM ${meta.platform} +INSTRUMENT ${meta.instrument} +LIBRARY_SOURCE ${meta.library_source} +LIBRARY_SELECTION ${meta.library_selection} +LIBRARY_STRATEGY ${meta.library_strategy} +${insert_size_line}${library_name_line}${description_line}${fastq_entries} +EOF + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(echo \$(bash --version | grep "GNU bash" | sed 's/GNU bash, version //; s/ (.*//' )) + END_VERSIONS + """ + + stub: + """ + touch ${meta.id}.manifest + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: 5.1.0 + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 9f5220b..ebe7923 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,7 +10,7 @@ params { // Input options input = null - mode = null // {mags, bins, metagenomic_assemblies} + mode = null // {mags, bins, metagenomic_assemblies, reads} study_metadata = null submission_study = null diff --git a/nextflow_schema.json b/nextflow_schema.json index b5089a8..fc42ff3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -18,7 +18,7 @@ "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file describing the data to be submitted. Format depends on the pipeline mode (mags/bins/metagenomic_assemblies).", + "description": "Path to comma-separated file describing the data to be submitted. Format depends on the pipeline mode (mags/bins/metagenomic_assemblies/reads).", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with a set of columns depending on the type of data being submitted. See [usage docs](https://nf-co.re/seqsubmit/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, @@ -283,8 +283,8 @@ "type": "string", "default": null, "description": "Type of upload", - "help_text": "Different types of data require specific upload steps. That mode controls what upload workflow to run depending on type of data (mags/bins/metagenomic_assemblies)", - "enum": ["mags", "bins", "metagenomic_assemblies"] + "help_text": "Different types of data require specific upload steps. That mode controls what upload workflow to run depending on type of data (mags/bins/metagenomic_assemblies/reads)", + "enum": ["mags", "bins", "metagenomic_assemblies", "reads"] }, "test_upload": { "type": "boolean", diff --git a/tests/reads_paired_end.nf.test b/tests/reads_paired_end.nf.test new file mode 100644 index 0000000..0f26a5d --- /dev/null +++ b/tests/reads_paired_end.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test reads submission workflow stub - paired_end" + script "../main.nf" + tag "pipeline" + tag "mode_reads" + tag "test_reads_paired_end" + profile "test_reads_paired" + + test("-profile test_reads_paired") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/workflows/readsubmit.nf b/workflows/readsubmit.nf new file mode 100644 index 0000000..eb28f8e --- /dev/null +++ b/workflows/readsubmit.nf @@ -0,0 +1,178 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { CREATE_READS_MANIFEST } from '../modules/local/create_reads_manifest/main' +include { ENA_WEBIN_CLI_WRAPPER as SUBMIT } from '../modules/local/ena_webin_cli_wrapper' +include { ENA_WEBIN_CLI_DOWNLOAD } from '../modules/local/ena_webin_cli_download' +include { REGISTERSTUDY } from '../modules/local/registerstudy/main' + +include { FIND_CONCATENATE as CONCAT_ACCESSIONS } from '../modules/nf-core/find/concatenate/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { paramsSummaryMap } from 'plugin/nf-schema' + +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_seqsubmit_pipeline' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN THE WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow READSUBMIT { + + take: + ch_samplesheet // channel: samplesheet read in from --input + submission_study // val: accession of the study to submit to (optional) + study_metadata // val: path to study metadata file for study creation (used if no submission_study provided) + test_upload // val: true for test upload mode + webin_cli_version // val: WebinCLI tool version to download and use for submission + webincli_mode // val: either 'validate' or 'submit' to specify WebinCLI mode of operation + + main: + ch_versions = channel.empty() + ch_multiqc_files = channel.empty() + + // Create reads channel with proper metadata structure + reads_ch = ch_samplesheet + .map { row -> + def meta = [ + id: row[0].id, + sample_accession: row[1], + single_end: row[3] ? false : true, + platform: row[4], + instrument: row[5], + library_source: row[6], + library_selection: row[7], + library_strategy: row[8], + insert_size: row[9] ?: null, + library_name: row[10] ?: null, + description: row[11] ?: null + ] + + if (row[3] && row[3] != "") { + // If paired end reads + [meta, [file(row[2]), file(row[3])]] + } else { + // If single end + [meta, file(row[2])] + } + } + + def study_accession_ch + if (submission_study) { + // Use provided study accession directly + study_accession_ch = channel.of(submission_study) + } else { + // Register a new study using the study metadata file + REGISTERSTUDY( + channel.of([[id: "study"], file(study_metadata)]), + test_upload + ) + ch_versions = ch_versions.mix(REGISTERSTUDY.out.versions) + study_accession_ch = REGISTERSTUDY.out.accessions + .map { _meta, json -> + def data = new groovy.json.JsonSlurper().parse(json) + data.submitted[0]?.accession + } + } + + // Generate reads manifest files + CREATE_READS_MANIFEST( + reads_ch, + study_accession_ch, + test_upload + ) + ch_versions = ch_versions.mix(CREATE_READS_MANIFEST.out.versions) + + ENA_WEBIN_CLI_DOWNLOAD ( + webin_cli_version + ) + + // Prepare input for submission with manifest and fastq files + submission_input = reads_ch.join(CREATE_READS_MANIFEST.out.manifest) + .map { meta, fastq, manifest -> + [meta, fastq, manifest] + } + + SUBMIT ( + submission_input, + ENA_WEBIN_CLI_DOWNLOAD.out.webin_cli_jar, + test_upload, + webincli_mode + ) + ch_versions = ch_versions.mix(SUBMIT.out.versions) + + // Concatenate accessions into single file to publish + CONCAT_ACCESSIONS ( + SUBMIT.out.accessions.map { _meta, file -> file }.collect().map { files -> [ [id: "assigned_accessions"], files ] }, + 'true' // skip_header - we want to keep the header from the first file and skip it for the rest + ) + + // + // Collate and save software versions + // + softwareVersionsToYAML(ch_versions) + .collectFile( + storeDir: "${params.outdir}/pipeline_info", + name: 'nf_core_' + 'seqsubmit_software_' + 'mqc_' + 'versions.yml', + sort: true, + newLine: true + ).set { ch_collated_versions } + + + // + // MODULE: MultiQC + // + ch_multiqc_config = channel.fromPath( + "$projectDir/assets/multiqc_config.yml", checkIfExists: true) + ch_multiqc_custom_config = params.multiqc_config ? + channel.fromPath(params.multiqc_config, checkIfExists: true) : + channel.empty() + ch_multiqc_logo = params.multiqc_logo ? + channel.fromPath(params.multiqc_logo, checkIfExists: true) : + channel.empty() + + summary_params = paramsSummaryMap( + workflow, parameters_schema: "nextflow_schema.json") + ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) + ch_multiqc_files = ch_multiqc_files.mix( + ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_custom_methods_description = params.multiqc_methods_description ? + file(params.multiqc_methods_description, checkIfExists: true) : + file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + ch_methods_description = channel.value( + methodsDescriptionText(ch_multiqc_custom_methods_description)) + + ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) + ch_multiqc_files = ch_multiqc_files.mix( + ch_methods_description.collectFile( + name: 'methods_description_mqc.yaml', + sort: true + ) + ) + + MULTIQC ( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList(), + [], + [] + ) + + emit: + multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] + +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ From 064835243e828487e167ec33b874c21c23004179 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova Date: Tue, 12 May 2026 15:23:35 +0100 Subject: [PATCH 2/6] delete incorrect config for reads test --- conf/test_reads_paired.config | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 conf/test_reads_paired.config diff --git a/conf/test_reads_paired.config b/conf/test_reads_paired.config deleted file mode 100644 index eaa2b70..0000000 --- a/conf/test_reads_paired.config +++ /dev/null @@ -1,33 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a fast and simple pipeline test. - - Use as follows: - nextflow run nf-core/seqsubmit -profile test_reads_paired, --outdir - ----------------------------------------------------------------------------------------- -*/ - -process { - resourceLimits = [ - cpus: 2, - memory: '8.GB', - time: '1.h' - ] -} - -params { - config_profile_name = 'Test --mode reads paired_end profile' - config_profile_description = 'Single-case reads test with paired-end reads' - - // Input data - // TODO: prepare test data and add to repo, update path here - input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/reads_paired.csv' - outdir = 'test_output' - - mode = "reads" - submission_study = "PRJEB98843" - -} From 8cfcfd750b20d14f82b63d2fab97782506562f9c Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova Date: Tue, 12 May 2026 16:06:26 +0100 Subject: [PATCH 3/6] multiple fixes Co-authored-by: Copilot --- conf/test_reads_paired.config | 34 +++++++++++++++++++ modules/local/create_reads_manifest/main.nf | 15 ++------ nextflow.config | 1 + .../utils_nfcore_seqsubmit_pipeline/main.nf | 7 ++-- workflows/readsubmit.nf | 1 - 5 files changed, 43 insertions(+), 15 deletions(-) create mode 100644 conf/test_reads_paired.config diff --git a/conf/test_reads_paired.config b/conf/test_reads_paired.config new file mode 100644 index 0000000..59c6691 --- /dev/null +++ b/conf/test_reads_paired.config @@ -0,0 +1,34 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test_reads, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '8.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode reads profile' + config_profile_description = 'Minimal test profile for reads submission' + + // Input data + input = "${projectDir}/assets/samplesheet_reads.csv" + outdir = 'test_output' + + mode = "reads" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + + test_upload = true +} diff --git a/modules/local/create_reads_manifest/main.nf b/modules/local/create_reads_manifest/main.nf index e4dba77..5cf8e08 100644 --- a/modules/local/create_reads_manifest/main.nf +++ b/modules/local/create_reads_manifest/main.nf @@ -2,7 +2,9 @@ process CREATE_READS_MANIFEST { tag "$meta.id" label 'process_single' - container "docker://alpine:latest" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/bash:5.2.37--06dbc4169cb39ae0' : + 'community.wave.seqera.io/library/bash:5.2.37--ae00789afb795adf' }" input: tuple val(meta), path(fastq_files) @@ -11,7 +13,6 @@ process CREATE_READS_MANIFEST { output: tuple val(meta), path("${meta.id}.manifest"), emit: manifest - path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when @@ -37,20 +38,10 @@ LIBRARY_SELECTION ${meta.library_selection} LIBRARY_STRATEGY ${meta.library_strategy} ${insert_size_line}${library_name_line}${description_line}${fastq_entries} EOF - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bash: \$(echo \$(bash --version | grep "GNU bash" | sed 's/GNU bash, version //; s/ (.*//' )) - END_VERSIONS """ stub: """ touch ${meta.id}.manifest - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bash: 5.1.0 - END_VERSIONS """ } diff --git a/nextflow.config b/nextflow.config index ebe7923..d785caa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -196,6 +196,7 @@ profiles { test_assembly_no_coverage_single_reads { includeConfig 'conf/test_assembly_no_coverage_single_reads.config' } test_assembly_no_coverage_paired_reads { includeConfig 'conf/test_assembly_no_coverage_paired_reads.config' } test_assembly_one_contig { includeConfig 'conf/test_assembly_one_contig.config' } + test_reads_paired { includeConfig 'conf/test_reads_paired.config' } } // Load nf-core custom profiles from different institutions diff --git a/subworkflows/local/utils_nfcore_seqsubmit_pipeline/main.nf b/subworkflows/local/utils_nfcore_seqsubmit_pipeline/main.nf index 8a19c8a..641ed1c 100644 --- a/subworkflows/local/utils_nfcore_seqsubmit_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_seqsubmit_pipeline/main.nf @@ -32,7 +32,7 @@ workflow PIPELINE_INITIALISATION { nextflow_cli_args // array: List of positional nextflow CLI args outdir // string: The output directory where the results will be saved input // string: Path to input samplesheet - mode // string: Type of input data (mags, bins, metagenomic_assemblies) + mode // string: Type of input data (mags, bins, metagenomic_assemblies, reads) help // boolean: Display help message and exit help_full // boolean: Show the full help message show_hidden // boolean: Show hidden parameters in the help message @@ -102,8 +102,11 @@ workflow PIPELINE_INITIALISATION { } else if ( mode == "metagenomic_assemblies" ) { ch_samplesheet = channel .fromList(samplesheetToList(input, "${projectDir}/assets/schema_input_assembly.json")) + } else if ( mode == "reads" ) { + ch_samplesheet = channel + .fromList(samplesheetToList(input, "${projectDir}/assets/schema_input_reads.json")) } else { - error("No input was found. Please, point to the location of your samplesheet using --input_genome or --input_assembly") + error("Unknown mode specified: '${mode}'. Supported modes are 'mags', 'bins', 'metagenomic_assemblies', and 'reads'.") } emit: diff --git a/workflows/readsubmit.nf b/workflows/readsubmit.nf index eb28f8e..c75d1fe 100644 --- a/workflows/readsubmit.nf +++ b/workflows/readsubmit.nf @@ -87,7 +87,6 @@ workflow READSUBMIT { study_accession_ch, test_upload ) - ch_versions = ch_versions.mix(CREATE_READS_MANIFEST.out.versions) ENA_WEBIN_CLI_DOWNLOAD ( webin_cli_version From 651ca32e04f7c9d6f8c97d7b6964d47b355cac03 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova Date: Tue, 12 May 2026 16:24:41 +0100 Subject: [PATCH 4/6] update multiqc in READSUBMIT Co-authored-by: Copilot --- docs/output.md | 2 +- main.nf | 4 ++ workflows/readsubmit.nf | 101 ++++++++++++++++++++-------------------- 3 files changed, 56 insertions(+), 51 deletions(-) diff --git a/docs/output.md b/docs/output.md index 6199a93..cd1547e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -67,7 +67,7 @@ When `--mode reads` is used, results are written under `reads/`. Output files - `reads/` - - `upload/assigned_accessions.tsv`: run accessions assigned to submitted reads. + - `upload/reads_accessions.tsv`: run accessions assigned to submitted reads. diff --git a/main.nf b/main.nf index 2ad7561..8f068b3 100644 --- a/main.nf +++ b/main.nf @@ -81,6 +81,10 @@ workflow NFCORE_SEQSUBMIT { } else if (params.mode == "reads") { READSUBMIT ( samplesheet, + params.multiqc_config, + params.multiqc_logo, + params.multiqc_methods_description, + params.outdir, params.submission_study, params.study_metadata, params.test_upload, diff --git a/workflows/readsubmit.nf b/workflows/readsubmit.nf index c75d1fe..c69ae3b 100644 --- a/workflows/readsubmit.nf +++ b/workflows/readsubmit.nf @@ -6,7 +6,6 @@ include { CREATE_READS_MANIFEST } from '../modules/local/create_reads_manifest/main' include { ENA_WEBIN_CLI_WRAPPER as SUBMIT } from '../modules/local/ena_webin_cli_wrapper' -include { ENA_WEBIN_CLI_DOWNLOAD } from '../modules/local/ena_webin_cli_download' include { REGISTERSTUDY } from '../modules/local/registerstudy/main' include { FIND_CONCATENATE as CONCAT_ACCESSIONS } from '../modules/nf-core/find/concatenate/main' @@ -27,6 +26,10 @@ workflow READSUBMIT { take: ch_samplesheet // channel: samplesheet read in from --input + multiqc_config + multiqc_logo + multiqc_methods_description + outdir submission_study // val: accession of the study to submit to (optional) study_metadata // val: path to study metadata file for study creation (used if no submission_study provided) test_upload // val: true for test upload mode @@ -34,8 +37,8 @@ workflow READSUBMIT { webincli_mode // val: either 'validate' or 'submit' to specify WebinCLI mode of operation main: - ch_versions = channel.empty() - ch_multiqc_files = channel.empty() + def ch_versions = channel.empty() + def ch_multiqc_files = channel.empty() // Create reads channel with proper metadata structure reads_ch = ch_samplesheet @@ -88,10 +91,6 @@ workflow READSUBMIT { test_upload ) - ENA_WEBIN_CLI_DOWNLOAD ( - webin_cli_version - ) - // Prepare input for submission with manifest and fastq files submission_input = reads_ch.join(CREATE_READS_MANIFEST.out.manifest) .map { meta, fastq, manifest -> @@ -100,7 +99,6 @@ workflow READSUBMIT { SUBMIT ( submission_input, - ENA_WEBIN_CLI_DOWNLOAD.out.webin_cli_jar, test_upload, webincli_mode ) @@ -108,66 +106,69 @@ workflow READSUBMIT { // Concatenate accessions into single file to publish CONCAT_ACCESSIONS ( - SUBMIT.out.accessions.map { _meta, file -> file }.collect().map { files -> [ [id: "assigned_accessions"], files ] }, + SUBMIT.out.accessions.map { _meta, file -> file }.collect().map { files -> [ [id: "reads_accessions"], files ] }, 'true' // skip_header - we want to keep the header from the first file and skip it for the rest ) // // Collate and save software versions // - softwareVersionsToYAML(ch_versions) + def topic_versions = channel.topic("versions") + .distinct() + .branch { entry -> + versions_file: entry instanceof Path + versions_tuple: true + } + + def topic_versions_string = topic_versions.versions_tuple + .map { process, tool, version -> + [ process[process.lastIndexOf(':')+1..-1], " ${tool}: ${version}" ] + } + .groupTuple(by:0) + .map { process, tool_versions -> + tool_versions.unique().sort() + "${process}:\n${tool_versions.join('\n')}" + } + + def ch_collated_versions = softwareVersionsToYAML(ch_versions.mix(topic_versions.versions_file)) + .mix(topic_versions_string) .collectFile( - storeDir: "${params.outdir}/pipeline_info", + storeDir: "${outdir}/pipeline_info", name: 'nf_core_' + 'seqsubmit_software_' + 'mqc_' + 'versions.yml', sort: true, newLine: true - ).set { ch_collated_versions } - + ) // // MODULE: MultiQC // - ch_multiqc_config = channel.fromPath( - "$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? - channel.fromPath(params.multiqc_config, checkIfExists: true) : - channel.empty() - ch_multiqc_logo = params.multiqc_logo ? - channel.fromPath(params.multiqc_logo, checkIfExists: true) : - channel.empty() - - summary_params = paramsSummaryMap( - workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_files = ch_multiqc_files.mix( - ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? - file(params.multiqc_methods_description, checkIfExists: true) : - file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = channel.value( - methodsDescriptionText(ch_multiqc_custom_methods_description)) - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix( - ch_methods_description.collectFile( - name: 'methods_description_mqc.yaml', - sort: true - ) - ) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList(), - [], - [] + def ch_summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") + def ch_workflow_summary = channel.value(paramsSummaryMultiqc(ch_summary_params)) + ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + def ch_multiqc_custom_methods_description = multiqc_methods_description + ? file(multiqc_methods_description, checkIfExists: true) + : file("${projectDir}/assets/methods_description_template.yml", checkIfExists: true) + def ch_methods_description = channel.value(methodsDescriptionText(ch_multiqc_custom_methods_description)) + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml', sort: true)) + ch_multiqc_files = ch_multiqc_files.mix(CONCAT_ACCESSIONS.out.file_out.map{_meta, file -> file}) + MULTIQC( + ch_multiqc_files.flatten().collect().map { files -> + [ + [id: 'seqsubmit'], + files, + multiqc_config + ? file(multiqc_config, checkIfExists: true) + : file("${projectDir}/assets/multiqc_config.yml", checkIfExists: true), + multiqc_logo ? file(multiqc_logo, checkIfExists: true) : [], + [], + [], + ] + } ) - emit: - multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + multiqc_report = MULTIQC.out.report.map { _meta, report -> [report] }.toList() // channel: /path/to/multiqc_report.html versions = ch_versions // channel: [ path(versions.yml) ] - } /* From 00307d0e41112887cbf2362bd2c2899d6ad21069 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova Date: Wed, 13 May 2026 10:00:45 +0100 Subject: [PATCH 5/6] remove --fasta-dir because it doesn't work for fastqs staged to different folders --- modules/local/ena_webin_cli_wrapper/main.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/local/ena_webin_cli_wrapper/main.nf b/modules/local/ena_webin_cli_wrapper/main.nf index 8a365bb..17607e0 100644 --- a/modules/local/ena_webin_cli_wrapper/main.nf +++ b/modules/local/ena_webin_cli_wrapper/main.nf @@ -22,14 +22,12 @@ process ENA_WEBIN_CLI_WRAPPER { def args = task.ext.args ?: "" def prefix = task.ext.prefix ?: "${meta.id}" def test_flag = test_upload ? "--test" : "" - def fasta_dir = submission_item.toRealPath().parent """ webin_cli_handler \\ -m ${manifest} \\ -o ${prefix}_accessions.tsv \\ --mode ${webincli_mode} \\ - --fasta-dir ${fasta_dir} \\ ${test_flag} \\ ${args} From 5678294acc92e61295d427953cf2a130a5bbc2ef Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova Date: Thu, 14 May 2026 16:04:26 +0100 Subject: [PATCH 6/6] disable publishDir for CREATE_READS_MANIFEST --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 931f57e..9f4d1fe 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -176,7 +176,7 @@ process { ] } - withName: 'REGISTERSTUDY|GENERATE_ASSEMBLY_MANIFEST' { + withName: 'REGISTERSTUDY|GENERATE_ASSEMBLY_MANIFEST|CREATE_READS_MANIFEST' { publishDir = [ enabled: false ]