From 7d1c0ddffca25888fd443929bf0e970e5d996842 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Wed, 13 May 2026 12:18:54 +0100
Subject: [PATCH 1/7] explicitly set test_upload = true in test configs

---
 ...est_assembly_coassembly_no_coverage.config | 34 ++++++++++++++++++
 ...t_assembly_coassembly_with_coverage.config | 35 +++++++++++++++++++
 conf/test_assembly_complete_metadata.config   |  1 +
 ...t_assembly_no_coverage_paired_reads.config |  1 +
 ...t_assembly_no_coverage_single_reads.config |  1 +
 ...assembly_no_study_complete_metadata.config |  1 -
 conf/test_assembly_one_contig.config          |  1 +
 7 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 conf/test_assembly_coassembly_no_coverage.config
 create mode 100644 conf/test_assembly_coassembly_with_coverage.config
diff --git a/conf/test_assembly_coassembly_no_coverage.config b/conf/test_assembly_coassembly_no_coverage.config
new file mode 100644
index 0000000..eda3ca5
--- /dev/null
+++ b/conf/test_assembly_coassembly_no_coverage.config
@@ -0,0 +1,34 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/seqsubmit -profile test_assembly_coassembly_no_coverage,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+process {
+    resourceLimits = [
+        cpus: 2,
+        memory: '8.GB',
+        time: '1.h'
+    ]
+}
+
+params {
+    config_profile_name        = 'Test --mode metagenomic_assemblies co-assembly without coverage profile'
+    config_profile_description = 'Co-assembly test with missing coverage and paired-end reads from two runs'
+
+    // Input data
+    input  = "${projectDir}/assets/samplesheet_assembly_coassembly_no_coverage.csv"
+    outdir = 'test_output'
+
+    mode             = "metagenomic_assemblies"
+    submission_study = "PRJEB98843"
+    centre_name      = "TEST_CENTER"
+
+    test_upload      = true
+}
diff --git a/conf/test_assembly_coassembly_with_coverage.config b/conf/test_assembly_coassembly_with_coverage.config
new file mode 100644
index 0000000..910231a
--- /dev/null
+++ b/conf/test_assembly_coassembly_with_coverage.config
@@ -0,0 +1,35 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/seqsubmit -profile test_assembly_coassembly_with_coverage,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+process {
+    resourceLimits = [
+        cpus: 2,
+        memory: '8.GB',
+        time: '1.h'
+    ]
+}
+
+params {
+    config_profile_name        = 'Test --mode metagenomic_assemblies co-assembly with coverage profile'
+    config_profile_description = 'Co-assembly test with pre-provided coverage and multiple run accessions'
+
+    // Input data
+    input  = "${projectDir}/assets/samplesheet_assembly_coassembly_with_coverage.csv"
+    outdir = 'test_output'
+
+    mode             = "metagenomic_assemblies"
+    submission_study = "PRJEB98843"
+    centre_name      = "TEST_CENTER"
+
+    test_upload      = true
+
+}
diff --git a/conf/test_assembly_complete_metadata.config b/conf/test_assembly_complete_metadata.config
index 16e4ccf..ce366be 100644
--- a/conf/test_assembly_complete_metadata.config
+++ b/conf/test_assembly_complete_metadata.config
@@ -30,4 +30,5 @@ params {
     submission_study = "PRJEB98843"
     centre_name      = "TEST_CENTER"
 
+    test_upload      = true
 }
diff --git a/conf/test_assembly_no_coverage_paired_reads.config b/conf/test_assembly_no_coverage_paired_reads.config
index 65c73e4..4571a1c 100644
--- a/conf/test_assembly_no_coverage_paired_reads.config
+++ b/conf/test_assembly_no_coverage_paired_reads.config
@@ -30,4 +30,5 @@ params {
     submission_study = "PRJEB98843"
     centre_name      = "TEST_CENTER"
 
+    test_upload      = true
 }
diff --git a/conf/test_assembly_no_coverage_single_reads.config b/conf/test_assembly_no_coverage_single_reads.config
index 814de31..ee269e9 100644
--- a/conf/test_assembly_no_coverage_single_reads.config
+++ b/conf/test_assembly_no_coverage_single_reads.config
@@ -30,4 +30,5 @@ params {
     submission_study = "PRJEB98843"
     centre_name      = "TEST_CENTER"
 
+    test_upload      = true
 }
diff --git a/conf/test_assembly_no_study_complete_metadata.config b/conf/test_assembly_no_study_complete_metadata.config
index b1c96d7..6831007 100644
--- a/conf/test_assembly_no_study_complete_metadata.config
+++ b/conf/test_assembly_no_study_complete_metadata.config
@@ -31,5 +31,4 @@ params {
     centre_name      = "TEST_CENTER"
 
     test_upload      = true
-
 }
diff --git a/conf/test_assembly_one_contig.config b/conf/test_assembly_one_contig.config
index b27784e..ce2e86c 100644
--- a/conf/test_assembly_one_contig.config
+++ b/conf/test_assembly_one_contig.config
@@ -30,4 +30,5 @@ params {
     submission_study = "PRJEB98843"
     centre_name      = "TEST_CENTER"
 
+    test_upload      = true
 }

From 100c7c0c832dfcf6f6343fc0de21f39db10ec048 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Wed, 13 May 2026 12:21:55 +0100
Subject: [PATCH 2/7] add partial co assembly support in ASSEMBLYSUBMIT

Co-authored-by: Copilot <copilot@github.com>
---
 ...esheet_assembly_coassembly_no_coverage.csv |  2 +
 ...heet_assembly_coassembly_with_coverage.csv |  2 +
 assets/schema_input_assembly.json             | 18 +++---
 .../create_assembly_metadata_csv/main.nf      |  5 +-
 nextflow.config                               |  2 +
 tests/assembly_coassembly_no_coverage.nf.test | 39 ++++++++++++
 .../assembly_coassembly_with_coverage.nf.test | 39 ++++++++++++
 workflows/assemblysubmit.nf                   | 60 ++++++++++++++++---
 8 files changed, 146 insertions(+), 21 deletions(-)
 create mode 100644 assets/samplesheet_assembly_coassembly_no_coverage.csv
 create mode 100644 assets/samplesheet_assembly_coassembly_with_coverage.csv
 create mode 100644 tests/assembly_coassembly_no_coverage.nf.test
 create mode 100644 tests/assembly_coassembly_with_coverage.nf.test

diff --git a/assets/samplesheet_assembly_coassembly_no_coverage.csv b/assets/samplesheet_assembly_coassembly_no_coverage.csv
new file mode 100644
index 0000000..e0a4f22
--- /dev/null
+++ b/assets/samplesheet_assembly_coassembly_no_coverage.csv
@@ -0,0 +1,2 @@
+sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version
+no_coverage_paired_reads,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test1_1.fastq.gz;https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test2_1.fastq.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test1_2.fastq.gz;https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test2_2.fastq.gz,,ERR000001;ERR000002,SPAdes,3.15
diff --git a/assets/samplesheet_assembly_coassembly_with_coverage.csv b/assets/samplesheet_assembly_coassembly_with_coverage.csv
new file mode 100644
index 0000000..7f51cca
--- /dev/null
+++ b/assets/samplesheet_assembly_coassembly_with_coverage.csv
@@ -0,0 +1,2 @@
+sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version
+coassembly_sample,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz,,,45.5,ERR000001;ERR000002,SPAdes,3.15
diff --git a/assets/schema_input_assembly.json b/assets/schema_input_assembly.json
index 2b5fcea..929589a 100644
--- a/assets/schema_input_assembly.json
+++ b/assets/schema_input_assembly.json
@@ -25,33 +25,29 @@
                 "anyOf": [
                     {
                         "type": "string",
-                        "format": "file-path",
-                        "exists": true,
-                        "pattern": "^\\S+\\.(fq|fastq)(\\.gz)?$"
+                        "pattern": "^\\S+\\.(fq|fastq)(\\.gz)?(;\\S+\\.(fq|fastq)(\\.gz)?)*$"
                     },
                     {
                         "type": "string",
                         "maxLength": 0
                     }
                 ],
-                "errorMessage": "FASTQ file must have extension '.fq' or '.fastq' (optionally gzipped)",
-                "description": "Forward reads if paired-end or single-end reads FASTQ file"
+                "errorMessage": "FASTQ file(s) must have extension '.fq' or '.fastq' (optionally gzipped); for co-assemblies use semicolon-separated paths",
+                "description": "Forward reads if paired-end or single-end reads FASTQ file. For co-assemblies, semicolon-separated paths (e.g. run1_R1.fq.gz;run2_R1.fq.gz)"
             },
             "fastq_2": {
                 "anyOf": [
                     {
                         "type": "string",
-                        "format": "file-path",
-                        "exists": true,
-                        "pattern": "^\\S+\\.(fq|fastq)(\\.gz)?$"
+                        "pattern": "^\\S+\\.(fq|fastq)(\\.gz)?(;\\S+\\.(fq|fastq)(\\.gz)?)*$"
                     },
                     {
                         "type": "string",
                         "maxLength": 0
                     }
                 ],
-                "errorMessage": "FASTQ file for reverse reads must have extension '.fq' or '.fastq' (optionally gzipped)",
-                "description": "Reverse reads FASTQ file if paired-end. Leave empty for single-end reads"
+                "errorMessage": "FASTQ file(s) for reverse reads must have extension '.fq' or '.fastq' (optionally gzipped); for co-assemblies use semicolon-separated paths",
+                "description": "Reverse reads FASTQ file if paired-end. Leave empty for single-end reads. For co-assemblies, semicolon-separated paths (e.g. run1_R2.fq.gz;run2_R2.fq.gz)"
             },
             "coverage": {
                 "anyOf": [
@@ -71,7 +67,7 @@
                 "type": "string",
                 "pattern": "^\\S+$",
                 "errorMessage": "Accession must be provided and cannot contain spaces",
-                "description": "Accession of the run used to generate the assembly"
+                "description": "Accession(s) of the run(s) used to generate the assembly. For co-assemblies, use semicolon-separated values (e.g. ERR000001;ERR000002)"
             },
             "assembler": {
                 "type": "string",
diff --git a/modules/local/create_assembly_metadata_csv/main.nf b/modules/local/create_assembly_metadata_csv/main.nf
index b7d7151..f3f8312 100644
--- a/modules/local/create_assembly_metadata_csv/main.nf
+++ b/modules/local/create_assembly_metadata_csv/main.nf
@@ -19,8 +19,11 @@ process CREATE_ASSEMBLY_METADATA_CSV {
 
     script:
     def header = 'Runs,Coverage,Assembler,Version,Filepath,Sample'
+    // Format run accessions: wrap in quotes if it's a list (co-assembly) or single value
+    def runs_joined = meta.run_accession instanceof List ? meta.run_accession.join(',') : meta.run_accession
+    def runs = "\"${runs_joined}\""
     def row = [
-        meta.run_accession,
+        runs,
         meta.coverage,
         meta.assembler,
         meta.assembler_version,
diff --git a/nextflow.config b/nextflow.config
index 4f9e21a..5482eed 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -199,6 +199,8 @@ profiles {
     test_assembly_no_coverage_single_reads   { includeConfig 'conf/test_assembly_no_coverage_single_reads.config' }
     test_assembly_no_coverage_paired_reads   { includeConfig 'conf/test_assembly_no_coverage_paired_reads.config' }
     test_assembly_one_contig                 { includeConfig 'conf/test_assembly_one_contig.config' }
+    test_assembly_coassembly_with_coverage   { includeConfig 'conf/test_assembly_coassembly_with_coverage.config' }
+    test_assembly_coassembly_no_coverage     { includeConfig 'conf/test_assembly_coassembly_no_coverage.config' }
 }
 
 // Load nf-core custom profiles from different institutions
diff --git a/tests/assembly_coassembly_no_coverage.nf.test b/tests/assembly_coassembly_no_coverage.nf.test
new file mode 100644
index 0000000..a4eefd8
--- /dev/null
+++ b/tests/assembly_coassembly_no_coverage.nf.test
@@ -0,0 +1,39 @@
+nextflow_pipeline {
+
+    name "Test assembly submission workflow - coassembly_no_coverage"
+    script "../main.nf"
+    tag "pipeline"
+    tag "mode_assembly"
+    tag "test_assembly_coassembly_no_coverage"
+    profile "test_assembly_coassembly_no_coverage"
+
+    test("-profile test_assembly_coassembly_no_coverage") {
+
+        when {
+            params {
+                outdir = "$outputDir"
+            }
+        }
+
+        then {
+            // stable_name: All files + folders in ${params.outdir}/ with a stable name
+            def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}'])
+            // stable_path: All files in ${params.outdir}/ with stable content
+            def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore')
+            // Early failure no need to test the rest of snapshots
+            assert workflow.success
+            assertAll(
+                { assert snapshot(
+                    // Number of successful tasks
+                    workflow.trace.succeeded().size(),
+                    // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions
+                    removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"),
+                    // All stable path name, with a relative path
+                    stable_name,
+                    // All files with stable contents
+                    stable_path
+                ).match() }
+            )
+        }
+    }
+}
diff --git a/tests/assembly_coassembly_with_coverage.nf.test b/tests/assembly_coassembly_with_coverage.nf.test
new file mode 100644
index 0000000..60f15e9
--- /dev/null
+++ b/tests/assembly_coassembly_with_coverage.nf.test
@@ -0,0 +1,39 @@
+nextflow_pipeline {
+
+    name "Test assembly submission workflow - coassembly_with_coverage"
+    script "../main.nf"
+    tag "pipeline"
+    tag "mode_assembly"
+    tag "test_assembly_coassembly_with_coverage"
+    profile "test_assembly_coassembly_with_coverage"
+
+    test("-profile test_assembly_coassembly_with_coverage") {
+
+        when {
+            params {
+                outdir = "$outputDir"
+            }
+        }
+
+        then {
+            // stable_name: All files + folders in ${params.outdir}/ with a stable name
+            def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}'])
+            // stable_path: All files in ${params.outdir}/ with stable content
+            def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore')
+            // Early failure no need to test the rest of snapshots
+            assert workflow.success
+            assertAll(
+                { assert snapshot(
+                    // Number of successful tasks
+                    workflow.trace.succeeded().size(),
+                    // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions
+                    removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"),
+                    // All stable path name, with a relative path
+                    stable_name,
+                    // All files with stable contents
+                    stable_path
+                ).match() }
+            )
+        }
+    }
+}
diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf
index 1f4dc7b..6d8aae4 100644
--- a/workflows/assemblysubmit.nf
+++ b/workflows/assemblysubmit.nf
@@ -49,11 +49,13 @@ workflow ASSEMBLYSUBMIT {
     // Create assembly channel with proper metadata structure
     assembly_fasta = ch_samplesheet
         .map { row ->
+            // support semicolon-separated values for co-assemblies (e.g. ERR000001;ERR000002)
+            def run_accessions = row[5].split(';').toList()
             def meta = [
                 id: row[0].id,
                 single_end: row[3] ? false : true,
                 coverage: row[4] ?: null,
-                run_accession: row[5],
+                run_accession: run_accessions.size() == 1 ? run_accessions[0] : run_accessions,
                 assembler: row[6],
                 assembler_version: row[7]
             ]
@@ -63,21 +65,36 @@ workflow ASSEMBLYSUBMIT {
     reads_fastq = ch_samplesheet
         .filter { row -> row[2] && row[2] != "" } // Check if fastq_1 exists and is not empty
         .map { row ->
+            def run_accessions = row[5].split(';').toList()
             def meta = [
                 id: row[0].id,
                 single_end: row[3] ? false : true,
                 coverage: row[4] ?: null,
-                run_accession: row[5],
+                run_accession: run_accessions.size() == 1 ? run_accessions[0] : run_accessions,
                 assembler: row[6],
                 assembler_version: row[7]
             ]
 
-            if (row[3] && row[3] != "") {
+            def fastq1_list = row[2].split(';').toList()
+            def fastq2_list = (row[3] && row[3] != "") ? row[3].split(';').toList() : []
+            // Validation: Check that number of read files matches number of accessions
+            if (fastq1_list.size() != run_accessions.size()) {
+                error "Sample ${meta.id}: Number of forward read files (${fastq1_list.size()}) does not match number of run accessions (${run_accessions.size()})"
+            }
+
+            if (fastq2_list && fastq2_list.size() != run_accessions.size()) {
+                error "Sample ${meta.id}: Number of reverse read files (${fastq2_list.size()}) does not match number of run accessions (${run_accessions.size()})"
+            }
+
+            // Convert paths to file objects
+            def fastq1_paths = fastq1_list.collect { path -> file(path) }
+            def fastq2_paths = fastq2_list ? fastq2_list.collect { path -> file(path) } : []
+            if (fastq2_paths) {
                 // If paired end reads
-                [meta, [file(row[2]), file(row[3])]]
+                [meta, fastq1_paths, fastq2_paths]
             } else {
                 // If single end
-                [meta, file(row[2])]
+                [meta, fastq1_paths]
             }
         }
 
@@ -94,11 +111,36 @@ workflow ASSEMBLYSUBMIT {
         }
 
     // For assemblies without coverage, calculate coverage with CoverM
+    // Transform reads into the format CoverM expects: list of all read files
     validated_fastas.filter { meta, _fasta -> meta.coverage == null }
         .join(reads_fastq)
-        .multiMap { meta, fasta, fastq ->
+        .map { tuple ->
+            def meta = tuple[0]
+            def fasta = tuple[1]
+            def reads_data = tuple[2..-1]
+
+            // Transform reads into flat list for CoverM
+            def all_reads = []
+            if (meta.single_end) {
+                // Single-end: just flatten the R1 list
+                def fastq1_list = reads_data[0]
+                all_reads = fastq1_list
+            } else {
+                // Paired-end: interleave R1 and R2 files
+                // Input format: [meta, [R1_1, R1_2, ...], [R2_1, R2_2, ...]]
+                def fastq1_list = reads_data[0]
+                def fastq2_list = reads_data[1]
+
+                // Create list as: R1_1, R2_1, R1_2, R2_2, ...
+                // Use collectMany to flatten the pairs
+                all_reads = [fastq1_list, fastq2_list].transpose().collectMany { pair -> pair }
+            }
+
+            [meta, fasta, all_reads]
+        }
+        .multiMap { meta, fasta, reads ->
             assembly: [ meta, fasta ]
-            reads: [ meta, fastq ]
+            reads: [ meta, reads ]
         }
         .set { coverm_input }
 
@@ -232,8 +274,8 @@ workflow ASSEMBLYSUBMIT {
         : file("${projectDir}/assets/methods_description_template.yml", checkIfExists: true)
     def ch_methods_description = channel.value(methodsDescriptionText(ch_multiqc_custom_methods_description))
     ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml', sort: true))
-    ch_multiqc_files = ch_multiqc_files.mix(CONCAT_ACCESSIONS.out.file_out.map{meta, file -> file})
-    ch_multiqc_files = ch_multiqc_files.mix(CONCAT_METADATA.out.file_out.map{meta, file -> file})
+    ch_multiqc_files = ch_multiqc_files.mix(CONCAT_ACCESSIONS.out.file_out.map{_meta, file -> file})
+    ch_multiqc_files = ch_multiqc_files.mix(CONCAT_METADATA.out.file_out.map{_meta, file -> file})
     MULTIQC(
         ch_multiqc_files.flatten().collect().map { files ->
             [

From a65c94c3b0f956f3896e522c78f057e8332ec433 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Thu, 14 May 2026 13:26:32 +0100
Subject: [PATCH 3/7] add REGISTER_COASSEMBLY_SAMPLE

Co-authored-by: Copilot <copilot@github.com>
---
 ...esheet_assembly_coassembly_no_coverage.csv |   2 +-
 ...heet_assembly_coassembly_with_coverage.csv |   2 +-
 bin/register_coassembly_sample.py             | 484 ++++++++++++++++++
 ...t_assembly_coassembly_with_coverage.config |   1 -
 .../create_assembly_metadata_csv/main.nf      |   2 +-
 .../environment.yml                           |   8 +
 .../local/register_coassembly_sample/main.nf  |  52 ++
 .../local/register_coassembly_sample/meta.yml |  44 ++
 .../tests/coassembly_metadata.csv             |  11 +
 .../tests/main.nf.test                        |  36 ++
 .../tests/nextflow.config                     |   6 +
 workflows/assemblysubmit.nf                   |  22 +-
 12 files changed, 664 insertions(+), 6 deletions(-)
 create mode 100755 bin/register_coassembly_sample.py
 create mode 100644 modules/local/register_coassembly_sample/environment.yml
 create mode 100644 modules/local/register_coassembly_sample/main.nf
 create mode 100644 modules/local/register_coassembly_sample/meta.yml
 create mode 100644 modules/local/register_coassembly_sample/tests/coassembly_metadata.csv
 create mode 100644 modules/local/register_coassembly_sample/tests/main.nf.test
 create mode 100644 modules/local/register_coassembly_sample/tests/nextflow.config

diff --git a/assets/samplesheet_assembly_coassembly_no_coverage.csv b/assets/samplesheet_assembly_coassembly_no_coverage.csv
index e0a4f22..db29939 100644
--- a/assets/samplesheet_assembly_coassembly_no_coverage.csv
+++ b/assets/samplesheet_assembly_coassembly_no_coverage.csv
@@ -1,2 +1,2 @@
 sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version
-no_coverage_paired_reads,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test1_1.fastq.gz;https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test2_1.fastq.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test1_2.fastq.gz;https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test2_2.fastq.gz,,ERR000001;ERR000002,SPAdes,3.15
+no_coverage_paired_reads,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test1_1.fastq.gz;https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test2_1.fastq.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test1_2.fastq.gz;https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test2_2.fastq.gz,,SRR3183850;SRR3183951,SPAdes,3.15
diff --git a/assets/samplesheet_assembly_coassembly_with_coverage.csv b/assets/samplesheet_assembly_coassembly_with_coverage.csv
index 7f51cca..cf9cab3 100644
--- a/assets/samplesheet_assembly_coassembly_with_coverage.csv
+++ b/assets/samplesheet_assembly_coassembly_with_coverage.csv
@@ -1,2 +1,2 @@
 sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version
-coassembly_sample,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz,,,45.5,ERR000001;ERR000002,SPAdes,3.15
+coassembly_sample,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz,,,45.5,SRR3183850;SRR3183951,SPAdes,3.15
diff --git a/bin/register_coassembly_sample.py b/bin/register_coassembly_sample.py
new file mode 100755
index 0000000..8238f50
--- /dev/null
+++ b/bin/register_coassembly_sample.py
@@ -0,0 +1,484 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import csv
+import hashlib
+import logging
+import os
+import xml.etree.ElementTree as ET
+from collections import OrderedDict
+import re
+
+import requests
+from requests.auth import HTTPBasicAuth
+from retry import retry
+
+logger = logging.getLogger("register_coassembly_sample")
+
+
+RUN_XML_API = "https://www.ebi.ac.uk/ena/browser/api/xml/{accession}"
+SAMPLE_XML_API = "https://www.ebi.ac.uk/ena/browser/api/xml/{accession}"
+SAMPLE_PORTAL_API = "https://www.ebi.ac.uk/ena/portal/api/search"
+CHECKLIST_XML_API = "https://www.ebi.ac.uk/ena/browser/api/xml/ERC000011"
+
+TEST_SUBMIT_URL = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit/"
+PROD_SUBMIT_URL = "https://www.ebi.ac.uk/ena/submit/drop-box/submit/"
+
+INSDC_BIOSAMPLE_ACCESSION_REGEX = re.compile(
+    r"SAM[AG]?[0-9]+"
+)
+EXISTING_ACCESSION_IN_ERROR_REGEX = re.compile(
+    r'accession:\s*"([A-Z]+[0-9]+)"',
+    re.IGNORECASE,
+)
+
+
+class CoassemblyRegistrationError(RuntimeError):
+    """Raised for hard validation/submission errors."""
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Register virtual ENA sample for co-assemblies in the metadata CSV")
+    parser.add_argument("--input", required=True, help="Input assembly metadata CSV")
+    parser.add_argument("--output", required=True, help="Output assembly metadata CSV")
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="Register sample using the test submission endpoint",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Enable verbose debug logging",
+    )
+    return parser.parse_args()
+
+
+def setup_logging(debug: bool) -> None:
+    """Configure logging level and format based on CLI flags."""
+    level = logging.DEBUG if debug else logging.INFO
+    logging.basicConfig(level=level, format="%(levelname)s: %(message)s", force=True)
+    if debug:
+        logging.getLogger("requests").setLevel(logging.INFO)
+        logging.getLogger("urllib3").setLevel(logging.INFO)
+        logging.getLogger("requests.packages.urllib3").setLevel(logging.INFO)
+    logger.debug("Debug logging enabled")
+
+
+def get_credentials() -> tuple[str, str]:
+    username = os.environ.get("ENA_WEBIN", "").strip()
+    password = os.environ.get("ENA_WEBIN_PASSWORD", "").strip()
+    if not username or not password:
+        raise CoassemblyRegistrationError(
+            "Missing credentials. Set ENA_WEBIN/ENA_WEBIN_PASSWORD."
+        )
+    return username, password
+
+
+@retry(tries=3, delay=2, backoff=2)
+def get_xml(url: str) -> ET.Element:
+    logger.debug(f"Requesting XML URL: {url}")
+    response = requests.get(url, timeout=60)
+    response.raise_for_status()
+    return ET.fromstring(response.content)
+
+@retry(tries=3, delay=2, backoff=2)
+def get_json(url: str, params: dict[str, str]) -> list[dict[str, str]]:
+    prepared = requests.Request("GET", url, params=params).prepare()
+    logger.debug(f"Requesting JSON URL: {prepared.url}")
+    response = requests.get(url, params=params, timeout=60)
+    response.raise_for_status()
+    return response.json()
+
+def to_pretty_xml(root: ET.Element) -> str:
+    """Serialize an XML element to a pretty-printed UTF-8 XML string."""
+    # Re-parse to avoid side effects from in-place indentation on caller-owned trees.
+    normalized_root = ET.fromstring(ET.tostring(root, encoding="utf-8"))
+    ET.indent(normalized_root, space="  ")
+    return ET.tostring(normalized_root, encoding="utf-8", xml_declaration=True).decode("utf-8")
+
+@retry(tries=3, delay=2, backoff=2)
+def submit_sample_xml(auth: HTTPBasicAuth, submit_url: str, sample_xml: str) -> ET.Element:
+    submission_xml = """<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<SUBMISSION><ACTIONS><ACTION><ADD/></ACTION></ACTIONS></SUBMISSION>\n"""
+    files = {
+        "SUBMISSION": ("submission.xml", submission_xml, "application/xml"),
+        "SAMPLE": ("sample.xml", sample_xml, "application/xml"),
+    }
+    response = requests.post(submit_url, files=files, auth=auth, timeout=120)
+    response.raise_for_status()
+    return ET.fromstring(response.content)
+
+def merge_or_not_provided(values: list[str]) -> str:
+    unique = list(OrderedDict.fromkeys(values))
+    logger.debug(f"Merging values: {unique}")
+    if any(not v for v in unique):
+        logger.info("Merge decision: 'not provided' (at least one value is empty)")
+        return "not provided"
+    if len(unique) != 1:
+        logger.info("Merge decision: 'not provided' (multiple conflicting values)")
+        return "not provided"
+    logger.info(f"Merge decision: using unique value '{unique[0]}'")
+    return unique[0]
+
+def get_run_sample_from_xml(run_accession: str) -> tuple[str, str | None]:
+    """
+    Retrieve the sample accession from the RUN XML for a given run accession.
+
+    Args:
+        run_accession (str): The run accession to query.
+
+    Returns:
+        tuple[str, str | None]: The sample accession and an optional secondary sample accession.
+
+    Raises:
+        CoassemblyRegistrationError: If the RUN XML does not contain a sample accession.
+    """
+    run_root = get_xml(RUN_XML_API.format(accession=run_accession))
+    sample_ref = run_root.find(".//RUN_LINKS/RUN_LINK/XREF_LINK[DB='ENA-SAMPLE']/ID")
+
+    if sample_ref is not None and sample_ref.text:
+        sample_accession = sample_ref.text.strip()
+    else:
+        raise CoassemblyRegistrationError(
+            f"Run {run_accession} has no sample accession in RUN XML (ENA-SAMPLE link)"
+        )
+
+    return sample_accession
+
+
+def get_sample_metadata(sample_accession: str) -> tuple[str, str, str, str]:
+    """
+    Retrieve metadata for a given sample accession from the ENA portal.
+
+    Args:
+        sample_accession (str): The sample accession to query.
+
+    Returns:
+        tuple[str, str, str, str]: A tuple containing tax_id, scientific_name, location, and collection_date.
+    """
+    if INSDC_BIOSAMPLE_ACCESSION_REGEX.match(sample_accession):
+        query = f'"sample_accession={sample_accession}"'
+    else:
+        query = f'"secondary_sample_accession={sample_accession}"'
+
+    rows = get_json(
+        SAMPLE_PORTAL_API,
+        params={
+            "result": "sample",
+            "query": query,
+            "fields": "country,collection_date,tax_id,scientific_name",
+            "format": "json",
+        },
+    )
+
+    if not rows:
+        logger.warning(f"No portal metadata found for sample {sample_accession}")
+        return "", "", "", ""
+
+    row = rows[0]
+    return (
+        row.get("tax_id"),
+        row.get("scientific_name"),
+        row.get("country"),
+        row.get("collection_date"),
+    )
+
+def get_allowed_countries() -> set[str]:
+    """
+    Retrieve the set of allowed countries from the ENA checklist XML.
+
+    Returns:
+        set[str]: A set of allowed country/sea values.
+    """
+    root = get_xml(CHECKLIST_XML_API)
+    values = set()
+    field_nodes = root.findall(".//FIELD")
+    for field in field_nodes:
+        label = field.findtext("LABEL")
+        name = field.findtext("NAME")
+        if label == "geographic location (country and/or sea)" or name == "geographic_location_country_andor_sea":
+            for entry in field.findall(".//TEXT_VALUE/VALUE"):
+                if entry.text:
+                    values.add(entry.text.strip())
+            break
+    return values
+
+def build_sample_xml(
+    alias: str,
+    title: str,
+    description: str,
+    taxon_id: str,
+    scientific_name: str,
+    composed_of: str,
+    collection_date: str,
+    geographic_location: str,
+) -> str:
+    """
+    Build the XML representation of a sample for ENA submission.
+
+    Args:
+        alias (str): The sample alias.
+        title (str): The sample title.
+        description (str): The sample description.
+        taxon_id (str): The taxon ID of the sample.
+        scientific_name (str): The scientific name of the sample.
+        composed_of (str): The list of source samples.
+        collection_date (str): The collection date of the sample.
+        geographic_location (str): The geographic location of the sample.
+
+    Returns:
+        str: The XML string representation of the sample.
+    """
+    root = ET.Element("SAMPLE_SET")
+    sample = ET.SubElement(root, "SAMPLE", {"alias": alias})
+
+    title_node = ET.SubElement(sample, "TITLE")
+    title_node.text = title
+
+    sample_name = ET.SubElement(sample, "SAMPLE_NAME")
+    taxon = ET.SubElement(sample_name, "TAXON_ID")
+    taxon.text = taxon_id
+    sci = ET.SubElement(sample_name, "SCIENTIFIC_NAME")
+    sci.text = scientific_name
+
+    description_node = ET.SubElement(sample, "DESCRIPTION")
+    description_node.text = description
+
+    attrs = ET.SubElement(sample, "SAMPLE_ATTRIBUTES")
+
+    def add_attr(tag: str, value: str) -> None:
+        item = ET.SubElement(attrs, "SAMPLE_ATTRIBUTE")
+        tag_node = ET.SubElement(item, "TAG")
+        tag_node.text = tag
+        value_node = ET.SubElement(item, "VALUE")
+        value_node.text = value
+
+    add_attr("ENA-CHECKLIST", "ERC000011")
+    add_attr("organism", scientific_name)
+    add_attr("collection date", collection_date)
+    add_attr("composed of", composed_of)
+    add_attr("geographic location (country and/or sea)", geographic_location)
+    add_attr("scientific_name", scientific_name)
+
+    return to_pretty_xml(root)
+
+
+def build_virtual_sample_alias(source_samples: list[str], max_length: int = 50) -> str:
+    """Build deterministic alias from source samples with an md5 suffix."""
+    sorted_samples = sorted(source_samples)
+    hash8 = hashlib.md5(",".join(sorted_samples).encode("utf-8")).hexdigest()[:8]
+
+    first_two = sorted_samples[:2]
+    remaining = len(sorted_samples) - len(first_two)
+
+    alias_core = f"coassembly_{'_'.join(first_two)}"
+    if remaining > 0:
+        alias_core = f"{alias_core}_{remaining}_others"
+
+    alias = f"{alias_core}_{hash8}"
+    if len(alias) <= max_length:
+        return alias
+
+    # Keep the hash suffix intact while trimming the front portion for ENA alias limits.
+    # suffix = f"_{hash8}"
+    # max_core_length = max_length - len(suffix)
+    # trimmed_core = alias_core[:max_core_length]
+    # trimmed_alias = f"{trimmed_core}{suffix}"
+    # logger.warning(f"Generated alias is too long, truncated to: {trimmed_alias}")
+    # return trimmed_alias
+    raise CoassemblyRegistrationError(
+        f"Hash suffix '{alias}' exceeds maximum alias length of {max_length}"
+    )
+
+
+def extract_accession_from_receipt(receipt: ET.Element) -> str:
+    """
+    Extract the SAMEA accession from the ENA submission receipt XML.
+
+    Args:
+        receipt (ET.Element): The root element of the receipt XML.
+
+    Returns:
+        str: The SAMEA accession.
+
+    Raises:
+        CoassemblyRegistrationError: If the receipt does not contain a SAMEA accession.
+    """
+    success = receipt.attrib.get("success", "false").lower() == "true"
+    if not success:
+        messages = [m.text for m in receipt.findall(".//MESSAGES/*") if m.text]
+        message_text = "; ".join(messages) if messages else "no error details in receipt"
+
+        # If this is a duplicate submission, ENA returns the existing accession.
+        existing_match = EXISTING_ACCESSION_IN_ERROR_REGEX.search(message_text)
+        if existing_match and "already exists" in message_text.lower():
+            existing_accession = existing_match.group(1)
+            logger.info(f"Virtual sample already exists; reusing accession: {existing_accession}")
+            return existing_accession
+
+        raise CoassemblyRegistrationError(
+            f"ENA sample submission failed: {message_text}"
+        )
+
+    biosample = receipt.find(".//SAMPLE/EXT_ID[@type='biosample']")
+    if biosample is not None and biosample.get("accession"):
+        return biosample.get("accession", "")
+
+    external = receipt.find(".//SAMPLE/EXT_ID")
+    if external is not None and external.get("accession", "").startswith("SAMEA"):
+        return external.get("accession", "")
+
+    for node in receipt.findall(".//SAMPLE"):
+        accession = node.get("accession", "")
+        if accession.startswith("SAMEA"):
+            return accession
+
+    raise CoassemblyRegistrationError("Could not find SAMEA accession in ENA receipt XML")
+
+def register_virtual_sample(run_accessions: list[str], test: bool) -> str:
+    """
+    Register a virtual sample for co-assemblies based on run accessions.
+
+    Args:
+        run_accessions (list[str]): A list of run accessions for the co-assembly.
+        test (bool): Whether to use the test submission endpoint.
+
+    Returns:
+        str: The SAMEA accession of the registered virtual sample.
+
+    Raises:
+        CoassemblyRegistrationError: If validation or submission fails.
+    """
+    if test:
+        submit_url = TEST_SUBMIT_URL
+    else:
+        submit_url = PROD_SUBMIT_URL
+
+    sample_accessions = []
+    taxon_ids = []
+    scientific_names = []
+    locations = []
+    collection_dates = []
+
+    for run in run_accessions:
+        sample_acc = get_run_sample_from_xml(run)
+        logger.debug(f"Run {run} is linked to sample {sample_acc}")
+        sample_accessions.append(sample_acc)
+
+        taxon_id, scientific_name, country, collection_date = get_sample_metadata(sample_acc)
+        logger.debug(f"Metadata for sample {sample_acc}: tax_id={taxon_id}, scientific_name={scientific_name}, country={country}, collection_date={collection_date}")
+        if not taxon_id:
+            raise CoassemblyRegistrationError(f"Sample {sample_acc} has no tax_id in ENA portal response")
+        if not scientific_name:
+            raise CoassemblyRegistrationError(f"Sample {sample_acc} has no scientific_name in ENA portal response")
+        taxon_ids.append(taxon_id)
+        scientific_names.append(scientific_name)
+        locations.append(country)
+        collection_dates.append(collection_date)
+
+    unique_samples = list(OrderedDict.fromkeys(sample_accessions))
+    if len(unique_samples) == 1:
+        logger.info(f"All source runs belong to one sample ({unique_samples[0]}); keeping original row unchanged")
+        return ""
+
+    unique_taxa = list(OrderedDict.fromkeys([t for t in taxon_ids if t]))
+    if len(unique_taxa) != 1:
+        raise CoassemblyRegistrationError(
+            f"Co-assembly source samples have mixed taxa: {', '.join(unique_taxa)}"
+        )
+
+    unique_scientific_names = list(OrderedDict.fromkeys([s for s in scientific_names if s]))
+    if len(unique_scientific_names) != 1:
+        raise CoassemblyRegistrationError(
+            f"Co-assembly source samples have mixed scientific names: {', '.join(unique_scientific_names)}"
+        )
+
+    scientific_name = unique_scientific_names[0]
+
+    merged_collection_date = merge_or_not_provided(collection_dates)
+    merged_location = merge_or_not_provided(locations)
+
+    allowed_countries = get_allowed_countries()
+    if merged_location != "not provided" and merged_location not in allowed_countries:
+        logger.warning(
+            f"Location '{merged_location}' is not in ERC000011 allowed country/sea values"
+        )
+
+    sample_list = ",".join(unique_samples)
+    title = f"Combined sample from {sample_list}"
+    description = (
+        "This sample is a virtual sample of co-assembled raw reads from multiple samples "
+        f"of {scientific_name}. Co-assembly was performed from runs of the samples {sample_list}"
+    )
+    alias = build_virtual_sample_alias(unique_samples)
+    logger.info(f"Registering virtual sample with alias '{alias}' for co-assembly of samples: {sample_list}")
+
+    sample_xml = build_sample_xml(
+        alias=alias,
+        title=title,
+        description=description,
+        taxon_id=unique_taxa[0],
+        scientific_name=scientific_name,
+        composed_of=sample_list,
+        collection_date=merged_collection_date,
+        geographic_location=merged_location,
+    )
+
+    with open(f"{alias}.xml", "w", encoding="utf-8") as sample_file:
+        sample_file.write(sample_xml)
+
+    username, password = get_credentials()
+    receipt = submit_sample_xml(HTTPBasicAuth(username, password), submit_url, sample_xml)
+    samea = extract_accession_from_receipt(receipt)
+    logger.info(f"Registered virtual co-assembly sample: {samea}")
+    return samea
+
+
+def main() -> int:
+    args = parse_args()
+    setup_logging(args.debug)
+
+    logger.info(f"Starting coassembly registration script: input={args.input}, output={args.output}, test={args.test}")
+
+    with open(args.input, newline="", encoding="utf-8") as handle:
+        reader = csv.DictReader(handle)
+        fieldnames = reader.fieldnames or []
+        rows = list(reader)
+
+    # The script may add/update the Sample value; ensure the output schema includes it
+    if "Sample" not in fieldnames:
+        fieldnames.append("Sample")
+
+    updated_rows: list[dict[str, str]] = []
+    for row in rows:
+        run_accessions = row["Runs"].split(",")
+
+        logger.info(f"Processing assembly with runs: {run_accessions}")
+
+        # Skip non-coassembly rows by design
+        if len(run_accessions) <= 1:
+            logger.info("Single run found; skipping coassembly sample registration for this row")
+            updated_rows.append(row)
+            continue
+
+        virtual_sample = register_virtual_sample(run_accessions, args.test)
+        if virtual_sample or virtual_sample == "":
+            row["Sample"] = virtual_sample
+        updated_rows.append(row)
+
+    logger.info(f"Writing updated assembly metadata to: {args.output}")
+    with open(args.output, "w", newline="", encoding="utf-8") as out_handle:
+        writer = csv.DictWriter(out_handle, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(updated_rows)
+
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except Exception as exc:  # noqa: BLE001
+        logger.error(str(exc))
+        raise
diff --git a/conf/test_assembly_coassembly_with_coverage.config b/conf/test_assembly_coassembly_with_coverage.config
index 910231a..ba3d683 100644
--- a/conf/test_assembly_coassembly_with_coverage.config
+++ b/conf/test_assembly_coassembly_with_coverage.config
@@ -31,5 +31,4 @@ params {
     centre_name      = "TEST_CENTER"
 
     test_upload      = true
-
 }
diff --git a/modules/local/create_assembly_metadata_csv/main.nf b/modules/local/create_assembly_metadata_csv/main.nf
index f3f8312..576f781 100644
--- a/modules/local/create_assembly_metadata_csv/main.nf
+++ b/modules/local/create_assembly_metadata_csv/main.nf
@@ -28,7 +28,7 @@ process CREATE_ASSEMBLY_METADATA_CSV {
         meta.assembler,
         meta.assembler_version,
         fasta.name,
-        ''    // Sample column left empty because co-assemblies are not supported
+        ''    // Sample column is filled later for co-assemblies that require virtual sample registration
     ].join(',')
     """
     cat <<-END_CSV > ${meta.id}_assembly_metadata.csv
diff --git a/modules/local/register_coassembly_sample/environment.yml b/modules/local/register_coassembly_sample/environment.yml
new file mode 100644
index 0000000..66d3d46
--- /dev/null
+++ b/modules/local/register_coassembly_sample/environment.yml
@@ -0,0 +1,8 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::python=3.12.11
+  - conda-forge::requests=2.32.5
+  - conda-forge::retry=0.9.2
diff --git a/modules/local/register_coassembly_sample/main.nf b/modules/local/register_coassembly_sample/main.nf
new file mode 100644
index 0000000..e0d3d8e
--- /dev/null
+++ b/modules/local/register_coassembly_sample/main.nf
@@ -0,0 +1,52 @@
+process REGISTER_COASSEMBLY_SAMPLE {
+    tag "${meta.id}"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'oras://community.wave.seqera.io/library/pip_requests_retry:d07e257656d938ab' :
+        'community.wave.seqera.io/library/pip_requests_retry:eb16563fc2cb641f' }"
+
+    input:
+    tuple val(meta), path(csv)
+    val(test)
+
+    output:
+    tuple val(meta), path("*_updated.csv"), emit: csv
+    path "versions.yml",                    emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def test_arg = test ? "--test" : ""
+    """
+    register_coassembly_sample.py \\
+        --input ${csv} \\
+        --output ${prefix}_updated.csv \\
+        ${test_arg} \\
+        ${args}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //')
+        requests: \$(python -c "import requests; print(requests.__version__)")
+        retry: \$(python -c "import importlib.metadata as m; print(m.version('retry'))")
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    cp ${csv} ${prefix}_updated.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //')
+        requests: \$(python -c "import requests; print(requests.__version__)")
+        retry: \$(python -c "import importlib.metadata as m; print(m.version('retry'))")
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/register_coassembly_sample/meta.yml b/modules/local/register_coassembly_sample/meta.yml
new file mode 100644
index 0000000..977b239
--- /dev/null
+++ b/modules/local/register_coassembly_sample/meta.yml
@@ -0,0 +1,44 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "register_coassembly_sample"
+description: Register a virtual ENA sample for co-assemblies and write sample accession into assembly metadata CSV
+keywords:
+  - ena
+  - sample
+  - metadata
+  - co-assembly
+tools:
+  - python:
+      description: Python
+      homepage: https://www.python.org/
+  - retry:
+      description: Retry decorator for resilient network calls
+      homepage: https://pypi.org/project/retry/
+input:
+  - - meta:
+        type: map
+        description: Groovy map containing sample metadata
+    - csv:
+        type: file
+        description: Assembly metadata CSV input for assembly_uploader tool
+        pattern: "*.csv"
+  - test:
+      type: string
+      description: Whether to submit to the ENA test server (`true`) instead of production.
+output:
+  - csv:
+      - meta:
+          type: map
+          description: Same metadata map as input
+      - "*.csv":
+          type: file
+          description: Updated assembly metadata CSV with "Sample" column filled for co-assemblies
+          pattern: "*.csv"
+  - versions:
+      - versions.yml:
+          type: file
+          description: File containing software versions
+          pattern: "versions.yml"
+authors:
+  - "@ochkalova"
+maintainers:
+  - "@ochkalova"
diff --git a/modules/local/register_coassembly_sample/tests/coassembly_metadata.csv b/modules/local/register_coassembly_sample/tests/coassembly_metadata.csv
new file mode 100644
index 0000000..689d97b
--- /dev/null
+++ b/modules/local/register_coassembly_sample/tests/coassembly_metadata.csv
@@ -0,0 +1,11 @@
+Runs,Coverage,Assembler,Version,Filepath
+"SRR3183853,SRR3183854,SRR3183855,SRR3183856,SRR3183857,SRR3183858,SRR3183859,SRR3183860,SRR3183861,SRR3183952,SRR3183953,SRR3183954,SRR3183955,SRR3183956,SRR3183957,SRR3183958,SRR3183959,SRR3183960",10.0,MEGAHIT,1.2.9,contigs1.fasta.gz
+"SRR3183850,SRR3183851,SRR3183852,SRR3183949,SRR3183950,SRR3183951",12.0,MEGAHIT,1.2.9,contigs2.fasta.gz
+"SRR1631070,SRR1631071,SRR1631072,SRR1631073,SRR1631074,SRR1631075,SRR1631076,SRR1631077,SRR1631078,SRR1631079,SRR1631080,SRR1631081,SRR1631082,SRR1631083,SRR3187740,SRR3187741,SRR3187742,SRR3187743,SRR3187744,SRR3187745,SRR3187746,SRR3187747,SRR3187748,SRR3188145,SRR3188146,SRR3188147,SRR3188148,SRR3188149,SRR3188150,SRR3188151,SRR3188152,SRR3188153",15.0,MEGAHIT,1.2.9,contigs3.fasta.gz
+"SRR1622826,SRR3187094,SRR3187149",8.0,MEGAHIT,1.2.9,contigs4.fasta.gz
+"SRR1622910,SRR1622911",7.0,MEGAHIT,1.2.9,contigs5.fasta.gz
+"SRR1624846,SRR1624847,SRR1624848,SRR1624849,SRR1624850,SRR1624851,SRR1624852,SRR1624853,SRR1624854",9.0,MEGAHIT,1.2.9,contigs6.fasta.gz
+"SRR1631360,SRR1631361,SRR1631362,SRR1631363,SRR1631364,SRR1631365",6.0,MEGAHIT,1.2.9,contigs9.fasta.gz
+"SRR3183597,SRR3183598,SRR3183599,SRR3183600",3.0,MEGAHIT,1.2.9,contigs12.fasta.gz
+"SRR3183638,SRR3183639,SRR3183640,SRR3183641,SRR3183642,SRR3183643,SRR3183644,SRR3183645,SRR3183646,SRR3183647",2.0,MEGAHIT,1.2.9,contigs13.fasta.gz
+"SRR3183844,SRR3183845,SRR3183846",1.0,MEGAHIT,1.2.9,contigs14.fasta.gz
diff --git a/modules/local/register_coassembly_sample/tests/main.nf.test b/modules/local/register_coassembly_sample/tests/main.nf.test
new file mode 100644
index 0000000..1a93313
--- /dev/null
+++ b/modules/local/register_coassembly_sample/tests/main.nf.test
@@ -0,0 +1,36 @@
+nextflow_process {
+    name "Test Process REGISTER_COASSEMBLY_SAMPLE"
+    script "../main.nf"
+    config "./nextflow.config"
+    process "REGISTER_COASSEMBLY_SAMPLE"
+
+    tag "modules"
+    tag "register_coassembly_sample"
+
+    test("REGISTER_COASSEMBLY_SAMPLE completes with expected outputs") {
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'test' ],
+                    file("${moduleDir}/tests/coassembly_metadata.csv", checkIfExists: true)
+                ]
+                input[1] = true
+                """
+            }
+        }
+
+        then {
+            assert process.success
+            assertAll(
+                { assert process.out.csv.size() == 1 },
+                { assert process.out.csv[0][1].toString().endsWith("_updated.csv") },
+                {
+                    def csvContent = path(process.out.csv[0][1]).text
+                    assert csvContent.contains("Sample")
+                    assert csvContent.contains("SRR3183853")
+                }
+            )
+        }
+    }
+}
diff --git a/modules/local/register_coassembly_sample/tests/nextflow.config b/modules/local/register_coassembly_sample/tests/nextflow.config
new file mode 100644
index 0000000..58a5702
--- /dev/null
+++ b/modules/local/register_coassembly_sample/tests/nextflow.config
@@ -0,0 +1,6 @@
+process {
+    withName: 'REGISTER_COASSEMBLY_SAMPLE' {
+        ext.args = '--debug'
+
+    }
+}
diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf
index 6d8aae4..97d19d8 100644
--- a/workflows/assemblysubmit.nf
+++ b/workflows/assemblysubmit.nf
@@ -7,6 +7,7 @@
 include { COVERM_CONTIG                         } from '../modules/nf-core/coverm/contig/main'
 include { FASTAVALIDATOR                        } from '../modules/nf-core/fastavalidator/main'
 include { CREATE_ASSEMBLY_METADATA_CSV          } from '../modules/local/create_assembly_metadata_csv/main'
+include { REGISTER_COASSEMBLY_SAMPLE            } from '../modules/local/register_coassembly_sample/main'
 include { GENERATE_ASSEMBLY_MANIFEST            } from '../modules/local/generate_assembly_manifest/main'
 include { REGISTERSTUDY                         } from '../modules/local/registerstudy/main'
 include { ENA_WEBIN_CLI_WRAPPER as SUBMIT       } from '../modules/local/ena_webin_cli_wrapper'
@@ -187,9 +188,26 @@ workflow ASSEMBLYSUBMIT {
     )
     ch_versions = ch_versions.mix(CREATE_ASSEMBLY_METADATA_CSV.out.versions)
 
+    // For co-assemblies (multiple run accessions), register a virtual ENA sample and fill Sample column.
+    // Rows with a single run accession bypass this step unchanged.
+    assembly_metadata_by_type = CREATE_ASSEMBLY_METADATA_CSV.out.csv
+        .branch { meta, _csv ->
+            coassembly: meta.run_accession instanceof List && meta.run_accession.size() > 1
+            standard: true
+        }
+
+    REGISTER_COASSEMBLY_SAMPLE(
+        assembly_metadata_by_type.coassembly,
+        test_upload
+    )
+    ch_versions = ch_versions.mix(REGISTER_COASSEMBLY_SAMPLE.out.versions)
+
+    assembly_metadata_with_sample = assembly_metadata_by_type.standard
+        .mix(REGISTER_COASSEMBLY_SAMPLE.out.csv)
+
     // Concatenate assembly metadata CSVs into single file to publish
     CONCAT_METADATA (
-        CREATE_ASSEMBLY_METADATA_CSV.out.csv.map { _meta, file -> file }.collect().map { files -> [ [id: "assemblies_metadata"], files ] },
+        assembly_metadata_with_sample.map { _meta, file -> file }.collect().map { files -> [ [id: "assemblies_metadata"], files ] },
         'true' // skip_header - we want to keep the header from the first file and skip it for the rest
     )
 
@@ -213,7 +231,7 @@ workflow ASSEMBLYSUBMIT {
 
     // Generate assembly manifest files and submit them to ENA
     GENERATE_ASSEMBLY_MANIFEST(
-        assemblies_with_coverage.join(CREATE_ASSEMBLY_METADATA_CSV.out.csv),
+        assemblies_with_coverage.join(assembly_metadata_with_sample),
         study_accession_ch.first(),
         upload_tpa,
         test_upload

From b0689c69c3a856e36a57011078cd3d591586a533 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Thu, 14 May 2026 14:17:37 +0100
Subject: [PATCH 4/7] minor refactoring of register_coassembly_sample.py

Co-authored-by: Copilot <copilot@github.com>
---
 bin/register_coassembly_sample.py | 45 ++++++++++++++-----------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/bin/register_coassembly_sample.py b/bin/register_coassembly_sample.py
index 8238f50..7a7bfdc 100755
--- a/bin/register_coassembly_sample.py
+++ b/bin/register_coassembly_sample.py
@@ -7,7 +7,6 @@
 import logging
 import os
 import xml.etree.ElementTree as ET
-from collections import OrderedDict
 import re
 
 import requests
@@ -110,7 +109,7 @@ def submit_sample_xml(auth: HTTPBasicAuth, submit_url: str, sample_xml: str) ->
     return ET.fromstring(response.content)
 
 def merge_or_not_provided(values: list[str]) -> str:
-    unique = list(OrderedDict.fromkeys(values))
+    unique = sorted(set(values))
     logger.debug(f"Merging values: {unique}")
     if any(not v for v in unique):
         logger.info("Merge decision: 'not provided' (at least one value is empty)")
@@ -155,7 +154,7 @@ def get_sample_metadata(sample_accession: str) -> tuple[str, str, str, str]:
         sample_accession (str): The sample accession to query.
 
     Returns:
-        tuple[str, str, str, str]: A tuple containing tax_id, scientific_name, location, and collection_date.
+        tuple[str, str, str, str]: A tuple containing tax_id, scientific_name, country, and collection_date.
     """
     if INSDC_BIOSAMPLE_ACCESSION_REGEX.match(sample_accession):
         query = f'"sample_accession={sample_accession}"'
@@ -345,7 +344,7 @@ def register_virtual_sample(run_accessions: list[str], test: bool) -> str:
         test (bool): Whether to use the test submission endpoint.
 
     Returns:
-        str: The SAMEA accession of the registered virtual sample.
+        str: The sample accession of the registered virtual sample.
 
     Raises:
         CoassemblyRegistrationError: If validation or submission fails.
@@ -358,7 +357,7 @@ def register_virtual_sample(run_accessions: list[str], test: bool) -> str:
     sample_accessions = []
     taxon_ids = []
     scientific_names = []
-    locations = []
+    countries = []
     collection_dates = []
 
     for run in run_accessions:
@@ -374,36 +373,36 @@ def register_virtual_sample(run_accessions: list[str], test: bool) -> str:
             raise CoassemblyRegistrationError(f"Sample {sample_acc} has no scientific_name in ENA portal response")
         taxon_ids.append(taxon_id)
         scientific_names.append(scientific_name)
-        locations.append(country)
+        countries.append(country)
         collection_dates.append(collection_date)
 
-    unique_samples = list(OrderedDict.fromkeys(sample_accessions))
+    unique_samples = sorted(set(sample_accessions))
     if len(unique_samples) == 1:
         logger.info(f"All source runs belong to one sample ({unique_samples[0]}); keeping original row unchanged")
         return ""
 
-    unique_taxa = list(OrderedDict.fromkeys([t for t in taxon_ids if t]))
+    unique_taxa = sorted(set([t for t in taxon_ids if t]))
     if len(unique_taxa) != 1:
         raise CoassemblyRegistrationError(
             f"Co-assembly source samples have mixed taxa: {', '.join(unique_taxa)}"
         )
 
-    unique_scientific_names = list(OrderedDict.fromkeys([s for s in scientific_names if s]))
+    unique_scientific_names = sorted(set([s for s in scientific_names if s]))
     if len(unique_scientific_names) != 1:
         raise CoassemblyRegistrationError(
             f"Co-assembly source samples have mixed scientific names: {', '.join(unique_scientific_names)}"
         )
-
     scientific_name = unique_scientific_names[0]
 
     merged_collection_date = merge_or_not_provided(collection_dates)
-    merged_location = merge_or_not_provided(locations)
+    merged_country = merge_or_not_provided(countries)
 
     allowed_countries = get_allowed_countries()
-    if merged_location != "not provided" and merged_location not in allowed_countries:
+    if merged_country != "not provided" and merged_country not in allowed_countries:
         logger.warning(
-            f"Location '{merged_location}' is not in ERC000011 allowed country/sea values"
+            f"Country '{merged_country}' is not in ERC000011 allowed country/sea values; setting to 'not provided'"
         )
+        merged_country = "not provided"
 
     sample_list = ",".join(unique_samples)
     title = f"Combined sample from {sample_list}"
@@ -422,7 +421,7 @@ def register_virtual_sample(run_accessions: list[str], test: bool) -> str:
         scientific_name=scientific_name,
         composed_of=sample_list,
         collection_date=merged_collection_date,
-        geographic_location=merged_location,
+        geographic_location=merged_country,
     )
 
     with open(f"{alias}.xml", "w", encoding="utf-8") as sample_file:
@@ -430,20 +429,22 @@ def register_virtual_sample(run_accessions: list[str], test: bool) -> str:
 
     username, password = get_credentials()
     receipt = submit_sample_xml(HTTPBasicAuth(username, password), submit_url, sample_xml)
-    samea = extract_accession_from_receipt(receipt)
-    logger.info(f"Registered virtual co-assembly sample: {samea}")
-    return samea
+    virtual_sample = extract_accession_from_receipt(receipt)
+    logger.info(f"Registered virtual co-assembly sample: {virtual_sample}")
+    return virtual_sample
 
 
 def main() -> int:
     args = parse_args()
     setup_logging(args.debug)
 
-    logger.info(f"Starting coassembly registration script: input={args.input}, output={args.output}, test={args.test}")
+    logger.debug(f"Starting coassembly registration script: input={args.input}, output={args.output}, test={args.test}")
 
     with open(args.input, newline="", encoding="utf-8") as handle:
         reader = csv.DictReader(handle)
         fieldnames = reader.fieldnames or []
+        if "Runs" not in fieldnames:
+            raise CoassemblyRegistrationError("Input CSV must have a 'Runs' column with ENA run accessions")
         rows = list(reader)
 
     # The script may add/update the Sample value; ensure the output schema includes it
@@ -473,12 +474,6 @@ def main() -> int:
         writer.writeheader()
         writer.writerows(updated_rows)
 
-    return 0
-
 
 if __name__ == "__main__":
-    try:
-        raise SystemExit(main())
-    except Exception as exc:  # noqa: BLE001
-        logger.error(str(exc))
-        raise
+    main()

From 33f3967a4322ce0789c56d5bee76b306a20895f6 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Thu, 14 May 2026 14:34:29 +0100
Subject: [PATCH 5/7] add default metadata values for
 register_coassembly_sample.py

Co-authored-by: Copilot <copilot@github.com>
---
 bin/register_coassembly_sample.py | 72 ++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 30 deletions(-)

diff --git a/bin/register_coassembly_sample.py b/bin/register_coassembly_sample.py
index 7a7bfdc..f98b0da 100755
--- a/bin/register_coassembly_sample.py
+++ b/bin/register_coassembly_sample.py
@@ -38,7 +38,7 @@ class CoassemblyRegistrationError(RuntimeError):
 
 
 def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Register virtual ENA sample for co-assemblies in the metadata CSV")
+    parser = argparse.ArgumentParser(description="Register co-assembly samples.")
     parser.add_argument("--input", required=True, help="Input assembly metadata CSV")
     parser.add_argument("--output", required=True, help="Output assembly metadata CSV")
     parser.add_argument(
@@ -51,6 +51,10 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Enable verbose debug logging",
     )
+    parser.add_argument("--default-country", type=str, help="Default country to use if values differ or are missing.")
+    parser.add_argument("--default-date", type=str, help="Default collection date to use if values differ or are missing.")
+    parser.add_argument("--default-taxid", type=str, help="Default taxon ID to use if values differ or are missing.")
+    parser.add_argument("--default-tax-name", type=str, help="Default taxon name to use if values differ or are missing.")
     return parser.parse_args()
 
 
@@ -108,17 +112,19 @@ def submit_sample_xml(auth: HTTPBasicAuth, submit_url: str, sample_xml: str) ->
     response.raise_for_status()
     return ET.fromstring(response.content)
 
-def merge_or_not_provided(values: list[str]) -> str:
-    unique = sorted(set(values))
-    logger.debug(f"Merging values: {unique}")
-    if any(not v for v in unique):
-        logger.info("Merge decision: 'not provided' (at least one value is empty)")
-        return "not provided"
-    if len(unique) != 1:
-        logger.info("Merge decision: 'not provided' (multiple conflicting values)")
+def merge_or_not_provided(values: list[str], default_value=None) -> str:
+    """Merge values or return default/not provided."""
+    unique_values = sorted(set(values))
+    logger.debug(f"Merging values: {unique_values}")
+    if len(unique_values) == 1 and unique_values[0]:
+        logger.info(f"Merge decision: using unique value '{unique_values[0]}'")
+        return unique_values[0]
+    elif default_value:
+        logger.info(f"Merge decision: using default value '{default_value}' due to multiple/conflicting values or missing data")
+        return default_value
+    else:
+        logger.info("Merge decision: 'not provided' (multiple conflicting values or some missing)")
         return "not provided"
-    logger.info(f"Merge decision: using unique value '{unique[0]}'")
-    return unique[0]
 
 def get_run_sample_from_xml(run_accession: str) -> tuple[str, str | None]:
     """
@@ -293,16 +299,16 @@ def build_virtual_sample_alias(source_samples: list[str], max_length: int = 50)
 
 def extract_accession_from_receipt(receipt: ET.Element) -> str:
     """
-    Extract the SAMEA accession from the ENA submission receipt XML.
+    Extract the sample accession from the ENA submission receipt XML.
 
     Args:
         receipt (ET.Element): The root element of the receipt XML.
 
     Returns:
-        str: The SAMEA accession.
+        str: The sample accession.
 
     Raises:
-        CoassemblyRegistrationError: If the receipt does not contain a SAMEA accession.
+        CoassemblyRegistrationError: If the receipt does not contain a sample accession.
     """
     success = receipt.attrib.get("success", "false").lower() == "true"
     if not success:
@@ -333,9 +339,9 @@ def extract_accession_from_receipt(receipt: ET.Element) -> str:
         if accession.startswith("SAMEA"):
             return accession
 
-    raise CoassemblyRegistrationError("Could not find SAMEA accession in ENA receipt XML")
+    raise CoassemblyRegistrationError("Could not find sample accession in ENA receipt XML")
 
-def register_virtual_sample(run_accessions: list[str], test: bool) -> str:
+def register_virtual_sample(run_accessions: list[str], test: bool, default_country=None, default_date=None, default_taxid=None, default_tax_name=None) -> str:
     """
     Register a virtual sample for co-assemblies based on run accessions.
 
@@ -381,21 +387,20 @@ def register_virtual_sample(run_accessions: list[str], test: bool) -> str:
         logger.info(f"All source runs belong to one sample ({unique_samples[0]}); keeping original row unchanged")
         return ""
 
-    unique_taxa = sorted(set([t for t in taxon_ids if t]))
-    if len(unique_taxa) != 1:
+    merged_collection_date = merge_or_not_provided(collection_dates, default_date)
+    merged_country = merge_or_not_provided(countries, default_country)
+    merged_taxid = merge_or_not_provided(taxon_ids, default_taxid)
+    merged_tax_name = merge_or_not_provided(scientific_names, default_tax_name)
+
+    if merged_taxid == "not provided":
         raise CoassemblyRegistrationError(
-            f"Co-assembly source samples have mixed taxa: {', '.join(unique_taxa)}"
+            f"Co-assembly source samples have mixed taxa: {', '.join(taxon_ids)}"
         )
 
-    unique_scientific_names = sorted(set([s for s in scientific_names if s]))
-    if len(unique_scientific_names) != 1:
+    if merged_tax_name == "not provided":
         raise CoassemblyRegistrationError(
-            f"Co-assembly source samples have mixed scientific names: {', '.join(unique_scientific_names)}"
+            f"Co-assembly source samples have mixed scientific names: {', '.join(scientific_names)}"
         )
-    scientific_name = unique_scientific_names[0]
-
-    merged_collection_date = merge_or_not_provided(collection_dates)
-    merged_country = merge_or_not_provided(countries)
 
     allowed_countries = get_allowed_countries()
     if merged_country != "not provided" and merged_country not in allowed_countries:
@@ -408,7 +413,7 @@ def register_virtual_sample(run_accessions: list[str], test: bool) -> str:
     title = f"Combined sample from {sample_list}"
     description = (
         "This sample is a virtual sample of co-assembled raw reads from multiple samples "
-        f"of {scientific_name}. Co-assembly was performed from runs of the samples {sample_list}"
+        f"of {merged_tax_name}. Co-assembly was performed from runs of the samples {sample_list}"
     )
     alias = build_virtual_sample_alias(unique_samples)
     logger.info(f"Registering virtual sample with alias '{alias}' for co-assembly of samples: {sample_list}")
@@ -417,8 +422,8 @@ def register_virtual_sample(run_accessions: list[str], test: bool) -> str:
         alias=alias,
         title=title,
         description=description,
-        taxon_id=unique_taxa[0],
-        scientific_name=scientific_name,
+        taxon_id=merged_taxid,
+        scientific_name=merged_tax_name,
         composed_of=sample_list,
         collection_date=merged_collection_date,
         geographic_location=merged_country,
@@ -463,7 +468,14 @@ def main() -> int:
             updated_rows.append(row)
             continue
 
-        virtual_sample = register_virtual_sample(run_accessions, args.test)
+        virtual_sample = register_virtual_sample(
+            run_accessions=run_accessions,
+            test=args.test,
+            default_country=args.default_country,
+            default_date=args.default_date,
+            default_taxid=args.default_taxid,
+            default_tax_name=args.default_tax_name
+        )
         if virtual_sample or virtual_sample == "":
             row["Sample"] = virtual_sample
         updated_rows.append(row)

From 1f738cbaee2cf13e9dbb138d8121b4ec1bacae1c Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Thu, 14 May 2026 14:50:57 +0100
Subject: [PATCH 6/7] minor refactoring of register_coassembly_sample.py

Co-authored-by: Copilot <copilot@github.com>
---
 bin/register_coassembly_sample.py | 44 ++++++++++++-------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/bin/register_coassembly_sample.py b/bin/register_coassembly_sample.py
index f98b0da..f3cf212 100755
--- a/bin/register_coassembly_sample.py
+++ b/bin/register_coassembly_sample.py
@@ -32,6 +32,8 @@
     re.IGNORECASE,
 )
 
+DEFAULT_NA_VALUE = "not provided"
+
 
 class CoassemblyRegistrationError(RuntimeError):
     """Raised for hard validation/submission errors."""
@@ -113,7 +115,7 @@ def submit_sample_xml(auth: HTTPBasicAuth, submit_url: str, sample_xml: str) ->
     return ET.fromstring(response.content)
 
 def merge_or_not_provided(values: list[str], default_value=None) -> str:
-    """Merge values or return default/not provided."""
+    """Merge values or return default_value or DEFAULT_NA_VALUE."""
     unique_values = sorted(set(values))
     logger.debug(f"Merging values: {unique_values}")
     if len(unique_values) == 1 and unique_values[0]:
@@ -123,8 +125,8 @@ def merge_or_not_provided(values: list[str], default_value=None) -> str:
         logger.info(f"Merge decision: using default value '{default_value}' due to multiple/conflicting values or missing data")
         return default_value
     else:
-        logger.info("Merge decision: 'not provided' (multiple conflicting values or some missing)")
-        return "not provided"
+        logger.info(f"Merge decision: '{DEFAULT_NA_VALUE}' (multiple conflicting values or some missing)")
+        return DEFAULT_NA_VALUE
 
 def get_run_sample_from_xml(run_accession: str) -> tuple[str, str | None]:
     """
@@ -285,13 +287,6 @@ def build_virtual_sample_alias(source_samples: list[str], max_length: int = 50)
     if len(alias) <= max_length:
         return alias
 
-    # Keep the hash suffix intact while trimming the front portion for ENA alias limits.
-    # suffix = f"_{hash8}"
-    # max_core_length = max_length - len(suffix)
-    # trimmed_core = alias_core[:max_core_length]
-    # trimmed_alias = f"{trimmed_core}{suffix}"
-    # logger.warning(f"Generated alias is too long, truncated to: {trimmed_alias}")
-    # return trimmed_alias
     raise CoassemblyRegistrationError(
         f"Hash suffix '{alias}' exceeds maximum alias length of {max_length}"
     )
@@ -326,20 +321,14 @@ def extract_accession_from_receipt(receipt: ET.Element) -> str:
             f"ENA sample submission failed: {message_text}"
         )
 
-    biosample = receipt.find(".//SAMPLE/EXT_ID[@type='biosample']")
-    if biosample is not None and biosample.get("accession"):
-        return biosample.get("accession", "")
-
-    external = receipt.find(".//SAMPLE/EXT_ID")
-    if external is not None and external.get("accession", "").startswith("SAMEA"):
-        return external.get("accession", "")
+    sample_node = receipt.find(".//SAMPLE")
+    if sample_node is not None:
+        ena_accession = sample_node.get("accession", "")
+        if ena_accession:
+            return ena_accession
 
-    for node in receipt.findall(".//SAMPLE"):
-        accession = node.get("accession", "")
-        if accession.startswith("SAMEA"):
-            return accession
+    raise CoassemblyRegistrationError("ENA sample submission succeeded but no SAMPLE accession found in receipt XML")
 
-    raise CoassemblyRegistrationError("Could not find sample accession in ENA receipt XML")
 
 def register_virtual_sample(run_accessions: list[str], test: bool, default_country=None, default_date=None, default_taxid=None, default_tax_name=None) -> str:
     """
@@ -392,22 +381,22 @@ def register_virtual_sample(run_accessions: list[str], test: bool, default_count
     merged_taxid = merge_or_not_provided(taxon_ids, default_taxid)
     merged_tax_name = merge_or_not_provided(scientific_names, default_tax_name)
 
-    if merged_taxid == "not provided":
+    if merged_taxid == DEFAULT_NA_VALUE:
         raise CoassemblyRegistrationError(
             f"Co-assembly source samples have mixed taxa: {', '.join(taxon_ids)}"
         )
 
-    if merged_tax_name == "not provided":
+    if merged_tax_name == DEFAULT_NA_VALUE:
         raise CoassemblyRegistrationError(
             f"Co-assembly source samples have mixed scientific names: {', '.join(scientific_names)}"
         )
 
     allowed_countries = get_allowed_countries()
-    if merged_country != "not provided" and merged_country not in allowed_countries:
+    if merged_country != DEFAULT_NA_VALUE and merged_country not in allowed_countries:
         logger.warning(
-            f"Country '{merged_country}' is not in ERC000011 allowed country/sea values; setting to 'not provided'"
+            f"Country '{merged_country}' is not in ERC000011 allowed country/sea values; setting to '{DEFAULT_NA_VALUE}'"
         )
-        merged_country = "not provided"
+        merged_country = DEFAULT_NA_VALUE
 
     sample_list = ",".join(unique_samples)
     title = f"Combined sample from {sample_list}"
@@ -434,6 +423,7 @@ def register_virtual_sample(run_accessions: list[str], test: bool, default_count
 
     username, password = get_credentials()
     receipt = submit_sample_xml(HTTPBasicAuth(username, password), submit_url, sample_xml)
+    logger.debug(f"ENA receipt XML:\n{ET.tostring(receipt, encoding='unicode')}")
     virtual_sample = extract_accession_from_receipt(receipt)
     logger.info(f"Registered virtual co-assembly sample: {virtual_sample}")
     return virtual_sample

From 25d01719bf24cf6fa079eb48726006431b14d002 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Thu, 14 May 2026 15:01:41 +0100
Subject: [PATCH 7/7] disable publishDir for REGISTER_COASSEMBLY_SAMPLE

---
 conf/modules.config                                            | 2 +-
 modules/local/register_coassembly_sample/tests/nextflow.config | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 931f57e..3a16be0 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -176,7 +176,7 @@ process {
         ]
     }
 
-    withName: 'REGISTERSTUDY|GENERATE_ASSEMBLY_MANIFEST' {
+    withName: 'REGISTERSTUDY|GENERATE_ASSEMBLY_MANIFEST|REGISTER_COASSEMBLY_SAMPLE' {
         publishDir = [
             enabled: false
         ]
diff --git a/modules/local/register_coassembly_sample/tests/nextflow.config b/modules/local/register_coassembly_sample/tests/nextflow.config
index 58a5702..ea49989 100644
--- a/modules/local/register_coassembly_sample/tests/nextflow.config
+++ b/modules/local/register_coassembly_sample/tests/nextflow.config
@@ -1,6 +1,5 @@
 process {
     withName: 'REGISTER_COASSEMBLY_SAMPLE' {
         ext.args = '--debug'
-
     }
 }