From fc66e8f541222236244943cb65366e9a27e6e32a Mon Sep 17 00:00:00 2001
From: oqadiSAK <salihardakizils@gmail.com>
Date: Thu, 5 Mar 2026 21:27:19 +0100
Subject: [PATCH] Add comment option to splitCsv operator (#6705)

Signed-off-by: oqadiSAK <salihardakizils@gmail.com>
---
 docs/reference/operator.md                    |   3 +
 .../nextflow/splitter/CsvSplitter.groovy      |  36 +++--
 .../nextflow/splitter/CsvSplitterTest.groovy  | 144 ++++++++++++++++++
 .../src/main/nextflow/util/CsvParser.groovy   |  21 +++
 .../test/nextflow/util/CsvParserTest.groovy   |  20 +++
 5 files changed, 215 insertions(+), 9 deletions(-)

diff --git a/docs/reference/operator.md b/docs/reference/operator.md
index 859276abcf..8db2a87cd5 100644
--- a/docs/reference/operator.md
+++ b/docs/reference/operator.md
@@ -1122,6 +1122,9 @@ Available options:
 `charset`
 : Parse the content with the specified charset, e.g. `UTF-8`. See the list of [standard charsets](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/charset/StandardCharsets.html) for available options.
 
+`comment`
+: The character used to denote comments (default: disabled). Lines starting with this character are skipped, and any text following this character mid-line is ignored.
+
 `decompress`
 : When `true`, decompress the content using the GZIP format before processing it (default: `false`). Files with the `.gz` extension are decompressed automatically.
 
diff --git a/modules/nextflow/src/main/groovy/nextflow/splitter/CsvSplitter.groovy b/modules/nextflow/src/main/groovy/nextflow/splitter/CsvSplitter.groovy
index 9d343a3943..b8a2802149 100644
--- a/modules/nextflow/src/main/groovy/nextflow/splitter/CsvSplitter.groovy
+++ b/modules/nextflow/src/main/groovy/nextflow/splitter/CsvSplitter.groovy
@@ -91,6 +91,10 @@ class CsvSplitter extends AbstractTextSplitter {
         if( options.skip )
             skipLines = options.skip as int
 
+        // the comment character if used
+        if( options.comment )
+            parser.setComment(options.comment as String)
+
         return this
     }
 
@@ -107,6 +111,7 @@ class CsvSplitter extends AbstractTextSplitter {
         result.header = [ Boolean, List ]
         result.quote = String
         result.skip = Integer
+        result.comment = String
         return result
     }
 
@@ -132,7 +137,7 @@ class CsvSplitter extends AbstractTextSplitter {
         while( z++ < skipLines && reader.readLine() != null ) { /* nope */ }
 
         if( firstLineAsHeader ) {
-            line = reader.readLine()
+            line = readParsableLine(reader)
             if( !line ) throw new IllegalStateException("Missing 'header' in CSV file")
             List allCols = parser.parse(line)
             columnsHeader = new ArrayList<>(allCols.size())
@@ -144,6 +149,24 @@ class CsvSplitter extends AbstractTextSplitter {
         }
     }
 
+    /**
+     * Read the next parsable line, skipping empty lines and comment lines
+     *
+     * @param reader The reader from which to read
+     * @return The next parsable line, or {@code null} if end of file
+     */
+    protected String readParsableLine(BufferedReader reader) {
+        String line
+        while( (line = reader.readLine()) != null ) {
+            if( !line )
+                continue
+            if( parser.comment && line.charAt(0) == parser.comment )
+                continue
+            return line
+        }
+        return null
+    }
+
     /**
      * Process a CSV row at time
      *
@@ -152,14 +175,9 @@ class CsvSplitter extends AbstractTextSplitter {
      */
     @Override
     protected fetchRecord(BufferedReader reader) {
-        String line
-        while( true ) {
-            line = reader.readLine()
-            if( line )
-                break
-            if( line==null )
-                return null
-        }
+        final line = readParsableLine(reader)
+        if( line==null )
+            return null
 
         final tokens = parser.parse(line)
 
diff --git a/modules/nextflow/src/test/groovy/nextflow/splitter/CsvSplitterTest.groovy b/modules/nextflow/src/test/groovy/nextflow/splitter/CsvSplitterTest.groovy
index d8d92af0cd..019a270f2b 100644
--- a/modules/nextflow/src/test/groovy/nextflow/splitter/CsvSplitterTest.groovy
+++ b/modules/nextflow/src/test/groovy/nextflow/splitter/CsvSplitterTest.groovy
@@ -317,4 +317,148 @@ class CsvSplitterTest extends Specification {
 
     }
 
+    def 'should skip comment lines' () {
+        given:
+        def LINES = '''
+                # This is a comment
+                alpha,beta,delta
+                gamma,,zeta
+                # Another comment
+                eta,theta,iota
+                '''
+                .stripIndent().trim()
+
+        when:
+        def items = new CsvSplitter().target(LINES).options(comment: '#').list()
+
+        then:
+        items.size() == 3
+        items[0] == ['alpha', 'beta', 'delta']
+        items[1] == ['gamma', '', 'zeta']
+        items[2] == ['eta', 'theta', 'iota']
+    }
+
+    def 'should skip comment lines with header' () {
+        given:
+        def LINES = '''
+                # This is a comment at the start
+                x,y,z
+                # Comment after header
+                alpha,beta,delta
+                gamma,,zeta
+                '''
+                .stripIndent().trim()
+
+        when:
+        def items = new CsvSplitter().target(LINES).options(header: true, comment: '#').list()
+
+        then:
+        items.size() == 2
+        items[0].x == 'alpha'
+        items[0].y == 'beta'
+        items[0].z == 'delta'
+        items[1].x == 'gamma'
+        items[1].y == ''
+        items[1].z == 'zeta'
+    }
+
+    def 'should skip comment lines with skip option' () {
+        given:
+        def LINES = '''
+                skip this line
+                # This is a comment
+                alpha,beta,delta
+                gamma,,zeta
+                '''
+                .stripIndent().trim()
+
+        when:
+        def items = new CsvSplitter().target(LINES).options(skip: 1, comment: '#').list()
+
+        then:
+        items.size() == 2
+        items[0] == ['alpha', 'beta', 'delta']
+        items[1] == ['gamma', '', 'zeta']
+    }
+
+    def 'should handle TSV with comments' () {
+        given:
+        def LINES = '''
+                # Comment line
+                alpha\tbeta\tdelta
+                gamma\t\tzeta
+                '''
+                .stripIndent().trim()
+
+        when:
+        def items = new CsvSplitter().target(LINES).options(sep: '\t', comment: '#').list()
+
+        then:
+        items.size() == 2
+        items[0] == ['alpha', 'beta', 'delta']
+        items[1] == ['gamma', '', 'zeta']
+    }
+
+    def 'should not skip comments when option not set' () {
+        given:
+        def LINES = '''
+                #alpha,beta,delta
+                gamma,,zeta
+                '''
+                .stripIndent().trim()
+
+        when:
+        def items = new CsvSplitter().target(LINES).list()
+
+        then:
+        items.size() == 2
+        items[0] == ['#alpha', 'beta', 'delta']
+        items[1] == ['gamma', '', 'zeta']
+    }
+
+    def 'should handle custom comment character' () {
+        given:
+        def LINES = '''
+                ; This is a comment
+                alpha,beta,delta
+                gamma,,zeta
+                '''
+                .stripIndent().trim()
+
+        when:
+        def items = new CsvSplitter().target(LINES).options(comment: ';').list()
+
+        then:
+        items.size() == 2
+        items[0] == ['alpha', 'beta', 'delta']
+        items[1] == ['gamma', '', 'zeta']
+    }
+
+    def 'should reject multi-character comment' () {
+        when:
+        new CsvSplitter().options(comment: '//')
+
+        then:
+        thrown(IllegalArgumentException)
+    }
+
+    def 'should handle inline comments' () {
+        given:
+        def LINES = '''
+                alpha,beta,delta
+                gamma,zeta # inline comment
+                eta,theta,iota
+                '''
+                .stripIndent().trim()
+
+        when:
+        def items = new CsvSplitter().target(LINES).options(comment: '#').list()
+
+        then:
+        items.size() == 3
+        items[0] == ['alpha', 'beta', 'delta']
+        items[1] == ['gamma', 'zeta ']
+        items[2] == ['eta', 'theta', 'iota']
+    }
+
 }
diff --git a/modules/nf-commons/src/main/nextflow/util/CsvParser.groovy b/modules/nf-commons/src/main/nextflow/util/CsvParser.groovy
index f7a15bf7bf..775958b3a7 100644
--- a/modules/nf-commons/src/main/nextflow/util/CsvParser.groovy
+++ b/modules/nf-commons/src/main/nextflow/util/CsvParser.groovy
@@ -39,6 +39,22 @@ class CsvParser {
 
     private boolean strip
 
+    private char comment
+
+    CsvParser setComment(char ch) {
+        this.comment = ch
+        return this
+    }
+
+    CsvParser setComment(String ch) {
+        this.comment = firstChar(ch)
+        return this
+    }
+
+    char getComment() {
+        return this.comment
+    }
+
     CsvParser setQuote(char ch) {
         this.quote = ch
         return this
@@ -91,6 +107,11 @@ class CsvParser {
 
     private String readSimpleValue(String line, List<String> result) {
         def p = line.indexOf( (int)separator )
+        def c = comment ? line.indexOf( (int)comment ) : -1
+        if( c != -1 && (p == -1 || c < p) ) {
+            result.add(stripBlanks(line.substring(0,c)) ?: empty)
+            return null
+        }
         if( p == -1 ) {
             result.add(stripBlanks(line))
             return null
diff --git a/modules/nf-commons/src/test/nextflow/util/CsvParserTest.groovy b/modules/nf-commons/src/test/nextflow/util/CsvParserTest.groovy
index 75a91a5f98..4af091296b 100644
--- a/modules/nf-commons/src/test/nextflow/util/CsvParserTest.groovy
+++ b/modules/nf-commons/src/test/nextflow/util/CsvParserTest.groovy
@@ -94,4 +94,24 @@ class CsvParserTest extends Specification {
 
     }
 
+    def 'should handle comment character' () {
+
+        given:
+        def parser = new CsvParser()
+                    .setComment(COMMENT)
+
+        expect:
+        parser.parse(LINE) == EXPECTED
+
+        where:
+        LINE                        | COMMENT   | EXPECTED
+        'a,b,c'                     | '#'       | ['a','b','c']
+        'a,b # comment'             | '#'       | ['a','b ']
+        'a,b,c # comment'           | '#'       | ['a','b','c ']
+        'a # comment,ignored'       | '#'       | ['a ']
+        '# full line comment'       | '#'       | ['']
+        'a,b,c'                     | null      | ['a','b','c']
+        'a,b # not a comment'       | null      | ['a','b # not a comment']
+    }
+
 }