From fc66e8f541222236244943cb65366e9a27e6e32a Mon Sep 17 00:00:00 2001 From: oqadiSAK Date: Thu, 5 Mar 2026 21:27:19 +0100 Subject: [PATCH] Add comment option to splitCsv operator (#6705) Signed-off-by: oqadiSAK --- docs/reference/operator.md | 3 + .../nextflow/splitter/CsvSplitter.groovy | 36 +++-- .../nextflow/splitter/CsvSplitterTest.groovy | 144 ++++++++++++++++++ .../src/main/nextflow/util/CsvParser.groovy | 21 +++ .../test/nextflow/util/CsvParserTest.groovy | 20 +++ 5 files changed, 215 insertions(+), 9 deletions(-) diff --git a/docs/reference/operator.md b/docs/reference/operator.md index 859276abcf..8db2a87cd5 100644 --- a/docs/reference/operator.md +++ b/docs/reference/operator.md @@ -1122,6 +1122,9 @@ Available options: `charset` : Parse the content with the specified charset, e.g. `UTF-8`. See the list of [standard charsets](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/charset/StandardCharsets.html) for available options. +`comment` +: The character used to denote comments (default: disabled). Lines starting with this character are skipped, and any text following this character mid-line is ignored. + `decompress` : When `true`, decompress the content using the GZIP format before processing it (default: `false`). Files with the `.gz` extension are decompressed automatically. diff --git a/modules/nextflow/src/main/groovy/nextflow/splitter/CsvSplitter.groovy b/modules/nextflow/src/main/groovy/nextflow/splitter/CsvSplitter.groovy index 9d343a3943..b8a2802149 100644 --- a/modules/nextflow/src/main/groovy/nextflow/splitter/CsvSplitter.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/splitter/CsvSplitter.groovy @@ -91,6 +91,10 @@ class CsvSplitter extends AbstractTextSplitter { if( options.skip ) skipLines = options.skip as int + // the comment character if used + if( options.comment ) + parser.setComment(options.comment as String) + return this } @@ -107,6 +111,7 @@ class CsvSplitter extends AbstractTextSplitter { result.header = [ Boolean, List ] result.quote = String result.skip = Integer + result.comment = String return result } @@ -132,7 +137,7 @@ class CsvSplitter extends AbstractTextSplitter { while( z++ < skipLines && reader.readLine() != null ) { /* nope */ } if( firstLineAsHeader ) { - line = reader.readLine() + line = readParsableLine(reader) if( !line ) throw new IllegalStateException("Missing 'header' in CSV file") List allCols = parser.parse(line) columnsHeader = new ArrayList<>(allCols.size()) @@ -144,6 +149,24 @@ class CsvSplitter extends AbstractTextSplitter { } } + /** + * Read the next parsable line, skipping empty lines and comment lines + * + * @param reader The reader from which to read + * @return The next parsable line, or {@code null} if end of file + */ + protected String readParsableLine(BufferedReader reader) { + String line + while( (line = reader.readLine()) != null ) { + if( !line ) + continue + if( parser.comment && line.charAt(0) == parser.comment ) + continue + return line + } + return null + } + /** * Process a CSV row at time * @@ -152,14 +175,9 @@ class CsvSplitter extends AbstractTextSplitter { */ @Override protected fetchRecord(BufferedReader reader) { - String line - while( true ) { - line = reader.readLine() - if( line ) - break - if( line==null ) - return null - } + final line = readParsableLine(reader) + if( line==null ) + return null final tokens = parser.parse(line) diff --git a/modules/nextflow/src/test/groovy/nextflow/splitter/CsvSplitterTest.groovy b/modules/nextflow/src/test/groovy/nextflow/splitter/CsvSplitterTest.groovy index d8d92af0cd..019a270f2b 100644 --- a/modules/nextflow/src/test/groovy/nextflow/splitter/CsvSplitterTest.groovy +++ b/modules/nextflow/src/test/groovy/nextflow/splitter/CsvSplitterTest.groovy @@ -317,4 +317,148 @@ class CsvSplitterTest extends Specification { } + def 'should skip comment lines' () { + given: + def LINES = ''' + # This is a comment + alpha,beta,delta + gamma,,zeta + # Another comment + eta,theta,iota + ''' + .stripIndent().trim() + + when: + def items = new CsvSplitter().target(LINES).options(comment: '#').list() + + then: + items.size() == 3 + items[0] == ['alpha', 'beta', 'delta'] + items[1] == ['gamma', '', 'zeta'] + items[2] == ['eta', 'theta', 'iota'] + } + + def 'should skip comment lines with header' () { + given: + def LINES = ''' + # This is a comment at the start + x,y,z + # Comment after header + alpha,beta,delta + gamma,,zeta + ''' + .stripIndent().trim() + + when: + def items = new CsvSplitter().target(LINES).options(header: true, comment: '#').list() + + then: + items.size() == 2 + items[0].x == 'alpha' + items[0].y == 'beta' + items[0].z == 'delta' + items[1].x == 'gamma' + items[1].y == '' + items[1].z == 'zeta' + } + + def 'should skip comment lines with skip option' () { + given: + def LINES = ''' + skip this line + # This is a comment + alpha,beta,delta + gamma,,zeta + ''' + .stripIndent().trim() + + when: + def items = new CsvSplitter().target(LINES).options(skip: 1, comment: '#').list() + + then: + items.size() == 2 + items[0] == ['alpha', 'beta', 'delta'] + items[1] == ['gamma', '', 'zeta'] + } + + def 'should handle TSV with comments' () { + given: + def LINES = ''' + # Comment line + alpha\tbeta\tdelta + gamma\t\tzeta + ''' + .stripIndent().trim() + + when: + def items = new CsvSplitter().target(LINES).options(sep: '\t', comment: '#').list() + + then: + items.size() == 2 + items[0] == ['alpha', 'beta', 'delta'] + items[1] == ['gamma', '', 'zeta'] + } + + def 'should not skip comments when option not set' () { + given: + def LINES = ''' + #alpha,beta,delta + gamma,,zeta + ''' + .stripIndent().trim() + + when: + def items = new CsvSplitter().target(LINES).list() + + then: + items.size() == 2 + items[0] == ['#alpha', 'beta', 'delta'] + items[1] == ['gamma', '', 'zeta'] + } + + def 'should handle custom comment character' () { + given: + def LINES = ''' + ; This is a comment + alpha,beta,delta + gamma,,zeta + ''' + .stripIndent().trim() + + when: + def items = new CsvSplitter().target(LINES).options(comment: ';').list() + + then: + items.size() == 2 + items[0] == ['alpha', 'beta', 'delta'] + items[1] == ['gamma', '', 'zeta'] + } + + def 'should reject multi-character comment' () { + when: + new CsvSplitter().options(comment: '//') + + then: + thrown(IllegalArgumentException) + } + + def 'should handle inline comments' () { + given: + def LINES = ''' + alpha,beta,delta + gamma,zeta # inline comment + eta,theta,iota + ''' + .stripIndent().trim() + + when: + def items = new CsvSplitter().target(LINES).options(comment: '#').list() + + then: + items.size() == 3 + items[0] == ['alpha', 'beta', 'delta'] + items[1] == ['gamma', 'zeta '] + items[2] == ['eta', 'theta', 'iota'] + } + } diff --git a/modules/nf-commons/src/main/nextflow/util/CsvParser.groovy b/modules/nf-commons/src/main/nextflow/util/CsvParser.groovy index f7a15bf7bf..775958b3a7 100644 --- a/modules/nf-commons/src/main/nextflow/util/CsvParser.groovy +++ b/modules/nf-commons/src/main/nextflow/util/CsvParser.groovy @@ -39,6 +39,22 @@ class CsvParser { private boolean strip + private char comment + + CsvParser setComment(char ch) { + this.comment = ch + return this + } + + CsvParser setComment(String ch) { + this.comment = firstChar(ch) + return this + } + + char getComment() { + return this.comment + } + CsvParser setQuote(char ch) { this.quote = ch return this @@ -91,6 +107,11 @@ class CsvParser { private String readSimpleValue(String line, List result) { def p = line.indexOf( (int)separator ) + def c = comment ? line.indexOf( (int)comment ) : -1 + if( c != -1 && (p == -1 || c < p) ) { + result.add(stripBlanks(line.substring(0,c)) ?: empty) + return null + } if( p == -1 ) { result.add(stripBlanks(line)) return null diff --git a/modules/nf-commons/src/test/nextflow/util/CsvParserTest.groovy b/modules/nf-commons/src/test/nextflow/util/CsvParserTest.groovy index 75a91a5f98..4af091296b 100644 --- a/modules/nf-commons/src/test/nextflow/util/CsvParserTest.groovy +++ b/modules/nf-commons/src/test/nextflow/util/CsvParserTest.groovy @@ -94,4 +94,24 @@ class CsvParserTest extends Specification { } + def 'should handle comment character' () { + + given: + def parser = new CsvParser() + .setComment(COMMENT) + + expect: + parser.parse(LINE) == EXPECTED + + where: + LINE | COMMENT | EXPECTED + 'a,b,c' | '#' | ['a','b','c'] + 'a,b # comment' | '#' | ['a','b '] + 'a,b,c # comment' | '#' | ['a','b','c '] + 'a # comment,ignored' | '#' | ['a '] + '# full line comment' | '#' | [''] + 'a,b,c' | null | ['a','b','c'] + 'a,b # not a comment' | null | ['a','b # not a comment'] + } + }