Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/reference/operator.md
Original file line number Diff line number Diff line change
Expand Up @@ -1122,6 +1122,9 @@ Available options:
`charset`
: Parse the content with the specified charset, e.g. `UTF-8`. See the list of [standard charsets](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/charset/StandardCharsets.html) for available options.

`comment`
: The character used to denote comments (default: disabled). Lines starting with this character are skipped, and any text following this character mid-line is ignored.

`decompress`
: When `true`, decompress the content using the GZIP format before processing it (default: `false`). Files with the `.gz` extension are decompressed automatically.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ class CsvSplitter extends AbstractTextSplitter {
if( options.skip )
skipLines = options.skip as int

// the comment character if used
if( options.comment )
parser.setComment(options.comment as String)

return this
}

Expand All @@ -107,6 +111,7 @@ class CsvSplitter extends AbstractTextSplitter {
result.header = [ Boolean, List ]
result.quote = String
result.skip = Integer
result.comment = String
return result
}

Expand All @@ -132,7 +137,7 @@ class CsvSplitter extends AbstractTextSplitter {
while( z++ < skipLines && reader.readLine() != null ) { /* nope */ }

if( firstLineAsHeader ) {
line = reader.readLine()
line = readParsableLine(reader)
if( !line ) throw new IllegalStateException("Missing 'header' in CSV file")
List allCols = parser.parse(line)
columnsHeader = new ArrayList<>(allCols.size())
Expand All @@ -144,6 +149,24 @@ class CsvSplitter extends AbstractTextSplitter {
}
}

/**
* Read the next parsable line, skipping empty lines and comment lines
*
* @param reader The reader from which to read
* @return The next parsable line, or {@code null} if end of file
*/
protected String readParsableLine(BufferedReader reader) {
String line
while( (line = reader.readLine()) != null ) {
if( !line )
continue
if( parser.comment && line.charAt(0) == parser.comment )
continue
return line
}
return null
}

/**
* Process a CSV row at time
*
Expand All @@ -152,14 +175,9 @@ class CsvSplitter extends AbstractTextSplitter {
*/
@Override
protected fetchRecord(BufferedReader reader) {
String line
while( true ) {
line = reader.readLine()
if( line )
break
if( line==null )
return null
}
final line = readParsableLine(reader)
if( line==null )
return null

final tokens = parser.parse(line)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -317,4 +317,148 @@ class CsvSplitterTest extends Specification {

}

def 'should skip comment lines' () {
given:
def LINES = '''
# This is a comment
alpha,beta,delta
gamma,,zeta
# Another comment
eta,theta,iota
'''
.stripIndent().trim()

when:
def items = new CsvSplitter().target(LINES).options(comment: '#').list()

then:
items.size() == 3
items[0] == ['alpha', 'beta', 'delta']
items[1] == ['gamma', '', 'zeta']
items[2] == ['eta', 'theta', 'iota']
}

def 'should skip comment lines with header' () {
given:
def LINES = '''
# This is a comment at the start
x,y,z
# Comment after header
alpha,beta,delta
gamma,,zeta
'''
.stripIndent().trim()

when:
def items = new CsvSplitter().target(LINES).options(header: true, comment: '#').list()

then:
items.size() == 2
items[0].x == 'alpha'
items[0].y == 'beta'
items[0].z == 'delta'
items[1].x == 'gamma'
items[1].y == ''
items[1].z == 'zeta'
}

def 'should skip comment lines with skip option' () {
given:
def LINES = '''
skip this line
# This is a comment
alpha,beta,delta
gamma,,zeta
'''
.stripIndent().trim()

when:
def items = new CsvSplitter().target(LINES).options(skip: 1, comment: '#').list()

then:
items.size() == 2
items[0] == ['alpha', 'beta', 'delta']
items[1] == ['gamma', '', 'zeta']
}

def 'should handle TSV with comments' () {
given:
def LINES = '''
# Comment line
alpha\tbeta\tdelta
gamma\t\tzeta
'''
.stripIndent().trim()

when:
def items = new CsvSplitter().target(LINES).options(sep: '\t', comment: '#').list()

then:
items.size() == 2
items[0] == ['alpha', 'beta', 'delta']
items[1] == ['gamma', '', 'zeta']
}

def 'should not skip comments when option not set' () {
given:
def LINES = '''
#alpha,beta,delta
gamma,,zeta
'''
.stripIndent().trim()

when:
def items = new CsvSplitter().target(LINES).list()

then:
items.size() == 2
items[0] == ['#alpha', 'beta', 'delta']
items[1] == ['gamma', '', 'zeta']
}

def 'should handle custom comment character' () {
given:
def LINES = '''
; This is a comment
alpha,beta,delta
gamma,,zeta
'''
.stripIndent().trim()

when:
def items = new CsvSplitter().target(LINES).options(comment: ';').list()

then:
items.size() == 2
items[0] == ['alpha', 'beta', 'delta']
items[1] == ['gamma', '', 'zeta']
}

def 'should reject multi-character comment' () {
when:
new CsvSplitter().options(comment: '//')

then:
thrown(IllegalArgumentException)
}

def 'should handle inline comments' () {
given:
def LINES = '''
alpha,beta,delta
gamma,zeta # inline comment
eta,theta,iota
'''
.stripIndent().trim()

when:
def items = new CsvSplitter().target(LINES).options(comment: '#').list()

then:
items.size() == 3
items[0] == ['alpha', 'beta', 'delta']
items[1] == ['gamma', 'zeta ']
items[2] == ['eta', 'theta', 'iota']
}

}
21 changes: 21 additions & 0 deletions modules/nf-commons/src/main/nextflow/util/CsvParser.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,22 @@ class CsvParser {

private boolean strip

private char comment

CsvParser setComment(char ch) {
this.comment = ch
return this
}

CsvParser setComment(String ch) {
this.comment = firstChar(ch)
return this
}

char getComment() {
return this.comment
}

CsvParser setQuote(char ch) {
this.quote = ch
return this
Expand Down Expand Up @@ -91,6 +107,11 @@ class CsvParser {

private String readSimpleValue(String line, List<String> result) {
def p = line.indexOf( (int)separator )
def c = comment ? line.indexOf( (int)comment ) : -1
if( c != -1 && (p == -1 || c < p) ) {
result.add(stripBlanks(line.substring(0,c)) ?: empty)
return null
}
if( p == -1 ) {
result.add(stripBlanks(line))
return null
Expand Down
20 changes: 20 additions & 0 deletions modules/nf-commons/src/test/nextflow/util/CsvParserTest.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,24 @@ class CsvParserTest extends Specification {

}

def 'should handle comment character' () {

given:
def parser = new CsvParser()
.setComment(COMMENT)

expect:
parser.parse(LINE) == EXPECTED

where:
LINE | COMMENT | EXPECTED
'a,b,c' | '#' | ['a','b','c']
'a,b # comment' | '#' | ['a','b ']
'a,b,c # comment' | '#' | ['a','b','c ']
'a # comment,ignored' | '#' | ['a ']
'# full line comment' | '#' | ['']
'a,b,c' | null | ['a','b','c']
'a,b # not a comment' | null | ['a','b # not a comment']
}

}