diff --git a/R/XStringSet-io.R b/R/XStringSet-io.R index e594f71b..13a6fdf5 100644 --- a/R/XStringSet-io.R +++ b/R/XStringSet-io.R @@ -362,8 +362,11 @@ readAAStringSet <- function(filepath, format="fasta", stop(wmsg("'width' must be a single integer")) if (!is.integer(width)) width <- as.integer(width) - if (width < 1L) - stop(wmsg("'width' must be an integer >= 1")) + if (width < 1L) { + ## set to maximum possible width + ## using `lengths` because the function overrides `width` + width <- max(lengths(x)) + } lkup <- get_seqtype_conversion_lookup(seqtype(x), "B") .Call2("write_XStringSet_to_fasta", x, filexp_list, width, lkup, diff --git a/man/XStringSet-io.Rd b/man/XStringSet-io.Rd index 1f389842..bea1cb7d 100644 --- a/man/XStringSet-io.Rd +++ b/man/XStringSet-io.Rd @@ -169,7 +169,7 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE) If \code{format="fasta"}, the \code{width} argument can be used to specify the maximum number of letters per line of sequence. - \code{width} must be a single integer. + \code{width} must be a single integer, and uses an unbounded width if set to a value less than 1. If \code{format="fastq"}, the \code{qualities} argument can be used to specify the quality strings. \code{qualities} must be a \link{BStringSet} diff --git a/src/read_fasta_files.c b/src/read_fasta_files.c index b480060b..52d4c1fc 100644 --- a/src/read_fasta_files.c +++ b/src/read_fasta_files.c @@ -628,6 +628,7 @@ SEXP write_XStringSet_to_fasta(SEXP x, SEXP filexp_list, SEXP width, SEXP lkup) { XStringSet_holder X; int x_length, width0, lkup_len, i, j1, j2, dest_nbytes; + int nmax_write_bytes, nwrote_bytes; const int *lkup0; SEXP filexp, x_names, desc; Chars_holder X_elt; @@ -636,8 +637,7 @@ SEXP write_XStringSet_to_fasta(SEXP x, SEXP filexp_list, SEXP width, SEXP lkup) x_length = _get_length_from_XStringSet_holder(&X); filexp = VECTOR_ELT(filexp_list, 0); width0 = INTEGER(width)[0]; - if (width0 >= IOBUF_SIZE) - error("'width' must be <= %d", IOBUF_SIZE - 1); + nmax_write_bytes = width0 >= IOBUF_SIZE ? IOBUF_SIZE - 1 : width0; iobuf[width0] = 0; if (lkup == R_NilValue) { lkup0 = NULL; @@ -657,21 +657,33 @@ SEXP write_XStringSet_to_fasta(SEXP x, SEXP filexp_list, SEXP width, SEXP lkup) } filexp_puts(filexp, "\n"); X_elt = _get_elt_from_XStringSet_holder(&X, i); - for (j1 = 0; j1 < X_elt.length; j1 += width0) { - j2 = j1 + width0; + j1 = 0; + nwrote_bytes = 0; + while(j1 < X_elt.length){ + j2 = j1 + nmax_write_bytes; if (j2 > X_elt.length) j2 = X_elt.length; dest_nbytes = j2 - j1; + if(nwrote_bytes + dest_nbytes > width0){ + // align write to size width0 + dest_nbytes = width0 - nwrote_bytes; + j2 = j1 + dest_nbytes; + } j2--; Ocopy_bytes_from_i1i2_with_lkup(j1, j2, iobuf, dest_nbytes, X_elt.ptr, X_elt.length, lkup0, lkup_len); iobuf[dest_nbytes] = 0; + nwrote_bytes += dest_nbytes; + j1 = j2+1; filexp_puts(filexp, iobuf); - filexp_puts(filexp, "\n"); + if(j1 == X_elt.length || nwrote_bytes == width0){ + // add newline if at end of line (width0) or end of sequence + filexp_puts(filexp, "\n"); + nwrote_bytes = 0; + } } } return R_NilValue; } - diff --git a/src/read_fastq_files.c b/src/read_fastq_files.c index 61770577..a1133cb7 100644 --- a/src/read_fastq_files.c +++ b/src/read_fastq_files.c @@ -615,9 +615,25 @@ static void write_FASTQ_id(SEXP filexp, const char *markup, const char *id) filexp_puts(filexp, "\n"); } -static void write_FASTQ_seq(SEXP filexp, const char *buf) +static void write_FASTQ_seq(SEXP filexp, const Chars_holder X, + const int *lkup0, int lkup_len) { - filexp_puts(filexp, buf); + // write sequences in chunks in case sequence is longer than I/O buffer + int i1, i2, bytes_to_write, bytes_remaining; + bytes_remaining = X.length; + i1 = 0; + while(bytes_remaining){ + bytes_to_write = bytes_remaining >= IOBUF_SIZE ? IOBUF_SIZE-1 : bytes_remaining; + i2 = i1+bytes_to_write-1; + Ocopy_bytes_from_i1i2_with_lkup(i1, i2, + iobuf, bytes_to_write, + X.ptr, X.length, + lkup0, lkup_len); + iobuf[bytes_to_write] = 0; + filexp_puts(filexp, iobuf); + i1 = i2 + 1; + bytes_remaining -= bytes_to_write; + } filexp_puts(filexp, "\n"); } @@ -677,18 +693,8 @@ SEXP write_XStringSet_to_fastq(SEXP x, SEXP filexp_list, for (i = 0; i < x_length; i++) { id = get_FASTQ_rec_id(x_names, q_names, i); X_elt = _get_elt_from_XStringSet_holder(&X, i); - if (X_elt.length >= IOBUF_SIZE) - error("XStringSet object (or derivative) to " - "write 'x' cannot contain strings\n longer " - "than %d ('x[[%d]]' has %d characters)", - IOBUF_SIZE - 1, i + 1, X_elt.length); - Ocopy_bytes_from_i1i2_with_lkup(0, X_elt.length - 1, - iobuf, X_elt.length, - X_elt.ptr, X_elt.length, - lkup0, lkup_len); - iobuf[X_elt.length] = 0; write_FASTQ_id(filexp, FASTQ_line1_markup, id); - write_FASTQ_seq(filexp, iobuf); + write_FASTQ_seq(filexp, X_elt, lkup0, lkup_len); write_FASTQ_id(filexp, FASTQ_line3_markup, id); if (qualities != R_NilValue) { write_FASTQ_qual(filexp, X_elt.length, &Q, i); diff --git a/tests/testthat/test-XStringSet-io.R b/tests/testthat/test-XStringSet-io.R index 40c52551..e502a547 100644 --- a/tests/testthat/test-XStringSet-io.R +++ b/tests/testthat/test-XStringSet-io.R @@ -175,3 +175,43 @@ test_that("XStringSet read/write is correct", { unlink(tmpfilefq) }) +test_that("Writing XStringSets larger than IOBUF_SIZE functions correctly", { + ## IOBUF_SIZE is 200,003 + tf <- tempfile() + set.seed(456L) + x1 <- paste(sample(DNA_BASES, 400020, replace=TRUE), collapse='') + x2 <- paste(sample(DNA_BASES, 200127, replace=TRUE), collapse='') + xss <- DNAStringSet(c(x1, x2)) + names(xss) <- c("seq1", "seq2") + exp_output_fasta <- paste0('>seq1', x1, '>seq2', x2) + for(w in c(5, 80, 213, 200003, width(xss)[1], width(xss)[2], -1)){ + writeXStringSet(xss, tf, width = w) + opt <- readLines(tf) + opt <- paste(opt, collapse='') + expect_equal(opt, exp_output_fasta) + read_seqs <- readDNAStringSet(tf) + expect_true(all(as.character(read_seqs) == as.character(xss))) + } +}) + +test_that("Writing QualityScaledXStringSets larger than IOBUF_SIZE functions correctly", { + ## IOBUF_SIZE is 200,003 + tf <- tempfile() + set.seed(456L) + x1 <- paste(sample(DNA_BASES, 200127, replace=TRUE), collapse='') + x2 <- paste(sample(DNA_BASES, 400020, replace=TRUE), collapse='') + q1 <- paste(sample(as.character(1:9), 200127, replace=TRUE), collapse='') + q2 <- paste(sample(as.character(1:9), 400020, replace=TRUE), collapse='') + xss <- DNAStringSet(c(x1, x2)) + quals <- PhredQuality(c(q1, q2)) + names(xss) <- c("seq1", "seq2") + names(quals) <- c("seq1", "seq2") + exp_output_fastq <- paste0("@seq1", x1, "+seq1", q1, "@seq2", x2, "+seq2", q2) + qss <- QualityScaledDNAStringSet(xss, quals) + writeQualityScaledXStringSet(qss, tf) + opt <- paste(readLines(tf), collapse='') + expect_equal(opt, exp_output_fastq) + read_seqs <- readQualityScaledDNAStringSet(tf) + expect_true(all(as.character(read_seqs) == as.character(xss))) + expect_true(all(as.character(quality(read_seqs)) == as.character(quals))) +})