diff --git a/R/XString-class.R b/R/XString-class.R index 5e6333cc..fbb85294 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -139,6 +139,70 @@ setMethod("extract_character_from_XString_by_ranges", "XString", } ) +### BString methods to support 0:255 input + +## BSTRING_RAW_LOOKUP is initialized in `zzz.R` +setMethod("extract_character_from_XString_by_ranges", "BString", + function(x, start, width, collapse=FALSE) + { + SHOW_RAW <- getOption("Biostrings.showRaw") + if(!is.logical(SHOW_RAW)){ + warning("Invalid value for option 'Biostrings.showRaw', ", + "resetting to FALSE") + SHOW_RAW <- FALSE + options(Biostrings.showRaw=FALSE) + } + if(!SHOW_RAW){ + callNextMethod() + } else { + bstring_lookup <- get("BSTRING_RAW_LOOKUP", envir=.pkgenv) + lkup <- xs_dec_lkup(x) + + ## need to remap null bytes, they have to be in 0:255 + ## so we have to overload some value + if(is.null(lkup)) lkup <- c(255L,1:255) + xs <- XVector:::extract_character_from_XRaw_by_ranges(x, start, width, + collapse=collapse, + lkup=lkup) + ## replace all undisplayable characters + for(i in seq_along(xs)) + xs[i] <- paste(bstring_lookup[as.integer(charToRaw(xs[i]))+1L], + collapse='') + xs + } + } +) + +setMethod("extract_character_from_XString_by_positions", "BString", + function(x, pos, collapse=FALSE) + { + SHOW_RAW <- getOption("Biostrings.showRaw") + if(!is.logical(SHOW_RAW)){ + warning("Invalid value for option 'Biostrings.showRaw', ", + "resetting to FALSE") + SHOW_RAW <- FALSE + options(Biostrings.showRaw=FALSE) + } + if(!SHOW_RAW){ + callNextMethod() + } else { + lkup <- xs_dec_lkup(x) + bstring_lookup <- get("BSTRING_RAW_LOOKUP", envir=.pkgenv) + + ## need to remap null bytes, they have to be in 0:255 + ## so we have to overload some value + if(is.null(lkup)) lkup <- c(255L,1:255) + xs <- XVector:::extract_character_from_XRaw_by_positions(x, pos, + collapse=collapse, + lkup=lkup) + ## replace all undisplayable characters + for(i in seq_along(xs)) + xs[i] <- paste(bstring_lookup[as.integer(charToRaw(xs[i]))+1L], + collapse='') + xs + } + } +) ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### make_XString_from_string() diff --git a/R/zzz.R b/R/zzz.R index 65e22402..6b03cfec 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,5 +1,7 @@ ### +.pkgenv <- new.env(parent=emptyenv()) + .onLoad <- function(libname, pkgname) { .Call2("init_DNAlkups", @@ -16,6 +18,34 @@ option_name <- "Biostrings.coloring" if (!(option_name %in% names(.Options))) options(setNames(list(TRUE), option_name)) + + option_name <- "Biostrings.showRaw" + if (!(option_name %in% names(.Options))) + options(setNames(list(FALSE), option_name)) + + ## BString lookup for raw strings + ## 256 char lookup table for 0:255 (note off by one) + ## characters 0-31 and 127-255 are not displayable + ## so positions 1-32 and 128-256 should be replaced + encoding_details <- l10n_info() + bstring_lookup <- rawToChar(as.raw(0:255), multiple=TRUE) + invalid_chars <- c(1:32,128:256) + # if(encoding_details$`UTF-8`){ + # # braille is nice if supported + # # allows for char comparisons after as.character() comparisons + # # I think it's overkill, though...uncomment this section if we need it + # bstring_lookup[invalid_chars] <- + # as.character(parse(text=paste0("'\\U28", as.raw(95:255), "'"))) + # } else + if (encoding_details$MBCS){ + # use multibyte question mark if supported + compact_unknown <- rawToChar(as.raw(c(0xef, 0xbf, 0xbd))) + bstring_lookup[invalid_chars] <- compact_unknown + } else { + # otherwise just use the regular '?' + bstring_lookup[invalid_chars] <- "?" + } + assign("BSTRING_RAW_LOOKUP", bstring_lookup, envir=.pkgenv) } .onUnload <- function(libpath) diff --git a/man/XString-class.Rd b/man/XString-class.Rd index f1f8f3b9..96b67c2a 100644 --- a/man/XString-class.Rd +++ b/man/XString-class.Rd @@ -105,7 +105,7 @@ \describe{ \item{\code{alphabet(x)}:}{ \code{NULL} for a \code{BString} object. - See the corresponding man pages when \code{x} is a + See the corresponding man pages when \code{x} is a \link{DNAString}, \link{RNAString} or \link{AAString} object. } \item{\code{length(x)}:}{ or \code{nchar(x)}: @@ -167,6 +167,14 @@ } } +\note{ +BString objects can technically hold any valid ASCII code, which includes all values in the range 0:255. However, non-displayable characters may cause the BString object to display in a weird format (see Examples for one such case). Even worse, if a 0 value is included in the BString, attempting to display it will throw a cryptic error that looks like this: + +\code{Error in XVector:::extract_character_from_XRaw_by_ranges: embedded nul in string: ...} + +BString objects are intended to hold displayable characters, so this shouldn't be an issue for most cases. However, if you need BString objects to hold non-displayable values, you can set \code{options(Biostrings.showRaw=TRUE)} to fix the formatting of non-displayable characters. Note that this adds some overhead to methods that show BString objects. +} + \author{H. Pagès} \seealso{ @@ -214,6 +222,18 @@ identical(b, 1:length(b)) # FALSE ## Compacting. As a particular type of XVector objects, XString ## objects can optionally be compacted. Compacting is done typically ## before serialization. See ?compact for more information. + +## Non-displayable characters in BStrings +## BString objects support any value, though this isn't encouraged: +b_bad <- as(as(as.raw(c(10,65,10,66,10,67,68)), "XRaw"), "BString") +b_bad ## formatting is all messed up because 10 = \n + +## if you really need to display characters like this, set this option: +options(Biostrings.showRaw=TRUE) +b_bad ## now all 7 characters "display" + +## reseting to default +options(Biostrings.showRaw=FALSE) } \keyword{methods} diff --git a/tests/testthat/test-XString-class.R b/tests/testthat/test-XString-class.R index 6253f758..de946148 100644 --- a/tests/testthat/test-XString-class.R +++ b/tests/testthat/test-XString-class.R @@ -250,6 +250,29 @@ test_that("reverse, complement, reverseComplement work correctly", { expect_equal(as.character(reverseComplement(mrna)), .revString(mr_comp)) }) +test_that("BStrings display correctly with full 0-255 value range", { + orig_setting <- getOption("Biostrings.showRaw") + full_bstring <- as(as(as.raw(0:255),"XRaw"),"BString") + options(Biostrings.showRaw=FALSE) + expect_error(extract_character_from_XString_by_ranges(full_bstring, 1L, 256L), + "embedded nul in string") + + ## can't really test MBCS vs. non-MBCS because we can't guarantee + ## the test suites will run on a platform with(out) MBCS + options(Biostrings.showRaw=TRUE) + expect_is(extract_character_from_XString_by_ranges(full_bstring, 1L, 256L), + "character") + + options(Biostrings.showRaw=10) + expect_warning(extract_character_from_XString_by_ranges(BString("ABCD"), 1L, 4L), + "Invalid value for option 'Biostrings.showRaw'") + expect_false(getOption("Biostrings.showRaw")) + + ## make sure we leave the system as we found it + options(Biostrings.showRaw=orig_setting) + +}) + ## Porting RUnit tests test_that("alphabet finds the correct values", { expect_equal(alphabet(DNAString(dnastr)), strsplit(dnastr, "")[[1]])