diff --git a/R/findOverlaps-methods.R b/R/findOverlaps-methods.R index 7a09f86..0c7288e 100644 --- a/R/findOverlaps-methods.R +++ b/R/findOverlaps-methods.R @@ -223,6 +223,118 @@ setMethod("findOverlaps", c("GenomicRanges", "GRangesList"), } ) +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### "findOverlaps" methods for GRangesFactor objects +### + +.findOverlaps_Factor_other <- function(query, subject, + maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), + select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) +{ + select <- match.arg(select) + type <- match.arg(type) + FUN <- function(Query, Select) { + findOverlaps(Query, subject, maxgap=maxgap, minoverlap=minoverlap, + type=type, select=Select, ignore.strand=ignore.strand) + } + + if (length(query) < length(levels(query))) { + FUN(unfactor(query), Select=select) + } else { + if (select=="all") { + lev.hits <- FUN(levels(query), "all") + idx.hits <- findMatches(as.integer(query), queryHits(lev.hits)) + Hits(from=queryHits(idx.hits), to=subjectHits(lev.hits)[subjectHits(idx.hits)], + nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE) + } else { + lev.hits <- FUN(levels(query), select) + lev.hits[as.integer(query)] + } + } +} + +setMethod("findOverlaps", c("GRangesFactor", "GenomicRanges"), .findOverlaps_Factor_other) + +setMethod("findOverlaps", c("GRangesFactor", "GRangesList"), .findOverlaps_Factor_other) + +.findOverlaps_other_Factor <- function(query, subject, + maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), + select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) +{ + select <- match.arg(select) + type <- match.arg(type) + FUN <- function(Subject, Select) { + findOverlaps(query, Subject, maxgap=maxgap, minoverlap=minoverlap, + type=type, select=Select, ignore.strand=ignore.strand) + } + + if (length(subject) < length(levels(subject))) { + FUN(unfactor(subject), select) + } else { + if (select=="all") { + lev.hits <- FUN(levels(subject), "all") + idx.hits <- findMatches(subjectHits(lev.hits), as.integer(subject)) + Hits(from=queryHits(lev.hits)[queryHits(idx.hits)], to=subjectHits(idx.hits), + nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE) + } else { + s.idx <- as.integer(subject) + if (select=="first") { + # Get the index of the first range for each level. + u <- which(!duplicated(s.idx)) + } else { + # Get the index of the last range for each level. + u <- which(!duplicated(s.idx, fromLast=TRUE)) + } + lev.hits <- FUN(levels(subject)[s.idx[u]], select) + u[lev.hits] + } + } +} + +setMethod("findOverlaps", c("GenomicRanges", "GRangesFactor"), .findOverlaps_other_Factor) + +setMethod("findOverlaps", c("GRangesList", "GRangesFactor"), .findOverlaps_other_Factor) + +setMethod("findOverlaps", c("GRangesFactor", "GRangesFactor"), function(query, subject, + maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), + select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) +{ + if (length(query) < length(levels(query))) { + query <- unfactor(query) + callGeneric() + } else if (length(subject) < length(levels(subject))) { + subject <- unfactor(subject) + callGeneric() + } else { + FUN <- function(Query, Subject, Select) { + findOverlaps(Query, Subject, maxgap=maxgap, minoverlap=minoverlap, + type=type, select=Select, ignore.strand=ignore.strand) + } + + select <- match.arg(select) + if (select=="all") { + lev.hits <- FUN(levels(query), levels(subject), "all") + q.idx.hits <- findMatches(as.integer(query), queryHits(lev.hits)) + s.idx.hits <- findMatches(subjectHits(lev.hits), as.integer(subject)) + reconciler <- findMatches(subjectHits(q.idx.hits), queryHits(s.idx.hits)) + + Hits(from=queryHits(q.idx.hits)[queryHits(reconciler)], + to=subjectHits(s.idx.hits)[subjectHits(reconciler)], + nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE) + } else { + s.idx <- as.integer(subject) + if (select=="first") { + # Get the index of the first range for each level. + u <- which(!duplicated(s.idx)) + } else { + # Get the index of the last range for each level. + u <- which(!duplicated(s.idx, fromLast=TRUE)) + } + lev.hits <- FUN(levels(query), levels(subject)[s.idx[u]], select) + u[lev.hits][as.integer(query)] + } + } +}) ### ========================================================================= ### findOverlaps-based methods diff --git a/inst/unitTests/test_findOverlaps-methods.R b/inst/unitTests/test_findOverlaps-methods.R index b479919..c26d2ab 100644 --- a/inst/unitTests/test_findOverlaps-methods.R +++ b/inst/unitTests/test_findOverlaps-methods.R @@ -318,6 +318,97 @@ test_findOverlaps_with_circular_sequences <- function() .checkHits(1:4, 1:4, 4, 4, current5, select="all") } +test_findOverlaps_with_GRangesFactors <- function() { + ir0 <- IRanges(c(5, 25, 20, 30, 45, 35, 10, 15), width=10) + gr0 <- GRanges("chrA", ir0) + F0 <- Factor(gr0[rep(seq_along(gr0), seq_along(gr0))]) + + ir1 <- IRanges(c(18, 8, 28, 38), width=5) + gr1 <- GRanges("chrA", ir1) + F1 <- Factor(gr1[rep(seq_along(gr1), rev(seq_along(gr1)))]) + + ###################### + # findOverlaps works with a Factor as the query. + out <- findOverlaps(F0, gr1) + ref <- findOverlaps(unfactor(F0), gr1) + checkIdentical(out, ref) + + out <- findOverlaps(F0, gr1, minoverlap=4) + ref <- findOverlaps(unfactor(F0), gr1, minoverlap=4) + checkIdentical(out, ref) + + # findOverlaps works with a Factor as the subject. + out <- findOverlaps(gr0, F1) + ref <- findOverlaps(gr0, unfactor(F1)) + checkIdentical(sort(out), sort(ref)) # hack to overcome lack of subject sorting guarantees. + + out <- findOverlaps(gr0, F1, maxgap=4) + ref <- findOverlaps(gr0, unfactor(F1), maxgap=4) + checkIdentical(sort(out), sort(ref)) # hack to overcome lack of subject sorting guarantees. + + # findOverlaps works with two Factors. + out <- findOverlaps(F0, F1) + ref <- findOverlaps(unfactor(F0), unfactor(F1)) + checkIdentical(out, ref) + + out <- findOverlaps(F0, F1, maxgap=2) + ref <- findOverlaps(unfactor(F0), unfactor(F1), maxgap=2) + checkIdentical(out, ref) + + ###################### + # All methods work with different settings for 'select'. + + out <- findOverlaps(F0, gr1, select="first") + ref <- findOverlaps(unfactor(F0), gr1, select="first") + checkIdentical(out, ref) + + out <- findOverlaps(F0, gr1, select="last") + ref <- findOverlaps(unfactor(F0), gr1, select="last") + checkIdentical(out, ref) + + out <- findOverlaps(gr0, F1, select="first") + ref <- findOverlaps(gr0, unfactor(F1), select="first") + checkIdentical(out, ref) + + out <- findOverlaps(gr0, F1, select="last") + ref <- findOverlaps(gr0, unfactor(F1), select="last") + checkIdentical(out, ref) + + out <- findOverlaps(F0, F1, select="first") + ref <- findOverlaps(unfactor(F0), unfactor(F1), select="first") + checkIdentical(out, ref) + + out <- findOverlaps(F0, F1, select="last") + ref <- findOverlaps(unfactor(F0), unfactor(F1), select="last") + checkIdentical(out, ref) + + ###################### + # All methods work correctly with small Factors that cause unfactor()ing. + out <- findOverlaps(F0[1:2], gr1) + ref <- findOverlaps(unfactor(F0[1:2]), gr1) + checkIdentical(out, ref) + + out <- findOverlaps(F0[3], gr1) + ref <- findOverlaps(unfactor(F0[3]), gr1) + checkIdentical(out, ref) + + out <- findOverlaps(gr0, F1[4:3]) + ref <- findOverlaps(gr0, unfactor(F1[4:3])) + checkIdentical(sort(out), sort(ref)) + + out <- findOverlaps(gr0, F1[1]) + ref <- findOverlaps(gr0, unfactor(F1[1])) + checkIdentical(sort(out), sort(ref)) + + out <- findOverlaps(F0[10:13], F1) + ref <- findOverlaps(unfactor(F0[10:13]), unfactor(F1)) + checkIdentical(out, ref) + + out <- findOverlaps(F0, F1[2]) + ref <- findOverlaps(unfactor(F0), unfactor(F1[2])) + checkIdentical(out, ref) +} + test_poverlaps <- function() { ans <- poverlaps(GRanges(), GRanges()) checkIdentical(ans, Rle()) diff --git a/man/findOverlaps-methods.Rd b/man/findOverlaps-methods.Rd index be2e737..7467f50 100644 --- a/man/findOverlaps-methods.Rd +++ b/man/findOverlaps-methods.Rd @@ -6,6 +6,9 @@ \alias{findOverlaps,GRangesList,GenomicRanges-method} \alias{findOverlaps,GenomicRanges,GRangesList-method} \alias{findOverlaps,GRangesList,GRangesList-method} +\alias{findOverlaps,GRangesFactor,GenomicRanges-method} +\alias{findOverlaps,GenomicRanges,GRangesFactor-method} +\alias{findOverlaps,GRangesFactor,GRangesFactor-method} \alias{countOverlaps} \alias{countOverlaps,GenomicRanges,GenomicRanges-method} @@ -97,6 +100,10 @@ For \code{type="equal"} with GRangesList objects, \code{query[[i]]} matches \code{subject[[j]]} iff for each range in \code{query[[i]]} there is an identical range in \code{subject[[j]]}, and vice versa. + + If either or both \code{query} or \code{subject} are \link{GRangesFactor} + objects, overlaps are identified based on the unique levels. This improves + the efficiency of this function for large GRangesFactors with few levels. } \value{