Skip to content
This repository was archived by the owner on May 14, 2018. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ Imports:
whisker,
lubridate,
stringr,
outliers
outliers,
stringdist
Suggests:
testthat,
xtable
48 changes: 48 additions & 0 deletions R/spelling.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#' Detect possible capitalization errors
#' @param char a character vector
#' @export
detect_cap = function(char){
uniques = unique(char)
lower_uniques = tolower(uniques)

matches = names(which(table(lower_uniques) > 1))


if(length(matches) > 0){
message("The following sets of elements are identical except for capitalization")
lapply(
matches,
function(x){
uniques[x == lower_uniques]
}
)
}
}

#' Detect uncomfortably similar string pairs
#' @param char a character vector
#' @param method a valid \code{method} for \code{stringdistmat}
#' @param tol minimum allowable distance between values
#' @param ... additional arguments to \code{stringdistmat}
#' I defaulted to Jaccard similarity because it's pre-scaled. It's probably not
#' the best distance metric for this though!
#' @importFrom stringdist stringdistmat
#' @export
detect_similar = function(char, method = "jaccard", tol = 0.1, ...){
uniques = unique(char)
distmat = stringdistmatrix(uniques, uniques, method = method, ...)

indices = which(distmat <= tol & upper.tri(distmat))

message("The following pairs of elements were uncomfortably similar")

# Could do something fancy like grouping them together (not just pairs) &
# indicating which name is more common.

#Column names aren't meaningful
data.frame(
a = uniques[row(distmat)[indices]],
b = uniques[col(distmat)[indices]]
)

}