diff --git a/NAMESPACE b/NAMESPACE index c298730..fe42466 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,8 +35,6 @@ export(qLap) export(rLap) export(release2json) export(sgn) -export(treeGetAccuracy) -export(treeGetParameters) exportClasses(dpCovariance) exportClasses(dpGLM) exportClasses(dpHeavyHitters) diff --git a/R/statistic-tree.R b/R/statistic-tree.R index 95de968..97ed977 100644 --- a/R/statistic-tree.R +++ b/R/statistic-tree.R @@ -1,145 +1,27 @@ -#' Accuracy for a differentially private binary tree +#' Calculate bins of each leaf in the tree by row #' -#' @param epsilon Numeric differential privacy parameter -#' @param rng Numeric a priori estimate of the variable range -#' @param gran Numeric granularity -#' @param alpha Numeric level of statistical significance, default 0.05 -#' @return Accuracy guarantee for the tree given epsilon -#' @export treeGetAccuracy -#' @rdname treeGetAccuracy - -treeGetAccuracy <- function(epsilon, rng, gran, alpha=0.05) { - universeSize <- diff(rng) / gran + 1 - accuracy <- (2 * sqrt(2) / epsilon) * sqrt(log(2 / alpha)) * log2(universeSize)^(1.5) - return(accuracy) -} - - -#' Epsilon for a differentially private binary tree -#' -#' @param accuracy Numeric accuracy needed -#' @param rng Numeric a priori estimate of the variable range -#' @param gran Numeric granularity -#' @param alpha Numeric level of statistical significance, default 0.05 -#' @return Epsilon necessary to guarantee the given accuracy -#' @export treeGetParameters -#' @rdname treeGetParameters - -treeGetParameters <- function(accuracy, rng, gran, alpha=0.05) { - universeSize <- diff(rng) / gran + 1 - epsilon <- (2 * sqrt(2) / accuracy) * sqrt(log(2 / alpha)) * log2(universeSize)^(1.5) - return(epsilon) -} - - -#' Function to truncate negative noisy node counts at zero -#' -#' @param release The differentially private noisy binary tree -#' @return Noisy binary tree truncated at zero - -treePostFormatRelease <- function(release) { - release <- round(release) - release[release < 0] <- 0 - return(release) -} - - -#' Function to derive CDF from efficient terminal node counts -#' -#' @param release Efficient differentially private binary tree -#' @param rng An a priori estimate of the range of the vector -#' being represented as a binary tree -#' @param terminalIndex Vector of indices corresponding to the terminal -#' leaf nodes of the binary tree -#' @return Differentially private estimate of the empirical cumulative -#' distribution function - -treePostCDF <- function(release, rng, terminalIndex) { - terminal <- release[terminalIndex] - stepSize <- diff(rng) / length(terminal) - cdfSteps <- seq(rng[1], rng[2], stepSize) - cdf <- c(0, cumsum(terminal) / sum(terminal)) - cdf <- data.frame(list('val' = cdfSteps, 'cdf' = cdf)) - return(cdf) -} - - -#' Function to evaluate the mean using the DP CDF -#' -#' @param cdf Differentially private estimate of the empirical cumulative -#' distribution function -#' @param rng Numeric a priori estimate of the range -#' @param gran Granularity -#' @return Differentially private estimate of the mean - -treePostMean <- function(cdf, rng) { - ecdf <- cdf$cdf - pdf <- sapply(2:length(ecdf), function(i) ecdf[i] - ecdf[i - 1]) - p <- c(ecdf[1], pdf) * cdf$val - return(sum(p)) -} - - -#' Function to evaluate the median using the DP CDF -#' -#' @param cdf Differentially private estimate of the empirical cumulative -#' distribution function -#' @return Differentially private estimate of the median - -treePostMedian <- function(cdf) { - outMedian <- treePostPercentiles(cdf, 0.5)$value - return(outMedian) -} - - -#' Quantile function using the DP CDF +#' @param rng Total range of data to be binned into tree in form (min, max) A tuple of numerics of length 2. +#' @param depth Depth of the tree, considering the root to have depth 0. +#' @param n Number of data points to be binned. #' -#' @param cdf Differentially private estimate of the empirical cumulative -#' distribution function -#' @param percentiles Vector of probabilities given to the quantile function -#' @return Differnetially private estimate of the values corresponding to -#' the provided probabilities - -treePostPercentiles <- function(cdf, percentiles) { - absArgMin <- function(q, cdf) { - target <- abs(q - cdf$cdf) - out <- cdf$val[which(target == min(target))] - return(c(q, mean(out))) - } - outValues <- lapply(percentiles, absArgMin, cdf) - outValues <- data.frame(do.call(rbind, outValues)) - names(outValues) <- c('percentile', 'value') - return(outValues) +#' Note: This can be calculated entirely publically. The fact that 'n' is input is unnecessary, +#' but is a residual affect of the fact that the helper function called can also be used if the user +#' specifies their desired granularity of the bins instead of supplying a range, which may be something +#' we want to eventually implement here. +#' +#' @return A list of the bins used in the tree, from the first level of the tree to the leaves. +treeBins <- function(rng, depth, n){ + binsByLevel <- list() + i <- 1 + while(i <= depth){ + nBins <- 2^i + bins <- determineNumericIntegerBins(rng, n, nBins, NULL) #NULL is passed as granularity since that is unnecessary here + binsByLevel <- append(binsByLevel, list(bins)) + i <- i+1 + } + return(binsByLevel) } - -#' Function to efficiently estimate noisy node counts -#' -#' @param release The truncated differentially private noisy binary tree -#' in vector form -#' @param treeData Data frame with binary tree attributes, including depth -#' and indicators of parent and adjacent nodes. Note that -#' \code{nrow(treeData) == length(release)} -#' @param n Number of observations -#' @param nNodes Number of nodes in the binary tree, also \code{length(release)} -#' @param variance The variance of the noise used to perturb tree nodes -#' @param terminalIndex Vector of indices corresponding to the terminal -#' leaf nodes of the binary tree -#' @return Efficient differentially private binary tree - -treePostEfficient <- function(release, treeData, n, variance, terminalIndex) { - nNodes <- length(release) - sigma <- sqrt(variance) - invSigmaSq <- 1 / variance - tree <- cbind(treeData, release) - names(tree)[ncol(tree)] <- 'noisy' - tree <- estBottomUp(tree, min(terminalIndex), nNodes, sigma, invSigmaSq) - tree <- estTopDown(tree, n, nNodes, sigma, invSigmaSq) - tree <- estEfficiently(tree, n, nNodes, sigma, invSigmaSq) - return(round(tree$est.efficient)) -} - - #' Differentially private binary tree #' #' @param mechanism Character, the privacy mechanism. @@ -166,73 +48,97 @@ treePostEfficient <- function(release, treeData, n, variance, terminalIndex) { dpTree <- setRefClass( Class = 'dpTree', - contains = 'mechanismLaplace' + contains = 'mechanismLaplace', + fields = list( + globalEps = 'numeric', + depth = 'numeric', + binsByLevel = 'list', + bins = 'numeric' + ) + ) dpTree$methods( - # DO NOT USE - initialize = function(mechanism, varType, variable, n, rng=NULL, gran, epsilon=NULL, - accuracy=NULL, imputeRng=NULL, percentiles=NULL, alpha=0.05, ...) { + initialize = function(varType, variable, n, depth, rng=NULL, globalEps=NULL, + accuracy=NULL, imputeRng=NULL, alpha=0.05, ...) { .self$name <- 'Differentially private binary tree' - .self$mechanism <- checkMechanism(mechanism, "mechanismLaplace") - .self$varType <- checkVariableType(varType, c('numeric', 'integer', 'logical', 'character')) + .self$mechanism <- "mechanismLaplace" + .self$varType <- checkVariableType(varType, c('numeric', 'integer')) .self$variable <- variable .self$n <- checkN(n) - .self$rng <- checkRange(rng) # CHANGE - .self$gran <- checkN(gran, emptyOkay=TRUE) #should be positive whole number + .self$depth <- checkN(depth) + .self$rng <- checkRange(rng, .self$varType, 'vector') + .self$rngFormat <- 'vector' .self$alpha <- checkNumeric(alpha) - .self$sens <- 2 * log2(diff(rng) / gran + 1) - checkVariableType(variable, "character") + checkVariableType(typeof(variable), "character") + .self$sens <- 2 - if (is.null(epsilon)) { - .self$accuracy <- checkAccuracy(accuracy) - .self$epsilon <- treeGetParameters(accuracy, rng, gran, alpha) - } else { - .self$epsilon <- checkEpsilon(epsilon) - .self$accuracy <- treeGetAccuracy(epsilon, rng, gran, alpha) + # Option 1: Specify global epsilon value + if (!is.null(globalEps)){ + .self$globalEps <- checkEpsilon(globalEps) + .self$epsilon <- .self$globalEps/depth + .self$accuracy <- laplaceGetAccuracy(.self$sens, .self$epsilon) + } + # Option 2: Specify epsilon value for each row of tree + else if (!is.null(epsilon)){ + .self$epsilon <- checkEpsilon(epsilon) + .self$globalEps <- checkEpsilon(epsilon*.self$depth) + .self$accuracy <- laplaceGetAccuracy(.self$sens, .self$epsilon) } + # Option 3: Specify an accuracy value + else if (!is.null(accuracy)){ + .self$accuracy <- checkAccuracy(accuracy) + .self$epsilon <- laplaceGetEpsilon(.self$sens, .self$accuracy, .self$alpha) + .self$globalEps <- .self$epsilon * .self$depth + } + + .self$binsByLevel <- treeBins(rng, depth, n) + if (is.null(imputeRng)) { .self$imputeRng <- rng } else { .self$imputeRng <- imputeRng } - .self$percentiles <- percentiles }) dpTree$methods( release = function(data) { x <- data[, variable] - variance <- 2 * sens / epsilon - universeSize <- floor(diff(rng) / gran + 1) - depth <- ceiling(log2(universeSize)) - terminalIndex <- seq(2^(depth - 1), 2^depth - 1) - .self$result <- export(mechanism)$evaluate(.self$treeFun, x, sens, .self$postProcess, - variance=variance, universeSize=universeSize, - depth=depth, terminalIndex=terminalIndex, self=.self) -}) - -dpTree$methods( - treeFun = function(x, universeSize, depth) { - tree <- binaryTree(x, n, rng, gran, universeSize, depth) - .self$treeData <- tree[, which(names(tree) != 'count')] - return(tree$count) + counts <- list(n) #n is public so the root of tree need not be noisy. Double nested just to match fact that later elements are also lists. + names(counts[[1]]) <- paste("[",toString(.self$rng),"]") # adding bin range to the root node + #counts[.self$rng] + i <- 1 + while(i <= .self$depth){ + .self$bins <- .self$binsByLevel[[i]] #Bins of ith row (note this is publically computable) + noisyCount <- export(mechanism)$evaluate(funHist, x, .self$sens, (function(out) return(out))) + # In evaluate, identity function is passed as postProcess function since we want to postprocess + # on all of the noisy counts together. + counts <- append(counts, list(noisyCount$release)) + i <- i+1 + } + #Note: postprocessing is called here instead of in the evaluate function + out <- list('release' = counts) + .self$result <- .self$postProcess(out) }) dpTree$methods( - postProcess = function(out, ...) { - out$variable <- variable - out$release <- treePostFormatRelease(out$release) - ellipsisVals <- getFuncArgs(list(...), treePostEfficient) - out$release <- do.call(treePostEfficient, c(list(release=out$release, treeData=treeData, n=n), ellipsisVals)) - ellipsisVals <- getFuncArgs(list(...), treePostCDF) - out$cdf <- do.call(treePostCDF, c(list(release=out$release, rng=rng), ellipsisVals)) - out$mean <- treePostMean(out$cdf, rng) - out$median <- treePostMedian(out$cdf) + postProcess = function(out) { + + out$epsilon <- .self$epsilon # epsilon used for each of the node calculations + out$globalEps <- .self$globalEps # global epsilon used in total out$accuracy <- .self$accuracy - out$epsilon <- .self$epsilon - if (!is.null(percentiles)) { - out$percentiles <- treePostPercentiles(out$cdf, percentiles) - } + out$variable <- variable + out$bins <- .self$binsByLevel + + # release an optimal version of the tree that leverages the multiple levels of information to produce higher accuracy counts + out$optimalPostProcess <- optimalPostProcess(out$release, .self$epsilon) + # release a cdf of the tree with highest granularity possible (equivalent to granularity of the bins in the leaf level of the tree) + out$postCDF <- treePostCDF(out$optimalPostProcess$optimalTree, out$bins) + # release the median from the cdf, or a best estimate if the 50th percentile is not an option from the cdf calculation + out$postMedian <- cdfMedian(out$postCDF) + # release an estimate of the mean from the tree, using the leaf bins and counts. + out$postMean <- treeMean(out$optimalPostProcess$optimalTree, out$bins) + return(out) }) diff --git a/R/utilities-histogram.R b/R/utilities-histogram.R index df50a13..b53ddd6 100644 --- a/R/utilities-histogram.R +++ b/R/utilities-histogram.R @@ -466,7 +466,7 @@ determineLogicalBins <- function(impute, object) { } } -# only called by determineBins() +# Called by determineBins() and the tree statistic determineNumericIntegerBins <- function(rng, n, nBins, granularity) { # first check if nBins is NULL, nBins is considered the truth for the number # of bins if the user has entered both nBins and granularity. diff --git a/R/utilities-tree.R b/R/utilities-tree.R index 5135eac..a80d9fe 100644 --- a/R/utilities-tree.R +++ b/R/utilities-tree.R @@ -1,214 +1,411 @@ -#' Function to evaluate weights from the noise variance and standard errors in child nodes for the -#' node of a differentially private binary tree -#' -#' @param invSigmaSq Inverse variance of the noise used in perturbing nodes -#' @param tree Data frame with binary tree attributes and node values -#' @param idx Index of the node for which the weight is evaluated -#' @return Weight - -wBelow <- function(invSigmaSq, tree, idx) { - leftIndex <- 2 * idx - rightIndex <- leftIndex + 1 - w <- invSigmaSq / (invSigmaSq + 1 / (tree$seBelow[leftIndex]^2 + tree$seBelow[rightIndex]^2)) - return(w) -} - - -#' Function to evaluate weights from the noise variance and standard errors in a parent and adjacent -#' nodes for the node of a differentially private binary tree -#' -#' @param invSigmaSq Inverse variance of the noise used in perturbing nodes -#' @param tree Data frame with binary tree attributes and node values -#' @param parent Index of the parnet node -#' @param adjacent Index of the adjacent node -#' @return Weight - -wAbove <- function(invSigmaSq, tree, parent, adjacent) { - w <- invSigmaSq / (invSigmaSq + 1 / (tree$seAbove[parent]^2 + tree$seBelow[adjacent]^2)) - return(w) -} - +#### Optimal Post-Processed Counts #### -#' Function to evaluate weights efficiently using the noise variance and standard errors in parent and adjacent -#' nodes as well child nodes for the node of a differentially private binary tree -#' -#' @param tree Data frame with binary tree attributes and node values -#' @param idx Index of the node for which the weight is evaluated -#' @param parent Index of the parnet node -#' @param adjacent Index of the adjacent node -#' @return Weight +#' Calculates variance of Laplace noise. +#' +#' Recall that the variance of the Laplace distribution is 2b^2, where b is the classic +#' scaling parameter of the Laplace distribution (which is in fact the inverse of what we +#' in this library call the scaling parameter). Then, in our case, b = eps/sens. +#' +#' @param sens Sensitivity of function that is to be perturbed +#' @param eps Privacy parameter used -wEfficient <- function(tree, idx, parent, adjacent) { - w <- tree$seBelow[idx]^(-2) / (tree$seBelow[idx]^(-2) + (1 / (tree$seAbove[parent]^2 + tree$seBelow[adjacent]^2))) - return(w) +inverseVariance <- function(sens, eps){ + b <- eps/sens + return (2*b^2) } - -#' Function to estimate the nodes of a tree using noisy child nodes +#' Creates array of adjacent elements in almost certainly a not optimally-efficient way #' -#' @param w Weight used construct the estimate -#' @param tree Data frame with binary tree attributes and node values -#' @param idx Index of the node for which the estimate is evaluated -#' @return Noisy node estimate - -estBelow <- function(w, tree, idx) { - leftIndex <- 2 * idx - rightIndex <- leftIndex + 1 - est <- w * tree$noisy[idx] + (1 - w) * (tree$estBelow[leftIndex] + tree$estBelow[rightIndex]) - return(est) +#' @param ls 1D array of even length n +#' +#' @return 1D array of length n equal to ls with every other element swapped +#' +#' @examples +#' ls <- c(1,2,3,4) +#' adjacentElements(ls) #outputs c(2,1,3,4) +#' +adjacentElements <- function(ls){ + adj <- vector("numeric", length=length(ls)) + i <- 1 + while (i <= length(ls)){ + if (i%%2 == 0){ + adj[i] = ls[i-1] + } + else { + adj[i] = ls[i+1] + } + i <- i + 1 + } + return(adj) } - -#' Function to estimate the nodes of a tree using noisy parent and adjacent nodes -#' -#' @param w Weight used construct the estimate -#' @param tree Data frame with binary tree attributes and node values -#' @param idx Index of the node for which the estimate is evaluated -#' @param parent Index of the parnet node -#' @param adjacent Index of the adjacent node -#' @return Noisy node estimate - -estAbove <- function(w, tree, idx, parent, adjacent) { - est <- w * tree$noisy[idx] + (1 - w) * (tree$estAbove[parent] - tree$estBelow[adjacent]) - return(est) +#' Recursive weight estimate from below +#' +#' See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +#' +#' @param tree Tree, formatted as a list of arrays where the contents of the ith array in the list is the ith level of the tree. +#' +#' @return Single weight for each level of the tree. +#' +#' @examples +#' t <- list(c(10), c(6,4), c(3,3,1,3)) +#' w <- wBelow(t) # should output c(4/7, 2/3, 1) +#' +wBelow <- function(tree){ + weights <- numeric(length(tree)) # initialize with one weight per level of the tree + i <- length(tree) + while (i>0){ + if (i == length(tree)){ + weights[i] <- 1 + } + else { + prev <- weights[i+1] + weights[i] <- (2*prev)/(2*prev+1) + } + i <- i - 1 + } + return(weights) } - -#' Function to efficiently estimate the nodes of a tree using all available information in the tree -#' -#' @param w Weight used construct the estimate -#' @param tree Data frame with binary tree attributes and node values -#' @param idx Index of the node for which the estimate is evaluated -#' @param parent Index of the parnet node -#' @param adjacent Index of the adjacent node -#' @return Efficient noisy node estimate - -estEfficient <- function(w, tree, idx, parent, adjacent) { - est <- w * tree$estBelow[idx] + (1 - w) * (tree$estAbove[parent] - tree$estBelow[adjacent]) - return(est) +#' Recursively compute counts from below +#' +#' See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +#' +#' @param tree Tree, formatted as a list of arrays where the contents of the ith array in the list is the ith level of the tree. +#' @param wBelows Array of weights of same length as tree, where the ith weight corresponds to the ith weight calculated from below. +#' +#' @return List of counts for each node of the tree. +#' +#' @examples +#' t <- list(c(10), c(6,4), c(3,3,1,3)) +#' w <- wBelow(t) +#' c <- countBelow(t, w) # t and c should be equal, since counts in t have no added noise. +countBelow <- function(tree, wBelows){ + counts <- vector('list', length(tree)) + i <- length(tree) + while (i > 0){ + # taking advantage of R's vector operations to find all weighted counts per layer of tree at once + if (i == length(tree)){ + counts[i] <- tree[i] + } + else{ + w <- wBelows[i] + child <- tree[[i+1]] + l <- length(child) + childSum <- child[1:l-1] + child[2:l] #sum all pairs of children counts (note this counts adjacent nodes that don't have same parent) + childSum <- childSum[seq(1, l, by=2)] # pick out the sums that are of children with same parents + counts[[i]] <- w*tree[[i]]+(1-w)*(childSum) + } + i <- i - 1 + } + return(counts) } - -#' Function to evaluate the standard error of a node estimate given a weight and the standard -#' deviation of the noise used to perturb the nodes +#' Recursive weight estimation from above #' -#' @param w Weight used construct the estimate -#' @param sigma Standard deviation of the noise used to perturb the estimates -#' @return Standard error of the node estimate - -stErr <- function(w, sigma) { - return(sigma * sqrt(w)) +#' See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +#' +#' @param tree Tree, formatted as a list of arrays where the contents of the ith array in the list is the ith level of the tree. +#' @param wBelows Array of weights of same length as tree, where the ith weight corresponds to the ith weight calculated from below. +#' +#' @return Single weight for each level of the tree. +#' +#' @examples +#' t <- list(c(10), c(6,4), c(3,3,1,3)) +#' wB <- wBelow(t) +#' wA <- wAbove(t, wB) # Should return c(1, 5/8, 13/21). +wAbove <- function(tree, wBelows){ + weights <- numeric(length(tree)) + i <- 1 + while (i <= length(tree)){ + if (i == 1){ + weights[i] <- 1 + } + else { + prevAbove <- weights[i-1] + prevBelow <- wBelows[i] + weights[i] <- 1/(1 + (prevAbove+prevBelow)^(-1)) + } + i <- i + 1 + } + return(weights) } - -#' Function to estimate a noisy binary tree from the terminal nodes -#' -#' @param tree Data frame with binary tree attributes and node values -#' @param terminalLevelIndex Index of the first terminal leaf node -#' @param nNodes Number of nodes in the binary tree -#' @param sigma Standard deviation of the noise used to perturb the estimates -#' @param invSigmaSq Inverse variance of the noise used in perturbing nodes -#' @return Bottom-up estimate of noisy binary tree in vector form - -estBottomUp <- function(tree, terminalLevelIndex, nNodes, sigma, invSigmaSq) { - tree$estBelow <- c(rep(NA, (terminalLevelIndex - 1)), tree$noisy[terminalLevelIndex:nrow(tree)]) - tree$seBelow <- c(rep(NA, (terminalLevelIndex - 1)), rep(sigma, nNodes - (terminalLevelIndex - 1))) - tree$wBelow <- rep(NA, nNodes) - for (i in (terminalLevelIndex - 1):2) { - tree$wBelow[i] <- wBelow(invSigmaSq, tree, i) - tree$estBelow[i] <- estBelow(tree$wBelow[i], tree, i) - tree$seBelow[i] <- stErr(tree$wBelow[i], sigma) +#' Recursively compute counts from above +#' +#' See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +#' @param tree Tree, formatted as a list of arrays where the contents of the ith array in the list is the ith level of the tree. +#' @param countsBelow Array of counts of same length as tree, as calculated by countBelow function +#' @param wAboves Weights computed from above, assuming each node in tree has same variance. +#' +#' @return List of counts for each node of the tree. +#' +#' @examples +#' t <- list(c(10), c(6,4), c(3,3,1,3)) +#' wB <- wBelow(t) +#' wA <- wAbove(t, wB) +#' cB <- countBelow(t, wB) +#' cA <- countAbove(t, cB, wA) #t and cA should be equal, since counts in t have no added noise. +countAbove <- function(tree, countsBelow, wAboves){ + counts <- vector('list', length(tree)) + i <- 1 + while (i <= length(tree)){ + # taking advantage of R's vector operations to find all weighted counts per layer of tree at once + if (i == 1){ + counts[[i]] <- tree[[i]] + } + else { + w <- wAboves[i] + parents <- rep(counts[[i-1]], each=2) #replicating each parent to make dimension correct + adjacents <- adjacentElements(tree[[i]]) + counts[[i]] <- w*tree[[i]] + (1-w)*(parents - adjacents) } - tree$estBelow[tree$estBelow < 0] <- 0 - return(tree) + i <- i + 1 + } + return(counts) } - -#' Function to estimate a noisy binary tree from the top down -#' -#' @param tree Data frame with binary tree attributes and node values -#' @param n Number of observations in the vector represented by the binary tree -#' @param nNodes Number of nodes in the binary tree -#' @param sigma Standard deviation of the noise used to perturb the estimates -#' @param invSigmaSq Inverse variance of the noise used in perturbing nodes -#' @return Top-down estimate of noisy binary tree in vector form - -estTopDown <- function(tree, n, nNodes, sigma, invSigmaSq) { - tree$estAbove <- c(n, rep(NA, (nNodes - 1))) - tree$seAbove <- c(0, rep(NA, (nNodes - 1))) - tree$wAbove <- rep(NA, nNodes) - for (i in 2:nNodes) { - tree$wAbove[i] <- wAbove(invSigmaSq, tree, tree$parent[i], tree$adjacent[i]) - tree$estAbove[i] <- estAbove(tree$wAbove[i], tree, i, tree$parent[i], tree$adjacent[i]) - tree$seAbove[i] <- stErr(tree$wAbove[i], sigma) +#' Optimal counts for nodes of tree +#' +#' See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +#' +#' @param tree Tree, formatted as a list of arrays where the contents of the ith array in the list is the ith level of the tree. +#' @param wA Array of weights calculated from above with wAbove +#' @param cA Array of counts calculated from below with countBelow +#' @param cB Array of counts calculated from above with countAbove +#' +#' @return List of counts for each node of the tree. +#' +#' @examples +#' t <- list(c(10), c(6,4), c(3,3,1,3)) +#' wB <- wBelow(t) +#' wA <- wAbove(t, wB) +#' cB <- countBelow(t, wB) +#' cA <- countAbove(t, cB, wA) +#' c <- optimalCount(t, wA, cA, cB) #will return t +#' +optimalCount <- function(tree, wA, cA, cB){ + + counts <- vector('list', length(tree)) + i <- 1 + while (i <= length(tree)){ + if (i == 1){ + counts[[i]] <- tree[[i]] } - tree$estAbove[tree$estAbove < 0] <- 0 - return(tree) + else{ + w <- wA[i] + parents <- rep(cA[[i-1]], each=2) #replicating each parent to make dimension correct + adjacents <- adjacentElements(cB[[i]]) + counts[[i]] <- w*cB[[i]] + (1-w)*(parents - adjacents) + } + i <- i + 1 + } + return(counts) } +#' Optimal Sigma Estimation +#' +#' See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +#' +#' @param wA Array of weights calculated from above +#' @param wB Array of weights calculated from below +#' @param epsilon Epsilon used for Laplace noise addition +#' +#' @return Value of variance for optimal estimate +optimalSigma <- function(wA, wB, epsilon){ + sB <- inverseVariance(2, epsilon)*wB #histogram of counts has sensitivity 2 + sOpt <- sB*sqrt(wA) + return(sOpt) +} -#' Function to estimate a noisy binary tree efficiently using all available information in the tree -#' -#' @param tree Data frame with binary tree attributes and node values -#' @param n Number of observations in the vector represented by the binary tree -#' @param nNodes Number of nodes in the binary tree -#' @param sigma Standard deviation of the noise used to perturb the estimates -#' @param invSigmaSq Inverse variance of the noise used in perturbing nodes -#' @return Efficient estimate of noisy binary tree in vector form - -estEfficiently <- function(tree, n, nNodes, sigma, invSigmaSq) { - tree$estEfficient <- c(n, rep(NA, (nNodes - 1))) - tree$seEfficient <- rep(NA, nNodes) - tree$wEfficient <- rep(NA, nNodes) - for (i in 2:nNodes) { - tree$wEfficient[i] <- wEfficient(tree, i, tree$parent[i], tree$adjacent[i]) - tree$estEfficient[i] <- estEfficient(tree$wEfficient[i], tree, i, tree$parent[i], tree$adjacent[i]) - tree$seEfficient[i] <- stErr(tree$wEfficient[i], sigma) - } - tree$estEfficient[tree$estEfficient < 0] <- 0 - return(tree) +#' Optimal Post Processing +#' +#' Wrapper function that generates optimal tree from noisy tree generated with the Laplace mechanism. The general idea is that +#' you can leverage the fact that the child counts at each node should sum to the parent count, and a node's count should be +#' equal to its parent count minus the count of the adjacent child node to generate less noisy counts at every node of the tree. +#' +#' You can think of the leveraging of child node's information to get the parent node's counts as a recursive process that +#' begins at the leaves of a tree, which here we refer to with the tag "Below" in the helper function names. Similarly, leveraging +#' a parent node and the adjacent child nodes can be thought of as a recursive process that begins with the root node, which +#' is referred to here with the tage "Above" in the helper function names. A new count at every node in the tree can then be calculated +#' using the counts that are generated in this way, which each contribute an amount according to some weight, which are calculated +#' by the wBelow and wAbove functions respectively. +#' +#' The theory behind this is explored in detail in the Honaker paper, whose implementation here is described in extra_docs/tree-post-processing. +#' The implementation here assumes that the variance of the noise added is equal at every level of the tree, which also means that the weights +#' wBelow and wAbove are the same for nodes at the same level of the tree. Honaker provides a more general implementation in his work. +#' +#' @references Honaker, James. "Efficient Use of Differentially Private Binary Trees." (2015). +#' +#' @param tree Differentially private tree generated from dpTree$release method +#' @param epsilon The epsilon value used for the noise addition at each node (note this is not the same as the global epsilon value.) +#' +#' @return A list that includes: +#' optimalTree: a new tree with optimized counts at each level +#' wBelow: the weights used to calculate the optimized counts from the leaf nodes to the top of the tree. +#' wAbove: the weights used to calculate the optimized counts from the top of the tree to the leaf nodes. +#' optVariance: the variance of the optimal tree counts. +#' +#' See extra_docs/tree-post-processing for more details on the meanings of wBelow, wAbove, and optVariance +optimalPostProcess <- function(tree, epsilon){ + wB <- wBelow(tree) # calculate the weights of counts from below + wA <- wAbove(tree, wB) # calculate the weights of counts from above + cB <- countBelow(tree, wB) # calculate counts from below + cA <- countAbove(tree, cB, wA) # calculate counts from above + + out <- list('optimalTree'= optimalCount(tree, wA, cA, cB)) # generate optimal count + out$wBelow <- wB # save the weights used for counts from below + out$wAbove <- wA # save the weights used for counts from above + out$optVariance <- optimalSigma(wA, wB, epsilon) # calculate variance of optimal count + + return(out) } +### Post-Processed CDF ### -#' Function to evaluate a binary tree -#' -#' @param x Numeric vector to be represented as a binary tree in vector form -#' @param n Number of observations in \code{x} -#' @param rng An a priori estimate of the range of \code{x} -#' @param gran The granularity at which \code{x} is represented in the tree -#' @param universeSize Difference in the range of \code{x} over the granularity, plus 1 -#' @param depth The depth of the binary tree -#' @return A binary tree in vector form +#' Empirical CDF for tree statistic. +#' +#' Generates the least noisy CDF for the tree. +#' +#' Note that for any numeric histogram, you can generate an empirical CDF for each of the maximal values of the bins by counting the number of items to the +#' left of that bin edge in the histogram. Here, we could do that by just using the histogram at the smallest level of the tree. This is not desirable +#' because you will have to sum many noisy things in order to get the counts. Instead, we can leverage the tree structure and minimize the number of counts +#' you need to sum for each count by traversing the tree. +#' +#' For example, if a tree has bins with ranges [0,2), [2,4), [4,6), [6,8] at the leaf nodes, you could use the counts at the leaf nodes associated with [0,2) and [2,4) +#' to get an estimate for Pr(x < 4), or you could instead just use the count that is one level up on the tree which has range [0,4) to output this directly. Similarly, +#' if you want to get an estimate for Pr(x < 6), you could combine the count for [4,6) with the count for [0,4] to minimize the number of sums you must make. +#' +#' There are two edge cases here: the probability of a count that is less than the minimum of the tree is always 0 as we assume that the minimum and maximum values are always true, +#' and the total sum is public knowledge. +#' +#' Note that you could also minimize the number of counts you need to sum for each by calculating them as a difference from the total sum, and combine these two methods to get the most +#' optimal value for each of the counts. (E.g. in the previous example, you could get an estimate for Pr(x < 6) from the fact that you know Pr(x<8)=1 and the [6,8] count, which would +#' reduce the number of noisy counts used. We do not do this here. This code could certainly be optimized further in terms of runtime efficiency as well. +#' +#' @param tree (Optimized) differentially private tree of counts generated by dpTree$optimalPostProcess. Note that you could instead pass in the differentially private unoptimized tree +#' and the algorithm will still run correctly, but you will get more accurate results if you instead pass in the optimized version. +#' @param bins The bins of the dpTree object, which are stored as $binsByLevel. +#' +#' @return An empirical cdf that is as granular as the tree allows. The output consists of +#' $bins: the values that the empirical cdf is evaluated at, which correspond to the ranges of the bins of the leaf nodes of the tree. +#' $counts: the number of values that appear to the left of the corresponding value in $bins +#' $proportions: the proportion of values that appear to the left of the corresponding value in $bins, i.e. proportions[i] is an approximation of Pr(x < bins[i]) +treePostCDF <- function(tree, bins){ + counts <- c() # initializing counts + vals <- bins[[length(bins)]] # vals are the values we can compute an empirical cdf for; i.e. the points that demarcate the bin ranges for the most granular bins in the tree + i <- 1 + while (i <= length(vals)){ # generate the least noisy count for each of the values + # start out with min and max values at top of tree + m <- vals[1] + M <- vals[length(vals)] -binaryTree <- function(x, n, rng, gran, universeSize, depth) { - tree <- rep(0, times=(2^depth + universeSize)) - for (i in 1:n) { - idx <- ((x[i] - rng[1]) / gran) + 2^depth - tree[idx] <- tree[idx] + 1 - } - d <- c() - for (i in seq(2^depth, 2^depth - 1 + universeSize, 2)) { - tree[i / 2] <- tree[i] + tree[i + 1] - d <- c(d, depth) - } - depthCounter <- depth - 1 - while (depthCounter > 0) { - for (i in seq(2^depthCounter, 2^(depthCounter + 1) - 1, 2)) { - tree[i / 2] <- tree[i] + tree[i + 1] - d <- c(d, depthCounter) + #initialize count for i + count <- 0 + + # iterate through layers of tree + index <- 1 + j <- 1 + while (j<=length(tree)){ + # determine if should traverse tree to left or right + mid <- m + (M-m)/2 + if (i == 1){ # if looking at leftmost node in the tree, we know empirical cdf should evaluate to 0 not to bin size. + count <- 0 + break + } + if (vals[i] == M){ # if you don't need higher granularity, stop traversal + count <- count + as.numeric(tree[[j]][index]) # as.numeric is because R coerces the result of binary ops on named items to have the name of the first thing you are coercing to. + break } - depthCounter <- depthCounter - 1 - } - tree <- data.frame(tree[1:(2^depth - 1)]) - names(tree) <- 'count' - r <- c(0, rep(c(1, -1), nrow(tree) - 1)) - tree$depth <- 1 - tree$parent <- NA - tree$adjacent <- NA - for(i in 2:nrow(tree)) { - tree$parent[i] <- trunc(i/2) - tree$depth[i] <- trunc(log2(i)) + 1 - tree$adjacent[i] <- i + r[i] + if (j == length(tree)){ # if at leaves of tree, record the count there + count <- count + as.numeric(tree[[j]][index]) + break + } + else if (vals[i] <= mid){ # if traversing left + # reset max value to the mid, don't add to the count + M <- mid + # set next index of node to look at in next layer + index <- index*2 - 1 + } + else { #if traversing right + # reset min value to the mid + m <- mid + # set to next index of node to look at in next layer + index <- index*2 + count <- count + as.numeric(tree[[j+1]][index - 1]) # add the node's left child to the count + } + j <- j + 1 } - return(tree) + counts <- append(counts, count) + i <- i + 1 + } + out <- c() + out$counts <- counts + out$proportions <- counts/count[length(count)] # count[length(count)] is the total number of values in the data base (which is a publically known quantity) + out$bins <- vals + return (out) +} + +#' Median of Input Data +#' +#' Estimated median of input data as postprocessed from tree. +#' +#' Note that the median is just the cdf of a distribution evaluated at the 50th percentile. Given an empirical cdf genderated by the +#' treePostCDF function, we can report this value according to the noisy counts of the tree exactly if the 50th percentile is included. +#' If the granularity of the bins and counts of the data at the lowest level of the noisy tree are such that the 50th percentile is +#' not included in the output cdf, the function will instead output the closest count. +#' +#' E.g. if the input has +#' +#' cdf$proportions <- c(0, 0.3. 0.4, 0.7, 1.0), +#' cdf$counts <- c(0,3,4,7,10) +#' +#' then the function will output the cdf count associated with 0.4 as an estimate. The output is a tuple of both the estimated median, +#' med$val, and the value of the proportion that was used to create the estimate. +#' +#' @param cdf CDF generated with the treePostCDF function. +#' +#' @return Estimate of the median and the proportion that was used to create the estimate. +cdfMedian <- function(cdf){ + med <- c() + if (0.5 %in% cdf$proportions){ + med$val <- cdf$bins[cdf$proportions==0.5] + med$prop <- 0.5 + } + else{ + # otherwise, estimate cdf with closest value to 0.5th percentile + distances <- sapply(cdf$proportions, (function(x) abs(x-0.5))) + i <- which(min(distances) == distances) + med$val <- cdf$bins[i+1] + med$prop <- cdf$proportions[i] + } + return (med) } + +#' Mean of Input Data +#' +#' Estimated mean of input data. +#' +#' Given a histogram of numeric values, you can estimate the mean of the underlying data by calculating the midpoint of each of the bins. +#' +#' Let c_i be the count associated with bin i, and let mid_i be the midpoint of the data range that bin i represents. +#' Let n be the total number of data points. Then, +#' \deqn{(1/n)\sum c_i * mid_i} +#' is an estimate of the mean. +#' +#' Here, we calculate this estimate using the highest granularity bins possible, so the histogram created by the leaf nodes of the tree. +#' +#' @param tree Differentially private tree of histogram counts generated with dpTree. +#' @param bins The bins by level of the tree, which corresponds to the binsByLevel attribute of dpTree. +#' +#' @return Estimate of the mean of the underlying data that created the dpTree. +treeMean <- function(tree, bins){ + #pull highest granularity bins from the list of all the tree's bins + rng <- bins[[length(bins)]] + # find midpoints of each of the ranges of the bins + mids <- (rng[2:length(rng)] - rng[1:length(rng)-1])/2 + rng[1:length(rng)-1] + #estimate mean by weighting sum of bin midpoints by counts in lowest level of tree + meanEst <- sum(mids * tree[[length(tree)]])/tree[[1]] + # convert to numeric to get rid of the bin labels that get coerced to the result due to R weirdness + return(as.numeric(meanEst)) +} \ No newline at end of file diff --git a/extra_docs/tree-post-processing/Tree Statistic.pdf b/extra_docs/tree-post-processing/Tree Statistic.pdf new file mode 100644 index 0000000..c1a2aaf Binary files /dev/null and b/extra_docs/tree-post-processing/Tree Statistic.pdf differ diff --git a/extra_docs/tree-post-processing/Tree Statistic.tex b/extra_docs/tree-post-processing/Tree Statistic.tex new file mode 100644 index 0000000..40ee6cc --- /dev/null +++ b/extra_docs/tree-post-processing/Tree Statistic.tex @@ -0,0 +1,131 @@ +\documentclass[11pt, oneside]{article} +\usepackage{geometry} +\geometry{letterpaper} +\usepackage{amssymb, amsmath, amsthm} +\usepackage{algorithmic, algorithm} + +\newtheorem{theorem}{Theorem} + +\newcommand{\x}{\boldsymbol{x}} +\newcommand{\lap}{\text{Lap}} +\newcommand{\eps}{\epsilon} + +\begin{document} + +The idea here is to make a binary tree with weights on each node so that the input data gets binned into each node as it would in a histogram, where the histogram has finer granularity at each level of the binary tree. + +Let $\epsilon$ be the privacy parameter, $\x$ be the input data, $m$ be a lower bound on points in $x$, and $M$ be an upper bound on points in $\x$. Let $n$ be the number of leaves in the tree that will be constructed. Let $T$ be a perfect binary tree with $n$ leaves. For each node $v$ of $T$, associate a range such that the head node of $T$ has range $[m,M)$, and for all nodes $v$ with range $[m', M')$, $v$'s left child has range $[m', m' + \frac{M'-m'}{2})$ and its right child has range $[m' + \frac{M'-m'}{2}, M')$. Let the weight $w$ of each vertex be the number of points in $\x$ that are within $v$'s range. The idea for the differentially private algorithm is to make a histogram at each level of the tree. +\begin{algorithm} +\begin{algorithmic} +\STATE \textbf{Input:} $\x, \eps, T$. +\STATE Let $\eps' = \eps / \lg n$. +\STATE Initialize $T'$ as a tree identical to $T$ with the weights of each node $w'$ set to 0. +\FOR{every level of $T$} + \STATE For each node at that level, $w' = w + z$, where $z \sim \lap(2/\eps')$. +\ENDFOR +\RETURN T' +\end{algorithmic} +\caption{Differentially private weighted tree using histogram method to add noise} +\label{alg: tree} +\end{algorithm} + +\begin{theorem} +Algorithm \ref{alg: tree} is $\eps$-differentially private. +\end{theorem} + +\begin{proof} +By definition of the Laplace mechanism, the histograms on each level of the tree are each $\eps'$-differentially private. Then, by composition, if $\boldsymbol{\eps}$ is the net privacy budget used across the whole algorithm, if $d$ is the depth of the tree, by composition + +\begin{align*} +\boldsymbol{\eps} &= d \eps' \\ + &= \lg n (\eps/\lg n) \\ + &= \eps, +\end{align*} + +since the tree's depth $d = \lg n$. +\end{proof} + +Note that since you don't need to distribute each $\eps'$ across each level of the tree, this gives much better accuracy than doing naive noise addition to each node's weight individually. The resulting tree may then be used to estimate a cumulative distribution function for the data. + +As discussed in more detail in \cite{honaker}, the fact that the count at a node is equal to the composition of the sums of its child nodes, or of its parent node minus the count of the adjacent node, etc., may be leveraged to calculate a more efficient count at each node of the tree. + +You can recursively create an estimate of the count $t_i$ at node $i$ ``from below", which we demarcate as $t_i^-$, by setting $t_i^- = t_i$ and $\sigma(t_i^-)=\sigma^-_i$ at the leaves, then recursively define for all other nodes + +$$ t_i^- = w_i^- t_i + (1-w_i^-)(t_{2i}^- + t_{2i+1}^-), $$ + +where + +$$ w_i^- = \frac{(\sigma_i)^{-2}}{(\sigma_i)^{-2} + \left[ (\sigma_{2i}^-)^2 + (\sigma_{2i+1}^-)^2\right]^{-1}},$$ + +and + +$$ \sigma_i^- = \sigma_i \sqrt{w_i^-},$$ + +which is equation 10 in \cite{honaker}. + +Note that if each node has noise with variance $s$ added to it, then $\forall i, \sigma_i=s$, $\sigma^-(t_{2i}^-) = \sigma^-(t_{2i+1}^-)$, and $w$ will be the same for every node at depth $d$. Then, $w_i$ may be recursively defined as + +\begin{align*} +w_i^- &= \frac{s^{-2}}{s^{-2} + (1/2)(\sigma_{2i}^-)^{-2}}\\ + &= \frac{s^{-2}}{s^{-2} + (1/2)\left(s\sqrt{w_{2i}^-}\right)^{-2}}\\ + &= \frac{s^{-2}}{s^{-2}(1+(1/2)(w_{2i}^-)^{-1})}\\ + &= \frac{2w_{2i}^-}{2w_{2i}^- + 1}, +\end{align*} + +and only needs to be calculated once per level of the tree. + +For the base case, + +$$ w_i^- = 1,$$ + +since $t_i^- = t_i.$ + +Similarly, you can recursively create an estimate of the count $t_i$ at node $i$ ``from above", which we demarcate as $t_i^+$. Set $t_1^+ = t_1$, and $\sigma_1^+ = \sigma_1$ if $N$ is private and 0 if $N$ is public. Denote the parent node of node $i$ as $i \Phi 1$ and denote the adjacent\footnote{Here, adjacent means ``the node which is the other child of node $i$'s parent node"} node to node $i$ as $i\Lambda1$. + +Then, we can recursively define $t_i^+$ for all other nodes as + +$$ t_i^+ = w_i^+ t_i + (1-w_i^+) (t_{i\Phi1}^+ - t_{i \Lambda 1}^-),$$ + +where + +$$ w_i^+ = \frac{(\sigma_i)^{-2}}{(\sigma_i)^{-2} + \left[(\sigma^+_{i\Phi 1})^2 + (\sigma^-_{i \Lambda 1})^2\right]^{-1}},$$ + +and + +$$ \sigma^+_i = \sigma_i \sqrt{w_i^+},$$ + +which is equation 11 in \cite{honaker}. + +Note that if each node has noise with variance $s$, then + +\begin{align*} +w_i^+ &= \frac{1}{1 + (w_{i \Phi 1}^+ + w_{i \Lambda 1}^-)^{-1}}, \\ +\end{align*} + +and + +$$w_1^+ = 1,$$ + +since $t_1^+ = t_i$. + +An optimal estimate may then be derived using both the estimates from below and above: + +$$ t_i^* = w_i t_i^- + (1-w_i) (t_{i \Phi 1} ^+ - t_{i \Lambda 1}^-),$$ + +where + +$$ w_i = \frac{(\sigma_i^-)^{-2}}{(\sigma_i^-)^{-2} + \left[(\sigma_{i\Phi 1}^+)^2 + (\sigma_{i \Lambda 1}^-)^2\right]^{-1}}, $$ + +and + +$$ \sigma_i^* = \sigma_i^-\sqrt{w_i},$$ + +which is equation 13 in \cite{honaker}. + +Note that if each node has variance $s$, then the expression for the weights here is the same as the weight for estimation from above. + +\begin{thebibliography}{2} + \bibitem[1]{honaker} Honaker, James, ``Efficient Use of Differentially Private Binary Trees," http://hona.kr/papers/files/privatetrees.pdf. +\end{thebibliography} + +\end{document} \ No newline at end of file diff --git a/extra_docs/tree-post-processing/efficient_binary_tree b/extra_docs/tree-post-processing/efficient_binary_tree new file mode 100644 index 0000000..e4bc823 Binary files /dev/null and b/extra_docs/tree-post-processing/efficient_binary_tree differ diff --git a/man/adjacentElements.Rd b/man/adjacentElements.Rd new file mode 100644 index 0000000..89a6e78 --- /dev/null +++ b/man/adjacentElements.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-tree.R +\name{adjacentElements} +\alias{adjacentElements} +\title{Creates array of adjacent elements in almost certainly a not optimally-efficient way} +\usage{ +adjacentElements(ls) +} +\arguments{ +\item{ls}{1D array of even length n} +} +\value{ +1D array of length n equal to ls with every other element swapped +} +\description{ +Creates array of adjacent elements in almost certainly a not optimally-efficient way +} +\examples{ +ls <- c(1,2,3,4) +adjacentElements(ls) #outputs c(2,1,3,4) + +} diff --git a/man/binaryTree.Rd b/man/binaryTree.Rd deleted file mode 100644 index 00685f5..0000000 --- a/man/binaryTree.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities-tree.R -\name{binaryTree} -\alias{binaryTree} -\title{Function to evaluate a binary tree} -\usage{ -binaryTree(x, n, rng, gran, universeSize, depth) -} -\arguments{ -\item{x}{Numeric vector to be represented as a binary tree in vector form} - -\item{n}{Number of observations in \code{x}} - -\item{rng}{An a priori estimate of the range of \code{x}} - -\item{gran}{The granularity at which \code{x} is represented in the tree} - -\item{universeSize}{Difference in the range of \code{x} over the granularity, plus 1} - -\item{depth}{The depth of the binary tree} -} -\value{ -A binary tree in vector form -} -\description{ -Function to evaluate a binary tree -} diff --git a/man/cdfMedian.Rd b/man/cdfMedian.Rd new file mode 100644 index 0000000..a1dc31b --- /dev/null +++ b/man/cdfMedian.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-tree.R +\name{cdfMedian} +\alias{cdfMedian} +\title{Median of Input Data} +\usage{ +cdfMedian(cdf) +} +\arguments{ +\item{cdf}{CDF generated with the treePostCDF function.} +} +\value{ +Estimate of the median and the proportion that was used to create the estimate. +} +\description{ +Estimated median of input data as postprocessed from tree. +} +\details{ +Note that the median is just the cdf of a distribution evaluated at the 50th percentile. Given an empirical cdf genderated by the +treePostCDF function, we can report this value according to the noisy counts of the tree exactly if the 50th percentile is included. +If the granularity of the bins and counts of the data at the lowest level of the noisy tree are such that the 50th percentile is +not included in the output cdf, the function will instead output the closest count. + +E.g. if the input has + +cdf$proportions <- c(0, 0.3. 0.4, 0.7, 1.0), +cdf$counts <- c(0,3,4,7,10) + +then the function will output the cdf count associated with 0.4 as an estimate. The output is a tuple of both the estimated median, +med$val, and the value of the proportion that was used to create the estimate. +} diff --git a/man/countAbove.Rd b/man/countAbove.Rd new file mode 100644 index 0000000..26a9768 --- /dev/null +++ b/man/countAbove.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-tree.R +\name{countAbove} +\alias{countAbove} +\title{Recursively compute counts from above} +\usage{ +countAbove(tree, countsBelow, wAboves) +} +\arguments{ +\item{tree}{Tree, formatted as a list of arrays where the contents of the ith array in the list is the ith level of the tree.} + +\item{countsBelow}{Array of counts of same length as tree, as calculated by countBelow function} + +\item{wAboves}{Weights computed from above, assuming each node in tree has same variance.} +} +\value{ +List of counts for each node of the tree. +} +\description{ +See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +} +\examples{ +t <- list(c(10), c(6,4), c(3,3,1,3)) +wB <- wBelow(t) +wA <- wAbove(t, wB) +cB <- countBelow(t, wB) +cA <- countAbove(t, cB, wA) #t and cA should be equal, since counts in t have no added noise. +} diff --git a/man/countBelow.Rd b/man/countBelow.Rd new file mode 100644 index 0000000..213e418 --- /dev/null +++ b/man/countBelow.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-tree.R +\name{countBelow} +\alias{countBelow} +\title{Recursively compute counts from below} +\usage{ +countBelow(tree, wBelows) +} +\arguments{ +\item{tree}{Tree, formatted as a list of arrays where the contents of the ith array in the list is the ith level of the tree.} + +\item{wBelows}{Array of weights of same length as tree, where the ith weight corresponds to the ith weight calculated from below.} +} +\value{ +List of counts for each node of the tree. +} +\description{ +See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +} +\examples{ +t <- list(c(10), c(6,4), c(3,3,1,3)) +w <- wBelow(t) +c <- countBelow(t, w) # t and c should be equal, since counts in t have no added noise. +} diff --git a/man/dLap.Rd b/man/dLap.Rd new file mode 100644 index 0000000..fbb8502 --- /dev/null +++ b/man/dLap.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-noise-generation.R +\name{dLap} +\alias{dLap} +\title{Probability density for Laplace distribution} +\usage{ +dLap(x, mu = 0, b = 1) +} +\arguments{ +\item{x}{numeric, value} + +\item{mu}{numeric, center of the distribution} + +\item{b}{numeric, spread} +} +\value{ +Density for elements of x +} +\description{ +Probability density for Laplace distribution +} +\examples{ + +x <- seq(-3, 3, length.out=61) +dLap(x) +} diff --git a/man/estAbove.Rd b/man/estAbove.Rd deleted file mode 100644 index ce06843..0000000 --- a/man/estAbove.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities-tree.R -\name{estAbove} -\alias{estAbove} -\title{Function to estimate the nodes of a tree using noisy parent and adjacent nodes} -\usage{ -estAbove(w, tree, idx, parent, adjacent) -} -\arguments{ -\item{w}{Weight used construct the estimate} - -\item{tree}{Data frame with binary tree attributes and node values} - -\item{idx}{Index of the node for which the estimate is evaluated} - -\item{parent}{Index of the parnet node} - -\item{adjacent}{Index of the adjacent node} -} -\value{ -Noisy node estimate -} -\description{ -Function to estimate the nodes of a tree using noisy parent and adjacent nodes -} diff --git a/man/estBelow.Rd b/man/estBelow.Rd deleted file mode 100644 index 7383eb0..0000000 --- a/man/estBelow.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities-tree.R -\name{estBelow} -\alias{estBelow} -\title{Function to estimate the nodes of a tree using noisy child nodes} -\usage{ -estBelow(w, tree, idx) -} -\arguments{ -\item{w}{Weight used construct the estimate} - -\item{tree}{Data frame with binary tree attributes and node values} - -\item{idx}{Index of the node for which the estimate is evaluated} -} -\value{ -Noisy node estimate -} -\description{ -Function to estimate the nodes of a tree using noisy child nodes -} diff --git a/man/estBottomUp.Rd b/man/estBottomUp.Rd deleted file mode 100644 index 5899295..0000000 --- a/man/estBottomUp.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities-tree.R -\name{estBottomUp} -\alias{estBottomUp} -\title{Function to estimate a noisy binary tree from the terminal nodes} -\usage{ -estBottomUp(tree, terminalLevelIndex, nNodes, sigma, invSigmaSq) -} -\arguments{ -\item{tree}{Data frame with binary tree attributes and node values} - -\item{terminalLevelIndex}{Index of the first terminal leaf node} - -\item{nNodes}{Number of nodes in the binary tree} - -\item{sigma}{Standard deviation of the noise used to perturb the estimates} - -\item{invSigmaSq}{Inverse variance of the noise used in perturbing nodes} -} -\value{ -Bottom-up estimate of noisy binary tree in vector form -} -\description{ -Function to estimate a noisy binary tree from the terminal nodes -} diff --git a/man/estEfficient.Rd b/man/estEfficient.Rd deleted file mode 100644 index 7acba37..0000000 --- a/man/estEfficient.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities-tree.R -\name{estEfficient} -\alias{estEfficient} -\title{Function to efficiently estimate the nodes of a tree using all available information in the tree} -\usage{ -estEfficient(w, tree, idx, parent, adjacent) -} -\arguments{ -\item{w}{Weight used construct the estimate} - -\item{tree}{Data frame with binary tree attributes and node values} - -\item{idx}{Index of the node for which the estimate is evaluated} - -\item{parent}{Index of the parnet node} - -\item{adjacent}{Index of the adjacent node} -} -\value{ -Efficient noisy node estimate -} -\description{ -Function to efficiently estimate the nodes of a tree using all available information in the tree -} diff --git a/man/estEfficiently.Rd b/man/estEfficiently.Rd deleted file mode 100644 index 1a04b3d..0000000 --- a/man/estEfficiently.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities-tree.R -\name{estEfficiently} -\alias{estEfficiently} -\title{Function to estimate a noisy binary tree efficiently using all available information in the tree} -\usage{ -estEfficiently(tree, n, nNodes, sigma, invSigmaSq) -} -\arguments{ -\item{tree}{Data frame with binary tree attributes and node values} - -\item{n}{Number of observations in the vector represented by the binary tree} - -\item{nNodes}{Number of nodes in the binary tree} - -\item{sigma}{Standard deviation of the noise used to perturb the estimates} - -\item{invSigmaSq}{Inverse variance of the noise used in perturbing nodes} -} -\value{ -Efficient estimate of noisy binary tree in vector form -} -\description{ -Function to estimate a noisy binary tree efficiently using all available information in the tree -} diff --git a/man/estTopDown.Rd b/man/estTopDown.Rd deleted file mode 100644 index 857da56..0000000 --- a/man/estTopDown.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities-tree.R -\name{estTopDown} -\alias{estTopDown} -\title{Function to estimate a noisy binary tree from the top down} -\usage{ -estTopDown(tree, n, nNodes, sigma, invSigmaSq) -} -\arguments{ -\item{tree}{Data frame with binary tree attributes and node values} - -\item{n}{Number of observations in the vector represented by the binary tree} - -\item{nNodes}{Number of nodes in the binary tree} - -\item{sigma}{Standard deviation of the noise used to perturb the estimates} - -\item{invSigmaSq}{Inverse variance of the noise used in perturbing nodes} -} -\value{ -Top-down estimate of noisy binary tree in vector form -} -\description{ -Function to estimate a noisy binary tree from the top down -} diff --git a/man/inverseVariance.Rd b/man/inverseVariance.Rd new file mode 100644 index 0000000..8385082 --- /dev/null +++ b/man/inverseVariance.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-tree.R +\name{inverseVariance} +\alias{inverseVariance} +\title{Calculates variance of Laplace noise.} +\usage{ +inverseVariance(sens, eps) +} +\arguments{ +\item{sens}{Sensitivity of function that is to be perturbed} + +\item{eps}{Privacy parameter used} +} +\description{ +Recall that the variance of the Laplace distribution is 2b^2, where b is the classic +scaling parameter of the Laplace distribution (which is in fact the inverse of what we +in this library call the scaling parameter). Then, in our case, b = eps/sens. +} diff --git a/man/optimalCount.Rd b/man/optimalCount.Rd new file mode 100644 index 0000000..ff3277b --- /dev/null +++ b/man/optimalCount.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-tree.R +\name{optimalCount} +\alias{optimalCount} +\title{Optimal counts for nodes of tree} +\usage{ +optimalCount(tree, wA, cA, wB, cB) +} +\arguments{ +\item{tree}{Tree, formatted as a list of arrays where the contents of the ith array in the list is the ith level of the tree.} + +\item{wA}{Array of weights calculated from above with wAbove} + +\item{cA}{Array of counts calculated from below with countBelow} + +\item{wB}{Array of weights calculated from below with wBelow} + +\item{cB}{Array of counts calculated from above with countAbove} +} +\value{ +List of counts for each node of the tree. +} +\description{ +See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +} +\examples{ +t <- list(c(10), c(6,4), c(3,3,1,3)) +wB <- wBelow(t) +wA <- wAbove(t, wB) +cB <- countBelow(t, wB) +cA <- countAbove(t, cB, wA) +c <- optimalCount(t, wA, cA, wB, cB) #will return t + +} diff --git a/man/optimalPostProcess.Rd b/man/optimalPostProcess.Rd new file mode 100644 index 0000000..c0799be --- /dev/null +++ b/man/optimalPostProcess.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-tree.R +\name{optimalPostProcess} +\alias{optimalPostProcess} +\title{Optimal Post Processing} +\usage{ +optimalPostProcess(tree, epsilon) +} +\arguments{ +\item{tree}{Differentially private tree generated from dpTree$release method} + +\item{epsilon}{The epsilon value used for the noise addition at each node (note this is not the same as the global epsilon value.)} +} +\value{ +A list that includes: + optimalTree: a new tree with optimized counts at each level + wBelow: the weights used to calculate the optimized counts from the leaf nodes to the top of the tree. + wAbove: the weights used to calculate the optimized counts from the top of the tree to the leaf nodes. + optVariance: the variance of the optimal tree counts. + +See extra_docs/tree-post-processing for more details on the meanings of wBelow, wAbove, and optVariance +} +\description{ +Wrapper function that generates optimal tree from noisy tree generated with the Laplace mechanism. The general idea is that +you can leverage the fact that the child counts at each node should sum to the parent count, and a node's count should be +equal to its parent count minus the count of the adjacent child node to generate less noisy counts at every node of the tree. +} +\details{ +You can think of the leveraging of child node's information to get the parent node's counts as a recursive process that +begins at the leaves of a tree, which here we refer to with the tag "Below" in the helper function names. Similarly, leveraging +a parent node and the adjacent child nodes can be thought of as a recursive process that begins with the root node, which +is referred to here with the tage "Above" in the helper function names. A new count at every node in the tree can then be calculated +using the counts that are generated in this way, which each contribute an amount according to some weight, which are calculated +by the wBelow and wAbove functions respectively. + +The theory behind this is explored in detail in the Honaker paper, whose implementation here is described in extra_docs/tree-post-processing. +The implementation here assumes that the variance of the noise added is equal at every level of the tree, which also means that the weights +wBelow and wAbove are the same for nodes at the same level of the tree. Honaker provides a more general implementation in his work. +} +\references{ +Honaker, James. "Efficient Use of Differentially Private Binary Trees." (2015). +} diff --git a/man/optimalSigma.Rd b/man/optimalSigma.Rd new file mode 100644 index 0000000..3115e6b --- /dev/null +++ b/man/optimalSigma.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-tree.R +\name{optimalSigma} +\alias{optimalSigma} +\title{Optimal Sigma Estimation} +\usage{ +optimalSigma(wA, wB, epsilon) +} +\arguments{ +\item{wA}{Array of weights calculated from above} + +\item{wB}{Array of weights calculated from below} + +\item{epsilon}{Epsilon used for Laplace noise addition} +} +\value{ +Value of variance for optimal estimate +} +\description{ +See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +} diff --git a/man/pLap.Rd b/man/pLap.Rd new file mode 100644 index 0000000..098c4af --- /dev/null +++ b/man/pLap.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-noise-generation.R +\name{pLap} +\alias{pLap} +\title{Laplace Cumulative Distribution Function} +\usage{ +pLap(x, mu = 0, b = 1) +} +\arguments{ +\item{x}{Numeric, the value(s) at which the user wants to know the CDF height.} + +\item{mu}{Numeric, the center of the LaPlace distribution, defaults to 0.} + +\item{b}{Numeric, the spread of the LaPlace distribution, defaults to 1.} +} +\value{ +Probability the LaPlace draw is less than or equal to \code{x}. +} +\description{ +Determines the probability a draw from a LaPlace distribution is less than + or equal to the specified value. +} +\examples{ + +x <- 0 +pLap(x) +} diff --git a/man/qLap.Rd b/man/qLap.Rd new file mode 100644 index 0000000..2dc6efc --- /dev/null +++ b/man/qLap.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-noise-generation.R +\name{qLap} +\alias{qLap} +\title{Quantile function for Laplace distribution} +\usage{ +qLap(p, mu = 0, b = 1) +} +\arguments{ +\item{p}{Numeric, vector of probabilities} + +\item{mu}{numeric, center of the distribution} + +\item{b}{numeric, spread} +} +\value{ +Quantile function +} +\description{ +Quantile function for Laplace distribution +} +\examples{ +probs <- c(0.05, 0.50, 0.95) +qLap(probs) +} diff --git a/man/rLap.Rd b/man/rLap.Rd new file mode 100644 index 0000000..2370563 --- /dev/null +++ b/man/rLap.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-noise-generation.R +\name{rLap} +\alias{rLap} +\title{Random draw from Laplace distribution} +\usage{ +rLap(mu = 0, b = 1, size = 1) +} +\arguments{ +\item{mu}{numeric, center of the distribution} + +\item{b}{numeric, spread} + +\item{size}{integer, number of draws} +} +\value{ +Random draws from Laplace distribution +} +\description{ +Random draw from Laplace distribution +} +\examples{ + +rLap(size=1000) +} diff --git a/man/stErr.Rd b/man/stErr.Rd deleted file mode 100644 index 4329796..0000000 --- a/man/stErr.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities-tree.R -\name{stErr} -\alias{stErr} -\title{Function to evaluate the standard error of a node estimate given a weight and the standard - deviation of the noise used to perturb the nodes} -\usage{ -stErr(w, sigma) -} -\arguments{ -\item{w}{Weight used construct the estimate} - -\item{sigma}{Standard deviation of the noise used to perturb the estimates} -} -\value{ -Standard error of the node estimate -} -\description{ -Function to evaluate the standard error of a node estimate given a weight and the standard - deviation of the noise used to perturb the nodes -} diff --git a/man/treeBins.Rd b/man/treeBins.Rd new file mode 100644 index 0000000..f1b4a7a --- /dev/null +++ b/man/treeBins.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/statistic-tree.R +\name{treeBins} +\alias{treeBins} +\title{Calculate bins of each leaf in the tree by row} +\usage{ +treeBins(rng, depth, n) +} +\arguments{ +\item{rng}{Total range of data to be binned into tree in form (min, max) A tuple of numerics of length 2.} + +\item{depth}{Depth of the tree, considering the root to have depth 0.} + +\item{n}{Number of data points to be binned. + +Note: This can be calculated entirely publically. The fact that 'n' is input is unnecessary, +but is a residual affect of the fact that the helper function called can also be used if the user +specifies their desired granularity of the bins instead of supplying a range, which may be something +we want to eventually implement here.} +} +\value{ +A list of the bins used in the tree, from the first level of the tree to the leaves. +} +\description{ +Calculate bins of each leaf in the tree by row +} diff --git a/man/treeGetAccuracy.Rd b/man/treeGetAccuracy.Rd deleted file mode 100644 index 14f051b..0000000 --- a/man/treeGetAccuracy.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/statistic-tree.R -\name{treeGetAccuracy} -\alias{treeGetAccuracy} -\title{Accuracy for a differentially private binary tree} -\usage{ -treeGetAccuracy(epsilon, rng, gran, alpha = 0.05) -} -\arguments{ -\item{epsilon}{Numeric differential privacy parameter} - -\item{rng}{Numeric a priori estimate of the variable range} - -\item{gran}{Numeric granularity} - -\item{alpha}{Numeric level of statistical significance, default 0.05} -} -\value{ -Accuracy guarantee for the tree given epsilon -} -\description{ -Accuracy for a differentially private binary tree -} diff --git a/man/treeGetParameters.Rd b/man/treeGetParameters.Rd deleted file mode 100644 index 18a0dcb..0000000 --- a/man/treeGetParameters.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/statistic-tree.R -\name{treeGetParameters} -\alias{treeGetParameters} -\title{Epsilon for a differentially private binary tree} -\usage{ -treeGetParameters(accuracy, rng, gran, alpha = 0.05) -} -\arguments{ -\item{accuracy}{Numeric accuracy needed} - -\item{rng}{Numeric a priori estimate of the variable range} - -\item{gran}{Numeric granularity} - -\item{alpha}{Numeric level of statistical significance, default 0.05} -} -\value{ -Epsilon necessary to guarantee the given accuracy -} -\description{ -Epsilon for a differentially private binary tree -} diff --git a/man/treeMean.Rd b/man/treeMean.Rd new file mode 100644 index 0000000..e74bc82 --- /dev/null +++ b/man/treeMean.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-tree.R +\name{treeMean} +\alias{treeMean} +\title{Mean of Input Data} +\usage{ +treeMean(tree, bins) +} +\arguments{ +\item{tree}{Differentially private tree of histogram counts generated with dpTree.} + +\item{bins}{The bins by level of the tree, which corresponds to the binsByLevel attribute of dpTree.} +} +\value{ +Estimate of the mean of the underlying data that created the dpTree. +} +\description{ +Estimated mean of input data. +} +\details{ +Given a histogram of numeric values, you can estimate the mean of the underlying data by calculating the midpoint of each of the bins. + +Let c_i be the count associated with bin i, and let mid_i be the midpoint of the data range that bin i represents. +Let n be the total number of data points. Then, +\deqn{(1/n)\sum c_i * mid_i} +is an estimate of the mean. + +Here, we calculate this estimate using the highest granularity bins possible, so the histogram created by the leaf nodes of the tree. +} diff --git a/man/treePostCDF.Rd b/man/treePostCDF.Rd index 18d180f..3b643d5 100644 --- a/man/treePostCDF.Rd +++ b/man/treePostCDF.Rd @@ -1,24 +1,40 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/statistic-tree.R +% Please edit documentation in R/utilities-tree.R \name{treePostCDF} \alias{treePostCDF} -\title{Function to derive CDF from efficient terminal node counts} +\title{Empirical CDF for tree statistic.} \usage{ -treePostCDF(release, rng, terminalIndex) +treePostCDF(tree, bins) } \arguments{ -\item{release}{Efficient differentially private binary tree} +\item{tree}{(Optimized) differentially private tree of counts generated by dpTree$optimalPostProcess. Note that you could instead pass in the differentially private unoptimized tree +and the algorithm will still run correctly, but you will get more accurate results if you instead pass in the optimized version.} -\item{rng}{An a priori estimate of the range of the vector -being represented as a binary tree} - -\item{terminalIndex}{Vector of indices corresponding to the terminal -leaf nodes of the binary tree} +\item{bins}{The bins of the dpTree object, which are stored as $binsByLevel.} } \value{ -Differentially private estimate of the empirical cumulative - distribution function +An empirical cdf that is as granular as the tree allows. The output consists of + $bins: the values that the empirical cdf is evaluated at, which correspond to the ranges of the bins of the leaf nodes of the tree. + $counts: the number of values that appear to the left of the corresponding value in $bins + $proportions: the proportion of values that appear to the left of the corresponding value in $bins, i.e. proportions[i] is an approximation of Pr(x < bins[i]) } \description{ -Function to derive CDF from efficient terminal node counts +Generates the least noisy CDF for the tree. +} +\details{ +Note that for any numeric histogram, you can generate an empirical CDF for each of the maximal values of the bins by counting the number of items to the +left of that bin edge in the histogram. Here, we could do that by just using the histogram at the smallest level of the tree. This is not desirable +because you will have to sum many noisy things in order to get the counts. Instead, we can leverage the tree structure and minimize the number of counts +you need to sum for each count by traversing the tree. + +For example, if a tree has bins with ranges [0,2), [2,4), [4,6), [6,8] at the leaf nodes, you could use the counts at the leaf nodes associated with [0,2) and [2,4) +to get an estimate for Pr(x < 4), or you could instead just use the count that is one level up on the tree which has range [0,4) to output this directly. Similarly, +if you want to get an estimate for Pr(x < 6), you could combine the count for [4,6) with the count for [0,4] to minimize the number of sums you must make. + +There are two edge cases here: the probability of a count that is less than the minimum of the tree is always 0 as we assume that the minimum and maximum values are always true, +and the total sum is public knowledge. + +Note that you could also minimize the number of counts you need to sum for each by calculating them as a difference from the total sum, and combine these two methods to get the most +optimal value for each of the counts. (E.g. in the previous example, you could get an estimate for Pr(x < 6) from the fact that you know Pr(x<8)=1 and the [6,8] count, which would +reduce the number of noisy counts used. We do not do this here. This code could certainly be optimized further in terms of runtime efficiency as well. } diff --git a/man/treePostEfficient.Rd b/man/treePostEfficient.Rd deleted file mode 100644 index b10dfa4..0000000 --- a/man/treePostEfficient.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/statistic-tree.R -\name{treePostEfficient} -\alias{treePostEfficient} -\title{Function to efficiently estimate noisy node counts} -\usage{ -treePostEfficient(release, treeData, n, variance, terminalIndex) -} -\arguments{ -\item{release}{The truncated differentially private noisy binary tree -in vector form} - -\item{treeData}{Data frame with binary tree attributes, including depth -and indicators of parent and adjacent nodes. Note that -\code{nrow(treeData) == length(release)}} - -\item{n}{Number of observations} - -\item{variance}{The variance of the noise used to perturb tree nodes} - -\item{terminalIndex}{Vector of indices corresponding to the terminal -leaf nodes of the binary tree} - -\item{nNodes}{Number of nodes in the binary tree, also \code{length(release)}} -} -\value{ -Efficient differentially private binary tree -} -\description{ -Function to efficiently estimate noisy node counts -} diff --git a/man/treePostFormatRelease.Rd b/man/treePostFormatRelease.Rd deleted file mode 100644 index 779fb01..0000000 --- a/man/treePostFormatRelease.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/statistic-tree.R -\name{treePostFormatRelease} -\alias{treePostFormatRelease} -\title{Function to truncate negative noisy node counts at zero} -\usage{ -treePostFormatRelease(release) -} -\arguments{ -\item{release}{The differentially private noisy binary tree} -} -\value{ -Noisy binary tree truncated at zero -} -\description{ -Function to truncate negative noisy node counts at zero -} diff --git a/man/treePostMean.Rd b/man/treePostMean.Rd deleted file mode 100644 index a243c46..0000000 --- a/man/treePostMean.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/statistic-tree.R -\name{treePostMean} -\alias{treePostMean} -\title{Function to evaluate the mean using the DP CDF} -\usage{ -treePostMean(cdf, rng) -} -\arguments{ -\item{cdf}{Differentially private estimate of the empirical cumulative -distribution function} - -\item{rng}{Numeric a priori estimate of the range} - -\item{gran}{Granularity} -} -\value{ -Differentially private estimate of the mean -} -\description{ -Function to evaluate the mean using the DP CDF -} diff --git a/man/treePostMedian.Rd b/man/treePostMedian.Rd deleted file mode 100644 index fee8374..0000000 --- a/man/treePostMedian.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/statistic-tree.R -\name{treePostMedian} -\alias{treePostMedian} -\title{Function to evaluate the median using the DP CDF} -\usage{ -treePostMedian(cdf) -} -\arguments{ -\item{cdf}{Differentially private estimate of the empirical cumulative -distribution function} -} -\value{ -Differentially private estimate of the median -} -\description{ -Function to evaluate the median using the DP CDF -} diff --git a/man/treePostPercentiles.Rd b/man/treePostPercentiles.Rd deleted file mode 100644 index 7ca06be..0000000 --- a/man/treePostPercentiles.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/statistic-tree.R -\name{treePostPercentiles} -\alias{treePostPercentiles} -\title{Quantile function using the DP CDF} -\usage{ -treePostPercentiles(cdf, percentiles) -} -\arguments{ -\item{cdf}{Differentially private estimate of the empirical cumulative -distribution function} - -\item{percentiles}{Vector of probabilities given to the quantile function} -} -\value{ -Differnetially private estimate of the values corresponding to - the provided probabilities -} -\description{ -Quantile function using the DP CDF -} diff --git a/man/wAbove.Rd b/man/wAbove.Rd index 7f0281b..67c040a 100644 --- a/man/wAbove.Rd +++ b/man/wAbove.Rd @@ -2,24 +2,23 @@ % Please edit documentation in R/utilities-tree.R \name{wAbove} \alias{wAbove} -\title{Function to evaluate weights from the noise variance and standard errors in a parent and adjacent - nodes for the node of a differentially private binary tree} +\title{Recursive weight estimation from above} \usage{ -wAbove(invSigmaSq, tree, parent, adjacent) +wAbove(tree, wBelows) } \arguments{ -\item{invSigmaSq}{Inverse variance of the noise used in perturbing nodes} +\item{tree}{Tree, formatted as a list of arrays where the contents of the ith array in the list is the ith level of the tree.} -\item{tree}{Data frame with binary tree attributes and node values} - -\item{parent}{Index of the parnet node} - -\item{adjacent}{Index of the adjacent node} +\item{wBelows}{Array of weights of same length as tree, where the ith weight corresponds to the ith weight calculated from below.} } \value{ -Weight +Single weight for each level of the tree. } \description{ -Function to evaluate weights from the noise variance and standard errors in a parent and adjacent - nodes for the node of a differentially private binary tree +See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +} +\examples{ +t <- list(c(10), c(6,4), c(3,3,1,3)) +wB <- wBelow(t) +wA <- wAbove(t, wB) # Should return c(1, 5/8, 13/21). } diff --git a/man/wBelow.Rd b/man/wBelow.Rd index a6810ae..7030573 100644 --- a/man/wBelow.Rd +++ b/man/wBelow.Rd @@ -2,22 +2,21 @@ % Please edit documentation in R/utilities-tree.R \name{wBelow} \alias{wBelow} -\title{Function to evaluate weights from the noise variance and standard errors in child nodes for the - node of a differentially private binary tree} +\title{Recursive weight estimate from below} \usage{ -wBelow(invSigmaSq, tree, idx) +wBelow(tree) } \arguments{ -\item{invSigmaSq}{Inverse variance of the noise used in perturbing nodes} - -\item{tree}{Data frame with binary tree attributes and node values} - -\item{idx}{Index of the node for which the weight is evaluated} +\item{tree}{Tree, formatted as a list of arrays where the contents of the ith array in the list is the ith level of the tree.} } \value{ -Weight +Single weight for each level of the tree. } \description{ -Function to evaluate weights from the noise variance and standard errors in child nodes for the - node of a differentially private binary tree +See extra_docs/tree-post-processing for formula. This assumes that the variance is the same for every node at in the tree. +} +\examples{ +t <- list(c(10), c(6,4), c(3,3,1,3)) +w <- wBelow(t) # should output c(4/7, 2/3, 1) + } diff --git a/man/wEfficient.Rd b/man/wEfficient.Rd deleted file mode 100644 index 0a4e08c..0000000 --- a/man/wEfficient.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities-tree.R -\name{wEfficient} -\alias{wEfficient} -\title{Function to evaluate weights efficiently using the noise variance and standard errors in parent and adjacent - nodes as well child nodes for the node of a differentially private binary tree} -\usage{ -wEfficient(tree, idx, parent, adjacent) -} -\arguments{ -\item{tree}{Data frame with binary tree attributes and node values} - -\item{idx}{Index of the node for which the weight is evaluated} - -\item{parent}{Index of the parnet node} - -\item{adjacent}{Index of the adjacent node} -} -\value{ -Weight -} -\description{ -Function to evaluate weights efficiently using the noise variance and standard errors in parent and adjacent - nodes as well child nodes for the node of a differentially private binary tree -} diff --git a/tests/testthat/test-tree.R b/tests/testthat/test-tree.R new file mode 100644 index 0000000..7a77cf4 --- /dev/null +++ b/tests/testthat/test-tree.R @@ -0,0 +1,32 @@ +context('tree statistic') + +test_that('Tree binning runs',{ + out <- treeBins(c(0,10), 2, 10) + expect_equal(length(out), 2) + expect_equal(out[[1]], c(0,5,10)) +}) + +test_that('Tree statistic initialization as expected', { + x <- c(1:10) + data.frame(x) + stat <- dpTree$new('numeric', 'x', 10, 3, c(0,10), globalEps=1) + expect_equal(stat$epsilon, 1/3) +}) + +test_that('Tree workflow runs', { + x <- c(1:10) + x <- data.frame(x) + + #will raise warning due to high epsilon + stat <- expect_warning(dpTree$new('numeric', 'x', 10, 3, c(0,10), globalEps=10000)) + + o <- stat$release(x) + out <- o$release + + expect_equal(length(out), 4) + expect_equal(as.integer(out[[1]]), 10) + expect_true(!is.null(names(out[[1]]))) + expect_equal(length(out[[4]]), 8) + expect_equal(as.vector(round(out[[4]][8])), 2) + expect_equal(o$bins[[1]], c(0,5,10)) +}) \ No newline at end of file diff --git a/tests/testthat/test-utilities-tree.R b/tests/testthat/test-utilities-tree.R new file mode 100644 index 0000000..84a53b5 --- /dev/null +++ b/tests/testthat/test-utilities-tree.R @@ -0,0 +1,92 @@ +context('tree utils') + +test_that('adjacent element flip', { + ls <- c(1,2,3,4) + expect_equal(adjacentElements(ls), c(2,1,4,3)) +}) + +test_that('estimation from below runs',{ + t <- list(c(10), c(6,4), c(3,3,1,3)) + + w <- wBelow(t) + expect_equal(w, c(4/7,2/3,1)) + + c <- countBelow(t, w) + expect_equal(t, c) +}) + +test_that('estimation from above runs', { + t <- list(c(10), c(6,4), c(3,3,1,3)) + + wB <- wBelow(t) + wA <- wAbove(t, wB) + expect_equal(wA, c(1, 5/8, 13/21)) + + cB <- countBelow(t, wB) + cA <- countAbove(t, cB, wA) + expect_equal(t, cA) +}) + +test_that('optimal estimation runs', { + t <- list(c(10), c(6,4), c(3,3,1,3)) + + wB <- wBelow(t) + wA <- wAbove(t, wB) + cB <- countBelow(t, wB) + cA <- countAbove(t, cB, wA) + + c <- optimalCount(t, wA, cA, cB) + expect_equal(t,c) +}) + +test_that('optimal sigma est runs', { + i <- inverseVariance(2,1) + expect_equal(i, 1/2) + + t <- list(c(10), c(6,4), c(3,3,1,3)) + wB <- wBelow(t) + wA <- wAbove(t, wB) + s <- optimalSigma(wA, wB, 1) + expect <- c(4/14, 1/3, 1/2) * sqrt(wA) + expect_equal(expect, s) +}) + +test_that('optimal post-process script runs',{ + t <- list(c(10), c(6,4), c(3,3,1,3)) + out <- optimalPostProcess(t, 1) + + expect_equal(length(out), 4) + expect_equal(out$optVariance, c(4/14, 1/3, 1/2) * sqrt(out$wAbove)) + expect_equal(out$optimalTree, t) +}) + +test_that('CDF created correctly', { + # basic test case for public tree + expect_equal(treePostCDF(list(c(10), c(6,4)),list(c(0,2,4)))$counts, c(0,6,10)) + expect_equal(treePostCDF(list(c(10), c(6,4)),list(c(0,2,4)))$proportions, c(0,6,10)/10) + expect_equal(treePostCDF(list(c(10), c(6,4), c(3,3,1,3)),list(c(0,2,4,6,8)))$counts, c(0,3,6,7,10)) + + # basic test cases for noisy tree to verify counts are pulled from correct spots + expect_equal(treePostCDF(list(c(9), c(6,4)),list(c(0,2,4)))$counts, c(0,6,9)) + expect_equal(treePostCDF(list(c(10), c(6,3), c(1,4,2,5)), list(c(0,2,4,6,8)))$counts, c(0,1,6,8,10)) +}) + +test_that('cdfMedian runs correctly', { + t <- list(c(10), c(6,4), c(3,3,1,3)) + b <- list(c(0,2,4,6,8)) + cdf <- treePostCDF(t,b) + med <- cdfMedian(cdf) + expect_equal(med$val, 6) + + t <- list(c(10), c(5,5), c(2,3,3,2)) + b <- list(c(0,2,4,6,8)) + cdf <- treePostCDF(t,b) + med <- cdfMedian(cdf) + expect_equal(med$val, 4) +}) + +test_that('treeMean runs correctly', { + t <- list(c(10), c(6,4), c(3,3,1,3)) + b <- list(c(0,2,4,6,8)) + expect_equal(treeMean(t, b), 3.8) +}) \ No newline at end of file