diff --git a/NEWS.md b/NEWS.md index f83f23a3..458315a8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # tidyr (development version) +* `chop()` gains a `by` argument for specifying grouping columns, similar to + `nest(.by =)` (@hrryt, #1490). + * `fill()` gains a `.by` argument as an alternative to `dplyr::group_by()` for applying the fill per group, similar to `nest(.by =)` and `dplyr::mutate(.by =)` (@olivroy, #1439). diff --git a/R/chop.R b/R/chop.R index 5720644a..3810fb67 100644 --- a/R/chop.R +++ b/R/chop.R @@ -28,9 +28,20 @@ #' @param data A data frame. #' @param cols <[`tidy-select`][tidyr_tidy_select]> Columns to chop or unchop. #' +#' If not supplied for `chop()`, then `cols` is derived as all columns _not_ +#' selected by `by`. +#' #' For `unchop()`, each column should be a list-column containing generalised #' vectors (e.g. any mix of `NULL`s, atomic vector, S3 vectors, a lists, #' or data frames). +#' @param by <[`tidy-select`][tidyr_tidy_select]> Columns to chop _by_; these +#' will not be chopped. +#' +#' `by` can be used in place of or in conjunction with columns supplied +#' through `cols`. +#' +#' If not supplied, then `by` is derived as all columns _not_ selected by +#' `cols`. #' @param keep_empty By default, you get one row of output for each element #' of the list that you are unchopping/unnesting. This means that if there's a #' size-0 element (like `NULL` or an empty data frame or vector), then that @@ -51,6 +62,19 @@ #' # cf nest #' df %>% nest(data = c(y, z)) #' +#' # Specify variables to chop by (rather than variables to chop) using `by` +#' df %>% chop(by = x) +#' +#' # Use tidyselect syntax and helpers, just like in `dplyr::select()` +#' df %>% chop(any_of(c("y", "z"))) +#' +#' # `cols` and `by` can be used together to drop columns you no longer need, +#' # or to chop the columns you are chopping by too. +#' # This drops `z`: +#' df %>% chop(y, by = x) +#' # This includes `x` in the chopped columns: +#' df %>% chop(everything(), by = x) +#' #' # Unchop -------------------------------------------------------------------- #' df <- tibble(x = 1:4, y = list(integer(), 1L, 1:2, 1:3)) #' df %>% unchop(y) @@ -65,20 +89,16 @@ #' df <- tibble(x = 1:3, y = list(NULL, tibble(x = 1), tibble(y = 1:2))) #' df %>% unchop(y) #' df %>% unchop(y, keep_empty = TRUE) -chop <- function(data, cols, ..., error_call = current_env()) { +chop <- function(data, cols = NULL, ..., by = NULL, error_call = current_env()) { check_dots_empty0(...) check_data_frame(data, call = error_call) - check_required(cols, call = error_call) - cols <- tidyselect::eval_select( - expr = enquo(cols), - data = data, - allow_rename = FALSE, - error_call = error_call - ) + info <- chop_info(data, cols = {{ cols }}, by = {{ by }}) + cols <- info$cols + by <- info$by cols <- tidyr_new_list(data[cols]) - keys <- data[setdiff(names(data), names(cols))] + keys <- data[by] info <- vec_group_loc(keys) keys <- info$key @@ -94,6 +114,59 @@ chop <- function(data, cols, ..., error_call = current_env()) { reconstruct_tibble(data, out) } +chop_info <- function( + data, + cols = NULL, + by = NULL, + error_call = caller_env() +) { + by <- enquo(by) + cols <- enquo(cols) + + cols_is_null <- quo_is_null(cols) + by_is_null <- quo_is_null(by) + + if (cols_is_null && by_is_null) { + stop_use_cols_or_by(error_call = error_call) + } + + names <- names(data) + + cols <- names(tidyselect::eval_select( + expr = cols, + data = data, + allow_rename = FALSE, + error_call = error_call + )) + + by <- names(tidyselect::eval_select( + expr = by, + data = data, + allow_rename = FALSE, + error_call = error_call + )) + + if (cols_is_null) { + # Derive `cols` names from `by` + cols <- setdiff(names, by) + } + + if (by_is_null) { + # Derive `by` names from `cols` + by <- setdiff(names, cols) + } + + list( + cols = cols, + by = by + ) +} + +stop_use_cols_or_by <- function(error_call = caller_env()) { + message <- c("At least one of {.var cols} or {.var by} must be supplied.") + cli::cli_abort(message, call = error_call) +} + col_chop <- function(x, indices) { ptype <- vec_ptype(x) diff --git a/man/chop.Rd b/man/chop.Rd index 5a165f5b..1084a3f3 100644 --- a/man/chop.Rd +++ b/man/chop.Rd @@ -5,7 +5,7 @@ \alias{unchop} \title{Chop and unchop} \usage{ -chop(data, cols, ..., error_call = current_env()) +chop(data, cols = NULL, ..., by = NULL, error_call = current_env()) unchop( data, @@ -21,12 +21,24 @@ unchop( \item{cols}{<\code{\link[=tidyr_tidy_select]{tidy-select}}> Columns to chop or unchop. +If not supplied for \code{chop()}, then \code{cols} is derived as all columns \emph{not} +selected by \code{by}. + For \code{unchop()}, each column should be a list-column containing generalised vectors (e.g. any mix of \code{NULL}s, atomic vector, S3 vectors, a lists, or data frames).} \item{...}{These dots are for future extensions and must be empty.} +\item{by}{<\code{\link[=tidyr_tidy_select]{tidy-select}}> Columns to chop \emph{by}; these +will not be chopped. + +\code{by} can be used in place of or in conjunction with columns supplied +through \code{cols}. + +If not supplied, then \code{by} is derived as all columns \emph{not} selected by +\code{cols}.} + \item{error_call}{The execution environment of a currently running function, e.g. \code{caller_env()}. The function will be mentioned in error messages as the source of the error. See the @@ -75,6 +87,19 @@ df \%>\% chop(c(y, z)) # cf nest df \%>\% nest(data = c(y, z)) +# Specify variables to chop by (rather than variables to chop) using `by` +df \%>\% chop(by = x) + +# Use tidyselect syntax and helpers, just like in `dplyr::select()` +df \%>\% chop(any_of(c("y", "z"))) + +# `cols` and `by` can be used together to drop columns you no longer need, +# or to chop the columns you are chopping by too. +# This drops `z`: +df \%>\% chop(y, by = x) +# This includes `x` in the chopped columns: +df \%>\% chop(everything(), by = x) + # Unchop -------------------------------------------------------------------- df <- tibble(x = 1:4, y = list(integer(), 1L, 1:2, 1:3)) df \%>\% unchop(y) diff --git a/tests/testthat/_snaps/chop.md b/tests/testthat/_snaps/chop.md index 9292d453..9c31bf68 100644 --- a/tests/testthat/_snaps/chop.md +++ b/tests/testthat/_snaps/chop.md @@ -12,7 +12,7 @@ chop(df) Condition Error in `chop()`: - ! `cols` is absent but must be supplied. + ! At least one of `cols` or `by` must be supplied. # incompatible ptype mentions the column (#1477) diff --git a/tests/testthat/test-chop.R b/tests/testthat/test-chop.R index f71a153a..147a5c66 100644 --- a/tests/testthat/test-chop.R +++ b/tests/testthat/test-chop.R @@ -47,6 +47,41 @@ test_that("can chop empty data frame (#1206)", { ) }) +test_that("can chop `by` columns (#1490)", { + df <- tibble(x = c(1, 1, 1, 2, 2), y = c(2, 1, 2, 3, 4), z = 1:5) + + expect_identical( + chop(df, by = c(x, y)), + chop(df, z) + ) +}) + +test_that("can combine `by` with `cols` (#1490)", { + df <- tibble(x = c(1, 1, 1, 2, 2), y = c(2, 1, 2, 3, 4), z = 1:5) + + expect_identical( + chop(df, x, by = y), + chop(dplyr::select(df, -z), x) + ) +}) + +test_that("union of `by` and `cols` results in renaming (#1490)", { + df <- tibble(x = 1, y = 1) + one <- vctrs::list_of(1) + + with_options(rlib_name_repair_verbosity = "quiet", { + expect_identical( + invisible(chop(df, everything(), by = x)), + tibble(x = 1, x = one, y = one, .name_repair = "unique") + ) + + expect_identical( + invisible(chop(df, x, by = everything())), + tibble(x = 1, y = 1, x = one, .name_repair = "unique") + ) + }) +}) + # unchop ------------------------------------------------------------------ test_that("extends into rows", {