From 1b52611ee2aeb7598dc900c6c3ad519304e08d27 Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 19 Jul 2022 12:14:47 -0400 Subject: [PATCH] Rewrite `nth()`, `first()`, and `last()` using vctrs (#6331) * Rewrite `nth()`, `first()`, and `last()` using vctrs * NEWS bullet * Switch to a `vec_slice2()`-like behavior Because using `vec_slice()` breaks too many revdeps that use these functions on lists and expect to get list elements back * NEWS bullet tweaks * Apply suggestions from code review * Typo --- NAMESPACE | 2 - NEWS.md | 14 +++ R/nth-value.R | 152 +++++++++++++++++-------- man/nth.Rd | 72 ++++++++---- tests/testthat/_snaps/nth-value.md | 57 +++++++++- tests/testthat/test-nth-value.R | 173 ++++++++++++++++++++++++++--- 6 files changed, 379 insertions(+), 91 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b71bf7d219..e1554e8ffc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,8 +36,6 @@ S3method(compute,data.frame) S3method(copy_to,DBIConnection) S3method(copy_to,src_local) S3method(count,data.frame) -S3method(default_missing,data.frame) -S3method(default_missing,default) S3method(distinct,data.frame) S3method(distinct_,data.frame) S3method(distinct_,grouped_df) diff --git a/NEWS.md b/NEWS.md index 3068c8cf1f..52cd47f094 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,19 @@ # dplyr (development version) +* `first()`, `last()`, and `nth()` have been rewritten to use vctrs. This comes + with the following improvements (#6331): + + * When used on a data frame, these functions now return a single row rather + than a single column. This is more consistent with the vctrs principle that + a data frame is generally treated as a vector of rows. + + * The `default` is no longer "guessed", and will always automatically be set + to a missing value appropriate for the type of `x`. + + * Fractional values of `n` are no longer truncated to integers, and will now + cause an error. For example, `nth(x, n = 2)` is fine, but + `nth(x, n = 2.5)` is now an error. + * `lag()` and `lead()` now cast `default` to the type of `x`, rather than taking the common type. This ensures that these functions are type stable on `x` (#6330). diff --git a/R/nth-value.R b/R/nth-value.R index 8efef2262d..0a7a21584b 100644 --- a/R/nth-value.R +++ b/R/nth-value.R @@ -1,25 +1,37 @@ -#' Extract the first, last or nth value from a vector +#' Extract the first, last, or nth value from a vector #' -#' These are straightforward wrappers around \code{\link{[[}}. The main -#' advantage is that you can provide an optional secondary vector that defines -#' the ordering, and provide a default value to use when the input is shorter -#' than expected. +#' These are useful helpers for extracting a single value from a vector. They +#' are guaranteed to return a meaningful value, even when the input is shorter +#' than expected. You can also provide an optional secondary vector that defines +#' the ordering. +#' +#' @details +#' For most vector types, `first(x)`, `last(x)`, and `nth(x, n)` work like +#' `x[[1]]`, `x[[length(x)]`, and `x[[n]]`, respectively. The primary exception +#' is data frames, where they instead retrieve rows, i.e. `x[1, ]`, `x[nrow(x), +#' ]`, and `x[n, ]`. This is consistent with the tidyverse/vctrs principle which +#' treats data frames as a vector of rows, rather than a vector of columns. #' #' @param x A vector #' @param n For `nth()`, a single integer specifying the position. #' Negative integers index from the end (i.e. `-1L` will return the #' last value in the vector). +#' @param order_by An optional vector the same size as `x` used to determine the +#' order. +#' @param default A default value to use if the position does not exist in `x`. +#' +#' If `NULL`, the default, a missing value is used. +#' +#' If supplied, this must be a single value, which will be cast to the type of +#' `x`. +#' +#' When `x` is a list , `default` is allowed to be any value. There are no +#' type or size restrictions in this case. #' -#' If a double is supplied, it will be silently truncated. -#' @param order_by An optional vector used to determine the order -#' @param default A default value to use if the position does not exist in -#' the input. This is guessed by default for base vectors, where a -#' missing value of the appropriate type is returned, and for lists, where -#' a `NULL` is return. +#' @return +#' If `x` is a list, a single element from that list. Otherwise, a vector the +#' same type as `x` with size 1. #' -#' For more complicated objects, you'll need to supply this value. -#' Make sure it is the same type as `x`. -#' @return A single value. `[[` is used to do the subsetting. #' @export #' @examples #' x <- 1:10 @@ -31,62 +43,114 @@ #' nth(x, 1) #' nth(x, 5) #' nth(x, -2) +#' +#' # `first()` and `last()` are often useful in `summarise()` +#' df <- tibble(x = x, y = y) +#' df %>% +#' summarise( +#' across(x:y, first, .names = "{col}_first"), +#' y_last = last(y) +#' ) +#' +#' # Selecting a position that is out of bounds returns a default value #' nth(x, 11) +#' nth(x, 0) +#' +#' # This out of bounds behavior also applies to empty vectors +#' first(integer()) #' +#' # You can customize the default value with `default` +#' nth(x, 11, default = -1L) +#' first(integer(), default = 0L) +#' +#' # `order_by` provides optional ordering #' last(x) -#' # Second argument provides optional ordering -#' last(x, y) +#' last(x, order_by = y) #' -#' # These functions always return a single value -#' first(integer()) -nth <- function(x, n, order_by = NULL, default = default_missing(x)) { - if (length(n) != 1 || !is.numeric(n)) { - abort("`n` must be a single integer.") +#' # For data frames, these select entire rows +#' df <- tibble(a = 1:5, b = 6:10) +#' first(df) +#' nth(df, 4) +nth <- function(x, n, order_by = NULL, default = NULL) { + size <- vec_size(x) + + vec_assert(n, size = 1L, arg = "n") + n <- vec_cast(n, to = integer(), x_arg = "n") + + if (!is.null(order_by)) { + vec_assert(order_by, size = size, arg = "order_by") } - n <- trunc(n) - if (n == 0 || n > length(x) || n < -length(x)) { - return(default) + default <- check_nth_default(default, x = x) + + if (n < 0L) { + # Negative values index from RHS + n <- size + n + 1L } - # Negative values index from RHS - if (n < 0) { - n <- length(x) + n + 1 + if (n <= 0L || n > size) { + return(default) } - if (is.null(order_by)) { - x[[n]] - } else { - x[[ order(order_by)[[n]] ]] + if (!is.null(order_by)) { + order <- vec_order_base(order_by) + n <- order[[n]] } + + vec_slice2(x, n) } #' @export #' @rdname nth -first <- function(x, order_by = NULL, default = default_missing(x)) { +first <- function(x, order_by = NULL, default = NULL) { nth(x, 1L, order_by = order_by, default = default) } #' @export #' @rdname nth -last <- function(x, order_by = NULL, default = default_missing(x)) { +last <- function(x, order_by = NULL, default = NULL) { nth(x, -1L, order_by = order_by, default = default) } -default_missing <- function(x) { - UseMethod("default_missing") +check_nth_default <- function(default, x, ..., error_call = caller_env()) { + check_dots_empty0(...) + + if (vec_is_list(x)) { + # Very special behavior for lists, since we use `[[` on them. + # Valid to use any `default` here (even non-vectors). + # And `default = NULL` is the correct default `default` for lists. + return(default) + } + + if (is.null(default)) { + return(vec_init(x)) + } + + vec_assert(default, size = 1L, arg = "default", call = error_call) + + default <- vec_cast( + x = default, + to = x, + x_arg = "default", + to_arg = "x", + call = error_call + ) + + default } -#' @export -default_missing.default <- function(x) { - if (!is.object(x) && is.list(x)) { - NULL +vec_slice2 <- function(x, i) { + # Our unimplemented vctrs equivalent of `[[` + # https://github.com/r-lib/vctrs/pull/1228/ + + i <- vec_as_location2(i, vec_size(x)) + + if (vec_is_list(x)) { + out <- .subset2(x, i) } else { - x[NA_real_] + out <- vec_slice(x, i) + out <- vec_set_names(out, NULL) } -} -#' @export -default_missing.data.frame <- function(x) { - rep(NA, nrow(x)) + out } diff --git a/man/nth.Rd b/man/nth.Rd index 57eeb78fef..2ade48350c 100644 --- a/man/nth.Rd +++ b/man/nth.Rd @@ -4,41 +4,49 @@ \alias{nth} \alias{first} \alias{last} -\title{Extract the first, last or nth value from a vector} +\title{Extract the first, last, or nth value from a vector} \usage{ -nth(x, n, order_by = NULL, default = default_missing(x)) +nth(x, n, order_by = NULL, default = NULL) -first(x, order_by = NULL, default = default_missing(x)) +first(x, order_by = NULL, default = NULL) -last(x, order_by = NULL, default = default_missing(x)) +last(x, order_by = NULL, default = NULL) } \arguments{ \item{x}{A vector} \item{n}{For \code{nth()}, a single integer specifying the position. Negative integers index from the end (i.e. \code{-1L} will return the -last value in the vector). +last value in the vector).} -If a double is supplied, it will be silently truncated.} +\item{order_by}{An optional vector the same size as \code{x} used to determine the +order.} -\item{order_by}{An optional vector used to determine the order} +\item{default}{A default value to use if the position does not exist in \code{x}. -\item{default}{A default value to use if the position does not exist in -the input. This is guessed by default for base vectors, where a -missing value of the appropriate type is returned, and for lists, where -a \code{NULL} is return. +If \code{NULL}, the default, a missing value is used. -For more complicated objects, you'll need to supply this value. -Make sure it is the same type as \code{x}.} +If supplied, this must be a single value, which will be cast to the type of +\code{x}. + +When \code{x} is a list , \code{default} is allowed to be any value. There are no +type or size restrictions in this case.} } \value{ -A single value. \code{[[} is used to do the subsetting. +If \code{x} is a list, a single element from that list. Otherwise, a vector the +same type as \code{x} with size 1. } \description{ -These are straightforward wrappers around \code{\link{[[}}. The main -advantage is that you can provide an optional secondary vector that defines -the ordering, and provide a default value to use when the input is shorter -than expected. +These are useful helpers for extracting a single value from a vector. They +are guaranteed to return a meaningful value, even when the input is shorter +than expected. You can also provide an optional secondary vector that defines +the ordering. +} +\details{ +For most vector types, \code{first(x)}, \code{last(x)}, and \code{nth(x, n)} work like +\code{x[[1]]}, \verb{x[[length(x)]}, and \code{x[[n]]}, respectively. The primary exception +is data frames, where they instead retrieve rows, i.e. \code{x[1, ]}, \code{x[nrow(x), ]}, and \code{x[n, ]}. This is consistent with the tidyverse/vctrs principle which +treats data frames as a vector of rows, rather than a vector of columns. } \examples{ x <- 1:10 @@ -50,12 +58,32 @@ last(y) nth(x, 1) nth(x, 5) nth(x, -2) + +# `first()` and `last()` are often useful in `summarise()` +df <- tibble(x = x, y = y) +df \%>\% + summarise( + across(x:y, first, .names = "{col}_first"), + y_last = last(y) + ) + +# Selecting a position that is out of bounds returns a default value nth(x, 11) +nth(x, 0) +# This out of bounds behavior also applies to empty vectors +first(integer()) + +# You can customize the default value with `default` +nth(x, 11, default = -1L) +first(integer(), default = 0L) + +# `order_by` provides optional ordering last(x) -# Second argument provides optional ordering -last(x, y) +last(x, order_by = y) -# These functions always return a single value -first(integer()) +# For data frames, these select entire rows +df <- tibble(a = 1:5, b = 6:10) +first(df) +nth(df, 4) } diff --git a/tests/testthat/_snaps/nth-value.md b/tests/testthat/_snaps/nth-value.md index df4527a91f..da06f0841a 100644 --- a/tests/testthat/_snaps/nth-value.md +++ b/tests/testthat/_snaps/nth-value.md @@ -1,9 +1,56 @@ -# nth() gives meaningful error message (#5466) +# `default` must be size 1 (when not used with lists) Code - (expect_error(nth(1:10, "x"))) - Output - + nth(1L, n = 2L, default = 1:2) + Condition Error in `nth()`: - ! `n` must be a single integer. + ! `default` must have size 1, not size 2. + +# `default` is cast to the type of `x` (when not used with lists) + + Code + nth("x", 2, default = 2) + Condition + Error in `nth()`: + ! Can't convert `default` to match type of `x` . + +# `n` is validated (#5466) + + Code + nth(1:10, n = "x") + Condition + Error in `nth()`: + ! Can't convert `n` to . + +--- + + Code + nth(1:10, n = 1:2) + Condition + Error in `nth()`: + ! `n` must have size 1, not size 2. + +# `x` must be a vector + + Code + nth(environment(), 1L) + Condition + Error in `vec_size()`: + ! `x` must be a vector, not an environment. + +# `order_by` must be the same size as `x` + + Code + nth(1:5, n = 1L, order_by = 1:2) + Condition + Error in `nth()`: + ! `order_by` must have size 5, not size 2. + +--- + + Code + nth(1:5, n = 6L, order_by = 1:2) + Condition + Error in `nth()`: + ! `order_by` must have size 5, not size 2. diff --git a/tests/testthat/test-nth-value.R b/tests/testthat/test-nth-value.R index 9ec9fe1734..c1952141f4 100644 --- a/tests/testthat/test-nth-value.R +++ b/tests/testthat/test-nth-value.R @@ -1,44 +1,181 @@ -test_that("nth works with lists", { - x <- list(1, 2, 3) +# ------------------------------------------------------------------------------ +# nth() + +test_that("nth works with lists and uses `vec_slice2()` to return elements (#6331)", { + # We'd like to use `vec_slice()` everywhere, but it breaks too many revdeps + # that rely on `nth()` returning list elements + x <- list(1, 2, 3:5) expect_equal(nth(x, 1), 1) - expect_null(nth(x, 4)) - expect_equal(nth(x, 4, default = 1), 1) + expect_equal(nth(x, 3), 3:5) +}) + +test_that("nth `default` for lists defaults to `NULL` since it uses `vec_slice2()`", { + expect_null(nth(list(1), 2)) + expect_null(nth(list(), 1)) +}) + +test_that("nth `default` for lists can be anything", { + # Because list elements can be anything + x <- list(1, 2) + + default <- environment() + expect_identical(nth(x, 3, default = default), default) + + default <- 1:3 + expect_identical(nth(x, 3, default = default), default) +}) + +test_that("nth treats list-of like lists", { + x <- list_of(1, 2, c(3, 4)) + + expect_identical(nth(x, 3), c(3, 4)) + expect_identical(nth(x, 4), NULL) + + # Not particularly strict about `default` here, + # even though `list_of()` elements are typed + expect_identical(nth(x, 4, default = "x"), "x") +}) + +test_that("nth works with data frames and always returns a single row", { + x <- tibble(x = 1:3, y = 4:6) + + expect_identical(nth(x, 1), tibble(x = 1L, y = 4L)) + expect_identical(nth(x, 4), tibble(x = NA_integer_, y = NA_integer_)) + expect_identical(nth(x, 4, default = tibble(x = 0, y = 0)), tibble(x = 0L, y = 0L)) +}) + +test_that("nth works with rcrds", { + x <- new_rcrd(list(x = 1:3, y = 4:6)) + + expect_identical(nth(x, 1), vec_slice(x, 1)) + expect_identical(nth(x, 4), vec_init(x)) + expect_identical(nth(x, 4, default = x[2]), x[2]) +}) + +test_that("drops names, because it uses `vec_slice2()`", { + x <- c(a = 1, b = 2) + expect_named(nth(x, 2), NULL) }) test_that("negative values index from end", { x <- 1:5 - expect_equal(nth(x, -1), 5) - expect_equal(nth(x, -3), 3) + expect_equal(nth(x, -1), 5L) + expect_equal(nth(x, -3), 3L) }) test_that("indexing past ends returns default value", { expect_equal(nth(1:4, 5), NA_integer_) expect_equal(nth(1:4, -5), NA_integer_) expect_equal(nth(1:4, -10), NA_integer_) + expect_equal(nth(1:4, -10, default = 6L), 6L) +}) + +test_that("gets corner case indexing correct", { + expect_identical(nth(1:4, -5), NA_integer_) + expect_identical(nth(1:4, -4), 1L) + expect_identical(nth(1:4, -3), 2L) + + expect_identical(nth(1:4, -1), 4L) + expect_identical(nth(1:4, 0), NA_integer_) + expect_identical(nth(1:4, 1), 1L) + + expect_identical(nth(1:4, 3), 3L) + expect_identical(nth(1:4, 4), 4L) + expect_identical(nth(1:4, 5), NA_integer_) +}) + +test_that("`order_by` can be used to alter the order", { + expect_identical(nth(1:5, n = 1L, order_by = 5:1), 5L) + expect_identical(nth(as.list(1:5), n = 1L, order_by = 5:1), 5L) }) -test_that("first uses default value for 0 length vectors", { +test_that("can use a data frame as `order_by`", { + x <- 1:3 + order_by <- tibble(a = c(1, 1, 2), b = c(2, 1, 0)) + + expect_identical(nth(x, 1, order_by = order_by), 2L) + expect_identical(nth(x, 2, order_by = order_by), 1L) +}) + +test_that("`default` must be size 1 (when not used with lists)", { + expect_snapshot(error = TRUE, { + nth(1L, n = 2L, default = 1:2) + }) +}) + +test_that("`default` is cast to the type of `x` (when not used with lists)", { + expect_snapshot(error = TRUE, { + nth("x", 2, default = 2) + }) +}) + +test_that("`n` is validated (#5466)", { + expect_snapshot(error = TRUE, { + nth(1:10, n = "x") + }) + expect_snapshot(error = TRUE, { + nth(1:10, n = 1:2) + }) +}) + +test_that("`x` must be a vector", { + expect_snapshot(error = TRUE, { + nth(environment(), 1L) + }) +}) + +test_that("`order_by` must be the same size as `x`", { + expect_snapshot(error = TRUE, { + nth(1:5, n = 1L, order_by = 1:2) + }) + + # Ensure that this is checked before `default` is early returned + expect_snapshot(error = TRUE, { + nth(1:5, n = 6L, order_by = 1:2) + }) +}) + +# ------------------------------------------------------------------------------ +# first() + +test_that("`first()` selects the first value", { + expect_identical(first(1:5), 1L) +}) + +test_that("`first()` uses default value for 0 length vectors", { expect_equal(first(logical()), NA) expect_equal(first(integer()), NA_integer_) expect_equal(first(numeric()), NA_real_) expect_equal(first(character()), NA_character_) - expect_null(first(list())) }) -test_that("firsts uses default value for 0 length augmented vectors", { +test_that("`first()` uses `NULL` default for 0 length lists", { + expect_identical(first(list()), NULL) +}) + +test_that("`first()` uses default value for 0 length augmented vectors", { fc <- factor("a")[0] - dt <- Sys.Date() - tm <- Sys.time() + dt <- Sys.Date()[0] + tm <- Sys.time()[0] - expect_equal(first(fc[0]), fc[NA]) - expect_equal(first(dt[0]), dt[NA]) - expect_equal(first(tm[0]), tm[NA]) + expect_equal(first(fc), vec_init(fc)) + expect_equal(first(dt), vec_init(dt)) + expect_equal(first(tm), vec_init(tm)) }) -test_that("nth() gives meaningful error message (#5466)", { - expect_snapshot({ - (expect_error(nth(1:10, "x"))) - }) +test_that("`first()` returns list elements", { + expect_identical(first(list(2:3, 4:5)), 2:3) +}) + +# ------------------------------------------------------------------------------ +# last() + +test_that("`last()` selects the last value", { + expect_identical(last(1:5), 5L) +}) + +test_that("`last()` returns list elements", { + expect_identical(last(list(2:3, 4:5)), 4:5) })