diff --git a/NAMESPACE b/NAMESPACE index 85dd9fe84..5b8705d41 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -563,6 +563,7 @@ export(vec_chop) export(vec_compare) export(vec_count) export(vec_data) +export(vec_deduplicated) export(vec_default_cast) export(vec_default_ptype2) export(vec_detect_complete) diff --git a/R/deduplicated.R b/R/deduplicated.R new file mode 100644 index 000000000..b69cbaa1e --- /dev/null +++ b/R/deduplicated.R @@ -0,0 +1,40 @@ +#' Modify a function to act on a deduplicated vector input +#' +#' @description +#' +#' `r lifecycle::badge("experimental")` +#' +#' The deduplicated function acts on the unique values in the first input `x` +#' and expands the output back to return. The return value is equivalent to `f(x)` +#' but is significantly faster for inputs with significant duplication. +#' +#' @param f Function whose first argument will be deduplicated. +#' +#' @return A deduplicated function +#' @export +#' +#' @examples +#' x <- sample(LETTERS, 10) +#' x +#' +#' large_x <- sample(rep(x, 10)) +#' length(large_x) +#' +#' long_func <- function(x) for(i in x) {Sys.sleep(0.001)} +#' +#' system.time(y <- long_func(large_x)) +#' system.time(y2 <- vec_deduplicate(long_func)(large_x)) +#' all(y == y2) +vec_deduplicated <- function(f) { + function(x, ...) { + res <- vec_group_id_and_loc(x) + group_id <- unclass(res) + unique_loc <- attr(res, "unique_loc") + unique_x <- vec_slice(x, unique_loc) + f(unique_x, ...)[group_id] + } +} + +vec_group_id_and_loc <- function(x) { + .Call(vctrs_group_id_and_loc, x) +} diff --git a/R/group.R b/R/group.R index 2c91df28a..579cc20d0 100644 --- a/R/group.R +++ b/R/group.R @@ -30,6 +30,7 @@ #' Note that when using `vec_group_loc()` for complex types, the default #' `data.frame` print method will be suboptimal, and you will want to coerce #' into a tibble to better understand the output. +#' #' @name vec_group #' #' @section Dependencies: diff --git a/_pkgdown.yml b/_pkgdown.yml index 812117b9f..76873bf97 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -131,3 +131,4 @@ reference: - vec_names - vec_as_location - vec_as_subscript + - vec_deduplicated diff --git a/man/vec_deduplicated.Rd b/man/vec_deduplicated.Rd new file mode 100644 index 000000000..5962a817a --- /dev/null +++ b/man/vec_deduplicated.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deduplicated.R +\name{vec_deduplicated} +\alias{vec_deduplicated} +\title{Modify a function to act on a deduplicated vector input} +\usage{ +vec_deduplicated(f) +} +\arguments{ +\item{f}{Function whose first argument will be deduplicated.} +} +\value{ +A deduplicated function +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +The deduplicated function acts on the unique values in the first input \code{x} +and expands the output back to return. The return value is equivalent to \code{f(x)} +but is significantly faster for inputs with significant duplication. +} +\examples{ +x <- sample(LETTERS, 10) +x + +large_x <- sample(rep(x, 10)) +length(large_x) + +long_func <- function(x) for(i in x) {Sys.sleep(0.001)} + +system.time(y <- long_func(large_x)) +system.time(y2 <- vec_deduplicate(long_func)(large_x)) +all(y == y2) +} diff --git a/src/group.c b/src/group.c index 81c321783..ebc09f7cf 100644 --- a/src/group.c +++ b/src/group.c @@ -233,3 +233,54 @@ SEXP vec_group_loc(SEXP x) { UNPROTECT(nprot); return out; } + + +// ----------------------------------------------------------------------------- + +// [[ register() ]] +SEXP vctrs_group_id_and_loc(SEXP x) { + int nprot = 0; + + R_len_t n = vec_size(x); + + x = PROTECT_N(vec_proxy_equal(x), &nprot); + x = PROTECT_N(vec_normalize_encoding(x), &nprot); + + struct dictionary* d = new_dictionary(x); + PROTECT_DICT(d, &nprot); + + SEXP out = PROTECT_N(Rf_allocVector(INTSXP, n), &nprot); + int* p_out = INTEGER(out); + + R_len_t g_id = 1; + + struct growable g_unq = new_growable(INTSXP, 256); + PROTECT_GROWABLE(&g_unq, &nprot); + + for (int i = 0; i < n; ++i) { + uint32_t hash = dict_hash_scalar(d, i); + R_len_t key = d->key[hash]; + + if (key == DICT_EMPTY) { + dict_put(d, hash, i); + // Record group id + p_out[i] = g_id; + ++g_id; + + // Record unique locs + growable_push_int(&g_unq, i + 1); + + } else { + p_out[i] = p_out[key]; + } + } + + SEXP n_groups = PROTECT_N(Rf_ScalarInteger(d->used), &nprot); + Rf_setAttrib(out, syms_n, n_groups); + + SEXP unq_vals = growable_values(&g_unq); + Rf_setAttrib(out, Rf_install("unique_loc"), unq_vals); + + UNPROTECT(nprot); + return out; +} diff --git a/src/init.c b/src/init.c index 9b338e3df..90495be30 100644 --- a/src/init.c +++ b/src/init.c @@ -27,6 +27,7 @@ extern SEXP vec_split(SEXP, SEXP); extern SEXP vctrs_group_id(SEXP); extern SEXP vctrs_group_rle(SEXP); extern SEXP vec_group_loc(SEXP); +extern SEXP vctrs_group_id_and_loc(SEXP); extern SEXP vctrs_equal(SEXP, SEXP, SEXP); extern r_obj* ffi_vec_detect_missing(r_obj*); extern r_obj* ffi_vec_any_missing(r_obj* x); @@ -211,6 +212,7 @@ static const R_CallMethodDef CallEntries[] = { {"vctrs_group_id", (DL_FUNC) &vctrs_group_id, 1}, {"vctrs_group_rle", (DL_FUNC) &vctrs_group_rle, 1}, {"vctrs_group_loc", (DL_FUNC) &vec_group_loc, 1}, + {"vctrs_group_id_and_loc", (DL_FUNC) &vctrs_group_id_and_loc, 1}, {"ffi_size", (DL_FUNC) &ffi_size, 2}, {"ffi_list_sizes", (DL_FUNC) &ffi_list_sizes, 2}, {"vctrs_dim", (DL_FUNC) &vctrs_dim, 1}, diff --git a/tests/testthat/test-deduplicated.R b/tests/testthat/test-deduplicated.R new file mode 100644 index 000000000..cf59259f5 --- /dev/null +++ b/tests/testthat/test-deduplicated.R @@ -0,0 +1,73 @@ +# group_id_and_loc ---------------------------------------------------------------- + +expect_matches_separate_calls <- function(x) { + expect_equal( + as.numeric(vec_group_id_and_loc(x)), + as.numeric(vec_group_id(x)) + ) + expect_equal( + attr(vec_group_id_and_loc(x), "unique_loc"), + vec_unique_loc(x) + ) +} + +test_that("vec_group_id_and_loc matches vec_group_id and vec_unique_loc", { + x <- c(2, 4, 2, 1, 4) + expect_matches_separate_calls(x) +}) + +test_that("vec_group_id_and_loc works for size 0 input", { + expect <- structure(integer(), n = 0L, unique_loc=integer()) + expect_equal(vec_group_id_and_loc(NULL), expect) + expect_equal(vec_group_id_and_loc(numeric()), expect) +}) + +test_that("vec_group_id_and_loc works on base S3 objects", { + x <- factor(c("x", "y", "x")) + expect_matches_separate_calls(x) + + x <- new_date(c(0, 1, 0)) + expect_matches_separate_calls(x) +}) + +test_that("vec_group_id_and_loc works on data frames", { + df <- data.frame(x = c(1, 2, 1, 1), y = c(2, 3, 2, 3)) + expect_matches_separate_calls(df) +}) + +test_that("vec_group_id_and_loc works on arrays", { + x <- array(c(1, 1, 1, 2, 4, 2), c(3, 2)) + expect_matches_separate_calls(x) +}) + +test_that("vec_group_id takes the equality proxy", { + local_comparable_tuple() + x <- tuple(c(1, 2, 1, 1), c(1, 1, 1, 2)) + # Compares on only the first field + expect_matches_separate_calls(x) +}) + +test_that("vec_group_id takes the equality proxy recursively", { + local_comparable_tuple() + + x <- tuple(c(1, 2, 1, 1), 1:4) + df <- data_frame(x = x) + expect_matches_separate_calls(df) +}) + + +# vec_deduplicate --------------------------------------------------------- + +test_that("vec_deduplicated(f) runs only on deduplicated values", { + ncalls <<- 0 + f <- function(ii) for(i in ii) ncalls <<- ncalls + 1 + + x <- c(1, 1, 1, 2, 3) + vec_deduplicated(f)(x) + expect_equal(ncalls, 3) + + ncalls <<- 0 + x <- 1:5 + vec_deduplicated(f)(x) + expect_equal(ncalls, 5) +})