From 7b61793cb32edf138985b70146f29d5f21be3af3 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Tue, 11 Aug 2020 14:36:48 +0200 Subject: [PATCH 1/3] speedup $set(name, chunks) --- R/data-mask.R | 4 +--- src/dplyr.h | 2 ++ src/init.cpp | 3 +++ src/mask.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 src/mask.cpp diff --git a/R/data-mask.R b/R/data-mask.R index 74000384c2..a56d6bd163 100644 --- a/R/data-mask.R +++ b/R/data-mask.R @@ -107,9 +107,7 @@ DataMask <- R6Class("DataMask", }, set = function(name, chunks) { - private$resolved[[name]] <- chunks - private$used <- !map_lgl(private$resolved, is.null) - private$which_used <- which(private$used) + .Call(`dplyr_mask_set`, private, name, chunks) }, remove = function(name) { diff --git a/src/dplyr.h b/src/dplyr.h index b4ac20e7d0..1d2e02f65a 100644 --- a/src/dplyr.h +++ b/src/dplyr.h @@ -26,6 +26,7 @@ struct symbols { static SEXP which_used; static SEXP dot_drop; static SEXP abort_glue; + static SEXP used; }; struct vectors { @@ -62,6 +63,7 @@ SEXP dplyr_mask_eval_all_filter(SEXP quos, SEXP env_private, SEXP s_n, SEXP env_ SEXP dplyr_summarise_recycle_chunks(SEXP chunks, SEXP rows, SEXP ptypes); SEXP dplyr_group_indices(SEXP data, SEXP s_nr); SEXP dplyr_group_keys(SEXP group_data); +SEXP dplyr_mask_set(SEXP env_private, SEXP name, SEXP chunks); #define DPLYR_MASK_INIT() \ SEXP rows = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::rows)); \ diff --git a/src/init.cpp b/src/init.cpp index 7adf2f2c24..6525c2e0fb 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -48,6 +48,7 @@ SEXP symbols::bindings = Rf_install("bindings"); SEXP symbols::which_used = Rf_install("which_used"); SEXP symbols::dot_drop = Rf_install(".drop"); SEXP symbols::abort_glue = Rf_install("abort_glue"); +SEXP symbols::used = Rf_install("used"); SEXP vectors::classes_vctrs_list_of = get_classes_vctrs_list_of(); SEXP vectors::empty_int_vector = get_empty_int_vector(); @@ -82,6 +83,8 @@ static const R_CallMethodDef CallEntries[] = { {"dplyr_group_indices", (DL_FUNC)& dplyr_group_indices, 2}, {"dplyr_group_keys", (DL_FUNC)& dplyr_group_keys, 1}, + {"dplyr_mask_set", (DL_FUNC)& dplyr_mask_set, 3}, + {NULL, NULL, 0} }; diff --git a/src/mask.cpp b/src/mask.cpp new file mode 100644 index 0000000000..1b017787b3 --- /dev/null +++ b/src/mask.cpp @@ -0,0 +1,49 @@ +#include "dplyr.h" + +SEXP dplyr_mask_set(SEXP env_private, SEXP name, SEXP chunks) { + // we assume control over these + SEXP resolved = Rf_findVarInFrame(env_private, dplyr::symbols::resolved); + SEXP names_resolved = PROTECT(Rf_getAttrib(resolved, R_NamesSymbol)); + SEXP used = Rf_findVarInFrame(env_private, dplyr::symbols::used); + + // search for position of name + SEXP char_name = STRING_ELT(name, 0); + R_xlen_t n = XLENGTH(resolved); + R_xlen_t i_name = 0; + for (; i_name < n; i_name++) { + if (char_name == STRING_ELT(names_resolved, i_name)) break; + } + UNPROTECT(1); // names_resolved + + if (i_name == n && chunks == R_NilValue) { + // early return, as this is removing a resolved that wasn't + // so it does nothing + return R_NilValue; + } + + // update used + LOGICAL(used)[i_name] = chunks != R_NilValue; + SET_VECTOR_ELT(resolved, i_name, chunks); + + // count how many are used + int* p_used = LOGICAL(used); + R_xlen_t n_used = 0; + for (R_xlen_t i = 0; i < n; i++, ++p_used) { + n_used += *p_used; + } + + // update which_used + SEXP which_used = PROTECT(Rf_allocVector(INTSXP, n_used)); + int* p_which_used = INTEGER(which_used); + p_used = LOGICAL(used); + for (R_xlen_t i = 0; i < n; i++, ++p_used) { + if (*p_used) { + *p_which_used = i + 1; + ++p_which_used; + } + } + Rf_defineVar(dplyr::symbols::which_used, which_used, env_private); + + UNPROTECT(1); // which_used + return R_NilValue; +} From 7e1dc48f50dcbf168db40aea242e7007c404afc5 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Tue, 11 Aug 2020 16:31:34 +0200 Subject: [PATCH 2/3] Also internalise $add() --- R/data-mask.R | 17 +--------- src/dplyr.h | 4 ++- src/init.cpp | 1 + src/mask.cpp | 90 +++++++++++++++++++++++++++++++++++++++++++++------ 4 files changed, 85 insertions(+), 27 deletions(-) diff --git a/R/data-mask.R b/R/data-mask.R index a56d6bd163..f7e9c16459 100644 --- a/R/data-mask.R +++ b/R/data-mask.R @@ -88,22 +88,7 @@ DataMask <- R6Class("DataMask", } } - pos <- which(names(private$resolved) == name) - is_new_column <- length(pos) == 0L - - if (is_new_column) { - pos <- length(private$resolved) + 1L - used <- FALSE - } else { - used <- private$used[[pos]] - } - - if (!used) { - private$used[[pos]] <- TRUE - private$which_used <- c(private$which_used, pos) - } - - private$resolved[[name]] <- chunks + .Call(`dplyr_mask_add`, private, name, chunks) }, set = function(name, chunks) { diff --git a/src/dplyr.h b/src/dplyr.h index 1d2e02f65a..7074c9d47f 100644 --- a/src/dplyr.h +++ b/src/dplyr.h @@ -63,7 +63,9 @@ SEXP dplyr_mask_eval_all_filter(SEXP quos, SEXP env_private, SEXP s_n, SEXP env_ SEXP dplyr_summarise_recycle_chunks(SEXP chunks, SEXP rows, SEXP ptypes); SEXP dplyr_group_indices(SEXP data, SEXP s_nr); SEXP dplyr_group_keys(SEXP group_data); -SEXP dplyr_mask_set(SEXP env_private, SEXP name, SEXP chunks); + +SEXP dplyr_mask_set(SEXP env_private, SEXP s_name, SEXP chunks); +SEXP dplyr_mask_add(SEXP env_private, SEXP s_name, SEXP chunks); #define DPLYR_MASK_INIT() \ SEXP rows = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::rows)); \ diff --git a/src/init.cpp b/src/init.cpp index 6525c2e0fb..b7ce369e8f 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -84,6 +84,7 @@ static const R_CallMethodDef CallEntries[] = { {"dplyr_group_keys", (DL_FUNC)& dplyr_group_keys, 1}, {"dplyr_mask_set", (DL_FUNC)& dplyr_mask_set, 3}, + {"dplyr_mask_add", (DL_FUNC)& dplyr_mask_add, 3}, {NULL, NULL, 0} }; diff --git a/src/mask.cpp b/src/mask.cpp index 1b017787b3..2d5200d5b0 100644 --- a/src/mask.cpp +++ b/src/mask.cpp @@ -1,19 +1,90 @@ #include "dplyr.h" -SEXP dplyr_mask_set(SEXP env_private, SEXP name, SEXP chunks) { +R_xlen_t find_first(SEXP haystack, SEXP needle) { + R_xlen_t n = XLENGTH(haystack); + R_xlen_t i_name = 0; + for (; i_name < n; i_name++) { + if (needle == STRING_ELT(haystack, i_name)) break; + } + + return i_name; +} + +SEXP integers_append(SEXP ints, int x) { + R_xlen_t n = XLENGTH(ints); + SEXP new_ints = PROTECT(Rf_allocVector(INTSXP, n + 1)); + int* p_ints = INTEGER(ints); + int* p_new_ints = INTEGER(new_ints); + for (R_xlen_t i = 0; i < n; i++) { + p_new_ints[i] = p_ints[i]; + } + p_new_ints[n] = x; + UNPROTECT(1); + return new_ints; +} + +SEXP dplyr_mask_add(SEXP env_private, SEXP s_name, SEXP chunks) { + SEXP name = STRING_ELT(s_name, 0); + // we assume control over these SEXP resolved = Rf_findVarInFrame(env_private, dplyr::symbols::resolved); SEXP names_resolved = PROTECT(Rf_getAttrib(resolved, R_NamesSymbol)); SEXP used = Rf_findVarInFrame(env_private, dplyr::symbols::used); + SEXP which_used = Rf_findVarInFrame(env_private, dplyr::symbols::which_used); // search for position of name - SEXP char_name = STRING_ELT(name, 0); - R_xlen_t n = XLENGTH(resolved); - R_xlen_t i_name = 0; - for (; i_name < n; i_name++) { - if (char_name == STRING_ELT(names_resolved, i_name)) break; + R_xlen_t n = XLENGTH(names_resolved); + R_xlen_t i_name = find_first(names_resolved, name); + + int* p_used = LOGICAL(used); + bool is_new_column = i_name == n; + if (is_new_column) { + SEXP new_used = PROTECT(Rf_allocVector(LGLSXP, n + 1)); + SEXP new_resolved = PROTECT(Rf_allocVector(VECSXP, n + 1)); + SEXP new_names_resolved = PROTECT(Rf_allocVector(STRSXP, n + 1)); + int* p_new_used = LOGICAL(new_used); + + for (R_xlen_t i = 0; i < n; i++) { + SET_VECTOR_ELT(new_resolved, i, VECTOR_ELT(resolved, i)); + SET_STRING_ELT(new_names_resolved, i, STRING_ELT(names_resolved, i)); + p_new_used[i] = p_used[i]; + } + SET_VECTOR_ELT(new_resolved, n, chunks); + SET_STRING_ELT(new_names_resolved, n, name); + p_new_used[n] = TRUE; + + SEXP new_which_used = PROTECT(integers_append(which_used, n + 1)); + + Rf_namesgets(new_resolved, new_names_resolved); + Rf_defineVar(dplyr::symbols::resolved, new_resolved, env_private); + Rf_defineVar(dplyr::symbols::used, new_used, env_private); + Rf_defineVar(dplyr::symbols::which_used, new_which_used, env_private); + + UNPROTECT(4); + } else { + SET_VECTOR_ELT(resolved, i_name, chunks); + p_used[i_name] = TRUE; + + SEXP new_which_used = PROTECT(integers_append(which_used, i_name + 1)); + Rf_defineVar(dplyr::symbols::which_used, new_which_used, env_private); + UNPROTECT(1); } UNPROTECT(1); // names_resolved + return R_NilValue; +} + +SEXP dplyr_mask_set(SEXP env_private, SEXP s_name, SEXP chunks) { + SEXP name = STRING_ELT(s_name, 0); + + // we assume control over these + SEXP resolved = Rf_findVarInFrame(env_private, dplyr::symbols::resolved); + SEXP names_resolved = PROTECT(Rf_getAttrib(resolved, R_NamesSymbol)); + SEXP used = Rf_findVarInFrame(env_private, dplyr::symbols::used); + + // search for position of name + R_xlen_t n = XLENGTH(names_resolved); + R_xlen_t i_name = find_first(names_resolved, name); + UNPROTECT(1); // names_resolved if (i_name == n && chunks == R_NilValue) { // early return, as this is removing a resolved that wasn't @@ -36,10 +107,9 @@ SEXP dplyr_mask_set(SEXP env_private, SEXP name, SEXP chunks) { SEXP which_used = PROTECT(Rf_allocVector(INTSXP, n_used)); int* p_which_used = INTEGER(which_used); p_used = LOGICAL(used); - for (R_xlen_t i = 0; i < n; i++, ++p_used) { - if (*p_used) { - *p_which_used = i + 1; - ++p_which_used; + for (R_xlen_t i = 0, j = 0; i < n; i++) { + if (p_used[i]) { + p_which_used[j++] = i + 1; } } Rf_defineVar(dplyr::symbols::which_used, which_used, env_private); From d88ff163164be5744f9746e0b23df3a6de71ffa8 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Mon, 31 Aug 2020 15:56:55 +0200 Subject: [PATCH 3/3] convert to utf8 if needed before comparing strings --- src/dplyr.h | 6 ++++++ src/mask.cpp | 10 +++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/dplyr.h b/src/dplyr.h index 7074c9d47f..c44a0d1a5d 100644 --- a/src/dplyr.h +++ b/src/dplyr.h @@ -6,6 +6,12 @@ #include #include +#define UTF8_MASK (1<<3) +#define ASCII_MASK (1<<6) + +#define IS_ASCII(x) (LEVELS(x) & ASCII_MASK) +#define IS_UTF8(x) (LEVELS(x) & UTF8_MASK) + namespace dplyr { struct envs { diff --git a/src/mask.cpp b/src/mask.cpp index 2d5200d5b0..47ec861cad 100644 --- a/src/mask.cpp +++ b/src/mask.cpp @@ -1,10 +1,18 @@ #include "dplyr.h" +SEXP as_utf8(SEXP s) { + if (!IS_UTF8(s) && !IS_ASCII(s)) { + s = Rf_mkCharCE(Rf_translateCharUTF8(s), CE_UTF8); + } + return s; +} + R_xlen_t find_first(SEXP haystack, SEXP needle) { + needle = as_utf8(needle); R_xlen_t n = XLENGTH(haystack); R_xlen_t i_name = 0; for (; i_name < n; i_name++) { - if (needle == STRING_ELT(haystack, i_name)) break; + if (needle == as_utf8(STRING_ELT(haystack, i_name))) break; } return i_name;