Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

speedup <DataMask>$set(name, chunks) #5474

Merged
merged 3 commits into from
Sep 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 2 additions & 19 deletions R/data-mask.R
Original file line number Diff line number Diff line change
Expand Up @@ -88,28 +88,11 @@ DataMask <- R6Class("DataMask",
}
}

pos <- which(names(private$resolved) == name)
is_new_column <- length(pos) == 0L

if (is_new_column) {
pos <- length(private$resolved) + 1L
used <- FALSE
} else {
used <- private$used[[pos]]
}

if (!used) {
private$used[[pos]] <- TRUE
private$which_used <- c(private$which_used, pos)
}

private$resolved[[name]] <- chunks
.Call(`dplyr_mask_add`, private, name, chunks)
},

set = function(name, chunks) {
private$resolved[[name]] <- chunks
private$used <- !map_lgl(private$resolved, is.null)
private$which_used <- which(private$used)
.Call(`dplyr_mask_set`, private, name, chunks)
},

remove = function(name) {
Expand Down
10 changes: 10 additions & 0 deletions src/dplyr.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
#include <Rinternals.h>
#include <R_ext/Rdynload.h>

#define UTF8_MASK (1<<3)
#define ASCII_MASK (1<<6)

#define IS_ASCII(x) (LEVELS(x) & ASCII_MASK)
#define IS_UTF8(x) (LEVELS(x) & UTF8_MASK)

namespace dplyr {

struct envs {
Expand All @@ -26,6 +32,7 @@ struct symbols {
static SEXP which_used;
static SEXP dot_drop;
static SEXP abort_glue;
static SEXP used;
};

struct vectors {
Expand Down Expand Up @@ -63,6 +70,9 @@ SEXP dplyr_summarise_recycle_chunks(SEXP chunks, SEXP rows, SEXP ptypes);
SEXP dplyr_group_indices(SEXP data, SEXP s_nr);
SEXP dplyr_group_keys(SEXP group_data);

SEXP dplyr_mask_set(SEXP env_private, SEXP s_name, SEXP chunks);
SEXP dplyr_mask_add(SEXP env_private, SEXP s_name, SEXP chunks);

#define DPLYR_MASK_INIT() \
SEXP rows = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::rows)); \
R_xlen_t ngroups = XLENGTH(rows); \
Expand Down
4 changes: 4 additions & 0 deletions src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ SEXP symbols::bindings = Rf_install("bindings");
SEXP symbols::which_used = Rf_install("which_used");
SEXP symbols::dot_drop = Rf_install(".drop");
SEXP symbols::abort_glue = Rf_install("abort_glue");
SEXP symbols::used = Rf_install("used");

SEXP vectors::classes_vctrs_list_of = get_classes_vctrs_list_of();
SEXP vectors::empty_int_vector = get_empty_int_vector();
Expand Down Expand Up @@ -82,6 +83,9 @@ static const R_CallMethodDef CallEntries[] = {
{"dplyr_group_indices", (DL_FUNC)& dplyr_group_indices, 2},
{"dplyr_group_keys", (DL_FUNC)& dplyr_group_keys, 1},

{"dplyr_mask_set", (DL_FUNC)& dplyr_mask_set, 3},
{"dplyr_mask_add", (DL_FUNC)& dplyr_mask_add, 3},

{NULL, NULL, 0}
};

Expand Down
127 changes: 127 additions & 0 deletions src/mask.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include "dplyr.h"

SEXP as_utf8(SEXP s) {
if (!IS_UTF8(s) && !IS_ASCII(s)) {
s = Rf_mkCharCE(Rf_translateCharUTF8(s), CE_UTF8);
}
return s;
}

R_xlen_t find_first(SEXP haystack, SEXP needle) {
needle = as_utf8(needle);
R_xlen_t n = XLENGTH(haystack);
R_xlen_t i_name = 0;
for (; i_name < n; i_name++) {
if (needle == as_utf8(STRING_ELT(haystack, i_name))) break;
}

return i_name;
}

SEXP integers_append(SEXP ints, int x) {
R_xlen_t n = XLENGTH(ints);
SEXP new_ints = PROTECT(Rf_allocVector(INTSXP, n + 1));
int* p_ints = INTEGER(ints);
int* p_new_ints = INTEGER(new_ints);
for (R_xlen_t i = 0; i < n; i++) {
p_new_ints[i] = p_ints[i];
}
p_new_ints[n] = x;
UNPROTECT(1);
return new_ints;
}

SEXP dplyr_mask_add(SEXP env_private, SEXP s_name, SEXP chunks) {
SEXP name = STRING_ELT(s_name, 0);

// we assume control over these
SEXP resolved = Rf_findVarInFrame(env_private, dplyr::symbols::resolved);
SEXP names_resolved = PROTECT(Rf_getAttrib(resolved, R_NamesSymbol));
SEXP used = Rf_findVarInFrame(env_private, dplyr::symbols::used);
SEXP which_used = Rf_findVarInFrame(env_private, dplyr::symbols::which_used);

// search for position of name
R_xlen_t n = XLENGTH(names_resolved);
R_xlen_t i_name = find_first(names_resolved, name);

int* p_used = LOGICAL(used);
bool is_new_column = i_name == n;
if (is_new_column) {
SEXP new_used = PROTECT(Rf_allocVector(LGLSXP, n + 1));
SEXP new_resolved = PROTECT(Rf_allocVector(VECSXP, n + 1));
SEXP new_names_resolved = PROTECT(Rf_allocVector(STRSXP, n + 1));
int* p_new_used = LOGICAL(new_used);

for (R_xlen_t i = 0; i < n; i++) {
SET_VECTOR_ELT(new_resolved, i, VECTOR_ELT(resolved, i));
SET_STRING_ELT(new_names_resolved, i, STRING_ELT(names_resolved, i));
p_new_used[i] = p_used[i];
}
SET_VECTOR_ELT(new_resolved, n, chunks);
SET_STRING_ELT(new_names_resolved, n, name);
p_new_used[n] = TRUE;

SEXP new_which_used = PROTECT(integers_append(which_used, n + 1));

Rf_namesgets(new_resolved, new_names_resolved);
Rf_defineVar(dplyr::symbols::resolved, new_resolved, env_private);
Rf_defineVar(dplyr::symbols::used, new_used, env_private);
Rf_defineVar(dplyr::symbols::which_used, new_which_used, env_private);

UNPROTECT(4);
} else {
SET_VECTOR_ELT(resolved, i_name, chunks);
p_used[i_name] = TRUE;

SEXP new_which_used = PROTECT(integers_append(which_used, i_name + 1));
Rf_defineVar(dplyr::symbols::which_used, new_which_used, env_private);
UNPROTECT(1);
}
UNPROTECT(1); // names_resolved
return R_NilValue;
}

SEXP dplyr_mask_set(SEXP env_private, SEXP s_name, SEXP chunks) {
SEXP name = STRING_ELT(s_name, 0);

// we assume control over these
SEXP resolved = Rf_findVarInFrame(env_private, dplyr::symbols::resolved);
SEXP names_resolved = PROTECT(Rf_getAttrib(resolved, R_NamesSymbol));
SEXP used = Rf_findVarInFrame(env_private, dplyr::symbols::used);

// search for position of name
R_xlen_t n = XLENGTH(names_resolved);
R_xlen_t i_name = find_first(names_resolved, name);
UNPROTECT(1); // names_resolved

if (i_name == n && chunks == R_NilValue) {
// early return, as this is removing a resolved that wasn't
// so it does nothing
return R_NilValue;
}

// update used
LOGICAL(used)[i_name] = chunks != R_NilValue;
SET_VECTOR_ELT(resolved, i_name, chunks);

// count how many are used
int* p_used = LOGICAL(used);
R_xlen_t n_used = 0;
for (R_xlen_t i = 0; i < n; i++, ++p_used) {
n_used += *p_used;
}

// update which_used
SEXP which_used = PROTECT(Rf_allocVector(INTSXP, n_used));
int* p_which_used = INTEGER(which_used);
p_used = LOGICAL(used);
for (R_xlen_t i = 0, j = 0; i < n; i++) {
if (p_used[i]) {
p_which_used[j++] = i + 1;
}
}
Rf_defineVar(dplyr::symbols::which_used, which_used, env_private);

UNPROTECT(1); // which_used
return R_NilValue;
}