Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(r): Implement dictionary conversion #285

Merged
merged 25 commits into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
7808a6f
fix typo in error message
paleolimbot Aug 21, 2023
5e4c497
make the "call into R" materializer generic
paleolimbot Aug 21, 2023
76a9e95
with passing test
paleolimbot Aug 21, 2023
2462279
with passing test
paleolimbot Aug 21, 2023
aca0325
support converting to factor
paleolimbot Aug 21, 2023
54e8260
notes about conversions
paleolimbot Aug 21, 2023
ac8f304
prepare to handle "other"
paleolimbot Aug 21, 2023
31c7f0d
start on materialize other
paleolimbot Aug 21, 2023
3589d2a
run pre-commit
paleolimbot Aug 22, 2023
5093002
more formatting
paleolimbot Aug 22, 2023
5edd4ea
start to simplify calls from C into R for conversion
paleolimbot Aug 22, 2023
1404770
possibly simplify C -> S3 -> C circle
paleolimbot Aug 22, 2023
4c9f9ce
move materializer for header out of the common header
paleolimbot Aug 22, 2023
ed7739a
fix include
paleolimbot Aug 22, 2023
97bf004
reuse convert_fallback_other everywhere
paleolimbot Aug 22, 2023
8e2f405
format
paleolimbot Aug 22, 2023
4c6969f
document dictionary conversion
paleolimbot Aug 22, 2023
70b390b
add more tests for dictionaries
paleolimbot Aug 22, 2023
6898c6c
make sure convert to partial_factor fails in a batched convert
paleolimbot Aug 22, 2023
6c3ec16
fix for reprex
paleolimbot Aug 23, 2023
07c38f8
document, fix ordered conversion
paleolimbot Aug 23, 2023
4451f5f
Update r/tests/testthat/test-convert-array.R
paleolimbot Aug 23, 2023
fb59210
Update r/R/convert-array.R
paleolimbot Aug 23, 2023
907a0a8
Update r/R/convert-array.R
paleolimbot Aug 23, 2023
7b54981
document
paleolimbot Aug 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ S3method(as_nanoarrow_schema,Field)
S3method(as_nanoarrow_schema,Schema)
S3method(as_nanoarrow_schema,nanoarrow_schema)
S3method(convert_array,default)
S3method(convert_array,double)
S3method(convert_array,factor)
S3method(convert_array,vctrs_partial_frame)
S3method(format,nanoarrow_array)
S3method(format,nanoarrow_array_stream)
Expand Down
114 changes: 93 additions & 21 deletions r/R/convert-array.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,13 @@
#' - [character()]: String and large string types can be converted to
#' [character()]. The conversion does not check for valid UTF-8: if you need
#' finer-grained control over encodings, use `to = blob::blob()`.
#' - [Date][as.Date]: Only the date32 type can be converted to an R Date vector.
#' - [factor()]: Dictionary-encoded arrays of strings can be converted to
#' `factor()`; however, this must be specified explicitly (i.e.,
#' `convert_array(array, factor())`) because arrays arriving
#' in chunks can have dictionaries that contain different levels. Use
#' `convert_array(array, factor(levels = c(...)))` to materialize an array
#' into a vector with known levels.
#' - [Date][as.Date()]: Only the date32 type can be converted to an R Date vector.
#' - [hms::hms()]: Time32 and time64 types can be converted to [hms::hms()].
#' - [difftime()]: Time32, time64, and duration types can be converted to
#' R [difftime()] vectors. The value is converted to match the [units()]
Expand All @@ -64,8 +70,7 @@
#'
#' In addition to the above conversions, a null array may be converted to any
#' target prototype except [data.frame()]. Extension arrays are currently
#' converted as their storage type; dictionary-encoded arrays are not
#' currently supported.
#' converted as their storage type.
#'
#' @examples
#' array <- as_nanoarrow_array(data.frame(x = 1:5))
Expand All @@ -80,6 +85,16 @@ convert_array <- function(array, to = NULL, ...) {
#' @export
convert_array.default <- function(array, to = NULL, ..., .from_c = FALSE) {
if (.from_c) {
# Handle default dictionary conversion since it's the same for all types
dictionary <- array$dictionary

if (!is.null(dictionary)) {
values <- .Call(nanoarrow_c_convert_array, dictionary, to)
array$dictionary <- NULL
indices <- .Call(nanoarrow_c_convert_array, array, integer())
return(values[indices + 1L])
}

stop_cant_convert_array(array, to)
}

Expand All @@ -96,10 +111,45 @@ convert_array.default <- function(array, to = NULL, ..., .from_c = FALSE) {
# we call convert_array() to dispatch to conversions defined via S3
# dispatch, making sure to let the default method know that we've already
# tried the internal C conversions.
convert_array_from_c <- function(array, to) {
convert_fallback_other <- function(array, offset, length, to) {
# If we need to modify offset/length, do it using a shallow copy.
if (!is.null(offset)) {
array <- nanoarrow_array_modify(
array,
list(offset = offset, length = length),
validate = FALSE
)
}

# Call convert_array() on a single chunk. Use .from_c = TRUE to ensure that
# methods do not attempt to pass the same array back to the C conversions.
# When the result is passed back to C it is checked enough to avoid segfault
# but not necessarily for correctness (e.g., factors with levels that don't
# correspond to 'to'). This result may be used as-is or may be copied into
# a slice of another vector.
convert_array(array, to, .from_c = TRUE)
}

#' @export
convert_array.double <- function(array, to, ...) {
# Handle conversion from decimal128 via arrow
schema <- infer_nanoarrow_schema(array)
parsed <- nanoarrow_schema_parse(schema)
if (parsed$type == "decimal128") {
assert_arrow_installed(
sprintf(
"convert %s array to object of type double",
nanoarrow_schema_formatted(schema)
)
)

arrow_array <- as_arrow_array.nanoarrow_array(array)
arrow_array$as_vector()
} else {
NextMethod()
}
}

#' @export
convert_array.vctrs_partial_frame <- function(array, to, ...) {
ptype <- infer_nanoarrow_ptype(array)
Expand All @@ -111,6 +161,45 @@ convert_array.vctrs_partial_frame <- function(array, to, ...) {
.Call(nanoarrow_c_convert_array, array, ptype)
}

#' @export
convert_array.factor <- function(array, to, ...) {
if (!is.null(array$dictionary)) {
levels_final <- levels(to)
levels <- convert_array(array$dictionary, character())
array$dictionary <- NULL
indices <- convert_array(array, integer()) + 1L

# Handle empty factor() as the sentinel for "auto levels"
if (identical(levels(to), character())) {
levels(to) <- levels
}

if (identical(levels, levels(to))) {
fct_data <- indices
} else if (all(levels %in% levels(to))) {
level_map <- match(levels, levels(to))
fct_data <- level_map[indices]
} else {
stop("Error converting to factor: some levels in data do not exist in levels")
}
} else {
strings <- convert_array(array, character())

# Handle empty factor() as the sentinel for "auto levels"
if (identical(levels(to), character())) {
fct_data <- factor(strings, levels)
levels(to) <- levels(fct_data)
} else {
fct_data <- factor(strings, levels = levels(to))
}
}

# Restore other attributes (e.g., ordered, labels)
attributes(fct_data) <- attributes(to)
fct_data
}


stop_cant_convert_array <- function(array, to, n = 0) {
stop_cant_convert_schema(infer_nanoarrow_schema(array), to, n - 1)
}
Expand Down Expand Up @@ -141,20 +230,3 @@ stop_cant_convert_schema <- function(schema, to, n = 0) {

stop(cnd)
}

# Called from C for decimal types
convert_decimal_to_double <- function(array, schema, offset, length) {
assert_arrow_installed(
sprintf(
"convert %s array to object of type double",
nanoarrow_schema_formatted(schema)
)
)

array2 <- nanoarrow_allocate_array()
schema2 <- nanoarrow_allocate_schema()
nanoarrow_pointer_export(array, array2)
nanoarrow_pointer_export(schema, schema2)
arrow_array <- arrow::Array$import_from_c(array2, schema2)
arrow_array$Slice(offset, length)$as_vector()
}
9 changes: 8 additions & 1 deletion r/R/infer-ptype.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ infer_ptype_other <- function(schema) {
ptype <- infer_nanoarrow_ptype(schema$children[[1]])
vctrs::list_of(.ptype = ptype)
},
"dictionary" = {
# Even though R's 'factor' can handle a dictionary of strings
# (perhaps the most common case), an array arriving in chunks may have
# different dictionary arrays. Thus, the best type-stable default we can
# achieve is to expand dictionaries.
infer_nanoarrow_ptype(schema$dictionary)
},
stop_cant_infer_ptype(schema, n = -1)
)
}
Expand All @@ -108,7 +115,7 @@ stop_cant_infer_ptype <- function(schema, n = 0) {
if (is.null(schema$name) || identical(schema$name, "")) {
cnd <- simpleError(
sprintf(
"Can't infer R vector type for array <%s>",
"Can't infer R vector type for <%s>",
schema_label
),
call = sys.call(n - 1)
Expand Down
4 changes: 2 additions & 2 deletions r/man/as_nanoarrow_schema.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions r/man/convert_array.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 0 additions & 7 deletions r/src/convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -309,13 +309,6 @@ int nanoarrow_converter_set_schema(SEXP converter_xptr, SEXP schema_xptr) {
UNPROTECT(1);
}

// Sub-par error for dictionary types until we have a way to deal with them
if (converter->schema_view.type == NANOARROW_TYPE_DICTIONARY) {
ArrowErrorSet(&converter->error,
"Conversion to dictionary-encoded array is not supported");
return ENOTSUP;
}

SET_VECTOR_ELT(converter_shelter, 1, schema_xptr);

ArrowArrayViewReset(&converter->array_view);
Expand Down
22 changes: 14 additions & 8 deletions r/src/convert_array.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@ enum VectorType nanoarrow_infer_vector_type_array(SEXP array_xptr);
// dispatch to find a convert_array() method (or error if there
// isn't one)
static SEXP call_convert_array(SEXP array_xptr, SEXP ptype_sexp) {
SEXP fun = PROTECT(Rf_install("convert_array_from_c"));
SEXP call = PROTECT(Rf_lang3(fun, array_xptr, ptype_sexp));
SEXP fun = PROTECT(Rf_install("convert_fallback_other"));
// offset/length don't need to be modified in this case
SEXP call = PROTECT(Rf_lang5(fun, array_xptr, R_NilValue, R_NilValue, ptype_sexp));
SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
UNPROTECT(3);
return result;
Expand Down Expand Up @@ -100,12 +101,17 @@ static SEXP convert_array_default(SEXP array_xptr, enum VectorType vector_type,
}

static SEXP convert_array_chr(SEXP array_xptr) {
SEXP result = PROTECT(nanoarrow_c_make_altrep_chr(array_xptr));
if (result == R_NilValue) {
call_stop_cant_convert_array(array_xptr, VECTOR_TYPE_CHR, R_NilValue);
struct ArrowArray* array = (struct ArrowArray*)R_ExternalPtrAddr(array_xptr);
if (array->dictionary == NULL) {
SEXP result = PROTECT(nanoarrow_c_make_altrep_chr(array_xptr));
if (result == R_NilValue) {
call_stop_cant_convert_array(array_xptr, VECTOR_TYPE_CHR, R_NilValue);
}
UNPROTECT(1);
return result;
} else {
return convert_array_default(array_xptr, VECTOR_TYPE_CHR, R_NilValue);
}
UNPROTECT(1);
return result;
}

SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp);
Expand Down Expand Up @@ -210,7 +216,7 @@ SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp) {
Rf_inherits(ptype_sexp, "Date") || Rf_inherits(ptype_sexp, "hms") ||
Rf_inherits(ptype_sexp, "POSIXct") ||
Rf_inherits(ptype_sexp, "difftime")) {
return convert_array_default(array_xptr, VECTOR_TYPE_OTHER, ptype_sexp);
return convert_array_default(array_xptr, VECTOR_TYPE_UNINITIALIZED, ptype_sexp);
} else {
return call_convert_array(array_xptr, ptype_sexp);
}
Expand Down
Loading