From fa0440f1cfafa5937cadc491e53f575a57e70cfa Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 29 May 2019 16:55:24 +0200 Subject: [PATCH 1/3] update R to changes from ARROW-3144 #4316 --- r/R/RcppExports.R | 12 +++++------ r/R/dictionary.R | 16 +++++++-------- r/src/RcppExports.cpp | 28 ++++++++++++------------- r/src/array_from_vector.cpp | 35 ++++++++++---------------------- r/src/datatype.cpp | 16 +++++++-------- r/src/message.cpp | 9 ++++++-- r/src/recordbatch.cpp | 4 +++- r/tests/testthat/test-DataType.R | 4 ++-- 8 files changed, 59 insertions(+), 65 deletions(-) diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index 3940cc208efda..81e1c0404448f 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -417,20 +417,20 @@ TimestampType__unit <- function(type) { .Call(`_arrow_TimestampType__unit`, type) } -DictionaryType__initialize <- function(type, array, ordered) { - .Call(`_arrow_DictionaryType__initialize`, type, array, ordered) +DictionaryType__initialize <- function(index_type, value_type, ordered) { + .Call(`_arrow_DictionaryType__initialize`, index_type, value_type, ordered) } DictionaryType__index_type <- function(type) { .Call(`_arrow_DictionaryType__index_type`, type) } -DictionaryType__name <- function(type) { - .Call(`_arrow_DictionaryType__name`, type) +DictionaryType__value_type <- function(type) { + .Call(`_arrow_DictionaryType__value_type`, type) } -DictionaryType__dictionary <- function(type) { - .Call(`_arrow_DictionaryType__dictionary`, type) +DictionaryType__name <- function(type) { + .Call(`_arrow_DictionaryType__name`, type) } DictionaryType__ordered <- function(type) { diff --git a/r/R/dictionary.R b/r/R/dictionary.R index 3c3758df303e8..bfe2373aefe74 100644 --- a/r/R/dictionary.R +++ b/r/R/dictionary.R @@ -34,7 +34,7 @@ active = list( index_type = function() `arrow::DataType`$dispatch(DictionaryType__index_type(self)), - dictionary = function() shared_ptr(`arrow::Array`, DictionaryType__dictionary(self)), + value_type = function() `arrow::DataType`$dispatch(DictionaryType__value_type(self)), name = function() DictionaryType__name(self), ordered = function() DictionaryType__ordered(self) ) @@ -42,17 +42,17 @@ #' dictionary type factory #' -#' @param type indices type, e.g. [int32()] -#' @param values values array, typically an arrow array of strings -#' @param ordered Is this an ordered dictionary +#' @param index_type index type, e.g. [int32()] +#' @param value_type value type, probably [utf8()] +#' @param ordered Is this an ordered dictionary ? #' #' @return a [arrow::DictionaryType][arrow__DictionaryType] #' #' @export -dictionary <- function(type, values, ordered = FALSE) { +dictionary <- function(index_type, value_type, ordered = FALSE) { assert_that( - inherits(type, "arrow::DataType"), - inherits(values, "arrow::Array") + inherits(index_type, "arrow::DataType"), + inherits(index_type, "arrow::DataType") ) - shared_ptr(`arrow::DictionaryType`, DictionaryType__initialize(type, values, ordered)) + shared_ptr(`arrow::DictionaryType`, DictionaryType__initialize(index_type, value_type, ordered)) } diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index b39e63503b781..1ac96d43a5b28 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -1172,15 +1172,15 @@ BEGIN_RCPP END_RCPP } // DictionaryType__initialize -std::shared_ptr DictionaryType__initialize(const std::shared_ptr& type, const std::shared_ptr& array, bool ordered); -RcppExport SEXP _arrow_DictionaryType__initialize(SEXP typeSEXP, SEXP arraySEXP, SEXP orderedSEXP) { +std::shared_ptr DictionaryType__initialize(const std::shared_ptr& index_type, const std::shared_ptr& value_type, bool ordered); +RcppExport SEXP _arrow_DictionaryType__initialize(SEXP index_typeSEXP, SEXP value_typeSEXP, SEXP orderedSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::shared_ptr& >::type type(typeSEXP); - Rcpp::traits::input_parameter< const std::shared_ptr& >::type array(arraySEXP); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type index_type(index_typeSEXP); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type value_type(value_typeSEXP); Rcpp::traits::input_parameter< bool >::type ordered(orderedSEXP); - rcpp_result_gen = Rcpp::wrap(DictionaryType__initialize(type, array, ordered)); + rcpp_result_gen = Rcpp::wrap(DictionaryType__initialize(index_type, value_type, ordered)); return rcpp_result_gen; END_RCPP } @@ -1195,25 +1195,25 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// DictionaryType__name -std::string DictionaryType__name(const std::shared_ptr& type); -RcppExport SEXP _arrow_DictionaryType__name(SEXP typeSEXP) { +// DictionaryType__value_type +std::shared_ptr DictionaryType__value_type(const std::shared_ptr& type); +RcppExport SEXP _arrow_DictionaryType__value_type(SEXP typeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::shared_ptr& >::type type(typeSEXP); - rcpp_result_gen = Rcpp::wrap(DictionaryType__name(type)); + rcpp_result_gen = Rcpp::wrap(DictionaryType__value_type(type)); return rcpp_result_gen; END_RCPP } -// DictionaryType__dictionary -std::shared_ptr DictionaryType__dictionary(const std::shared_ptr& type); -RcppExport SEXP _arrow_DictionaryType__dictionary(SEXP typeSEXP) { +// DictionaryType__name +std::string DictionaryType__name(const std::shared_ptr& type); +RcppExport SEXP _arrow_DictionaryType__name(SEXP typeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::shared_ptr& >::type type(typeSEXP); - rcpp_result_gen = Rcpp::wrap(DictionaryType__dictionary(type)); + rcpp_result_gen = Rcpp::wrap(DictionaryType__name(type)); return rcpp_result_gen; END_RCPP } @@ -2419,8 +2419,8 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_TimestampType__unit", (DL_FUNC) &_arrow_TimestampType__unit, 1}, {"_arrow_DictionaryType__initialize", (DL_FUNC) &_arrow_DictionaryType__initialize, 3}, {"_arrow_DictionaryType__index_type", (DL_FUNC) &_arrow_DictionaryType__index_type, 1}, + {"_arrow_DictionaryType__value_type", (DL_FUNC) &_arrow_DictionaryType__value_type, 1}, {"_arrow_DictionaryType__name", (DL_FUNC) &_arrow_DictionaryType__name, 1}, - {"_arrow_DictionaryType__dictionary", (DL_FUNC) &_arrow_DictionaryType__dictionary, 1}, {"_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1}, {"_arrow_ipc___feather___TableWriter__SetDescription", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetDescription, 2}, {"_arrow_ipc___feather___TableWriter__SetNumRows", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetNumRows, 2}, diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index ac72a4a821407..509a39a5e08f8 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -141,8 +141,11 @@ std::shared_ptr MakeFactorArrayImpl(Rcpp::IntegerVector_ factor, ArrayData::Make(std::make_shared(), n, std::move(buffers), null_count, 0); auto array_indices = MakeArray(array_indices_data); + SEXP levels = Rf_getAttrib(factor, R_LevelsSymbol); + auto dict = MakeStringArray(levels); + std::shared_ptr out; - STOP_IF_NOT_OK(DictionaryArray::FromArrays(type, array_indices, &out)); + STOP_IF_NOT_OK(DictionaryArray::FromArrays(type, array_indices, dict, &out)); return out; } @@ -741,22 +744,20 @@ Status GetConverter(const std::shared_ptr& type, } template -std::shared_ptr GetFactorTypeImpl(Rcpp::IntegerVector_ factor) { - auto dict_values = MakeStringArray(Rf_getAttrib(factor, R_LevelsSymbol)); - auto dict_type = - dictionary(std::make_shared(), dict_values, Rf_inherits(factor, "ordered")); - return dict_type; +std::shared_ptr GetFactorTypeImpl(bool ordered) { + return dictionary(std::make_shared(), arrow::utf8(), ordered); } std::shared_ptr GetFactorType(SEXP factor) { SEXP levels = Rf_getAttrib(factor, R_LevelsSymbol); + bool is_ordered = Rf_inherits(factor, "ordered"); int n = Rf_length(levels); if (n < 128) { - return GetFactorTypeImpl(factor); + return GetFactorTypeImpl(is_ordered); } else if (n < 32768) { - return GetFactorTypeImpl(factor); + return GetFactorTypeImpl(is_ordered); } else { - return GetFactorTypeImpl(factor); + return GetFactorTypeImpl(is_ordered); } } @@ -909,21 +910,7 @@ bool CheckCompatibleFactor(SEXP obj, const std::shared_ptr& typ arrow::DictionaryType* dict_type = arrow::checked_cast(type.get()); - auto dictionary = dict_type->dictionary(); - if (dictionary->type() != utf8()) return false; - - // then compare levels - auto typed_dict = checked_cast(dictionary.get()); - SEXP levels = Rf_getAttrib(obj, R_LevelsSymbol); - - R_xlen_t n = XLENGTH(levels); - if (n != typed_dict->length()) return false; - - for (R_xlen_t i = 0; i < n; i++) { - if (typed_dict->GetString(i) != CHAR(STRING_ELT(levels, i))) return false; - } - - return true; + return dict_type->value_type() == utf8(); } std::shared_ptr Array__from_vector( diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index e392f09136b59..b0becb458b765 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -250,9 +250,9 @@ arrow::TimeUnit::type TimestampType__unit( // [[Rcpp::export]] std::shared_ptr DictionaryType__initialize( - const std::shared_ptr& type, - const std::shared_ptr& array, bool ordered) { - return arrow::dictionary(type, array, ordered); + const std::shared_ptr& index_type, + const std::shared_ptr& value_type, bool ordered) { + return arrow::dictionary(index_type, value_type, ordered); } // [[Rcpp::export]] @@ -262,14 +262,14 @@ std::shared_ptr DictionaryType__index_type( } // [[Rcpp::export]] -std::string DictionaryType__name(const std::shared_ptr& type) { - return type->name(); +std::shared_ptr DictionaryType__value_type( + const std::shared_ptr& type) { + return type->value_type(); } // [[Rcpp::export]] -std::shared_ptr DictionaryType__dictionary( - const std::shared_ptr& type) { - return type->dictionary(); +std::string DictionaryType__name(const std::shared_ptr& type) { + return type->name(); } // [[Rcpp::export]] diff --git a/r/src/message.cpp b/r/src/message.cpp index 3aec1beeef796..cb8a3c97f57b3 100644 --- a/r/src/message.cpp +++ b/r/src/message.cpp @@ -56,7 +56,10 @@ std::shared_ptr ipc___ReadRecordBatch__Message__Schema( const std::unique_ptr& message, const std::shared_ptr& schema) { std::shared_ptr batch; - STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(*message, schema, &batch)); + + // TODO: perhaps this should come from the R side + arrow::ipc::DictionaryMemo memo; + STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(*message, schema, &memo, &batch)); return batch; } @@ -64,7 +67,9 @@ std::shared_ptr ipc___ReadRecordBatch__Message__Schema( std::shared_ptr ipc___ReadSchema_InputStream( const std::shared_ptr& stream) { std::shared_ptr schema; - STOP_IF_NOT_OK(arrow::ipc::ReadSchema(stream.get(), &schema)); + // TODO: promote to function argument + arrow::ipc::DictionaryMemo memo; + STOP_IF_NOT_OK(arrow::ipc::ReadSchema(stream.get(), &memo, &schema)); return schema; } diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp index f2bec018cc1ce..31fefa8de60d3 100644 --- a/r/src/recordbatch.cpp +++ b/r/src/recordbatch.cpp @@ -140,6 +140,8 @@ std::shared_ptr ipc___ReadRecordBatch__InputStream__Schema( const std::shared_ptr& stream, const std::shared_ptr& schema) { std::shared_ptr batch; - STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(schema, stream.get(), &batch)); + // TODO: promote to function arg + arrow::ipc::DictionaryMemo memo; + STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(schema, &memo, stream.get(), &batch)); return batch; } diff --git a/r/tests/testthat/test-DataType.R b/r/tests/testthat/test-DataType.R index fc9fc896eaee8..5faf7214649fe 100644 --- a/r/tests/testthat/test-DataType.R +++ b/r/tests/testthat/test-DataType.R @@ -314,7 +314,7 @@ test_that("struct type works as expected", { }) test_that("DictionaryType works as expected (ARROW-3355)", { - d <- dictionary(int32(), array(c("foo", "bar", "baz"))) + d <- dictionary(int32(), utf8()) expect_equal(d, d) expect_true(d == d) expect_false(d == int32()) @@ -322,5 +322,5 @@ test_that("DictionaryType works as expected (ARROW-3355)", { expect_equal(d$bit_width, 32L) expect_equal(d$ToString(), "dictionary") expect_equal(d$index_type, int32()) - expect_equal(d$dictionary, array(c("foo", "bar", "baz"))) + expect_equal(d$value_type, utf8()) }) From 2556c16ffd748066807b870194c23ec0ba3b0e5a Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 31 May 2019 08:32:30 +0200 Subject: [PATCH 2/3] document() --- r/man/dictionary.Rd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/man/dictionary.Rd b/r/man/dictionary.Rd index 340283ec4dafc..9662328b11a3a 100644 --- a/r/man/dictionary.Rd +++ b/r/man/dictionary.Rd @@ -4,14 +4,14 @@ \alias{dictionary} \title{dictionary type factory} \usage{ -dictionary(type, values, ordered = FALSE) +dictionary(index_type, value_type, ordered = FALSE) } \arguments{ -\item{type}{indices type, e.g. \code{\link[=int32]{int32()}}} +\item{index_type}{index type, e.g. \code{\link[=int32]{int32()}}} -\item{values}{values array, typically an arrow array of strings} +\item{value_type}{value type, probably \code{\link[=utf8]{utf8()}}} -\item{ordered}{Is this an ordered dictionary} +\item{ordered}{Is this an ordered dictionary ?} } \value{ a \link[=arrow__DictionaryType]{arrow::DictionaryType} From b0de1a8a1922ed4552d1dc45d0550d98716e2002 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 31 May 2019 08:57:38 +0200 Subject: [PATCH 3/3] R should pass now --- .travis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 79ce8cd29ca1b..622e820840f71 100644 --- a/.travis.yml +++ b/.travis.yml @@ -47,8 +47,6 @@ before_install: - if [ $TRAVIS_OS_NAME == "linux" ]; then ccache -s; fi matrix: - allow_failures: - - language: r fast_finish: true include: - name: "Lint, Release tests"