From ec24c02c1d1f83fe5e407a61dd77d0024d5ebc77 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Mar 2024 09:17:00 -0800 Subject: [PATCH] Rewrite conversion in terms of column (#15213) It looks like soon after I started investigating scalar conversions for https://github.com/rapidsai/cudf/pull/14121 (but well before I made the PR) a major underlying hole was plugged in pyarrow via https://github.com/apache/arrow/pull/36162. Most of #14121 was created to give us a way to handle scalars from pyarrow generically in libcudf. Now that pyarrow scalars can be easily tossed into arrays, we no longer really need separate scalar functions in libcudf; we can simply create an array from the scalar, put it into a table, and then call the table function. Additionally, arrow also has a function for creating an array from a scalar. This function is not new but [was previously undocumented](https://github.com/apache/arrow/pull/40373). The builder code added to libcudf in #14121 can be removed and replaced with that factory. The scalar conversion is as simple as calling that arrow function and then using our preexisting `from_arrow` function on the resulting array. For now this PR is just a simplification of internals. Future PRs will remove the scalar API once we have a more standard path for the conversion of arrays via the C Data Interface. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Yunsong Wang (https://github.com/PointKernel) - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15213 --- cpp/include/cudf/detail/interop.hpp | 55 ------------------------- cpp/src/interop/from_arrow.cu | 63 +---------------------------- python/cudf/cudf/_lib/scalar.pyx | 25 ++++++++---- 3 files changed, 20 insertions(+), 123 deletions(-) diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp index 683b49e1813..296b68d22a9 100644 --- a/cpp/include/cudf/detail/interop.hpp +++ b/cpp/include/cudf/detail/interop.hpp @@ -105,61 +105,6 @@ std::shared_ptr to_arrow_array(cudf::type_id id, Ts&&... args) } } -/** - * @brief Invokes an `operator()` template with the type instantiation based on - * the specified `arrow::DataType`'s `id()`. - * - * This function is analogous to libcudf's type_dispatcher, but instead applies - * to Arrow functions. Its primary use case is to leverage Arrow's - * metaprogramming facilities like arrow::TypeTraits that require translating - * the runtime dtype information into compile-time types. - */ -template -constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, - Functor f, - Ts&&... args) -{ - switch (dtype.id()) { - case arrow::Type::INT8: - return f.template operator()(std::forward(args)...); - case arrow::Type::INT16: - return f.template operator()(std::forward(args)...); - case arrow::Type::INT32: - return f.template operator()(std::forward(args)...); - case arrow::Type::INT64: - return f.template operator()(std::forward(args)...); - case arrow::Type::UINT8: - return f.template operator()(std::forward(args)...); - case arrow::Type::UINT16: - return f.template operator()(std::forward(args)...); - case arrow::Type::UINT32: - return f.template operator()(std::forward(args)...); - case arrow::Type::UINT64: - return f.template operator()(std::forward(args)...); - case arrow::Type::FLOAT: - return f.template operator()(std::forward(args)...); - case arrow::Type::DOUBLE: - return f.template operator()(std::forward(args)...); - case arrow::Type::BOOL: - return f.template operator()(std::forward(args)...); - case arrow::Type::TIMESTAMP: - return f.template operator()(std::forward(args)...); - case arrow::Type::DURATION: - return f.template operator()(std::forward(args)...); - case arrow::Type::STRING: - return f.template operator()(std::forward(args)...); - case arrow::Type::LIST: - return f.template operator()(std::forward(args)...); - case arrow::Type::DECIMAL128: - return f.template operator()(std::forward(args)...); - case arrow::Type::STRUCT: - return f.template operator()(std::forward(args)...); - default: { - CUDF_FAIL("Invalid type."); - } - } -} - // Converting arrow type to cudf type data_type arrow_to_cudf_type(arrow::DataType const& arrow_type); diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 7b44fb41288..2a524c773c0 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -419,52 +419,6 @@ std::unique_ptr get_column(arrow::Array const& array, : get_empty_type_column(array.length()); } -struct BuilderGenerator { - template && - !std::is_same_v)> - std::shared_ptr operator()(std::shared_ptr const& type) - { - return std::make_shared::BuilderType>( - type, arrow::default_memory_pool()); - } - - template || - std::is_same_v)> - std::shared_ptr operator()(std::shared_ptr const& type) - { - CUDF_FAIL("Type not supported by BuilderGenerator"); - } -}; - -std::shared_ptr make_builder(std::shared_ptr const& type) -{ - switch (type->id()) { - case arrow::Type::STRUCT: { - std::vector> field_builders; - - for (auto field : type->fields()) { - auto const vt = field->type(); - if (vt->id() == arrow::Type::STRUCT || vt->id() == arrow::Type::LIST) { - field_builders.push_back(make_builder(vt)); - } else { - field_builders.push_back(arrow_type_dispatcher(*vt, BuilderGenerator{}, vt)); - } - } - return std::make_shared( - type, arrow::default_memory_pool(), field_builders); - } - case arrow::Type::LIST: { - return std::make_shared(arrow::default_memory_pool(), - make_builder(type->field(0)->type())); - } - default: { - return arrow_type_dispatcher(*type, BuilderGenerator{}, type); - } - } -} - } // namespace std::unique_ptr from_arrow(arrow::Table const& input_table, @@ -512,21 +466,8 @@ std::unique_ptr from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // Get a builder for the scalar type - auto builder = detail::make_builder(input.type); - - auto status = builder->AppendScalar(input); - if (status != arrow::Status::OK()) { - if (status.IsNotImplemented()) { - // The only known failure case here is for nulls - CUDF_FAIL("Cannot create untyped null scalars or nested types with untyped null leaf nodes", - std::invalid_argument); - } - CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); - } - - auto maybe_array = builder->Finish(); - if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } + auto maybe_array = arrow::MakeArrayFromScalar(input, 1); + if (!maybe_array.ok()) { CUDF_FAIL("Failed to create array"); } auto array = *maybe_array; auto field = arrow::field("", input.type); diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 37708a4e3ba..cd9793270e2 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -129,18 +129,29 @@ cdef class DeviceScalar: else: pa_type = pa.from_numpy_dtype(dtype) - pa_scalar = pa.scalar(value, type=pa_type) + if isinstance(pa_type, pa.ListType) and value is None: + # pyarrow doesn't correctly handle None values for list types, so + # we have to create this one manually. + # https://github.com/apache/arrow/issues/40319 + pa_array = pa.array([None], type=pa_type) + else: + pa_array = pa.array([pa.scalar(value, type=pa_type)]) + + pa_table = pa.Table.from_arrays([pa_array], names=[""]) + table = pylibcudf.Table.from_arrow(pa_table) - data_type = None + column = table.columns()[0] if isinstance(dtype, cudf.core.dtypes.DecimalDtype): - tid = pylibcudf.TypeId.DECIMAL128 if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): - tid = pylibcudf.TypeId.DECIMAL32 + column = pylibcudf.unary.cast( + column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL32, -dtype.scale) + ) elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): - tid = pylibcudf.TypeId.DECIMAL64 - data_type = pylibcudf.DataType(tid, -dtype.scale) + column = pylibcudf.unary.cast( + column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL64, -dtype.scale) + ) - self.c_value = pylibcudf.Scalar.from_arrow(pa_scalar, data_type) + self.c_value = pylibcudf.copying.get_element(column, 0) self._dtype = dtype def _to_host_scalar(self):