Skip to content

Commit

Permalink
Rewrite conversion in terms of column (#15213)
Browse files Browse the repository at this point in the history
It looks like soon after I started investigating scalar conversions for #14121 (but well before I made the PR) a major underlying hole was plugged in pyarrow via apache/arrow#36162. Most of #14121 was created to give us a way to handle scalars from pyarrow generically in libcudf. Now that pyarrow scalars can be easily tossed into arrays, we no longer really need separate scalar functions in libcudf; we can simply create an array from the scalar, put it into a table, and then call the table function.

Additionally, arrow also has a function for creating an array from a scalar. This function is not new but [was previously undocumented](apache/arrow#40373). The builder code added to libcudf in #14121 can be removed and replaced with that factory. The scalar conversion is as simple as calling that arrow function and then using our preexisting `from_arrow` function on the resulting array.

For now this PR is just a simplification of internals. Future PRs will remove the scalar API once we have a more standard path for the conversion of arrays via the C Data Interface.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: #15213
  • Loading branch information
vyasr authored Mar 8, 2024
1 parent 7b0eee1 commit ec24c02
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 123 deletions.
55 changes: 0 additions & 55 deletions cpp/include/cudf/detail/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,61 +105,6 @@ std::shared_ptr<arrow::Array> to_arrow_array(cudf::type_id id, Ts&&... args)
}
}

/**
* @brief Invokes an `operator()` template with the type instantiation based on
* the specified `arrow::DataType`'s `id()`.
*
* This function is analogous to libcudf's type_dispatcher, but instead applies
* to Arrow functions. Its primary use case is to leverage Arrow's
* metaprogramming facilities like arrow::TypeTraits that require translating
* the runtime dtype information into compile-time types.
*/
template <typename Functor, typename... Ts>
constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype,
Functor f,
Ts&&... args)
{
switch (dtype.id()) {
case arrow::Type::INT8:
return f.template operator()<arrow::Int8Type>(std::forward<Ts>(args)...);
case arrow::Type::INT16:
return f.template operator()<arrow::Int16Type>(std::forward<Ts>(args)...);
case arrow::Type::INT32:
return f.template operator()<arrow::Int32Type>(std::forward<Ts>(args)...);
case arrow::Type::INT64:
return f.template operator()<arrow::Int64Type>(std::forward<Ts>(args)...);
case arrow::Type::UINT8:
return f.template operator()<arrow::UInt8Type>(std::forward<Ts>(args)...);
case arrow::Type::UINT16:
return f.template operator()<arrow::UInt16Type>(std::forward<Ts>(args)...);
case arrow::Type::UINT32:
return f.template operator()<arrow::UInt32Type>(std::forward<Ts>(args)...);
case arrow::Type::UINT64:
return f.template operator()<arrow::UInt64Type>(std::forward<Ts>(args)...);
case arrow::Type::FLOAT:
return f.template operator()<arrow::FloatType>(std::forward<Ts>(args)...);
case arrow::Type::DOUBLE:
return f.template operator()<arrow::DoubleType>(std::forward<Ts>(args)...);
case arrow::Type::BOOL:
return f.template operator()<arrow::BooleanType>(std::forward<Ts>(args)...);
case arrow::Type::TIMESTAMP:
return f.template operator()<arrow::TimestampType>(std::forward<Ts>(args)...);
case arrow::Type::DURATION:
return f.template operator()<arrow::DurationType>(std::forward<Ts>(args)...);
case arrow::Type::STRING:
return f.template operator()<arrow::StringType>(std::forward<Ts>(args)...);
case arrow::Type::LIST:
return f.template operator()<arrow::ListType>(std::forward<Ts>(args)...);
case arrow::Type::DECIMAL128:
return f.template operator()<arrow::Decimal128Type>(std::forward<Ts>(args)...);
case arrow::Type::STRUCT:
return f.template operator()<arrow::StructType>(std::forward<Ts>(args)...);
default: {
CUDF_FAIL("Invalid type.");
}
}
}

// Converting arrow type to cudf type
data_type arrow_to_cudf_type(arrow::DataType const& arrow_type);

Expand Down
63 changes: 2 additions & 61 deletions cpp/src/interop/from_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -419,52 +419,6 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
: get_empty_type_column(array.length());
}

struct BuilderGenerator {
template <typename T,
CUDF_ENABLE_IF(!std::is_same_v<T, arrow::ListType> &&
!std::is_same_v<T, arrow::StructType>)>
std::shared_ptr<arrow::ArrayBuilder> operator()(std::shared_ptr<arrow::DataType> const& type)
{
return std::make_shared<typename arrow::TypeTraits<T>::BuilderType>(
type, arrow::default_memory_pool());
}

template <typename T,
CUDF_ENABLE_IF(std::is_same_v<T, arrow::ListType> ||
std::is_same_v<T, arrow::StructType>)>
std::shared_ptr<arrow::ArrayBuilder> operator()(std::shared_ptr<arrow::DataType> const& type)
{
CUDF_FAIL("Type not supported by BuilderGenerator");
}
};

std::shared_ptr<arrow::ArrayBuilder> make_builder(std::shared_ptr<arrow::DataType> const& type)
{
switch (type->id()) {
case arrow::Type::STRUCT: {
std::vector<std::shared_ptr<arrow::ArrayBuilder>> field_builders;

for (auto field : type->fields()) {
auto const vt = field->type();
if (vt->id() == arrow::Type::STRUCT || vt->id() == arrow::Type::LIST) {
field_builders.push_back(make_builder(vt));
} else {
field_builders.push_back(arrow_type_dispatcher(*vt, BuilderGenerator{}, vt));
}
}
return std::make_shared<arrow::StructBuilder>(
type, arrow::default_memory_pool(), field_builders);
}
case arrow::Type::LIST: {
return std::make_shared<arrow::ListBuilder>(arrow::default_memory_pool(),
make_builder(type->field(0)->type()));
}
default: {
return arrow_type_dispatcher(*type, BuilderGenerator{}, type);
}
}
}

} // namespace

std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
Expand Down Expand Up @@ -512,21 +466,8 @@ std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// Get a builder for the scalar type
auto builder = detail::make_builder(input.type);

auto status = builder->AppendScalar(input);
if (status != arrow::Status::OK()) {
if (status.IsNotImplemented()) {
// The only known failure case here is for nulls
CUDF_FAIL("Cannot create untyped null scalars or nested types with untyped null leaf nodes",
std::invalid_argument);
}
CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed");
}

auto maybe_array = builder->Finish();
if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); }
auto maybe_array = arrow::MakeArrayFromScalar(input, 1);
if (!maybe_array.ok()) { CUDF_FAIL("Failed to create array"); }
auto array = *maybe_array;

auto field = arrow::field("", input.type);
Expand Down
25 changes: 18 additions & 7 deletions python/cudf/cudf/_lib/scalar.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -129,18 +129,29 @@ cdef class DeviceScalar:
else:
pa_type = pa.from_numpy_dtype(dtype)

pa_scalar = pa.scalar(value, type=pa_type)
if isinstance(pa_type, pa.ListType) and value is None:
# pyarrow doesn't correctly handle None values for list types, so
# we have to create this one manually.
# https://github.com/apache/arrow/issues/40319
pa_array = pa.array([None], type=pa_type)
else:
pa_array = pa.array([pa.scalar(value, type=pa_type)])

pa_table = pa.Table.from_arrays([pa_array], names=[""])
table = pylibcudf.Table.from_arrow(pa_table)

data_type = None
column = table.columns()[0]
if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
tid = pylibcudf.TypeId.DECIMAL128
if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
tid = pylibcudf.TypeId.DECIMAL32
column = pylibcudf.unary.cast(
column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL32, -dtype.scale)
)
elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
tid = pylibcudf.TypeId.DECIMAL64
data_type = pylibcudf.DataType(tid, -dtype.scale)
column = pylibcudf.unary.cast(
column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL64, -dtype.scale)
)

self.c_value = pylibcudf.Scalar.from_arrow(pa_scalar, data_type)
self.c_value = pylibcudf.copying.get_element(column, 0)
self._dtype = dtype

def _to_host_scalar(self):
Expand Down

0 comments on commit ec24c02

Please sign in to comment.