diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 95c509efc5b..310bc99b279 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -365,6 +365,7 @@ add_library( src/interop/dlpack.cpp src/interop/from_arrow.cu src/interop/arrow_utilities.cpp + src/interop/decimal_conversion_utilities.cu src/interop/to_arrow.cu src/interop/to_arrow_device.cu src/interop/to_arrow_host.cu diff --git a/cpp/src/interop/decimal_conversion_utilities.cu b/cpp/src/interop/decimal_conversion_utilities.cu new file mode 100644 index 00000000000..2f81c754a30 --- /dev/null +++ b/cpp/src/interop/decimal_conversion_utilities.cu @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "decimal_conversion_utilities.cuh" + +#include +#include +#include + +#include + +#include + +#include + +namespace cudf { +namespace detail { + +template +std::unique_ptr convert_decimals_to_decimal128( + cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) +{ + static_assert(std::is_same_v or std::is_same_v, + "Only int32 and int64 decimal types can be converted to decimal128."); + + constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType); + auto buf = std::make_unique(column.size() * sizeof(__int128_t), stream, mr); + + thrust::for_each(rmm::exec_policy_nosync(stream, mr), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(column.size()), + [in = column.begin(), + out = reinterpret_cast(buf->data()), + BIT_WIDTH_RATIO] __device__(auto in_idx) { + auto const out_idx = in_idx * BIT_WIDTH_RATIO; + // the lowest order bits are the value, the remainder + // simply matches the sign bit to satisfy the two's + // complement integer representation of negative numbers. + out[out_idx] = in[in_idx]; +#pragma unroll BIT_WIDTH_RATIO - 1 + for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { + out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; + } + }); + + return buf; +} + +// Instantiate templates for int32_t and int64_t decimal types +template std::unique_ptr convert_decimals_to_decimal128( + cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +template std::unique_ptr convert_decimals_to_decimal128( + cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/interop/decimal_conversion_utilities.cuh b/cpp/src/interop/decimal_conversion_utilities.cuh new file mode 100644 index 00000000000..41263147404 --- /dev/null +++ b/cpp/src/interop/decimal_conversion_utilities.cuh @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include + +namespace cudf::detail { + +/** + * @brief Convert decimal32 and decimal64 numeric data to decimal128 and return the device vector + * + * @tparam DecimalType to convert from + * + * @param column A view of the input columns + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + * + * @return A device vector containing the converted decimal128 data + */ +template +std::unique_ptr convert_decimals_to_decimal128( + cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +} // namespace cudf::detail diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 6b163e3441e..3d41f856f4f 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -15,6 +15,7 @@ */ #include "arrow_utilities.hpp" +#include "decimal_conversion_utilities.cuh" #include "detail/arrow_allocator.hpp" #include @@ -158,8 +159,11 @@ std::shared_ptr unsupported_decimals_to_arrow(column_view input, arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - auto buf = - detail::decimals_to_arrow(input, stream, rmm::mr::get_current_device_resource()); + auto buf = detail::convert_decimals_to_decimal128( + input, stream, rmm::mr::get_current_device_resource()); + + // Synchronize stream here to ensure the decimal128 buffer is ready. + stream.synchronize(); auto const buf_size_in_bytes = buf->size(); auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu index 2eb9b912054..cea7cdebcba 100644 --- a/cpp/src/interop/to_arrow_device.cu +++ b/cpp/src/interop/to_arrow_device.cu @@ -15,6 +15,7 @@ */ #include "arrow_utilities.hpp" +#include "decimal_conversion_utilities.cuh" #include #include @@ -141,7 +142,9 @@ int construct_decimals(cudf::column_view input, nanoarrow::UniqueArray tmp; NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input)); - auto buf = detail::decimals_to_arrow(input, stream, mr); + auto buf = detail::convert_decimals_to_decimal128(input, stream, mr); + // Synchronize stream here to ensure the decimal128 buffer is ready. + stream.synchronize(); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get())); ArrowArrayMove(tmp.get(), out); diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu index c9e53ebaab7..193b3a3b5a2 100644 --- a/cpp/src/interop/to_arrow_host.cu +++ b/cpp/src/interop/to_arrow_host.cu @@ -15,6 +15,7 @@ */ #include "arrow_utilities.hpp" +#include "decimal_conversion_utilities.cuh" #include #include @@ -50,41 +51,6 @@ namespace cudf { namespace detail { -template -std::unique_ptr decimals_to_arrow(cudf::column_view input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType); - auto buf = std::make_unique(input.size() * sizeof(__int128_t), stream, mr); - - auto count = thrust::counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream, mr), - count, - count + input.size(), - [in = input.begin(), - out = reinterpret_cast(buf->data()), - BIT_WIDTH_RATIO] __device__(auto in_idx) { - auto const out_idx = in_idx * BIT_WIDTH_RATIO; - // the lowest order bits are the value, the remainder - // simply matches the sign bit to satisfy the two's - // complement integer representation of negative numbers. - out[out_idx] = in[in_idx]; -#pragma unroll BIT_WIDTH_RATIO - 1 - for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { - out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; - } - }); - - return buf; -} - -template std::unique_ptr decimals_to_arrow( - cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); - -template std::unique_ptr decimals_to_arrow( - cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); - namespace { struct dispatch_to_arrow_host { @@ -156,7 +122,9 @@ struct dispatch_to_arrow_host { NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column)); NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get()))); - auto buf = detail::decimals_to_arrow(column, stream, mr); + auto buf = detail::convert_decimals_to_decimal128(column, stream, mr); + // No need to synchronize stream here as populate_data_buffer uses the same stream to copy data + // to host. NANOARROW_RETURN_NOT_OK( populate_data_buffer(device_span<__int128_t const>( reinterpret_cast(buf->data()), column.size()), diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 2df71b77301..36a1d8377bf 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -22,6 +22,7 @@ #include "arrow_schema_writer.hpp" #include "compact_protocol_reader.hpp" #include "compact_protocol_writer.hpp" +#include "interop/decimal_conversion_utilities.cuh" #include "io/comp/nvcomp_adapter.hpp" #include "io/parquet/parquet.hpp" #include "io/parquet/parquet_gpu.hpp" @@ -1601,50 +1602,12 @@ size_t column_index_buffer_size(EncColumnChunk* ck, return ck->ck_stat_size * num_pages + column_index_truncate_length + padding + size_struct_size; } -/** - * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector - * - * @tparam DecimalType to convert from - * - * @param column A view of the input columns - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return A device vector containing the converted decimal128 data - */ -template -rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column, - rmm::cuda_stream_view stream) -{ - size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType); - - rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream); - - thrust::for_each(rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(column.size()), - [in = column.begin(), - out = reinterpret_cast(d128_buffer.data()), - BIT_WIDTH_RATIO] __device__(auto in_idx) { - auto const out_idx = in_idx * BIT_WIDTH_RATIO; - // The lowest order bits are the value, the remainder - // simply matches the sign bit to satisfy the two's - // complement integer representation of negative numbers. - out[out_idx] = in[in_idx]; -#pragma unroll BIT_WIDTH_RATIO - 1 - for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { - out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; - } - }); - - return d128_buffer; -} - /** * @brief Function to convert decimal32 and decimal64 columns to decimal128 data, * update the input table metadata, and return a new vector of column views. * * @param[in,out] table_meta The table metadata - * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers. + * @param[in,out] d128_buffers Buffers containing the converted decimal128 data. * @param input The input table * @param stream CUDA stream used for device memory operations and kernel launches * @@ -1652,7 +1615,7 @@ rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& co */ std::vector convert_decimal_columns_and_metadata( table_input_metadata& table_meta, - std::vector>& d128_vectors, + std::vector>& d128_buffers, table_view const& table, rmm::cuda_stream_view stream) { @@ -1673,28 +1636,30 @@ std::vector convert_decimal_columns_and_metadata( switch (column.type().id()) { case type_id::DECIMAL32: // Convert data to decimal128 type - d128_vectors.emplace_back(convert_data_to_decimal128(column, stream)); + d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128( + column, stream, rmm::mr::get_current_device_resource())); // Update metadata metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION); metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()})); // Create a new column view from the d128 data vector return {data_type{type_id::DECIMAL128, column.type().scale()}, column.size(), - d128_vectors.back().data(), + d128_buffers.back()->data(), column.null_mask(), column.null_count(), column.offset(), converted_children}; case type_id::DECIMAL64: // Convert data to decimal128 type - d128_vectors.emplace_back(convert_data_to_decimal128(column, stream)); + d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128( + column, stream, rmm::mr::get_current_device_resource())); // Update metadata metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION); metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()})); // Create a new column view from the d128 data vector return {data_type{type_id::DECIMAL128, column.type().scale()}, column.size(), - d128_vectors.back().data(), + d128_buffers.back()->data(), column.null_mask(), column.null_count(), column.offset(), @@ -1722,6 +1687,9 @@ std::vector convert_decimal_columns_and_metadata( std::back_inserter(converted_column_views), [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); }); + // Synchronize stream here to ensure all decimal128 buffers are ready. + stream.synchronize(); + return converted_column_views; } @@ -1780,13 +1748,13 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, rmm::cuda_stream_view stream) { // Container to store decimal128 converted data if needed - std::vector> d128_vectors; + std::vector> d128_buffers; // Convert decimal32/decimal64 data to decimal128 if writing arrow schema // and initialize LinkedColVector auto vec = table_to_linked_columns( (write_arrow_schema) - ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)}) + ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_buffers, input, stream)}) : input); auto schema_tree = construct_parquet_schema_tree( diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 77da4039103..51216a8512c 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -710,6 +710,83 @@ TEST_F(ToArrowDeviceTest, StructColumn) template using fp_wrapper = cudf::test::fixed_point_column_wrapper; +TEST_F(ToArrowDeviceTest, FixedPoint32Table) +{ + using namespace numeric; + + for (auto const scale : {6, 4, 2, 0, -1, -3, -5}) { + auto const expect_data = + std::vector{-1000, -1, -1, -1, 2400, 0, 0, 0, -3456, -1, -1, -1, + 4650, 0, 0, 0, 5154, 0, 0, 0, 6800, 0, 0, 0}; + auto col = fp_wrapper({-1000, 2400, -3456, 4650, 5154, 6800}, scale_type{scale}); + std::vector> table_cols; + table_cols.emplace_back(col.release()); + auto input = cudf::table(std::move(table_cols)); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision(), + -scale)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + auto got_arrow_schema = + cudf::to_arrow_schema(input.view(), std::vector{{"a"}}); + compare_schemas(expected_schema.get(), got_arrow_schema.get()); + + auto result_dev_data = std::make_unique>( + expect_data.size(), cudf::get_default_stream()); + cudaMemcpy(result_dev_data->data(), + expect_data.data(), + sizeof(int32_t) * expect_data.size(), + cudaMemcpyHostToDevice); + + cudf::get_default_stream().synchronize(); + nanoarrow::UniqueArray expected_array; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + + expected_array->children[0]->length = input.num_rows(); + NANOARROW_THROW_NOT_OK( + ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc)); + ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data = + const_cast(reinterpret_cast(input.view().column(0).null_mask())); + + auto data_ptr = reinterpret_cast(result_dev_data->data()); + NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator( + ArrowArrayBuffer(expected_array->children[0], 1), + ArrowBufferDeallocator( + [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) { + auto buf = + reinterpret_cast>*>(alloc->private_data); + delete buf; + }, + new std::unique_ptr>(std::move(result_dev_data))))); + ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr; + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + auto got_arrow_array = cudf::to_arrow_device(input.view()); + ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); + ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); + compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); + + got_arrow_array = cudf::to_arrow_device(std::move(input)); + ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); + ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); + compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); + } +} + TEST_F(ToArrowDeviceTest, FixedPoint64Table) { using namespace numeric;