From c7f35e8f13ae5e3404bc883418358ff4fd3e4e62 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 16 Mar 2021 10:30:07 -0400 Subject: [PATCH 01/40] Reuse initializer list in test --- cpp/tests/interop/from_arrow_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 9f5bbe2dcb9..89c4e87a05c 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -86,7 +86,7 @@ TEST_F(FromArrowTest, DateTimeTable) std::shared_ptr arr; arrow::TimestampBuilder timestamp_builder(timestamp(arrow::TimeUnit::type::MILLI), arrow::default_memory_pool()); - timestamp_builder.AppendValues(std::vector{1, 2, 3, 4, 5, 6}); + timestamp_builder.AppendValues(std::vector(data)); CUDF_EXPECTS(timestamp_builder.Finish(&arr).ok(), "Failed to build array"); std::vector> schema_vector({arrow::field("a", arr->type())}); From 09c70547f4f9048507ac1c4086aedbe7343111be Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 16 Mar 2021 11:04:06 -0400 Subject: [PATCH 02/40] Add fixed point from arrow test --- cpp/tests/interop/from_arrow_test.cpp | 35 +++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 89c4e87a05c..b12c534e98b 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -76,17 +76,17 @@ TEST_F(FromArrowTest, EmptyTable) TEST_F(FromArrowTest, DateTimeTable) { - auto data = {1, 2, 3, 4, 5, 6}; + auto data = std::vector{1, 2, 3, 4, 5, 6}; - auto col = - cudf::test::fixed_width_column_wrapper(data); + auto col = cudf::test::fixed_width_column_wrapper( + data.begin(), data.end()); cudf::table_view expected_table_view({col}); std::shared_ptr arr; - arrow::TimestampBuilder timestamp_builder(timestamp(arrow::TimeUnit::type::MILLI), + arrow::TimestampBuilder timestamp_builder(arrow::timestamp(arrow::TimeUnit::type::MILLI), arrow::default_memory_pool()); - timestamp_builder.AppendValues(std::vector(data)); + timestamp_builder.AppendValues(data); CUDF_EXPECTS(timestamp_builder.Finish(&arr).ok(), "Failed to build array"); std::vector> schema_vector({arrow::field("a", arr->type())}); @@ -350,6 +350,31 @@ TEST_P(FromArrowTestSlice, SliceTest) } } +TEST_F(FromArrowTest, FixedPointTable) +{ + auto data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + + auto col = + cudf::test::fixed_point_column_wrapper({1, 2, 3, 4, 5, 6}, numeric::scale_type{0}); + + cudf::table_view expected_table_view({col}); + + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(1, 0), arrow::default_memory_pool()); + decimal_builder.AppendValues(reinterpret_cast(data.data()), + sizeof(int64_t) * data.size()); + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + + std::vector> schema_vector({arrow::field("a", arr->type())}); + auto schema = std::make_shared(schema_vector); + + auto arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_cudf_table = cudf::from_arrow(*arrow_table); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); +} + INSTANTIATE_TEST_CASE_P(FromArrowTest, FromArrowTestSlice, ::testing::Values(std::make_tuple(0, 10000), From 18f77e7b56d8f5140795545457ab27cf38401439 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 16 Mar 2021 11:26:40 -0400 Subject: [PATCH 03/40] Support DECIMAL conversion + cleanup --- cpp/src/interop/from_arrow.cpp | 8 +++++-- cpp/tests/interop/from_arrow_test.cpp | 30 ++++++++++++++------------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp index 729b98d85a8..64cc584763b 100644 --- a/cpp/src/interop/from_arrow.cpp +++ b/cpp/src/interop/from_arrow.cpp @@ -54,7 +54,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type) case arrow::Type::DOUBLE: return data_type(type_id::FLOAT64); case arrow::Type::DATE32: return data_type(type_id::TIMESTAMP_DAYS); case arrow::Type::TIMESTAMP: { - arrow::TimestampType const* type = static_cast(&arrow_type); + auto type = static_cast(&arrow_type); switch (type->unit()) { case arrow::TimeUnit::type::SECOND: return data_type(type_id::TIMESTAMP_SECONDS); case arrow::TimeUnit::type::MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS); @@ -64,7 +64,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type) } } case arrow::Type::DURATION: { - arrow::DurationType const* type = static_cast(&arrow_type); + auto type = static_cast(&arrow_type); switch (type->unit()) { case arrow::TimeUnit::type::SECOND: return data_type(type_id::DURATION_SECONDS); case arrow::TimeUnit::type::MILLI: return data_type(type_id::DURATION_MILLISECONDS); @@ -76,6 +76,10 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type) case arrow::Type::STRING: return data_type(type_id::STRING); case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32); case arrow::Type::LIST: return data_type(type_id::LIST); + case arrow::Type::DECIMAL: { // DECIMAL128 ??? + auto type = static_cast(&arrow_type); + return data_type{type_id::DECIMAL64, type->scale()}; + } case arrow::Type::STRUCT: return data_type(type_id::STRUCT); default: CUDF_FAIL("Unsupported type_id conversion to cudf"); } diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index b12c534e98b..3e0d764d959 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -337,10 +337,10 @@ TEST_P(FromArrowTestSlice, SliceTest) auto start = std::get<0>(GetParam()); auto end = std::get<1>(GetParam()); - auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0]; - cudf::table expected_cudf_table{sliced_cudf_table}; - auto sliced_arrow_table = arrow_table->Slice(start, end - start); - auto got_cudf_table = cudf::from_arrow(*sliced_arrow_table); + auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0]; + auto expected_cudf_table = cudf::table{sliced_cudf_table}; + auto sliced_arrow_table = arrow_table->Slice(start, end - start); + auto got_cudf_table = cudf::from_arrow(*sliced_arrow_table); // This has been added to take-care of empty string column issue with no children if (got_cudf_table->num_rows() == 0 and expected_cudf_table.num_rows() == 0) { @@ -350,14 +350,16 @@ TEST_P(FromArrowTestSlice, SliceTest) } } +template +using fp_wrapper = cudf::test::fixed_point_column_wrapper; + TEST_F(FromArrowTest, FixedPointTable) { - auto data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + using namespace numeric; - auto col = - cudf::test::fixed_point_column_wrapper({1, 2, 3, 4, 5, 6}, numeric::scale_type{0}); - - cudf::table_view expected_table_view({col}); + auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + auto const col = fp_wrapper({1, 2, 3, 4, 5, 6}, scale_type{0}); + auto const expected = cudf::table_view({col}); std::shared_ptr arr; arrow::Decimal128Builder decimal_builder(arrow::decimal(1, 0), arrow::default_memory_pool()); @@ -365,14 +367,14 @@ TEST_F(FromArrowTest, FixedPointTable) sizeof(int64_t) * data.size()); CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); - std::vector> schema_vector({arrow::field("a", arr->type())}); - auto schema = std::make_shared(schema_vector); - - auto arrow_table = arrow::Table::Make(schema, {arr}); + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const arrow_table = arrow::Table::Make(schema, {arr}); auto got_cudf_table = cudf::from_arrow(*arrow_table); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view()); } INSTANTIATE_TEST_CASE_P(FromArrowTest, From d460ab1f2aa79f63a9f5e7772794a0f59e3887cf Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 16 Mar 2021 11:50:30 -0400 Subject: [PATCH 04/40] Fix test --- cpp/tests/interop/from_arrow_test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 3e0d764d959..67ae6cefcf1 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -363,8 +363,7 @@ TEST_F(FromArrowTest, FixedPointTable) std::shared_ptr arr; arrow::Decimal128Builder decimal_builder(arrow::decimal(1, 0), arrow::default_memory_pool()); - decimal_builder.AppendValues(reinterpret_cast(data.data()), - sizeof(int64_t) * data.size()); + decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / 2); CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); auto const field = arrow::field("a", arr->type()); From c4085addabd5e125a325111cebe4cbe1b5f3b87e Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Wed, 17 Mar 2021 15:58:05 -0400 Subject: [PATCH 05/40] SFINAE + temporary/initial changes --- cpp/src/interop/from_arrow.cpp | 56 +++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp index 64cc584763b..a7604bee9cc 100644 --- a/cpp/src/interop/from_arrow.cpp +++ b/cpp/src/interop/from_arrow.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,8 @@ #include +#include + namespace cudf { namespace detail { @@ -115,7 +118,7 @@ struct dispatch_to_cudf_column { return mask; } - template + template ()>* = nullptr> std::unique_ptr operator()(arrow::Array const& array, data_type type, bool skip_mask, @@ -150,6 +153,57 @@ struct dispatch_to_cudf_column { return col; } + + template ()>* = nullptr> + std::unique_ptr operator()(arrow::Array const& array, + data_type type, + bool skip_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + using DeviceType = device_storage_type_t; + + auto data_buffer = array.data()->buffers[1]; + size_type const num_rows = array.length(); + auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; + auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); + auto mutable_column_view = col->mutable_view(); + + // CUDA_TRY(cudaMemcpyAsync( + // mutable_column_view.data(), + // reinterpret_cast(data_buffer->address()) + array.offset() * sizeof(T), + // sizeof(T) * num_rows, + // cudaMemcpyDefault, + // stream.value())); + + auto temp = reinterpret_cast(data_buffer->address()) + + array.offset() * sizeof(DeviceType); + auto data_64_ptr = reinterpret_cast(temp); + auto gather_map = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); + + thrust::gather(gather_map, // + gather_map + num_rows / 2, + data_64_ptr, + mutable_column_view.begin()); + + if (has_nulls) { + auto tmp_mask = get_mask_buffer(array, stream, mr); + + // If array is sliced, we have to copy whole mask and then take copy. + auto out_mask = (num_rows == static_cast(data_buffer->size() / sizeof(T))) + ? *tmp_mask + : cudf::detail::copy_bitmask(static_cast(tmp_mask->data()), + array.offset(), + array.offset() + num_rows, + stream, + mr); + + col->set_null_mask(std::move(out_mask)); + } + + return col; + } }; std::unique_ptr get_empty_type_column(size_type size) From 31b4a562fcc68e556ec2caa0dcb7662319b7b98d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 17 Mar 2021 16:37:32 -0400 Subject: [PATCH 06/40] Progress --- cpp/CMakeLists.txt | 2 +- .../interop/{from_arrow.cpp => from_arrow.cu} | 59 ++++++++----------- 2 files changed, 27 insertions(+), 34 deletions(-) rename cpp/src/interop/{from_arrow.cpp => from_arrow.cu} (91%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2a51ad5e55a..c4e5fe2cca3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -201,7 +201,7 @@ add_library(cudf src/groupby/sort/sort_helper.cu src/hash/hashing.cu src/interop/dlpack.cpp - src/interop/from_arrow.cpp + src/interop/from_arrow.cu src/interop/to_arrow.cpp src/io/avro/avro.cpp src/io/avro/avro_gpu.cu diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cu similarity index 91% rename from cpp/src/interop/from_arrow.cpp rename to cpp/src/interop/from_arrow.cu index a7604bee9cc..02a9e119d07 100644 --- a/cpp/src/interop/from_arrow.cpp +++ b/cpp/src/interop/from_arrow.cu @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include #include #include @@ -32,6 +31,7 @@ #include #include #include +#include #include @@ -154,6 +154,10 @@ struct dispatch_to_cudf_column { return col; } + struct every_other { + __device__ size_type operator()(size_type i) { return 2 * i; } + }; + template ()>* = nullptr> std::unique_ptr operator()(arrow::Array const& array, data_type type, @@ -165,44 +169,33 @@ struct dispatch_to_cudf_column { auto data_buffer = array.data()->buffers[1]; size_type const num_rows = array.length(); - auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; - auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); - auto mutable_column_view = col->mutable_view(); - // CUDA_TRY(cudaMemcpyAsync( - // mutable_column_view.data(), - // reinterpret_cast(data_buffer->address()) + array.offset() * sizeof(T), - // sizeof(T) * num_rows, - // cudaMemcpyDefault, - // stream.value())); + // auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; - auto temp = reinterpret_cast(data_buffer->address()) + - array.offset() * sizeof(DeviceType); - auto data_64_ptr = reinterpret_cast(temp); - auto gather_map = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); + rmm::device_uvector buf(num_rows * 2, stream); + rmm::device_uvector out_buf(num_rows, stream); - thrust::gather(gather_map, // - gather_map + num_rows / 2, - data_64_ptr, - mutable_column_view.begin()); + std::cout << "before memcpy" << std::endl; + CUDA_TRY(cudaMemcpy(reinterpret_cast(buf.data()), + reinterpret_cast(data_buffer->address()) + + array.offset() * sizeof(DeviceType), + sizeof(DeviceType) * num_rows * 2, + cudaMemcpyDefault)); + std::cout << "after memcpy" << std::endl; - if (has_nulls) { - auto tmp_mask = get_mask_buffer(array, stream, mr); + auto gather_map = cudf::detail::make_counting_transform_iterator(0, every_other{}); - // If array is sliced, we have to copy whole mask and then take copy. - auto out_mask = (num_rows == static_cast(data_buffer->size() / sizeof(T))) - ? *tmp_mask - : cudf::detail::copy_bitmask(static_cast(tmp_mask->data()), - array.offset(), - array.offset() + num_rows, - stream, - mr); + std::cout << "after making gathermap" << std::endl; - col->set_null_mask(std::move(out_mask)); - } - - return col; + thrust::gather(rmm::exec_policy(stream), + gather_map, // + gather_map + num_rows, + buf.data(), + out_buf.data()); + std::cout << "made gathermap" << std::endl; + auto result = make_fixed_point_column(type, num_rows, out_buf.release()); + std::cout << "constructed ressullt " << std::endl; + return std::move(result); } }; From 64a15c95a3e15bce50181922b3ee4f5edf135e19 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Wed, 17 Mar 2021 17:06:07 -0400 Subject: [PATCH 07/40] Changes --- cpp/src/interop/from_arrow.cu | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 02a9e119d07..7216cfa0de9 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -167,35 +167,32 @@ struct dispatch_to_cudf_column { { using DeviceType = device_storage_type_t; - auto data_buffer = array.data()->buffers[1]; - size_type const num_rows = array.length(); + auto data_buffer = array.data()->buffers[1]; + auto const num_rows = static_cast(array.length()); + // TODO clean up this function (remove magic constants) + // TODO add back null logic // auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; rmm::device_uvector buf(num_rows * 2, stream); - rmm::device_uvector out_buf(num_rows, stream); + rmm::device_uvector out_buf(num_rows, stream, mr); - std::cout << "before memcpy" << std::endl; - CUDA_TRY(cudaMemcpy(reinterpret_cast(buf.data()), - reinterpret_cast(data_buffer->address()) + - array.offset() * sizeof(DeviceType), - sizeof(DeviceType) * num_rows * 2, - cudaMemcpyDefault)); - std::cout << "after memcpy" << std::endl; + CUDA_TRY(cudaMemcpyAsync(reinterpret_cast(buf.data()), + reinterpret_cast(data_buffer->address()) + + array.offset() * sizeof(DeviceType), + sizeof(DeviceType) * num_rows * 2, + cudaMemcpyDefault, + stream.value())); auto gather_map = cudf::detail::make_counting_transform_iterator(0, every_other{}); - std::cout << "after making gathermap" << std::endl; - thrust::gather(rmm::exec_policy(stream), gather_map, // gather_map + num_rows, buf.data(), out_buf.data()); - std::cout << "made gathermap" << std::endl; - auto result = make_fixed_point_column(type, num_rows, out_buf.release()); - std::cout << "constructed ressullt " << std::endl; - return std::move(result); + + return std::make_unique(type, num_rows, out_buf.release()); } }; From ce6e756182413ccf6cc93ebc43f6ae75c9629ff2 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 14:36:49 -0400 Subject: [PATCH 08/40] Add failing null test --- cpp/tests/interop/from_arrow_test.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 67ae6cefcf1..0003031bd60 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -376,6 +376,33 @@ TEST_F(FromArrowTest, FixedPointTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view()); } +TEST_F(FromArrowTest, FixedPointTableNulls) +{ + using namespace numeric; + + auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + auto const col = + fp_wrapper({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{0}); + auto const expected = cudf::table_view({col}); + + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(1, 0), arrow::default_memory_pool()); + decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / 2); + decimal_builder.AppendNull(); + decimal_builder.AppendNull(); + + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_cudf_table = cudf::from_arrow(*arrow_table); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view()); +} + INSTANTIATE_TEST_CASE_P(FromArrowTest, FromArrowTestSlice, ::testing::Values(std::make_tuple(0, 10000), From 9db4246cad96fb6bebfb5723c35d293d624104f1 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Mon, 22 Mar 2021 14:47:26 -0400 Subject: [PATCH 09/40] Add null logic --- cpp/src/interop/from_arrow.cu | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 7216cfa0de9..a985c8835ca 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -192,7 +192,25 @@ struct dispatch_to_cudf_column { buf.data(), out_buf.data()); - return std::make_unique(type, num_rows, out_buf.release()); + auto null_mask = [&] { + // When C++17, use if statement with initialization + auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; + if (has_nulls) { + auto tmp_mask = get_mask_buffer(array, stream, mr); + + // If array is sliced, we have to copy whole mask and then take copy. + return (num_rows == static_cast(data_buffer->size() / sizeof(DeviceType))) + ? *tmp_mask + : cudf::detail::copy_bitmask(static_cast(tmp_mask->data()), + array.offset(), + array.offset() + num_rows, + stream, + mr); + } + return rmm::device_buffer{}; + }(); + + return std::make_unique(type, num_rows, out_buf.release(), null_mask); } }; From 54d7164dfa630dc1a4fbacc4932bb9b1955afeb9 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 14:50:52 -0400 Subject: [PATCH 10/40] Remove TODO --- cpp/src/interop/from_arrow.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index a985c8835ca..0bc99290930 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -171,7 +171,6 @@ struct dispatch_to_cudf_column { auto const num_rows = static_cast(array.length()); // TODO clean up this function (remove magic constants) - // TODO add back null logic // auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; rmm::device_uvector buf(num_rows * 2, stream); From 8dd82be0aad95e2c94146c5215939fa905e96f8c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 15:00:45 -0400 Subject: [PATCH 11/40] Add more scales to test --- cpp/tests/interop/from_arrow_test.cpp | 62 ++++++++++++++------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 0003031bd60..1ba8f9fd801 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -357,50 +357,54 @@ TEST_F(FromArrowTest, FixedPointTable) { using namespace numeric; - auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; - auto const col = fp_wrapper({1, 2, 3, 4, 5, 6}, scale_type{0}); - auto const expected = cudf::table_view({col}); + for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { + auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + auto const col = fp_wrapper({1, 2, 3, 4, 5, 6}, scale_type{i}); + auto const expected = cudf::table_view({col}); - std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(1, 0), arrow::default_memory_pool()); - decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / 2); - CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); + decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / 2); + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); - auto const field = arrow::field("a", arr->type()); - auto const schema_vector = std::vector>({field}); - auto const schema = std::make_shared(schema_vector); - auto const arrow_table = arrow::Table::Make(schema, {arr}); + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const arrow_table = arrow::Table::Make(schema, {arr}); - auto got_cudf_table = cudf::from_arrow(*arrow_table); + auto got_cudf_table = cudf::from_arrow(*arrow_table); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view()); + } } TEST_F(FromArrowTest, FixedPointTableNulls) { using namespace numeric; - auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; - auto const col = - fp_wrapper({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{0}); - auto const expected = cudf::table_view({col}); + for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { + auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + auto const col = + fp_wrapper({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i}); + auto const expected = cudf::table_view({col}); - std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(1, 0), arrow::default_memory_pool()); - decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / 2); - decimal_builder.AppendNull(); - decimal_builder.AppendNull(); + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); + decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / 2); + decimal_builder.AppendNull(); + decimal_builder.AppendNull(); - CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); - auto const field = arrow::field("a", arr->type()); - auto const schema_vector = std::vector>({field}); - auto const schema = std::make_shared(schema_vector); - auto const arrow_table = arrow::Table::Make(schema, {arr}); + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const arrow_table = arrow::Table::Make(schema, {arr}); - auto got_cudf_table = cudf::from_arrow(*arrow_table); + auto got_cudf_table = cudf::from_arrow(*arrow_table); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view()); + } } INSTANTIATE_TEST_CASE_P(FromArrowTest, From 65e6b07051303272482e23f91c853a178d09e9dc Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Mon, 22 Mar 2021 15:13:17 -0400 Subject: [PATCH 12/40] Add Large Test (no nulls) --- cpp/tests/interop/from_arrow_test.cpp | 30 +++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 1ba8f9fd801..bcd510b6f65 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -32,6 +32,8 @@ #include #include +#include + #include std::unique_ptr get_cudf_table() @@ -378,6 +380,34 @@ TEST_F(FromArrowTest, FixedPointTable) } } +TEST_F(FromArrowTest, FixedPointTableLarge) +{ + using namespace numeric; + + for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { + auto every_other = [](auto i) { return i % 2 ? 0 : i / 2; }; + auto transform = cudf::detail::make_counting_transform_iterator(2, every_other); + auto const data = std::vector(transform, transform + 2000); + auto iota = thrust::make_counting_iterator(1); + auto const col = fp_wrapper(iota, iota + 1000, scale_type{i}); // TODO + auto const expected = cudf::table_view({col}); + + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); + decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / 2); + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_cudf_table = cudf::from_arrow(*arrow_table); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view()); + } +} + TEST_F(FromArrowTest, FixedPointTableNulls) { using namespace numeric; From 736fd153a539646e121c3a50868afc7aa242c534 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Mon, 22 Mar 2021 15:50:28 -0400 Subject: [PATCH 13/40] Add Large Unit Test with Nulls + Cleanup --- cpp/src/interop/from_arrow.cu | 1 - cpp/tests/interop/from_arrow_test.cpp | 40 +++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 0bc99290930..795e6a6e416 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -171,7 +171,6 @@ struct dispatch_to_cudf_column { auto const num_rows = static_cast(array.length()); // TODO clean up this function (remove magic constants) - // auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; rmm::device_uvector buf(num_rows * 2, stream); rmm::device_uvector out_buf(num_rows, stream, mr); diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index bcd510b6f65..e4d286b7fc2 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -384,17 +384,18 @@ TEST_F(FromArrowTest, FixedPointTableLarge) { using namespace numeric; + int64_t constexpr NUM_ELEMENTS = 1000; for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { auto every_other = [](auto i) { return i % 2 ? 0 : i / 2; }; auto transform = cudf::detail::make_counting_transform_iterator(2, every_other); - auto const data = std::vector(transform, transform + 2000); + auto const data = std::vector(transform, transform + NUM_ELEMENTS * 2); auto iota = thrust::make_counting_iterator(1); - auto const col = fp_wrapper(iota, iota + 1000, scale_type{i}); // TODO + auto const col = fp_wrapper(iota, iota + NUM_ELEMENTS, scale_type{i}); auto const expected = cudf::table_view({col}); std::shared_ptr arr; arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); - decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / 2); + decimal_builder.AppendValues(reinterpret_cast(data.data()), NUM_ELEMENTS); CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); auto const field = arrow::field("a", arr->type()); @@ -437,6 +438,39 @@ TEST_F(FromArrowTest, FixedPointTableNulls) } } +TEST_F(FromArrowTest, FixedPointTableNullsLarge) +{ + using namespace numeric; + + int64_t constexpr NUM_ELEMENTS = 1000; + for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { + auto every_other = [](auto i) { return i % 2 ? 0 : i / 2; }; + auto transform = cudf::detail::make_counting_transform_iterator(2, every_other); + auto const data = std::vector(transform, transform + NUM_ELEMENTS * 2); + auto iota = thrust::make_counting_iterator(1); + auto const col = fp_wrapper(iota, iota + NUM_ELEMENTS, transform, scale_type{i}); + auto const expected = cudf::table_view({col}); + + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); + for (int64_t i = 0; i < NUM_ELEMENTS / 2; ++i) { + decimal_builder.Append(reinterpret_cast(data.data() + 4 * i)); + decimal_builder.AppendNull(); + } + + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_cudf_table = cudf::from_arrow(*arrow_table); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view()); + } +} + INSTANTIATE_TEST_CASE_P(FromArrowTest, FromArrowTestSlice, ::testing::Values(std::make_tuple(0, 10000), From 54acb0ce1d7eb8b3708ef395030d305965b144fd Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Mon, 22 Mar 2021 23:43:18 -0400 Subject: [PATCH 14/40] Remove header --- cpp/src/interop/from_arrow.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 795e6a6e416..0ec0870edb1 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -31,7 +31,6 @@ #include #include #include -#include #include From 31358a27cbdbb5fad17be28781cafd75e4c83416 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 23 Mar 2021 15:49:56 -0400 Subject: [PATCH 15/40] Cleanup --- cpp/src/interop/from_arrow.cu | 16 +++++--------- cpp/tests/interop/from_arrow_test.cpp | 32 ++++++++++++++++----------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 0ec0870edb1..683b87f7612 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -166,18 +166,17 @@ struct dispatch_to_cudf_column { { using DeviceType = device_storage_type_t; - auto data_buffer = array.data()->buffers[1]; - auto const num_rows = static_cast(array.length()); + size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto data_buffer = array.data()->buffers[1]; + auto const num_rows = static_cast(array.length()); - // TODO clean up this function (remove magic constants) - - rmm::device_uvector buf(num_rows * 2, stream); + rmm::device_uvector buf(num_rows * BIT_WIDTH_RATIO, stream); rmm::device_uvector out_buf(num_rows, stream, mr); CUDA_TRY(cudaMemcpyAsync(reinterpret_cast(buf.data()), reinterpret_cast(data_buffer->address()) + array.offset() * sizeof(DeviceType), - sizeof(DeviceType) * num_rows * 2, + sizeof(DeviceType) * num_rows * BIT_WIDTH_RATIO, cudaMemcpyDefault, stream.value())); @@ -190,11 +189,8 @@ struct dispatch_to_cudf_column { out_buf.data()); auto null_mask = [&] { - // When C++17, use if statement with initialization - auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; - if (has_nulls) { + if (not skip_mask and array.null_bitmap_data()) { auto tmp_mask = get_mask_buffer(array, stream, mr); - // If array is sliced, we have to copy whole mask and then take copy. return (num_rows == static_cast(data_buffer->size() / sizeof(DeviceType))) ? *tmp_mask diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index e4d286b7fc2..ec906a187fb 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -358,6 +358,7 @@ using fp_wrapper = cudf::test::fixed_point_column_wrapper; TEST_F(FromArrowTest, FixedPointTable) { using namespace numeric; + cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; @@ -366,7 +367,8 @@ TEST_F(FromArrowTest, FixedPointTable) std::shared_ptr arr; arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); - decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / 2); + decimal_builder.AppendValues(reinterpret_cast(data.data()), + data.size() / BIT_WIDTH_RATIO); CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); auto const field = arrow::field("a", arr->type()); @@ -383,14 +385,15 @@ TEST_F(FromArrowTest, FixedPointTable) TEST_F(FromArrowTest, FixedPointTableLarge) { using namespace numeric; + cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t int64_t constexpr NUM_ELEMENTS = 1000; for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { - auto every_other = [](auto i) { return i % 2 ? 0 : i / 2; }; - auto transform = cudf::detail::make_counting_transform_iterator(2, every_other); - auto const data = std::vector(transform, transform + NUM_ELEMENTS * 2); - auto iota = thrust::make_counting_iterator(1); - auto const col = fp_wrapper(iota, iota + NUM_ELEMENTS, scale_type{i}); + auto every_other = [](auto i) { return i % BIT_WIDTH_RATIO ? 0 : i / BIT_WIDTH_RATIO; }; + auto transform = cudf::detail::make_counting_transform_iterator(BIT_WIDTH_RATIO, every_other); + auto const data = std::vector(transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO); + auto iota = thrust::make_counting_iterator(1); + auto const col = fp_wrapper(iota, iota + NUM_ELEMENTS, scale_type{i}); auto const expected = cudf::table_view({col}); std::shared_ptr arr; @@ -412,6 +415,7 @@ TEST_F(FromArrowTest, FixedPointTableLarge) TEST_F(FromArrowTest, FixedPointTableNulls) { using namespace numeric; + cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; @@ -421,7 +425,8 @@ TEST_F(FromArrowTest, FixedPointTableNulls) std::shared_ptr arr; arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); - decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / 2); + decimal_builder.AppendValues(reinterpret_cast(data.data()), + data.size() / BIT_WIDTH_RATIO); decimal_builder.AppendNull(); decimal_builder.AppendNull(); @@ -441,19 +446,20 @@ TEST_F(FromArrowTest, FixedPointTableNulls) TEST_F(FromArrowTest, FixedPointTableNullsLarge) { using namespace numeric; + cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t int64_t constexpr NUM_ELEMENTS = 1000; for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { - auto every_other = [](auto i) { return i % 2 ? 0 : i / 2; }; - auto transform = cudf::detail::make_counting_transform_iterator(2, every_other); - auto const data = std::vector(transform, transform + NUM_ELEMENTS * 2); - auto iota = thrust::make_counting_iterator(1); - auto const col = fp_wrapper(iota, iota + NUM_ELEMENTS, transform, scale_type{i}); + auto every_other = [](auto i) { return i % BIT_WIDTH_RATIO ? 0 : i / BIT_WIDTH_RATIO; }; + auto transform = cudf::detail::make_counting_transform_iterator(BIT_WIDTH_RATIO, every_other); + auto const data = std::vector(transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO); + auto iota = thrust::make_counting_iterator(1); + auto const col = fp_wrapper(iota, iota + NUM_ELEMENTS, transform, scale_type{i}); auto const expected = cudf::table_view({col}); std::shared_ptr arr; arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); - for (int64_t i = 0; i < NUM_ELEMENTS / 2; ++i) { + for (int64_t i = 0; i < NUM_ELEMENTS / BIT_WIDTH_RATIO; ++i) { decimal_builder.Append(reinterpret_cast(data.data() + 4 * i)); decimal_builder.AppendNull(); } From 8eaaed48debe9dce8d992335f4ce8447f785e938 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 15:59:29 -0400 Subject: [PATCH 16/40] Add failing to_arrow test --- cpp/tests/interop/to_arrow_test.cpp | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index c8e56711135..3c355f578e4 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -353,6 +353,36 @@ TEST_F(ToArrowTest, StructColumn) ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true)); } +template +using fp_wrapper = cudf::test::fixed_point_column_wrapper; + +TEST_F(ToArrowTest, FixedPointTable) +{ + using namespace numeric; + cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + + for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { + auto const col = fp_wrapper({1, 2, 3, 4, 5, 6}, scale_type{i}); + auto const input = cudf::table_view({col}); + + auto const expect_data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); + decimal_builder.AppendValues(reinterpret_cast(expect_data.data()), + expect_data.size() / BIT_WIDTH_RATIO); + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const expected_arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); + + ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true)); + } +} + struct ToArrowTestSlice : public ToArrowTest, public ::testing::WithParamInterface> { From dd25a2e13822cc366e5cb78eb2dd0c07e9c38cc5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 16:00:44 -0400 Subject: [PATCH 17/40] Rename to_arrow.cpp -> to_arrow.cu --- cpp/CMakeLists.txt | 2 +- cpp/src/interop/{to_arrow.cpp => to_arrow.cu} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/src/interop/{to_arrow.cpp => to_arrow.cu} (100%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ec9abf280b7..17a983017a7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -208,7 +208,7 @@ add_library(cudf src/hash/hashing.cu src/interop/dlpack.cpp src/interop/from_arrow.cu - src/interop/to_arrow.cpp + src/interop/to_arrow.cu src/io/avro/avro.cpp src/io/avro/avro_gpu.cu src/io/avro/reader_impl.cu diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cu similarity index 100% rename from cpp/src/interop/to_arrow.cpp rename to cpp/src/interop/to_arrow.cu From c4321070d9a0e20c7b836983f3766d19138c10b0 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 16:20:43 -0400 Subject: [PATCH 18/40] Half baked to_arrow --- cpp/src/interop/to_arrow.cu | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 7daffc1a3c3..83cbb57e8ff 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -128,6 +128,42 @@ struct dispatch_to_arrow { } }; +template <> +std::shared_ptr dispatch_to_arrow::operator()( + column_view input, + cudf::type_id id, + column_metadata const& metadata, + arrow::MemoryPool* ar_mr, + rmm::cuda_stream_view stream) +{ + using DeviceType = device_storage_type_t; + size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + + rmm::device_uvector buf(input.size() * BIT_WIDTH_RATIO, stream); + + thrust::gather(rmm::exec_policy(stream), // scatter values from input to buf + gather_map, // + gather_map + num_rows, + buf.data(), + out_buf.data()); + + auto result = arrow::AllocateBuffer(static_cast(buf->size()), ar_mr); + CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); + + std::shared_ptr data_buffer = std::move(result.ValueOrDie()); + + CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), + bitmask.first->data(), + bitmask.first->size(), + cudaMemcpyDeviceToHost, + stream.value())); + return to_arrow_array(id, + static_cast(input.size()), + data_buffer, + fetch_mask_buffer(input, ar_mr, stream), + static_cast(input.null_count())); +} + template <> std::shared_ptr dispatch_to_arrow::operator()(column_view input, cudf::type_id id, From e3f704e275c4de90ff3a43fbc48880d0c3cb1c9e Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 23 Mar 2021 16:49:33 -0400 Subject: [PATCH 19/40] Initial to_arrow decimal64 work --- cpp/include/cudf/detail/interop.hpp | 2 ++ cpp/src/interop/to_arrow.cu | 31 ++++++++++++++++++----------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp index cdc221dcdbe..5709df644d6 100644 --- a/cpp/include/cudf/detail/interop.hpp +++ b/cpp/include/cudf/detail/interop.hpp @@ -89,6 +89,8 @@ std::shared_ptr to_arrow_array(cudf::type_id id, Ts&&... args) case type_id::DURATION_NANOSECONDS: return std::make_shared(arrow::duration(arrow::TimeUnit::NANO), std::forward(args)...); + case type_id::DECIMAL64: + return std::make_shared(std::forward(args)...); default: CUDF_FAIL("Unsupported type_id conversion to arrow"); } } diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 83cbb57e8ff..9860d4265f2 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,8 @@ #include #include +#include + namespace cudf { namespace detail { namespace { @@ -128,6 +131,10 @@ struct dispatch_to_arrow { } }; +struct every_other { + __device__ size_type operator()(size_type i) { return 2 * i; } +}; + template <> std::shared_ptr dispatch_to_arrow::operator()( column_view input, @@ -136,27 +143,27 @@ std::shared_ptr dispatch_to_arrow::operator()( arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - using DeviceType = device_storage_type_t; + using DeviceType = int64_t; size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t rmm::device_uvector buf(input.size() * BIT_WIDTH_RATIO, stream); - thrust::gather(rmm::exec_policy(stream), // scatter values from input to buf - gather_map, // - gather_map + num_rows, - buf.data(), - out_buf.data()); + auto scatter_map = cudf::detail::make_counting_transform_iterator(0, every_other{}); - auto result = arrow::AllocateBuffer(static_cast(buf->size()), ar_mr); + thrust::scatter(rmm::exec_policy(stream), + input.begin(), + input.end(), + scatter_map, + buf.data()); + + auto result = arrow::AllocateBuffer(static_cast(buf.size()), ar_mr); CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); std::shared_ptr data_buffer = std::move(result.ValueOrDie()); - CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), - bitmask.first->data(), - bitmask.first->size(), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync( + data_buffer->mutable_data(), buf.data(), buf.size(), cudaMemcpyDeviceToHost, stream.value())); + return to_arrow_array(id, static_cast(input.size()), data_buffer, From 3b5f0bee4f3411812c0f1c3fc577a1cc451fae6c Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Fri, 26 Mar 2021 10:46:10 -0400 Subject: [PATCH 20/40] Changes --- cpp/src/interop/to_arrow.cu | 11 ++++++----- cpp/tests/interop/arrow_utils.hpp | 22 +++++++++++++++++++++- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 9860d4265f2..d8535345d94 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -164,11 +164,12 @@ std::shared_ptr dispatch_to_arrow::operator()( CUDA_TRY(cudaMemcpyAsync( data_buffer->mutable_data(), buf.data(), buf.size(), cudaMemcpyDeviceToHost, stream.value())); - return to_arrow_array(id, - static_cast(input.size()), - data_buffer, - fetch_mask_buffer(input, ar_mr, stream), - static_cast(input.null_count())); + auto type = std::make_shared(arrow::Type::DECIMAL); + auto mask = fetch_mask_buffer(input, ar_mr, stream); + auto buffers = std::vector>{mask, data_buffer}; + auto data = std::make_shared(type, buf.size(), buffers); + + return to_arrow_array(id, data); } template <> diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp index 935ef760e61..92089bef173 100644 --- a/cpp/tests/interop/arrow_utils.hpp +++ b/cpp/tests/interop/arrow_utils.hpp @@ -34,7 +34,8 @@ #pragma once template -std::enable_if_t() and !std::is_same::value, +std::enable_if_t() and not std::is_same::value and + not cudf::is_fixed_point(), std::shared_ptr> get_arrow_array(std::vector const& data, std::vector const& mask = {}) { @@ -49,6 +50,25 @@ get_arrow_array(std::vector const& data, std::vector const& mask = { return cudf::detail::to_arrow_array(cudf::type_to_id(), data.size(), data_buffer, mask_buffer); } +template +std::enable_if_t(), std::shared_ptr> get_arrow_array( + std::vector const& data, std::vector const& mask = {}) +{ + std::shared_ptr data_buffer; + arrow::BufferBuilder buff_builder; + buff_builder.Append(data.data(), sizeof(T) * data.size()); + CUDF_EXPECTS(buff_builder.Finish(&data_buffer).ok(), "Failed to allocate buffer"); + + std::shared_ptr mask_buffer = + mask.empty() ? nullptr : arrow::internal::BytesToBits(mask).ValueOrDie(); + + auto type = std::make_shared(arrow::Type::DECIMAL); + auto buffers = std::vector>{mask_buffer, data_buffer}; + auto array_data = std::make_shared(type, data.size(), buffers); + + return cudf::detail::to_arrow_array(cudf::type_to_id(), array_data); +} + template std::enable_if_t() and !std::is_same::value, std::shared_ptr> From 04ed34be486b752aa1d372d044610dc7a0fee1f3 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 26 Mar 2021 11:31:52 -0400 Subject: [PATCH 21/40] Get things building again --- cpp/include/cudf/detail/interop.hpp | 2 -- cpp/src/interop/to_arrow.cu | 4 +-- cpp/tests/interop/arrow_utils.hpp | 40 +++++++++++++++-------------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp index 5709df644d6..cdc221dcdbe 100644 --- a/cpp/include/cudf/detail/interop.hpp +++ b/cpp/include/cudf/detail/interop.hpp @@ -89,8 +89,6 @@ std::shared_ptr to_arrow_array(cudf::type_id id, Ts&&... args) case type_id::DURATION_NANOSECONDS: return std::make_shared(arrow::duration(arrow::TimeUnit::NANO), std::forward(args)...); - case type_id::DECIMAL64: - return std::make_shared(std::forward(args)...); default: CUDF_FAIL("Unsupported type_id conversion to arrow"); } } diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index d8535345d94..0103a158e03 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -164,12 +164,12 @@ std::shared_ptr dispatch_to_arrow::operator()( CUDA_TRY(cudaMemcpyAsync( data_buffer->mutable_data(), buf.data(), buf.size(), cudaMemcpyDeviceToHost, stream.value())); - auto type = std::make_shared(arrow::Type::DECIMAL); + auto type = arrow::decimal(18, input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); auto buffers = std::vector>{mask, data_buffer}; auto data = std::make_shared(type, buf.size(), buffers); - return to_arrow_array(id, data); + return std::make_shared(data); } template <> diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp index 92089bef173..08ad74d3d85 100644 --- a/cpp/tests/interop/arrow_utils.hpp +++ b/cpp/tests/interop/arrow_utils.hpp @@ -50,27 +50,29 @@ get_arrow_array(std::vector const& data, std::vector const& mask = { return cudf::detail::to_arrow_array(cudf::type_to_id(), data.size(), data_buffer, mask_buffer); } -template -std::enable_if_t(), std::shared_ptr> get_arrow_array( - std::vector const& data, std::vector const& mask = {}) -{ - std::shared_ptr data_buffer; - arrow::BufferBuilder buff_builder; - buff_builder.Append(data.data(), sizeof(T) * data.size()); - CUDF_EXPECTS(buff_builder.Finish(&data_buffer).ok(), "Failed to allocate buffer"); - - std::shared_ptr mask_buffer = - mask.empty() ? nullptr : arrow::internal::BytesToBits(mask).ValueOrDie(); - - auto type = std::make_shared(arrow::Type::DECIMAL); - auto buffers = std::vector>{mask_buffer, data_buffer}; - auto array_data = std::make_shared(type, data.size(), buffers); - - return cudf::detail::to_arrow_array(cudf::type_to_id(), array_data); -} +// template +// std::shared_ptr get_arrow_array_fp( +// std::vector const& data, int32_t scale, +// std::vector const& mask = {}) +// { +// std::shared_ptr data_buffer; +// arrow::BufferBuilder buff_builder; +// buff_builder.Append(data.data(), sizeof(T) * data.size()); +// CUDF_EXPECTS(buff_builder.Finish(&data_buffer).ok(), "Failed to allocate buffer"); + +// std::shared_ptr mask_buffer = +// mask.empty() ? nullptr : arrow::internal::BytesToBits(mask).ValueOrDie(); + +// auto type = arrow::decimal(18, scale); +// auto buffers = std::vector>{mask_buffer, data_buffer}; +// auto array_data = std::make_shared(type, data.size(), buffers); + +// return cudf::detail::to_arrow_array(cudf::type_to_id(), array_data); +// } template -std::enable_if_t() and !std::is_same::value, +std::enable_if_t() and !std::is_same::value and + !cudf::is_fixed_point(), std::shared_ptr> get_arrow_array(std::initializer_list elements, std::initializer_list validity = {}) { From f29ff3aeb11b3fee60b8c7f0ea8463021622f6e9 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Fri, 26 Mar 2021 11:42:19 -0400 Subject: [PATCH 22/40] Changes --- cpp/src/interop/to_arrow.cu | 1 + cpp/tests/interop/arrow_utils.hpp | 26 ++------------------------ cpp/tests/interop/to_arrow_test.cpp | 2 +- 3 files changed, 4 insertions(+), 25 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 0103a158e03..b463542b3a9 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -147,6 +147,7 @@ std::shared_ptr dispatch_to_arrow::operator()( size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t rmm::device_uvector buf(input.size() * BIT_WIDTH_RATIO, stream); + thrust::uninitialized_fill(rmm::exec_policy(stream), buf.begin(), buf.end(), DeviceType{0}); auto scatter_map = cudf::detail::make_counting_transform_iterator(0, every_other{}); diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp index 08ad74d3d85..935ef760e61 100644 --- a/cpp/tests/interop/arrow_utils.hpp +++ b/cpp/tests/interop/arrow_utils.hpp @@ -34,8 +34,7 @@ #pragma once template -std::enable_if_t() and not std::is_same::value and - not cudf::is_fixed_point(), +std::enable_if_t() and !std::is_same::value, std::shared_ptr> get_arrow_array(std::vector const& data, std::vector const& mask = {}) { @@ -50,29 +49,8 @@ get_arrow_array(std::vector const& data, std::vector const& mask = { return cudf::detail::to_arrow_array(cudf::type_to_id(), data.size(), data_buffer, mask_buffer); } -// template -// std::shared_ptr get_arrow_array_fp( -// std::vector const& data, int32_t scale, -// std::vector const& mask = {}) -// { -// std::shared_ptr data_buffer; -// arrow::BufferBuilder buff_builder; -// buff_builder.Append(data.data(), sizeof(T) * data.size()); -// CUDF_EXPECTS(buff_builder.Finish(&data_buffer).ok(), "Failed to allocate buffer"); - -// std::shared_ptr mask_buffer = -// mask.empty() ? nullptr : arrow::internal::BytesToBits(mask).ValueOrDie(); - -// auto type = arrow::decimal(18, scale); -// auto buffers = std::vector>{mask_buffer, data_buffer}; -// auto array_data = std::make_shared(type, data.size(), buffers); - -// return cudf::detail::to_arrow_array(cudf::type_to_id(), array_data); -// } - template -std::enable_if_t() and !std::is_same::value and - !cudf::is_fixed_point(), +std::enable_if_t() and !std::is_same::value, std::shared_ptr> get_arrow_array(std::initializer_list elements, std::initializer_list validity = {}) { diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 3c355f578e4..ac2e7d2fe2c 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -367,7 +367,7 @@ TEST_F(ToArrowTest, FixedPointTable) auto const expect_data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); + arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); decimal_builder.AppendValues(reinterpret_cast(expect_data.data()), expect_data.size() / BIT_WIDTH_RATIO); CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); From 1b81752b6098f46b6196252fc25d134b4f13b483 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Mon, 29 Mar 2021 10:27:26 -0400 Subject: [PATCH 23/40] to_arrow changes --- cpp/src/interop/to_arrow.cu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index b463542b3a9..66e52dca640 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -157,18 +157,21 @@ std::shared_ptr dispatch_to_arrow::operator()( scatter_map, buf.data()); - auto result = arrow::AllocateBuffer(static_cast(buf.size()), ar_mr); + auto result = arrow::AllocateBuffer(buf.size(), ar_mr); CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); std::shared_ptr data_buffer = std::move(result.ValueOrDie()); - CUDA_TRY(cudaMemcpyAsync( - data_buffer->mutable_data(), buf.data(), buf.size(), cudaMemcpyDeviceToHost, stream.value())); + CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), + buf.data(), + buf.size() * BIT_WIDTH_RATIO * sizeof(int64_t), + cudaMemcpyDeviceToHost, + stream.value())); auto type = arrow::decimal(18, input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); auto buffers = std::vector>{mask, data_buffer}; - auto data = std::make_shared(type, buf.size(), buffers); + auto data = std::make_shared(type, buf.size() / BIT_WIDTH_RATIO, buffers); return std::make_shared(data); } From 5e4cb0366daa52fc4c9ab99a9fa8389663c6622d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 29 Mar 2021 10:43:42 -0400 Subject: [PATCH 24/40] Add failing null test --- cpp/tests/interop/to_arrow_test.cpp | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index ac2e7d2fe2c..0845f90fe10 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -383,6 +383,39 @@ TEST_F(ToArrowTest, FixedPointTable) } } +TEST_F(ToArrowTest, FixedPointTableNulls) +{ + using namespace numeric; + cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + + for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { + auto const col = fp_wrapper({1, 2, 3, 4, 5, 6}, {1, 0, 1, 0, 1, 0}, scale_type{i}); + auto const input = cudf::table_view({col}); + + auto const expect_data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); + for (int64_t i = 0; i < 6; ++i) { + decimal_builder.Append( + reinterpret_cast(expect_data.data() + BIT_WIDTH_RATIO * i)); + decimal_builder.AppendNull(); + } + + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const expected_arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); + + std::cout << got_arrow_table->ToString() << std::endl; + + ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true)); + } +} + struct ToArrowTestSlice : public ToArrowTest, public ::testing::WithParamInterface> { From d64092f412c7083d2d1fd70075e3302b414ad3c0 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Mon, 29 Mar 2021 11:51:26 -0400 Subject: [PATCH 25/40] Unit tests --- cpp/tests/interop/to_arrow_test.cpp | 83 ++++++++++++++++++++++++++--- 1 file changed, 76 insertions(+), 7 deletions(-) diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 0845f90fe10..02c1bded684 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -383,21 +383,89 @@ TEST_F(ToArrowTest, FixedPointTable) } } -TEST_F(ToArrowTest, FixedPointTableNulls) +TEST_F(ToArrowTest, FixedPointTableLarge) { using namespace numeric; cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + int32_t constexpr NUM_ELEMENTS = 1000; for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { - auto const col = fp_wrapper({1, 2, 3, 4, 5, 6}, {1, 0, 1, 0, 1, 0}, scale_type{i}); + auto iota = thrust::make_counting_iterator(1); + auto const col = fp_wrapper(iota, iota + NUM_ELEMENTS, scale_type{i}); auto const input = cudf::table_view({col}); - auto const expect_data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + auto every_other = [](auto i) { return i % 2 == 0 ? i / 2 : 0; }; + auto transform = cudf::detail::make_counting_transform_iterator(2, every_other); + auto const expect_data = + std::vector{transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO}; + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); + decimal_builder.AppendValues(reinterpret_cast(expect_data.data()), + expect_data.size() / BIT_WIDTH_RATIO); + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const expected_arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); + + // std::cout << got_arrow_table->ToString() << '\n'; + // std::cout << expected_arrow_table->ToString() << '\n'; + + ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true)); + } +} + +TEST_F(ToArrowTest, FixedPointTableNullsSimple) +{ + using namespace numeric; + cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + + for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { + auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + auto const col = + fp_wrapper({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i}); + auto const input = cudf::table_view({col}); + + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); + decimal_builder.AppendValues(reinterpret_cast(data.data()), + data.size() / BIT_WIDTH_RATIO); + decimal_builder.AppendNull(); + decimal_builder.AppendNull(); + + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); + + ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true)); + } +} + +TEST_F(ToArrowTest, FixedPointTableNulls) +{ + using namespace numeric; + cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + + // for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { + for (auto const i : {0}) { + auto const col = fp_wrapper( + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, scale_type{i}); + auto const input = cudf::table_view({col}); + + auto const expect_data = + std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0}; std::shared_ptr arr; arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); - for (int64_t i = 0; i < 6; ++i) { - decimal_builder.Append( - reinterpret_cast(expect_data.data() + BIT_WIDTH_RATIO * i)); + for (int64_t i = 0; i < input.column(0).size() / BIT_WIDTH_RATIO; ++i) { + decimal_builder.Append(reinterpret_cast(expect_data.data() + 4 * i)); decimal_builder.AppendNull(); } @@ -410,7 +478,8 @@ TEST_F(ToArrowTest, FixedPointTableNulls) auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); - std::cout << got_arrow_table->ToString() << std::endl; + // std::cout << got_arrow_table->ToString() << std::endl; + // std::cout << expected_arrow_table->ToString() << std::endl; ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true)); } From 34b286dd23622b9681db56e31be984617fc27407 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 30 Mar 2021 09:21:45 -0400 Subject: [PATCH 26/40] Fix FixPointTableLarge --- cpp/tests/interop/to_arrow_test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 02c1bded684..b1126f8de38 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -400,8 +400,11 @@ TEST_F(ToArrowTest, FixedPointTableLarge) std::vector{transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO}; std::shared_ptr arr; arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); - decimal_builder.AppendValues(reinterpret_cast(expect_data.data()), - expect_data.size() / BIT_WIDTH_RATIO); + + // Note: For some reason, decimal_builder.AppendValues with NUM_ELEMENTS >= 1000 doesn't work + for (int i = 0; i < NUM_ELEMENTS; ++i) + decimal_builder.Append(reinterpret_cast(expect_data.data() + 2 * i)); + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); auto const field = arrow::field("a", arr->type()); @@ -411,9 +414,6 @@ TEST_F(ToArrowTest, FixedPointTableLarge) auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); - // std::cout << got_arrow_table->ToString() << '\n'; - // std::cout << expected_arrow_table->ToString() << '\n'; - ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true)); } } From 4bd322d80b53d0baf37bdfe34e4c89491bd74d6b Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 30 Mar 2021 09:22:41 -0400 Subject: [PATCH 27/40] Commenting out tests --- cpp/tests/interop/to_arrow_test.cpp | 134 ++++++++++++++-------------- 1 file changed, 68 insertions(+), 66 deletions(-) diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index b1126f8de38..0e47648aef7 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -418,72 +418,74 @@ TEST_F(ToArrowTest, FixedPointTableLarge) } } -TEST_F(ToArrowTest, FixedPointTableNullsSimple) -{ - using namespace numeric; - cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t - - for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { - auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; - auto const col = - fp_wrapper({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i}); - auto const input = cudf::table_view({col}); - - std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); - decimal_builder.AppendValues(reinterpret_cast(data.data()), - data.size() / BIT_WIDTH_RATIO); - decimal_builder.AppendNull(); - decimal_builder.AppendNull(); - - CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); - - auto const field = arrow::field("a", arr->type()); - auto const schema_vector = std::vector>({field}); - auto const schema = std::make_shared(schema_vector); - auto const arrow_table = arrow::Table::Make(schema, {arr}); - - auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); - - ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true)); - } -} - -TEST_F(ToArrowTest, FixedPointTableNulls) -{ - using namespace numeric; - cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t - - // for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { - for (auto const i : {0}) { - auto const col = fp_wrapper( - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, scale_type{i}); - auto const input = cudf::table_view({col}); - - auto const expect_data = - std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0}; - std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); - for (int64_t i = 0; i < input.column(0).size() / BIT_WIDTH_RATIO; ++i) { - decimal_builder.Append(reinterpret_cast(expect_data.data() + 4 * i)); - decimal_builder.AppendNull(); - } - - CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); - - auto const field = arrow::field("a", arr->type()); - auto const schema_vector = std::vector>({field}); - auto const schema = std::make_shared(schema_vector); - auto const expected_arrow_table = arrow::Table::Make(schema, {arr}); - - auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); - - // std::cout << got_arrow_table->ToString() << std::endl; - // std::cout << expected_arrow_table->ToString() << std::endl; - - ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true)); - } -} +// TEST_F(ToArrowTest, FixedPointTableNullsSimple) +// { +// using namespace numeric; +// cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + +// for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { +// auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; +// auto const col = +// fp_wrapper({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i}); +// auto const input = cudf::table_view({col}); + +// std::shared_ptr arr; +// arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), +// arrow::default_memory_pool()); decimal_builder.AppendValues(reinterpret_cast(data.data()), +// data.size() / BIT_WIDTH_RATIO); +// decimal_builder.AppendNull(); +// decimal_builder.AppendNull(); + +// CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + +// auto const field = arrow::field("a", arr->type()); +// auto const schema_vector = std::vector>({field}); +// auto const schema = std::make_shared(schema_vector); +// auto const arrow_table = arrow::Table::Make(schema, {arr}); + +// auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); + +// ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true)); +// } +// } + +// TEST_F(ToArrowTest, FixedPointTableNulls) +// { +// using namespace numeric; +// cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + +// // for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { +// for (auto const i : {0}) { +// auto const col = fp_wrapper( +// {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, scale_type{i}); +// auto const input = cudf::table_view({col}); + +// auto const expect_data = +// std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0}; +// std::shared_ptr arr; +// arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), +// arrow::default_memory_pool()); for (int64_t i = 0; i < input.column(0).size() / +// BIT_WIDTH_RATIO; ++i) { +// decimal_builder.Append(reinterpret_cast(expect_data.data() + 4 * i)); +// decimal_builder.AppendNull(); +// } + +// CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + +// auto const field = arrow::field("a", arr->type()); +// auto const schema_vector = std::vector>({field}); +// auto const schema = std::make_shared(schema_vector); +// auto const expected_arrow_table = arrow::Table::Make(schema, {arr}); + +// auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); + +// // std::cout << got_arrow_table->ToString() << std::endl; +// // std::cout << expected_arrow_table->ToString() << std::endl; + +// ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true)); +// } +// } struct ToArrowTestSlice : public ToArrowTest, From ab3c3d17c26d44dab664eb529a7840a9871bd3f9 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 30 Mar 2021 12:41:03 -0400 Subject: [PATCH 28/40] Fix for failing to_arrow tests :D :D --- cpp/src/interop/from_arrow.cu | 2 +- cpp/src/interop/to_arrow.cu | 6 +- cpp/tests/interop/to_arrow_test.cpp | 130 +++++++++++++--------------- 3 files changed, 66 insertions(+), 72 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 683b87f7612..e56cdf73eef 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -176,7 +176,7 @@ struct dispatch_to_cudf_column { CUDA_TRY(cudaMemcpyAsync(reinterpret_cast(buf.data()), reinterpret_cast(data_buffer->address()) + array.offset() * sizeof(DeviceType), - sizeof(DeviceType) * num_rows * BIT_WIDTH_RATIO, + buf.size() * sizeof(DeviceType), cudaMemcpyDefault, stream.value())); diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 66e52dca640..147b1002133 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -157,21 +157,21 @@ std::shared_ptr dispatch_to_arrow::operator()( scatter_map, buf.data()); - auto result = arrow::AllocateBuffer(buf.size(), ar_mr); + auto result = arrow::AllocateBuffer(buf.size() * sizeof(DeviceType), ar_mr); CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); std::shared_ptr data_buffer = std::move(result.ValueOrDie()); CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), buf.data(), - buf.size() * BIT_WIDTH_RATIO * sizeof(int64_t), + buf.size() * sizeof(DeviceType), cudaMemcpyDeviceToHost, stream.value())); auto type = arrow::decimal(18, input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); auto buffers = std::vector>{mask, data_buffer}; - auto data = std::make_shared(type, buf.size() / BIT_WIDTH_RATIO, buffers); + auto data = std::make_shared(type, input.size(), buffers); return std::make_shared(data); } diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 0e47648aef7..6b0b264ed59 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -418,74 +418,68 @@ TEST_F(ToArrowTest, FixedPointTableLarge) } } -// TEST_F(ToArrowTest, FixedPointTableNullsSimple) -// { -// using namespace numeric; -// cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t - -// for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { -// auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; -// auto const col = -// fp_wrapper({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i}); -// auto const input = cudf::table_view({col}); - -// std::shared_ptr arr; -// arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), -// arrow::default_memory_pool()); decimal_builder.AppendValues(reinterpret_cast(data.data()), -// data.size() / BIT_WIDTH_RATIO); -// decimal_builder.AppendNull(); -// decimal_builder.AppendNull(); - -// CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); - -// auto const field = arrow::field("a", arr->type()); -// auto const schema_vector = std::vector>({field}); -// auto const schema = std::make_shared(schema_vector); -// auto const arrow_table = arrow::Table::Make(schema, {arr}); - -// auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); - -// ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true)); -// } -// } - -// TEST_F(ToArrowTest, FixedPointTableNulls) -// { -// using namespace numeric; -// cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t - -// // for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { -// for (auto const i : {0}) { -// auto const col = fp_wrapper( -// {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, scale_type{i}); -// auto const input = cudf::table_view({col}); - -// auto const expect_data = -// std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0}; -// std::shared_ptr arr; -// arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), -// arrow::default_memory_pool()); for (int64_t i = 0; i < input.column(0).size() / -// BIT_WIDTH_RATIO; ++i) { -// decimal_builder.Append(reinterpret_cast(expect_data.data() + 4 * i)); -// decimal_builder.AppendNull(); -// } - -// CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); - -// auto const field = arrow::field("a", arr->type()); -// auto const schema_vector = std::vector>({field}); -// auto const schema = std::make_shared(schema_vector); -// auto const expected_arrow_table = arrow::Table::Make(schema, {arr}); - -// auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); - -// // std::cout << got_arrow_table->ToString() << std::endl; -// // std::cout << expected_arrow_table->ToString() << std::endl; - -// ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true)); -// } -// } +TEST_F(ToArrowTest, FixedPointTableNullsSimple) +{ + using namespace numeric; + cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + + for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { + auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + auto const col = + fp_wrapper({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i}); + auto const input = cudf::table_view({col}); + + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); + decimal_builder.AppendValues(reinterpret_cast(data.data()), + data.size() / BIT_WIDTH_RATIO); + decimal_builder.AppendNull(); + decimal_builder.AppendNull(); + + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); + + ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true)); + } +} + +TEST_F(ToArrowTest, FixedPointTableNulls) +{ + using namespace numeric; + cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + + for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { + auto const col = fp_wrapper( + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, scale_type{i}); + auto const input = cudf::table_view({col}); + + auto const expect_data = + std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0}; + std::shared_ptr arr; + arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); + for (int64_t i = 0; i < input.column(0).size() / BIT_WIDTH_RATIO; ++i) { + decimal_builder.Append(reinterpret_cast(expect_data.data() + 4 * i)); + decimal_builder.AppendNull(); + } + + CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); + + auto const field = arrow::field("a", arr->type()); + auto const schema_vector = std::vector>({field}); + auto const schema = std::make_shared(schema_vector); + auto const expected_arrow_table = arrow::Table::Make(schema, {arr}); + + auto got_arrow_table = cudf::to_arrow(input, {{"a"}}); + + ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true)); + } +} struct ToArrowTestSlice : public ToArrowTest, From 84b0f96a41b4e203523c9d0166cabeac75ae3ed2 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 30 Mar 2021 12:55:49 -0400 Subject: [PATCH 29/40] Remove DecimalColumn.to/from_arrow. Patch precision of decimal result in ColumnBase.to_arrow --- python/cudf/cudf/core/column/column.py | 10 ++++- python/cudf/cudf/core/column/decimal.py | 58 +++---------------------- 2 files changed, 15 insertions(+), 53 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b2b2874eeb4..d07764becac 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -357,7 +357,7 @@ def to_arrow(self) -> pa.Array: pa.null(), len(self), [pa.py_buffer((b""))] ) - return libcudf.interop.to_arrow( + result = libcudf.interop.to_arrow( libcudf.table.Table( cudf.core.column_accessor.ColumnAccessor({"None": self}) ), @@ -365,6 +365,14 @@ def to_arrow(self) -> pa.Array: keep_index=False, )["None"].chunk(0) + if isinstance(self.dtype, cudf.Decimal64Dtype): + result = result.cast( + pa.decimal128( + scale=self.dtype.scale, precision=self.dtype.precision + ) + ) + return result + @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: """ diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 7fbe602f07a..16fb9b25cbc 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,66 +1,20 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -import cudf -import cupy as cp -import numpy as np -import pyarrow as pa -from pandas.api.types import is_integer_dtype from typing import cast -from cudf import _lib as libcudf -from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase -from cudf.core.dtypes import Decimal64Dtype -from cudf.utils.utils import pa_mask_buffer_to_mask +from pandas.api.types import is_integer_dtype -from cudf._typing import Dtype +import cudf +from cudf import _lib as libcudf from cudf._lib.strings.convert.convert_fixed_point import ( from_decimal as cpp_from_decimal, ) -from cudf.core.column import as_column +from cudf._typing import Dtype +from cudf.core.column import ColumnBase, as_column +from cudf.core.dtypes import Decimal64Dtype class DecimalColumn(ColumnBase): - @classmethod - def from_arrow(cls, data: pa.Array): - dtype = Decimal64Dtype.from_arrow(data.type) - mask_buf = data.buffers()[0] - mask = ( - mask_buf - if mask_buf is None - else pa_mask_buffer_to_mask(mask_buf, len(data)) - ) - data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64")) - data_64 = data_128[::2].copy() - return cls( - data=Buffer(data_64.view("uint8")), - size=len(data), - dtype=dtype, - mask=mask, - ) - - def to_arrow(self): - data_buf_64 = self.base_data.to_host_array().view("int64") - data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64") - # use striding to set the first 64 bits of each 128-bit chunk: - data_buf_128[::2] = data_buf_64 - # use striding again to set the remaining bits of each 128-bit chunk: - # 0 for non-negative values, -1 for negative values: - data_buf_128[1::2] = np.piecewise( - data_buf_64, [data_buf_64 < 0], [-1, 0] - ) - data_buf = pa.py_buffer(data_buf_128) - mask_buf = ( - self.base_mask - if self.base_mask is None - else pa.py_buffer(self.base_mask.to_host_array()) - ) - return pa.Array.from_buffers( - type=self.dtype.to_arrow(), - length=self.size, - buffers=[mask_buf, data_buf], - ) - def binary_operator(self, op, other, reflect=False): if reflect: self, other = other, self From 161ec7f893bc1f671ab6772ba3d11b4f78ab0c0e Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 30 Mar 2021 14:04:40 -0400 Subject: [PATCH 30/40] Switch from SFINAE to template specialization --- cpp/src/interop/from_arrow.cu | 109 +++++++++++++++++----------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index b599fe6348e..e1529324f0e 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -117,7 +117,7 @@ struct dispatch_to_cudf_column { return mask; } - template ()>* = nullptr> + template std::unique_ptr operator()(arrow::Array const& array, data_type type, bool skip_mask, @@ -152,59 +152,6 @@ struct dispatch_to_cudf_column { return col; } - - struct every_other { - __device__ size_type operator()(size_type i) { return 2 * i; } - }; - - template ()>* = nullptr> - std::unique_ptr operator()(arrow::Array const& array, - data_type type, - bool skip_mask, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - { - using DeviceType = device_storage_type_t; - - size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t - auto data_buffer = array.data()->buffers[1]; - auto const num_rows = static_cast(array.length()); - - rmm::device_uvector buf(num_rows * BIT_WIDTH_RATIO, stream); - rmm::device_uvector out_buf(num_rows, stream, mr); - - CUDA_TRY(cudaMemcpyAsync(reinterpret_cast(buf.data()), - reinterpret_cast(data_buffer->address()) + - array.offset() * sizeof(DeviceType), - buf.size() * sizeof(DeviceType), - cudaMemcpyDefault, - stream.value())); - - auto gather_map = cudf::detail::make_counting_transform_iterator(0, every_other{}); - - thrust::gather(rmm::exec_policy(stream), - gather_map, // - gather_map + num_rows, - buf.data(), - out_buf.data()); - - auto null_mask = [&] { - if (not skip_mask and array.null_bitmap_data()) { - auto tmp_mask = get_mask_buffer(array, stream, mr); - // If array is sliced, we have to copy whole mask and then take copy. - return (num_rows == static_cast(data_buffer->size() / sizeof(DeviceType))) - ? *tmp_mask - : cudf::detail::copy_bitmask(static_cast(tmp_mask->data()), - array.offset(), - array.offset() + num_rows, - stream, - mr); - } - return rmm::device_buffer{}; - }(); - - return std::make_unique(type, num_rows, out_buf.release(), null_mask); - } }; std::unique_ptr get_empty_type_column(size_type size) @@ -223,6 +170,60 @@ std::unique_ptr get_column(arrow::Array const& array, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +struct every_other { + __device__ size_type operator()(size_type i) { return 2 * i; } +}; + +template <> +std::unique_ptr dispatch_to_cudf_column::operator()( + arrow::Array const& array, + data_type type, + bool skip_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + using DeviceType = int64_t; + + size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto data_buffer = array.data()->buffers[1]; + auto const num_rows = static_cast(array.length()); + + rmm::device_uvector buf(num_rows * BIT_WIDTH_RATIO, stream); + rmm::device_uvector out_buf(num_rows, stream, mr); + + CUDA_TRY(cudaMemcpyAsync( + reinterpret_cast(buf.data()), + reinterpret_cast(data_buffer->address()) + array.offset() * sizeof(DeviceType), + buf.size() * sizeof(DeviceType), + cudaMemcpyDefault, + stream.value())); + + auto gather_map = cudf::detail::make_counting_transform_iterator(0, every_other{}); + + thrust::gather(rmm::exec_policy(stream), + gather_map, // + gather_map + num_rows, + buf.data(), + out_buf.data()); + + auto null_mask = [&] { + if (not skip_mask and array.null_bitmap_data()) { + auto tmp_mask = get_mask_buffer(array, stream, mr); + // If array is sliced, we have to copy whole mask and then take copy. + return (num_rows == static_cast(data_buffer->size() / sizeof(DeviceType))) + ? *tmp_mask + : cudf::detail::copy_bitmask(static_cast(tmp_mask->data()), + array.offset(), + array.offset() + num_rows, + stream, + mr); + } + return rmm::device_buffer{}; + }(); + + return std::make_unique(type, num_rows, out_buf.release(), null_mask); +} + template <> std::unique_ptr dispatch_to_cudf_column::operator()( arrow::Array const& array, From b68b703f7d1179f97b57c18aa9759ec4ce1384d8 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 30 Mar 2021 15:03:15 -0400 Subject: [PATCH 31/40] Fix scale --- cpp/src/interop/to_arrow.cu | 2 +- cpp/tests/interop/to_arrow_test.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 147b1002133..ad86d8686f0 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -168,7 +168,7 @@ std::shared_ptr dispatch_to_arrow::operator()( cudaMemcpyDeviceToHost, stream.value())); - auto type = arrow::decimal(18, input.type().scale()); + auto type = arrow::decimal(18, -input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); auto buffers = std::vector>{mask, data_buffer}; auto data = std::make_shared(type, input.size(), buffers); diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 09a931d0913..a1f9ab5d13c 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -367,7 +367,7 @@ TEST_F(ToArrowTest, FixedPointTable) auto const expect_data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); + arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool()); decimal_builder.AppendValues(reinterpret_cast(expect_data.data()), expect_data.size() / BIT_WIDTH_RATIO); CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); @@ -399,7 +399,7 @@ TEST_F(ToArrowTest, FixedPointTableLarge) auto const expect_data = std::vector{transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO}; std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); + arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool()); // Note: For some reason, decimal_builder.AppendValues with NUM_ELEMENTS >= 1000 doesn't work for (int i = 0; i < NUM_ELEMENTS; ++i) @@ -430,7 +430,7 @@ TEST_F(ToArrowTest, FixedPointTableNullsSimple) auto const input = cudf::table_view({col}); std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); + arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool()); decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / BIT_WIDTH_RATIO); decimal_builder.AppendNull(); @@ -462,7 +462,7 @@ TEST_F(ToArrowTest, FixedPointTableNulls) auto const expect_data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0}; std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(18, i), arrow::default_memory_pool()); + arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool()); for (int64_t i = 0; i < input.column(0).size() / BIT_WIDTH_RATIO; ++i) { decimal_builder.Append(reinterpret_cast(expect_data.data() + 4 * i)); decimal_builder.AppendNull(); From 89a4c0cd5cf17aa093d73c5e91106e00093d6c91 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 30 Mar 2021 18:58:21 -0400 Subject: [PATCH 32/40] Bye bye thrust::scatter, hello thrust::for_each --- cpp/src/interop/to_arrow.cu | 21 ++++++++++----------- cpp/tests/interop/to_arrow_test.cpp | 4 ++-- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index ad86d8686f0..2690be2dffd 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -31,7 +31,8 @@ #include #include -#include +#include +#include namespace cudf { namespace detail { @@ -131,10 +132,6 @@ struct dispatch_to_arrow { } }; -struct every_other { - __device__ size_type operator()(size_type i) { return 2 * i; } -}; - template <> std::shared_ptr dispatch_to_arrow::operator()( column_view input, @@ -149,13 +146,15 @@ std::shared_ptr dispatch_to_arrow::operator()( rmm::device_uvector buf(input.size() * BIT_WIDTH_RATIO, stream); thrust::uninitialized_fill(rmm::exec_policy(stream), buf.begin(), buf.end(), DeviceType{0}); - auto scatter_map = cudf::detail::make_counting_transform_iterator(0, every_other{}); + auto count = thrust::make_counting_iterator(0); - thrust::scatter(rmm::exec_policy(stream), - input.begin(), - input.end(), - scatter_map, - buf.data()); + thrust::for_each(count, + count + input.size(), + [in = input.begin(), out = buf.data()] __device__(auto in_idx) { + auto const out_idx = in_idx * 2; + out[out_idx] = in[in_idx]; + out[out_idx + 1] = in[in_idx] < 0 ? -1 : 0; + }); auto result = arrow::AllocateBuffer(buf.size() * sizeof(DeviceType), ar_mr); CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index a1f9ab5d13c..415cb65b14e 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -362,10 +362,10 @@ TEST_F(ToArrowTest, FixedPointTable) cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { - auto const col = fp_wrapper({1, 2, 3, 4, 5, 6}, scale_type{i}); + auto const col = fp_wrapper({-1, 2, 3, 4, 5, 6}, scale_type{i}); auto const input = cudf::table_view({col}); - auto const expect_data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; + auto const expect_data = std::vector{-1, -1, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; std::shared_ptr arr; arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool()); decimal_builder.AppendValues(reinterpret_cast(expect_data.data()), From 398b4621353aeb862c1d43cce15fa15522d03378 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 30 Mar 2021 19:01:03 -0400 Subject: [PATCH 33/40] Patch precision in from_arrow --- python/cudf/cudf/core/column/column.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index bddff4bd56c..93d69871116 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -435,10 +435,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: ): return cudf.core.column.IntervalColumn.from_arrow(array) - return libcudf.interop.from_arrow(data, data.column_names)._data[ + result = libcudf.interop.from_arrow(data, data.column_names)._data[ "None" ] + if isinstance(result.dtype, cudf.Decimal64Dtype): + result.dtype.precision = array.type.precision + return result + def _get_mask_as_column(self) -> ColumnBase: return libcudf.transform.mask_to_bools( self.base_mask, self.offset, self.offset + len(self) From 4e937fb04b731c21a7b64ae27ecc5698a71e846f Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 30 Mar 2021 19:03:15 -0400 Subject: [PATCH 34/40] Fix from_arrow scale & tests --- cpp/src/interop/from_arrow.cu | 4 ++-- cpp/tests/interop/from_arrow_test.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index e1529324f0e..60e67dbf4f3 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -78,9 +78,9 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type) case arrow::Type::STRING: return data_type(type_id::STRING); case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32); case arrow::Type::LIST: return data_type(type_id::LIST); - case arrow::Type::DECIMAL: { // DECIMAL128 ??? + case arrow::Type::DECIMAL: { auto type = static_cast(&arrow_type); - return data_type{type_id::DECIMAL64, type->scale()}; + return data_type{type_id::DECIMAL64, -type->scale()}; } case arrow::Type::STRUCT: return data_type(type_id::STRUCT); default: CUDF_FAIL("Unsupported type_id conversion to cudf"); diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index bbcd0be4219..15a35b95cd1 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -366,7 +366,7 @@ TEST_F(FromArrowTest, FixedPointTable) auto const expected = cudf::table_view({col}); std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); + arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool()); decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / BIT_WIDTH_RATIO); CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); @@ -397,7 +397,7 @@ TEST_F(FromArrowTest, FixedPointTableLarge) auto const expected = cudf::table_view({col}); std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); + arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool()); decimal_builder.AppendValues(reinterpret_cast(data.data()), NUM_ELEMENTS); CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array"); @@ -424,7 +424,7 @@ TEST_F(FromArrowTest, FixedPointTableNulls) auto const expected = cudf::table_view({col}); std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); + arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool()); decimal_builder.AppendValues(reinterpret_cast(data.data()), data.size() / BIT_WIDTH_RATIO); decimal_builder.AppendNull(); @@ -458,7 +458,7 @@ TEST_F(FromArrowTest, FixedPointTableNullsLarge) auto const expected = cudf::table_view({col}); std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(10, i), arrow::default_memory_pool()); + arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool()); for (int64_t i = 0; i < NUM_ELEMENTS / BIT_WIDTH_RATIO; ++i) { decimal_builder.Append(reinterpret_cast(data.data() + 4 * i)); decimal_builder.AppendNull(); From 296c5df1ec55d985c6cbd229d33cc89a6963c4e8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 30 Mar 2021 19:36:08 -0400 Subject: [PATCH 35/40] xfail pytests with explanation --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/tests/test_decimal.py | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 93d69871116..77bf3c0b8ce 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -368,7 +368,7 @@ def to_arrow(self) -> pa.Array: if isinstance(self.dtype, cudf.Decimal64Dtype): result = result.cast( pa.decimal128( - scale=self.dtype.scale, precision=self.dtype.precision + scale=result.type.scale, precision=self.dtype.precision ) ) return result diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index ddf56828c3d..bfad7621d9b 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -5,15 +5,11 @@ import numpy as np import pyarrow as pa import pytest -import cudf -from cudf.core.dtypes import Decimal64Dtype +import cudf from cudf.core.column import DecimalColumn, NumericalColumn - -from cudf.tests.utils import ( - FLOAT_TYPES, - assert_eq, -) +from cudf.core.dtypes import Decimal64Dtype +from cudf.tests.utils import FLOAT_TYPES, assert_eq @pytest.mark.parametrize( @@ -24,7 +20,14 @@ [1], [-1], [1, 2, 3, 4], - [42, 1729, 4104], + pytest.param( + [42, 1729, 4104], + marks=pytest.mark.xfail( + reason="pyarrow bug that sometimes " + "doesn't allow casting to a higher precision " + "then back." + ), + ), [1, 2, None, 4], [None, None, None], [], From b66e83f8116093837d720d810dd4fdaec4931530 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 30 Mar 2021 19:42:24 -0400 Subject: [PATCH 36/40] C++ cleanup --- cpp/src/interop/from_arrow.cu | 15 ++++++--------- cpp/src/interop/to_arrow.cu | 6 +++--- cpp/tests/interop/from_arrow_test.cpp | 12 ++++++------ cpp/tests/interop/to_arrow_test.cpp | 10 +++++----- 4 files changed, 20 insertions(+), 23 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 60e67dbf4f3..6723775f6c0 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -79,7 +79,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type) case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32); case arrow::Type::LIST: return data_type(type_id::LIST); case arrow::Type::DECIMAL: { - auto type = static_cast(&arrow_type); + auto const type = static_cast(&arrow_type); return data_type{type_id::DECIMAL64, -type->scale()}; } case arrow::Type::STRUCT: return data_type(type_id::STRUCT); @@ -170,10 +170,6 @@ std::unique_ptr get_column(arrow::Array const& array, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -struct every_other { - __device__ size_type operator()(size_type i) { return 2 * i; } -}; - template <> std::unique_ptr dispatch_to_cudf_column::operator()( arrow::Array const& array, @@ -198,7 +194,8 @@ std::unique_ptr dispatch_to_cudf_column::operator()( cudaMemcpyDefault, stream.value())); - auto gather_map = cudf::detail::make_counting_transform_iterator(0, every_other{}); + auto every_other = [] __device__(size_type i) { return 2 * i; }; + auto gather_map = cudf::detail::make_counting_transform_iterator(0, every_other); thrust::gather(rmm::exec_policy(stream), gather_map, // @@ -208,11 +205,11 @@ std::unique_ptr dispatch_to_cudf_column::operator()( auto null_mask = [&] { if (not skip_mask and array.null_bitmap_data()) { - auto tmp_mask = get_mask_buffer(array, stream, mr); + auto temp_mask = get_mask_buffer(array, stream, mr); // If array is sliced, we have to copy whole mask and then take copy. return (num_rows == static_cast(data_buffer->size() / sizeof(DeviceType))) - ? *tmp_mask - : cudf::detail::copy_bitmask(static_cast(tmp_mask->data()), + ? *temp_mask + : cudf::detail::copy_bitmask(static_cast(temp_mask->data()), array.offset(), array.offset() + num_rows, stream, diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 2690be2dffd..2a369126eeb 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -144,7 +144,6 @@ std::shared_ptr dispatch_to_arrow::operator()( size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t rmm::device_uvector buf(input.size() * BIT_WIDTH_RATIO, stream); - thrust::uninitialized_fill(rmm::exec_policy(stream), buf.begin(), buf.end(), DeviceType{0}); auto count = thrust::make_counting_iterator(0); @@ -156,14 +155,15 @@ std::shared_ptr dispatch_to_arrow::operator()( out[out_idx + 1] = in[in_idx] < 0 ? -1 : 0; }); - auto result = arrow::AllocateBuffer(buf.size() * sizeof(DeviceType), ar_mr); + auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); + auto result = arrow::AllocateBuffer(buf_size_in_bytes, ar_mr); CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); std::shared_ptr data_buffer = std::move(result.ValueOrDie()); CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), buf.data(), - buf.size() * sizeof(DeviceType), + buf_size_in_bytes, cudaMemcpyDeviceToHost, stream.value())); diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 15a35b95cd1..ae8808ba59d 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -358,7 +358,7 @@ using fp_wrapper = cudf::test::fixed_point_column_wrapper; TEST_F(FromArrowTest, FixedPointTable) { using namespace numeric; - cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto constexpr BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; @@ -385,9 +385,9 @@ TEST_F(FromArrowTest, FixedPointTable) TEST_F(FromArrowTest, FixedPointTableLarge) { using namespace numeric; - cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto constexpr BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto constexpr NUM_ELEMENTS = 1000; - int64_t constexpr NUM_ELEMENTS = 1000; for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { auto every_other = [](auto i) { return i % BIT_WIDTH_RATIO ? 0 : i / BIT_WIDTH_RATIO; }; auto transform = cudf::detail::make_counting_transform_iterator(BIT_WIDTH_RATIO, every_other); @@ -415,7 +415,7 @@ TEST_F(FromArrowTest, FixedPointTableLarge) TEST_F(FromArrowTest, FixedPointTableNulls) { using namespace numeric; - cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto constexpr BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; @@ -446,9 +446,9 @@ TEST_F(FromArrowTest, FixedPointTableNulls) TEST_F(FromArrowTest, FixedPointTableNullsLarge) { using namespace numeric; - cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto constexpr BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto constexpr NUM_ELEMENTS = 1000; - int64_t constexpr NUM_ELEMENTS = 1000; for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { auto every_other = [](auto i) { return i % BIT_WIDTH_RATIO ? 0 : i / BIT_WIDTH_RATIO; }; auto transform = cudf::detail::make_counting_transform_iterator(BIT_WIDTH_RATIO, every_other); diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 415cb65b14e..00d625175d0 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -359,7 +359,7 @@ using fp_wrapper = cudf::test::fixed_point_column_wrapper; TEST_F(ToArrowTest, FixedPointTable) { using namespace numeric; - cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto constexpr const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { auto const col = fp_wrapper({-1, 2, 3, 4, 5, 6}, scale_type{i}); @@ -386,8 +386,8 @@ TEST_F(ToArrowTest, FixedPointTable) TEST_F(ToArrowTest, FixedPointTableLarge) { using namespace numeric; - cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t - int32_t constexpr NUM_ELEMENTS = 1000; + auto constexpr BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto constexpr NUM_ELEMENTS = 1000; for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { auto iota = thrust::make_counting_iterator(1); @@ -421,7 +421,7 @@ TEST_F(ToArrowTest, FixedPointTableLarge) TEST_F(ToArrowTest, FixedPointTableNullsSimple) { using namespace numeric; - cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto constexpr BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { auto const data = std::vector{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0}; @@ -452,7 +452,7 @@ TEST_F(ToArrowTest, FixedPointTableNullsSimple) TEST_F(ToArrowTest, FixedPointTableNulls) { using namespace numeric; - cudf::size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto constexpr BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t for (auto const i : {3, 2, 1, 0, -1, -2, -3}) { auto const col = fp_wrapper( From 20c5f96bf5e8b8a4273aa0d0a529a1904c006d13 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 31 Mar 2021 10:29:09 -0400 Subject: [PATCH 37/40] Change cast->view and remove xfail --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/tests/test_decimal.py | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 77bf3c0b8ce..a975674cd09 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -366,7 +366,7 @@ def to_arrow(self) -> pa.Array: )["None"].chunk(0) if isinstance(self.dtype, cudf.Decimal64Dtype): - result = result.cast( + result = result.view( pa.decimal128( scale=result.type.scale, precision=self.dtype.precision ) diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index bfad7621d9b..5d50dae7957 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -20,14 +20,7 @@ [1], [-1], [1, 2, 3, 4], - pytest.param( - [42, 1729, 4104], - marks=pytest.mark.xfail( - reason="pyarrow bug that sometimes " - "doesn't allow casting to a higher precision " - "then back." - ), - ), + [42, 1729, 4104], [1, 2, None, 4], [None, None, None], [], From b6c1f350738f426bd341061af0209cb3392c50a1 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Wed, 31 Mar 2021 17:46:13 -0400 Subject: [PATCH 38/40] Addressing PR comments --- cpp/src/interop/from_arrow.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 1b2cac8b07c..898bae38aa2 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -190,9 +190,9 @@ std::unique_ptr dispatch_to_cudf_column::operator()( { using DeviceType = int64_t; - size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t - auto data_buffer = array.data()->buffers[1]; - auto const num_rows = static_cast(array.length()); + auto constexpr BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t + auto data_buffer = array.data()->buffers[1]; + auto const num_rows = static_cast(array.length()); rmm::device_uvector buf(num_rows * BIT_WIDTH_RATIO, stream); rmm::device_uvector out_buf(num_rows, stream, mr); @@ -228,7 +228,7 @@ std::unique_ptr dispatch_to_cudf_column::operator()( return rmm::device_buffer{}; }(); - return std::make_unique(type, num_rows, out_buf.release(), null_mask); + return std::make_unique(type, num_rows, out_buf.release(), std::move(null_mask)); } template <> From 7503e6c0cb585a7e579f027ca55081416baf7a22 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Wed, 31 Mar 2021 17:49:05 -0400 Subject: [PATCH 39/40] Address PR comments --- cpp/src/interop/from_arrow.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 898bae38aa2..bd249ec2f5b 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -218,7 +218,7 @@ std::unique_ptr dispatch_to_cudf_column::operator()( auto temp_mask = get_mask_buffer(array, stream, mr); // If array is sliced, we have to copy whole mask and then take copy. return (num_rows == static_cast(data_buffer->size() / sizeof(DeviceType))) - ? *temp_mask + ? std::move(*temp_mask.release()) : cudf::detail::copy_bitmask(static_cast(temp_mask->data()), array.offset(), array.offset() + num_rows, From 133529cb69592c72deb336bf3104641a52a5682b Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 6 Apr 2021 11:23:21 -0400 Subject: [PATCH 40/40] Addressing PR comments --- cpp/src/interop/from_arrow.cu | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index bd249ec2f5b..ee02fadc017 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -207,18 +207,15 @@ std::unique_ptr dispatch_to_cudf_column::operator()( auto every_other = [] __device__(size_type i) { return 2 * i; }; auto gather_map = cudf::detail::make_counting_transform_iterator(0, every_other); - thrust::gather(rmm::exec_policy(stream), - gather_map, // - gather_map + num_rows, - buf.data(), - out_buf.data()); + thrust::gather( + rmm::exec_policy(stream), gather_map, gather_map + num_rows, buf.data(), out_buf.data()); auto null_mask = [&] { if (not skip_mask and array.null_bitmap_data()) { auto temp_mask = get_mask_buffer(array, stream, mr); // If array is sliced, we have to copy whole mask and then take copy. return (num_rows == static_cast(data_buffer->size() / sizeof(DeviceType))) - ? std::move(*temp_mask.release()) + ? *temp_mask.release() : cudf::detail::copy_bitmask(static_cast(temp_mask->data()), array.offset(), array.offset() + num_rows,