From e57f0fe4edafb689ff468ae6336d47b3aea4772d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 12 Jun 2024 09:19:48 -0500 Subject: [PATCH] Enable round-tripping of large strings in `cudf` (#15944) Fixes: #15922 This PR adds support for round-tripping `LargeStringArray` in `cudf` using 64 bit offsets. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15944 --- cpp/src/interop/from_arrow.cu | 42 ++++++++++++++++++++------ cpp/src/interop/to_arrow.cu | 18 ++++++++--- cpp/tests/interop/from_arrow_test.cpp | 42 ++++++++++++++++++++++++-- python/cudf/cudf/core/column/column.py | 6 ---- python/cudf/cudf/tests/test_series.py | 11 ++++--- 5 files changed, 91 insertions(+), 28 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index f100ca0cc2b..579820cbae3 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -78,6 +78,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type) } } case arrow::Type::STRING: return data_type(type_id::STRING); + case arrow::Type::LARGE_STRING: return data_type(type_id::STRING); case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32); case arrow::Type::LIST: return data_type(type_id::LIST); case arrow::Type::DECIMAL: { @@ -276,21 +277,42 @@ std::unique_ptr dispatch_to_cudf_column::operator()( rmm::device_async_resource_ref mr) { if (array.length() == 0) { return make_empty_column(type_id::STRING); } - auto str_array = static_cast(&array); - auto offset_array = std::make_unique( - str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr); - auto char_array = std::make_unique( - str_array->value_data()->size(), str_array->value_data(), nullptr); - auto offsets_column = dispatch_to_cudf_column{}.operator()( - *offset_array, data_type(type_id::INT32), true, stream, mr); - auto chars_column = dispatch_to_cudf_column{}.operator()( - *char_array, data_type(type_id::INT8), true, stream, mr); + std::unique_ptr offsets_column; + std::unique_ptr char_array; + + if (array.type_id() == arrow::Type::LARGE_STRING) { + auto str_array = static_cast(&array); + auto offset_array = std::make_unique( + str_array->value_offsets()->size() / sizeof(int64_t), str_array->value_offsets(), nullptr); + offsets_column = dispatch_to_cudf_column{}.operator()( + *offset_array, data_type(type_id::INT64), true, stream, mr); + char_array = std::make_unique( + str_array->value_data()->size(), str_array->value_data(), nullptr); + } else if (array.type_id() == arrow::Type::STRING) { + auto str_array = static_cast(&array); + auto offset_array = std::make_unique( + str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr); + offsets_column = dispatch_to_cudf_column{}.operator()( + *offset_array, data_type(type_id::INT32), true, stream, mr); + char_array = std::make_unique( + str_array->value_data()->size(), str_array->value_data(), nullptr); + } else { + throw std::runtime_error("Unsupported array type"); + } + + rmm::device_buffer chars(char_array->length(), stream, mr); + auto data_buffer = char_array->data()->buffers[1]; + CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(), + reinterpret_cast(data_buffer->address()), + chars.size(), + cudaMemcpyDefault, + stream.value())); auto const num_rows = offsets_column->size() - 1; auto out_col = make_strings_column(num_rows, std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + std::move(chars), array.null_count(), std::move(*get_mask_buffer(array, stream, mr))); diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index e871e656c48..47aee982c32 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -306,11 +306,19 @@ std::shared_ptr dispatch_to_arrow::operator()( static_cast(sview.chars_size(stream))}, ar_mr, stream); - return std::make_shared(static_cast(input_view.size()), - offset_buffer, - data_buffer, - fetch_mask_buffer(input_view, ar_mr, stream), - static_cast(input_view.null_count())); + if (sview.offsets().type().id() == cudf::type_id::INT64) { + return std::make_shared(static_cast(input_view.size()), + offset_buffer, + data_buffer, + fetch_mask_buffer(input_view, ar_mr, stream), + static_cast(input_view.null_count())); + } else { + return std::make_shared(static_cast(input_view.size()), + offset_buffer, + data_buffer, + fetch_mask_buffer(input_view, ar_mr, stream), + static_cast(input_view.null_count())); + } } template <> diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index aec2bab7196..af20a5c772f 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -50,13 +50,36 @@ std::unique_ptr get_cudf_table() {true, false, true, true, true}); columns.emplace_back(std::move(cudf::dictionary::encode(col4))); columns.emplace_back(cudf::test::fixed_width_column_wrapper( - {true, false, true, false, true}, {true, false, true, true, false}) + {true, false, true, false, true}, {true, false, true, true, false}).release()); + columns.emplace_back(cudf::test::strings_column_wrapper( + { + "", + "abc", + "def", + "1", + "2", + }, + {0, 1, 1, 1, 1}) .release()); // columns.emplace_back(cudf::test::lists_column_wrapper({{1, 2}, {3, 4}, {}, {6}, {7, 8, // 9}}).release()); return std::make_unique(std::move(columns)); } +std::shared_ptr get_arrow_large_string_array( + std::vector const& data, std::vector const& mask = {}) +{ + std::shared_ptr large_string_array; + arrow::LargeStringBuilder large_string_builder; + + CUDF_EXPECTS(large_string_builder.AppendValues(data, mask.data()).ok(), + "Failed to append values to string builder"); + CUDF_EXPECTS(large_string_builder.Finish(&large_string_array).ok(), + "Failed to create arrow string array"); + + return large_string_array; +} + struct FromArrowTest : public cudf::test::BaseFixture {}; template @@ -294,6 +317,15 @@ TEST_F(FromArrowTest, ChunkedArray) "ccc", }, {0, 1}); + auto large_string_array_1 = get_arrow_large_string_array( + { + "", + "abc", + "def", + "1", + "2", + }, + {0, 1, 1, 1, 1}); auto dict_array1 = get_arrow_dict_array({1, 2, 5, 7}, {0, 1, 2}, {1, 0, 1}); auto dict_array2 = get_arrow_dict_array({1, 2, 5, 7}, {1, 3}); @@ -307,13 +339,16 @@ TEST_F(FromArrowTest, ChunkedArray) auto boolean_array = get_arrow_array({true, false, true, false, true}, {true, false, true, true, false}); auto boolean_chunked_array = std::make_shared(boolean_array); + auto large_string_chunked_array = std::make_shared( + std::vector>{large_string_array_1}); std::vector> schema_vector( {arrow::field("a", int32_chunked_array->type()), arrow::field("b", int64array->type()), arrow::field("c", string_array_1->type()), arrow::field("d", dict_chunked_array->type()), - arrow::field("e", boolean_chunked_array->type())}); + arrow::field("e", boolean_chunked_array->type()), + arrow::field("f", large_string_array_1->type())}); auto schema = std::make_shared(schema_vector); auto arrow_table = arrow::Table::Make(schema, @@ -321,7 +356,8 @@ TEST_F(FromArrowTest, ChunkedArray) int64_chunked_array, string_chunked_array, dict_chunked_array, - boolean_chunked_array}); + boolean_chunked_array, + large_string_chunked_array}); auto expected_cudf_table = get_cudf_table(); diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7abdbc85720..001e8996c19 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -334,12 +334,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: ) elif isinstance(array.type, ArrowIntervalType): return cudf.core.column.IntervalColumn.from_arrow(array) - elif pa.types.is_large_string(array.type): - # Pandas-2.2+: Pandas defaults to `large_string` type - # instead of `string` without data-introspection. - # Temporary workaround until cudf has native - # support for `LARGE_STRING` i.e., 64 bit offsets - array = array.cast(pa.string()) data = pa.table([array], [None]) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index f47c42d9a1d..30189e1ac8a 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2737,13 +2737,16 @@ def test_series_dtype_astypes(data): assert_eq(result, expected) -def test_series_from_large_string(): - pa_large_string_array = pa.array(["a", "b", "c"]).cast(pa.large_string()) - got = cudf.Series(pa_large_string_array) - expected = pd.Series(pa_large_string_array) +@pytest.mark.parametrize("pa_type", [pa.string, pa.large_string]) +def test_series_from_large_string(pa_type): + pa_string_array = pa.array(["a", "b", "c"]).cast(pa_type()) + got = cudf.Series(pa_string_array) + expected = pd.Series(pa_string_array) assert_eq(expected, got) + assert pa_string_array.equals(got.to_arrow()) + @pytest.mark.parametrize( "scalar",