Skip to content

Commit

Permalink
Enable round-tripping of large strings in cudf (#15944)
Browse files Browse the repository at this point in the history
Fixes: #15922

This PR adds support for round-tripping `LargeStringArray` in `cudf` using 64 bit offsets.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: #15944
  • Loading branch information
galipremsagar authored Jun 12, 2024
1 parent 9fae8ab commit e57f0fe
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 28 deletions.
42 changes: 32 additions & 10 deletions cpp/src/interop/from_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
}
}
case arrow::Type::STRING: return data_type(type_id::STRING);
case arrow::Type::LARGE_STRING: return data_type(type_id::STRING);
case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32);
case arrow::Type::LIST: return data_type(type_id::LIST);
case arrow::Type::DECIMAL: {
Expand Down Expand Up @@ -276,21 +277,42 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
rmm::device_async_resource_ref mr)
{
if (array.length() == 0) { return make_empty_column(type_id::STRING); }
auto str_array = static_cast<arrow::StringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int32Array>(
str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
auto char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);

auto offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
*offset_array, data_type(type_id::INT32), true, stream, mr);
auto chars_column = dispatch_to_cudf_column{}.operator()<int8_t>(
*char_array, data_type(type_id::INT8), true, stream, mr);
std::unique_ptr<column> offsets_column;
std::unique_ptr<arrow::Array> char_array;

if (array.type_id() == arrow::Type::LARGE_STRING) {
auto str_array = static_cast<arrow::LargeStringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int64Array>(
str_array->value_offsets()->size() / sizeof(int64_t), str_array->value_offsets(), nullptr);
offsets_column = dispatch_to_cudf_column{}.operator()<int64_t>(
*offset_array, data_type(type_id::INT64), true, stream, mr);
char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);
} else if (array.type_id() == arrow::Type::STRING) {
auto str_array = static_cast<arrow::StringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int32Array>(
str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
*offset_array, data_type(type_id::INT32), true, stream, mr);
char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);
} else {
throw std::runtime_error("Unsupported array type");
}

rmm::device_buffer chars(char_array->length(), stream, mr);
auto data_buffer = char_array->data()->buffers[1];
CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
reinterpret_cast<uint8_t const*>(data_buffer->address()),
chars.size(),
cudaMemcpyDefault,
stream.value()));

auto const num_rows = offsets_column->size() - 1;
auto out_col = make_strings_column(num_rows,
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
std::move(chars),
array.null_count(),
std::move(*get_mask_buffer(array, stream, mr)));

Expand Down
18 changes: 13 additions & 5 deletions cpp/src/interop/to_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -306,11 +306,19 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
static_cast<std::size_t>(sview.chars_size(stream))},
ar_mr,
stream);
return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
if (sview.offsets().type().id() == cudf::type_id::INT64) {
return std::make_shared<arrow::LargeStringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
} else {
return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
}
}

template <>
Expand Down
42 changes: 39 additions & 3 deletions cpp/tests/interop/from_arrow_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,36 @@ std::unique_ptr<cudf::table> get_cudf_table()
{true, false, true, true, true});
columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
{true, false, true, false, true}, {true, false, true, true, false})
{true, false, true, false, true}, {true, false, true, true, false}).release());
columns.emplace_back(cudf::test::strings_column_wrapper(
{
"",
"abc",
"def",
"1",
"2",
},
{0, 1, 1, 1, 1})
.release());
// columns.emplace_back(cudf::test::lists_column_wrapper<int>({{1, 2}, {3, 4}, {}, {6}, {7, 8,
// 9}}).release());
return std::make_unique<cudf::table>(std::move(columns));
}

std::shared_ptr<arrow::LargeStringArray> get_arrow_large_string_array(
std::vector<std::string> const& data, std::vector<uint8_t> const& mask = {})
{
std::shared_ptr<arrow::LargeStringArray> large_string_array;
arrow::LargeStringBuilder large_string_builder;

CUDF_EXPECTS(large_string_builder.AppendValues(data, mask.data()).ok(),
"Failed to append values to string builder");
CUDF_EXPECTS(large_string_builder.Finish(&large_string_array).ok(),
"Failed to create arrow string array");

return large_string_array;
}

struct FromArrowTest : public cudf::test::BaseFixture {};

template <typename T>
Expand Down Expand Up @@ -294,6 +317,15 @@ TEST_F(FromArrowTest, ChunkedArray)
"ccc",
},
{0, 1});
auto large_string_array_1 = get_arrow_large_string_array(
{
"",
"abc",
"def",
"1",
"2",
},
{0, 1, 1, 1, 1});
auto dict_array1 = get_arrow_dict_array({1, 2, 5, 7}, {0, 1, 2}, {1, 0, 1});
auto dict_array2 = get_arrow_dict_array({1, 2, 5, 7}, {1, 3});

Expand All @@ -307,21 +339,25 @@ TEST_F(FromArrowTest, ChunkedArray)
auto boolean_array =
get_arrow_array<bool>({true, false, true, false, true}, {true, false, true, true, false});
auto boolean_chunked_array = std::make_shared<arrow::ChunkedArray>(boolean_array);
auto large_string_chunked_array = std::make_shared<arrow::ChunkedArray>(
std::vector<std::shared_ptr<arrow::Array>>{large_string_array_1});

std::vector<std::shared_ptr<arrow::Field>> schema_vector(
{arrow::field("a", int32_chunked_array->type()),
arrow::field("b", int64array->type()),
arrow::field("c", string_array_1->type()),
arrow::field("d", dict_chunked_array->type()),
arrow::field("e", boolean_chunked_array->type())});
arrow::field("e", boolean_chunked_array->type()),
arrow::field("f", large_string_array_1->type())});
auto schema = std::make_shared<arrow::Schema>(schema_vector);

auto arrow_table = arrow::Table::Make(schema,
{int32_chunked_array,
int64_chunked_array,
string_chunked_array,
dict_chunked_array,
boolean_chunked_array});
boolean_chunked_array,
large_string_chunked_array});

auto expected_cudf_table = get_cudf_table();

Expand Down
6 changes: 0 additions & 6 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,12 +334,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
)
elif isinstance(array.type, ArrowIntervalType):
return cudf.core.column.IntervalColumn.from_arrow(array)
elif pa.types.is_large_string(array.type):
# Pandas-2.2+: Pandas defaults to `large_string` type
# instead of `string` without data-introspection.
# Temporary workaround until cudf has native
# support for `LARGE_STRING` i.e., 64 bit offsets
array = array.cast(pa.string())

data = pa.table([array], [None])

Expand Down
11 changes: 7 additions & 4 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2737,13 +2737,16 @@ def test_series_dtype_astypes(data):
assert_eq(result, expected)


def test_series_from_large_string():
pa_large_string_array = pa.array(["a", "b", "c"]).cast(pa.large_string())
got = cudf.Series(pa_large_string_array)
expected = pd.Series(pa_large_string_array)
@pytest.mark.parametrize("pa_type", [pa.string, pa.large_string])
def test_series_from_large_string(pa_type):
pa_string_array = pa.array(["a", "b", "c"]).cast(pa_type())
got = cudf.Series(pa_string_array)
expected = pd.Series(pa_string_array)

assert_eq(expected, got)

assert pa_string_array.equals(got.to_arrow())


@pytest.mark.parametrize(
"scalar",
Expand Down

0 comments on commit e57f0fe

Please sign in to comment.