Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable round-tripping of large strings in cudf #15944

Merged
merged 24 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d81f2a2
return LargeStringArray if large strings are enabled
galipremsagar Jun 6, 2024
d33bb09
Update cpp/src/interop/to_arrow.cu
galipremsagar Jun 7, 2024
0ee3f78
revert
galipremsagar Jun 7, 2024
a3b6cfa
Merge remote-tracking branch 'upstream/branch-24.08' into arrow_interop
galipremsagar Jun 7, 2024
3decdd1
Merge remote-tracking branch 'upstream/branch-24.08' into arrow_interop
galipremsagar Jun 7, 2024
7c58693
Add from_arrow support
galipremsagar Jun 10, 2024
91bf4a9
Merge remote-tracking branch 'upstream/branch-24.08' into arrow_interop
galipremsagar Jun 10, 2024
9788dee
no overflow error
galipremsagar Jun 10, 2024
fc6fc9e
Merge remote-tracking branch 'upstream/branch-24.08' into arrow_interop
galipremsagar Jun 10, 2024
b02a43b
Add from arrow tests
galipremsagar Jun 10, 2024
7e59184
Merge branch 'branch-24.08' into arrow_interop
galipremsagar Jun 10, 2024
bf13c9d
revert pytest
galipremsagar Jun 10, 2024
6081a02
Merge branch 'branch-24.08' into arrow_interop
galipremsagar Jun 10, 2024
89c17e9
Use device_buffer
galipremsagar Jun 11, 2024
d113c30
Update cpp/src/interop/to_arrow.cu
galipremsagar Jun 11, 2024
f09765d
Merge branch 'arrow_interop' of https://github.com/galipremsagar/cudf…
galipremsagar Jun 11, 2024
38b0ac7
Merge branch 'branch-24.08' into arrow_interop
galipremsagar Jun 11, 2024
cea5714
Update cpp/tests/interop/arrow_utils.hpp
galipremsagar Jun 11, 2024
0f3b3a0
style
galipremsagar Jun 11, 2024
cabacd1
Merge branch 'branch-24.08' into arrow_interop
galipremsagar Jun 11, 2024
6a89e20
Update cpp/tests/interop/from_arrow_test.cpp
galipremsagar Jun 11, 2024
a60de7d
move get_arrow_large_string_array to cpp
galipremsagar Jun 11, 2024
631075f
Merge branch 'branch-24.08' into arrow_interop
galipremsagar Jun 11, 2024
31a60a8
Merge
galipremsagar Jun 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 32 additions & 10 deletions cpp/src/interop/from_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
}
}
case arrow::Type::STRING: return data_type(type_id::STRING);
case arrow::Type::LARGE_STRING: return data_type(type_id::STRING);
case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32);
case arrow::Type::LIST: return data_type(type_id::LIST);
case arrow::Type::DECIMAL: {
Expand Down Expand Up @@ -276,21 +277,42 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
rmm::device_async_resource_ref mr)
{
if (array.length() == 0) { return make_empty_column(type_id::STRING); }
auto str_array = static_cast<arrow::StringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int32Array>(
str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
auto char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);

auto offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
*offset_array, data_type(type_id::INT32), true, stream, mr);
auto chars_column = dispatch_to_cudf_column{}.operator()<int8_t>(
*char_array, data_type(type_id::INT8), true, stream, mr);
std::unique_ptr<column> offsets_column;
std::unique_ptr<arrow::Array> char_array;

if (array.type_id() == arrow::Type::LARGE_STRING) {
auto str_array = static_cast<arrow::LargeStringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int64Array>(
str_array->value_offsets()->size() / sizeof(int64_t), str_array->value_offsets(), nullptr);
offsets_column = dispatch_to_cudf_column{}.operator()<int64_t>(
*offset_array, data_type(type_id::INT64), true, stream, mr);
char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);
} else if (array.type_id() == arrow::Type::STRING) {
auto str_array = static_cast<arrow::StringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int32Array>(
str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
*offset_array, data_type(type_id::INT32), true, stream, mr);
char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);
} else {
throw std::runtime_error("Unsupported array type");
}

rmm::device_buffer chars(char_array->length(), stream, mr);
auto data_buffer = char_array->data()->buffers[1];
CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
reinterpret_cast<uint8_t const*>(data_buffer->address()),
chars.size(),
cudaMemcpyDefault,
stream.value()));

auto const num_rows = offsets_column->size() - 1;
auto out_col = make_strings_column(num_rows,
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
std::move(chars),
array.null_count(),
std::move(*get_mask_buffer(array, stream, mr)));

Expand Down
18 changes: 13 additions & 5 deletions cpp/src/interop/to_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -306,11 +306,19 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
static_cast<std::size_t>(sview.chars_size(stream))},
ar_mr,
stream);
return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
if (sview.offsets().type().id() == cudf::type_id::INT64) {
return std::make_shared<arrow::LargeStringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
} else {
return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
}
}

template <>
Expand Down
42 changes: 39 additions & 3 deletions cpp/tests/interop/from_arrow_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,36 @@ std::unique_ptr<cudf::table> get_cudf_table()
{true, false, true, true, true});
columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
{true, false, true, false, true}, {true, false, true, true, false})
{true, false, true, false, true}, {true, false, true, true, false}).release());
columns.emplace_back(cudf::test::strings_column_wrapper(
{
"",
"abc",
"def",
"1",
"2",
},
{0, 1, 1, 1, 1})
.release());
// columns.emplace_back(cudf::test::lists_column_wrapper<int>({{1, 2}, {3, 4}, {}, {6}, {7, 8,
// 9}}).release());
return std::make_unique<cudf::table>(std::move(columns));
}

std::shared_ptr<arrow::LargeStringArray> get_arrow_large_string_array(
std::vector<std::string> const& data, std::vector<uint8_t> const& mask = {})
{
std::shared_ptr<arrow::LargeStringArray> large_string_array;
arrow::LargeStringBuilder large_string_builder;

CUDF_EXPECTS(large_string_builder.AppendValues(data, mask.data()).ok(),
"Failed to append values to string builder");
CUDF_EXPECTS(large_string_builder.Finish(&large_string_array).ok(),
"Failed to create arrow string array");

return large_string_array;
}

struct FromArrowTest : public cudf::test::BaseFixture {};

template <typename T>
Expand Down Expand Up @@ -294,6 +317,15 @@ TEST_F(FromArrowTest, ChunkedArray)
"ccc",
},
{0, 1});
auto large_string_array_1 = get_arrow_large_string_array(
{
"",
"abc",
"def",
"1",
"2",
},
{0, 1, 1, 1, 1});
auto dict_array1 = get_arrow_dict_array({1, 2, 5, 7}, {0, 1, 2}, {1, 0, 1});
auto dict_array2 = get_arrow_dict_array({1, 2, 5, 7}, {1, 3});

Expand All @@ -307,21 +339,25 @@ TEST_F(FromArrowTest, ChunkedArray)
auto boolean_array =
get_arrow_array<bool>({true, false, true, false, true}, {true, false, true, true, false});
auto boolean_chunked_array = std::make_shared<arrow::ChunkedArray>(boolean_array);
auto large_string_chunked_array = std::make_shared<arrow::ChunkedArray>(
std::vector<std::shared_ptr<arrow::Array>>{large_string_array_1});

std::vector<std::shared_ptr<arrow::Field>> schema_vector(
{arrow::field("a", int32_chunked_array->type()),
arrow::field("b", int64array->type()),
arrow::field("c", string_array_1->type()),
arrow::field("d", dict_chunked_array->type()),
arrow::field("e", boolean_chunked_array->type())});
arrow::field("e", boolean_chunked_array->type()),
arrow::field("f", large_string_array_1->type())});
auto schema = std::make_shared<arrow::Schema>(schema_vector);

auto arrow_table = arrow::Table::Make(schema,
{int32_chunked_array,
int64_chunked_array,
string_chunked_array,
dict_chunked_array,
boolean_chunked_array});
boolean_chunked_array,
large_string_chunked_array});

auto expected_cudf_table = get_cudf_table();

Expand Down
6 changes: 0 additions & 6 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,12 +334,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
)
elif isinstance(array.type, ArrowIntervalType):
return cudf.core.column.IntervalColumn.from_arrow(array)
elif pa.types.is_large_string(array.type):
# Pandas-2.2+: Pandas defaults to `large_string` type
# instead of `string` without data-introspection.
# Temporary workaround until cudf has native
# support for `LARGE_STRING` i.e., 64 bit offsets
array = array.cast(pa.string())

data = pa.table([array], [None])

Expand Down
11 changes: 7 additions & 4 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2737,13 +2737,16 @@ def test_series_dtype_astypes(data):
assert_eq(result, expected)


def test_series_from_large_string():
pa_large_string_array = pa.array(["a", "b", "c"]).cast(pa.large_string())
got = cudf.Series(pa_large_string_array)
expected = pd.Series(pa_large_string_array)
@pytest.mark.parametrize("pa_type", [pa.string, pa.large_string])
def test_series_from_large_string(pa_type):
pa_string_array = pa.array(["a", "b", "c"]).cast(pa_type())
got = cudf.Series(pa_string_array)
expected = pd.Series(pa_string_array)

assert_eq(expected, got)

assert pa_string_array.equals(got.to_arrow())


@pytest.mark.parametrize(
"scalar",
Expand Down
Loading