Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable round-tripping of large strings in cudf #15944

Merged
merged 24 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d81f2a2
return LargeStringArray if large strings are enabled
galipremsagar Jun 6, 2024
d33bb09
Update cpp/src/interop/to_arrow.cu
galipremsagar Jun 7, 2024
0ee3f78
revert
galipremsagar Jun 7, 2024
a3b6cfa
Merge remote-tracking branch 'upstream/branch-24.08' into arrow_interop
galipremsagar Jun 7, 2024
3decdd1
Merge remote-tracking branch 'upstream/branch-24.08' into arrow_interop
galipremsagar Jun 7, 2024
7c58693
Add from_arrow support
galipremsagar Jun 10, 2024
91bf4a9
Merge remote-tracking branch 'upstream/branch-24.08' into arrow_interop
galipremsagar Jun 10, 2024
9788dee
no overflow error
galipremsagar Jun 10, 2024
fc6fc9e
Merge remote-tracking branch 'upstream/branch-24.08' into arrow_interop
galipremsagar Jun 10, 2024
b02a43b
Add from arrow tests
galipremsagar Jun 10, 2024
7e59184
Merge branch 'branch-24.08' into arrow_interop
galipremsagar Jun 10, 2024
bf13c9d
revert pytest
galipremsagar Jun 10, 2024
6081a02
Merge branch 'branch-24.08' into arrow_interop
galipremsagar Jun 10, 2024
89c17e9
Use device_buffer
galipremsagar Jun 11, 2024
d113c30
Update cpp/src/interop/to_arrow.cu
galipremsagar Jun 11, 2024
f09765d
Merge branch 'arrow_interop' of https://github.com/galipremsagar/cudf…
galipremsagar Jun 11, 2024
38b0ac7
Merge branch 'branch-24.08' into arrow_interop
galipremsagar Jun 11, 2024
cea5714
Update cpp/tests/interop/arrow_utils.hpp
galipremsagar Jun 11, 2024
0f3b3a0
style
galipremsagar Jun 11, 2024
cabacd1
Merge branch 'branch-24.08' into arrow_interop
galipremsagar Jun 11, 2024
6a89e20
Update cpp/tests/interop/from_arrow_test.cpp
galipremsagar Jun 11, 2024
a60de7d
move get_arrow_large_string_array to cpp
galipremsagar Jun 11, 2024
631075f
Merge branch 'branch-24.08' into arrow_interop
galipremsagar Jun 11, 2024
31a60a8
Merge
galipremsagar Jun 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 24 additions & 8 deletions cpp/src/interop/from_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
}
}
case arrow::Type::STRING: return data_type(type_id::STRING);
case arrow::Type::LARGE_STRING: return data_type(type_id::STRING);
case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32);
case arrow::Type::LIST: return data_type(type_id::LIST);
case arrow::Type::DECIMAL: {
Expand Down Expand Up @@ -276,14 +277,30 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
rmm::device_async_resource_ref mr)
{
if (array.length() == 0) { return make_empty_column(type_id::STRING); }
auto str_array = static_cast<arrow::StringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int32Array>(
str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
auto char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);

auto offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
*offset_array, data_type(type_id::INT32), true, stream, mr);
std::unique_ptr<column> offsets_column;
std::unique_ptr<arrow::Array> char_array;

if (array.type_id() == arrow::Type::LARGE_STRING) {
auto str_array = static_cast<arrow::LargeStringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int64Array>(
str_array->value_offsets()->size() / sizeof(int64_t), str_array->value_offsets(), nullptr);
offsets_column = dispatch_to_cudf_column{}.operator()<int64_t>(
*offset_array, data_type(type_id::INT64), true, stream, mr);
char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);
} else if (array.type_id() == arrow::Type::STRING) {
auto str_array = static_cast<arrow::StringArray const*>(&array);
auto offset_array = std::make_unique<arrow::Int32Array>(
str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
*offset_array, data_type(type_id::INT32), true, stream, mr);
char_array = std::make_unique<arrow::Int8Array>(
str_array->value_data()->size(), str_array->value_data(), nullptr);
} else {
throw std::runtime_error("Unsupported array type");
}

auto chars_column = dispatch_to_cudf_column{}.operator()<int8_t>(
*char_array, data_type(type_id::INT8), true, stream, mr);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to be updated to use an rmm::device_buffer since making this a column will trigger the size_type limit.


Expand All @@ -304,7 +321,6 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
stream,
mr);
}

galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
template <>
std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::dictionary32>(
arrow::Array const& array,
Expand Down
19 changes: 14 additions & 5 deletions cpp/src/interop/to_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <cudf/dictionary/dictionary_column_view.hpp>
#include <cudf/interop.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/strings/detail/utilities.hpp>
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
Expand Down Expand Up @@ -306,11 +307,19 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
static_cast<std::size_t>(sview.chars_size(stream))},
ar_mr,
stream);
return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
if (sview.offsets().type().id() == cudf::type_id::INT64) {
return std::make_shared<arrow::LargeStringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
} else {
return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
offset_buffer,
data_buffer,
fetch_mask_buffer(input_view, ar_mr, stream),
static_cast<int64_t>(input_view.null_count()));
}
}

template <>
Expand Down
6 changes: 0 additions & 6 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,12 +338,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
)
elif isinstance(array.type, ArrowIntervalType):
return cudf.core.column.IntervalColumn.from_arrow(array)
elif pa.types.is_large_string(array.type):
# Pandas-2.2+: Pandas defaults to `large_string` type
# instead of `string` without data-introspection.
# Temporary workaround until cudf has native
# support for `LARGE_STRING` i.e., 64 bit offsets
array = array.cast(pa.string())

data = pa.table([array], [None])

Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,10 @@ def test_concatenate_large_column_strings():
s_1 = cudf.Series(["very long string " * string_scale_f] * num_strings)
s_2 = cudf.Series(["very long string " * string_scale_f] * num_strings)

with pytest.raises(OverflowError):
cudf.concat([s_1, s_2])
actual = cudf.concat([s_1, s_2])
expected = pd.concat([s_1.to_pandas(), s_2.to_pandas()])

assert_eq(actual, expected)


@pytest.mark.parametrize(
Expand Down
11 changes: 7 additions & 4 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2737,13 +2737,16 @@ def test_series_dtype_astypes(data):
assert_eq(result, expected)


def test_series_from_large_string():
pa_large_string_array = pa.array(["a", "b", "c"]).cast(pa.large_string())
got = cudf.Series(pa_large_string_array)
expected = pd.Series(pa_large_string_array)
@pytest.mark.parametrize("pa_type", [pa.string, pa.large_string])
def test_series_from_large_string(pa_type):
pa_string_array = pa.array(["a", "b", "c"]).cast(pa_type())
got = cudf.Series(pa_string_array)
expected = pd.Series(pa_string_array)

assert_eq(expected, got)

assert pa_string_array.equals(got.to_arrow())


@pytest.mark.parametrize(
"scalar",
Expand Down
Loading