Skip to content

Commit

Permalink
Fix split_record for all empty strings column
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Jul 16, 2024
1 parent 04330f2 commit 06e7265
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 0 deletions.
6 changes: 6 additions & 0 deletions cpp/src/strings/split/split.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,12 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
auto const chars_bytes =
get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
get_offset_value(input.offsets(), input.offset(), stream);
if (chars_bytes == 0) {
auto offsets = cudf::make_column_from_scalar(
numeric_scalar<int32_t>(0, true, stream), strings_count + 1, stream, mr);
auto tokens = rmm::device_uvector<string_index_pair>(0, stream);
return std::pair{std::move(offsets), std::move(tokens)};
}
auto const d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());

Expand Down
20 changes: 20 additions & 0 deletions cpp/tests/strings/split_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,26 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
}

TEST_F(StringsSplitTest, SplitRecordAllEmpty)
{
auto input = cudf::test::strings_column_wrapper({"", "", "", ""});
auto sv = cudf::strings_column_view(input);
auto delimiter = cudf::string_scalar("s");
auto empty = cudf::string_scalar("");

using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
LCW expected({LCW{}, LCW{}, LCW{}, LCW{}});
auto result = cudf::strings::split_record(sv, delimiter);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
result = cudf::strings::split_record(sv, empty);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);

result = cudf::strings::rsplit_record(sv, delimiter);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
result = cudf::strings::rsplit_record(sv, empty);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
}

TEST_F(StringsSplitTest, MultiByteDelimiters)
{
// Overlapping delimiters
Expand Down

0 comments on commit 06e7265

Please sign in to comment.