diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index 23614ac0733..4d7096c02ca 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -357,6 +357,12 @@ std::pair, rmm::device_uvector> split auto const chars_bytes = get_offset_value(input.offsets(), input.offset() + strings_count, stream) - get_offset_value(input.offsets(), input.offset(), stream); + if (chars_bytes == 0) { + auto offsets = cudf::make_column_from_scalar( + numeric_scalar(0, true, stream), strings_count + 1, stream, mr); + auto tokens = rmm::device_uvector(0, stream); + return std::pair{std::move(offsets), std::move(tokens)}; + } auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index d53c64ed539..4c020cb4c29 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -307,6 +307,26 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } +TEST_F(StringsSplitTest, SplitRecordAllEmpty) +{ + auto input = cudf::test::strings_column_wrapper({"", "", "", ""}); + auto sv = cudf::strings_column_view(input); + auto delimiter = cudf::string_scalar("s"); + auto empty = cudf::string_scalar(""); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{}, LCW{}}); + auto result = cudf::strings::split_record(sv, delimiter); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + result = cudf::strings::split_record(sv, empty); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + + result = cudf::strings::rsplit_record(sv, delimiter); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + result = cudf::strings::rsplit_record(sv, empty); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); +} + TEST_F(StringsSplitTest, MultiByteDelimiters) { // Overlapping delimiters