diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ff4b3fefce..8dc871fa3c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - PR #5204 Concatenate strings columns using row separator as strings column - PR #5342 Add support for `StringMethods.__getitem__` - PR #5356 Use `size_type` instead of `scalar` in `cudf::repeat`. +- PR #5303 Add slice_strings functionality using delimiter string ## Improvements - PR #5245 Add column reduction benchmark diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp index 04387f047a8..9e21905f3db 100644 --- a/cpp/include/cudf/strings/substring.hpp +++ b/cpp/include/cudf/strings/substring.hpp @@ -104,6 +104,98 @@ std::unique_ptr slice_strings( column_view const& stops, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); +/** + * @brief Slices a column of strings by using a delimiter as a slice point. + * + * Returns a column of strings after searching for @p delimiter @p count number of + * times in the source @p strings from left to right if @p count is positive or from + * right to left if @p count is negative. If @p count is positive, it returns a substring + * from the start of the source @p strings up until @p count occurrence of the @delimiter + * not including the @p delimiter. If @p count is negative, it returns a substring from + * the start of the @p count occurrence of the @delimiter in the source @p strings past + * the delimiter until the end of the string. + * + * The search for @delimiter in @p strings is case sensitive. + * If the row value of @p strings is null, the row value in the output column will be null. + * If the @p count is 0 or if @p delimiter is invalid or empty, every row in the output column + * will be an empty string. + * If the column value for a row is empty, the row value in the output column will be empty. + * If @p count occurrences of @p delimiter isn't found, the row value in the output column will + * be the row value from the input @p strings column. + * + * @code{.pseudo} + * Example: + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo'] + * r = slice_strings(in_s, '.', 1) + * r = ['www', null, 'www', '', 'foo'] + * + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo'] + * r = slice_strings(in_s, '.', -2) + * r = ['nvidia.com', null, 'google.com', '', 'foo'] + * @endcode + * + * @param strings Strings instance for this operation. + * @param delimiter UTF-8 encoded string to search for in each string. + * @param count Number of times to search for delimiter in each string. If the value is positive, + * delimiter is searched from left to right; else, it is searched from right to left. + * @param mr Resource for allocating device memory. + * @return New strings column containing the substrings. + */ +std::unique_ptr slice_strings( + strings_column_view const& strings, + string_scalar const& delimiter, + size_type count, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +/** + * @brief Slices a column of strings by using a delimiter column as slice points. + * + * Returns a column of strings after searching the delimiter defined per row from + * @p delimiter_strings @p count number of times in the source @p strings from left to right + * if @p count is positive or from right to left if @p count is negative. If @p count is + * positive, it returns a substring from the start of the source @p strings up until + * @p count occurrence of the delimiter for that row not including that delimiter. If @p count + * is negative, it returns a substring from the start of the @p count occurrence of the + * delimiter for that row in the source @p strings past the delimiter until the end of the string. + * + * The search for @p delimiter_strings in @p strings is case sensitive. + * If the @p count is 0, every row in the output column will be an empty string. + * If the row value of @p strings is null, the row value in the output column will be null. + * If the row value from @p delimiter_strings is invalid or null, the row value in the + * output column will be an empty string. + * If the row value from @p delimiter_strings or the column value for a row is empty, the + * row value in the output column will be empty. + * If @p count occurrences of delimiter isn't found, the row value in the output column will + * be the row value from the input @p strings column. + * + * @code{.pseudo} + * Example: + * in_s = ['www.nvidia.com', null, 'www.google.com', 'bar', 'foo..bar....goo'] + * delimiters = ['.', '..', '', null, '..'] + * r = slice_strings(in_s, delimiters, 2) + * r = ['www.nvidia', null, '', '', 'foo..bar'] + * + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo..bar....goo', 'apache.org'] + * delimiters = ['.', '..', '', null,'..', '.'] + * r = slice_strings(in_s, delimiters, -2) + * r = ['nvidia.com', null, '', '', '..goo', 'apache.org'] + * @endcode + * + * @throw cudf::logic_error if the number of rows in @p strings and @delimiter_strings do not match. + * + * @param strings Strings instance for this operation. + * @param delimiter_strings UTF-8 encoded string for each row. + * @param count Number of times to search for delimiter in each string. If the value is positive, + * delimiter is searched from left to right; else, it is searched from right to left. + * @param mr Resource for allocating device memory. + * @return New strings column containing the substrings. + */ +std::unique_ptr slice_strings( + strings_column_view const& strings, + strings_column_view const& delimiter_strings, + size_type count, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index c4fa4a39297..14d04c1b257 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -198,37 +199,28 @@ struct substring_from_fn { } }; -/** - * Called by the type-dispatcher for resolving the position columns - * (starts_column and stops_column) to actual types. - */ -struct dispatch_substring_from_fn { +struct compute_substrings_from_fn { /** * @brief Returns strings column with substrings based on the ranges in the * individual starts and stops column position values. */ - template ::value>* = nullptr> - std::unique_ptr operator()(strings_column_view const& strings, - column_view const& starts_column, - column_view const& stops_column, + template + std::unique_ptr operator()(column_device_view const& d_column, + size_type null_count, + PositionType const* starts, + PositionType const* stops, rmm::mr::device_memory_resource* mr, cudaStream_t stream) const { - const PositionType* starts = starts_column.data(); - const PositionType* stops = stops_column.data(); - - auto strings_count = strings.size(); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; + auto strings_count = d_column.size(); - // copy the null mask + // Copy the null mask rmm::device_buffer null_mask; - size_type null_count = strings.null_count(); if (d_column.nullable()) null_mask = rmm::device_buffer( d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr); - // build offsets column + + // Build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::make_counting_iterator(0), substring_from_fn{d_column, starts, stops}); @@ -237,7 +229,7 @@ struct dispatch_substring_from_fn { auto offsets_view = offsets_column->view(); auto d_new_offsets = offsets_view.template data(); - // build chars column + // Build chars column cudf::size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count]; auto chars_column = cudf::strings::detail::create_chars_child_column( strings_count, null_count, bytes, mr, stream); @@ -248,7 +240,7 @@ struct dispatch_substring_from_fn { thrust::make_counting_iterator(0), strings_count, substring_from_fn{d_column, starts, stops, d_new_offsets, d_chars}); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -257,30 +249,103 @@ struct dispatch_substring_from_fn { stream, mr); } - // +}; + +// This functor is invoked to compute the substrings using start and end positional indices. +// The type used to store the indices is inferred by invoking this functor through the +// type dispatcher. +struct compute_substrings { + template ::value>* = nullptr> + std::unique_ptr operator()(column_device_view const& d_column, + size_type null_count, + column_view const& starts_column, + column_view const& stops_column, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) const + { + return compute_substrings_from_fn{}(d_column, + null_count, + starts_column.data(), + stops_column.data(), + mr, + stream); + } + template ::value>* = nullptr> - std::unique_ptr operator()(strings_column_view const&, - column_view const&, - column_view const&, - rmm::mr::device_memory_resource*, - cudaStream_t) const + std::unique_ptr operator()(column_device_view const& d_column, + size_type null_count, + column_view const& starts_column, + column_view const& stops_column, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) const { CUDF_FAIL("Positions values must be an integral type."); } }; -template <> -std::unique_ptr dispatch_substring_from_fn::operator()( - strings_column_view const&, - column_view const&, - column_view const&, - rmm::mr::device_memory_resource*, - cudaStream_t) const -{ - CUDF_FAIL("Positions values must not be bool type."); -} +// When slice_strings is invoked with a delimiter string and a delimiter count, we need to +// compute the start and end indices of the substring. This functor accomplishes that. +struct compute_substring_indices { + template + void operator()(column_device_view const& d_column, + DelimiterItrT const delim_itr, + size_type delimiter_count, + size_type* start_char_pos, + size_type* end_char_pos, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) const + { + auto strings_count = d_column.size(); + + thrust::for_each_n( + rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + strings_count, + [delim_itr, delimiter_count, start_char_pos, end_char_pos, d_column] __device__( + size_type idx) { + auto const& delim_val_pair = delim_itr[idx]; + auto const& delim_val = delim_val_pair.first; // Don't use it yet + + // If the column value for this row is null, result is null. + // If the delimiter count is 0, result is empty string. + // If the global delimiter or the row specific delimiter is invalid or if it is empty, row + // value is empty. + if (d_column.is_null(idx) || !delim_val_pair.second || delim_val.empty()) return; + auto const& col_val = d_column.element(idx); + // If the column value for the row is empty, the row value is empty. + if (!col_val.empty()) { + auto const col_val_len = col_val.length(); + auto const delimiter_len = delim_val.length(); + + auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count; + bool const left_to_right = (delimiter_count > 0); + + size_type start_pos = start_char_pos[idx]; + size_type end_pos = col_val_len; + size_type char_pos = -1; + + end_char_pos[idx] = col_val_len; + + for (auto i = 0; i < nsearches; ++i) { + char_pos = left_to_right ? col_val.find(delim_val, start_pos) + : col_val.rfind(delim_val, 0, end_pos); + if (char_pos == -1) return; + if (left_to_right) + start_pos = char_pos + delimiter_len; + else + end_pos = char_pos; + } + if (left_to_right) + end_char_pos[idx] = char_pos; + else + start_char_pos[idx] = end_pos + delimiter_len; + } + }); + } +}; } // namespace // @@ -301,17 +366,55 @@ std::unique_ptr slice_strings( "Parameters starts and stops must be of the same type."); CUDF_EXPECTS(starts_column.null_count() == 0, "Parameter starts must not contain nulls."); CUDF_EXPECTS(stops_column.null_count() == 0, "Parameter stops must not contain nulls."); + CUDF_EXPECTS(starts_column.type().id() != data_type{BOOL8}.id(), + "Positions values must not be bool type."); + CUDF_EXPECTS(is_fixed_width(starts_column.type()), "Positions values must be fixed width type."); + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; // perhaps another candidate for index-normalizer return cudf::type_dispatcher(starts_column.type(), - dispatch_substring_from_fn{}, - strings, + compute_substrings{}, + d_column, + strings.null_count(), starts_column, stops_column, mr, stream); } +template +std::unique_ptr slice_strings(strings_column_view const& strings, + DelimiterItrT const delimiter_itr, + size_type count, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream = 0) +{ + auto strings_count = strings.size(); + // If there aren't any rows, return an empty strings column + if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream); + + // Compute the substring indices first + rmm::device_vector start_char_pos_vec(strings_count, 0); + rmm::device_vector end_char_pos_vec(strings_count, 0); + auto* start_char_pos = start_char_pos_vec.data().get(); + auto* end_char_pos = end_char_pos_vec.data().get(); + + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; + + // If delimiter count is 0, the output column will contain empty strings + if (count != 0) { + // Compute the substring indices first + compute_substring_indices{}( + d_column, delimiter_itr, count, start_char_pos, end_char_pos, mr, stream); + } + + // Extract the substrings using the indices next + return compute_substrings_from_fn{}( + d_column, strings.null_count(), start_char_pos, end_char_pos, mr, stream); +} + } // namespace detail // external API @@ -325,5 +428,39 @@ std::unique_ptr slice_strings(strings_column_view const& strings, return detail::slice_strings(strings, starts_column, stops_column, mr); } +std::unique_ptr slice_strings(strings_column_view const& strings, + string_scalar const& delimiter, + size_type count, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::slice_strings( + strings, cudf::detail::make_pair_iterator(delimiter), count, mr, nullptr); +} + +std::unique_ptr slice_strings(strings_column_view const& strings, + strings_column_view const& delimiters, + size_type count, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(strings.size() == delimiters.size(), + "Strings and delimiters column sizes do not match"); + + CUDF_FUNC_RANGE(); + auto delimiters_dev_view_ptr = cudf::column_device_view::create(delimiters.parent(), 0); + auto delimiters_dev_view = *delimiters_dev_view_ptr; + return (delimiters_dev_view.nullable()) + ? detail::slice_strings( + strings, + cudf::detail::make_pair_iterator(delimiters_dev_view), + count, + mr) + : detail::slice_strings( + strings, + cudf::detail::make_pair_iterator(delimiters_dev_view), + count, + mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/substring_tests.cpp b/cpp/tests/strings/substring_tests.cpp index 99b4df39a87..5de87592033 100644 --- a/cpp/tests/strings/substring_tests.cpp +++ b/cpp/tests/strings/substring_tests.cpp @@ -270,3 +270,320 @@ TEST_F(StringsSubstringsTest, Error) auto strings_column = cudf::strings_column_view(strings); EXPECT_THROW(cudf::strings::slice_strings(strings_column, 0, 0, 0), cudf::logic_error); } + +struct StringsSubstringsScalarDelimiterTest : public cudf::test::BaseFixture { +}; + +TEST_F(StringsSubstringsScalarDelimiterTest, ZeroSizeStringsColumn) +{ + cudf::column_view col0(cudf::data_type{cudf::STRING}, 0, nullptr, nullptr, 0); + auto strings_view = cudf::strings_column_view(col0); + + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("foo"), 1); + cudf::test::expect_strings_empty(results->view()); +} + +TEST_F(StringsSubstringsScalarDelimiterTest, AllEmpty) +{ + auto strings_col = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("e"), -1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsScalarDelimiterTest, EmptyDelimiter) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, true, false, true, true, true}); + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar(""), 1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsScalarDelimiterTest, ZeroCount) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 0); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + { + auto exp_results = cudf::test::strings_column_wrapper({"H", "thes", "", "lease", "t", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto exp_results = cudf::test::strings_column_wrapper( + {"llo", "", "", "lease", "st strings", ""}, {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 2); + cudf::test::expect_columns_equal(*results, strings_view.parent(), true); + } + + { + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -2); + cudf::test::expect_columns_equal(*results, strings_view.parent(), true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"Hello LL", "o", "", "opp", "pol", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), 2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"ogh", "pppllo", "", "llo", " po", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), -2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", "poloéé lopéé apploo", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", " lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), -3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com", + "www.apache..org", + "tennis...com", + "nvidia....com", + "google...........com", + "microsoft...c.....co..m"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"www.yahoo.com", "www.apache.", "tennis..", "nvidia..", "google..", "microsoft.."}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("."), 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com", + "www.apache..org", + "tennis..com", + "nvidia....com", + "google...........com", + ".", + "microsoft...c.....co..m"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"www.yahoo.com", "www.apache..org", "tennis..com", "..com", "..com", ".", "co..m"}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar(".."), -2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } +} + +struct StringsSubstringsColumnDelimiterTest : public cudf::test::BaseFixture { +}; + +TEST_F(StringsSubstringsColumnDelimiterTest, ZeroSizeStringsColumn) +{ + cudf::column_view col0(cudf::data_type{cudf::STRING}, 0, nullptr, nullptr, 0); + auto strings_view = cudf::strings_column_view(col0); + + auto results = cudf::strings::slice_strings(strings_view, strings_view, 1); + // Check empty column + cudf::test::expect_strings_empty(results->view()); +} + +TEST_F(StringsSubstringsColumnDelimiterTest, GenerateExceptions) +{ + auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", "."}); + + EXPECT_THROW(cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1), + cudf::logic_error); +} + +TEST_F(StringsSubstringsColumnDelimiterTest, ColumnAllEmpty) +{ + auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", ".", "/"}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsColumnDelimiterTest, DelimiterAllEmptyAndInvalid) +{ + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, true, false, true, false}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsColumnDelimiterTest, ZeroDelimiterCount) +{ + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, true, false, true, false}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 0); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter) +{ + { + auto col0 = cudf::test::strings_column_wrapper( + {"H™élloi ™◎oo™ff™", "thesé", "", "lease™", "tést strings", "™"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"™", "™", "", "e", "t", "™"}); + + auto exp_results = cudf::test::strings_column_wrapper({"H", "thesé", "", "l", "", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ffstri.nffgs", + "ffff ™ ffff ff"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "ff ", "t", "ff ™"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"ff™", "esé", "", "eaffse™", "ri.nffgs", " ffff ff"}, {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "e ", "ff™ff", "ff™ff™"}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ goo", + "tffffh", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "e ", "ff™ff", "ff™ff™"}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({" gooff™ ™◎ooff™ff™", + "ffhffesé", + "", + "lff fooff ffff eaffse™", + "ff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } +}