diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index ab6afc82094..a9b27b7cfde 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -141,6 +141,94 @@ std::unique_ptr ends_with( string_scalar const& target, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); +/** + * @brief Returns a column of strings that searches for the @p delimiter @p count number of + * times in the source @p strings forward if @p count is positive or backwards if @p count is + * negative. If @p count is positive, it returns a substring from the start of the source @p + * strings up until @p count occurrence of the @delimiter not including the @p delimiter. + * If @p count is negative, it returns a substring from the start of the @p count occurrence of + * the @delimiter in the source @p strings past the delimiter until the end of the string. + * + * The search for @delimiter in @p strings is case sensitive. + * If the @p count is 0, every row in the output column will be null. + * If the row value of @p strings is null, the row value in the output column will be null. + * If the @p delimiter is invalid or null, every row in the output column will be null. + * If the @p delimiter or the column value for a row is empty, the row value in the output + * column will be empty. + * If @p count occurrences of @p delimiter isn't found, the row value in the output column will + * be the row value from the input @p strings column. + * + * @code{.pseudo} + * Example: + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo' ] + * r = substring_index(in_s, '.', 1) + * r is ['www', null, 'www', '', 'foo'] + * + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo' ] + * r = substring_index(in_s, '.', -2) + * r is ['nvidia.com', null, 'google.com', '', 'foo'] + * @endcode + * + * @param strings Strings instance for this operation. + * @param delimiter UTF-8 encoded string to search for in each string. + * @param count Number of times to search for delimiter in each string. If the value is positive, + * forward search of delimiter is performed; else, a backward search is performed. + * @param mr Resource for allocating device memory. + * @return New strings column containing the substrings. + */ +std::unique_ptr substring_index( + strings_column_view const& strings, + string_scalar const& delimiter, + size_type count, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +/** + * @brief Returns a column of strings that searches the delimiter for each row from + * @p delimiter_strings @p count number of times in the source @p strings forward if @p count + * is positive or backwards if @p count is negative. If @p count is positive, it returns a + * substring from the start of the source @p strings up until @p count occurrence of the + * delimiter for that row not including that delimiter. If @p count is negative, it returns a + * substring from the start of the @p count occurrence of the delimiter for that row in the + * source @p strings past the delimiter until the end of the string. + * + * The search for @p delimiter_strings in @p strings is case sensitive. + * If the @p count is 0, every row in the output column will be null. + * If the row value of @p strings is null, the row value in the output column will be null. + * If the row value from @p delimiter_strings is invalid or null, the row value in the + * output column will be null. + * If the row value from @p delimiter_strings or the column value for a row is empty, the + * row value in the output column will be empty. + * If @p count occurrences of delimiter isn't found, the row value in the output column will + * be the row value from the input @p strings column. + * + * @code{.pseudo} + * Example: + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo..bar....goo' ] + * delimiters = ['.', '..', '', null, '..'] + * r = substring_index(in_s, delimiters, 2) + * r is ['www.nvidia', null, '', null, 'foo..bar'] + * + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo..bar....goo', 'apache.org' ] + * delimiters = ['.', '..', '', null, '..', '.'] + * r = substring_index(in_s, delimiters, -2) + * r is ['nvidia.com', null, '', null, '..goo', 'apache.org'] + * @endcode + * + * @throw cudf::logic_error if the number of rows in @p strings and @delimiter_strings do not match. + * + * @param strings Strings instance for this operation. + * @param delimiter_strings UTF-8 encoded string for each row. + * @param count Number of times to search for delimiter in each string. If the value is positive, + * forward search of delimiter is performed; else, a backward search is performed. + * @param mr Resource for allocating device memory. + * @return New strings column containing the substrings. + */ +std::unique_ptr substring_index( + strings_column_view const& strings, + strings_column_view const& delimiter_strings, + size_type count, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu index c791f8f7ab2..589bb976e08 100644 --- a/cpp/src/strings/find.cu +++ b/cpp/src/strings/find.cu @@ -16,7 +16,9 @@ #include #include +#include #include +#include #include #include #include @@ -286,5 +288,156 @@ std::unique_ptr ends_with(strings_column_view const& strings, return detail::ends_with(strings, target, mr); } +// For substring_index APIs +namespace detail { +// Internal helper class +namespace { + +struct substring_index_functor { + template + std::unique_ptr operator()(ColItrT const col_itr, + DelimiterItrT const delim_itr, + size_type delimiter_count, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream, + size_type strings_count) const + { + // Shallow copy of the resultant strings + rmm::device_vector out_col_strings(strings_count); + + // Invalid output column strings - null rows + string_view const invalid_str{nullptr, 0}; + + thrust::transform( + rmm::exec_policy(stream)->on(stream), + col_itr, + col_itr + strings_count, + delim_itr, + out_col_strings.data().get(), + [delimiter_count, invalid_str] __device__(auto col_val_pair, auto delim_val_pair) { + // If the column value for this row or the delimiter is null or if the delimiter count is 0, + // result is null + if (!col_val_pair.second || !delim_val_pair.second || delimiter_count == 0) + return invalid_str; + auto col_val = col_val_pair.first; + + // If the global delimiter or the row specific delimiter or if the column value for the row + // is empty, value is empty. + if (delim_val_pair.first.empty() || col_val.empty()) return string_view{}; + + auto delim_val = delim_val_pair.first; + + auto const col_val_len = col_val.length(); + auto const delimiter_len = delim_val.length(); + + auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count; + size_type start_pos = 0; + size_type end_pos = col_val_len; + string_view out_str{}; + + for (auto i = 0; i < nsearches; ++i) { + if (delimiter_count < 0) { + end_pos = col_val.rfind(delim_val, 0, end_pos); + if (end_pos == -1) { + out_str = col_val; + break; + } + if (i + 1 == nsearches) + out_str = + col_val.substr(end_pos + delimiter_len, col_val_len - end_pos - delimiter_len); + } else { + auto char_pos = col_val.find(delim_val, start_pos); + if (char_pos == -1) { + out_str = col_val; + break; + } + if (i + 1 == nsearches) + out_str = col_val.substr(0, char_pos); + else + start_pos = char_pos + delimiter_len; + } + } + + return out_str.empty() ? string_view{} : out_str; + }); + + // Create an output column with the resultant strings + return make_strings_column(out_col_strings, invalid_str, stream, mr); + } +}; + +} // namespace + +template +std::unique_ptr substring_index(strings_column_view const& strings, + DelimiterItrT const delimiter_itr, + size_type count, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream = 0) +{ + auto strings_count = strings.size(); + // If there aren't any rows, return an empty strings column + if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream); + + // Create device view of the column + auto colview_ptr = column_device_view::create(strings.parent(), stream); + auto colview = *colview_ptr; + if (colview.nullable()) { + return substring_index_functor{}( + experimental::detail::make_pair_iterator(colview), + delimiter_itr, + count, + mr, + stream, + strings_count); + } else { + return substring_index_functor{}( + experimental::detail::make_pair_iterator(colview), + delimiter_itr, + count, + mr, + stream, + strings_count); + } +} + +} // namespace detail + +// external APIs + +std::unique_ptr substring_index(strings_column_view const& strings, + string_scalar const& delimiter, + size_type count, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::substring_index( + strings, experimental::detail::make_pair_iterator(delimiter), count, mr); +} + +std::unique_ptr substring_index(strings_column_view const& strings, + strings_column_view const& delimiters, + size_type count, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(strings.size() == delimiters.size(), + "Strings and delimiters column sizes do not match"); + + CUDF_FUNC_RANGE(); + auto delimiters_dev_view_ptr = cudf::column_device_view::create(delimiters.parent(), 0); + auto delimiters_dev_view = *delimiters_dev_view_ptr; + return (delimiters_dev_view.nullable()) + ? detail::substring_index( + strings, + experimental::detail::make_pair_iterator(delimiters_dev_view), + count, + mr) + : detail::substring_index( + strings, + experimental::detail::make_pair_iterator(delimiters_dev_view), + count, + mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 6a0d39757f0..ab29a06c1d0 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -215,3 +216,321 @@ TEST_P(FindParmsTest, Find) INSTANTIATE_TEST_CASE_P(StringsFindTest, FindParmsTest, testing::ValuesIn(std::array{0, 1, 2, 3})); + +struct StringsSubstringIndexWithScalarTest : public cudf::test::BaseFixture { +}; + +TEST_F(StringsSubstringIndexWithScalarTest, ZeroSizeStringsColumn) +{ + cudf::column_view col0(cudf::data_type{cudf::STRING}, 0, nullptr, nullptr, 0); + auto strings_view = cudf::strings_column_view(col0); + + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("foo"), 1); + cudf::test::expect_strings_empty(results->view()); +} + +TEST_F(StringsSubstringIndexWithScalarTest, AllEmpty) +{ + auto strings_col = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("e"), -1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithScalarTest, EmptyDelimiter) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, true, false, true, true, true}); + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar(""), 1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithScalarTest, ZeroCount) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {false, false, false, false, false, false}); + + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), 0); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithScalarTest, SearchDelimiter) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + { + auto exp_results = cudf::test::strings_column_wrapper({"H", "thes", "", "lease", "t", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), 1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto exp_results = cudf::test::strings_column_wrapper( + {"llo", "", "", "lease", "st strings", ""}, {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), -1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), 2); + cudf::test::expect_columns_equal(*results, strings_view.parent(), true); + } + + { + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), -2); + cudf::test::expect_columns_equal(*results, strings_view.parent(), true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"Hello LL", "o", "", "opp", "pol", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("o"), 2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"ogh", "pppllo", "", "llo", " po", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("o"), -2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", "poloéé lopéé apploo", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", " lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::string_scalar("éé"), -3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com", + "www.apache..org", + "tennis...com", + "nvidia....com", + "google...........com", + "microsoft...c.....co..m"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"www.yahoo.com", "www.apache.", "tennis..", "nvidia..", "google..", "microsoft.."}); + + auto results = + cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("."), 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com", + "www.apache..org", + "tennis..com", + "nvidia....com", + "google...........com", + ".", + "microsoft...c.....co..m"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"www.yahoo.com", "www.apache..org", "tennis..com", "..com", "..com", ".", "co..m"}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::string_scalar(".."), -2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } +} + +struct StringsSubstringIndexWithColumnTest : public cudf::test::BaseFixture { +}; + +TEST_F(StringsSubstringIndexWithColumnTest, ZeroSizeStringsColumn) +{ + cudf::column_view col0(cudf::data_type{cudf::STRING}, 0, nullptr, nullptr, 0); + auto strings_view = cudf::strings_column_view(col0); + + auto results = cudf::strings::substring_index(strings_view, strings_view, 1); + // Check empty column + cudf::test::expect_strings_empty(results->view()); +} + +TEST_F(StringsSubstringIndexWithColumnTest, GenerateExceptions) +{ + auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", "."}); + + EXPECT_THROW(cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1), + cudf::logic_error); +} + +TEST_F(StringsSubstringIndexWithColumnTest, ColumnAllEmpty) +{ + auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", ".", "/"}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithColumnTest, DelimiterAllEmptyAndInvalid) +{ + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, true, false, true, false}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, false, false, true, false}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithColumnTest, ZeroDelimiterCount) +{ + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, true, false, true, false}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {false, false, false, false, false, false}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 0); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithColumnTest, SearchDelimiter) +{ + { + auto col0 = cudf::test::strings_column_wrapper( + {"H™élloi ™◎oo™ff™", "thesé", "", "lease™", "tést strings", "™"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"™", "™", "", "e", "t", "™"}); + + auto exp_results = cudf::test::strings_column_wrapper({"H", "thesé", "", "l", "", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ffstri.nffgs", + "ffff ™ ffff ff"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "ff ", "t", "ff ™"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"ff™", "esé", "", "eaffse™", "ri.nffgs", " ffff ff"}, {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "e ", "ff™ff", "ff™ff™"}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ goo", + "tffffh", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "e ", "ff™ff", "ff™ff™"}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({" gooff™ ™◎ooff™ff™", + "ffhffesé", + "", + "lff fooff ffff eaffse™", + "ff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } +}