From 8d7bf34ff045fbe1760c90ef825e6719bab1ff80 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Tue, 4 Feb 2020 08:13:38 -0800 Subject: [PATCH 1/9] REL v0.12.0 release --- docs/cudf/source/conf.py | 2 +- docs/nvstrings/source/conf.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 80ccad1c9fe..0e465b9efbc 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -71,7 +71,7 @@ # built documents. # # The short X.Y version. -version = "0.12" +version = '0.12' # The full version, including alpha/beta/rc tags. release = cudf.__version__ diff --git a/docs/nvstrings/source/conf.py b/docs/nvstrings/source/conf.py index 83795ef3660..3da2ce0c912 100644 --- a/docs/nvstrings/source/conf.py +++ b/docs/nvstrings/source/conf.py @@ -69,9 +69,9 @@ # built documents. # # The short X.Y version. -version = "0.12" +version = '0.12' # The full version, including alpha/beta/rc tags. -release = "0.12.0a" +release = '0.12.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 7178642ae8f353db1be87b65cc8315a5373833e6 Mon Sep 17 00:00:00 2001 From: Sriram Chandramouli Date: Wed, 27 May 2020 15:39:53 +0000 Subject: [PATCH 2/9] - compute substrings from beginning until delimiter or from a delimiter until end of string - this Closes #5158 - this emulates spark's `substring_index` function --- cpp/include/cudf/strings/find.hpp | 88 +++++++++ cpp/src/strings/find.cu | 153 ++++++++++++++ cpp/tests/strings/find_tests.cpp | 319 ++++++++++++++++++++++++++++++ 3 files changed, 560 insertions(+) diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index ab6afc82094..a9b27b7cfde 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -141,6 +141,94 @@ std::unique_ptr ends_with( string_scalar const& target, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); +/** + * @brief Returns a column of strings that searches for the @p delimiter @p count number of + * times in the source @p strings forward if @p count is positive or backwards if @p count is + * negative. If @p count is positive, it returns a substring from the start of the source @p + * strings up until @p count occurrence of the @delimiter not including the @p delimiter. + * If @p count is negative, it returns a substring from the start of the @p count occurrence of + * the @delimiter in the source @p strings past the delimiter until the end of the string. + * + * The search for @delimiter in @p strings is case sensitive. + * If the @p count is 0, every row in the output column will be null. + * If the row value of @p strings is null, the row value in the output column will be null. + * If the @p delimiter is invalid or null, every row in the output column will be null. + * If the @p delimiter or the column value for a row is empty, the row value in the output + * column will be empty. + * If @p count occurrences of @p delimiter isn't found, the row value in the output column will + * be the row value from the input @p strings column. + * + * @code{.pseudo} + * Example: + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo' ] + * r = substring_index(in_s, '.', 1) + * r is ['www', null, 'www', '', 'foo'] + * + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo' ] + * r = substring_index(in_s, '.', -2) + * r is ['nvidia.com', null, 'google.com', '', 'foo'] + * @endcode + * + * @param strings Strings instance for this operation. + * @param delimiter UTF-8 encoded string to search for in each string. + * @param count Number of times to search for delimiter in each string. If the value is positive, + * forward search of delimiter is performed; else, a backward search is performed. + * @param mr Resource for allocating device memory. + * @return New strings column containing the substrings. + */ +std::unique_ptr substring_index( + strings_column_view const& strings, + string_scalar const& delimiter, + size_type count, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +/** + * @brief Returns a column of strings that searches the delimiter for each row from + * @p delimiter_strings @p count number of times in the source @p strings forward if @p count + * is positive or backwards if @p count is negative. If @p count is positive, it returns a + * substring from the start of the source @p strings up until @p count occurrence of the + * delimiter for that row not including that delimiter. If @p count is negative, it returns a + * substring from the start of the @p count occurrence of the delimiter for that row in the + * source @p strings past the delimiter until the end of the string. + * + * The search for @p delimiter_strings in @p strings is case sensitive. + * If the @p count is 0, every row in the output column will be null. + * If the row value of @p strings is null, the row value in the output column will be null. + * If the row value from @p delimiter_strings is invalid or null, the row value in the + * output column will be null. + * If the row value from @p delimiter_strings or the column value for a row is empty, the + * row value in the output column will be empty. + * If @p count occurrences of delimiter isn't found, the row value in the output column will + * be the row value from the input @p strings column. + * + * @code{.pseudo} + * Example: + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo..bar....goo' ] + * delimiters = ['.', '..', '', null, '..'] + * r = substring_index(in_s, delimiters, 2) + * r is ['www.nvidia', null, '', null, 'foo..bar'] + * + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo..bar....goo', 'apache.org' ] + * delimiters = ['.', '..', '', null, '..', '.'] + * r = substring_index(in_s, delimiters, -2) + * r is ['nvidia.com', null, '', null, '..goo', 'apache.org'] + * @endcode + * + * @throw cudf::logic_error if the number of rows in @p strings and @delimiter_strings do not match. + * + * @param strings Strings instance for this operation. + * @param delimiter_strings UTF-8 encoded string for each row. + * @param count Number of times to search for delimiter in each string. If the value is positive, + * forward search of delimiter is performed; else, a backward search is performed. + * @param mr Resource for allocating device memory. + * @return New strings column containing the substrings. + */ +std::unique_ptr substring_index( + strings_column_view const& strings, + strings_column_view const& delimiter_strings, + size_type count, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu index c791f8f7ab2..589bb976e08 100644 --- a/cpp/src/strings/find.cu +++ b/cpp/src/strings/find.cu @@ -16,7 +16,9 @@ #include #include +#include #include +#include #include #include #include @@ -286,5 +288,156 @@ std::unique_ptr ends_with(strings_column_view const& strings, return detail::ends_with(strings, target, mr); } +// For substring_index APIs +namespace detail { +// Internal helper class +namespace { + +struct substring_index_functor { + template + std::unique_ptr operator()(ColItrT const col_itr, + DelimiterItrT const delim_itr, + size_type delimiter_count, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream, + size_type strings_count) const + { + // Shallow copy of the resultant strings + rmm::device_vector out_col_strings(strings_count); + + // Invalid output column strings - null rows + string_view const invalid_str{nullptr, 0}; + + thrust::transform( + rmm::exec_policy(stream)->on(stream), + col_itr, + col_itr + strings_count, + delim_itr, + out_col_strings.data().get(), + [delimiter_count, invalid_str] __device__(auto col_val_pair, auto delim_val_pair) { + // If the column value for this row or the delimiter is null or if the delimiter count is 0, + // result is null + if (!col_val_pair.second || !delim_val_pair.second || delimiter_count == 0) + return invalid_str; + auto col_val = col_val_pair.first; + + // If the global delimiter or the row specific delimiter or if the column value for the row + // is empty, value is empty. + if (delim_val_pair.first.empty() || col_val.empty()) return string_view{}; + + auto delim_val = delim_val_pair.first; + + auto const col_val_len = col_val.length(); + auto const delimiter_len = delim_val.length(); + + auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count; + size_type start_pos = 0; + size_type end_pos = col_val_len; + string_view out_str{}; + + for (auto i = 0; i < nsearches; ++i) { + if (delimiter_count < 0) { + end_pos = col_val.rfind(delim_val, 0, end_pos); + if (end_pos == -1) { + out_str = col_val; + break; + } + if (i + 1 == nsearches) + out_str = + col_val.substr(end_pos + delimiter_len, col_val_len - end_pos - delimiter_len); + } else { + auto char_pos = col_val.find(delim_val, start_pos); + if (char_pos == -1) { + out_str = col_val; + break; + } + if (i + 1 == nsearches) + out_str = col_val.substr(0, char_pos); + else + start_pos = char_pos + delimiter_len; + } + } + + return out_str.empty() ? string_view{} : out_str; + }); + + // Create an output column with the resultant strings + return make_strings_column(out_col_strings, invalid_str, stream, mr); + } +}; + +} // namespace + +template +std::unique_ptr substring_index(strings_column_view const& strings, + DelimiterItrT const delimiter_itr, + size_type count, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream = 0) +{ + auto strings_count = strings.size(); + // If there aren't any rows, return an empty strings column + if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream); + + // Create device view of the column + auto colview_ptr = column_device_view::create(strings.parent(), stream); + auto colview = *colview_ptr; + if (colview.nullable()) { + return substring_index_functor{}( + experimental::detail::make_pair_iterator(colview), + delimiter_itr, + count, + mr, + stream, + strings_count); + } else { + return substring_index_functor{}( + experimental::detail::make_pair_iterator(colview), + delimiter_itr, + count, + mr, + stream, + strings_count); + } +} + +} // namespace detail + +// external APIs + +std::unique_ptr substring_index(strings_column_view const& strings, + string_scalar const& delimiter, + size_type count, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::substring_index( + strings, experimental::detail::make_pair_iterator(delimiter), count, mr); +} + +std::unique_ptr substring_index(strings_column_view const& strings, + strings_column_view const& delimiters, + size_type count, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(strings.size() == delimiters.size(), + "Strings and delimiters column sizes do not match"); + + CUDF_FUNC_RANGE(); + auto delimiters_dev_view_ptr = cudf::column_device_view::create(delimiters.parent(), 0); + auto delimiters_dev_view = *delimiters_dev_view_ptr; + return (delimiters_dev_view.nullable()) + ? detail::substring_index( + strings, + experimental::detail::make_pair_iterator(delimiters_dev_view), + count, + mr) + : detail::substring_index( + strings, + experimental::detail::make_pair_iterator(delimiters_dev_view), + count, + mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 6a0d39757f0..ab29a06c1d0 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -215,3 +216,321 @@ TEST_P(FindParmsTest, Find) INSTANTIATE_TEST_CASE_P(StringsFindTest, FindParmsTest, testing::ValuesIn(std::array{0, 1, 2, 3})); + +struct StringsSubstringIndexWithScalarTest : public cudf::test::BaseFixture { +}; + +TEST_F(StringsSubstringIndexWithScalarTest, ZeroSizeStringsColumn) +{ + cudf::column_view col0(cudf::data_type{cudf::STRING}, 0, nullptr, nullptr, 0); + auto strings_view = cudf::strings_column_view(col0); + + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("foo"), 1); + cudf::test::expect_strings_empty(results->view()); +} + +TEST_F(StringsSubstringIndexWithScalarTest, AllEmpty) +{ + auto strings_col = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("e"), -1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithScalarTest, EmptyDelimiter) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, true, false, true, true, true}); + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar(""), 1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithScalarTest, ZeroCount) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {false, false, false, false, false, false}); + + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), 0); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithScalarTest, SearchDelimiter) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + { + auto exp_results = cudf::test::strings_column_wrapper({"H", "thes", "", "lease", "t", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), 1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto exp_results = cudf::test::strings_column_wrapper( + {"llo", "", "", "lease", "st strings", ""}, {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), -1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), 2); + cudf::test::expect_columns_equal(*results, strings_view.parent(), true); + } + + { + auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), -2); + cudf::test::expect_columns_equal(*results, strings_view.parent(), true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"Hello LL", "o", "", "opp", "pol", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("o"), 2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"ogh", "pppllo", "", "llo", " po", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("o"), -2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", "poloéé lopéé apploo", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", " lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::string_scalar("éé"), -3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com", + "www.apache..org", + "tennis...com", + "nvidia....com", + "google...........com", + "microsoft...c.....co..m"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"www.yahoo.com", "www.apache.", "tennis..", "nvidia..", "google..", "microsoft.."}); + + auto results = + cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("."), 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com", + "www.apache..org", + "tennis..com", + "nvidia....com", + "google...........com", + ".", + "microsoft...c.....co..m"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"www.yahoo.com", "www.apache..org", "tennis..com", "..com", "..com", ".", "co..m"}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::string_scalar(".."), -2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } +} + +struct StringsSubstringIndexWithColumnTest : public cudf::test::BaseFixture { +}; + +TEST_F(StringsSubstringIndexWithColumnTest, ZeroSizeStringsColumn) +{ + cudf::column_view col0(cudf::data_type{cudf::STRING}, 0, nullptr, nullptr, 0); + auto strings_view = cudf::strings_column_view(col0); + + auto results = cudf::strings::substring_index(strings_view, strings_view, 1); + // Check empty column + cudf::test::expect_strings_empty(results->view()); +} + +TEST_F(StringsSubstringIndexWithColumnTest, GenerateExceptions) +{ + auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", "."}); + + EXPECT_THROW(cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1), + cudf::logic_error); +} + +TEST_F(StringsSubstringIndexWithColumnTest, ColumnAllEmpty) +{ + auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", ".", "/"}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithColumnTest, DelimiterAllEmptyAndInvalid) +{ + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, true, false, true, false}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, false, false, true, false}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithColumnTest, ZeroDelimiterCount) +{ + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, true, false, true, false}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {false, false, false, false, false, false}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 0); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringIndexWithColumnTest, SearchDelimiter) +{ + { + auto col0 = cudf::test::strings_column_wrapper( + {"H™élloi ™◎oo™ff™", "thesé", "", "lease™", "tést strings", "™"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"™", "™", "", "e", "t", "™"}); + + auto exp_results = cudf::test::strings_column_wrapper({"H", "thesé", "", "l", "", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ffstri.nffgs", + "ffff ™ ffff ff"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "ff ", "t", "ff ™"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"ff™", "esé", "", "eaffse™", "ri.nffgs", " ffff ff"}, {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "e ", "ff™ff", "ff™ff™"}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ goo", + "tffffh", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "e ", "ff™ff", "ff™ff™"}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({" gooff™ ™◎ooff™ff™", + "ffhffesé", + "", + "lff fooff ffff eaffse™", + "ff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::substring_index( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } +} From 41e2ea9225fac3ac6e95000589fb52105b8ede7a Mon Sep 17 00:00:00 2001 From: Sriram Chandramouli Date: Wed, 27 May 2020 16:16:30 +0000 Subject: [PATCH 3/9] - updates after upstream merge --- cpp/src/strings/find.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu index 589bb976e08..c6a17f70a9d 100644 --- a/cpp/src/strings/find.cu +++ b/cpp/src/strings/find.cu @@ -384,7 +384,7 @@ std::unique_ptr substring_index(strings_column_view const& strings, auto colview = *colview_ptr; if (colview.nullable()) { return substring_index_functor{}( - experimental::detail::make_pair_iterator(colview), + cudf::detail::make_pair_iterator(colview), delimiter_itr, count, mr, @@ -392,7 +392,7 @@ std::unique_ptr substring_index(strings_column_view const& strings, strings_count); } else { return substring_index_functor{}( - experimental::detail::make_pair_iterator(colview), + cudf::detail::make_pair_iterator(colview), delimiter_itr, count, mr, @@ -412,7 +412,7 @@ std::unique_ptr substring_index(strings_column_view const& strings, { CUDF_FUNC_RANGE(); return detail::substring_index( - strings, experimental::detail::make_pair_iterator(delimiter), count, mr); + strings, cudf::detail::make_pair_iterator(delimiter), count, mr); } std::unique_ptr substring_index(strings_column_view const& strings, @@ -429,12 +429,12 @@ std::unique_ptr substring_index(strings_column_view const& strings, return (delimiters_dev_view.nullable()) ? detail::substring_index( strings, - experimental::detail::make_pair_iterator(delimiters_dev_view), + cudf::detail::make_pair_iterator(delimiters_dev_view), count, mr) : detail::substring_index( strings, - experimental::detail::make_pair_iterator(delimiters_dev_view), + cudf::detail::make_pair_iterator(delimiters_dev_view), count, mr); } From fe8d3ea8ee5a2e05437288f91cccf2b5f5a763df Mon Sep 17 00:00:00 2001 From: Sriram Chandramouli Date: Wed, 27 May 2020 16:17:50 +0000 Subject: [PATCH 4/9] - fix code style --- cpp/src/strings/find.cu | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu index c6a17f70a9d..a55a179daf8 100644 --- a/cpp/src/strings/find.cu +++ b/cpp/src/strings/find.cu @@ -383,21 +383,19 @@ std::unique_ptr substring_index(strings_column_view const& strings, auto colview_ptr = column_device_view::create(strings.parent(), stream); auto colview = *colview_ptr; if (colview.nullable()) { - return substring_index_functor{}( - cudf::detail::make_pair_iterator(colview), - delimiter_itr, - count, - mr, - stream, - strings_count); + return substring_index_functor{}(cudf::detail::make_pair_iterator(colview), + delimiter_itr, + count, + mr, + stream, + strings_count); } else { - return substring_index_functor{}( - cudf::detail::make_pair_iterator(colview), - delimiter_itr, - count, - mr, - stream, - strings_count); + return substring_index_functor{}(cudf::detail::make_pair_iterator(colview), + delimiter_itr, + count, + mr, + stream, + strings_count); } } From 3554406faee719ceec12fcfb327f03daa2f5bae4 Mon Sep 17 00:00:00 2001 From: Sriram Chandramouli Date: Wed, 27 May 2020 22:28:31 +0000 Subject: [PATCH 5/9] - add changelog entry --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 79f18ee1b27..15f15f4627e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ - PR #5203 Add Java bindings for is_integer and is_float - PR #5205 Add ci test for libcudf, libnvstrings headers existence check in meta.yml - PR #5293 Add Java bindings for replace_with_backrefs +- PR #5303 Add substring_index functionality for strings ## Improvements From e9c5e6ae10cc48b1ba03ba7aa33387a8d004dde5 Mon Sep 17 00:00:00 2001 From: Sriram Chandramouli Date: Fri, 29 May 2020 18:37:48 +0000 Subject: [PATCH 6/9] - rename method to `slice_strings` - reuse some of the facility `slice_strings` already has to build the substrings --- CHANGELOG.md | 2 +- cpp/include/cudf/strings/find.hpp | 88 ------- cpp/include/cudf/strings/substring.hpp | 87 +++++++ cpp/src/strings/find.cu | 151 ------------ cpp/src/strings/substring.cu | 220 ++++++++++++++--- cpp/tests/strings/find_tests.cpp | 319 ------------------------- cpp/tests/strings/substring_tests.cpp | 317 ++++++++++++++++++++++++ 7 files changed, 587 insertions(+), 597 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9288e2018c6..cb7be405063 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## New Features - PR #5222 Adding clip feature support to DataFrame and Series +- PR #5303 Add substring_index functionality for strings ## Improvements @@ -56,7 +57,6 @@ - PR #5203 Add Java bindings for is_integer and is_float - PR #5205 Add ci test for libcudf, libnvstrings headers existence check in meta.yml - PR #5293 Add Java bindings for replace_with_backrefs -- PR #5303 Add substring_index functionality for strings ## Improvements diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index a9b27b7cfde..ab6afc82094 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -141,94 +141,6 @@ std::unique_ptr ends_with( string_scalar const& target, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); -/** - * @brief Returns a column of strings that searches for the @p delimiter @p count number of - * times in the source @p strings forward if @p count is positive or backwards if @p count is - * negative. If @p count is positive, it returns a substring from the start of the source @p - * strings up until @p count occurrence of the @delimiter not including the @p delimiter. - * If @p count is negative, it returns a substring from the start of the @p count occurrence of - * the @delimiter in the source @p strings past the delimiter until the end of the string. - * - * The search for @delimiter in @p strings is case sensitive. - * If the @p count is 0, every row in the output column will be null. - * If the row value of @p strings is null, the row value in the output column will be null. - * If the @p delimiter is invalid or null, every row in the output column will be null. - * If the @p delimiter or the column value for a row is empty, the row value in the output - * column will be empty. - * If @p count occurrences of @p delimiter isn't found, the row value in the output column will - * be the row value from the input @p strings column. - * - * @code{.pseudo} - * Example: - * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo' ] - * r = substring_index(in_s, '.', 1) - * r is ['www', null, 'www', '', 'foo'] - * - * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo' ] - * r = substring_index(in_s, '.', -2) - * r is ['nvidia.com', null, 'google.com', '', 'foo'] - * @endcode - * - * @param strings Strings instance for this operation. - * @param delimiter UTF-8 encoded string to search for in each string. - * @param count Number of times to search for delimiter in each string. If the value is positive, - * forward search of delimiter is performed; else, a backward search is performed. - * @param mr Resource for allocating device memory. - * @return New strings column containing the substrings. - */ -std::unique_ptr substring_index( - strings_column_view const& strings, - string_scalar const& delimiter, - size_type count, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); - -/** - * @brief Returns a column of strings that searches the delimiter for each row from - * @p delimiter_strings @p count number of times in the source @p strings forward if @p count - * is positive or backwards if @p count is negative. If @p count is positive, it returns a - * substring from the start of the source @p strings up until @p count occurrence of the - * delimiter for that row not including that delimiter. If @p count is negative, it returns a - * substring from the start of the @p count occurrence of the delimiter for that row in the - * source @p strings past the delimiter until the end of the string. - * - * The search for @p delimiter_strings in @p strings is case sensitive. - * If the @p count is 0, every row in the output column will be null. - * If the row value of @p strings is null, the row value in the output column will be null. - * If the row value from @p delimiter_strings is invalid or null, the row value in the - * output column will be null. - * If the row value from @p delimiter_strings or the column value for a row is empty, the - * row value in the output column will be empty. - * If @p count occurrences of delimiter isn't found, the row value in the output column will - * be the row value from the input @p strings column. - * - * @code{.pseudo} - * Example: - * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo..bar....goo' ] - * delimiters = ['.', '..', '', null, '..'] - * r = substring_index(in_s, delimiters, 2) - * r is ['www.nvidia', null, '', null, 'foo..bar'] - * - * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo..bar....goo', 'apache.org' ] - * delimiters = ['.', '..', '', null, '..', '.'] - * r = substring_index(in_s, delimiters, -2) - * r is ['nvidia.com', null, '', null, '..goo', 'apache.org'] - * @endcode - * - * @throw cudf::logic_error if the number of rows in @p strings and @delimiter_strings do not match. - * - * @param strings Strings instance for this operation. - * @param delimiter_strings UTF-8 encoded string for each row. - * @param count Number of times to search for delimiter in each string. If the value is positive, - * forward search of delimiter is performed; else, a backward search is performed. - * @param mr Resource for allocating device memory. - * @return New strings column containing the substrings. - */ -std::unique_ptr substring_index( - strings_column_view const& strings, - strings_column_view const& delimiter_strings, - size_type count, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); - /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp index d438fd4cc29..db27154f7b7 100644 --- a/cpp/include/cudf/strings/substring.hpp +++ b/cpp/include/cudf/strings/substring.hpp @@ -104,6 +104,93 @@ std::unique_ptr slice_strings( column_view const& stops, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); +/** + * @brief Returns a column of strings after searching for @p delimiter @p count number of + * times in the source @p strings forward if @p count is positive or backwards if @p count is + * negative. If @p count is positive, it returns a substring from the start of the source @p + * strings up until @p count occurrence of the @delimiter not including the @p delimiter. + * If @p count is negative, it returns a substring from the start of the @p count occurrence of + * the @delimiter in the source @p strings past the delimiter until the end of the string. + * + * The search for @delimiter in @p strings is case sensitive. + * If the row value of @p strings is null, the row value in the output column will be null. + * If the @p count is 0 or if @p delimiter is invalid, output column will be an empty string. + * If the @p delimiter or the column value for a row is empty, the row value in the output + * column will be empty. + * If @p count occurrences of @p delimiter isn't found, the row value in the output column will + * be the row value from the input @p strings column. + * + * @code{.pseudo} + * Example: + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo'] + * r = slice_strings(in_s, '.', 1) + * r = ['www', null, 'www', '', 'foo'] + * + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo'] + * r = slice_strings(in_s, '.', -2) + * r = ['nvidia.com', null, 'google.com', '', 'foo'] + * @endcode + * + * @param strings Strings instance for this operation. + * @param delimiter UTF-8 encoded string to search for in each string. + * @param count Number of times to search for delimiter in each string. If the value is positive, + * forward search of delimiter is performed; else, a backward search is performed. + * @param mr Resource for allocating device memory. + * @return New strings column containing the substrings. + */ +std::unique_ptr slice_strings( + strings_column_view const& strings, + string_scalar const& delimiter, + size_type count, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +/** + * @brief Returns a column of strings after searching the delimiter defined per row from + * @p delimiter_strings @p count number of times in the source @p strings forward if @p count + * is positive or backwards if @p count is negative. If @p count is positive, it returns a + * substring from the start of the source @p strings up until @p count occurrence of the + * delimiter for that row not including that delimiter. If @p count is negative, it returns a + * substring from the start of the @p count occurrence of the delimiter for that row in the + * source @p strings past the delimiter until the end of the string. + * + * The search for @p delimiter_strings in @p strings is case sensitive. + * If the @p count is 0, every row in the output column will be an empty string. + * If the row value of @p strings is null, the row value in the output column will be null. + * If the row value from @p delimiter_strings is invalid or null, the row value in the + * output column will an empty string. + * If the row value from @p delimiter_strings or the column value for a row is empty, the + * row value in the output column will be empty. + * If @p count occurrences of delimiter isn't found, the row value in the output column will + * be the row value from the input @p strings column. + * + * @code{.pseudo} + * Example: + * in_s = ['www.nvidia.com', null, 'www.google.com', 'bar', 'foo..bar....goo'] + * delimiters = ['.', '..', '', null, '..'] + * r = slice_strings(in_s, delimiters, 2) + * r = ['www.nvidia', null, '', '', 'foo..bar'] + * + * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo..bar....goo', 'apache.org'] + * delimiters = ['.', '..', '', null,'..', '.'] + * r = slice_strings(in_s, delimiters, -2) + * r = ['nvidia.com', null, '', '', '..goo', 'apache.org'] + * @endcode + * + * @throw cudf::logic_error if the number of rows in @p strings and @delimiter_strings do not match. + * + * @param strings Strings instance for this operation. + * @param delimiter_strings UTF-8 encoded string for each row. + * @param count Number of times to search for delimiter in each string. If the value is positive, + * forward search of delimiter is performed; else, a backward search is performed. + * @param mr Resource for allocating device memory. + * @return New strings column containing the substrings. + */ +std::unique_ptr slice_strings( + strings_column_view const& strings, + strings_column_view const& delimiter_strings, + size_type count, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu index a55a179daf8..c791f8f7ab2 100644 --- a/cpp/src/strings/find.cu +++ b/cpp/src/strings/find.cu @@ -16,9 +16,7 @@ #include #include -#include #include -#include #include #include #include @@ -288,154 +286,5 @@ std::unique_ptr ends_with(strings_column_view const& strings, return detail::ends_with(strings, target, mr); } -// For substring_index APIs -namespace detail { -// Internal helper class -namespace { - -struct substring_index_functor { - template - std::unique_ptr operator()(ColItrT const col_itr, - DelimiterItrT const delim_itr, - size_type delimiter_count, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream, - size_type strings_count) const - { - // Shallow copy of the resultant strings - rmm::device_vector out_col_strings(strings_count); - - // Invalid output column strings - null rows - string_view const invalid_str{nullptr, 0}; - - thrust::transform( - rmm::exec_policy(stream)->on(stream), - col_itr, - col_itr + strings_count, - delim_itr, - out_col_strings.data().get(), - [delimiter_count, invalid_str] __device__(auto col_val_pair, auto delim_val_pair) { - // If the column value for this row or the delimiter is null or if the delimiter count is 0, - // result is null - if (!col_val_pair.second || !delim_val_pair.second || delimiter_count == 0) - return invalid_str; - auto col_val = col_val_pair.first; - - // If the global delimiter or the row specific delimiter or if the column value for the row - // is empty, value is empty. - if (delim_val_pair.first.empty() || col_val.empty()) return string_view{}; - - auto delim_val = delim_val_pair.first; - - auto const col_val_len = col_val.length(); - auto const delimiter_len = delim_val.length(); - - auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count; - size_type start_pos = 0; - size_type end_pos = col_val_len; - string_view out_str{}; - - for (auto i = 0; i < nsearches; ++i) { - if (delimiter_count < 0) { - end_pos = col_val.rfind(delim_val, 0, end_pos); - if (end_pos == -1) { - out_str = col_val; - break; - } - if (i + 1 == nsearches) - out_str = - col_val.substr(end_pos + delimiter_len, col_val_len - end_pos - delimiter_len); - } else { - auto char_pos = col_val.find(delim_val, start_pos); - if (char_pos == -1) { - out_str = col_val; - break; - } - if (i + 1 == nsearches) - out_str = col_val.substr(0, char_pos); - else - start_pos = char_pos + delimiter_len; - } - } - - return out_str.empty() ? string_view{} : out_str; - }); - - // Create an output column with the resultant strings - return make_strings_column(out_col_strings, invalid_str, stream, mr); - } -}; - -} // namespace - -template -std::unique_ptr substring_index(strings_column_view const& strings, - DelimiterItrT const delimiter_itr, - size_type count, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) -{ - auto strings_count = strings.size(); - // If there aren't any rows, return an empty strings column - if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream); - - // Create device view of the column - auto colview_ptr = column_device_view::create(strings.parent(), stream); - auto colview = *colview_ptr; - if (colview.nullable()) { - return substring_index_functor{}(cudf::detail::make_pair_iterator(colview), - delimiter_itr, - count, - mr, - stream, - strings_count); - } else { - return substring_index_functor{}(cudf::detail::make_pair_iterator(colview), - delimiter_itr, - count, - mr, - stream, - strings_count); - } -} - -} // namespace detail - -// external APIs - -std::unique_ptr substring_index(strings_column_view const& strings, - string_scalar const& delimiter, - size_type count, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::substring_index( - strings, cudf::detail::make_pair_iterator(delimiter), count, mr); -} - -std::unique_ptr substring_index(strings_column_view const& strings, - strings_column_view const& delimiters, - size_type count, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(strings.size() == delimiters.size(), - "Strings and delimiters column sizes do not match"); - - CUDF_FUNC_RANGE(); - auto delimiters_dev_view_ptr = cudf::column_device_view::create(delimiters.parent(), 0); - auto delimiters_dev_view = *delimiters_dev_view_ptr; - return (delimiters_dev_view.nullable()) - ? detail::substring_index( - strings, - cudf::detail::make_pair_iterator(delimiters_dev_view), - count, - mr) - : detail::substring_index( - strings, - cudf::detail::make_pair_iterator(delimiters_dev_view), - count, - mr); -} - } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index c4fa4a39297..13b7a965953 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -198,37 +199,28 @@ struct substring_from_fn { } }; -/** - * Called by the type-dispatcher for resolving the position columns - * (starts_column and stops_column) to actual types. - */ struct dispatch_substring_from_fn { /** * @brief Returns strings column with substrings based on the ranges in the * individual starts and stops column position values. */ - template ::value>* = nullptr> - std::unique_ptr operator()(strings_column_view const& strings, - column_view const& starts_column, - column_view const& stops_column, + template + std::unique_ptr operator()(column_device_view const& d_column, + size_type null_count, + PositionType const* starts, + PositionType const* stops, rmm::mr::device_memory_resource* mr, cudaStream_t stream) const { - const PositionType* starts = starts_column.data(); - const PositionType* stops = stops_column.data(); + auto strings_count = d_column.size(); - auto strings_count = strings.size(); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - - // copy the null mask + // Copy the null mask rmm::device_buffer null_mask; - size_type null_count = strings.null_count(); if (d_column.nullable()) null_mask = rmm::device_buffer( d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr); - // build offsets column + + // Build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::make_counting_iterator(0), substring_from_fn{d_column, starts, stops}); @@ -237,7 +229,7 @@ struct dispatch_substring_from_fn { auto offsets_view = offsets_column->view(); auto d_new_offsets = offsets_view.template data(); - // build chars column + // Build chars column cudf::size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count]; auto chars_column = cudf::strings::detail::create_chars_child_column( strings_count, null_count, bytes, mr, stream); @@ -248,7 +240,7 @@ struct dispatch_substring_from_fn { thrust::make_counting_iterator(0), strings_count, substring_from_fn{d_column, starts, stops, d_new_offsets, d_chars}); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -257,30 +249,114 @@ struct dispatch_substring_from_fn { stream, mr); } - // +}; + +// This functor is invoked to compute the substrings using start and end positional indices. +// The type used to store the indices is inferred by invoking this functor through the +// type dispatcher. +struct compute_substrings { + template ::value>* = nullptr> + std::unique_ptr operator()(column_device_view const& d_column, + size_type null_count, + column_view const& starts_column, + column_view const& stops_column, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) const + { + return dispatch_substring_from_fn{}(d_column, + null_count, + starts_column.data(), + stops_column.data(), + mr, + stream); + } + template ::value>* = nullptr> - std::unique_ptr operator()(strings_column_view const&, - column_view const&, - column_view const&, - rmm::mr::device_memory_resource*, - cudaStream_t) const + std::unique_ptr operator()(column_device_view const& d_column, + size_type null_count, + column_view const& starts_column, + column_view const& stops_column, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) const { CUDF_FAIL("Positions values must be an integral type."); } }; -template <> -std::unique_ptr dispatch_substring_from_fn::operator()( - strings_column_view const&, - column_view const&, - column_view const&, - rmm::mr::device_memory_resource*, - cudaStream_t) const -{ - CUDF_FAIL("Positions values must not be bool type."); -} +// When slice_strings is invoked with a delimiter string and a delimiter count, we need to +// compute the start and end indices of the substring. This functor accomplishes that. +struct compute_substring_indices { + template + void operator()(column_device_view const& d_column, + DelimiterItrT const delim_itr, + size_type delimiter_count, + size_type* start_char_pos, + size_type* end_char_pos, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) const + { + auto strings_count = d_column.size(); + + thrust::for_each_n( + rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + strings_count, + [delim_itr, delimiter_count, start_char_pos, end_char_pos, d_column] __device__( + size_type idx) { + // If the column value for this row is null, result is null. + // If the delimiter count is 0, result is empty string. + if (d_column.is_null(idx) || !delimiter_count) { return; } + auto const& delim_val_pair = delim_itr[idx]; + auto const& col_val = d_column.element(idx); + + // If the global delimiter or the row specific delimiter is invalid or if it is empty, row + // value is empty. + // If the column value for the row is empty, the row value is empty. + if (!delim_val_pair.second || delim_val_pair.first.empty() || col_val.empty()) { return; } + + auto const& delim_val = delim_val_pair.first; + + auto const col_val_len = col_val.length(); + auto const delimiter_len = delim_val.length(); + + auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count; + size_type start_pos = 0; + size_type end_pos = col_val_len; + + for (auto i = 0; i < nsearches; ++i) { + if (delimiter_count < 0) { + end_pos = col_val.rfind(delim_val, 0, end_pos); + if (end_pos == -1) { + start_char_pos[idx] = 0; + end_char_pos[idx] = col_val_len; + return; + } + if (i + 1 == nsearches) { + start_char_pos[idx] = end_pos + delimiter_len; + end_char_pos[idx] = col_val_len; + return; + } + } else { + auto char_pos = col_val.find(delim_val, start_pos); + if (char_pos == -1) { + start_char_pos[idx] = 0; + end_char_pos[idx] = col_val_len; + return; + } + if (i + 1 == nsearches) { + start_char_pos[idx] = 0; + end_char_pos[idx] = char_pos; + return; + } else + start_pos = char_pos + delimiter_len; + } + } + }); + } +}; } // namespace // @@ -301,17 +377,51 @@ std::unique_ptr slice_strings( "Parameters starts and stops must be of the same type."); CUDF_EXPECTS(starts_column.null_count() == 0, "Parameter starts must not contain nulls."); CUDF_EXPECTS(stops_column.null_count() == 0, "Parameter stops must not contain nulls."); + CUDF_EXPECTS(starts_column.type().id() != data_type{BOOL8}.id(), + "Positions values must not be bool type."); + CUDF_EXPECTS(is_fixed_width(starts_column.type()), "Positions values must be an integral type."); + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; // perhaps another candidate for index-normalizer return cudf::type_dispatcher(starts_column.type(), - dispatch_substring_from_fn{}, - strings, + compute_substrings{}, + d_column, + strings.null_count(), starts_column, stops_column, mr, stream); } +template +std::unique_ptr slice_strings(strings_column_view const& strings, + DelimiterItrT const delimiter_itr, + size_type count, + rmm::mr::device_memory_resource* mr, + cudaStream_t stream = 0) +{ + auto strings_count = strings.size(); + // If there aren't any rows, return an empty strings column + if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream); + + // Compute the substring indices first + rmm::device_vector start_char_pos_vec(strings_count, 0); + rmm::device_vector end_char_pos_vec(strings_count, 0); + auto* start_char_pos = start_char_pos_vec.data().get(); + auto* end_char_pos = end_char_pos_vec.data().get(); + + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; + // Compute the substring indices first + compute_substring_indices{}( + d_column, delimiter_itr, count, start_char_pos, end_char_pos, mr, stream); + + // Extract the substrings using the indices next + return dispatch_substring_from_fn{}( + d_column, strings.null_count(), start_char_pos, end_char_pos, mr, stream); +} + } // namespace detail // external API @@ -325,5 +435,39 @@ std::unique_ptr slice_strings(strings_column_view const& strings, return detail::slice_strings(strings, starts_column, stops_column, mr); } +std::unique_ptr slice_strings(strings_column_view const& strings, + string_scalar const& delimiter, + size_type count, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::slice_strings( + strings, cudf::detail::make_pair_iterator(delimiter), count, mr, nullptr); +} + +std::unique_ptr slice_strings(strings_column_view const& strings, + strings_column_view const& delimiters, + size_type count, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(strings.size() == delimiters.size(), + "Strings and delimiters column sizes do not match"); + + CUDF_FUNC_RANGE(); + auto delimiters_dev_view_ptr = cudf::column_device_view::create(delimiters.parent(), 0); + auto delimiters_dev_view = *delimiters_dev_view_ptr; + return (delimiters_dev_view.nullable()) + ? detail::slice_strings( + strings, + cudf::detail::make_pair_iterator(delimiters_dev_view), + count, + mr) + : detail::slice_strings( + strings, + cudf::detail::make_pair_iterator(delimiters_dev_view), + count, + mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index ab29a06c1d0..6a0d39757f0 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -216,321 +215,3 @@ TEST_P(FindParmsTest, Find) INSTANTIATE_TEST_CASE_P(StringsFindTest, FindParmsTest, testing::ValuesIn(std::array{0, 1, 2, 3})); - -struct StringsSubstringIndexWithScalarTest : public cudf::test::BaseFixture { -}; - -TEST_F(StringsSubstringIndexWithScalarTest, ZeroSizeStringsColumn) -{ - cudf::column_view col0(cudf::data_type{cudf::STRING}, 0, nullptr, nullptr, 0); - auto strings_view = cudf::strings_column_view(col0); - - auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("foo"), 1); - cudf::test::expect_strings_empty(results->view()); -} - -TEST_F(StringsSubstringIndexWithScalarTest, AllEmpty) -{ - auto strings_col = cudf::test::strings_column_wrapper({"", "", "", "", ""}); - auto strings_view = cudf::strings_column_view(strings_col); - - auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); - - auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("e"), -1); - cudf::test::expect_columns_equal(*results, exp_results, true); -} - -TEST_F(StringsSubstringIndexWithScalarTest, EmptyDelimiter) -{ - auto strings_col = cudf::test::strings_column_wrapper( - {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); - ; - auto strings_view = cudf::strings_column_view(strings_col); - - auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, - {true, true, false, true, true, true}); - auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar(""), 1); - cudf::test::expect_columns_equal(*results, exp_results, true); -} - -TEST_F(StringsSubstringIndexWithScalarTest, ZeroCount) -{ - auto strings_col = cudf::test::strings_column_wrapper( - {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); - ; - auto strings_view = cudf::strings_column_view(strings_col); - - auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, - {false, false, false, false, false, false}); - - auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), 0); - cudf::test::expect_columns_equal(*results, exp_results, true); -} - -TEST_F(StringsSubstringIndexWithScalarTest, SearchDelimiter) -{ - auto strings_col = cudf::test::strings_column_wrapper( - {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); - ; - auto strings_view = cudf::strings_column_view(strings_col); - - { - auto exp_results = cudf::test::strings_column_wrapper({"H", "thes", "", "lease", "t", ""}, - {true, true, false, true, true, true}); - - auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), 1); - cudf::test::expect_columns_equal(*results, exp_results, true); - } - - { - auto exp_results = cudf::test::strings_column_wrapper( - {"llo", "", "", "lease", "st strings", ""}, {true, true, false, true, true, true}); - - auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), -1); - cudf::test::expect_columns_equal(*results, exp_results, true); - } - - { - auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), 2); - cudf::test::expect_columns_equal(*results, strings_view.parent(), true); - } - - { - auto results = cudf::strings::substring_index(strings_view, cudf::string_scalar("é"), -2); - cudf::test::expect_columns_equal(*results, strings_view.parent(), true); - } - - { - auto col0 = cudf::test::strings_column_wrapper( - {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""}, - {true, true, false, true, true, true}); - - auto exp_results = cudf::test::strings_column_wrapper({"Hello LL", "o", "", "opp", "pol", ""}, - {true, true, false, true, true, true}); - - auto results = - cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("o"), 2); - cudf::test::expect_columns_equal(*results, exp_results, true); - } - - { - auto col0 = cudf::test::strings_column_wrapper( - {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""}, - {true, true, false, true, true, true}); - - auto exp_results = cudf::test::strings_column_wrapper({"ogh", "pppllo", "", "llo", " po", ""}, - {true, true, false, true, true, true}); - - auto results = - cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("o"), -2); - cudf::test::expect_columns_equal(*results, exp_results, true); - } - - { - auto col0 = cudf::test::strings_column_wrapper( - {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""}, - {true, true, false, true, true, true}); - - auto exp_results = cudf::test::strings_column_wrapper( - {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", "poloéé lopéé apploo", ""}, - {true, true, false, true, true, true}); - - auto results = - cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), 3); - cudf::test::expect_columns_equal(*results, exp_results, true); - } - - { - auto col0 = cudf::test::strings_column_wrapper( - {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""}, - {true, true, false, true, true, true}); - - auto exp_results = cudf::test::strings_column_wrapper( - {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", " lopéé applooéé po", ""}, - {true, true, false, true, true, true}); - - auto results = cudf::strings::substring_index( - cudf::strings_column_view{col0}, cudf::string_scalar("éé"), -3); - cudf::test::expect_columns_equal(*results, exp_results, true); - } - - { - auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com", - "www.apache..org", - "tennis...com", - "nvidia....com", - "google...........com", - "microsoft...c.....co..m"}); - - auto exp_results = cudf::test::strings_column_wrapper( - {"www.yahoo.com", "www.apache.", "tennis..", "nvidia..", "google..", "microsoft.."}); - - auto results = - cudf::strings::substring_index(cudf::strings_column_view{col0}, cudf::string_scalar("."), 3); - cudf::test::expect_columns_equal(*results, exp_results, true); - } - - { - auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com", - "www.apache..org", - "tennis..com", - "nvidia....com", - "google...........com", - ".", - "microsoft...c.....co..m"}); - - auto exp_results = cudf::test::strings_column_wrapper( - {"www.yahoo.com", "www.apache..org", "tennis..com", "..com", "..com", ".", "co..m"}); - - auto results = cudf::strings::substring_index( - cudf::strings_column_view{col0}, cudf::string_scalar(".."), -2); - cudf::test::expect_columns_equal(*results, exp_results, true); - } -} - -struct StringsSubstringIndexWithColumnTest : public cudf::test::BaseFixture { -}; - -TEST_F(StringsSubstringIndexWithColumnTest, ZeroSizeStringsColumn) -{ - cudf::column_view col0(cudf::data_type{cudf::STRING}, 0, nullptr, nullptr, 0); - auto strings_view = cudf::strings_column_view(col0); - - auto results = cudf::strings::substring_index(strings_view, strings_view, 1); - // Check empty column - cudf::test::expect_strings_empty(results->view()); -} - -TEST_F(StringsSubstringIndexWithColumnTest, GenerateExceptions) -{ - auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); - auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", "."}); - - EXPECT_THROW(cudf::strings::substring_index( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1), - cudf::logic_error); -} - -TEST_F(StringsSubstringIndexWithColumnTest, ColumnAllEmpty) -{ - auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); - auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", ".", "/"}); - - auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); - - auto results = cudf::strings::substring_index( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); - cudf::test::expect_columns_equal(*results, exp_results, true); -} - -TEST_F(StringsSubstringIndexWithColumnTest, DelimiterAllEmptyAndInvalid) -{ - auto col0 = cudf::test::strings_column_wrapper( - {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); - auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, - {true, false, true, false, true, false}); - - auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, - {true, false, false, false, true, false}); - - auto results = cudf::strings::substring_index( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); - cudf::test::expect_columns_equal(*results, exp_results, true); -} - -TEST_F(StringsSubstringIndexWithColumnTest, ZeroDelimiterCount) -{ - auto col0 = cudf::test::strings_column_wrapper( - {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); - auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, - {true, false, true, false, true, false}); - - auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, - {false, false, false, false, false, false}); - - auto results = cudf::strings::substring_index( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 0); - cudf::test::expect_columns_equal(*results, exp_results, true); -} - -TEST_F(StringsSubstringIndexWithColumnTest, SearchDelimiter) -{ - { - auto col0 = cudf::test::strings_column_wrapper( - {"H™élloi ™◎oo™ff™", "thesé", "", "lease™", "tést strings", "™"}, - {true, true, false, true, true, true}); - auto delim_col = cudf::test::strings_column_wrapper({"™", "™", "", "e", "t", "™"}); - - auto exp_results = cudf::test::strings_column_wrapper({"H", "thesé", "", "l", "", ""}, - {true, true, false, true, true, true}); - - auto results = cudf::strings::substring_index( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); - cudf::test::expect_columns_equal(*results, exp_results, true); - } - - { - auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi ™◎ooff™ff™", - "tffffhffesé", - "", - "lff fooff ffff eaffse™", - "tést ffstri.nffgs", - "ffff ™ ffff ff"}, - {true, true, false, true, true, true}); - auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "ff ", "t", "ff ™"}); - - auto exp_results = cudf::test::strings_column_wrapper( - {"ff™", "esé", "", "eaffse™", "ri.nffgs", " ffff ff"}, {true, true, false, true, true, true}); - - auto results = cudf::strings::substring_index( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); - cudf::test::expect_columns_equal(*results, exp_results, true); - } - - { - auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™", - "tffffhffesé", - "", - "lff fooff ffff eaffse™", - "tést ff™ffff™ff™ffffstri.ff™ffff™nffgs", - "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, - {true, true, false, true, true, true}); - auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "e ", "ff™ff", "ff™ff™"}, - {true, true, false, true, true, true}); - - auto exp_results = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ goo", - "tffffh", - "", - "lff fooff ffff eaffse™", - "tést ff™ffff™ff™ffffstri.", - "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, - {true, true, false, true, true, true}); - - auto results = cudf::strings::substring_index( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 3); - cudf::test::expect_columns_equal(*results, exp_results, true); - } - - { - auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™", - "tffffhffesé", - "", - "lff fooff ffff eaffse™", - "tést ff™ffff™ff™ffffstri.ff™ffff™nffgs", - "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}); - auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "e ", "ff™ff", "ff™ff™"}, - {true, true, false, true, true, true}); - - auto exp_results = cudf::test::strings_column_wrapper({" gooff™ ™◎ooff™ff™", - "ffhffesé", - "", - "lff fooff ffff eaffse™", - "ff™ff™ffffstri.ff™ffff™nffgs", - "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, - {true, true, false, true, true, true}); - - auto results = cudf::strings::substring_index( - cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -3); - cudf::test::expect_columns_equal(*results, exp_results, true); - } -} diff --git a/cpp/tests/strings/substring_tests.cpp b/cpp/tests/strings/substring_tests.cpp index 99b4df39a87..5de87592033 100644 --- a/cpp/tests/strings/substring_tests.cpp +++ b/cpp/tests/strings/substring_tests.cpp @@ -270,3 +270,320 @@ TEST_F(StringsSubstringsTest, Error) auto strings_column = cudf::strings_column_view(strings); EXPECT_THROW(cudf::strings::slice_strings(strings_column, 0, 0, 0), cudf::logic_error); } + +struct StringsSubstringsScalarDelimiterTest : public cudf::test::BaseFixture { +}; + +TEST_F(StringsSubstringsScalarDelimiterTest, ZeroSizeStringsColumn) +{ + cudf::column_view col0(cudf::data_type{cudf::STRING}, 0, nullptr, nullptr, 0); + auto strings_view = cudf::strings_column_view(col0); + + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("foo"), 1); + cudf::test::expect_strings_empty(results->view()); +} + +TEST_F(StringsSubstringsScalarDelimiterTest, AllEmpty) +{ + auto strings_col = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("e"), -1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsScalarDelimiterTest, EmptyDelimiter) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, true, false, true, true, true}); + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar(""), 1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsScalarDelimiterTest, ZeroCount) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 0); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter) +{ + auto strings_col = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + ; + auto strings_view = cudf::strings_column_view(strings_col); + + { + auto exp_results = cudf::test::strings_column_wrapper({"H", "thes", "", "lease", "t", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto exp_results = cudf::test::strings_column_wrapper( + {"llo", "", "", "lease", "st strings", ""}, {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 2); + cudf::test::expect_columns_equal(*results, strings_view.parent(), true); + } + + { + auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -2); + cudf::test::expect_columns_equal(*results, strings_view.parent(), true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"Hello LL", "o", "", "opp", "pol", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), 2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"ogh", "pppllo", "", "llo", " po", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), -2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", "poloéé lopéé apploo", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", " lopéé applooéé po", ""}, + {true, true, false, true, true, true}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), -3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com", + "www.apache..org", + "tennis...com", + "nvidia....com", + "google...........com", + "microsoft...c.....co..m"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"www.yahoo.com", "www.apache.", "tennis..", "nvidia..", "google..", "microsoft.."}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("."), 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com", + "www.apache..org", + "tennis..com", + "nvidia....com", + "google...........com", + ".", + "microsoft...c.....co..m"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"www.yahoo.com", "www.apache..org", "tennis..com", "..com", "..com", ".", "co..m"}); + + auto results = + cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar(".."), -2); + cudf::test::expect_columns_equal(*results, exp_results, true); + } +} + +struct StringsSubstringsColumnDelimiterTest : public cudf::test::BaseFixture { +}; + +TEST_F(StringsSubstringsColumnDelimiterTest, ZeroSizeStringsColumn) +{ + cudf::column_view col0(cudf::data_type{cudf::STRING}, 0, nullptr, nullptr, 0); + auto strings_view = cudf::strings_column_view(col0); + + auto results = cudf::strings::slice_strings(strings_view, strings_view, 1); + // Check empty column + cudf::test::expect_strings_empty(results->view()); +} + +TEST_F(StringsSubstringsColumnDelimiterTest, GenerateExceptions) +{ + auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", "."}); + + EXPECT_THROW(cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1), + cudf::logic_error); +} + +TEST_F(StringsSubstringsColumnDelimiterTest, ColumnAllEmpty) +{ + auto col0 = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", ".", "/"}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsColumnDelimiterTest, DelimiterAllEmptyAndInvalid) +{ + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, true, false, true, false}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsColumnDelimiterTest, ZeroDelimiterCount) +{ + auto col0 = cudf::test::strings_column_wrapper( + {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, false, true, false, true, false}); + + auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 0); + cudf::test::expect_columns_equal(*results, exp_results, true); +} + +TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter) +{ + { + auto col0 = cudf::test::strings_column_wrapper( + {"H™élloi ™◎oo™ff™", "thesé", "", "lease™", "tést strings", "™"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"™", "™", "", "e", "t", "™"}); + + auto exp_results = cudf::test::strings_column_wrapper({"H", "thesé", "", "l", "", ""}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ffstri.nffgs", + "ffff ™ ffff ff"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "ff ", "t", "ff ™"}); + + auto exp_results = cudf::test::strings_column_wrapper( + {"ff™", "esé", "", "eaffse™", "ri.nffgs", " ffff ff"}, {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, + {true, true, false, true, true, true}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "e ", "ff™ff", "ff™ff™"}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ goo", + "tffffh", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}, + {true, true, false, true, true, true}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } + + { + auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™", + "tffffhffesé", + "", + "lff fooff ffff eaffse™", + "tést ff™ffff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}); + auto delim_col = cudf::test::strings_column_wrapper({"ff™", "ff", "", "e ", "ff™ff", "ff™ff™"}, + {true, true, false, true, true, true}); + + auto exp_results = cudf::test::strings_column_wrapper({" gooff™ ™◎ooff™ff™", + "ffhffesé", + "", + "lff fooff ffff eaffse™", + "ff™ff™ffffstri.ff™ffff™nffgs", + "ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"}); + + auto results = cudf::strings::slice_strings( + cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -3); + cudf::test::expect_columns_equal(*results, exp_results, true); + } +} From 81d8b6229d93acfb077adb2e449fc85ae68183a4 Mon Sep 17 00:00:00 2001 From: Sriram Chandramouli Date: Fri, 29 May 2020 19:48:35 +0000 Subject: [PATCH 7/9] - minor cleanup to substring index computing functor --- cpp/src/strings/substring.cu | 82 ++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index 13b7a965953..7388ae94dee 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -305,53 +305,51 @@ struct compute_substring_indices { strings_count, [delim_itr, delimiter_count, start_char_pos, end_char_pos, d_column] __device__( size_type idx) { - // If the column value for this row is null, result is null. - // If the delimiter count is 0, result is empty string. - if (d_column.is_null(idx) || !delimiter_count) { return; } - auto const& delim_val_pair = delim_itr[idx]; - auto const& col_val = d_column.element(idx); + auto const& delim_val = delim_val_pair.first; // Don't use it yet + // If the column value for this row is null, result is null. + // If the delimiter count is 0, result is empty string. // If the global delimiter or the row specific delimiter is invalid or if it is empty, row // value is empty. - // If the column value for the row is empty, the row value is empty. - if (!delim_val_pair.second || delim_val_pair.first.empty() || col_val.empty()) { return; } - - auto const& delim_val = delim_val_pair.first; - - auto const col_val_len = col_val.length(); - auto const delimiter_len = delim_val.length(); - - auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count; - size_type start_pos = 0; - size_type end_pos = col_val_len; - - for (auto i = 0; i < nsearches; ++i) { - if (delimiter_count < 0) { - end_pos = col_val.rfind(delim_val, 0, end_pos); - if (end_pos == -1) { - start_char_pos[idx] = 0; - end_char_pos[idx] = col_val_len; - return; - } - if (i + 1 == nsearches) { - start_char_pos[idx] = end_pos + delimiter_len; - end_char_pos[idx] = col_val_len; - return; - } - } else { - auto char_pos = col_val.find(delim_val, start_pos); - if (char_pos == -1) { - start_char_pos[idx] = 0; - end_char_pos[idx] = col_val_len; - return; + if (!d_column.is_null(idx) && delimiter_count && delim_val_pair.second && + !delim_val.empty()) { + auto const& col_val = d_column.element(idx); + + // If the column value for the row is empty, the row value is empty. + if (!col_val.empty()) { + auto const col_val_len = col_val.length(); + auto const delimiter_len = delim_val.length(); + + auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count; + size_type start_pos = 0; + size_type end_pos = col_val_len; + bool keep_searching = true; + + for (auto i = 0; keep_searching && i < nsearches; ++i) { + if (delimiter_count < 0) { + end_pos = col_val.rfind(delim_val, 0, end_pos); + if (end_pos == -1) { + start_char_pos[idx] = 0; + end_char_pos[idx] = col_val_len; + keep_searching = false; + } else if (i + 1 == nsearches) { + start_char_pos[idx] = end_pos + delimiter_len; + end_char_pos[idx] = col_val_len; + } + } else { + auto char_pos = col_val.find(delim_val, start_pos); + if (char_pos == -1) { + start_char_pos[idx] = 0; + end_char_pos[idx] = col_val_len; + keep_searching = false; + } else if (i + 1 == nsearches) { + start_char_pos[idx] = 0; + end_char_pos[idx] = char_pos; + } else + start_pos = char_pos + delimiter_len; + } } - if (i + 1 == nsearches) { - start_char_pos[idx] = 0; - end_char_pos[idx] = char_pos; - return; - } else - start_pos = char_pos + delimiter_len; } } }); From b02244bb54c6e4547dab19aef08b12a500b52044 Mon Sep 17 00:00:00 2001 From: Sriram Chandramouli Date: Mon, 1 Jun 2020 14:44:26 +0000 Subject: [PATCH 8/9] - incorporate review comments --- CHANGELOG.md | 2 +- cpp/include/cudf/strings/substring.hpp | 8 ++++++-- cpp/src/strings/substring.cu | 21 ++++++++++++--------- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 160481f08f1..e07b6e9ae46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## New Features - PR #5222 Adding clip feature support to DataFrame and Series -- PR #5303 Add substring_index functionality for strings +- PR #5303 Add slice_strings functionality using delimiter string ## Improvements - PR #5245 Add column reduction benchmark diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp index 7e3fcda5d6f..2139bb63af5 100644 --- a/cpp/include/cudf/strings/substring.hpp +++ b/cpp/include/cudf/strings/substring.hpp @@ -105,7 +105,9 @@ std::unique_ptr slice_strings( rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); /** - * @brief Returns a column of strings after searching for @p delimiter @p count number of + * @brief Slices a column of strings by using a delimiter as a slice point. + * + * Returns a column of strings after searching for @p delimiter @p count number of * times in the source @p strings forward if @p count is positive or backwards if @p count is * negative. If @p count is positive, it returns a substring from the start of the source @p * strings up until @p count occurrence of the @delimiter not including the @p delimiter. @@ -145,7 +147,9 @@ std::unique_ptr slice_strings( rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); /** - * @brief Returns a column of strings after searching the delimiter defined per row from + * @brief Slices a column of strings by using a delimiter column as slice points. + * + * Returns a column of strings after searching the delimiter defined per row from * @p delimiter_strings @p count number of times in the source @p strings forward if @p count * is positive or backwards if @p count is negative. If @p count is positive, it returns a * substring from the start of the source @p strings up until @p count occurrence of the diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index 7388ae94dee..cae52115a65 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -199,7 +199,7 @@ struct substring_from_fn { } }; -struct dispatch_substring_from_fn { +struct compute_substrings_from_fn { /** * @brief Returns strings column with substrings based on the ranges in the * individual starts and stops column position values. @@ -264,7 +264,7 @@ struct compute_substrings { rmm::mr::device_memory_resource* mr, cudaStream_t stream) const { - return dispatch_substring_from_fn{}(d_column, + return compute_substrings_from_fn{}(d_column, null_count, starts_column.data(), stops_column.data(), @@ -312,8 +312,7 @@ struct compute_substring_indices { // If the delimiter count is 0, result is empty string. // If the global delimiter or the row specific delimiter is invalid or if it is empty, row // value is empty. - if (!d_column.is_null(idx) && delimiter_count && delim_val_pair.second && - !delim_val.empty()) { + if (!d_column.is_null(idx) && delim_val_pair.second && !delim_val.empty()) { auto const& col_val = d_column.element(idx); // If the column value for the row is empty, the row value is empty. @@ -377,7 +376,7 @@ std::unique_ptr slice_strings( CUDF_EXPECTS(stops_column.null_count() == 0, "Parameter stops must not contain nulls."); CUDF_EXPECTS(starts_column.type().id() != data_type{BOOL8}.id(), "Positions values must not be bool type."); - CUDF_EXPECTS(is_fixed_width(starts_column.type()), "Positions values must be an integral type."); + CUDF_EXPECTS(is_fixed_width(starts_column.type()), "Positions values must be fixed width type."); auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; @@ -411,12 +410,16 @@ std::unique_ptr slice_strings(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; - // Compute the substring indices first - compute_substring_indices{}( - d_column, delimiter_itr, count, start_char_pos, end_char_pos, mr, stream); + + // If delimiter count is 0, the output column will contain empty strings + if (count) { + // Compute the substring indices first + compute_substring_indices{}( + d_column, delimiter_itr, count, start_char_pos, end_char_pos, mr, stream); + } // Extract the substrings using the indices next - return dispatch_substring_from_fn{}( + return compute_substrings_from_fn{}( d_column, strings.null_count(), start_char_pos, end_char_pos, mr, stream); } From 9d2fc8bf70d1d5b86d737f8decfe85737dbd9c70 Mon Sep 17 00:00:00 2001 From: Sriram Chandramouli Date: Tue, 2 Jun 2020 01:37:15 +0000 Subject: [PATCH 9/9] - incorporate review comments --- cpp/include/cudf/strings/substring.hpp | 35 ++++++------- cpp/src/strings/substring.cu | 68 ++++++++++++-------------- 2 files changed, 48 insertions(+), 55 deletions(-) diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp index 2139bb63af5..9e21905f3db 100644 --- a/cpp/include/cudf/strings/substring.hpp +++ b/cpp/include/cudf/strings/substring.hpp @@ -108,17 +108,18 @@ std::unique_ptr slice_strings( * @brief Slices a column of strings by using a delimiter as a slice point. * * Returns a column of strings after searching for @p delimiter @p count number of - * times in the source @p strings forward if @p count is positive or backwards if @p count is - * negative. If @p count is positive, it returns a substring from the start of the source @p - * strings up until @p count occurrence of the @delimiter not including the @p delimiter. - * If @p count is negative, it returns a substring from the start of the @p count occurrence of - * the @delimiter in the source @p strings past the delimiter until the end of the string. + * times in the source @p strings from left to right if @p count is positive or from + * right to left if @p count is negative. If @p count is positive, it returns a substring + * from the start of the source @p strings up until @p count occurrence of the @delimiter + * not including the @p delimiter. If @p count is negative, it returns a substring from + * the start of the @p count occurrence of the @delimiter in the source @p strings past + * the delimiter until the end of the string. * * The search for @delimiter in @p strings is case sensitive. * If the row value of @p strings is null, the row value in the output column will be null. - * If the @p count is 0 or if @p delimiter is invalid, output column will be an empty string. - * If the @p delimiter or the column value for a row is empty, the row value in the output - * column will be empty. + * If the @p count is 0 or if @p delimiter is invalid or empty, every row in the output column + * will be an empty string. + * If the column value for a row is empty, the row value in the output column will be empty. * If @p count occurrences of @p delimiter isn't found, the row value in the output column will * be the row value from the input @p strings column. * @@ -136,7 +137,7 @@ std::unique_ptr slice_strings( * @param strings Strings instance for this operation. * @param delimiter UTF-8 encoded string to search for in each string. * @param count Number of times to search for delimiter in each string. If the value is positive, - * forward search of delimiter is performed; else, a backward search is performed. + * delimiter is searched from left to right; else, it is searched from right to left. * @param mr Resource for allocating device memory. * @return New strings column containing the substrings. */ @@ -150,18 +151,18 @@ std::unique_ptr slice_strings( * @brief Slices a column of strings by using a delimiter column as slice points. * * Returns a column of strings after searching the delimiter defined per row from - * @p delimiter_strings @p count number of times in the source @p strings forward if @p count - * is positive or backwards if @p count is negative. If @p count is positive, it returns a - * substring from the start of the source @p strings up until @p count occurrence of the - * delimiter for that row not including that delimiter. If @p count is negative, it returns a - * substring from the start of the @p count occurrence of the delimiter for that row in the - * source @p strings past the delimiter until the end of the string. + * @p delimiter_strings @p count number of times in the source @p strings from left to right + * if @p count is positive or from right to left if @p count is negative. If @p count is + * positive, it returns a substring from the start of the source @p strings up until + * @p count occurrence of the delimiter for that row not including that delimiter. If @p count + * is negative, it returns a substring from the start of the @p count occurrence of the + * delimiter for that row in the source @p strings past the delimiter until the end of the string. * * The search for @p delimiter_strings in @p strings is case sensitive. * If the @p count is 0, every row in the output column will be an empty string. * If the row value of @p strings is null, the row value in the output column will be null. * If the row value from @p delimiter_strings is invalid or null, the row value in the - * output column will an empty string. + * output column will be an empty string. * If the row value from @p delimiter_strings or the column value for a row is empty, the * row value in the output column will be empty. * If @p count occurrences of delimiter isn't found, the row value in the output column will @@ -185,7 +186,7 @@ std::unique_ptr slice_strings( * @param strings Strings instance for this operation. * @param delimiter_strings UTF-8 encoded string for each row. * @param count Number of times to search for delimiter in each string. If the value is positive, - * forward search of delimiter is performed; else, a backward search is performed. + * delimiter is searched from left to right; else, it is searched from right to left. * @param mr Resource for allocating device memory. * @return New strings column containing the substrings. */ diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index cae52115a65..14d04c1b257 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -312,44 +312,36 @@ struct compute_substring_indices { // If the delimiter count is 0, result is empty string. // If the global delimiter or the row specific delimiter is invalid or if it is empty, row // value is empty. - if (!d_column.is_null(idx) && delim_val_pair.second && !delim_val.empty()) { - auto const& col_val = d_column.element(idx); - - // If the column value for the row is empty, the row value is empty. - if (!col_val.empty()) { - auto const col_val_len = col_val.length(); - auto const delimiter_len = delim_val.length(); - - auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count; - size_type start_pos = 0; - size_type end_pos = col_val_len; - bool keep_searching = true; - - for (auto i = 0; keep_searching && i < nsearches; ++i) { - if (delimiter_count < 0) { - end_pos = col_val.rfind(delim_val, 0, end_pos); - if (end_pos == -1) { - start_char_pos[idx] = 0; - end_char_pos[idx] = col_val_len; - keep_searching = false; - } else if (i + 1 == nsearches) { - start_char_pos[idx] = end_pos + delimiter_len; - end_char_pos[idx] = col_val_len; - } - } else { - auto char_pos = col_val.find(delim_val, start_pos); - if (char_pos == -1) { - start_char_pos[idx] = 0; - end_char_pos[idx] = col_val_len; - keep_searching = false; - } else if (i + 1 == nsearches) { - start_char_pos[idx] = 0; - end_char_pos[idx] = char_pos; - } else - start_pos = char_pos + delimiter_len; - } - } + if (d_column.is_null(idx) || !delim_val_pair.second || delim_val.empty()) return; + auto const& col_val = d_column.element(idx); + + // If the column value for the row is empty, the row value is empty. + if (!col_val.empty()) { + auto const col_val_len = col_val.length(); + auto const delimiter_len = delim_val.length(); + + auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count; + bool const left_to_right = (delimiter_count > 0); + + size_type start_pos = start_char_pos[idx]; + size_type end_pos = col_val_len; + size_type char_pos = -1; + + end_char_pos[idx] = col_val_len; + + for (auto i = 0; i < nsearches; ++i) { + char_pos = left_to_right ? col_val.find(delim_val, start_pos) + : col_val.rfind(delim_val, 0, end_pos); + if (char_pos == -1) return; + if (left_to_right) + start_pos = char_pos + delimiter_len; + else + end_pos = char_pos; } + if (left_to_right) + end_char_pos[idx] = char_pos; + else + start_char_pos[idx] = end_pos + delimiter_len; } }); } @@ -412,7 +404,7 @@ std::unique_ptr slice_strings(strings_column_view const& strings, auto d_column = *strings_column; // If delimiter count is 0, the output column will contain empty strings - if (count) { + if (count != 0) { // Compute the substring indices first compute_substring_indices{}( d_column, delimiter_itr, count, start_char_pos, end_char_pos, mr, stream);