Skip to content

Commit

Permalink
Deprecate cudf::strings::slice_strings APIs that accept delimiters (#…
Browse files Browse the repository at this point in the history
…13373)

Deprecating `cudf::strings::slice_strings` functions:
https://docs.rapids.ai/api/libcudf/stable/group__strings__slice.html#gaf1504116d31b0ec4f119f1477bb87ee1
```
std::unique_ptr<column> slice_strings(
  strings_column_view const& strings,
  string_scalar const& delimiter,
  size_type count,
  rmm::mr::device_memory_resource* mr);
```
and 
https://docs.rapids.ai/api/libcudf/stable/group__strings__slice.html#ga21f01493d15c18d67b66a94f20a24389
```
std::unique_ptr<column> slice_strings(
  strings_column_view const& strings,
  strings_column_view const& delimiter_strings,
  size_type count,
  rmm::mr::device_memory_resource* mr);
```
These are not being used by cuDF (Cython) or Spark (JNI) and are quite convoluted and difficult to maintain.
Marking these as `deprecated` in 23.06 to be removed in 23.08 if possible.
Due to the warning/error compile settings, the benchmarks and the gtests are removed for these functions.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: #13373
  • Loading branch information
davidwendt authored May 24, 2023
1 parent 43c04e2 commit 132540e
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 265 deletions.
10 changes: 1 addition & 9 deletions cpp/benchmarks/string/slice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

class StringSlice : public cudf::benchmark {};

enum slice_type { position, multi_position, delimiter, multi_delimiter };
enum slice_type { position, multi_position };

static void BM_slice(benchmark::State& state, slice_type rt)
{
Expand All @@ -47,8 +47,6 @@ static void BM_slice(benchmark::State& state, slice_type rt)
auto stops_itr = thrust::constant_iterator<cudf::size_type>(max_str_length / 2);
cudf::test::fixed_width_column_wrapper<int32_t> starts(starts_itr, starts_itr + n_rows);
cudf::test::fixed_width_column_wrapper<int32_t> stops(stops_itr, stops_itr + n_rows);
auto delim_itr = thrust::constant_iterator<std::string>(" ");
cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows);

for (auto _ : state) {
cuda_event_timer raii(state, true, cudf::get_default_stream());
Expand All @@ -57,10 +55,6 @@ static void BM_slice(benchmark::State& state, slice_type rt)
cudf::strings::slice_strings(input, max_str_length / 3, max_str_length / 2);
break;
case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
case delimiter: cudf::strings::slice_strings(input, std::string{" "}, 1); break;
case multi_delimiter:
cudf::strings::slice_strings(input, cudf::strings_column_view(delimiters), 1);
break;
}
}

Expand Down Expand Up @@ -88,5 +82,3 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)

STRINGS_BENCHMARK_DEFINE(position)
STRINGS_BENCHMARK_DEFINE(multi_position)
STRINGS_BENCHMARK_DEFINE(delimiter)
STRINGS_BENCHMARK_DEFINE(multi_delimiter)
8 changes: 6 additions & 2 deletions cpp/include/cudf/strings/slice.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ std::unique_ptr<column> slice_strings(
/**
* @brief Slices a column of strings by using a delimiter as a slice point.
*
* @deprecated Since 23.06
*
* Returns a column of strings after searching for @p delimiter @p count number of
* times in the source @p strings from left to right if @p count is positive or from
* right to left if @p count is negative. If @p count is positive, it returns a substring
Expand Down Expand Up @@ -144,7 +146,7 @@ std::unique_ptr<column> slice_strings(
* @param mr Resource for allocating device memory.
* @return New strings column containing the substrings.
*/
std::unique_ptr<column> slice_strings(
[[deprecated]] std::unique_ptr<column> slice_strings(
strings_column_view const& strings,
string_scalar const& delimiter,
size_type count,
Expand All @@ -153,6 +155,8 @@ std::unique_ptr<column> slice_strings(
/**
* @brief Slices a column of strings by using a delimiter column as slice points.
*
* @deprecated Since 23.06
*
* Returns a column of strings after searching the delimiter defined per row from
* @p delimiter_strings @p count number of times in the source @p strings from left to right
* if @p count is positive or from right to left if @p count is negative. If @p count is
Expand Down Expand Up @@ -194,7 +198,7 @@ std::unique_ptr<column> slice_strings(
* @param mr Resource for allocating device memory.
* @return New strings column containing the substrings.
*/
std::unique_ptr<column> slice_strings(
[[deprecated]] std::unique_ptr<column> slice_strings(
strings_column_view const& strings,
strings_column_view const& delimiter_strings,
size_type count,
Expand Down
256 changes: 2 additions & 254 deletions cpp/tests/strings/slice_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,6 @@ TEST_F(StringsSliceTest, Error)
auto strings_view = cudf::strings_column_view(strings);
EXPECT_THROW(cudf::strings::slice_strings(strings_view, 0, 0, 0), cudf::logic_error);

auto delim_col = cudf::test::strings_column_wrapper({"", ""});
EXPECT_THROW(cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, -1),
cudf::logic_error);

auto indexes = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2});
EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes, indexes), cudf::logic_error);

Expand All @@ -299,16 +295,10 @@ TEST_F(StringsSliceTest, ZeroSizeStringsColumn)
auto results = cudf::strings::slice_strings(strings_view, 1, 2);
cudf::test::expect_column_empty(results->view());

results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("foo"), 1);
cudf::test::expect_column_empty(results->view());

cudf::column_view starts_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
cudf::column_view stops_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
results = cudf::strings::slice_strings(strings_view, starts_column, stops_column);
cudf::test::expect_column_empty(results->view());

results = cudf::strings::slice_strings(strings_view, strings_view, 1);
cudf::test::expect_column_empty(results->view());
}

TEST_F(StringsSliceTest, AllEmpty)
Expand All @@ -317,250 +307,8 @@ TEST_F(StringsSliceTest, AllEmpty)
auto strings_view = cudf::strings_column_view(strings_col);
auto exp_results = cudf::column_view(strings_col);

auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("e"), -1);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
results = cudf::strings::slice_strings(strings_view, strings_view, -1);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

TEST_F(StringsSliceTest, EmptyDelimiter)
{
auto strings_col = cudf::test::strings_column_wrapper(
{"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
;
auto strings_view = cudf::strings_column_view(strings_col);

auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
{true, true, false, true, true, true});

auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar(""), 1);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);

auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
{true, false, true, false, true, false});

results = cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, 1);
auto results = cudf::strings::slice_strings(strings_view, 0, -1);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

TEST_F(StringsSliceTest, ZeroCount)
{
auto strings_col = cudf::test::strings_column_wrapper(
{"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
;
auto strings_view = cudf::strings_column_view(strings_col);

auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
{true, true, false, true, true, true});

auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 0);
results = cudf::strings::slice_strings(strings_view, 0, -1);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);

auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
{true, false, true, false, true, false});

results = cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, 0);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

TEST_F(StringsSliceTest, SearchScalarDelimiter)
{
auto strings_col = cudf::test::strings_column_wrapper(
{"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
;
auto strings_view = cudf::strings_column_view(strings_col);

{
auto exp_results = cudf::test::strings_column_wrapper({"H", "thes", "", "lease", "t", ""},
{true, true, false, true, true, true});

auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 1);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

{
auto exp_results = cudf::test::strings_column_wrapper(
{"llo", "", "", "lease", "st strings", ""}, {true, true, false, true, true, true});

auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -1);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

{
auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 2);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_col);
}

{
auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -2);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_col);
}

{
auto col0 = cudf::test::strings_column_wrapper(
{"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""},
{true, true, false, true, true, true});

auto exp_results = cudf::test::strings_column_wrapper({"Hello LL", "o", "", "opp", "pol", ""},
{true, true, false, true, true, true});

auto results =
cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), 2);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

{
auto col0 = cudf::test::strings_column_wrapper(
{"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""},
{true, true, false, true, true, true});

auto exp_results = cudf::test::strings_column_wrapper({"ogh", "pppllo", "", "llo", " po", ""},
{true, true, false, true, true, true});

auto results =
cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), -2);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

{
auto col0 = cudf::test::strings_column_wrapper(
{"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""},
{true, true, false, true, true, true});

auto exp_results = cudf::test::strings_column_wrapper(
{"Héllo HélloHéllo", "Hélloééééé", "", "éééé", "poloéé lopéé apploo", ""},
{true, true, false, true, true, true});

auto results =
cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), 3);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

{
auto col0 = cudf::test::strings_column_wrapper(
{"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""},
{true, true, false, true, true, true});

auto exp_results = cudf::test::strings_column_wrapper(
{"Héllo HélloHéllo", "Hélloééééé", "", "éééé", " lopéé applooéé po", ""},
{true, true, false, true, true, true});

auto results =
cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), -3);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

{
auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com",
"www.apache..org",
"tennis...com",
"nvidia....com",
"google...........com",
"microsoft...c.....co..m"});

auto exp_results = cudf::test::strings_column_wrapper(
{"www.yahoo.com", "www.apache.", "tennis..", "nvidia..", "google..", "microsoft.."});

auto results =
cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("."), 3);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

{
auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com",
"www.apache..org",
"tennis..com",
"nvidia....com",
"google...........com",
".",
"microsoft...c.....co..m"});

auto exp_results = cudf::test::strings_column_wrapper(
{"www.yahoo.com", "www.apache..org", "tennis..com", "..com", "..com", ".", "co..m"});

auto results =
cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar(".."), -2);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}
}

TEST_F(StringsSliceTest, SearchColumnDelimiter)
{
{
auto col0 = cudf::test::strings_column_wrapper(
{"H™élloi ™◎oo™ff™", "thesé", "", "lease™", "tést strings", ""},
{true, true, false, true, true, true});
auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "e", "t", ""});

auto exp_results = cudf::test::strings_column_wrapper({"H", "thesé", "", "l", "", ""},
{true, true, false, true, true, true});

auto results = cudf::strings::slice_strings(
cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

{
auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi ™◎ooff™ff™",
"tffffhffesé",
"",
"lff fooff ffff eaffse™",
"tést ffstri.nffgs",
"ffff ™ ffff ff"},
{true, true, false, true, true, true});
auto delim_col = cudf::test::strings_column_wrapper({"ff™", "", "", "", "t", "ff ™"});

auto exp_results = cudf::test::strings_column_wrapper(
{"ff™", "esé", "", "eaffse™", "ri.nffgs", " ffff ff"}, {true, true, false, true, true, true});

auto results = cudf::strings::slice_strings(
cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

{
auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™",
"tffffhffesé",
"",
"lff fooff ffff eaffse™",
"tést ff™ffff™ff™ffffstri.ff™ffff™nffgs",
"ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"},
{true, true, false, true, true, true});
auto delim_col = cudf::test::strings_column_wrapper({"ff™", "", "", "e ", "ff™ff", "ff™ff™"},
{true, true, false, true, true, true});

auto exp_results = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ goo",
"tffffh",
"",
"lff fooff ffff eaffse™",
"tést ff™ffff™ff™ffffstri.",
"ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"},
{true, true, false, true, true, true});

auto results = cudf::strings::slice_strings(
cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 3);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}

{
auto col0 = cudf::test::strings_column_wrapper({"H™élloff ffffi fooff™ barff™ gooff™ ™◎ooff™ff™",
"tffffhffesé",
"",
"lff fooff ffff eaffse™",
"tést ff™ffff™ff™ffffstri.ff™ffff™nffgs",
"ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"});
auto delim_col = cudf::test::strings_column_wrapper({"ff™", "", "", "e ", "ff™ff", "ff™ff™"},
{true, true, false, true, true, true});

auto exp_results = cudf::test::strings_column_wrapper({" gooff™ ™◎ooff™ff™",
"ffhffesé",
"",
"lff fooff ffff eaffse™",
"ff™ff™ffffstri.ff™ffff™nffgs",
"ffff ™ ffff ff™ ff™ff™ff™ ff™ff™ ff"});

auto results = cudf::strings::slice_strings(
cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -3);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}
}

0 comments on commit 132540e

Please sign in to comment.