From a583c97ca977041e3cc3399739e29962982d6aad Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 5 Jul 2024 13:45:38 -0400 Subject: [PATCH] Fix cudf::strings::replace_multiple hang on empty target (#16167) Fixes logic in `cudf::strings::replace_multiple` to ignore empty targets correctly in the `replace_multi_fn` functor. Also updated the doxygen and added a gtest for this case. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16167 --- cpp/include/cudf/strings/replace.hpp | 2 +- cpp/src/strings/replace/multi.cu | 9 ++++----- cpp/tests/strings/replace_tests.cpp | 17 +++++++++++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp index a19aa9be0c0..a714f762a19 100644 --- a/cpp/include/cudf/strings/replace.hpp +++ b/cpp/include/cudf/strings/replace.hpp @@ -122,7 +122,7 @@ std::unique_ptr replace_slice( * If a target string is found, it is replaced by the corresponding entry in the repls column. * All occurrences found in each string are replaced. * - * This does not use regex to match targets in the string. + * This does not use regex to match targets in the string. Empty string targets are ignored. * * Null string entries will return null output string entries. * diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 43a3d69091a..2ca22f0e017 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -451,8 +451,8 @@ struct replace_multi_fn { while (spos < d_str.size_bytes()) { for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) { auto const d_tgt = d_targets.element(tgt_idx); - if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit - (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match + if (!d_tgt.empty() && (d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit + (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match { auto const d_repl = (d_repls.size() == 1) ? d_repls.element(0) : d_repls.element(tgt_idx); @@ -468,9 +468,8 @@ struct replace_multi_fn { } ++spos; } - if (out_ptr) // copy remainder - { - memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); + if (out_ptr) { + memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); // copy remainder } else { d_sizes[idx] = bytes; } diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp index 3aa7467d156..6c4afbb435a 100644 --- a/cpp/tests/strings/replace_tests.cpp +++ b/cpp/tests/strings/replace_tests.cpp @@ -532,6 +532,23 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong) } } +TEST_F(StringsReplaceTest, EmptyTarget) +{ + auto const input = cudf::test::strings_column_wrapper({"hello", "world", "", "accénted"}); + auto const sv = cudf::strings_column_view(input); + + auto const targets = cudf::test::strings_column_wrapper({"e", "", "d"}); + auto const tv = cudf::strings_column_view(targets); + + auto const repls = cudf::test::strings_column_wrapper({"E", "_", "D"}); + auto const rv = cudf::strings_column_view(repls); + + // empty target should be ignored + auto results = cudf::strings::replace_multiple(sv, tv, rv); + auto expected = cudf::test::strings_column_wrapper({"hEllo", "worlD", "", "accéntED"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsReplaceTest, EmptyStringsColumn) { auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();