From 960cc42c5a5a85f48b91814b2bee9e0cc9afcc0d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 25 May 2023 07:41:53 -0400 Subject: [PATCH] Fix cudf::strings::replace_with_backrefs hang on empty match result (#13418) Fixes bug where the `cudf::strings::replace_with_backrefs` goes into an infinite loop when an match results in an empty string. After each replace occurs, the logic continues to search for matches on the remainder of the string. Each new starting point must account for the previous match being empty. Also included a gtest for this case. Closes #13404 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Gregory Kimball (https://github.com/GregoryKimball) - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/13418 --- cpp/src/strings/replace/backref_re.cuh | 17 +++++++++-------- cpp/tests/strings/replace_regex_tests.cpp | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh index eb34ab67839..a5f3ace2141 100644 --- a/cpp/src/strings/replace/backref_re.cuh +++ b/cpp/src/strings/replace/backref_re.cuh @@ -61,14 +61,15 @@ struct backrefs_fn { auto out_ptr = d_chars ? (d_chars + d_offsets[idx]) : nullptr; size_type lpos = 0; // last byte position processed in d_str size_type begin = 0; // first character position matching regex - size_type end = nchars; // last character position (exclusive) + size_type end = -1; // match through the end of the string // copy input to output replacing strings as we go - while (prog.find(prog_idx, d_str, begin, end) > 0) // inits the begin/end vars + while ((begin <= nchars) && + (prog.find(prog_idx, d_str, begin, end) > 0)) // inits the begin/end vars { - auto spos = d_str.byte_offset(begin); // get offset for the - auto epos = d_str.byte_offset(end); // character position values; - nbytes += d_repl.size_bytes() - (epos - spos); // compute the output size + auto spos = d_str.byte_offset(begin); // get offset for the + auto epos = d_str.byte_offset(end); // character position values; + nbytes += d_repl.size_bytes() - (epos - spos); // compute the output size // copy the string data before the matched section if (out_ptr) { out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos); } @@ -84,7 +85,7 @@ struct backrefs_fn { } // extract the specific group's string for this backref's index auto extracted = prog.extract(prog_idx, d_str, begin, end, backref.first - 1); - if (!extracted || (extracted.value().second <= extracted.value().first)) { + if (!extracted || (extracted.value().second < extracted.value().first)) { return; // no value for this backref number; that is ok } auto spos_extract = d_str.byte_offset(extracted.value().first); // convert @@ -104,8 +105,8 @@ struct backrefs_fn { // setup to match the next section lpos = epos; - begin = end; - end = nchars; + begin = end + (begin == end); + end = -1; } // finally, copy remainder of input string diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 0b25a026bd1..6b5d3936a9a 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -354,6 +354,21 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } +// https://github.com/rapidsai/cudf/issues/13404 +TEST_F(StringsReplaceRegexTest, ReplaceBackrefsWithEmptyCapture) +{ + cudf::test::strings_column_wrapper input({"one\ntwo", "three\n\n", "four\r\n"}); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("(\r\n|\r)?$"); + auto repl_template = std::string("[\\1]"); + + cudf::test::strings_column_wrapper expected({"one\ntwo[]", "three\n[]\n[]", "four[\r\n][]"}); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); +} + TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest) { cudf::test::strings_column_wrapper strings({"this string left intentionally blank"});