Skip to content

Commit

Permalink
Fix cudf::strings::replace_with_backrefs hang on empty match result (#…
Browse files Browse the repository at this point in the history
…13418)

Fixes bug where the `cudf::strings::replace_with_backrefs` goes into an infinite loop when an match results in an empty string. After each replace occurs, the logic continues to search for matches on the remainder of the string. Each new starting point must account for the previous match being empty.
Also included a gtest for this case.

Closes #13404

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Gregory Kimball (https://github.com/GregoryKimball)
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #13418
  • Loading branch information
davidwendt authored May 25, 2023
1 parent c4f471d commit 960cc42
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 8 deletions.
17 changes: 9 additions & 8 deletions cpp/src/strings/replace/backref_re.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,15 @@ struct backrefs_fn {
auto out_ptr = d_chars ? (d_chars + d_offsets[idx]) : nullptr;
size_type lpos = 0; // last byte position processed in d_str
size_type begin = 0; // first character position matching regex
size_type end = nchars; // last character position (exclusive)
size_type end = -1; // match through the end of the string

// copy input to output replacing strings as we go
while (prog.find(prog_idx, d_str, begin, end) > 0) // inits the begin/end vars
while ((begin <= nchars) &&
(prog.find(prog_idx, d_str, begin, end) > 0)) // inits the begin/end vars
{
auto spos = d_str.byte_offset(begin); // get offset for the
auto epos = d_str.byte_offset(end); // character position values;
nbytes += d_repl.size_bytes() - (epos - spos); // compute the output size
auto spos = d_str.byte_offset(begin); // get offset for the
auto epos = d_str.byte_offset(end); // character position values;
nbytes += d_repl.size_bytes() - (epos - spos); // compute the output size

// copy the string data before the matched section
if (out_ptr) { out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos); }
Expand All @@ -84,7 +85,7 @@ struct backrefs_fn {
}
// extract the specific group's string for this backref's index
auto extracted = prog.extract(prog_idx, d_str, begin, end, backref.first - 1);
if (!extracted || (extracted.value().second <= extracted.value().first)) {
if (!extracted || (extracted.value().second < extracted.value().first)) {
return; // no value for this backref number; that is ok
}
auto spos_extract = d_str.byte_offset(extracted.value().first); // convert
Expand All @@ -104,8 +105,8 @@ struct backrefs_fn {

// setup to match the next section
lpos = epos;
begin = end;
end = nchars;
begin = end + (begin == end);
end = -1;
}

// finally, copy remainder of input string
Expand Down
15 changes: 15 additions & 0 deletions cpp/tests/strings/replace_regex_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,21 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}

// https://github.com/rapidsai/cudf/issues/13404
TEST_F(StringsReplaceRegexTest, ReplaceBackrefsWithEmptyCapture)
{
cudf::test::strings_column_wrapper input({"one\ntwo", "three\n\n", "four\r\n"});
auto sv = cudf::strings_column_view(input);

auto pattern = std::string("(\r\n|\r)?$");
auto repl_template = std::string("[\\1]");

cudf::test::strings_column_wrapper expected({"one\ntwo[]", "three\n[]\n[]", "four[\r\n][]"});
auto prog = cudf::strings::regex_program::create(pattern);
auto results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}

TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest)
{
cudf::test::strings_column_wrapper strings({"this string left intentionally blank"});
Expand Down

0 comments on commit 960cc42

Please sign in to comment.