Skip to content

Commit

Permalink
Fix replace error when regex has only zero match quantifiers (#10760)
Browse files Browse the repository at this point in the history
Closes #10753 

Fixes `cudf::strings::replace_re` logic that was reading past the end of a string when given a regex that contained net zero match quantifier pattern (e.g. 'D*' or 'D?s?' both can match to nothing).

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)

URL: #10760
  • Loading branch information
davidwendt authored May 3, 2022
1 parent 0e32624 commit ad12606
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 7 deletions.
15 changes: 8 additions & 7 deletions cpp/src/strings/replace/replace_re.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,19 @@ struct replace_regex_fn {
return;
}

auto const d_str = d_strings.element<string_view>(idx);
auto nbytes = d_str.size_bytes(); // number of bytes in input string
auto mxn = maxrepl < 0 ? d_str.length() + 1 : maxrepl; // max possible replaces for this string
auto in_ptr = d_str.data(); // input pointer (i)
auto out_ptr = d_chars ? d_chars + d_offsets[idx] // output pointer (o)
: nullptr;
auto const d_str = d_strings.element<string_view>(idx);
auto const nchars = d_str.length();
auto nbytes = d_str.size_bytes(); // number of bytes in input string
auto mxn = maxrepl < 0 ? nchars + 1 : maxrepl; // max possible replaces for this string
auto in_ptr = d_str.data(); // input pointer (i)
auto out_ptr = d_chars ? d_chars + d_offsets[idx] // output pointer (o)
: nullptr;
size_type last_pos = 0;
int32_t begin = 0; // these are for calling prog.find
int32_t end = -1; // matches final word-boundary if at the end of the string

// copy input to output replacing strings as we go
while (mxn-- > 0) { // maximum number of replaces
while (mxn-- > 0 && begin <= nchars) { // maximum number of replaces

if (prog.is_empty() || prog.find<stack_size>(idx, d_str, begin, end) <= 0) {
break; // no more matches
Expand Down
12 changes: 12 additions & 0 deletions cpp/tests/strings/replace_regex_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,18 @@ TEST_F(StringsReplaceRegexTest, WordBoundary)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}

TEST_F(StringsReplaceRegexTest, ZeroLengthMatch)
{
cudf::test::strings_column_wrapper input({"DD", "zéz", "DsDs", ""});
auto repl = cudf::string_scalar("_");
auto results = cudf::strings::replace_re(cudf::strings_column_view(input), "D*", repl);
auto expected = cudf::test::strings_column_wrapper({"__", "_z_é_z_", "__s__s_", "_"});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
results = cudf::strings::replace_re(cudf::strings_column_view(input), "D?s?", repl);
expected = cudf::test::strings_column_wrapper({"___", "_z_é_z_", "___", "_"});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}

TEST_F(StringsReplaceRegexTest, Multiline)
{
auto const multiline = cudf::strings::regex_flags::MULTILINE;
Expand Down
10 changes: 10 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,16 @@ def test_string_replace(
assert_eq(expect, got)


@pytest.mark.parametrize("pat", ["A*", "F?H?"])
def test_string_replace_zero_length(ps_gs, pat):
ps, gs = ps_gs

expect = ps.str.replace(pat, "_", regex=True)
got = gs.str.replace(pat, "_", regex=True)

assert_eq(expect, got)


def test_string_lower(ps_gs):
ps, gs = ps_gs

Expand Down

0 comments on commit ad12606

Please sign in to comment.