From ad126065109aaa72b6eb324ba5abd555b70bb4ae Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 3 May 2022 07:44:19 -0400 Subject: [PATCH] Fix replace error when regex has only zero match quantifiers (#10760) Closes #10753 Fixes `cudf::strings::replace_re` logic that was reading past the end of a string when given a regex that contained net zero match quantifier pattern (e.g. 'D*' or 'D?s?' both can match to nothing). Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/10760 --- cpp/src/strings/replace/replace_re.cu | 15 ++++++++------- cpp/tests/strings/replace_regex_tests.cpp | 12 ++++++++++++ python/cudf/cudf/tests/test_string.py | 10 ++++++++++ 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index d42359deeac..af74d8bdb92 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -54,18 +54,19 @@ struct replace_regex_fn { return; } - auto const d_str = d_strings.element(idx); - auto nbytes = d_str.size_bytes(); // number of bytes in input string - auto mxn = maxrepl < 0 ? d_str.length() + 1 : maxrepl; // max possible replaces for this string - auto in_ptr = d_str.data(); // input pointer (i) - auto out_ptr = d_chars ? d_chars + d_offsets[idx] // output pointer (o) - : nullptr; + auto const d_str = d_strings.element(idx); + auto const nchars = d_str.length(); + auto nbytes = d_str.size_bytes(); // number of bytes in input string + auto mxn = maxrepl < 0 ? nchars + 1 : maxrepl; // max possible replaces for this string + auto in_ptr = d_str.data(); // input pointer (i) + auto out_ptr = d_chars ? d_chars + d_offsets[idx] // output pointer (o) + : nullptr; size_type last_pos = 0; int32_t begin = 0; // these are for calling prog.find int32_t end = -1; // matches final word-boundary if at the end of the string // copy input to output replacing strings as we go - while (mxn-- > 0) { // maximum number of replaces + while (mxn-- > 0 && begin <= nchars) { // maximum number of replaces if (prog.is_empty() || prog.find(idx, d_str, begin, end) <= 0) { break; // no more matches diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 2b9e8b7aae7..1ccbc6fa676 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -157,6 +157,18 @@ TEST_F(StringsReplaceRegexTest, WordBoundary) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } +TEST_F(StringsReplaceRegexTest, ZeroLengthMatch) +{ + cudf::test::strings_column_wrapper input({"DD", "zéz", "DsDs", ""}); + auto repl = cudf::string_scalar("_"); + auto results = cudf::strings::replace_re(cudf::strings_column_view(input), "D*", repl); + auto expected = cudf::test::strings_column_wrapper({"__", "_z_é_z_", "__s__s_", "_"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + results = cudf::strings::replace_re(cudf::strings_column_view(input), "D?s?", repl); + expected = cudf::test::strings_column_wrapper({"___", "_z_é_z_", "___", "_"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); +} + TEST_F(StringsReplaceRegexTest, Multiline) { auto const multiline = cudf::strings::regex_flags::MULTILINE; diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index d600fdeee27..d212c6b2072 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -923,6 +923,16 @@ def test_string_replace( assert_eq(expect, got) +@pytest.mark.parametrize("pat", ["A*", "F?H?"]) +def test_string_replace_zero_length(ps_gs, pat): + ps, gs = ps_gs + + expect = ps.str.replace(pat, "_", regex=True) + got = gs.str.replace(pat, "_", regex=True) + + assert_eq(expect, got) + + def test_string_lower(ps_gs): ps, gs = ps_gs