From 483318dcd46b7b876e8ae0f5f922349976084165 Mon Sep 17 00:00:00 2001 From: David Wendt <dwendt@nvidia.com> Date: Tue, 26 Jul 2022 14:57:23 -0400 Subject: [PATCH 1/4] Fix regex quantifier check to include capture groups --- cpp/src/strings/regex/regcomp.cpp | 24 ++++++++++++++++++++++-- cpp/tests/strings/contains_tests.cpp | 15 +++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 50d641c9a74..7a1a018fcfd 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -60,7 +60,7 @@ static reclass cclass_S(NCCLASS_S); // \S static reclass cclass_D(NCCLASS_D); // \D // Tables for analyzing quantifiers -const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}}; +const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL}}; const std::array<char, 5> quantifiers{{'*', '?', '+', '{', '|'}}; // Valid regex characters that can be escaped and used as literals const std::array<char, 33> escapable_chars{ @@ -466,9 +466,29 @@ class regex_parser { // are treated as regex expressions and sometimes they are not. if (_items.empty()) { CUDF_FAIL("invalid regex pattern: nothing to repeat at position 0"); } + // Check that the previous item can be used with quantifiers. + // If the previous item is a capture group, we need to check items inside the + // capture group can be used with quantifiers. + auto previous_type = _items.back().type; + if (previous_type == RBRA) { // previous item is a capture group + // look for matching LBRA + auto lbra_itr = std::find_if(_items.rbegin(), _items.rend(), [](auto const& item) { + return item.type == LBRA || item.type == LBRA_NC; + }); + // search for the first valid item within the LBRA-RBRA range + auto first_valid = std::find_first_of( + _items.rbegin() + 1, + lbra_itr, + valid_preceding_inst_types.begin(), + valid_preceding_inst_types.end(), + [](auto const item, auto const valid_type) { return item.type == valid_type; }); + // set previous_type for error report + previous_type = (first_valid != lbra_itr) ? first_valid->type : (--lbra_itr)->type; + } + if (std::find(valid_preceding_inst_types.begin(), valid_preceding_inst_types.end(), - _items.back().type) == valid_preceding_inst_types.end()) { + previous_type) == valid_preceding_inst_types.end()) { CUDF_FAIL("invalid regex pattern: nothing to repeat at position " + std::to_string(_expr_ptr - _pattern_begin - 1)); } diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 70f28aa139d..c5ca9cabdeb 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -424,6 +424,21 @@ TEST_F(StringsContainsTests, FixedQuantifier) } } +TEST_F(StringsContainsTests, QuantifierErrors) +{ + auto input = cudf::test::strings_column_wrapper({"a", "aa", "aaa", "aaaa", "aaaaa", "aaaaaa"}); + auto sv = cudf::strings_column_view(input); + + EXPECT_THROW(cudf::strings::contains_re(sv, "^+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::count_re(sv, "$+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::count_re(sv, "(^)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::contains_re(sv, "($)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::count_re(sv, "\\A+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::count_re(sv, "\\Z+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::contains_re(sv, "(\\A)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::contains_re(sv, "(\\Z)+"), cudf::logic_error); +} + TEST_F(StringsContainsTests, OverlappedClasses) { auto input = cudf::test::strings_column_wrapper({"abcdefg", "defghí", "", "éééééé", "ghijkl"}); From f98693dee031c9aa4012b7ad43b3eb42310a6935 Mon Sep 17 00:00:00 2001 From: David Wendt <dwendt@nvidia.com> Date: Fri, 29 Jul 2022 10:54:09 -0400 Subject: [PATCH 2/4] add support for checking nested capture groups --- cpp/src/strings/regex/regcomp.cpp | 16 ++++++++++------ cpp/tests/strings/contains_tests.cpp | 4 ++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 7a1a018fcfd..c1ea53d57d4 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -459,7 +459,7 @@ class regex_parser { } // The quantifiers require at least one "real" previous item. - // We are throwing an error in these two if-checks for invalid quantifiers. + // We are throwing an errors for invalid quantifiers. // Another option is to just return CHAR silently here which effectively // treats the chr character as a literal instead as a quantifier. // This could lead to confusion where sometimes unescaped quantifier characters @@ -468,13 +468,17 @@ class regex_parser { // Check that the previous item can be used with quantifiers. // If the previous item is a capture group, we need to check items inside the - // capture group can be used with quantifiers. + // capture group can be used with quantifiers too. + // (Note that capture groups can be nested). auto previous_type = _items.back().type; if (previous_type == RBRA) { // previous item is a capture group // look for matching LBRA - auto lbra_itr = std::find_if(_items.rbegin(), _items.rend(), [](auto const& item) { - return item.type == LBRA || item.type == LBRA_NC; - }); + auto nested_count = 0; + auto lbra_itr = + std::find_if(_items.rbegin(), _items.rend(), [nested_count](auto const& item) mutable { + nested_count += (item.type == RBRA); + return (item.type == LBRA || item.type == LBRA_NC) && (nested_count-- == 0); + }); // search for the first valid item within the LBRA-RBRA range auto first_valid = std::find_first_of( _items.rbegin() + 1, @@ -482,7 +486,7 @@ class regex_parser { valid_preceding_inst_types.begin(), valid_preceding_inst_types.end(), [](auto const item, auto const valid_type) { return item.type == valid_type; }); - // set previous_type for error report + // set previous_type to be checked in next if-statement previous_type = (first_valid != lbra_itr) ? first_valid->type : (--lbra_itr)->type; } diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index c5ca9cabdeb..d725f3d5dd0 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -437,6 +437,10 @@ TEST_F(StringsContainsTests, QuantifierErrors) EXPECT_THROW(cudf::strings::count_re(sv, "\\Z+"), cudf::logic_error); EXPECT_THROW(cudf::strings::contains_re(sv, "(\\A)+"), cudf::logic_error); EXPECT_THROW(cudf::strings::contains_re(sv, "(\\Z)+"), cudf::logic_error); + + EXPECT_THROW(cudf::strings::contains_re(sv, "(^($))+"), cudf::logic_error); + EXPECT_NO_THROW(cudf::strings::contains_re(sv, "(^a($))+")); + EXPECT_NO_THROW(cudf::strings::count_re(sv, "(^(a$))+")); } TEST_F(StringsContainsTests, OverlappedClasses) From fe59df86a9b4406c9fa427eb1249dbc32c52cb26 Mon Sep 17 00:00:00 2001 From: David Wendt <dwendt@nvidia.com> Date: Wed, 3 Aug 2022 10:53:25 -0400 Subject: [PATCH 3/4] improve clarity of find-if lambda expression --- cpp/src/strings/regex/regcomp.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index c1ea53d57d4..89348e57f45 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -60,7 +60,7 @@ static reclass cclass_S(NCCLASS_S); // \S static reclass cclass_D(NCCLASS_D); // \D // Tables for analyzing quantifiers -const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL}}; +const std::array<int, 5> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL}}; const std::array<char, 5> quantifiers{{'*', '?', '+', '{', '|'}}; // Valid regex characters that can be escaped and used as literals const std::array<char, 33> escapable_chars{ @@ -473,11 +473,13 @@ class regex_parser { auto previous_type = _items.back().type; if (previous_type == RBRA) { // previous item is a capture group // look for matching LBRA - auto nested_count = 0; + auto nested_count = 1; auto lbra_itr = std::find_if(_items.rbegin(), _items.rend(), [nested_count](auto const& item) mutable { - nested_count += (item.type == RBRA); - return (item.type == LBRA || item.type == LBRA_NC) && (nested_count-- == 0); + auto const is_closing = (item.type == RBRA); + auto const is_opening = (item.type == LBRA || item.type == LBRA_NC); + nested_count += is_closing - is_opening; + return is_opening && (nested_count == 0); }); // search for the first valid item within the LBRA-RBRA range auto first_valid = std::find_first_of( @@ -487,7 +489,7 @@ class regex_parser { valid_preceding_inst_types.end(), [](auto const item, auto const valid_type) { return item.type == valid_type; }); // set previous_type to be checked in next if-statement - previous_type = (first_valid != lbra_itr) ? first_valid->type : (--lbra_itr)->type; + previous_type = (first_valid == lbra_itr) ? (--lbra_itr)->type : first_valid->type; } if (std::find(valid_preceding_inst_types.begin(), From 67f07827af54097037435d88716ac36a571d0c03 Mon Sep 17 00:00:00 2001 From: David Wendt <dwendt@nvidia.com> Date: Wed, 3 Aug 2022 12:50:41 -0400 Subject: [PATCH 4/4] fix comment wording --- cpp/src/strings/regex/regcomp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 89348e57f45..bc6bdd9dc7b 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -459,7 +459,7 @@ class regex_parser { } // The quantifiers require at least one "real" previous item. - // We are throwing an errors for invalid quantifiers. + // We are throwing errors for invalid quantifiers. // Another option is to just return CHAR silently here which effectively // treats the chr character as a literal instead as a quantifier. // This could lead to confusion where sometimes unescaped quantifier characters