From 483318dcd46b7b876e8ae0f5f922349976084165 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 26 Jul 2022 14:57:23 -0400
Subject: [PATCH 1/4] Fix regex quantifier check to include capture groups

---
 cpp/src/strings/regex/regcomp.cpp    | 24 ++++++++++++++++++++++--
 cpp/tests/strings/contains_tests.cpp | 15 +++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 50d641c9a74..7a1a018fcfd 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -60,7 +60,7 @@ static reclass cclass_S(NCCLASS_S);  // \S
 static reclass cclass_D(NCCLASS_D);  // \D
 
 // Tables for analyzing quantifiers
-const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}};
+const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL}};
 const std::array<char, 5> quantifiers{{'*', '?', '+', '{', '|'}};
 // Valid regex characters that can be escaped and used as literals
 const std::array<char, 33> escapable_chars{
@@ -466,9 +466,29 @@ class regex_parser {
     // are treated as regex expressions and sometimes they are not.
     if (_items.empty()) { CUDF_FAIL("invalid regex pattern: nothing to repeat at position 0"); }
 
+    // Check that the previous item can be used with quantifiers.
+    // If the previous item is a capture group, we need to check items inside the
+    // capture group can be used with quantifiers.
+    auto previous_type = _items.back().type;
+    if (previous_type == RBRA) {  // previous item is a capture group
+      // look for matching LBRA
+      auto lbra_itr = std::find_if(_items.rbegin(), _items.rend(), [](auto const& item) {
+        return item.type == LBRA || item.type == LBRA_NC;
+      });
+      // search for the first valid item within the LBRA-RBRA range
+      auto first_valid = std::find_first_of(
+        _items.rbegin() + 1,
+        lbra_itr,
+        valid_preceding_inst_types.begin(),
+        valid_preceding_inst_types.end(),
+        [](auto const item, auto const valid_type) { return item.type == valid_type; });
+      // set previous_type for error report
+      previous_type = (first_valid != lbra_itr) ? first_valid->type : (--lbra_itr)->type;
+    }
+
     if (std::find(valid_preceding_inst_types.begin(),
                   valid_preceding_inst_types.end(),
-                  _items.back().type) == valid_preceding_inst_types.end()) {
+                  previous_type) == valid_preceding_inst_types.end()) {
       CUDF_FAIL("invalid regex pattern: nothing to repeat at position " +
                 std::to_string(_expr_ptr - _pattern_begin - 1));
     }
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 70f28aa139d..c5ca9cabdeb 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -424,6 +424,21 @@ TEST_F(StringsContainsTests, FixedQuantifier)
   }
 }
 
+TEST_F(StringsContainsTests, QuantifierErrors)
+{
+  auto input = cudf::test::strings_column_wrapper({"a", "aa", "aaa", "aaaa", "aaaaa", "aaaaaa"});
+  auto sv    = cudf::strings_column_view(input);
+
+  EXPECT_THROW(cudf::strings::contains_re(sv, "^+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::count_re(sv, "$+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::count_re(sv, "(^)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::contains_re(sv, "($)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::count_re(sv, "\\A+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::count_re(sv, "\\Z+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::contains_re(sv, "(\\A)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::contains_re(sv, "(\\Z)+"), cudf::logic_error);
+}
+
 TEST_F(StringsContainsTests, OverlappedClasses)
 {
   auto input = cudf::test::strings_column_wrapper({"abcdefg", "defghí", "", "éééééé", "ghijkl"});

From f98693dee031c9aa4012b7ad43b3eb42310a6935 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 29 Jul 2022 10:54:09 -0400
Subject: [PATCH 2/4] add support for checking nested capture groups

---
 cpp/src/strings/regex/regcomp.cpp    | 16 ++++++++++------
 cpp/tests/strings/contains_tests.cpp |  4 ++++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 7a1a018fcfd..c1ea53d57d4 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -459,7 +459,7 @@ class regex_parser {
     }
 
     // The quantifiers require at least one "real" previous item.
-    // We are throwing an error in these two if-checks for invalid quantifiers.
+    // We are throwing an errors for invalid quantifiers.
     // Another option is to just return CHAR silently here which effectively
     // treats the chr character as a literal instead as a quantifier.
     // This could lead to confusion where sometimes unescaped quantifier characters
@@ -468,13 +468,17 @@ class regex_parser {
 
     // Check that the previous item can be used with quantifiers.
     // If the previous item is a capture group, we need to check items inside the
-    // capture group can be used with quantifiers.
+    // capture group can be used with quantifiers too.
+    // (Note that capture groups can be nested).
     auto previous_type = _items.back().type;
     if (previous_type == RBRA) {  // previous item is a capture group
       // look for matching LBRA
-      auto lbra_itr = std::find_if(_items.rbegin(), _items.rend(), [](auto const& item) {
-        return item.type == LBRA || item.type == LBRA_NC;
-      });
+      auto nested_count = 0;
+      auto lbra_itr =
+        std::find_if(_items.rbegin(), _items.rend(), [nested_count](auto const& item) mutable {
+          nested_count += (item.type == RBRA);
+          return (item.type == LBRA || item.type == LBRA_NC) && (nested_count-- == 0);
+        });
       // search for the first valid item within the LBRA-RBRA range
       auto first_valid = std::find_first_of(
         _items.rbegin() + 1,
@@ -482,7 +486,7 @@ class regex_parser {
         valid_preceding_inst_types.begin(),
         valid_preceding_inst_types.end(),
         [](auto const item, auto const valid_type) { return item.type == valid_type; });
-      // set previous_type for error report
+      // set previous_type to be checked in next if-statement
       previous_type = (first_valid != lbra_itr) ? first_valid->type : (--lbra_itr)->type;
     }
 
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index c5ca9cabdeb..d725f3d5dd0 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -437,6 +437,10 @@ TEST_F(StringsContainsTests, QuantifierErrors)
   EXPECT_THROW(cudf::strings::count_re(sv, "\\Z+"), cudf::logic_error);
   EXPECT_THROW(cudf::strings::contains_re(sv, "(\\A)+"), cudf::logic_error);
   EXPECT_THROW(cudf::strings::contains_re(sv, "(\\Z)+"), cudf::logic_error);
+
+  EXPECT_THROW(cudf::strings::contains_re(sv, "(^($))+"), cudf::logic_error);
+  EXPECT_NO_THROW(cudf::strings::contains_re(sv, "(^a($))+"));
+  EXPECT_NO_THROW(cudf::strings::count_re(sv, "(^(a$))+"));
 }
 
 TEST_F(StringsContainsTests, OverlappedClasses)

From fe59df86a9b4406c9fa427eb1249dbc32c52cb26 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 3 Aug 2022 10:53:25 -0400
Subject: [PATCH 3/4] improve clarity of find-if lambda expression

---
 cpp/src/strings/regex/regcomp.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index c1ea53d57d4..89348e57f45 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -60,7 +60,7 @@ static reclass cclass_S(NCCLASS_S);  // \S
 static reclass cclass_D(NCCLASS_D);  // \D
 
 // Tables for analyzing quantifiers
-const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL}};
+const std::array<int, 5> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL}};
 const std::array<char, 5> quantifiers{{'*', '?', '+', '{', '|'}};
 // Valid regex characters that can be escaped and used as literals
 const std::array<char, 33> escapable_chars{
@@ -473,11 +473,13 @@ class regex_parser {
     auto previous_type = _items.back().type;
     if (previous_type == RBRA) {  // previous item is a capture group
       // look for matching LBRA
-      auto nested_count = 0;
+      auto nested_count = 1;
       auto lbra_itr =
         std::find_if(_items.rbegin(), _items.rend(), [nested_count](auto const& item) mutable {
-          nested_count += (item.type == RBRA);
-          return (item.type == LBRA || item.type == LBRA_NC) && (nested_count-- == 0);
+          auto const is_closing = (item.type == RBRA);
+          auto const is_opening = (item.type == LBRA || item.type == LBRA_NC);
+          nested_count += is_closing - is_opening;
+          return is_opening && (nested_count == 0);
         });
       // search for the first valid item within the LBRA-RBRA range
       auto first_valid = std::find_first_of(
@@ -487,7 +489,7 @@ class regex_parser {
         valid_preceding_inst_types.end(),
         [](auto const item, auto const valid_type) { return item.type == valid_type; });
       // set previous_type to be checked in next if-statement
-      previous_type = (first_valid != lbra_itr) ? first_valid->type : (--lbra_itr)->type;
+      previous_type = (first_valid == lbra_itr) ? (--lbra_itr)->type : first_valid->type;
     }
 
     if (std::find(valid_preceding_inst_types.begin(),

From 67f07827af54097037435d88716ac36a571d0c03 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 3 Aug 2022 12:50:41 -0400
Subject: [PATCH 4/4] fix comment wording

---
 cpp/src/strings/regex/regcomp.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 89348e57f45..bc6bdd9dc7b 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -459,7 +459,7 @@ class regex_parser {
     }
 
     // The quantifiers require at least one "real" previous item.
-    // We are throwing an errors for invalid quantifiers.
+    // We are throwing errors for invalid quantifiers.
     // Another option is to just return CHAR silently here which effectively
     // treats the chr character as a literal instead as a quantifier.
     // This could lead to confusion where sometimes unescaped quantifier characters