rapidsai · rapids-bot · May 25, 2022 · May 12, 2022 · May 12, 2022 · May 13, 2022
@@ -98,16 +98,16 @@ The details are based on features documented at https://www.regular-expressions.
 | Feature  | Syntax | Description | Example |
 | ---------- | ------------- | ------------- | ------------- |
 | Greedy quantifier | `?` (question mark) | Makes the preceding item optional. Greedy, so the optional item is included in the match if possible. | `abc?` matches `abc` or `ab` |
-| Greedy quantifier | `*` (star) | Repeats the previous item zero or more times. Greedy, so as many items as possible will be matched before trying permutations with less matches of the preceding item, up to the point where the preceding item is not matched at all. | `".*"` matches `"def"` and `"ghi"` in `abc "def" "ghi" jkl` |
-| Greedy quantifier | `+` (plus)　| Repeats the previous item once or more. Greedy, so as many items as possible will be matched before trying permutations with less matches of the preceding item, up to the point where the preceding item is matched only once. | `".+"` matches `"def"` and `"ghi"` in `abc "def" "ghi" jkl` |
+| Greedy quantifier | `*` (star) | Repeats the previous item zero or more times. Greedy, so as many items as possible will be matched before trying permutations with fewer matches of the preceding item, up to the point where the preceding item is not matched at all. | `".*"` matches `"def"` and `"ghi"` in `abc "def" "ghi" jkl` |
+| Greedy quantifier | `+` (plus)　| Repeats the previous item once or more. Greedy, so as many items as possible will be matched before trying permutations with fewer matches of the preceding item, up to the point where the preceding item is matched only once. | `".+"` matches `"def"` and `"ghi"` in `abc "def" "ghi" jkl` |
 | Lazy quantifier | `??` | Makes the preceding item optional. Lazy, so the optional item is excluded in the match if possible. | `abc??` matches `ab` or `abc` |
 | Lazy quantifier | `*?` | Repeats the previous item zero or more times. Lazy, so the engine first attempts to skip the previous item, before trying permutations with ever increasing matches of the preceding item. | `".*?"` matches `"def"` and `"ghi"` in `abc "def" "ghi" jkl` |
 | Lazy quantifier | `+?` | Repeats the previous item once or more. Lazy, so the engine first matches the previous item only once, before trying permutations with ever increasing matches of the preceding item. | `".+?"` matches `"def"` and `"ghi"` in `abc "def" "ghi" jkl` |
-| Fixed quantifier | `{n}` where `n is an integer >= 1` | Repeats the previous item exactly `n` times. | `a{5}` matches `aaaaa` |
-| Greedy quantifier | `{n,m}` where `n >= 0` and `m >= n` | Repeats the previous item between `n` and `m` times. Greedy, so repeating `m` times is tried before reducing the repetition to `n` times. | `a{2,4}` matches `aaaa`, `aaa` or `aa` |
-| Greedy quantifier | `{n,}` where `n >= 0` | Repeats the previous item at least `n` times. Greedy, so as many items as possible will be matched before trying permutations with less matches of the preceding item, up to the point where the preceding item is matched only `n` times. | `a{2,}` matches `aaaaa` in `aaaaa` |
-| Lazy quantifier | `{n,m}?` where `n >= 0` and `m >= n` | Repeats the previous item between `n` and `m` times. Lazy, so repeating `n` times is tried before increasing the repetition to `m` times. | `a{2,4}?` matches `aa`, `aaa` or `aaaa` |
-| Lazy quantifier | `{n,}?` where `n >= 0` | Repeats the previous item `n` or more times. Lazy, so the engine first matches the previous item `n` times, before trying permutations with ever increasing matches of the preceding item. | `a{2,}?` matches `aa` in `aaaaa` |
+| Fixed quantifier | `{n}` where `n` is an integer: `0 ≤ n ≤ 999` | Repeats the previous item exactly `n` times. | `a{5}` matches `aaaaa` |
+| Greedy quantifier | `{n,m}` where `n` and `m` are integers: `0 ≤ n ≤ m ≤ 999` | Repeats the previous item between `n` and `m` times. Greedy, so repeating `m` times is tried before reducing the repetition to `n` times. | `a{2,4}` matches `aaaa`, `aaa` or `aa` |
+| Greedy quantifier | `{n,}` where `n` is an integer: `0 ≤ n ≤ 999` | Repeats the previous item at least `n` times. Greedy, so as many items as possible will be matched before trying permutations with fewer matches of the preceding item, up to the point where the preceding item is matched only `n` times. | `a{2,}` matches `aaaaa` in `aaaaa` |
+| Lazy quantifier | `{n,m}?` where `n` and `m` are integers `0 ≤ n ≤ m ≤ 999` | Repeats the previous item between `n` and `m` times. Lazy, so repeating `n` times is tried before increasing the repetition to `m` times. | `a{2,4}?` matches `aa`, `aaa`, or `aaaa` |
+| Lazy quantifier | `{n,}?` where `n` is an integer: `0 ≤ n ≤ 999` | Repeats the previous item `n` or more times. Lazy, so the engine first matches the previous item `n` times, before trying permutations with ever increasing matches of the preceding item. | `a{2,}?` matches `aa` in `aaaaa` |
 
 ### Groups
 

@@ -21,9 +21,11 @@
 
 #include <algorithm>
 #include <array>
+#include <cctype>
 #include <numeric>
 #include <stack>
 #include <string>
+#include <vector>
 
 namespace cudf {
 namespace strings {
@@ -45,6 +47,7 @@ enum OperatorType {
   COUNTED_LAZY = 0215,
   NOP          = 0302,  // No operation, internal use only
 };
+#define ITEM_MASK 0300
 
 static reclass ccls_w(CCLASS_W);   // \w
 static reclass ccls_s(CCLASS_S);   // \s
@@ -152,10 +155,10 @@ class regex_parser {
   int id_ccls_d = -1;  // digit
   int id_ccls_D = -1;  // not digit
 
-  char32_t yy;    /* last lex'd Char */
-  int yyclass_id; /* last lex'd class */
-  short yy_min_count;
-  short yy_max_count;
+  char32_t yy{};    /* last lex'd Char */
+  int yyclass_id{}; /* last lex'd class */
+  int16_t yy_min_count{};
+  int16_t yy_max_count{};
 
   bool nextc(char32_t& c)  // return "quoted" == backslash-escape prefix
   {
@@ -454,41 +457,69 @@ class regex_parser {
           return PLUS_LAZY;
         }
         return PLUS;
-      case '{':  // counted repetition
+      case '{':  // counted repetition: {n,m}
       {
-        if (*exprp < '0' || *exprp > '9') break;
-        const char32_t* exprp_backup = exprp;  // in case '}' is not found
-        char buff[8]                 = {0};
-        for (int i = 0; i < 7 && *exprp != '}' && *exprp != ',' && *exprp != 0; i++, exprp++) {
-          buff[i]     = *exprp;
-          buff[i + 1] = 0;
-        }
-        if (*exprp != '}' && *exprp != ',') {
-          exprp = exprp_backup;
-          break;
-        }
-        sscanf(buff, "%hd", &yy_min_count);
-        if (*exprp != ',')
-          yy_max_count = yy_min_count;
-        else {
-          yy_max_count = -1;
-          exprp++;
-          buff[0] = 0;
-          for (int i = 0; i < 7 && *exprp != '}' && *exprp != 0; i++, exprp++) {
-            buff[i]     = *exprp;
-            buff[i + 1] = 0;
+        if (!std::isdigit(*exprp)) { break; }
+
+        // transform char32 to char until null, delimiter, non-digit or end is reached;
+        // returns the number of chars read/transformed
+        auto transform_until = [](char32_t const* input,
+                                  char32_t const* end,
+                                  char* output,
+                                  std::string_view const delimiters) -> int32_t {
+          int32_t count = 0;
+          while (*input != 0 && input < end) {
+            auto const ch = static_cast<char>(*input++);
+            // if ch not a digit or ch is a delimiter, we are done
+            if (!std::isdigit(ch) || delimiters.find(ch) != delimiters.npos) { break; }
+            output[count] = ch;
+            ++count;
           }
-          if (*exprp != '}') {
-            exprp = exprp_backup;
-            break;
+          output[count] = 0;  // null-terminate (for the atoi call)
+          return count;
+        };
+
+        constexpr auto max_read               = 4;    // 3 digits plus the delimiter
+        constexpr auto max_value              = 999;  // support only 3 digits
+        std::array<char, max_read + 1> buffer = {0};  //(max_read + 1);
+
+        // get left-side (n) value => min_count
+        auto bytes_read = transform_until(exprp, exprp + max_read, buffer.data(), "},");
+        if (exprp[bytes_read] != '}' && exprp[bytes_read] != ',') {
+          break;  // re-interpret as CHAR
+        }
+        auto count = std::atoi(buffer.data());
+        CUDF_EXPECTS(count <= max_value,
+                     "unsupported repeat value at " + std::to_string(exprp - pattern - 1));
+        yy_min_count = static_cast<int16_t>(count);
+
+        auto const exprp_backup = exprp;  // save in case ending '}' is not found
+        exprp += bytes_read;
+
+        // get optional right-side (m) value => max_count
+        yy_max_count = yy_min_count;
+        if (*exprp++ == ',') {
+          bytes_read = transform_until(exprp, exprp + max_read, buffer.data(), "}");
+          if (exprp[bytes_read] != '}') {
+            exprp = exprp_backup;  // abort, rollback and
+            break;                 // re-interpret as CHAR
           }
-          if (buff[0] != 0) sscanf(buff, "%hd", &yy_max_count);
+
+          count = std::atoi(buffer.data());
+          CUDF_EXPECTS(count <= max_value,
+                       "unsupported repeat value at " + std::to_string(exprp - pattern - 1));
+
+          // {n,m} and {n,} are both valid
+          yy_max_count = buffer[0] == 0 ? -1 : static_cast<int16_t>(count);
+          exprp += bytes_read + 1;
         }
-        exprp++;
+
+        // {n,m}? pattern is lazy counted quantifier
         if (*exprp == '?') {
           exprp++;
           return COUNTED_LAZY;
         }
+        // otherwise, fixed counted quantifier
         return COUNTED;
       }
       case '|': return OR;
@@ -562,6 +593,9 @@ class regex_compiler {
 
   regex_flags flags;
 
+  char32_t yy;
+  int yyclass_id;
+
   inline void pushand(int f, int l) { andstack.push_back({f, l}); }
 
   inline Node popand(int op)
@@ -714,97 +748,70 @@ class regex_compiler {
     lastwasand = true;
   }
 
-  char32_t yy;
-  int yyclass_id;
-
-  void expand_counted(const std::vector<regex_parser::Item>& in,
-                      std::vector<regex_parser::Item>& out)
+  std::vector<regex_parser::Item> expand_counted(std::vector<regex_parser::Item> const& in)
   {
-    std::vector<int> lbra_stack;
-    int rep_start = -1;
-
-    out.clear();
-    for (std::size_t i = 0; i < in.size(); i++) {
-      if (in[i].t != COUNTED && in[i].t != COUNTED_LAZY) {
-        out.push_back(in[i]);
-        if (in[i].t == LBRA || in[i].t == LBRA_NC) {
-          lbra_stack.push_back(i);
-          rep_start = -1;
-        } else if (in[i].t == RBRA) {
-          rep_start = lbra_stack[lbra_stack.size() - 1];
-          lbra_stack.pop_back();
-        } else if ((in[i].t & 0300) != OPERATOR_MASK) {
-          rep_start = i;
+    std::vector<regex_parser::Item> out;
+    std::stack<int> lbra_stack;
+    auto repeat_start_index = -1;
+
+    for (std::size_t index = 0; index < in.size(); index++) {
+      auto const item = in[index];
+
+      if (item.t != COUNTED && item.t != COUNTED_LAZY) {
+        out.push_back(item);
+        if (item.t == LBRA || item.t == LBRA_NC) {
+          lbra_stack.push(index);
+          repeat_start_index = -1;
+        } else if (item.t == RBRA) {
+          repeat_start_index = lbra_stack.top();
+          lbra_stack.pop();
+        } else if ((item.t & ITEM_MASK) != OPERATOR_MASK) {
+          repeat_start_index = index;
         }
       } else {
-        if (rep_start < 0)  // broken regex
-          return;
+        // item is of type COUNTED or COUNTED_LAZY
+        // here we repeat the previous item(s) based on the count range in item
 
-        regex_parser::Item item = in[i];
-        if (item.d.yycount.n <= 0) {
-          // need to erase
-          for (std::size_t j = 0; j < i - rep_start; j++)
-            out.pop_back();
-        } else {
-          // repeat
-          for (int j = 1; j < item.d.yycount.n; j++)
-            for (std::size_t k = rep_start; k < i; k++)
-              out.push_back(in[k]);
+        CUDF_EXPECTS(repeat_start_index >= 0, "regex: invalid counted quantifier location");
+
+        // range of affected item(s) to repeat
+        auto const begin = in.begin() + repeat_start_index;
+        auto const end   = in.begin() + index;
+        // count range values
+        auto const n = item.d.yycount.n;  // minimum count
+        auto const m = item.d.yycount.m;  // maximum count
+
+        assert(n >= 0 && "invalid repeat count value n");
+        // zero-repeat edge-case: need to erase the previous items
+        if (n == 0) { out.erase(out.end() - (index - repeat_start_index), out.end()); }
+
+        // minimum repeats (n)
+        for (int j = 1; j < n; j++) {
+          out.insert(out.end(), begin, end);
         }
 
-        // optional repeats
-        if (item.d.yycount.m >= 0) {
-          for (int j = item.d.yycount.n; j < item.d.yycount.m; j++) {
-            regex_parser::Item o_item;
-            o_item.t    = LBRA_NC;
-            o_item.d.yy = 0;
-            out.push_back(o_item);
-            for (std::size_t k = rep_start; k < i; k++)
-              out.push_back(in[k]);
+        // optional maximum repeats (m)
+        if (m >= 0) {
+          for (int j = n; j < m; j++) {
+            out.push_back(regex_parser::Item{LBRA_NC, 0});
+            out.insert(out.end(), begin, end);
           }
-          for (int j = item.d.yycount.n; j < item.d.yycount.m; j++) {
-            regex_parser::Item o_item;
-            o_item.t    = RBRA;
-            o_item.d.yy = 0;
-            out.push_back(o_item);
-            if (item.t == COUNTED) {
-              o_item.t = QUEST;
-              out.push_back(o_item);
-            } else {
-              o_item.t = QUEST_LAZY;
-              out.push_back(o_item);
-            }
+          for (int j = n; j < m; j++) {
+            out.push_back(regex_parser::Item{RBRA, 0});
+            out.push_back(regex_parser::Item{item.t == COUNTED ? QUEST : QUEST_LAZY, 0});
           }
-        } else  // infinite repeat
-        {
-          regex_parser::Item o_item;
-          o_item.d.yy = 0;
-
-          if (item.d.yycount.n > 0)  // put '+' after last repetition
-          {
-            if (item.t == COUNTED) {
-              o_item.t = PLUS;
-              out.push_back(o_item);
-            } else {
-              o_item.t = PLUS_LAZY;
-              out.push_back(o_item);
-            }
-          } else  // copy it once then put '*'
-          {
-            for (std::size_t k = rep_start; k < i; k++)
-              out.push_back(in[k]);
-
-            if (item.t == COUNTED) {
-              o_item.t = STAR;
-              out.push_back(o_item);
-            } else {
-              o_item.t = STAR_LAZY;
-              out.push_back(o_item);
-            }
+        } else {
+          // infinite repeats
+          if (n > 0) {  // append '+' after last repetition
+            out.push_back(regex_parser::Item{item.t == COUNTED ? PLUS : PLUS_LAZY, 0});
+          } else {  // copy it once then append '*'
+            out.insert(out.end(), begin, end);
+            out.push_back(regex_parser::Item{item.t == COUNTED ? STAR : STAR_LAZY, 0});
           }
         }
       }
     }
+    return out;
   }
 
  public:
@@ -819,23 +826,17 @@ class regex_compiler {
       yyclass_id(0)
   {
     // Parse
-    std::vector<regex_parser::Item> items;
-    {
+    std::vector<regex_parser::Item> const items = [&] {
       regex_parser parser(pattern, is_dotall(flags) ? ANYNL : ANY, m_prog);
-
-      // Expand counted repetitions
-      if (parser.m_has_counted)
-        expand_counted(parser.m_items, items);
-      else
-        items = parser.m_items;
-    }
+      return parser.m_has_counted ? expand_counted(parser.m_items) : parser.m_items;
+    }();
 
     /* Start with a low priority operator to prime parser */
     pushator(START - 1);
 
     for (int i = 0; i < static_cast<int>(items.size()); i++) {
-      regex_parser::Item item = items[i];
-      int token               = item.t;
+      auto const item = items[i];
+      int token       = item.t;
       if (token == CCLASS || token == NCCLASS)
         yyclass_id = item.d.yyclass_id;
       else
@@ -1109,12 +1110,12 @@ void reprog::print(regex_flags const flags)
     if (cls.builtins) {
       int mask = cls.builtins;
       printf("   builtins(x%02X):", static_cast<unsigned>(mask));
-      if (mask & 1) printf(" \\w");
-      if (mask & 2) printf(" \\s");
-      if (mask & 4) printf(" \\d");
-      if (mask & 8) printf(" \\W");
-      if (mask & 16) printf(" \\S");
-      if (mask & 32) printf(" \\D");
+      if (mask & CCLASS_W) printf(" \\w");
+      if (mask & CCLASS_S) printf(" \\s");
+      if (mask & CCLASS_D) printf(" \\d");
+      if (mask & NCCLASS_W) printf(" \\W");
+      if (mask & NCCLASS_S) printf(" \\S");
+      if (mask & NCCLASS_D) printf(" \\D");
     }
     printf("\n");
   }