rapidsai · rapids-bot · May 25, 2022 · May 12, 2022 · May 12, 2022 · May 13, 2022
@@ -103,11 +103,11 @@ The details are based on features documented at https://www.regular-expressions.
 | Lazy quantifier | `??` | Makes the preceding item optional. Lazy, so the optional item is excluded in the match if possible. | `abc??` matches `ab` or `abc` |
 | Lazy quantifier | `*?` | Repeats the previous item zero or more times. Lazy, so the engine first attempts to skip the previous item, before trying permutations with ever increasing matches of the preceding item. | `".*?"` matches `"def"` and `"ghi"` in `abc "def" "ghi" jkl` |
 | Lazy quantifier | `+?` | Repeats the previous item once or more. Lazy, so the engine first matches the previous item only once, before trying permutations with ever increasing matches of the preceding item. | `".+?"` matches `"def"` and `"ghi"` in `abc "def" "ghi" jkl` |
-| Fixed quantifier | `{n}` where `n is an integer >= 1` | Repeats the previous item exactly `n` times. | `a{5}` matches `aaaaa` |
-| Greedy quantifier | `{n,m}` where `n >= 0` and `m >= n` | Repeats the previous item between `n` and `m` times. Greedy, so repeating `m` times is tried before reducing the repetition to `n` times. | `a{2,4}` matches `aaaa`, `aaa` or `aa` |
-| Greedy quantifier | `{n,}` where `n >= 0` | Repeats the previous item at least `n` times. Greedy, so as many items as possible will be matched before trying permutations with less matches of the preceding item, up to the point where the preceding item is matched only `n` times. | `a{2,}` matches `aaaaa` in `aaaaa` |
-| Lazy quantifier | `{n,m}?` where `n >= 0` and `m >= n` | Repeats the previous item between `n` and `m` times. Lazy, so repeating `n` times is tried before increasing the repetition to `m` times. | `a{2,4}?` matches `aa`, `aaa` or `aaaa` |
-| Lazy quantifier | `{n,}?` where `n >= 0` | Repeats the previous item `n` or more times. Lazy, so the engine first matches the previous item `n` times, before trying permutations with ever increasing matches of the preceding item. | `a{2,}?` matches `aa` in `aaaaa` |
+| Fixed quantifier | `{n}` where `n` is an integer: `0 ≤ n ≤ 999` | Repeats the previous item exactly `n` times. | `a{5}` matches `aaaaa` |
+| Greedy quantifier | `{n,m}` where `n` and `m` are integers: `0 ≤ n ≤ 999` and `n ≤ m ≤ 999` | Repeats the previous item between `n` and `m` times. Greedy, so repeating `m` times is tried before reducing the repetition to `n` times. | `a{2,4}` matches `aaaa`, `aaa` or `aa` |
+| Greedy quantifier | `{n,}` where `n` is an integer: `0 ≤ n ≤ 999` | Repeats the previous item at least `n` times. Greedy, so as many items as possible will be matched before trying permutations with less matches of the preceding item, up to the point where the preceding item is matched only `n` times. | `a{2,}` matches `aaaaa` in `aaaaa` |
+| Lazy quantifier | `{n,m}?` where `n` and `m` are integers `0 ≤ n ≤ 999` and `n ≤ m ≤ 999` | Repeats the previous item between `n` and `m` times. Lazy, so repeating `n` times is tried before increasing the repetition to `m` times. | `a{2,4}?` matches `aa`, `aaa` or `aaaa` |
+| Lazy quantifier | `{n,}?` where `n` is an integer: `0 ≤ n ≤ 999` | Repeats the previous item `n` or more times. Lazy, so the engine first matches the previous item `n` times, before trying permutations with ever increasing matches of the preceding item. | `a{2,}?` matches `aa` in `aaaaa` |
 
 ### Groups
 

@@ -21,9 +21,11 @@
 
 #include <algorithm>
 #include <array>
+#include <cctype>
 #include <numeric>
 #include <stack>
 #include <string>
+#include <vector>
 
 namespace cudf {
 namespace strings {
@@ -45,6 +47,7 @@ enum OperatorType {
   COUNTED_LAZY = 0215,
   NOP          = 0302,  // No operation, internal use only
 };
+#define ITEM_MASK 0300
 
 static reclass ccls_w(CCLASS_W);   // \w
 static reclass ccls_s(CCLASS_S);   // \s
@@ -152,10 +155,10 @@ class regex_parser {
   int id_ccls_d = -1;  // digit
   int id_ccls_D = -1;  // not digit
 
-  char32_t yy;    /* last lex'd Char */
-  int yyclass_id; /* last lex'd class */
-  short yy_min_count;
-  short yy_max_count;
+  char32_t yy{};    /* last lex'd Char */
+  int yyclass_id{}; /* last lex'd class */
+  int16_t yy_min_count{};
+  int16_t yy_max_count{};
 
   bool nextc(char32_t& c)  // return "quoted" == backslash-escape prefix
   {
@@ -454,41 +457,62 @@ class regex_parser {
           return PLUS_LAZY;
         }
         return PLUS;
-      case '{':  // counted repetition
+      case '{':  // counted repetition: {n,m}
       {
-        if (*exprp < '0' || *exprp > '9') break;
-        const char32_t* exprp_backup = exprp;  // in case '}' is not found
-        char buff[8]                 = {0};
-        for (int i = 0; i < 7 && *exprp != '}' && *exprp != ',' && *exprp != 0; i++, exprp++) {
-          buff[i]     = *exprp;
-          buff[i + 1] = 0;
-        }
-        if (*exprp != '}' && *exprp != ',') {
-          exprp = exprp_backup;
-          break;
-        }
-        sscanf(buff, "%hd", &yy_min_count);
-        if (*exprp != ',')
-          yy_max_count = yy_min_count;
-        else {
-          yy_max_count = -1;
-          exprp++;
-          buff[0] = 0;
-          for (int i = 0; i < 7 && *exprp != '}' && *exprp != 0; i++, exprp++) {
-            buff[i]     = *exprp;
-            buff[i + 1] = 0;
+        if (!std::isdigit(*exprp)) { break; }
+
+        // transform char32 to char until null, delimiter, non-digit or end is reached;
+        // returns the number of chars read/transformed
+        auto transform_until = [](char32_t const* input,
+                                  char32_t const* end,
+                                  char* output,
+                                  std::string_view const delimiters) -> int32_t {
+          int32_t count = 0;
+          while (*input != 0 && input < end) {
+            auto const ch = static_cast<char>(*input++);
+            // if ch not a digit or ch is a delimiter, we are done
+            if (!std::isdigit(ch) || delimiters.find(ch) != delimiters.npos) { break; }
+            output[count] = ch;
+            ++count;
           }
-          if (*exprp != '}') {
-            exprp = exprp_backup;
-            break;
+          output[count] = 0;  // null-terminate (for the atoi call)
+          return count;
+        };
+
+        auto const exprp_backup               = exprp;  // save in case matching '}' is not found
+        constexpr auto max_read               = 4;      // 3 digits plus the delimiter
+        constexpr auto max_value              = 999;    // support only 3 digits
+        std::array<char, max_read + 1> buffer = {0};    //(max_read + 1);
+
+        // get left-side (n) value => min_count
+        exprp += transform_until(exprp, exprp + max_read, buffer.data(), "},");
+        auto count = std::atoi(buffer.data());
+        if ((*exprp != '}' && *exprp != ',') || (count > max_value)) {
+          exprp = exprp_backup;  // abort, rollback and
+          break;                 // re-interpret as CHAR
+        }
+        yy_min_count = static_cast<int16_t>(count);
+
+        // get optional right-side (m) value => max_count
+        yy_max_count = yy_min_count;
+        if (*exprp++ == ',') {
+          exprp += transform_until(exprp, exprp + max_read, buffer.data(), "}");
+          count = std::atoi(buffer.data());
+          if ((*exprp != '}') || (count > max_value)) {
+            exprp = exprp_backup;  // abort, rollback and
+            break;                 // re-interpret as CHAR
           }
-          if (buff[0] != 0) sscanf(buff, "%hd", &yy_max_count);
+          // {n,m} and {n,} are both valid
+          yy_max_count = buffer[0] == 0 ? -1 : static_cast<int16_t>(count);
+          ++exprp;
         }
-        exprp++;
+
+        // {n,m}? pattern is lazy counted quantifier
         if (*exprp == '?') {
           exprp++;
           return COUNTED_LAZY;
         }
+        // otherwise, fixed counted quantifier
         return COUNTED;
       }
       case '|': return OR;
@@ -562,6 +586,9 @@ class regex_compiler {
 
   regex_flags flags;
 
+  char32_t yy;
+  int yyclass_id;
+
   inline void pushand(int f, int l) { andstack.push_back({f, l}); }
 
   inline Node popand(int op)
@@ -714,97 +741,70 @@ class regex_compiler {
     lastwasand = true;
   }
 
-  char32_t yy;
-  int yyclass_id;
-
-  void expand_counted(const std::vector<regex_parser::Item>& in,
-                      std::vector<regex_parser::Item>& out)
+  std::vector<regex_parser::Item> expand_counted(std::vector<regex_parser::Item> const& in)
   {
-    std::vector<int> lbra_stack;
-    int rep_start = -1;
-
-    out.clear();
-    for (std::size_t i = 0; i < in.size(); i++) {
-      if (in[i].t != COUNTED && in[i].t != COUNTED_LAZY) {
-        out.push_back(in[i]);
-        if (in[i].t == LBRA || in[i].t == LBRA_NC) {
-          lbra_stack.push_back(i);
-          rep_start = -1;
-        } else if (in[i].t == RBRA) {
-          rep_start = lbra_stack[lbra_stack.size() - 1];
-          lbra_stack.pop_back();
-        } else if ((in[i].t & 0300) != OPERATOR_MASK) {
-          rep_start = i;
+    std::vector<regex_parser::Item> out;
+    std::stack<int> lbra_stack;
+    auto repeat_start_index = -1;
+
+    for (std::size_t index = 0; index < in.size(); index++) {
+      auto const item = in[index];
+
+      if (item.t != COUNTED && item.t != COUNTED_LAZY) {
+        out.push_back(item);
+        if (item.t == LBRA || item.t == LBRA_NC) {
+          lbra_stack.push(index);
+          repeat_start_index = -1;
+        } else if (item.t == RBRA) {
+          repeat_start_index = lbra_stack.top();
+          lbra_stack.pop();
+        } else if ((item.t & ITEM_MASK) != OPERATOR_MASK) {
+          repeat_start_index = index;
         }
       } else {
-        if (rep_start < 0)  // broken regex
-          return;
+        // item is of type COUNTED or COUNTED_LAZY
+        // here we repeat the previous item(s) based on the count range in item
 
-        regex_parser::Item item = in[i];
-        if (item.d.yycount.n <= 0) {
-          // need to erase
-          for (std::size_t j = 0; j < i - rep_start; j++)
-            out.pop_back();
-        } else {
-          // repeat
-          for (int j = 1; j < item.d.yycount.n; j++)
-            for (std::size_t k = rep_start; k < i; k++)
-              out.push_back(in[k]);
+        CUDF_EXPECTS(repeat_start_index >= 0, "regex: invalid counted quantifier location");
+
+        // range of affected item(s) to repeat
+        auto const begin = in.begin() + repeat_start_index;
+        auto const end   = in.begin() + index;
+        // count range values
+        auto const n = item.d.yycount.n;  // minimum count
+        auto const m = item.d.yycount.m;  // maximum count
+
+        assert(n >= 0 && "invalid repeat count value n");
+        // zero-repeat edge-case: need to erase the previous items
+        if (n == 0) { out.erase(out.end() - (index - repeat_start_index), out.end()); }
+
+        // minimum repeats (n)
+        for (int j = 1; j < n; j++) {
+          out.insert(out.end(), begin, end);
         }
 
-        // optional repeats
-        if (item.d.yycount.m >= 0) {
-          for (int j = item.d.yycount.n; j < item.d.yycount.m; j++) {
-            regex_parser::Item o_item;
-            o_item.t    = LBRA_NC;
-            o_item.d.yy = 0;
-            out.push_back(o_item);
-            for (std::size_t k = rep_start; k < i; k++)
-              out.push_back(in[k]);
+        // optional maximum repeats (m)
+        if (m >= 0) {
+          for (int j = n; j < m; j++) {
+            out.push_back(regex_parser::Item{LBRA_NC, 0});
+            out.insert(out.end(), begin, end);
           }
-          for (int j = item.d.yycount.n; j < item.d.yycount.m; j++) {
-            regex_parser::Item o_item;
-            o_item.t    = RBRA;
-            o_item.d.yy = 0;
-            out.push_back(o_item);
-            if (item.t == COUNTED) {
-              o_item.t = QUEST;
-              out.push_back(o_item);
-            } else {
-              o_item.t = QUEST_LAZY;
-              out.push_back(o_item);
-            }
+          for (int j = n; j < m; j++) {
+            out.push_back(regex_parser::Item{RBRA, 0});
+            out.push_back(regex_parser::Item{item.t == COUNTED ? QUEST : QUEST_LAZY, 0});
           }
-        } else  // infinite repeat
-        {
-          regex_parser::Item o_item;
-          o_item.d.yy = 0;
-
-          if (item.d.yycount.n > 0)  // put '+' after last repetition
-          {
-            if (item.t == COUNTED) {
-              o_item.t = PLUS;
-              out.push_back(o_item);
-            } else {
-              o_item.t = PLUS_LAZY;
-              out.push_back(o_item);
-            }
-          } else  // copy it once then put '*'
-          {
-            for (std::size_t k = rep_start; k < i; k++)
-              out.push_back(in[k]);
-
-            if (item.t == COUNTED) {
-              o_item.t = STAR;
-              out.push_back(o_item);
-            } else {
-              o_item.t = STAR_LAZY;
-              out.push_back(o_item);
-            }
+        } else {
+          // infinite repeats
+          if (n > 0) {  // append '+' after last repetition
+            out.push_back(regex_parser::Item{item.t == COUNTED ? PLUS : PLUS_LAZY, 0});
+          } else {  // copy it once then append '*'
+            out.insert(out.end(), begin, end);
+            out.push_back(regex_parser::Item{item.t == COUNTED ? STAR : STAR_LAZY, 0});
           }
         }
       }
     }
+    return out;
   }
 
  public:
@@ -819,23 +819,17 @@ class regex_compiler {
       yyclass_id(0)
   {
     // Parse
-    std::vector<regex_parser::Item> items;
-    {
+    std::vector<regex_parser::Item> const items = [&] {
       regex_parser parser(pattern, is_dotall(flags) ? ANYNL : ANY, m_prog);
-
-      // Expand counted repetitions
-      if (parser.m_has_counted)
-        expand_counted(parser.m_items, items);
-      else
-        items = parser.m_items;
-    }
+      return parser.m_has_counted ? expand_counted(parser.m_items) : parser.m_items;
+    }();
 
     /* Start with a low priority operator to prime parser */
     pushator(START - 1);
 
     for (int i = 0; i < static_cast<int>(items.size()); i++) {
-      regex_parser::Item item = items[i];
-      int token               = item.t;
+      auto const item = items[i];
+      int token       = item.t;
       if (token == CCLASS || token == NCCLASS)
         yyclass_id = item.d.yyclass_id;
       else
@@ -1109,12 +1103,12 @@ void reprog::print(regex_flags const flags)
     if (cls.builtins) {
       int mask = cls.builtins;
       printf("   builtins(x%02X):", static_cast<unsigned>(mask));
-      if (mask & 1) printf(" \\w");
-      if (mask & 2) printf(" \\s");
-      if (mask & 4) printf(" \\d");
-      if (mask & 8) printf(" \\W");
-      if (mask & 16) printf(" \\S");
-      if (mask & 32) printf(" \\D");
+      if (mask & CCLASS_W) printf(" \\w");
+      if (mask & CCLASS_S) printf(" \\s");
+      if (mask & CCLASS_D) printf(" \\d");
+      if (mask & NCCLASS_W) printf(" \\W");
+      if (mask & NCCLASS_S) printf(" \\S");
+      if (mask & NCCLASS_D) printf(" \\D");
     }
     printf("\n");
   }