From 9a57a4d9c752070d498c462bb5b75498bf267760 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 3 Jun 2022 15:17:17 -0400
Subject: [PATCH 1/7] Cleanup regex reclass and reclass_device classes

---
 cpp/src/strings/regex/regcomp.cpp    | 125 +++++++++++++--------------
 cpp/src/strings/regex/regcomp.h      |  24 +++--
 cpp/src/strings/regex/regex.cuh      |   2 +-
 cpp/src/strings/regex/regex.inl      |   7 +-
 cpp/src/strings/regex/regexec.cu     |  23 +++--
 cpp/tests/strings/contains_tests.cpp |  17 ++++
 6 files changed, 112 insertions(+), 86 deletions(-)
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index fdf4609e336..e5a4f7e0fa1 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -19,6 +19,9 @@
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
 #include <algorithm>
 #include <array>
 #include <cctype>
@@ -106,16 +109,16 @@ int32_t reprog::add_inst(int32_t t)
   return add_inst(inst);
 }
 
-int32_t reprog::add_inst(reinst inst)
+int32_t reprog::add_inst(reinst const& inst)
 {
   _insts.push_back(inst);
-  return static_cast<int>(_insts.size() - 1);
+  return static_cast<int32_t>(_insts.size() - 1);
 }
 
-int32_t reprog::add_class(reclass cls)
+int32_t reprog::add_class(reclass const& cls)
 {
   _classes.push_back(cls);
-  return static_cast<int>(_classes.size() - 1);
+  return static_cast<int32_t>(_classes.size() - 1);
 }
 
 reinst& reprog::inst_at(int32_t id) { return _insts[id]; }
@@ -134,9 +137,11 @@ void reprog::set_groups_count(int32_t groups) { _num_capturing_groups = groups;
 
 int32_t reprog::groups_count() const { return _num_capturing_groups; }
 
-const reinst* reprog::insts_data() const { return _insts.data(); }
+reinst const* reprog::insts_data() const { return _insts.data(); }
+
+reclass const* reprog::classes_data() const { return _classes.data(); }
 
-const int32_t* reprog::starts_data() const { return _startinst_ids.data(); }
+int32_t const* reprog::starts_data() const { return _startinst_ids.data(); }
 
 int32_t reprog::starts_count() const { return static_cast<int>(_startinst_ids.size()); }
 
@@ -178,28 +183,25 @@ class regex_parser {
   int bldcclass()
   {
     int type = CCLASS;
-    std::vector<char32_t> cls;
+    std::vector<char32_t> literals;
     int builtins = 0;
 
-    /* look ahead for negation */
-    /* SPECIAL CASE!!! negated classes don't match \n */
-    char32_t c = 0;
-    int quoted = nextc(c);
+    // look ahead for negation
+    char32_t c  = 0;
+    auto quoted = nextc(c);
     if (!quoted && c == '^') {
       type   = NCCLASS;
       quoted = nextc(c);
-      cls.push_back('\n');
-      cls.push_back('\n');
+      // negated classes also don't match '\n'
+      literals.push_back('\n');
+      literals.push_back('\n');
     }
 
-    /* parse class into a set of spans */
+    // parse class into a set of spans
     int count_char = 0;
     while (true) {
       count_char++;
-      if (c == 0) {
-        // malformed '[]'
-        return 0;
-      }
+      if (c == 0) { return 0; }  // malformed '[]'
       if (quoted) {
         switch (c) {
           case 'n': c = '\n'; break;
@@ -236,55 +238,47 @@ class regex_parser {
       }
       if (!quoted && c == ']' && count_char > 1) break;
       if (!quoted && c == '-') {
-        if (cls.size() < 1) {
-          // malformed '[]'
-          return 0;
-        }
+        if (literals.empty()) { return 0; }  // malformed '[]'
         quoted = nextc(c);
-        if ((!quoted && c == ']') || c == 0) {
-          // malformed '[]'
-          return 0;
-        }
-        cls[cls.size() - 1] = c;
+        if ((!quoted && c == ']') || c == 0) { return 0; }  // malformed '[]'
+        literals.back() = c;
       } else {
-        cls.push_back(c);
-        cls.push_back(c);
+        literals.push_back(c);
+        literals.push_back(c);
       }
       quoted = nextc(c);
     }
 
-    /* sort on span start */
-    for (std::size_t p = 0; p < cls.size(); p += 2)
-      for (std::size_t np = p + 2; np < cls.size(); np += 2)
-        if (cls[np] < cls[p]) {
-          c           = cls[np];
-          cls[np]     = cls[p];
-          cls[p]      = c;
-          c           = cls[np + 1];
-          cls[np + 1] = cls[p + 1];
-          cls[p + 1]  = c;
-        }
-
-    /* merge spans */
-    reclass yycls{builtins};
-    if (cls.size() >= 2) {
-      int np        = 0;
-      std::size_t p = 0;
-      yycls.literals += cls[p++];
-      yycls.literals += cls[p++];
-      for (; p < cls.size(); p += 2) {
-        /* overlapping or adjacent ranges? */
-        if (cls[p] <= yycls.literals[np + 1] + 1) {
-          if (cls[p + 1] >= yycls.literals[np + 1])
-            yycls.literals.replace(np + 1, 1, 1, cls[p + 1]); /* coalesce */
-        } else {
-          np += 2;
-          yycls.literals += cls[p];
-          yycls.literals += cls[p + 1];
+    // transform pairs of literals to spans
+    std::vector<reclass_span> spans;
+    auto const evens = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+                                                       [](auto i) { return i * 2; });
+    std::transform(
+      evens, evens + (literals.size() / 2), std::back_inserter(spans), [&literals](auto idx) {
+        return reclass_span{literals[idx], literals[idx + 1]};
+      });
+    // sort the spans to help with detecting overlapping entries
+    std::sort(spans.begin(), spans.end(), [](auto l, auto r) {
+      return l.first == r.first ? l.last < r.last : l.first < r.first;
+    });
+    // combine overlapping entries
+    if (spans.size() > 1) {
+      for (auto itr = spans.begin() + 1; itr < spans.end(); ++itr) {
+        auto const prev = *(itr - 1);
+        auto const curr = *itr;
+        if (curr.first <= prev.last + 1) {
+          // if these 2 spans intersect, expand the current one
+          *itr = reclass_span{prev.first, std::max(prev.last, curr.last)};
         }
       }
     }
-    yyclass_id = m_prog.add_class(yycls);
+    // remove duplicates
+    std::reverse(spans.begin(), spans.end());  // moves larger overlaps forward
+    auto const end =  // this relies on std::unique keeping the first entry in a repeated sequence
+      std::unique(spans.begin(), spans.end(), [](auto l, auto r) { return l.first == r.first; });
+    spans.erase(end, spans.end());
+
+    yyclass_id = m_prog.add_class(reclass{builtins, std::move(spans)});
     return type;
   }
 
@@ -340,8 +334,7 @@ class regex_parser {
           case 'W': {
             if (id_ccls_W < 0) {
               reclass cls = ccls_w;
-              cls.literals += '\n';
-              cls.literals += '\n';
+              cls.literals.push_back({'\n', '\n'});
               yyclass_id = m_prog.add_class(cls);
               id_ccls_W  = yyclass_id;
             } else
@@ -375,8 +368,7 @@ class regex_parser {
           case 'D': {
             if (id_ccls_D < 0) {
               reclass cls = ccls_d;
-              cls.literals += '\n';
-              cls.literals += '\n';
+              cls.literals.push_back({'\n', '\n'});
               yyclass_id = m_prog.add_class(cls);
               id_ccls_D  = yyclass_id;
             } else
@@ -1086,15 +1078,16 @@ void reprog::print(regex_flags const flags)
     const reclass& cls = _classes[i];
     auto const size    = static_cast<int>(cls.literals.size());
     printf("%2d: ", i);
-    for (int j = 0; j < size; j += 2) {
-      char32_t c1 = cls.literals[j];
-      char32_t c2 = cls.literals[j + 1];
+    for (int j = 0; j < size; ++j) {
+      auto const l = cls.literals[j];
+      char32_t c1  = l.first;
+      char32_t c2  = l.last;
       if (c1 <= 32 || c1 >= 127 || c2 <= 32 || c2 >= 127) {
         printf("0x%02x-0x%02x", static_cast<unsigned>(c1), static_cast<unsigned>(c2));
       } else {
         printf("%c-%c", static_cast<char>(c1), static_cast<char>(c2));
       }
-      if ((j + 2) < size) { printf(", "); }
+      if ((j + 1) < size) { printf(", "); }
     }
     printf("\n");
     if (cls.builtins) {
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index 48395e8cf1f..3d7ba114000 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -47,14 +47,23 @@ enum InstType {
   END     = 0377   // Terminate: match found
 };
 
+/**
+ * @brief Range used for literals in reclass classes.
+ */
+struct reclass_span {
+  char32_t first{};  /// first character in span
+  char32_t last{};   /// last character in span (inclusive)
+};
+
 /**
  * @brief Class type for regex compiler instruction.
  */
 struct reclass {
-  int32_t builtins{0};      // bit mask identifying builtin classes
-  std::u32string literals;  // ranges as pairs of utf-8 characters
+  int32_t builtins{0};  // bit mask identifying builtin classes
+  std::vector<reclass_span> literals;
   reclass() {}
   reclass(int m) : builtins(m) {}
+  reclass(int m, std::vector<reclass_span>&& l) : builtins(m), literals(std::move(l)) {}
 };
 
 constexpr int32_t CCLASS_W{1 << 0};   // [a-z], [A-Z], [0-9], and '_'
@@ -105,18 +114,19 @@ class reprog {
   static reprog create_from(std::string_view pattern, regex_flags const flags);
 
   int32_t add_inst(int32_t type);
-  int32_t add_inst(reinst inst);
-  int32_t add_class(reclass cls);
+  int32_t add_inst(reinst const& inst);
+  int32_t add_class(reclass const& cls);
 
   void set_groups_count(int32_t groups);
   [[nodiscard]] int32_t groups_count() const;
 
-  [[nodiscard]] const reinst* insts_data() const;
   [[nodiscard]] int32_t insts_count() const;
-  reinst& inst_at(int32_t id);
+  [[nodiscard]] reinst& inst_at(int32_t id);
+  [[nodiscard]] reinst const* insts_data() const;
 
-  reclass& class_at(int32_t id);
   [[nodiscard]] int32_t classes_count() const;
+  [[nodiscard]] reclass& class_at(int32_t id);
+  [[nodiscard]] reclass const* classes_data() const;
 
   [[nodiscard]] const int32_t* starts_data() const;
   [[nodiscard]] int32_t starts_count() const;
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 2ee195a2c5e..29a3d7e70ec 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -51,7 +51,7 @@ constexpr int32_t MINIMUM_THREADS     = 256;  // Minimum threads for computing w
 struct alignas(16) reclass_device {
   int32_t builtins{};
   int32_t count{};
-  char32_t const* literals{};
+  reclass_span const* literals{};
 
   __device__ inline bool is_match(char32_t const ch, uint8_t const* flags) const;
 };
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 8e2194f2094..a8d13e236db 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -141,7 +141,8 @@ __device__ __forceinline__ bool reclass_device::is_match(char32_t const ch,
                                                          uint8_t const* codepoint_flags) const
 {
   for (int i = 0; i < count; ++i) {
-    if ((ch >= literals[i * 2]) && (ch <= literals[(i * 2) + 1])) { return true; }
+    auto const literal = literals[i];
+    if ((ch >= literal.first) && (ch <= literal.last)) { return true; }
   }
 
   if (!builtins) return false;
@@ -204,11 +205,11 @@ __device__ __forceinline__ void reprog_device::store(void* buffer) const
   auto classes     = reinterpret_cast<reclass_device*>(ptr);
   result->_classes = classes;
   // fill in each class
-  auto d_ptr = reinterpret_cast<char32_t*>(classes + _classes_count);
+  auto d_ptr = reinterpret_cast<reclass_span*>(classes + _classes_count);
   for (int idx = 0; idx < _classes_count; ++idx) {
     classes[idx]          = _classes[idx];
     classes[idx].literals = d_ptr;
-    for (int jdx = 0; jdx < _classes[idx].count * 2; ++jdx)
+    for (int jdx = 0; jdx < _classes[idx].count; ++jdx)
       *d_ptr++ = _classes[idx].literals[jdx];
   }
 }
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index 16f5b6fa03d..d504e469c44 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -25,6 +25,8 @@
 #include <rmm/device_buffer.hpp>
 
 #include <algorithm>
+#include <functional>
+#include <numeric>
 
 namespace cudf {
 namespace strings {
@@ -63,9 +65,12 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   // compute size of each section
   auto insts_size    = insts_count * sizeof(_insts[0]);
   auto startids_size = starts_count * sizeof(_startinst_ids[0]);
-  auto classes_size  = classes_count * sizeof(_classes[0]);
-  for (auto idx = 0; idx < classes_count; ++idx)
-    classes_size += static_cast<int32_t>((h_prog.class_at(idx).literals.size()) * sizeof(char32_t));
+  auto classes_size  = std::transform_reduce(
+    h_prog.classes_data(),
+    h_prog.classes_data() + h_prog.classes_count(),
+    classes_count * sizeof(_classes[0]),
+    std::plus<std::size_t>{},
+    [&h_prog](auto& cls) { return cls.literals.size() * sizeof(reclass_span); });
   // make sure each section is aligned for the subsequent section's data type
   auto const memsize = cudf::util::round_up_safe(insts_size, sizeof(_startinst_ids[0])) +
                        cudf::util::round_up_safe(startids_size, sizeof(_classes[0])) +
@@ -104,14 +109,14 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   auto d_end = d_ptr + (classes_count * sizeof(reclass_device));
   // place each class and append the variable length data
   for (int32_t idx = 0; idx < classes_count; ++idx) {
-    reclass& h_class = h_prog.class_at(idx);
+    auto h_class = h_prog.class_at(idx);
     reclass_device d_class{h_class.builtins,
-                           static_cast<int32_t>(h_class.literals.size() / 2),
-                           reinterpret_cast<char32_t*>(d_end)};
+                           static_cast<int32_t>(h_class.literals.size()),
+                           reinterpret_cast<reclass_span*>(d_end)};
     *classes++ = d_class;
-    memcpy(h_end, h_class.literals.c_str(), h_class.literals.size() * sizeof(char32_t));
-    h_end += h_class.literals.size() * sizeof(char32_t);
-    d_end += h_class.literals.size() * sizeof(char32_t);
+    memcpy(h_end, h_class.literals.data(), h_class.literals.size() * sizeof(reclass_span));
+    h_end += h_class.literals.size() * sizeof(reclass_span);
+    d_end += h_class.literals.size() * sizeof(reclass_span);
   }
 
   // initialize the rest of the elements
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 9df22503c07..21c18977746 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -406,6 +406,23 @@ TEST_F(StringsContainsTests, FixedQuantifier)
   }
 }
 
+TEST_F(StringsContainsTests, OverlappedClasses)
+{
+  auto input = cudf::test::strings_column_wrapper({"abcdefg", "defghí", "", "éééééé", "ghijkl"});
+  auto sv = cudf::strings_column_view(input);
+
+  {
+    auto results = cudf::strings::count_re(sv, "[e-gb-da-c]");
+    cudf::test::fixed_width_column_wrapper<int32_t> expected({7, 4, 0, 0, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto results = cudf::strings::count_re(sv, "[á-éê-ú]");
+    cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 1, 0, 6, 0});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
 TEST_F(StringsContainsTests, MultiLine)
 {
   auto input =

From f11ebe1836a3a7aa814ce55d4ce8a0c571210986 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 6 Jun 2022 11:26:37 -0400
Subject: [PATCH 2/7] rename reclass_span reclass_range

---
 cpp/src/strings/regex/regcomp.cpp |  6 +++---
 cpp/src/strings/regex/regcomp.h   |  6 +++---
 cpp/src/strings/regex/regex.cuh   |  2 +-
 cpp/src/strings/regex/regex.inl   |  2 +-
 cpp/src/strings/regex/regexec.cu  | 10 +++++-----
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index e5a4f7e0fa1..5133a167072 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -250,12 +250,12 @@ class regex_parser {
     }
 
     // transform pairs of literals to spans
-    std::vector<reclass_span> spans;
+    std::vector<reclass_range> spans;
     auto const evens = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
                                                        [](auto i) { return i * 2; });
     std::transform(
       evens, evens + (literals.size() / 2), std::back_inserter(spans), [&literals](auto idx) {
-        return reclass_span{literals[idx], literals[idx + 1]};
+        return reclass_range{literals[idx], literals[idx + 1]};
       });
     // sort the spans to help with detecting overlapping entries
     std::sort(spans.begin(), spans.end(), [](auto l, auto r) {
@@ -268,7 +268,7 @@ class regex_parser {
         auto const curr = *itr;
         if (curr.first <= prev.last + 1) {
           // if these 2 spans intersect, expand the current one
-          *itr = reclass_span{prev.first, std::max(prev.last, curr.last)};
+          *itr = reclass_range{prev.first, std::max(prev.last, curr.last)};
         }
       }
     }
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index 3d7ba114000..10092137c77 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -50,7 +50,7 @@ enum InstType {
 /**
  * @brief Range used for literals in reclass classes.
  */
-struct reclass_span {
+struct reclass_range {
   char32_t first{};  /// first character in span
   char32_t last{};   /// last character in span (inclusive)
 };
@@ -60,10 +60,10 @@ struct reclass_span {
  */
 struct reclass {
   int32_t builtins{0};  // bit mask identifying builtin classes
-  std::vector<reclass_span> literals;
+  std::vector<reclass_range> literals;
   reclass() {}
   reclass(int m) : builtins(m) {}
-  reclass(int m, std::vector<reclass_span>&& l) : builtins(m), literals(std::move(l)) {}
+  reclass(int m, std::vector<reclass_range>&& l) : builtins(m), literals(std::move(l)) {}
 };
 
 constexpr int32_t CCLASS_W{1 << 0};   // [a-z], [A-Z], [0-9], and '_'
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 29a3d7e70ec..e899c84a48d 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -51,7 +51,7 @@ constexpr int32_t MINIMUM_THREADS     = 256;  // Minimum threads for computing w
 struct alignas(16) reclass_device {
   int32_t builtins{};
   int32_t count{};
-  reclass_span const* literals{};
+  reclass_range const* literals{};
 
   __device__ inline bool is_match(char32_t const ch, uint8_t const* flags) const;
 };
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index a8d13e236db..fb2db86ab0b 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -205,7 +205,7 @@ __device__ __forceinline__ void reprog_device::store(void* buffer) const
   auto classes     = reinterpret_cast<reclass_device*>(ptr);
   result->_classes = classes;
   // fill in each class
-  auto d_ptr = reinterpret_cast<reclass_span*>(classes + _classes_count);
+  auto d_ptr = reinterpret_cast<reclass_range*>(classes + _classes_count);
   for (int idx = 0; idx < _classes_count; ++idx) {
     classes[idx]          = _classes[idx];
     classes[idx].literals = d_ptr;
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index d504e469c44..99f6692f023 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -70,7 +70,7 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
     h_prog.classes_data() + h_prog.classes_count(),
     classes_count * sizeof(_classes[0]),
     std::plus<std::size_t>{},
-    [&h_prog](auto& cls) { return cls.literals.size() * sizeof(reclass_span); });
+    [&h_prog](auto& cls) { return cls.literals.size() * sizeof(reclass_range); });
   // make sure each section is aligned for the subsequent section's data type
   auto const memsize = cudf::util::round_up_safe(insts_size, sizeof(_startinst_ids[0])) +
                        cudf::util::round_up_safe(startids_size, sizeof(_classes[0])) +
@@ -112,11 +112,11 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
     auto h_class = h_prog.class_at(idx);
     reclass_device d_class{h_class.builtins,
                            static_cast<int32_t>(h_class.literals.size()),
-                           reinterpret_cast<reclass_span*>(d_end)};
+                           reinterpret_cast<reclass_range*>(d_end)};
     *classes++ = d_class;
-    memcpy(h_end, h_class.literals.data(), h_class.literals.size() * sizeof(reclass_span));
-    h_end += h_class.literals.size() * sizeof(reclass_span);
-    d_end += h_class.literals.size() * sizeof(reclass_span);
+    memcpy(h_end, h_class.literals.data(), h_class.literals.size() * sizeof(reclass_range));
+    h_end += h_class.literals.size() * sizeof(reclass_range);
+    d_end += h_class.literals.size() * sizeof(reclass_range);
   }
 
   // initialize the rest of the elements

From 6914f4d267c0fe4be34d724a4a187d00a79331b1 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 16 Jun 2022 10:42:13 -0400
Subject: [PATCH 3/7] expand static reclass var names

---
 cpp/src/strings/regex/regcomp.cpp | 58 +++++++++++++++----------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 47e29e04aad..4eb040ba526 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -53,12 +53,12 @@ enum OperatorType {
 };
 #define ITEM_MASK 0300
 
-static reclass ccls_w(CCLASS_W);   // \w
-static reclass ccls_s(CCLASS_S);   // \s
-static reclass ccls_d(CCLASS_D);   // \d
-static reclass ccls_W(NCCLASS_W);  // \W
-static reclass ccls_S(NCCLASS_S);  // \S
-static reclass ccls_D(NCCLASS_D);  // \D
+static reclass cclass_w(CCLASS_W);   // \w
+static reclass cclass_s(CCLASS_S);   // \s
+static reclass cclass_d(CCLASS_D);   // \d
+static reclass cclass_W(NCCLASS_W);  // \W
+static reclass cclass_S(NCCLASS_S);  // \S
+static reclass cclass_D(NCCLASS_D);  // \D
 
 // Tables for analyzing quantifiers
 const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}};
@@ -241,32 +241,32 @@ class regex_parser {
           case 'b': chr = 0x08; break;
           case 'f': chr = 0x0C; break;
           case 'w':
-            builtins |= ccls_w.builtins;
+            builtins |= cclass_w.builtins;
             std::tie(is_quoted, chr) = next_char();
             continue;
           case 's':
-            builtins |= ccls_s.builtins;
+            builtins |= cclass_s.builtins;
             std::tie(is_quoted, chr) = next_char();
             continue;
           case 'd':
-            builtins |= ccls_d.builtins;
+            builtins |= cclass_d.builtins;
             std::tie(is_quoted, chr) = next_char();
             continue;
           case 'W':
-            builtins |= ccls_W.builtins;
+            builtins |= cclass_W.builtins;
             std::tie(is_quoted, chr) = next_char();
             continue;
           case 'S':
-            builtins |= ccls_S.builtins;
+            builtins |= cclass_S.builtins;
             std::tie(is_quoted, chr) = next_char();
             continue;
           case 'D':
-            builtins |= ccls_D.builtins;
+            builtins |= cclass_D.builtins;
             std::tie(is_quoted, chr) = next_char();
             continue;
         }
       }
-      if (!is_quoted && chr == ']' && count_char > 1) break;
+      if (!is_quoted && chr == ']' && count_char > 1) { break; }  // done
       if (!is_quoted && chr == '-') {
         if (literals.empty()) { return 0; }  // malformed '[]'
         std::tie(is_quoted, chr) = next_char();
@@ -280,18 +280,16 @@ class regex_parser {
     }
 
     // transform pairs of literals to spans
-    std::vector<reclass_range> spans;
-    auto const evens = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
-                                                       [](auto i) { return i * 2; });
-    std::transform(
-      evens, evens + (literals.size() / 2), std::back_inserter(spans), [&literals](auto idx) {
-        return reclass_range{literals[idx], literals[idx + 1]};
-      });
+    std::vector<reclass_range> spans(literals.size() / 2);
+    auto const counter = thrust::make_counting_iterator(0);
+    std::transform(counter, counter + spans.size(), spans.begin(), [&literals](auto idx) {
+      return reclass_range{literals[idx * 2], literals[idx * 2 + 1]};
+    });
     // sort the spans to help with detecting overlapping entries
     std::sort(spans.begin(), spans.end(), [](auto l, auto r) {
       return l.first == r.first ? l.last < r.last : l.first < r.first;
     });
-    // combine overlapping entries
+    // combine overlapping entries: [a-f][c-g] => [a-g]
     for (auto itr = spans.begin() + static_cast<int>(!spans.empty()); itr < spans.end(); ++itr) {
       auto const prev = *(itr - 1);
       auto const curr = *itr;
@@ -300,11 +298,11 @@ class regex_parser {
         *itr = reclass_range{prev.first, std::max(prev.last, curr.last)};
       }
     }
-    // remove duplicates
+    // remove any duplicates
     std::reverse(spans.begin(), spans.end());  // moves larger overlaps forward
-    auto const end =  // this relies on std::unique keeping the first entry in a repeated sequence
+    auto const end =  // std::unique specifies keeping the first entry in a repeated sequence
       std::unique(spans.begin(), spans.end(), [](auto l, auto r) { return l.first == r.first; });
-    spans.erase(end, spans.end());
+    spans.erase(end, spans.end());  // clear the remainder
 
     _cclass_id = _prog.add_class(reclass{builtins, std::move(spans)});
     return type;
@@ -355,13 +353,13 @@ class regex_parser {
             break;
           }
           case 'w': {
-            if (_id_cclass_w < 0) { _id_cclass_w = _prog.add_class(ccls_w); }
+            if (_id_cclass_w < 0) { _id_cclass_w = _prog.add_class(cclass_w); }
             _cclass_id = _id_cclass_w;
             return CCLASS;
           }
           case 'W': {
             if (_id_cclass_W < 0) {
-              reclass cls = ccls_w;
+              reclass cls = cclass_w;
               cls.literals.push_back({'\n', '\n'});
               _id_cclass_W = _prog.add_class(cls);
             }
@@ -369,23 +367,23 @@ class regex_parser {
             return NCCLASS;
           }
           case 's': {
-            if (_id_cclass_s < 0) { _id_cclass_s = _prog.add_class(ccls_s); }
+            if (_id_cclass_s < 0) { _id_cclass_s = _prog.add_class(cclass_s); }
             _cclass_id = _id_cclass_s;
             return CCLASS;
           }
           case 'S': {
-            if (_id_cclass_s < 0) { _id_cclass_s = _prog.add_class(ccls_s); }
+            if (_id_cclass_s < 0) { _id_cclass_s = _prog.add_class(cclass_s); }
             _cclass_id = _id_cclass_s;
             return NCCLASS;
           }
           case 'd': {
-            if (_id_cclass_d < 0) { _id_cclass_d = _prog.add_class(ccls_d); }
+            if (_id_cclass_d < 0) { _id_cclass_d = _prog.add_class(cclass_d); }
             _cclass_id = _id_cclass_d;
             return CCLASS;
           }
           case 'D': {
             if (_id_cclass_D < 0) {
-              reclass cls = ccls_d;
+              reclass cls = cclass_d;
               cls.literals.push_back({'\n', '\n'});
               _id_cclass_D = _prog.add_class(cls);
             }

From 21fa745931e17c9391b81475b6a01fe7ccffb6c3 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 16 Jun 2022 13:06:30 -0400
Subject: [PATCH 4/7] remove unneeded include

---
 cpp/src/strings/regex/regcomp.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 4eb040ba526..dbde7700672 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -20,7 +20,6 @@
 #include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 
 #include <algorithm>
 #include <array>

From aad80306075b5c8fca0d57a777c68b3f36d81e91 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 16 Jun 2022 14:27:15 -0400
Subject: [PATCH 5/7] rename var spans to ranges

---
 cpp/src/strings/regex/regcomp.cpp | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index dbde7700672..0bf2693c708 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -278,32 +278,31 @@ class regex_parser {
       std::tie(is_quoted, chr) = next_char();
     }
 
-    // transform pairs of literals to spans
-    std::vector<reclass_range> spans(literals.size() / 2);
+    // transform pairs of literals to ranges
+    std::vector<reclass_range> ranges(literals.size() / 2);
     auto const counter = thrust::make_counting_iterator(0);
-    std::transform(counter, counter + spans.size(), spans.begin(), [&literals](auto idx) {
+    std::transform(counter, counter + ranges.size(), ranges.begin(), [&literals](auto idx) {
       return reclass_range{literals[idx * 2], literals[idx * 2 + 1]};
     });
-    // sort the spans to help with detecting overlapping entries
-    std::sort(spans.begin(), spans.end(), [](auto l, auto r) {
+    // sort the ranges to help with detecting overlapping entries
+    std::sort(ranges.begin(), ranges.end(), [](auto l, auto r) {
       return l.first == r.first ? l.last < r.last : l.first < r.first;
     });
     // combine overlapping entries: [a-f][c-g] => [a-g]
-    for (auto itr = spans.begin() + static_cast<int>(!spans.empty()); itr < spans.end(); ++itr) {
+    for (auto itr = ranges.begin() + static_cast<int>(!ranges.empty()); itr < ranges.end(); ++itr) {
       auto const prev = *(itr - 1);
-      auto const curr = *itr;
-      if (curr.first <= prev.last + 1) {
-        // if these 2 spans intersect, expand the current one
-        *itr = reclass_range{prev.first, std::max(prev.last, curr.last)};
+      if (itr->first <= prev.last + 1) {
+        // if these 2 ranges intersect, expand the current one
+        *itr = reclass_range{prev.first, std::max(prev.last, itr->last)};
       }
     }
     // remove any duplicates
-    std::reverse(spans.begin(), spans.end());  // moves larger overlaps forward
+    std::reverse(ranges.begin(), ranges.end());  // moves larger overlaps forward
     auto const end =  // std::unique specifies keeping the first entry in a repeated sequence
-      std::unique(spans.begin(), spans.end(), [](auto l, auto r) { return l.first == r.first; });
-    spans.erase(end, spans.end());  // clear the remainder
+      std::unique(ranges.begin(), ranges.end(), [](auto l, auto r) { return l.first == r.first; });
+    ranges.erase(end, ranges.end());  // clear the remaining items
 
-    _cclass_id = _prog.add_class(reclass{builtins, std::move(spans)});
+    _cclass_id = _prog.add_class(reclass{builtins, std::move(ranges)});
     return type;
   }
 

From a12cea633c2bf6c9518e10114ea92136231b287c Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 16 Jun 2022 18:37:08 -0400
Subject: [PATCH 6/7] use rbegin/rend with std::unique

---
 cpp/src/strings/regex/regcomp.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 0bf2693c708..992d66a5ff4 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -289,18 +289,19 @@ class regex_parser {
       return l.first == r.first ? l.last < r.last : l.first < r.first;
     });
     // combine overlapping entries: [a-f][c-g] => [a-g]
-    for (auto itr = ranges.begin() + static_cast<int>(!ranges.empty()); itr < ranges.end(); ++itr) {
-      auto const prev = *(itr - 1);
-      if (itr->first <= prev.last + 1) {
-        // if these 2 ranges intersect, expand the current one
-        *itr = reclass_range{prev.first, std::max(prev.last, itr->last)};
+    if (ranges.size() > 1) {
+      for (auto itr = ranges.begin() + 1; itr < ranges.end(); ++itr) {
+        auto const prev = *(itr - 1);
+        if (itr->first <= prev.last + 1) {
+          // if these 2 ranges intersect, expand the current one
+          *itr = reclass_range{prev.first, std::max(prev.last, itr->last)};
+        }
       }
     }
     // remove any duplicates
-    std::reverse(ranges.begin(), ranges.end());  // moves larger overlaps forward
-    auto const end =  // std::unique specifies keeping the first entry in a repeated sequence
-      std::unique(ranges.begin(), ranges.end(), [](auto l, auto r) { return l.first == r.first; });
-    ranges.erase(end, ranges.end());  // clear the remaining items
+    auto const end = std::unique(
+      ranges.rbegin(), ranges.rend(), [](auto l, auto r) { return l.first == r.first; });
+    ranges.erase(ranges.begin(), ranges.begin() + std::distance(end, ranges.rend()));
 
     _cclass_id = _prog.add_class(reclass{builtins, std::move(ranges)});
     return type;

From 7d3539b10619ab1e3ca17abfe48da6898fab27bb Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 23 Jun 2022 13:45:59 -0400
Subject: [PATCH 7/7] fix h_class var declaration to prevent unneeded copy

---
 cpp/src/strings/regex/regexec.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index 99f6692f023..2ee1901c32d 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -109,7 +109,7 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   auto d_end = d_ptr + (classes_count * sizeof(reclass_device));
   // place each class and append the variable length data
   for (int32_t idx = 0; idx < classes_count; ++idx) {
-    auto h_class = h_prog.class_at(idx);
+    auto const& h_class = h_prog.class_at(idx);
     reclass_device d_class{h_class.builtins,
                            static_cast<int32_t>(h_class.literals.size()),
                            reinterpret_cast<reclass_range*>(d_end)};