From 2d913892ea0c432c9452b7f20295ff275e4753da Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 9 May 2022 13:47:01 -0400 Subject: [PATCH 1/2] Refactor regex builtin class identifiers --- cpp/src/strings/regex/regcomp.cpp | 14 +++++++------- cpp/src/strings/regex/regcomp.h | 29 ++++++++++++++++++++--------- cpp/src/strings/regex/regex.inl | 12 ++++++------ 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 829230d0842..f99acc3448a 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -44,17 +44,17 @@ enum OperatorType { NOP = 0302, // No operation, internal use only }; -static reclass ccls_w(1); // [a-z], [A-Z], [0-9], and '_' -static reclass ccls_W(8); // now ccls_w plus '\n' -static reclass ccls_s(2); // all spaces or ctrl characters -static reclass ccls_S(16); // not ccls_s -static reclass ccls_d(4); // digits [0-9] -static reclass ccls_D(32); // not ccls_d plus '\n' +static reclass ccls_w(CCLASS_W); // \w +static reclass ccls_s(CCLASS_S); // \s +static reclass ccls_d(CCLASS_D); // \d +static reclass ccls_W(NCCLASS_W); // \W +static reclass ccls_S(NCCLASS_S); // \S +static reclass ccls_D(NCCLASS_D); // \D // Tables for analyzing quantifiers const std::array valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}}; const std::array quantifiers{{'*', '?', '+', '{', '|'}}; -// Valid regex characters that can be escaping to be used as literals +// Valid regex characters that can be escaped and used as literals const std::array escapable_chars{ {'.', '-', '+', '*', '\\', '?', '^', '$', '|', '{', '}', '(', ')', '[', ']', '<', '>', '"', '~', '\'', '`', '_', '@', '=', ';', ':', '!', '#', '%', '&', ',', '/', ' '}}; diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h index 798b43830b4..c486a23cbbf 100644 --- a/cpp/src/strings/regex/regcomp.h +++ b/cpp/src/strings/regex/regcomp.h @@ -41,9 +41,9 @@ enum InstType { BOL = 0303, // Beginning of line, ^ EOL = 0304, // End of line, $ CCLASS = 0305, // Character class, [] - NCCLASS = 0306, // Negated character class, [] + NCCLASS = 0306, // Negated character class, [^ ] BOW = 0307, // Boundary of word, /b - NBOW = 0310, // Not boundary of word, /b + NBOW = 0310, // Not boundary of word, /B END = 0377 // Terminate: match found }; @@ -57,6 +57,13 @@ struct reclass { reclass(int m) : builtins(m) {} }; +constexpr int32_t CCLASS_W{1 << 0}; // [a-z], [A-Z], [0-9], and '_' +constexpr int32_t CCLASS_S{1 << 1}; // all spaces or ctrl characters +constexpr int32_t CCLASS_D{1 << 2}; // digits [0-9] +constexpr int32_t NCCLASS_W{1 << 3}; // not cclass_w plus '\n' +constexpr int32_t NCCLASS_S{1 << 4}; // not cclass_s +constexpr int32_t NCCLASS_D{1 << 5}; // not cclass_d plus '\n' + /** * @brief Structure of an encoded regex instruction */ @@ -76,12 +83,11 @@ struct reinst { }; /** - * @brief Regex program handles parsing a pattern in to individual set + * @brief Regex program handles parsing a pattern into a vector * of chained instructions. */ class reprog { public: - reprog() = default; reprog(const reprog&) = default; reprog(reprog&&) = default; ~reprog() = default; @@ -89,8 +95,12 @@ class reprog { reprog& operator=(reprog&&) = default; /** - * @brief Parses the given regex pattern and compiles - * into a list of chained instructions. + * @brief Parses the given regex pattern and produces an instance + * of this object + * + * @param pattern Regex pattern encoded as UTF-8 + * @param flags For interpretting certain `pattern` characters + * @return Instance of reprog */ static reprog create_from(std::string_view pattern, regex_flags const flags); @@ -122,12 +132,13 @@ class reprog { #endif private: - std::vector _insts; - std::vector _classes; - int32_t _startinst_id; + std::vector _insts; // instructions + std::vector _classes; // data for CCLASS instructions + int32_t _startinst_id{}; // id of first instruction std::vector _startinst_ids; // short-cut to speed-up ORs int32_t _num_capturing_groups{}; + reprog() = default; void check_for_errors(int32_t id, int32_t next_id); }; diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 9fe4440d7ec..8bb12187d72 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -149,17 +149,17 @@ __device__ __forceinline__ bool reclass_device::is_match(char32_t const ch, uint32_t codept = utf8_to_codepoint(ch); if (codept > 0x00FFFF) return false; int8_t fl = codepoint_flags[codept]; - if ((builtins & 1) && ((ch == '_') || IS_ALPHANUM(fl))) // \w + if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl))) // \w return true; - if ((builtins & 2) && IS_SPACE(fl)) // \s + if ((builtins & CCLASS_S) && IS_SPACE(fl)) // \s return true; - if ((builtins & 4) && IS_DIGIT(fl)) // \d + if ((builtins & CCLASS_D) && IS_DIGIT(fl)) // \d return true; - if ((builtins & 8) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl))) // \W + if ((builtins & NCCLASS_W) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl))) // \W return true; - if ((builtins & 16) && !IS_SPACE(fl)) // \S + if ((builtins & NCCLASS_S) && !IS_SPACE(fl)) // \S return true; - if ((builtins & 32) && ((ch != '\n') && !IS_DIGIT(fl))) // \D + if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl))) // \D return true; // return false; From 60dcc65a1a3ff6ec290208617af7614f374b4252 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 10 May 2022 12:42:50 -0400 Subject: [PATCH 2/2] fix comment wording NCCLASS types --- cpp/src/strings/regex/regcomp.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h index c486a23cbbf..162a2090268 100644 --- a/cpp/src/strings/regex/regcomp.h +++ b/cpp/src/strings/regex/regcomp.h @@ -42,8 +42,8 @@ enum InstType { EOL = 0304, // End of line, $ CCLASS = 0305, // Character class, [] NCCLASS = 0306, // Negated character class, [^ ] - BOW = 0307, // Boundary of word, /b - NBOW = 0310, // Not boundary of word, /B + BOW = 0307, // Boundary of word, \b + NBOW = 0310, // Not boundary of word, \B END = 0377 // Terminate: match found }; @@ -60,9 +60,9 @@ struct reclass { constexpr int32_t CCLASS_W{1 << 0}; // [a-z], [A-Z], [0-9], and '_' constexpr int32_t CCLASS_S{1 << 1}; // all spaces or ctrl characters constexpr int32_t CCLASS_D{1 << 2}; // digits [0-9] -constexpr int32_t NCCLASS_W{1 << 3}; // not cclass_w plus '\n' -constexpr int32_t NCCLASS_S{1 << 4}; // not cclass_s -constexpr int32_t NCCLASS_D{1 << 5}; // not cclass_d plus '\n' +constexpr int32_t NCCLASS_W{1 << 3}; // not CCLASS_W or '\n' +constexpr int32_t NCCLASS_S{1 << 4}; // not CCLASS_S +constexpr int32_t NCCLASS_D{1 << 5}; // not CCLASS_D or '\n' /** * @brief Structure of an encoded regex instruction