diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 829230d0842..f99acc3448a 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -44,17 +44,17 @@ enum OperatorType { NOP = 0302, // No operation, internal use only }; -static reclass ccls_w(1); // [a-z], [A-Z], [0-9], and '_' -static reclass ccls_W(8); // now ccls_w plus '\n' -static reclass ccls_s(2); // all spaces or ctrl characters -static reclass ccls_S(16); // not ccls_s -static reclass ccls_d(4); // digits [0-9] -static reclass ccls_D(32); // not ccls_d plus '\n' +static reclass ccls_w(CCLASS_W); // \w +static reclass ccls_s(CCLASS_S); // \s +static reclass ccls_d(CCLASS_D); // \d +static reclass ccls_W(NCCLASS_W); // \W +static reclass ccls_S(NCCLASS_S); // \S +static reclass ccls_D(NCCLASS_D); // \D // Tables for analyzing quantifiers const std::array valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}}; const std::array quantifiers{{'*', '?', '+', '{', '|'}}; -// Valid regex characters that can be escaping to be used as literals +// Valid regex characters that can be escaped and used as literals const std::array escapable_chars{ {'.', '-', '+', '*', '\\', '?', '^', '$', '|', '{', '}', '(', ')', '[', ']', '<', '>', '"', '~', '\'', '`', '_', '@', '=', ';', ':', '!', '#', '%', '&', ',', '/', ' '}}; diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h index 798b43830b4..162a2090268 100644 --- a/cpp/src/strings/regex/regcomp.h +++ b/cpp/src/strings/regex/regcomp.h @@ -41,9 +41,9 @@ enum InstType { BOL = 0303, // Beginning of line, ^ EOL = 0304, // End of line, $ CCLASS = 0305, // Character class, [] - NCCLASS = 0306, // Negated character class, [] - BOW = 0307, // Boundary of word, /b - NBOW = 0310, // Not boundary of word, /b + NCCLASS = 0306, // Negated character class, [^ ] + BOW = 0307, // Boundary of word, \b + NBOW = 0310, // Not boundary of word, \B END = 0377 // Terminate: match found }; @@ -57,6 +57,13 @@ struct reclass { reclass(int m) : builtins(m) {} }; +constexpr int32_t CCLASS_W{1 << 0}; // [a-z], [A-Z], [0-9], and '_' +constexpr int32_t CCLASS_S{1 << 1}; // all spaces or ctrl characters +constexpr int32_t CCLASS_D{1 << 2}; // digits [0-9] +constexpr int32_t NCCLASS_W{1 << 3}; // not CCLASS_W or '\n' +constexpr int32_t NCCLASS_S{1 << 4}; // not CCLASS_S +constexpr int32_t NCCLASS_D{1 << 5}; // not CCLASS_D or '\n' + /** * @brief Structure of an encoded regex instruction */ @@ -76,12 +83,11 @@ struct reinst { }; /** - * @brief Regex program handles parsing a pattern in to individual set + * @brief Regex program handles parsing a pattern into a vector * of chained instructions. */ class reprog { public: - reprog() = default; reprog(const reprog&) = default; reprog(reprog&&) = default; ~reprog() = default; @@ -89,8 +95,12 @@ class reprog { reprog& operator=(reprog&&) = default; /** - * @brief Parses the given regex pattern and compiles - * into a list of chained instructions. + * @brief Parses the given regex pattern and produces an instance + * of this object + * + * @param pattern Regex pattern encoded as UTF-8 + * @param flags For interpretting certain `pattern` characters + * @return Instance of reprog */ static reprog create_from(std::string_view pattern, regex_flags const flags); @@ -122,12 +132,13 @@ class reprog { #endif private: - std::vector _insts; - std::vector _classes; - int32_t _startinst_id; + std::vector _insts; // instructions + std::vector _classes; // data for CCLASS instructions + int32_t _startinst_id{}; // id of first instruction std::vector _startinst_ids; // short-cut to speed-up ORs int32_t _num_capturing_groups{}; + reprog() = default; void check_for_errors(int32_t id, int32_t next_id); }; diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index bae6fb275f6..8e2194f2094 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -148,17 +148,17 @@ __device__ __forceinline__ bool reclass_device::is_match(char32_t const ch, uint32_t codept = utf8_to_codepoint(ch); if (codept > 0x00FFFF) return false; int8_t fl = codepoint_flags[codept]; - if ((builtins & 1) && ((ch == '_') || IS_ALPHANUM(fl))) // \w + if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl))) // \w return true; - if ((builtins & 2) && IS_SPACE(fl)) // \s + if ((builtins & CCLASS_S) && IS_SPACE(fl)) // \s return true; - if ((builtins & 4) && IS_DIGIT(fl)) // \d + if ((builtins & CCLASS_D) && IS_DIGIT(fl)) // \d return true; - if ((builtins & 8) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl))) // \W + if ((builtins & NCCLASS_W) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl))) // \W return true; - if ((builtins & 16) && !IS_SPACE(fl)) // \S + if ((builtins & NCCLASS_S) && !IS_SPACE(fl)) // \S return true; - if ((builtins & 32) && ((ch != '\n') && !IS_DIGIT(fl))) // \D + if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl))) // \D return true; // return false;