Skip to content

Commit

Permalink
Refactor regex builtin character-class identifiers (#10814)
Browse files Browse the repository at this point in the history
Refactors the builtin regex class integer ids to common header for the compiler and executor.
The builtin regex character classes like `\s, \d, \W` have integer identifiers (bit values that can be combined) but were defined in separate source files. This PR refactors the declarations to the common header file `regcomp.h` to ensure the same value is used when parsing/compiling the instructions in `regcomp.cpp` and when evaluating the instructions in `regex.inl`.

This is just a cleanup of the code and does not effect behavior or performance.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #10814
  • Loading branch information
davidwendt authored May 11, 2022
1 parent 2aaa863 commit efd2c39
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 23 deletions.
14 changes: 7 additions & 7 deletions cpp/src/strings/regex/regcomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,17 @@ enum OperatorType {
NOP = 0302, // No operation, internal use only
};

static reclass ccls_w(1); // [a-z], [A-Z], [0-9], and '_'
static reclass ccls_W(8); // now ccls_w plus '\n'
static reclass ccls_s(2); // all spaces or ctrl characters
static reclass ccls_S(16); // not ccls_s
static reclass ccls_d(4); // digits [0-9]
static reclass ccls_D(32); // not ccls_d plus '\n'
static reclass ccls_w(CCLASS_W); // \w
static reclass ccls_s(CCLASS_S); // \s
static reclass ccls_d(CCLASS_D); // \d
static reclass ccls_W(NCCLASS_W); // \W
static reclass ccls_S(NCCLASS_S); // \S
static reclass ccls_D(NCCLASS_D); // \D

// Tables for analyzing quantifiers
const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}};
const std::array<char, 5> quantifiers{{'*', '?', '+', '{', '|'}};
// Valid regex characters that can be escaping to be used as literals
// Valid regex characters that can be escaped and used as literals
const std::array<char, 33> escapable_chars{
{'.', '-', '+', '*', '\\', '?', '^', '$', '|', '{', '}', '(', ')', '[', ']', '<', '>',
'"', '~', '\'', '`', '_', '@', '=', ';', ':', '!', '#', '%', '&', ',', '/', ' '}};
Expand Down
31 changes: 21 additions & 10 deletions cpp/src/strings/regex/regcomp.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ enum InstType {
BOL = 0303, // Beginning of line, ^
EOL = 0304, // End of line, $
CCLASS = 0305, // Character class, []
NCCLASS = 0306, // Negated character class, []
BOW = 0307, // Boundary of word, /b
NBOW = 0310, // Not boundary of word, /b
NCCLASS = 0306, // Negated character class, [^ ]
BOW = 0307, // Boundary of word, \b
NBOW = 0310, // Not boundary of word, \B
END = 0377 // Terminate: match found
};

Expand All @@ -57,6 +57,13 @@ struct reclass {
reclass(int m) : builtins(m) {}
};

constexpr int32_t CCLASS_W{1 << 0}; // [a-z], [A-Z], [0-9], and '_'
constexpr int32_t CCLASS_S{1 << 1}; // all spaces or ctrl characters
constexpr int32_t CCLASS_D{1 << 2}; // digits [0-9]
constexpr int32_t NCCLASS_W{1 << 3}; // not CCLASS_W or '\n'
constexpr int32_t NCCLASS_S{1 << 4}; // not CCLASS_S
constexpr int32_t NCCLASS_D{1 << 5}; // not CCLASS_D or '\n'

/**
* @brief Structure of an encoded regex instruction
*/
Expand All @@ -76,21 +83,24 @@ struct reinst {
};

/**
* @brief Regex program handles parsing a pattern in to individual set
* @brief Regex program handles parsing a pattern into a vector
* of chained instructions.
*/
class reprog {
public:
reprog() = default;
reprog(const reprog&) = default;
reprog(reprog&&) = default;
~reprog() = default;
reprog& operator=(const reprog&) = default;
reprog& operator=(reprog&&) = default;

/**
* @brief Parses the given regex pattern and compiles
* into a list of chained instructions.
* @brief Parses the given regex pattern and produces an instance
* of this object
*
* @param pattern Regex pattern encoded as UTF-8
* @param flags For interpretting certain `pattern` characters
* @return Instance of reprog
*/
static reprog create_from(std::string_view pattern, regex_flags const flags);

Expand Down Expand Up @@ -122,12 +132,13 @@ class reprog {
#endif

private:
std::vector<reinst> _insts;
std::vector<reclass> _classes;
int32_t _startinst_id;
std::vector<reinst> _insts; // instructions
std::vector<reclass> _classes; // data for CCLASS instructions
int32_t _startinst_id{}; // id of first instruction
std::vector<int32_t> _startinst_ids; // short-cut to speed-up ORs
int32_t _num_capturing_groups{};

reprog() = default;
void check_for_errors(int32_t id, int32_t next_id);
};

Expand Down
12 changes: 6 additions & 6 deletions cpp/src/strings/regex/regex.inl
Original file line number Diff line number Diff line change
Expand Up @@ -148,17 +148,17 @@ __device__ __forceinline__ bool reclass_device::is_match(char32_t const ch,
uint32_t codept = utf8_to_codepoint(ch);
if (codept > 0x00FFFF) return false;
int8_t fl = codepoint_flags[codept];
if ((builtins & 1) && ((ch == '_') || IS_ALPHANUM(fl))) // \w
if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl))) // \w
return true;
if ((builtins & 2) && IS_SPACE(fl)) // \s
if ((builtins & CCLASS_S) && IS_SPACE(fl)) // \s
return true;
if ((builtins & 4) && IS_DIGIT(fl)) // \d
if ((builtins & CCLASS_D) && IS_DIGIT(fl)) // \d
return true;
if ((builtins & 8) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl))) // \W
if ((builtins & NCCLASS_W) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl))) // \W
return true;
if ((builtins & 16) && !IS_SPACE(fl)) // \S
if ((builtins & NCCLASS_S) && !IS_SPACE(fl)) // \S
return true;
if ((builtins & 32) && ((ch != '\n') && !IS_DIGIT(fl))) // \D
if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl))) // \D
return true;
//
return false;
Expand Down

0 comments on commit efd2c39

Please sign in to comment.