Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor regex builtin character-class identifiers #10814

Merged
merged 3 commits into from
May 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions cpp/src/strings/regex/regcomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,17 @@ enum OperatorType {
NOP = 0302, // No operation, internal use only
};

static reclass ccls_w(1); // [a-z], [A-Z], [0-9], and '_'
static reclass ccls_W(8); // now ccls_w plus '\n'
static reclass ccls_s(2); // all spaces or ctrl characters
static reclass ccls_S(16); // not ccls_s
static reclass ccls_d(4); // digits [0-9]
static reclass ccls_D(32); // not ccls_d plus '\n'
static reclass ccls_w(CCLASS_W); // \w
static reclass ccls_s(CCLASS_S); // \s
static reclass ccls_d(CCLASS_D); // \d
static reclass ccls_W(NCCLASS_W); // \W
static reclass ccls_S(NCCLASS_S); // \S
static reclass ccls_D(NCCLASS_D); // \D

// Tables for analyzing quantifiers
const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}};
const std::array<char, 5> quantifiers{{'*', '?', '+', '{', '|'}};
// Valid regex characters that can be escaping to be used as literals
// Valid regex characters that can be escaped and used as literals
const std::array<char, 33> escapable_chars{
{'.', '-', '+', '*', '\\', '?', '^', '$', '|', '{', '}', '(', ')', '[', ']', '<', '>',
'"', '~', '\'', '`', '_', '@', '=', ';', ':', '!', '#', '%', '&', ',', '/', ' '}};
Expand Down
31 changes: 21 additions & 10 deletions cpp/src/strings/regex/regcomp.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ enum InstType {
BOL = 0303, // Beginning of line, ^
EOL = 0304, // End of line, $
CCLASS = 0305, // Character class, []
NCCLASS = 0306, // Negated character class, []
BOW = 0307, // Boundary of word, /b
NBOW = 0310, // Not boundary of word, /b
NCCLASS = 0306, // Negated character class, [^ ]
BOW = 0307, // Boundary of word, \b
NBOW = 0310, // Not boundary of word, \B
END = 0377 // Terminate: match found
};

Expand All @@ -57,6 +57,13 @@ struct reclass {
reclass(int m) : builtins(m) {}
};

constexpr int32_t CCLASS_W{1 << 0}; // [a-z], [A-Z], [0-9], and '_'
constexpr int32_t CCLASS_S{1 << 1}; // all spaces or ctrl characters
constexpr int32_t CCLASS_D{1 << 2}; // digits [0-9]
constexpr int32_t NCCLASS_W{1 << 3}; // not CCLASS_W or '\n'
constexpr int32_t NCCLASS_S{1 << 4}; // not CCLASS_S
constexpr int32_t NCCLASS_D{1 << 5}; // not CCLASS_D or '\n'

/**
* @brief Structure of an encoded regex instruction
*/
Expand All @@ -76,21 +83,24 @@ struct reinst {
};

/**
* @brief Regex program handles parsing a pattern in to individual set
* @brief Regex program handles parsing a pattern into a vector
* of chained instructions.
*/
class reprog {
public:
reprog() = default;
reprog(const reprog&) = default;
reprog(reprog&&) = default;
~reprog() = default;
reprog& operator=(const reprog&) = default;
reprog& operator=(reprog&&) = default;

/**
* @brief Parses the given regex pattern and compiles
* into a list of chained instructions.
* @brief Parses the given regex pattern and produces an instance
* of this object
*
* @param pattern Regex pattern encoded as UTF-8
* @param flags For interpretting certain `pattern` characters
* @return Instance of reprog
*/
static reprog create_from(std::string_view pattern, regex_flags const flags);

Expand Down Expand Up @@ -122,12 +132,13 @@ class reprog {
#endif

private:
std::vector<reinst> _insts;
std::vector<reclass> _classes;
int32_t _startinst_id;
std::vector<reinst> _insts; // instructions
std::vector<reclass> _classes; // data for CCLASS instructions
int32_t _startinst_id{}; // id of first instruction
std::vector<int32_t> _startinst_ids; // short-cut to speed-up ORs
int32_t _num_capturing_groups{};

reprog() = default;
void check_for_errors(int32_t id, int32_t next_id);
};

Expand Down
12 changes: 6 additions & 6 deletions cpp/src/strings/regex/regex.inl
Original file line number Diff line number Diff line change
Expand Up @@ -148,17 +148,17 @@ __device__ __forceinline__ bool reclass_device::is_match(char32_t const ch,
uint32_t codept = utf8_to_codepoint(ch);
if (codept > 0x00FFFF) return false;
int8_t fl = codepoint_flags[codept];
if ((builtins & 1) && ((ch == '_') || IS_ALPHANUM(fl))) // \w
if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl))) // \w
return true;
if ((builtins & 2) && IS_SPACE(fl)) // \s
if ((builtins & CCLASS_S) && IS_SPACE(fl)) // \s
return true;
if ((builtins & 4) && IS_DIGIT(fl)) // \d
if ((builtins & CCLASS_D) && IS_DIGIT(fl)) // \d
return true;
if ((builtins & 8) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl))) // \W
if ((builtins & NCCLASS_W) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl))) // \W
return true;
if ((builtins & 16) && !IS_SPACE(fl)) // \S
if ((builtins & NCCLASS_S) && !IS_SPACE(fl)) // \S
return true;
if ((builtins & 32) && ((ch != '\n') && !IS_DIGIT(fl))) // \D
if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl))) // \D
return true;
//
return false;
Expand Down