Skip to content

Commit

Permalink
fix merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed May 11, 2022
2 parents 25aab2d + efd2c39 commit 7982923
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 23 deletions.
14 changes: 7 additions & 7 deletions cpp/src/strings/regex/regcomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,17 @@ enum OperatorType {
NOP = 0302, // No operation, internal use only
};

static reclass ccls_w(1); // [a-z], [A-Z], [0-9], and '_'
static reclass ccls_W(8); // now ccls_w plus '\n'
static reclass ccls_s(2); // all spaces or ctrl characters
static reclass ccls_S(16); // not ccls_s
static reclass ccls_d(4); // digits [0-9]
static reclass ccls_D(32); // not ccls_d plus '\n'
static reclass ccls_w(CCLASS_W); // \w
static reclass ccls_s(CCLASS_S); // \s
static reclass ccls_d(CCLASS_D); // \d
static reclass ccls_W(NCCLASS_W); // \W
static reclass ccls_S(NCCLASS_S); // \S
static reclass ccls_D(NCCLASS_D); // \D

// Tables for analyzing quantifiers
const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}};
const std::array<char, 5> quantifiers{{'*', '?', '+', '{', '|'}};
// Valid regex characters that can be escaping to be used as literals
// Valid regex characters that can be escaped and used as literals
const std::array<char, 33> escapable_chars{
{'.', '-', '+', '*', '\\', '?', '^', '$', '|', '{', '}', '(', ')', '[', ']', '<', '>',
'"', '~', '\'', '`', '_', '@', '=', ';', ':', '!', '#', '%', '&', ',', '/', ' '}};
Expand Down
31 changes: 21 additions & 10 deletions cpp/src/strings/regex/regcomp.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ enum InstType {
BOL = 0303, // Beginning of line, ^
EOL = 0304, // End of line, $
CCLASS = 0305, // Character class, []
NCCLASS = 0306, // Negated character class, []
BOW = 0307, // Boundary of word, /b
NBOW = 0310, // Not boundary of word, /b
NCCLASS = 0306, // Negated character class, [^ ]
BOW = 0307, // Boundary of word, \b
NBOW = 0310, // Not boundary of word, \B
END = 0377 // Terminate: match found
};

Expand All @@ -57,6 +57,13 @@ struct reclass {
reclass(int m) : builtins(m) {}
};

constexpr int32_t CCLASS_W{1 << 0}; // [a-z], [A-Z], [0-9], and '_'
constexpr int32_t CCLASS_S{1 << 1}; // all spaces or ctrl characters
constexpr int32_t CCLASS_D{1 << 2}; // digits [0-9]
constexpr int32_t NCCLASS_W{1 << 3}; // not CCLASS_W or '\n'
constexpr int32_t NCCLASS_S{1 << 4}; // not CCLASS_S
constexpr int32_t NCCLASS_D{1 << 5}; // not CCLASS_D or '\n'

/**
* @brief Structure of an encoded regex instruction
*/
Expand All @@ -76,21 +83,24 @@ struct reinst {
};

/**
* @brief Regex program handles parsing a pattern in to individual set
* @brief Regex program handles parsing a pattern into a vector
* of chained instructions.
*/
class reprog {
public:
reprog() = default;
reprog(const reprog&) = default;
reprog(reprog&&) = default;
~reprog() = default;
reprog& operator=(const reprog&) = default;
reprog& operator=(reprog&&) = default;

/**
* @brief Parses the given regex pattern and compiles
* into a list of chained instructions.
* @brief Parses the given regex pattern and produces an instance
* of this object
*
* @param pattern Regex pattern encoded as UTF-8
* @param flags For interpretting certain `pattern` characters
* @return Instance of reprog
*/
static reprog create_from(std::string_view pattern, regex_flags const flags);

Expand Down Expand Up @@ -121,12 +131,13 @@ class reprog {
#endif

private:
std::vector<reinst> _insts;
std::vector<reclass> _classes;
int32_t _startinst_id;
std::vector<reinst> _insts; // instructions
std::vector<reclass> _classes; // data for CCLASS instructions
int32_t _startinst_id{}; // id of first instruction
std::vector<int32_t> _startinst_ids; // short-cut to speed-up ORs
int32_t _num_capturing_groups{};

reprog() = default;
void collapse_nops();
void build_start_ids();
void check_for_errors(int32_t id, int32_t next_id);
Expand Down
12 changes: 6 additions & 6 deletions cpp/src/strings/regex/regex.inl
Original file line number Diff line number Diff line change
Expand Up @@ -148,17 +148,17 @@ __device__ __forceinline__ bool reclass_device::is_match(char32_t const ch,
uint32_t codept = utf8_to_codepoint(ch);
if (codept > 0x00FFFF) return false;
int8_t fl = codepoint_flags[codept];
if ((builtins & 1) && ((ch == '_') || IS_ALPHANUM(fl))) // \w
if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl))) // \w
return true;
if ((builtins & 2) && IS_SPACE(fl)) // \s
if ((builtins & CCLASS_S) && IS_SPACE(fl)) // \s
return true;
if ((builtins & 4) && IS_DIGIT(fl)) // \d
if ((builtins & CCLASS_D) && IS_DIGIT(fl)) // \d
return true;
if ((builtins & 8) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl))) // \W
if ((builtins & NCCLASS_W) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl))) // \W
return true;
if ((builtins & 16) && !IS_SPACE(fl)) // \S
if ((builtins & NCCLASS_S) && !IS_SPACE(fl)) // \S
return true;
if ((builtins & 32) && ((ch != '\n') && !IS_DIGIT(fl))) // \D
if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl))) // \D
return true;
//
return false;
Expand Down

0 comments on commit 7982923

Please sign in to comment.