From 1179e4698922e353934d3d819bb4741cfeca4807 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 27 May 2022 12:22:22 -0400 Subject: [PATCH] Cleanup regex compiler operators and operands source (#10879) Cleans up the `regcomp.cpp` source to fix class names, comments, and simplify logic around processing operators and operands returned by the parser. Several class member variables used for state are moved or eliminated. Some member functions and variables are renamed. Cleanup of the parser logic will be in a follow-on PR. Reference #3582 Follow on to #10843 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/10879 --- cpp/src/strings/regex/regcomp.cpp | 330 +++++++++++++++--------------- 1 file changed, 160 insertions(+), 170 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index aaecb56bf4b..fdf4609e336 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -570,182 +570,180 @@ class regex_parser { * @brief The compiler converts class list into instructions. */ class regex_compiler { - reprog& m_prog; - - struct Node { + struct and_node { int id_first; int id_last; }; - int cursubid; - int pushsubid; - std::vector andstack; - - struct Ator { + struct re_operator { int t; int subid; }; - std::vector atorstack; - - bool lastwasand; - int nbra; + reprog& _prog; + std::stack _and_stack; + std::stack _operator_stack; + bool _last_was_and; + int _bracket_count; + regex_flags _flags; - regex_flags flags; + inline void push_and(int first, int last) { _and_stack.push({first, last}); } - char32_t yy; - int yyclass_id; - - inline void pushand(int f, int l) { andstack.push_back({f, l}); } - - inline Node popand(int op) + inline and_node pop_and() { - if (andstack.size() < 1) { - // missing operand for op - int inst_id = m_prog.add_inst(NOP); - pushand(inst_id, inst_id); + if (_and_stack.empty()) { + auto const inst_id = _prog.add_inst(NOP); + push_and(inst_id, inst_id); } - Node node = andstack[andstack.size() - 1]; - andstack.pop_back(); + auto const node = _and_stack.top(); + _and_stack.pop(); return node; } - inline void pushator(int t) + inline void push_operator(int token, int subid = 0) { - Ator ator; - ator.t = t; - ator.subid = pushsubid; - atorstack.push_back(ator); + _operator_stack.push(re_operator{token, subid}); } - inline Ator popator() + inline re_operator const pop_operator() { - Ator ator = atorstack[atorstack.size() - 1]; - atorstack.pop_back(); - return ator; + auto const op = _operator_stack.top(); + _operator_stack.pop(); + return op; } - void evaluntil(int pri) + void eval_until(int min_token) { - Node op1; - Node op2; - int id_inst1 = -1; - int id_inst2 = -1; - while (pri == RBRA || atorstack[atorstack.size() - 1].t >= pri) { - Ator ator = popator(); - switch (ator.t) { + while (min_token == RBRA || _operator_stack.top().t >= min_token) { + auto const op = pop_operator(); + switch (op.t) { default: - // unknown operator in evaluntil + // unknown operator break; - case LBRA: /* must have been RBRA */ - op1 = popand('('); - id_inst2 = m_prog.add_inst(RBRA); - m_prog.inst_at(id_inst2).u1.subid = ator.subid; - m_prog.inst_at(op1.id_last).u2.next_id = id_inst2; - id_inst1 = m_prog.add_inst(LBRA); - m_prog.inst_at(id_inst1).u1.subid = ator.subid; - m_prog.inst_at(id_inst1).u2.next_id = op1.id_first; - pushand(id_inst1, id_inst2); + case LBRA: // expects matching RBRA + { + auto const operand = pop_and(); + auto const id_inst2 = _prog.add_inst(RBRA); + _prog.inst_at(id_inst2).u1.subid = op.subid; + _prog.inst_at(operand.id_last).u2.next_id = id_inst2; + auto const id_inst1 = _prog.add_inst(LBRA); + _prog.inst_at(id_inst1).u1.subid = op.subid; + _prog.inst_at(id_inst1).u2.next_id = operand.id_first; + push_and(id_inst1, id_inst2); return; - case OR: - op2 = popand('|'); - op1 = popand('|'); - id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(op2.id_last).u2.next_id = id_inst2; - m_prog.inst_at(op1.id_last).u2.next_id = id_inst2; - id_inst1 = m_prog.add_inst(OR); - m_prog.inst_at(id_inst1).u1.right_id = op1.id_first; - m_prog.inst_at(id_inst1).u2.left_id = op2.id_first; - pushand(id_inst1, id_inst2); + } + case OR: { + auto const operand2 = pop_and(); + auto const operand1 = pop_and(); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(operand2.id_last).u2.next_id = id_inst2; + _prog.inst_at(operand1.id_last).u2.next_id = id_inst2; + auto const id_inst1 = _prog.add_inst(OR); + _prog.inst_at(id_inst1).u1.right_id = operand1.id_first; + _prog.inst_at(id_inst1).u2.left_id = operand2.id_first; + push_and(id_inst1, id_inst2); break; - case CAT: - op2 = popand(0); - op1 = popand(0); - m_prog.inst_at(op1.id_last).u2.next_id = op2.id_first; - pushand(op1.id_first, op2.id_last); + } + case CAT: { + auto const operand2 = pop_and(); + auto const operand1 = pop_and(); + _prog.inst_at(operand1.id_last).u2.next_id = operand2.id_first; + push_and(operand1.id_first, operand2.id_last); break; - case STAR: - op2 = popand('*'); - id_inst1 = m_prog.add_inst(OR); - m_prog.inst_at(op2.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u1.right_id = op2.id_first; - pushand(id_inst1, id_inst1); + } + case STAR: { + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + _prog.inst_at(operand.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u1.right_id = operand.id_first; + push_and(id_inst1, id_inst1); break; - case STAR_LAZY: - op2 = popand('*'); - id_inst1 = m_prog.add_inst(OR); - id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(op2.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u2.left_id = op2.id_first; - m_prog.inst_at(id_inst1).u1.right_id = id_inst2; - pushand(id_inst1, id_inst2); + } + case STAR_LAZY: { + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(operand.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u2.left_id = operand.id_first; + _prog.inst_at(id_inst1).u1.right_id = id_inst2; + push_and(id_inst1, id_inst2); break; - case PLUS: - op2 = popand('+'); - id_inst1 = m_prog.add_inst(OR); - m_prog.inst_at(op2.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u1.right_id = op2.id_first; - pushand(op2.id_first, id_inst1); + } + case PLUS: { + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + _prog.inst_at(operand.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u1.right_id = operand.id_first; + push_and(operand.id_first, id_inst1); break; - case PLUS_LAZY: - op2 = popand('+'); - id_inst1 = m_prog.add_inst(OR); - id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(op2.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u2.left_id = op2.id_first; - m_prog.inst_at(id_inst1).u1.right_id = id_inst2; - pushand(op2.id_first, id_inst2); + } + case PLUS_LAZY: { + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(operand.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u2.left_id = operand.id_first; + _prog.inst_at(id_inst1).u1.right_id = id_inst2; + push_and(operand.id_first, id_inst2); break; - case QUEST: - op2 = popand('?'); - id_inst1 = m_prog.add_inst(OR); - id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(id_inst1).u2.left_id = id_inst2; - m_prog.inst_at(id_inst1).u1.right_id = op2.id_first; - m_prog.inst_at(op2.id_last).u2.next_id = id_inst2; - pushand(id_inst1, id_inst2); + } + case QUEST: { + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(id_inst1).u2.left_id = id_inst2; + _prog.inst_at(id_inst1).u1.right_id = operand.id_first; + _prog.inst_at(operand.id_last).u2.next_id = id_inst2; + push_and(id_inst1, id_inst2); break; - case QUEST_LAZY: - op2 = popand('?'); - id_inst1 = m_prog.add_inst(OR); - id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(id_inst1).u2.left_id = op2.id_first; - m_prog.inst_at(id_inst1).u1.right_id = id_inst2; - m_prog.inst_at(op2.id_last).u2.next_id = id_inst2; - pushand(id_inst1, id_inst2); + } + case QUEST_LAZY: { + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(id_inst1).u2.left_id = operand.id_first; + _prog.inst_at(id_inst1).u1.right_id = id_inst2; + _prog.inst_at(operand.id_last).u2.next_id = id_inst2; + push_and(id_inst1, id_inst2); break; + } } } } - void Operator(int t) + void handle_operator(int token, int subid = 0) { - if (t == RBRA && --nbra < 0) + if (token == RBRA && --_bracket_count < 0) { // unmatched right paren return; - if (t == LBRA) { - nbra++; - if (lastwasand) Operator(CAT); - } else - evaluntil(t); - if (t != RBRA) pushator(t); - lastwasand = (t == STAR || t == QUEST || t == PLUS || t == STAR_LAZY || t == QUEST_LAZY || - t == PLUS_LAZY || t == RBRA); + } + if (token == LBRA) { + _bracket_count++; + if (_last_was_and) { handle_operator(CAT, subid); } + } else { + eval_until(token); + } + if (token != RBRA) { push_operator(token, subid); } + + static std::vector tokens{STAR, STAR_LAZY, QUEST, QUEST_LAZY, PLUS, PLUS_LAZY, RBRA}; + _last_was_and = + std::any_of(tokens.cbegin(), tokens.cend(), [token](auto t) { return t == token; }); } - void Operand(int t) + void handle_operand(int token, int subid = 0, char32_t yy = 0, int class_id = 0) { - if (lastwasand) Operator(CAT); /* catenate is implicit */ - int inst_id = m_prog.add_inst(t); - if (t == CCLASS || t == NCCLASS) { - m_prog.inst_at(inst_id).u1.cls_id = yyclass_id; - } else if (t == CHAR) { - m_prog.inst_at(inst_id).u1.c = yy; - } else if (t == BOL || t == EOL) { - m_prog.inst_at(inst_id).u1.c = is_multiline(flags) ? yy : '\n'; + if (_last_was_and) { handle_operator(CAT, subid); } // catenate is implicit + + auto const inst_id = _prog.add_inst(token); + if (token == CCLASS || token == NCCLASS) { + _prog.inst_at(inst_id).u1.cls_id = class_id; + } else if (token == CHAR) { + _prog.inst_at(inst_id).u1.c = yy; + } else if (token == BOL || token == EOL) { + _prog.inst_at(inst_id).u1.c = is_multiline(_flags) ? yy : '\n'; } - pushand(inst_id, inst_id); - lastwasand = true; + push_and(inst_id, inst_id); + _last_was_and = true; } std::vector expand_counted(std::vector const& in) @@ -816,58 +814,50 @@ class regex_compiler { public: regex_compiler(const char32_t* pattern, regex_flags const flags, reprog& prog) - : m_prog(prog), - cursubid(0), - pushsubid(0), - lastwasand(false), - nbra(0), - flags(flags), - yy(0), - yyclass_id(0) + : _prog(prog), _last_was_and(false), _bracket_count(0), _flags(flags) { // Parse std::vector const items = [&] { - regex_parser parser(pattern, is_dotall(flags) ? ANYNL : ANY, m_prog); + regex_parser parser(pattern, is_dotall(flags) ? ANYNL : ANY, _prog); return parser.m_has_counted ? expand_counted(parser.m_items) : parser.m_items; }(); - /* Start with a low priority operator to prime parser */ - pushator(START - 1); + int cur_subid{}; + int push_subid{}; - for (int i = 0; i < static_cast(items.size()); i++) { - auto const item = items[i]; - int token = item.t; - if (token == CCLASS || token == NCCLASS) - yyclass_id = item.d.yyclass_id; - else - yy = item.d.yy; + // Start with a low priority operator + push_operator(START - 1); + + for (auto const item : items) { + auto token = item.t; if (token == LBRA) { - ++cursubid; - pushsubid = cursubid; + ++cur_subid; + push_subid = cur_subid; } else if (token == LBRA_NC) { - pushsubid = 0; - token = LBRA; + push_subid = 0; + token = LBRA; } - if ((token & 0300) == OPERATOR_MASK) - Operator(token); - else - Operand(token); + if ((token & ITEM_MASK) == OPERATOR_MASK) { + handle_operator(token, push_subid); + } else { + handle_operand(token, push_subid, item.d.yy, item.d.yyclass_id); + } } - /* Close with a low priority operator */ - evaluntil(START); - /* Force END */ - Operand(END); - evaluntil(START); - if (nbra) - ; // "unmatched left paren"; - /* points to first and only operand */ - m_prog.set_start_inst(andstack[andstack.size() - 1].id_first); - m_prog.finalize(); - m_prog.check_for_errors(); - m_prog.set_groups_count(cursubid); + // Close with a low priority operator + eval_until(START); + // Force END + handle_operand(END, push_subid); + eval_until(START); + + CUDF_EXPECTS(_bracket_count == 0, "unmatched left parenthesis"); + + _prog.set_start_inst(_and_stack.top().id_first); + _prog.finalize(); + _prog.check_for_errors(); + _prog.set_groups_count(cur_subid); } };