From 9a57a4d9c752070d498c462bb5b75498bf267760 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 3 Jun 2022 15:17:17 -0400 Subject: [PATCH 1/7] Cleanup regex reclass and reclass_device classes --- cpp/src/strings/regex/regcomp.cpp | 125 +++++++++++++-------------- cpp/src/strings/regex/regcomp.h | 24 +++-- cpp/src/strings/regex/regex.cuh | 2 +- cpp/src/strings/regex/regex.inl | 7 +- cpp/src/strings/regex/regexec.cu | 23 +++-- cpp/tests/strings/contains_tests.cpp | 17 ++++ 6 files changed, 112 insertions(+), 86 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index fdf4609e336..e5a4f7e0fa1 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -19,6 +19,9 @@ #include #include +#include +#include + #include #include #include @@ -106,16 +109,16 @@ int32_t reprog::add_inst(int32_t t) return add_inst(inst); } -int32_t reprog::add_inst(reinst inst) +int32_t reprog::add_inst(reinst const& inst) { _insts.push_back(inst); - return static_cast(_insts.size() - 1); + return static_cast(_insts.size() - 1); } -int32_t reprog::add_class(reclass cls) +int32_t reprog::add_class(reclass const& cls) { _classes.push_back(cls); - return static_cast(_classes.size() - 1); + return static_cast(_classes.size() - 1); } reinst& reprog::inst_at(int32_t id) { return _insts[id]; } @@ -134,9 +137,11 @@ void reprog::set_groups_count(int32_t groups) { _num_capturing_groups = groups; int32_t reprog::groups_count() const { return _num_capturing_groups; } -const reinst* reprog::insts_data() const { return _insts.data(); } +reinst const* reprog::insts_data() const { return _insts.data(); } + +reclass const* reprog::classes_data() const { return _classes.data(); } -const int32_t* reprog::starts_data() const { return _startinst_ids.data(); } +int32_t const* reprog::starts_data() const { return _startinst_ids.data(); } int32_t reprog::starts_count() const { return static_cast(_startinst_ids.size()); } @@ -178,28 +183,25 @@ class regex_parser { int bldcclass() { int type = CCLASS; - std::vector cls; + std::vector literals; int builtins = 0; - /* look ahead for negation */ - /* SPECIAL CASE!!! negated classes don't match \n */ - char32_t c = 0; - int quoted = nextc(c); + // look ahead for negation + char32_t c = 0; + auto quoted = nextc(c); if (!quoted && c == '^') { type = NCCLASS; quoted = nextc(c); - cls.push_back('\n'); - cls.push_back('\n'); + // negated classes also don't match '\n' + literals.push_back('\n'); + literals.push_back('\n'); } - /* parse class into a set of spans */ + // parse class into a set of spans int count_char = 0; while (true) { count_char++; - if (c == 0) { - // malformed '[]' - return 0; - } + if (c == 0) { return 0; } // malformed '[]' if (quoted) { switch (c) { case 'n': c = '\n'; break; @@ -236,55 +238,47 @@ class regex_parser { } if (!quoted && c == ']' && count_char > 1) break; if (!quoted && c == '-') { - if (cls.size() < 1) { - // malformed '[]' - return 0; - } + if (literals.empty()) { return 0; } // malformed '[]' quoted = nextc(c); - if ((!quoted && c == ']') || c == 0) { - // malformed '[]' - return 0; - } - cls[cls.size() - 1] = c; + if ((!quoted && c == ']') || c == 0) { return 0; } // malformed '[]' + literals.back() = c; } else { - cls.push_back(c); - cls.push_back(c); + literals.push_back(c); + literals.push_back(c); } quoted = nextc(c); } - /* sort on span start */ - for (std::size_t p = 0; p < cls.size(); p += 2) - for (std::size_t np = p + 2; np < cls.size(); np += 2) - if (cls[np] < cls[p]) { - c = cls[np]; - cls[np] = cls[p]; - cls[p] = c; - c = cls[np + 1]; - cls[np + 1] = cls[p + 1]; - cls[p + 1] = c; - } - - /* merge spans */ - reclass yycls{builtins}; - if (cls.size() >= 2) { - int np = 0; - std::size_t p = 0; - yycls.literals += cls[p++]; - yycls.literals += cls[p++]; - for (; p < cls.size(); p += 2) { - /* overlapping or adjacent ranges? */ - if (cls[p] <= yycls.literals[np + 1] + 1) { - if (cls[p + 1] >= yycls.literals[np + 1]) - yycls.literals.replace(np + 1, 1, 1, cls[p + 1]); /* coalesce */ - } else { - np += 2; - yycls.literals += cls[p]; - yycls.literals += cls[p + 1]; + // transform pairs of literals to spans + std::vector spans; + auto const evens = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [](auto i) { return i * 2; }); + std::transform( + evens, evens + (literals.size() / 2), std::back_inserter(spans), [&literals](auto idx) { + return reclass_span{literals[idx], literals[idx + 1]}; + }); + // sort the spans to help with detecting overlapping entries + std::sort(spans.begin(), spans.end(), [](auto l, auto r) { + return l.first == r.first ? l.last < r.last : l.first < r.first; + }); + // combine overlapping entries + if (spans.size() > 1) { + for (auto itr = spans.begin() + 1; itr < spans.end(); ++itr) { + auto const prev = *(itr - 1); + auto const curr = *itr; + if (curr.first <= prev.last + 1) { + // if these 2 spans intersect, expand the current one + *itr = reclass_span{prev.first, std::max(prev.last, curr.last)}; } } } - yyclass_id = m_prog.add_class(yycls); + // remove duplicates + std::reverse(spans.begin(), spans.end()); // moves larger overlaps forward + auto const end = // this relies on std::unique keeping the first entry in a repeated sequence + std::unique(spans.begin(), spans.end(), [](auto l, auto r) { return l.first == r.first; }); + spans.erase(end, spans.end()); + + yyclass_id = m_prog.add_class(reclass{builtins, std::move(spans)}); return type; } @@ -340,8 +334,7 @@ class regex_parser { case 'W': { if (id_ccls_W < 0) { reclass cls = ccls_w; - cls.literals += '\n'; - cls.literals += '\n'; + cls.literals.push_back({'\n', '\n'}); yyclass_id = m_prog.add_class(cls); id_ccls_W = yyclass_id; } else @@ -375,8 +368,7 @@ class regex_parser { case 'D': { if (id_ccls_D < 0) { reclass cls = ccls_d; - cls.literals += '\n'; - cls.literals += '\n'; + cls.literals.push_back({'\n', '\n'}); yyclass_id = m_prog.add_class(cls); id_ccls_D = yyclass_id; } else @@ -1086,15 +1078,16 @@ void reprog::print(regex_flags const flags) const reclass& cls = _classes[i]; auto const size = static_cast(cls.literals.size()); printf("%2d: ", i); - for (int j = 0; j < size; j += 2) { - char32_t c1 = cls.literals[j]; - char32_t c2 = cls.literals[j + 1]; + for (int j = 0; j < size; ++j) { + auto const l = cls.literals[j]; + char32_t c1 = l.first; + char32_t c2 = l.last; if (c1 <= 32 || c1 >= 127 || c2 <= 32 || c2 >= 127) { printf("0x%02x-0x%02x", static_cast(c1), static_cast(c2)); } else { printf("%c-%c", static_cast(c1), static_cast(c2)); } - if ((j + 2) < size) { printf(", "); } + if ((j + 1) < size) { printf(", "); } } printf("\n"); if (cls.builtins) { diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h index 48395e8cf1f..3d7ba114000 100644 --- a/cpp/src/strings/regex/regcomp.h +++ b/cpp/src/strings/regex/regcomp.h @@ -47,14 +47,23 @@ enum InstType { END = 0377 // Terminate: match found }; +/** + * @brief Range used for literals in reclass classes. + */ +struct reclass_span { + char32_t first{}; /// first character in span + char32_t last{}; /// last character in span (inclusive) +}; + /** * @brief Class type for regex compiler instruction. */ struct reclass { - int32_t builtins{0}; // bit mask identifying builtin classes - std::u32string literals; // ranges as pairs of utf-8 characters + int32_t builtins{0}; // bit mask identifying builtin classes + std::vector literals; reclass() {} reclass(int m) : builtins(m) {} + reclass(int m, std::vector&& l) : builtins(m), literals(std::move(l)) {} }; constexpr int32_t CCLASS_W{1 << 0}; // [a-z], [A-Z], [0-9], and '_' @@ -105,18 +114,19 @@ class reprog { static reprog create_from(std::string_view pattern, regex_flags const flags); int32_t add_inst(int32_t type); - int32_t add_inst(reinst inst); - int32_t add_class(reclass cls); + int32_t add_inst(reinst const& inst); + int32_t add_class(reclass const& cls); void set_groups_count(int32_t groups); [[nodiscard]] int32_t groups_count() const; - [[nodiscard]] const reinst* insts_data() const; [[nodiscard]] int32_t insts_count() const; - reinst& inst_at(int32_t id); + [[nodiscard]] reinst& inst_at(int32_t id); + [[nodiscard]] reinst const* insts_data() const; - reclass& class_at(int32_t id); [[nodiscard]] int32_t classes_count() const; + [[nodiscard]] reclass& class_at(int32_t id); + [[nodiscard]] reclass const* classes_data() const; [[nodiscard]] const int32_t* starts_data() const; [[nodiscard]] int32_t starts_count() const; diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh index 2ee195a2c5e..29a3d7e70ec 100644 --- a/cpp/src/strings/regex/regex.cuh +++ b/cpp/src/strings/regex/regex.cuh @@ -51,7 +51,7 @@ constexpr int32_t MINIMUM_THREADS = 256; // Minimum threads for computing w struct alignas(16) reclass_device { int32_t builtins{}; int32_t count{}; - char32_t const* literals{}; + reclass_span const* literals{}; __device__ inline bool is_match(char32_t const ch, uint8_t const* flags) const; }; diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 8e2194f2094..a8d13e236db 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -141,7 +141,8 @@ __device__ __forceinline__ bool reclass_device::is_match(char32_t const ch, uint8_t const* codepoint_flags) const { for (int i = 0; i < count; ++i) { - if ((ch >= literals[i * 2]) && (ch <= literals[(i * 2) + 1])) { return true; } + auto const literal = literals[i]; + if ((ch >= literal.first) && (ch <= literal.last)) { return true; } } if (!builtins) return false; @@ -204,11 +205,11 @@ __device__ __forceinline__ void reprog_device::store(void* buffer) const auto classes = reinterpret_cast(ptr); result->_classes = classes; // fill in each class - auto d_ptr = reinterpret_cast(classes + _classes_count); + auto d_ptr = reinterpret_cast(classes + _classes_count); for (int idx = 0; idx < _classes_count; ++idx) { classes[idx] = _classes[idx]; classes[idx].literals = d_ptr; - for (int jdx = 0; jdx < _classes[idx].count * 2; ++jdx) + for (int jdx = 0; jdx < _classes[idx].count; ++jdx) *d_ptr++ = _classes[idx].literals[jdx]; } } diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu index 16f5b6fa03d..d504e469c44 100644 --- a/cpp/src/strings/regex/regexec.cu +++ b/cpp/src/strings/regex/regexec.cu @@ -25,6 +25,8 @@ #include #include +#include +#include namespace cudf { namespace strings { @@ -63,9 +65,12 @@ std::unique_ptr> reprog_devic // compute size of each section auto insts_size = insts_count * sizeof(_insts[0]); auto startids_size = starts_count * sizeof(_startinst_ids[0]); - auto classes_size = classes_count * sizeof(_classes[0]); - for (auto idx = 0; idx < classes_count; ++idx) - classes_size += static_cast((h_prog.class_at(idx).literals.size()) * sizeof(char32_t)); + auto classes_size = std::transform_reduce( + h_prog.classes_data(), + h_prog.classes_data() + h_prog.classes_count(), + classes_count * sizeof(_classes[0]), + std::plus{}, + [&h_prog](auto& cls) { return cls.literals.size() * sizeof(reclass_span); }); // make sure each section is aligned for the subsequent section's data type auto const memsize = cudf::util::round_up_safe(insts_size, sizeof(_startinst_ids[0])) + cudf::util::round_up_safe(startids_size, sizeof(_classes[0])) + @@ -104,14 +109,14 @@ std::unique_ptr> reprog_devic auto d_end = d_ptr + (classes_count * sizeof(reclass_device)); // place each class and append the variable length data for (int32_t idx = 0; idx < classes_count; ++idx) { - reclass& h_class = h_prog.class_at(idx); + auto h_class = h_prog.class_at(idx); reclass_device d_class{h_class.builtins, - static_cast(h_class.literals.size() / 2), - reinterpret_cast(d_end)}; + static_cast(h_class.literals.size()), + reinterpret_cast(d_end)}; *classes++ = d_class; - memcpy(h_end, h_class.literals.c_str(), h_class.literals.size() * sizeof(char32_t)); - h_end += h_class.literals.size() * sizeof(char32_t); - d_end += h_class.literals.size() * sizeof(char32_t); + memcpy(h_end, h_class.literals.data(), h_class.literals.size() * sizeof(reclass_span)); + h_end += h_class.literals.size() * sizeof(reclass_span); + d_end += h_class.literals.size() * sizeof(reclass_span); } // initialize the rest of the elements diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 9df22503c07..21c18977746 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -406,6 +406,23 @@ TEST_F(StringsContainsTests, FixedQuantifier) } } +TEST_F(StringsContainsTests, OverlappedClasses) +{ + auto input = cudf::test::strings_column_wrapper({"abcdefg", "defghí", "", "éééééé", "ghijkl"}); + auto sv = cudf::strings_column_view(input); + + { + auto results = cudf::strings::count_re(sv, "[e-gb-da-c]"); + cudf::test::fixed_width_column_wrapper expected({7, 4, 0, 0, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + { + auto results = cudf::strings::count_re(sv, "[á-éê-ú]"); + cudf::test::fixed_width_column_wrapper expected({0, 1, 0, 6, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } +} + TEST_F(StringsContainsTests, MultiLine) { auto input = From f11ebe1836a3a7aa814ce55d4ce8a0c571210986 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 6 Jun 2022 11:26:37 -0400 Subject: [PATCH 2/7] rename reclass_span reclass_range --- cpp/src/strings/regex/regcomp.cpp | 6 +++--- cpp/src/strings/regex/regcomp.h | 6 +++--- cpp/src/strings/regex/regex.cuh | 2 +- cpp/src/strings/regex/regex.inl | 2 +- cpp/src/strings/regex/regexec.cu | 10 +++++----- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index e5a4f7e0fa1..5133a167072 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -250,12 +250,12 @@ class regex_parser { } // transform pairs of literals to spans - std::vector spans; + std::vector spans; auto const evens = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [](auto i) { return i * 2; }); std::transform( evens, evens + (literals.size() / 2), std::back_inserter(spans), [&literals](auto idx) { - return reclass_span{literals[idx], literals[idx + 1]}; + return reclass_range{literals[idx], literals[idx + 1]}; }); // sort the spans to help with detecting overlapping entries std::sort(spans.begin(), spans.end(), [](auto l, auto r) { @@ -268,7 +268,7 @@ class regex_parser { auto const curr = *itr; if (curr.first <= prev.last + 1) { // if these 2 spans intersect, expand the current one - *itr = reclass_span{prev.first, std::max(prev.last, curr.last)}; + *itr = reclass_range{prev.first, std::max(prev.last, curr.last)}; } } } diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h index 3d7ba114000..10092137c77 100644 --- a/cpp/src/strings/regex/regcomp.h +++ b/cpp/src/strings/regex/regcomp.h @@ -50,7 +50,7 @@ enum InstType { /** * @brief Range used for literals in reclass classes. */ -struct reclass_span { +struct reclass_range { char32_t first{}; /// first character in span char32_t last{}; /// last character in span (inclusive) }; @@ -60,10 +60,10 @@ struct reclass_span { */ struct reclass { int32_t builtins{0}; // bit mask identifying builtin classes - std::vector literals; + std::vector literals; reclass() {} reclass(int m) : builtins(m) {} - reclass(int m, std::vector&& l) : builtins(m), literals(std::move(l)) {} + reclass(int m, std::vector&& l) : builtins(m), literals(std::move(l)) {} }; constexpr int32_t CCLASS_W{1 << 0}; // [a-z], [A-Z], [0-9], and '_' diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh index 29a3d7e70ec..e899c84a48d 100644 --- a/cpp/src/strings/regex/regex.cuh +++ b/cpp/src/strings/regex/regex.cuh @@ -51,7 +51,7 @@ constexpr int32_t MINIMUM_THREADS = 256; // Minimum threads for computing w struct alignas(16) reclass_device { int32_t builtins{}; int32_t count{}; - reclass_span const* literals{}; + reclass_range const* literals{}; __device__ inline bool is_match(char32_t const ch, uint8_t const* flags) const; }; diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index a8d13e236db..fb2db86ab0b 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -205,7 +205,7 @@ __device__ __forceinline__ void reprog_device::store(void* buffer) const auto classes = reinterpret_cast(ptr); result->_classes = classes; // fill in each class - auto d_ptr = reinterpret_cast(classes + _classes_count); + auto d_ptr = reinterpret_cast(classes + _classes_count); for (int idx = 0; idx < _classes_count; ++idx) { classes[idx] = _classes[idx]; classes[idx].literals = d_ptr; diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu index d504e469c44..99f6692f023 100644 --- a/cpp/src/strings/regex/regexec.cu +++ b/cpp/src/strings/regex/regexec.cu @@ -70,7 +70,7 @@ std::unique_ptr> reprog_devic h_prog.classes_data() + h_prog.classes_count(), classes_count * sizeof(_classes[0]), std::plus{}, - [&h_prog](auto& cls) { return cls.literals.size() * sizeof(reclass_span); }); + [&h_prog](auto& cls) { return cls.literals.size() * sizeof(reclass_range); }); // make sure each section is aligned for the subsequent section's data type auto const memsize = cudf::util::round_up_safe(insts_size, sizeof(_startinst_ids[0])) + cudf::util::round_up_safe(startids_size, sizeof(_classes[0])) + @@ -112,11 +112,11 @@ std::unique_ptr> reprog_devic auto h_class = h_prog.class_at(idx); reclass_device d_class{h_class.builtins, static_cast(h_class.literals.size()), - reinterpret_cast(d_end)}; + reinterpret_cast(d_end)}; *classes++ = d_class; - memcpy(h_end, h_class.literals.data(), h_class.literals.size() * sizeof(reclass_span)); - h_end += h_class.literals.size() * sizeof(reclass_span); - d_end += h_class.literals.size() * sizeof(reclass_span); + memcpy(h_end, h_class.literals.data(), h_class.literals.size() * sizeof(reclass_range)); + h_end += h_class.literals.size() * sizeof(reclass_range); + d_end += h_class.literals.size() * sizeof(reclass_range); } // initialize the rest of the elements From 6914f4d267c0fe4be34d724a4a187d00a79331b1 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 16 Jun 2022 10:42:13 -0400 Subject: [PATCH 3/7] expand static reclass var names --- cpp/src/strings/regex/regcomp.cpp | 58 +++++++++++++++---------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 47e29e04aad..4eb040ba526 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -53,12 +53,12 @@ enum OperatorType { }; #define ITEM_MASK 0300 -static reclass ccls_w(CCLASS_W); // \w -static reclass ccls_s(CCLASS_S); // \s -static reclass ccls_d(CCLASS_D); // \d -static reclass ccls_W(NCCLASS_W); // \W -static reclass ccls_S(NCCLASS_S); // \S -static reclass ccls_D(NCCLASS_D); // \D +static reclass cclass_w(CCLASS_W); // \w +static reclass cclass_s(CCLASS_S); // \s +static reclass cclass_d(CCLASS_D); // \d +static reclass cclass_W(NCCLASS_W); // \W +static reclass cclass_S(NCCLASS_S); // \S +static reclass cclass_D(NCCLASS_D); // \D // Tables for analyzing quantifiers const std::array valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}}; @@ -241,32 +241,32 @@ class regex_parser { case 'b': chr = 0x08; break; case 'f': chr = 0x0C; break; case 'w': - builtins |= ccls_w.builtins; + builtins |= cclass_w.builtins; std::tie(is_quoted, chr) = next_char(); continue; case 's': - builtins |= ccls_s.builtins; + builtins |= cclass_s.builtins; std::tie(is_quoted, chr) = next_char(); continue; case 'd': - builtins |= ccls_d.builtins; + builtins |= cclass_d.builtins; std::tie(is_quoted, chr) = next_char(); continue; case 'W': - builtins |= ccls_W.builtins; + builtins |= cclass_W.builtins; std::tie(is_quoted, chr) = next_char(); continue; case 'S': - builtins |= ccls_S.builtins; + builtins |= cclass_S.builtins; std::tie(is_quoted, chr) = next_char(); continue; case 'D': - builtins |= ccls_D.builtins; + builtins |= cclass_D.builtins; std::tie(is_quoted, chr) = next_char(); continue; } } - if (!is_quoted && chr == ']' && count_char > 1) break; + if (!is_quoted && chr == ']' && count_char > 1) { break; } // done if (!is_quoted && chr == '-') { if (literals.empty()) { return 0; } // malformed '[]' std::tie(is_quoted, chr) = next_char(); @@ -280,18 +280,16 @@ class regex_parser { } // transform pairs of literals to spans - std::vector spans; - auto const evens = thrust::make_transform_iterator(thrust::make_counting_iterator(0), - [](auto i) { return i * 2; }); - std::transform( - evens, evens + (literals.size() / 2), std::back_inserter(spans), [&literals](auto idx) { - return reclass_range{literals[idx], literals[idx + 1]}; - }); + std::vector spans(literals.size() / 2); + auto const counter = thrust::make_counting_iterator(0); + std::transform(counter, counter + spans.size(), spans.begin(), [&literals](auto idx) { + return reclass_range{literals[idx * 2], literals[idx * 2 + 1]}; + }); // sort the spans to help with detecting overlapping entries std::sort(spans.begin(), spans.end(), [](auto l, auto r) { return l.first == r.first ? l.last < r.last : l.first < r.first; }); - // combine overlapping entries + // combine overlapping entries: [a-f][c-g] => [a-g] for (auto itr = spans.begin() + static_cast(!spans.empty()); itr < spans.end(); ++itr) { auto const prev = *(itr - 1); auto const curr = *itr; @@ -300,11 +298,11 @@ class regex_parser { *itr = reclass_range{prev.first, std::max(prev.last, curr.last)}; } } - // remove duplicates + // remove any duplicates std::reverse(spans.begin(), spans.end()); // moves larger overlaps forward - auto const end = // this relies on std::unique keeping the first entry in a repeated sequence + auto const end = // std::unique specifies keeping the first entry in a repeated sequence std::unique(spans.begin(), spans.end(), [](auto l, auto r) { return l.first == r.first; }); - spans.erase(end, spans.end()); + spans.erase(end, spans.end()); // clear the remainder _cclass_id = _prog.add_class(reclass{builtins, std::move(spans)}); return type; @@ -355,13 +353,13 @@ class regex_parser { break; } case 'w': { - if (_id_cclass_w < 0) { _id_cclass_w = _prog.add_class(ccls_w); } + if (_id_cclass_w < 0) { _id_cclass_w = _prog.add_class(cclass_w); } _cclass_id = _id_cclass_w; return CCLASS; } case 'W': { if (_id_cclass_W < 0) { - reclass cls = ccls_w; + reclass cls = cclass_w; cls.literals.push_back({'\n', '\n'}); _id_cclass_W = _prog.add_class(cls); } @@ -369,23 +367,23 @@ class regex_parser { return NCCLASS; } case 's': { - if (_id_cclass_s < 0) { _id_cclass_s = _prog.add_class(ccls_s); } + if (_id_cclass_s < 0) { _id_cclass_s = _prog.add_class(cclass_s); } _cclass_id = _id_cclass_s; return CCLASS; } case 'S': { - if (_id_cclass_s < 0) { _id_cclass_s = _prog.add_class(ccls_s); } + if (_id_cclass_s < 0) { _id_cclass_s = _prog.add_class(cclass_s); } _cclass_id = _id_cclass_s; return NCCLASS; } case 'd': { - if (_id_cclass_d < 0) { _id_cclass_d = _prog.add_class(ccls_d); } + if (_id_cclass_d < 0) { _id_cclass_d = _prog.add_class(cclass_d); } _cclass_id = _id_cclass_d; return CCLASS; } case 'D': { if (_id_cclass_D < 0) { - reclass cls = ccls_d; + reclass cls = cclass_d; cls.literals.push_back({'\n', '\n'}); _id_cclass_D = _prog.add_class(cls); } From 21fa745931e17c9391b81475b6a01fe7ccffb6c3 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 16 Jun 2022 13:06:30 -0400 Subject: [PATCH 4/7] remove unneeded include --- cpp/src/strings/regex/regcomp.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 4eb040ba526..dbde7700672 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include From aad80306075b5c8fca0d57a777c68b3f36d81e91 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 16 Jun 2022 14:27:15 -0400 Subject: [PATCH 5/7] rename var spans to ranges --- cpp/src/strings/regex/regcomp.cpp | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index dbde7700672..0bf2693c708 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -278,32 +278,31 @@ class regex_parser { std::tie(is_quoted, chr) = next_char(); } - // transform pairs of literals to spans - std::vector spans(literals.size() / 2); + // transform pairs of literals to ranges + std::vector ranges(literals.size() / 2); auto const counter = thrust::make_counting_iterator(0); - std::transform(counter, counter + spans.size(), spans.begin(), [&literals](auto idx) { + std::transform(counter, counter + ranges.size(), ranges.begin(), [&literals](auto idx) { return reclass_range{literals[idx * 2], literals[idx * 2 + 1]}; }); - // sort the spans to help with detecting overlapping entries - std::sort(spans.begin(), spans.end(), [](auto l, auto r) { + // sort the ranges to help with detecting overlapping entries + std::sort(ranges.begin(), ranges.end(), [](auto l, auto r) { return l.first == r.first ? l.last < r.last : l.first < r.first; }); // combine overlapping entries: [a-f][c-g] => [a-g] - for (auto itr = spans.begin() + static_cast(!spans.empty()); itr < spans.end(); ++itr) { + for (auto itr = ranges.begin() + static_cast(!ranges.empty()); itr < ranges.end(); ++itr) { auto const prev = *(itr - 1); - auto const curr = *itr; - if (curr.first <= prev.last + 1) { - // if these 2 spans intersect, expand the current one - *itr = reclass_range{prev.first, std::max(prev.last, curr.last)}; + if (itr->first <= prev.last + 1) { + // if these 2 ranges intersect, expand the current one + *itr = reclass_range{prev.first, std::max(prev.last, itr->last)}; } } // remove any duplicates - std::reverse(spans.begin(), spans.end()); // moves larger overlaps forward + std::reverse(ranges.begin(), ranges.end()); // moves larger overlaps forward auto const end = // std::unique specifies keeping the first entry in a repeated sequence - std::unique(spans.begin(), spans.end(), [](auto l, auto r) { return l.first == r.first; }); - spans.erase(end, spans.end()); // clear the remainder + std::unique(ranges.begin(), ranges.end(), [](auto l, auto r) { return l.first == r.first; }); + ranges.erase(end, ranges.end()); // clear the remaining items - _cclass_id = _prog.add_class(reclass{builtins, std::move(spans)}); + _cclass_id = _prog.add_class(reclass{builtins, std::move(ranges)}); return type; } From a12cea633c2bf6c9518e10114ea92136231b287c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 16 Jun 2022 18:37:08 -0400 Subject: [PATCH 6/7] use rbegin/rend with std::unique --- cpp/src/strings/regex/regcomp.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 0bf2693c708..992d66a5ff4 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -289,18 +289,19 @@ class regex_parser { return l.first == r.first ? l.last < r.last : l.first < r.first; }); // combine overlapping entries: [a-f][c-g] => [a-g] - for (auto itr = ranges.begin() + static_cast(!ranges.empty()); itr < ranges.end(); ++itr) { - auto const prev = *(itr - 1); - if (itr->first <= prev.last + 1) { - // if these 2 ranges intersect, expand the current one - *itr = reclass_range{prev.first, std::max(prev.last, itr->last)}; + if (ranges.size() > 1) { + for (auto itr = ranges.begin() + 1; itr < ranges.end(); ++itr) { + auto const prev = *(itr - 1); + if (itr->first <= prev.last + 1) { + // if these 2 ranges intersect, expand the current one + *itr = reclass_range{prev.first, std::max(prev.last, itr->last)}; + } } } // remove any duplicates - std::reverse(ranges.begin(), ranges.end()); // moves larger overlaps forward - auto const end = // std::unique specifies keeping the first entry in a repeated sequence - std::unique(ranges.begin(), ranges.end(), [](auto l, auto r) { return l.first == r.first; }); - ranges.erase(end, ranges.end()); // clear the remaining items + auto const end = std::unique( + ranges.rbegin(), ranges.rend(), [](auto l, auto r) { return l.first == r.first; }); + ranges.erase(ranges.begin(), ranges.begin() + std::distance(end, ranges.rend())); _cclass_id = _prog.add_class(reclass{builtins, std::move(ranges)}); return type; From 7d3539b10619ab1e3ca17abfe48da6898fab27bb Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 23 Jun 2022 13:45:59 -0400 Subject: [PATCH 7/7] fix h_class var declaration to prevent unneeded copy --- cpp/src/strings/regex/regexec.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu index 99f6692f023..2ee1901c32d 100644 --- a/cpp/src/strings/regex/regexec.cu +++ b/cpp/src/strings/regex/regexec.cu @@ -109,7 +109,7 @@ std::unique_ptr> reprog_devic auto d_end = d_ptr + (classes_count * sizeof(reclass_device)); // place each class and append the variable length data for (int32_t idx = 0; idx < classes_count; ++idx) { - auto h_class = h_prog.class_at(idx); + auto const& h_class = h_prog.class_at(idx); reclass_device d_class{h_class.builtins, static_cast(h_class.literals.size()), reinterpret_cast(d_end)};