diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 090f613a9d1..9b8b87a677f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -362,8 +362,6 @@ add_library(cudf src/strings/regex/regexec.cu src/strings/repeat_strings.cu src/strings/replace/backref_re.cu - src/strings/replace/backref_re_large.cu - src/strings/replace/backref_re_medium.cu src/strings/replace/multi_re.cu src/strings/replace/replace.cu src/strings/replace/replace_re.cu diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index 082e6655cef..e2188365785 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -45,7 +45,7 @@ namespace { * Small to medium instruction lengths can use the stack effectively though smaller executes faster. * Longer patterns require global memory. */ -template +template struct contains_fn { reprog_device prog; column_device_view d_strings; @@ -163,7 +163,7 @@ namespace { /** * @brief This counts the number of times the regex pattern matches in each string. */ -template +template struct count_fn { reprog_device prog; column_device_view d_strings; diff --git a/cpp/src/strings/extract.cu b/cpp/src/strings/extract.cu index 438f031d3b8..423bfff0cbc 100644 --- a/cpp/src/strings/extract.cu +++ b/cpp/src/strings/extract.cu @@ -22,17 +22,20 @@ #include #include #include +#include #include #include #include #include +#include #include +#include + namespace cudf { namespace strings { namespace detail { -using string_index_pair = thrust::pair; namespace { /** @@ -42,26 +45,27 @@ namespace { * @tparam stack_size Correlates to the regex instructions state to maintain for each string. * Each instruction requires a fixed amount of overhead data. */ -template +template struct extract_fn { reprog_device prog; column_device_view d_strings; - size_type column_index; + cudf::detail::device_2dspan d_indices; - __device__ string_index_pair operator()(size_type idx) + __device__ void operator()(size_type idx) { - if (d_strings.is_null(idx)) return string_index_pair{nullptr, 0}; - string_view d_str = d_strings.element(idx); - string_index_pair result{nullptr, 0}; - int32_t begin = 0; - int32_t end = -1; // handles empty strings automatically - if ((prog.find(idx, d_str, begin, end) > 0) && - (prog.extract(idx, d_str, begin, end, column_index) > 0)) { - auto offset = d_str.byte_offset(begin); - // build index-pair - result = string_index_pair{d_str.data() + offset, d_str.byte_offset(end) - offset}; + auto groups = prog.group_counts(); + auto d_output = d_indices[idx]; + if (d_strings.is_valid(idx)) { + string_view d_str = d_strings.element(idx); + int32_t begin = 0; + int32_t end = -1; // handles empty strings automatically + if ((prog.find(idx, d_str, begin, end) > 0) && + prog.extract(idx, d_str, begin, end, d_output)) { + return; + } } - return result; + // fill output with null entries + thrust::fill(thrust::seq, d_output.begin(), d_output.end(), string_index_pair{nullptr, 0}); } }; @@ -82,43 +86,48 @@ std::unique_ptr extract( auto prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); auto d_prog = *prog; // extract should include groups - int groups = d_prog.group_counts(); + auto const groups = d_prog.group_counts(); CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern"); // build a result column for each group std::vector> results; auto regex_insts = d_prog.insts_counts(); + rmm::device_uvector indices(strings_count * groups, stream); + cudf::detail::device_2dspan d_indices(indices.data(), strings_count, groups); + + if (regex_insts <= RX_SMALL_INSTS) { + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + extract_fn{d_prog, d_strings, d_indices}); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + extract_fn{d_prog, d_strings, d_indices}); + } else if (regex_insts <= RX_LARGE_INSTS) { + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + extract_fn{d_prog, d_strings, d_indices}); + } else { // supports any number of instructions + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + extract_fn{d_prog, d_strings, d_indices}); + } + for (int32_t column_index = 0; column_index < groups; ++column_index) { - rmm::device_uvector indices(strings_count, stream); - - if (regex_insts <= RX_SMALL_INSTS) - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - indices.begin(), - extract_fn{d_prog, d_strings, column_index}); - else if (regex_insts <= RX_MEDIUM_INSTS) - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - indices.begin(), - extract_fn{d_prog, d_strings, column_index}); - else if (regex_insts <= RX_LARGE_INSTS) - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - indices.begin(), - extract_fn{d_prog, d_strings, column_index}); - else - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - indices.begin(), - extract_fn{d_prog, d_strings, column_index}); - - results.emplace_back(make_strings_column(indices, stream, mr)); + auto indices_itr = thrust::make_permutation_iterator( + indices.begin(), + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [column_index, groups] __device__(size_type idx) { + return (idx * groups) + column_index; + })); + results.emplace_back(make_strings_column(indices_itr, indices_itr + strings_count, stream, mr)); } + return std::make_unique
(std::move(results)); } diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh index 5e9811d6897..66b90abc393 100644 --- a/cpp/src/strings/regex/regex.cuh +++ b/cpp/src/strings/regex/regex.cuh @@ -17,8 +17,13 @@ #include +#include +#include + #include +#include + #include #include @@ -33,6 +38,8 @@ struct reljunk; struct reinst; class reprog; +using string_index_pair = thrust::pair; + constexpr int32_t RX_STACK_SMALL = 112; ///< fastest stack size constexpr int32_t RX_STACK_MEDIUM = 1104; ///< faster stack size constexpr int32_t RX_STACK_LARGE = 10128; ///< fast stack size @@ -99,6 +106,7 @@ class reprog_device { const uint8_t* cp_flags, int32_t strings_count, rmm::cuda_stream_view stream); + /** * @brief Called automatically by the unique_ptr returned from create(). */ @@ -157,21 +165,26 @@ class reprog_device { * @brief Does an extract evaluation using the compiled expression on the given string. * * This will find a specific match within the string when more than match occurs. + * The find() function should be called first to locate the begin/end bounds of the + * the matched section. * * @tparam stack_size One of the `RX_STACK_` values based on the `insts_count`. * @param idx The string index used for mapping the state memory for this string in global memory * (if necessary). * @param d_str The string to search. - * @param[in,out] begin Position index to begin the search. If found, returns the position found + * @param begin Position index to begin the search. If found, returns the position found * in the string. - * @param[in,out] end Position index to end the search. If found, returns the last position + * @param end Position index to end the search. If found, returns the last position * matching in the string. - * @param group_id The specific instance to return if more than one match is found. - * @return Returns 0 if no match is found. + * @param indices All extracted groups + * @return Returns true if successful. */ template - __device__ inline int32_t extract( - int32_t idx, string_view const& d_str, int32_t& begin, int32_t& end, int32_t group_id); + __device__ inline bool extract(int32_t idx, + string_view const& d_str, + int32_t begin, + int32_t end, + device_span indices); private: int32_t _startinst_id, _num_capturing_groups; @@ -185,15 +198,21 @@ class reprog_device { /** * @brief Executes the regex pattern on the given string. */ - __device__ inline int32_t regexec( - string_view const& d_str, reljunk& jnk, int32_t& begin, int32_t& end, int32_t groupid = 0); + __device__ inline int32_t regexec(string_view const& d_str, + reljunk& jnk, + int32_t& begin, + int32_t& end, + string_index_pair* indices = nullptr); /** * @brief Utility wrapper to setup state memory structures for calling regexec */ template - __device__ inline int32_t call_regexec( - int32_t idx, string_view const& d_str, int32_t& begin, int32_t& end, int32_t groupid = 0); + __device__ inline int32_t call_regexec(int32_t idx, + string_view const& d_str, + int32_t& begin, + int32_t& end, + string_index_pair* indices = nullptr); reprog_device(reprog&); // must use create() }; diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index caa9550b9d1..5c9f389827f 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -194,7 +194,7 @@ __device__ inline int32_t* reprog_device::startinst_ids() const { return _starti * @return >0 if match found */ __device__ inline int32_t reprog_device::regexec( - string_view const& dstr, reljunk& jnk, int32_t& begin, int32_t& end, int32_t group_id) + string_view const& dstr, reljunk& jnk, int32_t& begin, int32_t& end, string_index_pair* indices) { int32_t match = 0; auto checkstart = jnk.starttype; @@ -229,10 +229,9 @@ __device__ inline int32_t reprog_device::regexec( } if (((eos < 0) || (pos < eos)) && match == 0) { - // jnk.list1->activate(startinst_id, pos, 0); int32_t i = 0; auto ids = startinst_ids(); - while (ids[i] >= 0) jnk.list1->activate(ids[i++], (group_id == 0 ? pos : -1), -1); + while (ids[i] >= 0) jnk.list1->activate(ids[i++], (indices == nullptr ? pos : -1), -1); } c = static_cast(pos >= txtlen ? 0 : *itr); @@ -257,14 +256,20 @@ __device__ inline int32_t reprog_device::regexec( case NCCLASS: case END: id_activate = inst_id; break; case LBRA: - if (inst->u1.subid == group_id) range.x = pos; + if (indices && inst->u1.subid == _num_capturing_groups) range.x = pos; id_activate = inst->u2.next_id; expanded = true; + if (indices) { indices[inst->u1.subid - 1].first = dstr.data() + itr.byte_offset(); } break; case RBRA: - if (inst->u1.subid == group_id) range.y = pos; + if (indices && inst->u1.subid == _num_capturing_groups) range.y = pos; id_activate = inst->u2.next_id; expanded = true; + if (indices) { + auto const ptr_offset = indices[inst->u1.subid - 1].first - dstr.data(); + indices[inst->u1.subid - 1].second = + itr.byte_offset() - static_cast(ptr_offset); + } break; case BOL: if ((pos == 0) || @@ -318,8 +323,9 @@ __device__ inline int32_t reprog_device::regexec( } while (expanded); // execute + bool continue_execute = true; jnk.list2->reset(); - for (int16_t i = 0; i < jnk.list1->size; i++) { + for (int16_t i = 0; continue_execute && i < jnk.list1->size; i++) { int32_t inst_id = static_cast(jnk.list1->inst_ids[i]); int2& range = jnk.list1->ranges[i]; const reinst* inst = get_inst(inst_id); @@ -346,18 +352,21 @@ __device__ inline int32_t reprog_device::regexec( case END: match = 1; begin = range.x; - end = group_id == 0 ? pos : range.y; - goto BreakFor; + end = indices == nullptr ? pos : range.y; + + continue_execute = false; + break; } - if (id_activate >= 0) jnk.list2->activate(id_activate, range.x, range.y); + if (continue_execute && (id_activate >= 0)) + jnk.list2->activate(id_activate, range.x, range.y); } - BreakFor: ++pos; ++itr; swaplist(jnk.list1, jnk.list2); checkstart = jnk.list1->size > 0 ? 0 : 1; } while (c && (jnk.list1->size > 0 || match == 0)); + return match; } @@ -373,16 +382,19 @@ __device__ inline int32_t reprog_device::find(int32_t idx, } template -__device__ inline int32_t reprog_device::extract( - int32_t idx, string_view const& dstr, int32_t& begin, int32_t& end, int32_t group_id) +__device__ inline bool reprog_device::extract(int32_t idx, + string_view const& dstr, + int32_t begin, + int32_t end, + device_span indices) { end = begin + 1; - return call_regexec(idx, dstr, begin, end, group_id + 1); + return call_regexec(idx, dstr, begin, end, indices.data()) > 0; } template __device__ inline int32_t reprog_device::call_regexec( - int32_t idx, string_view const& dstr, int32_t& begin, int32_t& end, int32_t group_id) + int32_t idx, string_view const& dstr, int32_t& begin, int32_t& end, string_index_pair* indices) { u_char data1[stack_size], data2[stack_size]; @@ -393,12 +405,12 @@ __device__ inline int32_t reprog_device::call_regexec( relist list2(static_cast(_insts_count), data2); reljunk jnk(&list1, &list2, stype, schar); - return regexec(dstr, jnk, begin, end, group_id); + return regexec(dstr, jnk, begin, end, indices); } template <> __device__ inline int32_t reprog_device::call_regexec( - int32_t idx, string_view const& dstr, int32_t& begin, int32_t& end, int32_t group_id) + int32_t idx, string_view const& dstr, int32_t& begin, int32_t& end, string_index_pair* indices) { auto const stype = get_inst(_startinst_id)->type; auto const schar = get_inst(_startinst_id)->u1.c; @@ -407,12 +419,11 @@ __device__ inline int32_t reprog_device::call_regexec( u_char* listmem = reinterpret_cast(_relists_mem); // beginning of relist buffer; listmem += (idx * relists_size * 2); // two relist ptrs in reljunk: - // run ctor on assigned memory buffer relist* list1 = new (listmem) relist(static_cast(_insts_count)); relist* list2 = new (listmem + relists_size) relist(static_cast(_insts_count)); reljunk jnk(list1, list2, stype, schar); - return regexec(dstr, jnk, begin, end, group_id); + return regexec(dstr, jnk, begin, end, indices); } } // namespace detail diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 2de5141bb00..99a2f4f78c7 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -110,26 +110,36 @@ std::unique_ptr replace_with_backrefs( using BackRefIterator = decltype(backrefs.begin()); // create child columns - children_pair children = [&] { - // Each invocation is predicated on the stack size - // which is dependent on the number of regex instructions + auto [offsets, chars] = [&] { + rmm::device_uvector indices(strings.size() * d_prog->group_counts(), stream); + cudf::detail::device_2dspan d_indices( + indices.data(), strings.size(), d_prog->group_counts()); + if (regex_insts <= RX_SMALL_INSTS) { return make_strings_children( backrefs_fn{ - *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, + *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end(), d_indices}, strings.size(), stream, mr); } else if (regex_insts <= RX_MEDIUM_INSTS) { - return replace_with_backrefs_medium( - *d_strings, *d_prog, d_repl_template, backrefs, stream, mr); + return make_strings_children( + backrefs_fn{ + *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end(), d_indices}, + strings.size(), + stream, + mr); } else if (regex_insts <= RX_LARGE_INSTS) { - return replace_with_backrefs_large( - *d_strings, *d_prog, d_repl_template, backrefs, stream, mr); + return make_strings_children( + backrefs_fn{ + *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end(), d_indices}, + strings.size(), + stream, + mr); } else { return make_strings_children( backrefs_fn{ - *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, + *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end(), d_indices}, strings.size(), stream, mr); @@ -137,8 +147,8 @@ std::unique_ptr replace_with_backrefs( }(); return make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets), + std::move(chars), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh index 9c14e5acaa9..0456f9a5998 100644 --- a/cpp/src/strings/replace/backref_re.cuh +++ b/cpp/src/strings/replace/backref_re.cuh @@ -18,6 +18,8 @@ #include #include #include +#include + #include #include @@ -33,14 +35,6 @@ using backref_type = thrust::pair; * and inserting the at the backref position indicated in the replacement template. * * The logic includes computing the size of each string and also writing the output. - * - * The stack is used to keep progress on evaluating the regex instructions on each string. - * So the size of the stack is in proportion to the number of instructions in the given regex - * pattern. - * - * There are three call types based on the number of regex instructions in the given pattern. - * Small to medium instruction lengths can use the stack effectively though smaller executes faster. - * Longer patterns require global memory. Shorter patterns are common in data cleaning. */ template struct backrefs_fn { @@ -49,6 +43,7 @@ struct backrefs_fn { string_view const d_repl; // string replacement template Iterator backrefs_begin; Iterator backrefs_end; + cudf::detail::device_2dspan d_indices; int32_t* d_offsets{}; char* d_chars{}; @@ -59,73 +54,67 @@ struct backrefs_fn { return; } auto const d_str = d_strings.element(idx); + auto const in_ptr = d_str.data(); auto const nchars = d_str.length(); // number of characters in input string - auto nbytes = d_str.size_bytes(); // number of bytes in input string - auto in_ptr = d_str.data(); + auto nbytes = d_str.size_bytes(); // number of bytes for the output string auto out_ptr = d_chars ? (d_chars + d_offsets[idx]) : nullptr; size_type lpos = 0; // last byte position processed in d_str size_type begin = 0; // first character position matching regex size_type end = nchars; // last character position (exclusive) + + // working memory for extract on this string + auto d_extracts = d_indices[idx]; + // copy input to output replacing strings as we go while (prog.find(idx, d_str, begin, end) > 0) // inits the begin/end vars { - auto spos = d_str.byte_offset(begin); // get offset for these - auto epos = d_str.byte_offset(end); // character position values - nbytes += d_repl.size_bytes() - (epos - spos); // compute new size + auto spos = d_str.byte_offset(begin); // get offset for the + auto epos = d_str.byte_offset(end); // character position values; + nbytes += d_repl.size_bytes() - (epos - spos); // compute the output size + + // copy the string data before the matched section if (out_ptr) out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos); size_type lpos_template = 0; // last end pos of replace template auto const repl_ptr = d_repl.data(); // replace template pattern + + // extracts all groups for this string into d_extracts + prog.extract(idx, d_str, begin, end, d_extracts); + thrust::for_each( thrust::seq, backrefs_begin, backrefs_end, [&] __device__(backref_type backref) { + // copy the static data at the beginning of the template if (out_ptr) { auto const copy_length = backref.second - lpos_template; out_ptr = copy_and_increment(out_ptr, repl_ptr + lpos_template, copy_length); lpos_template += copy_length; } - // extract the specific group's string for this backref's index - int32_t spos_extract = begin; // these are modified - int32_t epos_extract = end; // by extract() - if ((prog.extract( - idx, d_str, spos_extract, epos_extract, backref.first - 1) <= 0) || - (epos_extract <= spos_extract)) - return; // no value for this backref number; that is ok - spos_extract = d_str.byte_offset(spos_extract); // convert - epos_extract = d_str.byte_offset(epos_extract); // to bytes - nbytes += epos_extract - spos_extract; - if (out_ptr) - out_ptr = - copy_and_increment(out_ptr, in_ptr + spos_extract, (epos_extract - spos_extract)); + // retrieve the string for this backref + auto const extracted_string = d_extracts[backref.first - 1]; + nbytes += extracted_string.second; + if (out_ptr) { + out_ptr = copy_and_increment(out_ptr, extracted_string.first, extracted_string.second); + } }); - if (out_ptr && (lpos_template < d_repl.size_bytes())) // copy remainder of template + + // copy remainder of template + if (out_ptr && (lpos_template < d_repl.size_bytes())) out_ptr = copy_and_increment( out_ptr, repl_ptr + lpos_template, d_repl.size_bytes() - lpos_template); + + // setup to match the next section lpos = epos; begin = end; end = nchars; } - if (out_ptr && (lpos < d_str.size_bytes())) // copy remainder of input string + + // finally, copy remainder of input string + if (out_ptr && (lpos < d_str.size_bytes())) memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); else if (!out_ptr) d_offsets[idx] = static_cast(nbytes); } }; -using children_pair = std::pair, std::unique_ptr>; - -children_pair replace_with_backrefs_medium(column_device_view const& d_strings, - reprog_device& d_prog, - string_view const& d_repl_template, - device_span backrefs, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - -children_pair replace_with_backrefs_large(column_device_view const& d_strings, - reprog_device& d_prog, - string_view const& d_repl_template, - device_span backrefs, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - } // namespace detail } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/replace/backref_re_large.cu b/cpp/src/strings/replace/backref_re_large.cu deleted file mode 100644 index 4f3c2fb3e1d..00000000000 --- a/cpp/src/strings/replace/backref_re_large.cu +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "backref_re.cuh" - -#include - -#include - -namespace cudf { -namespace strings { -namespace detail { - -children_pair replace_with_backrefs_large(column_device_view const& d_strings, - reprog_device& d_prog, - string_view const& d_repl_template, - device_span backrefs, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - using Iterator = decltype(backrefs.begin()); - return make_strings_children( - backrefs_fn{ - d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, - d_strings.size(), - stream, - mr); -} - -} // namespace detail -} // namespace strings -} // namespace cudf diff --git a/cpp/src/strings/replace/backref_re_medium.cu b/cpp/src/strings/replace/backref_re_medium.cu deleted file mode 100644 index 277c75930a6..00000000000 --- a/cpp/src/strings/replace/backref_re_medium.cu +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "backref_re.cuh" - -#include - -#include - -namespace cudf { -namespace strings { -namespace detail { - -children_pair replace_with_backrefs_medium(column_device_view const& d_strings, - reprog_device& d_prog, - string_view const& d_repl_template, - device_span backrefs, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - using Iterator = decltype(backrefs.begin()); - return make_strings_children( - backrefs_fn{ - d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, - d_strings.size(), - stream, - mr); -} - -} // namespace detail -} // namespace strings -} // namespace cudf diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 12bf0810a64..2d9d40e2d68 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -169,7 +169,6 @@ std::unique_ptr replace_re( auto d_found_ranges = found_ranges.data(); // create child columns - // std::pair, std::unique_ptr> children(nullptr, nullptr); auto children = [&] { // Each invocation is predicated on the stack size which is dependent on the number of regex // instructions diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index bcd0bb7714f..c7543e29b0a 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -167,22 +167,22 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest2) +TEST_F(StringsReplaceTests, ReplaceBackrefsRegexReversedTest) { cudf::test::strings_column_wrapper strings( {"A543", "Z756", "", "tést-string", "two-thréé four-fivé", "abcd-éfgh", "tést-string-again"}); auto strings_view = cudf::strings_column_view(strings); std::string pattern = "([a-z])-([a-zé])"; - std::string repl_template = "X\\1+\\2Z"; + std::string repl_template = "X\\2+\\1Z"; auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template); cudf::test::strings_column_wrapper expected({"A543", "Z756", "", - "tésXt+sZtring", - "twXo+tZhréé fouXr+fZivé", - "abcXd+éZfgh", - "tésXt+sZtrinXg+aZgain"}); + "tésXs+tZtring", + "twXt+oZhréé fouXf+rZivé", + "abcXé+dZfgh", + "tésXs+tZtrinXa+gZgain"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); }