diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 682f1ac5fca..cd083ebec7a 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -189,6 +189,7 @@ ConfigureBench(STRINGS_BENCH string/filter_benchmark.cpp string/find_benchmark.cpp string/replace_benchmark.cpp + string/replace_re_benchmark.cpp string/split_benchmark.cpp string/substring_benchmark.cpp string/url_decode_benchmark.cpp) diff --git a/cpp/benchmarks/string/replace_re_benchmark.cpp b/cpp/benchmarks/string/replace_re_benchmark.cpp new file mode 100644 index 00000000000..616e2c0f22c --- /dev/null +++ b/cpp/benchmarks/string/replace_re_benchmark.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "string_bench_args.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +class StringReplace : public cudf::benchmark { +}; + +enum replace_type { replace_re, replace_re_multi, replace_backref }; + +static void BM_replace(benchmark::State& state, replace_type rt) +{ + cudf::size_type const n_rows{static_cast(state.range(0))}; + cudf::size_type const max_str_length{static_cast(state.range(1))}; + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + cudf::test::strings_column_wrapper repls({"#", ""}); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + switch (rt) { + case replace_type::replace_re: // contains_re and matches_re use the same main logic + cudf::strings::replace_re(input, "\\d+"); + break; + case replace_type::replace_re_multi: // counts occurrences of pattern + cudf::strings::replace_re(input, {"\\d+", "\\s+"}, cudf::strings_column_view(repls)); + break; + case replace_type::replace_backref: // returns occurrences of matches + cudf::strings::replace_with_backrefs(input, "(\\d+)", "#\\1X"); + break; + } + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define STRINGS_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(StringReplace, name) \ + (::benchmark::State & st) { BM_replace(st, name); } \ + BENCHMARK_REGISTER_F(StringReplace, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +STRINGS_BENCHMARK_DEFINE(replace_re) +STRINGS_BENCHMARK_DEFINE(replace_re_multi) +STRINGS_BENCHMARK_DEFINE(replace_backref) diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 95f9ecbe2ef..cac774ef43e 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,7 +43,8 @@ namespace { * * The backref numbers are expected to be 1-based. * - * Returns a modified string without back-ref indicators. + * Returns a modified string without back-ref indicators and a vector of backref + * byte position pairs. * ``` * Example: * for input string: 'hello \2 and \1' @@ -51,8 +52,9 @@ namespace { * returned string is: 'hello and ' * ``` */ -std::string parse_backrefs(std::string const& repl, std::vector& backrefs) +std::pair> parse_backrefs(std::string const& repl) { + std::vector backrefs; std::string str = repl; // make a modifiable copy std::smatch m; std::regex ex("(\\\\\\d+)"); // this searches for backslash-number(s); example "\1" @@ -60,21 +62,19 @@ std::string parse_backrefs(std::string const& repl, std::vector& b size_type byte_offset = 0; while (std::regex_search(str, m, ex)) { if (m.size() == 0) break; - backref_type item; - std::string bref = m[0]; - size_type position = static_cast(m.position(0)); - size_type length = static_cast(bref.length()); + std::string const backref = m[0]; + size_type const position = static_cast(m.position(0)); + size_type const length = static_cast(backref.length()); byte_offset += position; - item.first = std::atoi(bref.c_str() + 1); // back-ref index number - CUDF_EXPECTS(item.first > 0, "Back-reference numbers must be greater than 0"); - item.second = byte_offset; // position within the string + size_type const index = std::atoi(backref.c_str() + 1); // back-ref index number + CUDF_EXPECTS(index > 0, "Back-reference numbers must be greater than 0"); rtn += str.substr(0, position); str = str.substr(position + length); - backrefs.push_back(item); + backrefs.push_back({index, byte_offset}); } if (!str.empty()) // add the remainder rtn += str; // of the string - return rtn; + return {rtn, backrefs}; } } // namespace @@ -87,54 +87,54 @@ std::unique_ptr replace_with_backrefs( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - auto strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(stream, mr); + if (strings.is_empty()) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); CUDF_EXPECTS(!repl.empty(), "Parameter repl must not be empty"); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; + auto d_strings = column_device_view::create(strings.parent(), stream); // compile regex into device object - auto prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); - auto d_prog = *prog; - auto regex_insts = d_prog.insts_counts(); + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings.size(), stream); + auto const regex_insts = d_prog->insts_counts(); // parse the repl string for backref indicators - std::vector h_backrefs; - std::string repl_template = parse_backrefs(repl, h_backrefs); - rmm::device_vector backrefs(h_backrefs); - string_scalar repl_scalar(repl_template); - string_view d_repl_template{repl_scalar.data(), repl_scalar.size()}; - - // copy null mask - auto null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - auto null_count = strings.null_count(); + auto const parse_result = parse_backrefs(repl); + rmm::device_uvector backrefs(parse_result.second.size(), stream); + CUDA_TRY(cudaMemcpyAsync(backrefs.data(), + parse_result.second.data(), + sizeof(backref_type) * backrefs.size(), + cudaMemcpyHostToDevice, + stream.value())); + string_scalar repl_scalar(parse_result.first, true, stream); + string_view const d_repl_template = repl_scalar.value(); + + using BackRefIterator = decltype(backrefs.begin()); // create child columns - children_pair children(nullptr, nullptr); - // Each invocation is predicated on the stack size - // which is dependent on the number of regex instructions - if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) { - children = make_strings_children( - backrefs_fn{ - d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, - strings_count, - null_count, - stream, - mr); - } else if (regex_insts <= RX_MEDIUM_INSTS) - children = replace_with_backrefs_medium( - d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr); - else - children = replace_with_backrefs_large( - d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr); - - return make_strings_column(strings_count, + children_pair children = [&] { + // Each invocation is predicated on the stack size + // which is dependent on the number of regex instructions + if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) { + return make_strings_children( + backrefs_fn{ + *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, + strings.size(), + strings.null_count(), + stream, + mr); + } else if (regex_insts <= RX_MEDIUM_INSTS) + return replace_with_backrefs_medium( + *d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr); + else + return replace_with_backrefs_large( + *d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr); + }(); + + return make_strings_column(strings.size(), std::move(children.first), std::move(children.second), - null_count, - std::move(null_mask), + strings.null_count(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, mr); } diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh index f13d84cf9ca..529b91a98e5 100644 --- a/cpp/src/strings/replace/backref_re.cuh +++ b/cpp/src/strings/replace/backref_re.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,13 +42,13 @@ using backref_type = thrust::pair; * Small to medium instruction lengths can use the stack effectively though smaller executes faster. * Longer patterns require global memory. Shorter patterns are common in data cleaning. */ -template +template struct backrefs_fn { column_device_view const d_strings; reprog_device prog; string_view const d_repl; // string replacement template - rmm::device_vector::iterator backrefs_begin; - rmm::device_vector::iterator backrefs_end; + Iterator backrefs_begin; + Iterator backrefs_end; int32_t* d_offsets{}; char* d_chars{}; @@ -117,7 +117,7 @@ using children_pair = std::pair, std::unique_ptr children_pair replace_with_backrefs_medium(column_device_view const& d_strings, reprog_device& d_prog, string_view const& d_repl_template, - rmm::device_vector& backrefs, + device_span backrefs, size_type null_count, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); @@ -125,7 +125,7 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings, children_pair replace_with_backrefs_large(column_device_view const& d_strings, reprog_device& d_prog, string_view const& d_repl_template, - rmm::device_vector& backrefs, + device_span backrefs, size_type null_count, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/src/strings/replace/backref_re_large.cu b/cpp/src/strings/replace/backref_re_large.cu index 0b078132623..56bd8941b8a 100644 --- a/cpp/src/strings/replace/backref_re_large.cu +++ b/cpp/src/strings/replace/backref_re_large.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,17 +24,17 @@ namespace cudf { namespace strings { namespace detail { -// children_pair replace_with_backrefs_large(column_device_view const& d_strings, reprog_device& d_prog, string_view const& d_repl_template, - rmm::device_vector& backrefs, + device_span backrefs, size_type null_count, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + using Iterator = decltype(backrefs.begin()); return make_strings_children( - backrefs_fn{ + backrefs_fn{ d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, d_strings.size(), null_count, diff --git a/cpp/src/strings/replace/backref_re_medium.cu b/cpp/src/strings/replace/backref_re_medium.cu index 899e0cb2a3e..8b1dd6c5999 100644 --- a/cpp/src/strings/replace/backref_re_medium.cu +++ b/cpp/src/strings/replace/backref_re_medium.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,17 +24,17 @@ namespace cudf { namespace strings { namespace detail { -// children_pair replace_with_backrefs_medium(column_device_view const& d_strings, reprog_device& d_prog, string_view const& d_repl_template, - rmm::device_vector& backrefs, + device_span backrefs, size_type null_count, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + using Iterator = decltype(backrefs.begin()); return make_strings_children( - backrefs_fn{ + backrefs_fn{ d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, d_strings.size(), null_count, diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 3eb551ead18..39725361741 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -139,15 +139,13 @@ std::unique_ptr replace_re( auto strings_count = strings.size(); if (strings_count == 0) return make_empty_strings_column(stream, mr); if (patterns.empty()) // no patterns; just return a copy - return std::make_unique(strings.parent()); + return std::make_unique(strings.parent(), stream, mr); CUDF_EXPECTS(!repls.has_nulls(), "Parameter repls must not have any nulls"); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - auto repls_column = column_device_view::create(repls.parent(), stream); - auto d_repls = *repls_column; - auto d_flags = get_character_flags_table(); + auto d_strings = column_device_view::create(strings.parent(), stream); + auto d_repls = column_device_view::create(repls.parent(), stream); + auto d_flags = get_character_flags_table(); // compile regexes into device objects size_type regex_insts = 0; @@ -170,37 +168,39 @@ std::unique_ptr replace_re( reprog_device* d_progs = reinterpret_cast(progs_buffer.data()); // create working buffer for ranges pairs - rmm::device_vector found_ranges(patterns.size() * strings_count); - auto d_found_ranges = found_ranges.data().get(); + rmm::device_uvector found_ranges(patterns.size() * strings_count, stream); + auto d_found_ranges = found_ranges.data(); // create child columns - std::pair, std::unique_ptr> children(nullptr, nullptr); - // Each invocation is predicated on the stack size which is dependent on the number of regex - // instructions - if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) - children = make_strings_children( - replace_multi_regex_fn{ - d_strings, d_progs, static_cast(progs.size()), d_found_ranges, d_repls}, - strings_count, - strings.null_count(), - stream, - mr); - else if (regex_insts <= RX_MEDIUM_INSTS) - children = make_strings_children( - replace_multi_regex_fn{ - d_strings, d_progs, static_cast(progs.size()), d_found_ranges, d_repls}, - strings_count, - strings.null_count(), - stream, - mr); - else - children = make_strings_children( - replace_multi_regex_fn{ - d_strings, d_progs, static_cast(progs.size()), d_found_ranges, d_repls}, - strings_count, - strings.null_count(), - stream, - mr); + // std::pair, std::unique_ptr> children(nullptr, nullptr); + auto children = [&] { + // Each invocation is predicated on the stack size which is dependent on the number of regex + // instructions + if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) + return make_strings_children( + replace_multi_regex_fn{ + *d_strings, d_progs, static_cast(progs.size()), d_found_ranges, *d_repls}, + strings_count, + strings.null_count(), + stream, + mr); + else if (regex_insts <= RX_MEDIUM_INSTS) + return make_strings_children( + replace_multi_regex_fn{ + *d_strings, d_progs, static_cast(progs.size()), d_found_ranges, *d_repls}, + strings_count, + strings.null_count(), + stream, + mr); + else + return make_strings_children( + replace_multi_regex_fn{ + *d_strings, d_progs, static_cast(progs.size()), d_found_ranges, *d_repls}, + strings_count, + strings.null_count(), + stream, + mr); + }(); return make_strings_column(strings_count, std::move(children.first),