Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gbenchmarks for strings replace regex functions #7541

Merged
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ ConfigureBench(STRINGS_BENCH
string/filter_benchmark.cpp
string/find_benchmark.cpp
string/replace_benchmark.cpp
string/replace_re_benchmark.cpp
string/split_benchmark.cpp
string/substring_benchmark.cpp
string/url_decode_benchmark.cpp)
84 changes: 84 additions & 0 deletions cpp/benchmarks/string/replace_re_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "string_bench_args.hpp"

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/strings/replace_re.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf_test/column_wrapper.hpp>

class StringReplace : public cudf::benchmark {
};

enum replace_type { replace_re, replace_re_multi, replace_backref };

static void BM_replace(benchmark::State& state, replace_type rt)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));
cudf::test::strings_column_wrapper repls({"#", ""});

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
switch (rt) {
case replace_type::replace_re: // contains_re and matches_re use the same main logic
cudf::strings::replace_re(input, "\\d+");
break;
case replace_type::replace_re_multi: // counts occurrences of pattern
cudf::strings::replace_re(input, {"\\d+", "\\s+"}, cudf::strings_column_view(repls));
break;
case replace_type::replace_backref: // returns occurrences of matches
cudf::strings::replace_with_backrefs(input, "(\\d+)", "#\\1X");
break;
}
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringReplace, name) \
(::benchmark::State & st) { BM_replace(st, name); } \
BENCHMARK_REGISTER_F(StringReplace, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(replace_re)
STRINGS_BENCHMARK_DEFINE(replace_re_multi)
STRINGS_BENCHMARK_DEFINE(replace_backref)
98 changes: 49 additions & 49 deletions cpp/src/strings/replace/backref_re.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -43,38 +43,38 @@ namespace {
*
* The backref numbers are expected to be 1-based.
*
* Returns a modified string without back-ref indicators.
* Returns a modified string without back-ref indicators and a vector of backref
* byte position pairs.
* ```
* Example:
* for input string: 'hello \2 and \1'
* the returned pairs: (2,6),(1,11)
* returned string is: 'hello and '
* ```
*/
std::string parse_backrefs(std::string const& repl, std::vector<backref_type>& backrefs)
std::pair<std::string, std::vector<backref_type>> parse_backrefs(std::string const& repl)
{
std::vector<backref_type> backrefs;
std::string str = repl; // make a modifiable copy
std::smatch m;
std::regex ex("(\\\\\\d+)"); // this searches for backslash-number(s); example "\1"
std::string rtn; // result without refs
size_type byte_offset = 0;
while (std::regex_search(str, m, ex)) {
if (m.size() == 0) break;
backref_type item;
std::string bref = m[0];
size_type position = static_cast<size_type>(m.position(0));
size_type length = static_cast<size_type>(bref.length());
std::string const backref = m[0];
size_type const position = static_cast<size_type>(m.position(0));
size_type const length = static_cast<size_type>(backref.length());
byte_offset += position;
item.first = std::atoi(bref.c_str() + 1); // back-ref index number
CUDF_EXPECTS(item.first > 0, "Back-reference numbers must be greater than 0");
item.second = byte_offset; // position within the string
size_type const index = std::atoi(backref.c_str() + 1); // back-ref index number
CUDF_EXPECTS(index > 0, "Back-reference numbers must be greater than 0");
rtn += str.substr(0, position);
str = str.substr(position + length);
backrefs.push_back(item);
backrefs.push_back({index, byte_offset});
}
if (!str.empty()) // add the remainder
rtn += str; // of the string
return rtn;
return {rtn, backrefs};
}

} // namespace
Expand All @@ -87,54 +87,54 @@ std::unique_ptr<column> replace_with_backrefs(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto strings_count = strings.size();
if (strings_count == 0) return make_empty_strings_column(stream, mr);
if (strings.is_empty()) return make_empty_strings_column(stream, mr);

CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
CUDF_EXPECTS(!repl.empty(), "Parameter repl must not be empty");

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;
auto d_strings = column_device_view::create(strings.parent(), stream);
// compile regex into device object
auto prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
auto d_prog = *prog;
auto regex_insts = d_prog.insts_counts();
auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings.size(), stream);
auto const regex_insts = d_prog->insts_counts();

// parse the repl string for backref indicators
std::vector<backref_type> h_backrefs;
std::string repl_template = parse_backrefs(repl, h_backrefs);
rmm::device_vector<backref_type> backrefs(h_backrefs);
string_scalar repl_scalar(repl_template);
string_view d_repl_template{repl_scalar.data(), repl_scalar.size()};

// copy null mask
auto null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
auto null_count = strings.null_count();
auto const parse_result = parse_backrefs(repl);
rmm::device_uvector<backref_type> backrefs(parse_result.second.size(), stream);
CUDA_TRY(cudaMemcpyAsync(backrefs.data(),
parse_result.second.data(),
sizeof(backref_type) * backrefs.size(),
cudaMemcpyHostToDevice,
stream.value()));
string_scalar repl_scalar(parse_result.first, true, stream);
string_view const d_repl_template = repl_scalar.value();

using BackRefIterator = decltype(backrefs.begin());

// create child columns
children_pair children(nullptr, nullptr);
// Each invocation is predicated on the stack size
// which is dependent on the number of regex instructions
if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) {
children = make_strings_children(
backrefs_fn<RX_STACK_SMALL>{
d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
strings_count,
null_count,
stream,
mr);
} else if (regex_insts <= RX_MEDIUM_INSTS)
children = replace_with_backrefs_medium(
d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr);
else
children = replace_with_backrefs_large(
d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr);

return make_strings_column(strings_count,
children_pair children = [&] {
// Each invocation is predicated on the stack size
// which is dependent on the number of regex instructions
if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) {
return make_strings_children(
backrefs_fn<BackRefIterator, RX_STACK_SMALL>{
*d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
strings.size(),
strings.null_count(),
stream,
mr);
} else if (regex_insts <= RX_MEDIUM_INSTS)
return replace_with_backrefs_medium(
*d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr);
else
return replace_with_backrefs_large(
*d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr);
}();

return make_strings_column(strings.size(),
std::move(children.first),
std::move(children.second),
null_count,
std::move(null_mask),
strings.null_count(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
stream,
mr);
}
Expand Down
12 changes: 6 additions & 6 deletions cpp/src/strings/replace/backref_re.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,13 +42,13 @@ using backref_type = thrust::pair<size_type, size_type>;
* Small to medium instruction lengths can use the stack effectively though smaller executes faster.
* Longer patterns require global memory. Shorter patterns are common in data cleaning.
*/
template <size_t stack_size>
template <typename Iterator, size_t stack_size>
struct backrefs_fn {
column_device_view const d_strings;
reprog_device prog;
string_view const d_repl; // string replacement template
rmm::device_vector<backref_type>::iterator backrefs_begin;
rmm::device_vector<backref_type>::iterator backrefs_end;
Iterator backrefs_begin;
Iterator backrefs_end;
int32_t* d_offsets{};
char* d_chars{};

Expand Down Expand Up @@ -117,15 +117,15 @@ using children_pair = std::pair<std::unique_ptr<column>, std::unique_ptr<column>
children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
reprog_device& d_prog,
string_view const& d_repl_template,
rmm::device_vector<backref_type>& backrefs,
device_span<backref_type> backrefs,
size_type null_count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

children_pair replace_with_backrefs_large(column_device_view const& d_strings,
reprog_device& d_prog,
string_view const& d_repl_template,
rmm::device_vector<backref_type>& backrefs,
device_span<backref_type> backrefs,
size_type null_count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/strings/replace/backref_re_large.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,17 +24,17 @@ namespace cudf {
namespace strings {
namespace detail {

//
children_pair replace_with_backrefs_large(column_device_view const& d_strings,
reprog_device& d_prog,
string_view const& d_repl_template,
rmm::device_vector<backref_type>& backrefs,
device_span<backref_type> backrefs,
size_type null_count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
using Iterator = decltype(backrefs.begin());
return make_strings_children(
backrefs_fn<RX_STACK_LARGE>{
backrefs_fn<Iterator, RX_STACK_LARGE>{
d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
d_strings.size(),
null_count,
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/strings/replace/backref_re_medium.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,17 +24,17 @@ namespace cudf {
namespace strings {
namespace detail {

//
children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
reprog_device& d_prog,
string_view const& d_repl_template,
rmm::device_vector<backref_type>& backrefs,
device_span<backref_type> backrefs,
size_type null_count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
using Iterator = decltype(backrefs.begin());
return make_strings_children(
backrefs_fn<RX_STACK_MEDIUM>{
backrefs_fn<Iterator, RX_STACK_MEDIUM>{
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
d_strings.size(),
null_count,
Expand Down
Loading