From 6456f5f0df06ad749239921a27dd4c1d94323f61 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 10 Feb 2021 14:08:32 -0500 Subject: [PATCH 1/3] Add gbenchmark for strings contains_re/count_re functions --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/string/contains_benchmark.cpp | 54 ++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 cpp/benchmarks/string/contains_benchmark.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 7b5c092f9c6..e0ce4157ded 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -306,6 +306,7 @@ ConfigureBench(SUBWORD_TOKENIZER_BENCH "${SUBWORD_TOKENIZER_BENCH_SRC}") set(STRINGS_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/string/case_benchmark.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/string/contains_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/string/convert_durations_benchmark.cpp") ConfigureBench(STRINGS_BENCH "${STRINGS_BENCH_SRC}") diff --git a/cpp/benchmarks/string/contains_benchmark.cpp b/cpp/benchmarks/string/contains_benchmark.cpp new file mode 100644 index 00000000000..1593b1cebb1 --- /dev/null +++ b/cpp/benchmarks/string/contains_benchmark.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +class StringContains : public cudf::benchmark { +}; + +static void BM_contains(benchmark::State& state, bool contains) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}); + cudf::strings_column_view input(table->view().column(0)); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + // contains_re(), matches_re(), and count_re() all have similar functions + // with count_re() being the most regex intensive + contains ? cudf::strings::contains_re(input, "\\d+") : cudf::strings::count_re(input, "\\d+"); + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +#define STRINGS_BENCHMARK_DEFINE(name, b) \ + BENCHMARK_DEFINE_F(StringContains, name) \ + (::benchmark::State & st) { BM_contains(st, b); } \ + BENCHMARK_REGISTER_F(StringContains, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 12, 1 << 24}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +STRINGS_BENCHMARK_DEFINE(contains_re, true) +STRINGS_BENCHMARK_DEFINE(count_re, false) From 0bec86105fe9ae68557b9048bd0363a7240655a8 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 10 Feb 2021 14:09:32 -0500 Subject: [PATCH 2/3] increment update dstr iter instead of creating a new one --- cpp/src/strings/regex/regex.inl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index d1aafcf0729..5c9d1152cc6 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -233,7 +233,7 @@ __device__ inline int32_t reprog_device::regexec( break; } } - itr = string_view::const_iterator(dstr, pos); + itr += (pos - itr.position()); // faster to increment position } if (((eos < 0) || (pos < eos)) && match == 0) { From 289af40788771e97edde8e0a15f328e0667e2189 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Thu, 11 Feb 2021 18:07:43 -0500 Subject: [PATCH 3/3] add findall_re to benchmark as well --- cpp/benchmarks/string/contains_benchmark.cpp | 36 ++++++++++++++------ 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/cpp/benchmarks/string/contains_benchmark.cpp b/cpp/benchmarks/string/contains_benchmark.cpp index 1593b1cebb1..1a2ac8ad602 100644 --- a/cpp/benchmarks/string/contains_benchmark.cpp +++ b/cpp/benchmarks/string/contains_benchmark.cpp @@ -20,12 +20,15 @@ #include #include +#include #include class StringContains : public cudf::benchmark { }; -static void BM_contains(benchmark::State& state, bool contains) +enum contains_type { contains, count, findall }; + +static void BM_contains(benchmark::State& state, contains_type ct) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}); @@ -35,20 +38,31 @@ static void BM_contains(benchmark::State& state, bool contains) cuda_event_timer raii(state, true, 0); // contains_re(), matches_re(), and count_re() all have similar functions // with count_re() being the most regex intensive - contains ? cudf::strings::contains_re(input, "\\d+") : cudf::strings::count_re(input, "\\d+"); + switch (ct) { + case contains_type::contains: // contains_re and matches_re use the same main logic + cudf::strings::contains_re(input, "\\d+"); + break; + case contains_type::count: // counts occurrences of pattern + cudf::strings::count_re(input, "\\d+"); + break; + case contains_type::findall: // returns occurrences of matches + cudf::strings::findall_re(input, "\\d+"); + break; + } } state.SetBytesProcessed(state.iterations() * input.chars_size()); } -#define STRINGS_BENCHMARK_DEFINE(name, b) \ - BENCHMARK_DEFINE_F(StringContains, name) \ - (::benchmark::State & st) { BM_contains(st, b); } \ - BENCHMARK_REGISTER_F(StringContains, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 12, 1 << 24}}) \ - ->UseManualTime() \ +#define STRINGS_BENCHMARK_DEFINE(name, b) \ + BENCHMARK_DEFINE_F(StringContains, name) \ + (::benchmark::State & st) { BM_contains(st, contains_type::b); } \ + BENCHMARK_REGISTER_F(StringContains, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 12, 1 << 24}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -STRINGS_BENCHMARK_DEFINE(contains_re, true) -STRINGS_BENCHMARK_DEFINE(count_re, false) +STRINGS_BENCHMARK_DEFINE(contains_re, contains) +STRINGS_BENCHMARK_DEFINE(count_re, count) +STRINGS_BENCHMARK_DEFINE(findall_re, findall)