From b7c1c9e72643aff82cec4cc37443d728c892e3bc Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 8 Apr 2021 23:24:29 -0400 Subject: [PATCH] Update strings extract gbenchmark to measure multiple groups (#7789) Reference https://github.com/rapidsai/custreamz/issues/2 Updating the gbenchmark for `cudf::strings::extract` to measure varying number of groups. Previously the regex pattern only had 1 group. This is to more closely align to the referenced issue which includes more than 20 groups. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Devavret Makkar (https://github.com/devavret) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/7789 --- cpp/benchmarks/string/extract_benchmark.cpp | 68 +++++++++++++-------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/cpp/benchmarks/string/extract_benchmark.cpp b/cpp/benchmarks/string/extract_benchmark.cpp index f53ea60cdf9..aa1e59a22bf 100644 --- a/cpp/benchmarks/string/extract_benchmark.cpp +++ b/cpp/benchmarks/string/extract_benchmark.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "string_bench_args.hpp" + #include #include #include @@ -23,43 +25,55 @@ #include #include -#include "string_bench_args.hpp" +#include class StringExtract : public cudf::benchmark { }; -static void BM_extract(benchmark::State& state, int re_instructions) +static void BM_extract(benchmark::State& state, int groups) { - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; - data_profile table_profile; - table_profile.set_distribution_params( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const table = - create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); - cudf::strings_column_view input(table->view().column(0)); - std::string const raw_pattern = - "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234" - "5678901234567890123456789012345678901234567890"; - std::string const pattern = "(" + raw_pattern.substr(0, re_instructions) + ")"; + auto const n_rows = static_cast(state.range(0)); + auto const n_length = static_cast(state.range(1)); + + std::default_random_engine generator; + std::uniform_int_distribution words_dist(0, 999); + + std::vector samples(100); // 100 unique rows of data to reuse + std::generate(samples.begin(), samples.end(), [&]() { + std::string row; // build a row of random tokens + while (static_cast(row.size()) < n_length) { + row += std::to_string(words_dist(generator)) + " "; + } + return row; + }); + + std::string pattern; + while (static_cast(pattern.size()) < groups) { pattern += "(\\d+) "; } + + std::uniform_int_distribution distribution(0, samples.size() - 1); + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [&](auto idx) { return samples.at(distribution(generator)); }); + cudf::test::strings_column_wrapper input(elements, elements + n_rows); + cudf::strings_column_view view(input); for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto results = cudf::strings::extract(input, pattern); + cuda_event_timer raii(state, true); + auto results = cudf::strings::extract(view, pattern); } - state.SetBytesProcessed(state.iterations() * input.chars_size()); + state.SetBytesProcessed(state.iterations() * view.chars_size()); } static void generate_bench_args(benchmark::internal::Benchmark* b) { - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_multiplier = 8; + int const min_row_length = 1 << 5; + int const max_row_length = 1 << 13; + int const length_multiplier = 4; + generate_string_bench_args( + b, min_rows, max_rows, row_multiplier, min_row_length, max_row_length, length_multiplier); } #define STRINGS_BENCHMARK_DEFINE(name, instructions) \ @@ -70,6 +84,6 @@ static void generate_bench_args(benchmark::internal::Benchmark* b) ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -STRINGS_BENCHMARK_DEFINE(small, 4) -STRINGS_BENCHMARK_DEFINE(medium, 48) -STRINGS_BENCHMARK_DEFINE(large, 128) +STRINGS_BENCHMARK_DEFINE(small, 2) +STRINGS_BENCHMARK_DEFINE(medium, 10) +STRINGS_BENCHMARK_DEFINE(large, 30)