Skip to content

Commit

Permalink
Update strings extract gbenchmark to measure multiple groups (#7789)
Browse files Browse the repository at this point in the history
Reference rapidsai/custreamz#2
Updating the gbenchmark for `cudf::strings::extract` to measure varying number of groups. Previously the regex pattern only had 1 group. This is to more closely align to the referenced issue which includes more than 20 groups.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)
  - Conor Hoekstra (https://github.com/codereport)

URL: #7789
  • Loading branch information
davidwendt authored Apr 9, 2021
1 parent c026556 commit b7c1c9e
Showing 1 changed file with 41 additions and 27 deletions.
68 changes: 41 additions & 27 deletions cpp/benchmarks/string/extract_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
* limitations under the License.
*/

#include "string_bench_args.hpp"

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
Expand All @@ -23,43 +25,55 @@
#include <cudf/strings/strings_column_view.hpp>
#include <cudf_test/column_wrapper.hpp>

#include "string_bench_args.hpp"
#include <random>

class StringExtract : public cudf::benchmark {
};

static void BM_extract(benchmark::State& state, int re_instructions)
static void BM_extract(benchmark::State& state, int groups)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));
std::string const raw_pattern =
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234"
"5678901234567890123456789012345678901234567890";
std::string const pattern = "(" + raw_pattern.substr(0, re_instructions) + ")";
auto const n_rows = static_cast<cudf::size_type>(state.range(0));
auto const n_length = static_cast<cudf::size_type>(state.range(1));

std::default_random_engine generator;
std::uniform_int_distribution<int> words_dist(0, 999);

std::vector<std::string> samples(100); // 100 unique rows of data to reuse
std::generate(samples.begin(), samples.end(), [&]() {
std::string row; // build a row of random tokens
while (static_cast<int>(row.size()) < n_length) {
row += std::to_string(words_dist(generator)) + " ";
}
return row;
});

std::string pattern;
while (static_cast<int>(pattern.size()) < groups) { pattern += "(\\d+) "; }

std::uniform_int_distribution<int> distribution(0, samples.size() - 1);
auto elements = cudf::detail::make_counting_transform_iterator(
0, [&](auto idx) { return samples.at(distribution(generator)); });
cudf::test::strings_column_wrapper input(elements, elements + n_rows);
cudf::strings_column_view view(input);

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);
auto results = cudf::strings::extract(input, pattern);
cuda_event_timer raii(state, true);
auto results = cudf::strings::extract(view, pattern);
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
state.SetBytesProcessed(state.iterations() * view.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_multiplier = 8;
int const min_row_length = 1 << 5;
int const max_row_length = 1 << 13;
int const length_multiplier = 4;
generate_string_bench_args(
b, min_rows, max_rows, row_multiplier, min_row_length, max_row_length, length_multiplier);
}

#define STRINGS_BENCHMARK_DEFINE(name, instructions) \
Expand All @@ -70,6 +84,6 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(small, 4)
STRINGS_BENCHMARK_DEFINE(medium, 48)
STRINGS_BENCHMARK_DEFINE(large, 128)
STRINGS_BENCHMARK_DEFINE(small, 2)
STRINGS_BENCHMARK_DEFINE(medium, 10)
STRINGS_BENCHMARK_DEFINE(large, 30)

0 comments on commit b7c1c9e

Please sign in to comment.