From 6ed360c57dbd441c1b54b0b4d3a2b4dcaf841c27 Mon Sep 17 00:00:00 2001 From: David <45795991+davidwendt@users.noreply.github.com> Date: Wed, 24 Mar 2021 06:50:12 -0400 Subject: [PATCH] Add gbenchmark for nvtext tokenize functions (#7684) Reference #5696 Creates gbenchmarks for `nvtext::tokenize()`, `nvtext::count_tokens()` and `nvtext::ngrams_tokenize()` functions. The benchmarks measures various string lengths and number of rows. These functions use the `make_strings_column` factory optimized in #7576 Authors: - David (@davidwendt) Approvers: - Conor Hoekstra (@codereport) - Nghia Truong (@ttnghia) - Mark Harris (@harrism) URL: https://github.com/rapidsai/cudf/pull/7684 --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/text/tokenize_benchmark.cpp | 92 ++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 cpp/benchmarks/text/tokenize_benchmark.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index d59e582b1fb..7fd84b508ac 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -177,6 +177,7 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu) ConfigureBench(TEXT_BENCH text/normalize_benchmark.cpp text/normalize_spaces_benchmark.cpp + text/tokenize_benchmark.cpp text/subword_benchmark.cpp) ################################################################################################### diff --git a/cpp/benchmarks/text/tokenize_benchmark.cpp b/cpp/benchmarks/text/tokenize_benchmark.cpp new file mode 100644 index 00000000000..f9e742f0f31 --- /dev/null +++ b/cpp/benchmarks/text/tokenize_benchmark.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +class TextTokenize : public cudf::benchmark { +}; + +enum class tokenize_type { single, multi, count, count_multi, ngrams }; + +static void BM_tokenize(benchmark::State& state, tokenize_type tt) +{ + auto const n_rows = static_cast(state.range(0)); + auto const max_str_length = static_cast(state.range(1)); + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + cudf::test::strings_column_wrapper delimiters({" ", "+", "-"}); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + switch (tt) { + case tokenize_type::single: nvtext::tokenize(input); break; + case tokenize_type::multi: + nvtext::tokenize(input, cudf::strings_column_view(delimiters)); + break; + case tokenize_type::count: nvtext::count_tokens(input); break; + case tokenize_type::count_multi: + nvtext::count_tokens(input, cudf::strings_column_view(delimiters)); + break; + case tokenize_type::ngrams: + // default is bigrams + nvtext::ngrams_tokenize(input); + break; + } + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define NVTEXT_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(TextTokenize, name) \ + (::benchmark::State & st) { BM_tokenize(st, tokenize_type::name); } \ + BENCHMARK_REGISTER_F(TextTokenize, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +NVTEXT_BENCHMARK_DEFINE(single) +NVTEXT_BENCHMARK_DEFINE(multi) +NVTEXT_BENCHMARK_DEFINE(count) +NVTEXT_BENCHMARK_DEFINE(count_multi) +NVTEXT_BENCHMARK_DEFINE(ngrams)