From 54aceedb79dbf7499c6d83e61fb79afa38f13bc1 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 2 May 2022 09:07:25 -0400 Subject: [PATCH] Add multiple rows to subword tokenizer benchmark --- cpp/benchmarks/text/subword.cpp | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp index d8357dcf92c..2c430868341 100644 --- a/cpp/benchmarks/text/subword.cpp +++ b/cpp/benchmarks/text/subword.cpp @@ -14,7 +14,8 @@ * limitations under the License. */ -#include +#include +#include #include @@ -53,9 +54,9 @@ static std::string create_hash_vocab_file() return hash_file; } -static void BM_cuda_tokenizer_cudf(benchmark::State& state) +static void BM_subword_tokenizer(benchmark::State& state) { - uint32_t nrows = 1000; + auto const nrows = static_cast(state.range(0)); std::vector h_strings(nrows, "This is a test "); cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); std::string hash_file = create_hash_vocab_file(); @@ -67,6 +68,7 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state) // auto vocab = nvtext::load_vocabulary_file(hash_file); for (auto _ : state) { + cuda_event_timer raii(state, true); auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, *vocab, max_sequence_length, @@ -76,6 +78,18 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state) MAX_ROWS_TENSOR); } } -BENCHMARK(BM_cuda_tokenizer_cudf); -BENCHMARK_MAIN(); +class Subword : public cudf::benchmark { +}; + +#define SUBWORD_BM_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \ + BENCHMARK_REGISTER_F(Subword, name) \ + ->RangeMultiplier(2) \ + ->Range(1 << 10, 1 << 17) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer); + +// BENCHMARK_MAIN();