Skip to content

Commit

Permalink
Add chars-tokenizer to nvtext tokenize_benchmark.cpp (#8125)
Browse files Browse the repository at this point in the history
This PR adds a benchmark to the current `tokenize_benchmark.cpp` to measure the `nvtext::character_tokenize` API.

PR #8085 added code for using the `nvtext::character_tokenize` function. 
The benchmark was also useful while investigating #8094.
Also found and removed an unused variable in the code logic.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: #8125
  • Loading branch information
davidwendt authored May 6, 2021
1 parent e8b9ff7 commit 611cabd
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 4 deletions.
17 changes: 14 additions & 3 deletions cpp/benchmarks/text/tokenize_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
class TextTokenize : public cudf::benchmark {
};

enum class tokenize_type { single, multi, count, count_multi, ngrams };
enum class tokenize_type { single, multi, count, count_multi, ngrams, characters };

static void BM_tokenize(benchmark::State& state, tokenize_type tt)
{
Expand All @@ -48,18 +48,28 @@ static void BM_tokenize(benchmark::State& state, tokenize_type tt)
for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);
switch (tt) {
case tokenize_type::single: nvtext::tokenize(input); break;
case tokenize_type::single:
// single whitespace delimiter
nvtext::tokenize(input);
break;
case tokenize_type::multi:
nvtext::tokenize(input, cudf::strings_column_view(delimiters));
break;
case tokenize_type::count: nvtext::count_tokens(input); break;
case tokenize_type::count:
// single whitespace delimiter
nvtext::count_tokens(input);
break;
case tokenize_type::count_multi:
nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
break;
case tokenize_type::ngrams:
// default is bigrams
nvtext::ngrams_tokenize(input);
break;
case tokenize_type::characters:
// every character becomes a string
nvtext::character_tokenize(input);
break;
}
}

Expand Down Expand Up @@ -90,3 +100,4 @@ NVTEXT_BENCHMARK_DEFINE(multi)
NVTEXT_BENCHMARK_DEFINE(count)
NVTEXT_BENCHMARK_DEFINE(count_multi)
NVTEXT_BENCHMARK_DEFINE(ngrams)
NVTEXT_BENCHMARK_DEFINE(characters)
1 change: 0 additions & 1 deletion cpp/src/text/tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,6 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
// To minimize memory, count the number of characters so we can
// build the output offsets without an intermediate buffer.
// In the worst case each byte is a character so the output is 4x the input.
auto strings_view = cudf::column_device_view::create(strings_column.parent(), stream);
cudf::size_type num_characters = thrust::count_if(
rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) {
return cudf::strings::detail::is_begin_utf8_char(byte);
Expand Down

0 comments on commit 611cabd

Please sign in to comment.