Add chars-tokenizer to nvtext tokenize_benchmark.cpp (#8125)

This PR adds a benchmark to the current `tokenize_benchmark.cpp` to measure the `nvtext::character_tokenize` API. PR #8085 added code for using the `nvtext::character_tokenize` function. The benchmark was also useful while investigating #8094. Also found and removed an unused variable in the code logic. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Nghia Truong (https://github.com/ttnghia) URL: #8125
rapidsai · May 6, 2021 · 611cabd · 611cabd
1 parent e8b9ff7
commit 611cabd
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 4 deletions.
diff --git a/cpp/benchmarks/text/tokenize_benchmark.cpp b/cpp/benchmarks/text/tokenize_benchmark.cpp
@@ -31,7 +31,7 @@
 class TextTokenize : public cudf::benchmark {
 };
 
-enum class tokenize_type { single, multi, count, count_multi, ngrams };
+enum class tokenize_type { single, multi, count, count_multi, ngrams, characters };
 
 static void BM_tokenize(benchmark::State& state, tokenize_type tt)
 {
@@ -48,18 +48,28 @@ static void BM_tokenize(benchmark::State& state, tokenize_type tt)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     switch (tt) {
-      case tokenize_type::single: nvtext::tokenize(input); break;
+      case tokenize_type::single:
+        // single whitespace delimiter
+        nvtext::tokenize(input);
+        break;
       case tokenize_type::multi:
         nvtext::tokenize(input, cudf::strings_column_view(delimiters));
         break;
-      case tokenize_type::count: nvtext::count_tokens(input); break;
+      case tokenize_type::count:
+        // single whitespace delimiter
+        nvtext::count_tokens(input);
+        break;
       case tokenize_type::count_multi:
         nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
         break;
       case tokenize_type::ngrams:
         // default is bigrams
         nvtext::ngrams_tokenize(input);
         break;
+      case tokenize_type::characters:
+        // every character becomes a string
+        nvtext::character_tokenize(input);
+        break;
     }
   }
 
@@ -90,3 +100,4 @@ NVTEXT_BENCHMARK_DEFINE(multi)
 NVTEXT_BENCHMARK_DEFINE(count)
 NVTEXT_BENCHMARK_DEFINE(count_multi)
 NVTEXT_BENCHMARK_DEFINE(ngrams)
+NVTEXT_BENCHMARK_DEFINE(characters)
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
@@ -181,7 +181,6 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   // To minimize memory, count the number of characters so we can
   // build the output offsets without an intermediate buffer.
   // In the worst case each byte is a character so the output is 4x the input.
-  auto strings_view = cudf::column_device_view::create(strings_column.parent(), stream);
   cudf::size_type num_characters = thrust::count_if(
     rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) {
       return cudf::strings::detail::is_begin_utf8_char(byte);