Add gbenchmark for nvtext ngrams functions (#7693)

Reference #5696 Creates a gbenchmark for `nvtext::generate_ngrams()` and `nvtext::generate_character_ngrams()` functions. The benchmarks measures various string lengths and number of rows. The `nvtext::generate_ngrams()` was refactored to use the more efficient `make_strings_children` which improved its performance by about 50%. Authors: - David (@davidwendt) Approvers: - Nghia Truong (@ttnghia) - Mark Harris (@harrism) URL: #7693
rapidsai · Mar 29, 2021 · d9103c4 · d9103c4
1 parent fe7ec85
commit d9103c4
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 27 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -175,6 +175,7 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)
 ###################################################################################################
 # - nvtext benchmark -------------------------------------------------------------------
 ConfigureBench(TEXT_BENCH
+  text/ngrams_benchmark.cpp
   text/normalize_benchmark.cpp
   text/normalize_spaces_benchmark.cpp
   text/replace_benchmark.cpp

diff --git a/cpp/benchmarks/text/ngrams_benchmark.cpp b/cpp/benchmarks/text/ngrams_benchmark.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/string/string_bench_args.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+
+#include <nvtext/generate_ngrams.hpp>
+
+class TextNGrams : public cudf::benchmark {
+};
+
+enum class ngrams_type { tokens, characters };
+
+static void BM_ngrams(benchmark::State& state, ngrams_type nt)
+{
+  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (nt) {
+      case ngrams_type::tokens: nvtext::generate_ngrams(input); break;
+      case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 5;
+  int const max_rowlen = 40;
+  int const len_mult   = 2;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name)                             \
+  BENCHMARK_DEFINE_F(TextNGrams, name)                            \
+  (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \
+  BENCHMARK_REGISTER_F(TextNGrams, name)                          \
+    ->Apply(generate_bench_args)                                  \
+    ->UseManualTime()                                             \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(tokens)
+NVTEXT_BENCHMARK_DEFINE(characters)
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
@@ -50,7 +50,7 @@ struct ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::string_view const d_separator;
-  int32_t const* d_offsets{};
+  int32_t* d_offsets{};
   char* d_chars{};
 
   /**
@@ -62,7 +62,7 @@ struct ngram_generator_fn {
    * @param idx Index of the kernel thread.
    * @return Number of bytes required for the string for this thread.
    */
-  __device__ cudf::size_type operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx)
   {
     char* out_ptr         = d_chars ? d_chars + d_offsets[idx] : nullptr;
     cudf::size_type bytes = 0;
@@ -74,7 +74,7 @@ struct ngram_generator_fn {
       bytes += d_separator.size_bytes();
       if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_separator);
     }
-    return bytes;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
@@ -109,11 +109,11 @@ std::unique_ptr<cudf::column> generate_ngrams(
                              if (d_strings.is_null(idx)) return false;
                              return !d_strings.element<cudf::string_view>(idx).empty();
                            },
-                           stream,
-                           mr)
+                           stream)
                            ->release();
     strings_count = table_offsets.front()->size() - 1;
-    return std::move(table_offsets.front());
+    auto result   = std::move(table_offsets.front());
+    return result;
   }();  // this allows freeing the temporary table_offsets
 
   CUDF_EXPECTS(strings_count >= ngrams, "Insufficient number of strings to generate ngrams");
@@ -131,30 +131,13 @@ std::unique_ptr<cudf::column> generate_ngrams(
   // compute the number of strings of ngrams
   auto const ngrams_count = strings_count - ngrams + 1;
 
-  // build output offsets by computing the output bytes for each generated ngram
-  auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(
-    0, ngram_generator_fn{d_strings, ngrams, d_separator});
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + ngrams_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
-
-  // build the chars column
-  // generate the ngrams from the input strings and copy them into the chars data buffer
-  cudf::size_type const total_bytes = thrust::device_pointer_cast(d_offsets)[ngrams_count];
-  auto chars_column =
-    cudf::strings::detail::create_chars_child_column(ngrams_count, 0, total_bytes, stream, mr);
-  char* const d_chars = chars_column->mutable_view().data<char>();
-
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     ngrams_count,
-                     ngram_generator_fn{d_strings, ngrams, d_separator, d_offsets, d_chars});
-  chars_column->set_null_count(0);
+  auto children = cudf::strings::detail::make_strings_children(
+    ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, 0, stream, mr);
 
   // make the output strings column from the offsets and chars column
   return cudf::make_strings_column(ngrams_count,
-                                   std::move(offsets_column),
-                                   std::move(chars_column),
+                                   std::move(children.first),
+                                   std::move(children.second),
                                    0,
                                    rmm::device_buffer{0, stream, mr},
                                    stream,