Remove metadata singleton from nvtext normalizer (#10090)

Closes #10043 A metadata singleton was allocated from the current rmm memory-resource inside the nvtext normalizer functions. If the memory resource is later changed, the metadata pointer may become invalidated. This PR removes the singleton pattern. The normalizer is used by the subword-tokenizer which is passed a vocabulary structure that is built only once and is maintained by the caller. The metadata has been added to this structure so it's lifetime can share the same scope. The normalizer can also be called directly through the `nvtext::normalize_characters` API. Here the metadata table (size about 1MB) is now created on each call. This showed only significant performance impact on benchmarks testing a small number (<50K) of shorter strings. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Robert Maynard (https://github.com/robertmaynard) URL: #10090
rapidsai · Jan 26, 2022 · 85109e6 · 85109e6
1 parent eacaea0
commit 85109e6
Show file tree

Hide file tree

Showing 9 changed files with 99 additions and 126 deletions.
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,6 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <cstdint>
-#include <cstring>
-
 namespace nvtext {
 
 /**
@@ -43,6 +40,8 @@ struct hashed_vocabulary {
   std::unique_ptr<cudf::column> table;             // uint64
   std::unique_ptr<cudf::column> bin_coefficients;  // uint64
   std::unique_ptr<cudf::column> bin_offsets;       // uint16
+  std::unique_ptr<cudf::column> cp_metadata;       // uint32
+  std::unique_ptr<cudf::column> aux_cp_table;      // uint64
 };
 
 /**

diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <text/subword/detail/data_normalizer.hpp>
+#include <text/subword/detail/tokenizer_utils.cuh>
 #include <text/utilities/tokenize_ops.cuh>
 
 #include <nvtext/normalize.hpp>
@@ -199,12 +200,14 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
   // create the normalizer and call it
-  data_normalizer normalizer(stream, do_lower_case);
-  auto result = [&strings, &normalizer, stream] {
-    auto const offsets   = strings.offsets();
-    auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
-    auto const offset    = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
-    auto const d_chars   = strings.chars().data<char>() + offset;
+  auto result = [&] {
+    auto const cp_metadata = get_codepoint_metadata(stream);
+    auto const aux_table   = get_aux_codepoint_data(stream);
+    auto const normalizer  = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
+    auto const offsets     = strings.offsets();
+    auto const d_offsets   = offsets.data<uint32_t>() + strings.offset();
+    auto const offset      = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
+    auto const d_chars     = strings.chars().data<char>() + offset;
     return normalizer.normalize(d_chars, d_offsets, strings.size(), stream);
   }();
 

diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -261,17 +261,17 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
 
 }  // namespace
 
-data_normalizer::data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case)
-  : do_lower_case(do_lower_case)
+data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata,
+                                 aux_codepoint_data_type const* aux_table,
+                                 bool do_lower_case)
+  : d_cp_metadata{cp_metadata}, d_aux_table{aux_table}, do_lower_case{do_lower_case}
 {
-  d_cp_metadata = detail::get_codepoint_metadata(stream);
-  d_aux_table   = detail::get_aux_codepoint_data(stream);
 }
 
 uvector_pair data_normalizer::normalize(char const* d_strings,
                                         uint32_t const* d_offsets,
                                         uint32_t num_strings,
-                                        rmm::cuda_stream_view stream)
+                                        rmm::cuda_stream_view stream) const
 {
   if (num_strings == 0)
     return std::make_pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),

diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,14 +48,17 @@ namespace detail {
 class data_normalizer {
  public:
   /**
-   * @brief Transfer to the GPU the metadata needed to normalize characters.
+   * @brief Create instance of the normalizer.
    *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param cp_metadata The code point metadata table to use for normalization.
+   * @param aux_table The auxiliary code point table.
    * @param do_lower_case If true, the normalizer will convert uppercase characters in the
    *        input stream to lower case and strip accents from those characters.
    *        If false, accented and uppercase characters are not transformed.
    */
-  data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case = true);
+  data_normalizer(codepoint_metadata_type const* cp_metadata,
+                  aux_codepoint_data_type const* aux_table,
+                  bool do_lower_case = true);
 
   /**
    * @brief Normalize a vector of strings.
@@ -84,7 +87,7 @@ class data_normalizer {
   uvector_pair normalize(char const* d_strings,
                          uint32_t const* d_offsets,
                          uint32_t num_strings,
-                         rmm::cuda_stream_view stream);
+                         rmm::cuda_stream_view stream) const;
 
  private:
   bool const do_lower_case;

diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <text/subword/detail/cp_data.h>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <cstdint>
 
@@ -57,22 +58,16 @@ struct update_strings_lengths_fn {
 /**
  * @brief Retrieve the code point metadata table.
  *
- * This is a singleton instance that copies a large table of integers into
- * device memory on the very first call.
- *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-codepoint_metadata_type const* get_codepoint_metadata(rmm::cuda_stream_view stream);
+rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream);
 
 /**
- * @brief Retrieve the aux code point metadata table.
- *
- * This is a singleton instance that copies a large table of integers into
- * device memory on the very first call.
+ * @brief Retrieve the auxiliary code point metadata table.
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-aux_codepoint_data_type const* get_aux_codepoint_data(rmm::cuda_stream_view stream);
+rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,7 +61,6 @@ class wordpiece_tokenizer {
    * @param do_lower_case If true, the tokenizer will convert uppercase characters in the
    *        input stream to lowercase and strip accents from those characters.
    *        If false, accented and uppercase characters are not transformed.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param max_word_length The length of the longest word that will be tokenized. Words
    *        longer than this will simply be replaced by the unknown token
    *        specified in the `vocab_file`.
@@ -72,7 +71,6 @@ class wordpiece_tokenizer {
                       uint32_t stride,
                       bool do_truncate,
                       bool do_lower_case,
-                      rmm::cuda_stream_view stream,
                       uint32_t max_word_length = 200);
 
   /**

diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include <text/subword/detail/codepoint_metadata.ah>
-#include <text/subword/detail/data_normalizer.hpp>
 #include <text/subword/detail/tokenizer_utils.cuh>
 
 #include <nvtext/detail/load_hash_file.hpp>
@@ -37,87 +36,32 @@
 namespace nvtext {
 namespace detail {
 
-namespace {
-struct get_codepoint_metadata_init {
-  rmm::cuda_stream_view stream;
-
-  rmm::device_uvector<codepoint_metadata_type>* operator()() const
-  {
-    auto table_vector =
-      new rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
-    auto table = table_vector->data();
-    thrust::fill(rmm::exec_policy(stream),
-                 table + cp_section1_end,
-                 table + codepoint_metadata_size,
-                 codepoint_metadata_default_value);
-    CUDA_TRY(cudaMemcpyAsync(table,
-                             codepoint_metadata,
-                             cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + cp_section2_begin,
-      cp_metadata_917505_917999,
-      (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    return table_vector;
-  };
-};
-
-struct get_aux_codepoint_data_init {
-  rmm::cuda_stream_view stream;
-
-  rmm::device_uvector<aux_codepoint_data_type>* operator()() const
-  {
-    auto table_vector =
-      new rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
-    auto table = table_vector->data();
-    thrust::fill(rmm::exec_policy(stream),
-                 table + aux_section1_end,
-                 table + aux_codepoint_data_size,
-                 aux_codepoint_default_value);
-    CUDA_TRY(cudaMemcpyAsync(table,
-                             aux_codepoint_data,
-                             aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section2_begin,
-      aux_cp_data_44032_55203,
-      (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]),  // 2nd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section3_begin,
-      aux_cp_data_70475_71099,
-      (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]),  // 3rd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section4_begin,
-      aux_cp_data_119134_119232,
-      (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    return table_vector;
-  }
-};
-}  // namespace
-
 /**
  * @brief Retrieve the code point metadata table.
  *
  * Build the code point metadata table in device memory
  * using the vector pieces from codepoint_metadata.ah
  */
-const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream)
+rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream)
 {
-  static cudf::strings::detail::thread_safe_per_context_cache<
-    rmm::device_uvector<codepoint_metadata_type>>
-    g_codepoint_metadata;
-
-  return g_codepoint_metadata.find_or_initialize(get_codepoint_metadata_init{stream})->data();
+  auto table_vector = rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
+  auto table        = table_vector.data();
+  thrust::fill(rmm::exec_policy(stream),
+               table + cp_section1_end,
+               table + codepoint_metadata_size,
+               codepoint_metadata_default_value);
+  CUDA_TRY(cudaMemcpyAsync(table,
+                           codepoint_metadata,
+                           cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + cp_section2_begin,
+    cp_metadata_917505_917999,
+    (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  return table_vector;
 }
 
 /**
@@ -126,13 +70,38 @@ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stre
  * Build the aux code point data table in device memory
  * using the vector pieces from codepoint_metadata.ah
  */
-const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream)
+rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream)
 {
-  static cudf::strings::detail::thread_safe_per_context_cache<
-    rmm::device_uvector<aux_codepoint_data_type>>
-    g_aux_codepoint_data;
-
-  return g_aux_codepoint_data.find_or_initialize(get_aux_codepoint_data_init{stream})->data();
+  auto table_vector = rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
+  auto table        = table_vector.data();
+  thrust::fill(rmm::exec_policy(stream),
+               table + aux_section1_end,
+               table + aux_codepoint_data_size,
+               aux_codepoint_default_value);
+  CUDA_TRY(cudaMemcpyAsync(table,
+                           aux_codepoint_data,
+                           aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section2_begin,
+    aux_cp_data_44032_55203,
+    (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]),  // 2nd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section3_begin,
+    aux_cp_data_70475_71099,
+    (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]),  // 3rd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section4_begin,
+    aux_cp_data_119134_119232,
+    (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  return table_vector;
 }
 
 namespace {
@@ -293,10 +262,15 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
                            cudaMemcpyHostToDevice,
                            stream.value()));
 
-  // this just initializes some constant tables into device memory
-  // to help speed up the runtime
-  detail::get_codepoint_metadata(stream);
-  detail::get_aux_codepoint_data(stream);
+  auto cp_metadata            = detail::get_codepoint_metadata(stream);
+  auto const cp_metadata_size = static_cast<cudf::size_type>(cp_metadata.size());
+  result.cp_metadata          = std::make_unique<cudf::column>(
+    cudf::data_type{cudf::type_id::UINT32}, cp_metadata_size, cp_metadata.release());
+
+  auto aux_cp_table            = detail::get_aux_codepoint_data(stream);
+  auto const aux_cp_table_size = static_cast<cudf::size_type>(aux_cp_table.size());
+  result.aux_cp_table          = std::make_unique<cudf::column>(
+    cudf::data_type{cudf::type_id::UINT64}, aux_cp_table_size, aux_cp_table.release());
 
   return std::make_unique<hashed_vocabulary>(std::move(result));
 }

diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -153,7 +153,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
 
   // Create tokenizer
   wordpiece_tokenizer tokenizer(
-    vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case, stream);
+    vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case);
   // Run tokenizer
   auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
   // assign output components