rapidsai · rapids-bot · Jan 26, 2022 · Jan 20, 2022 · Jan 20, 2022 · Jan 21, 2022
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,6 +43,8 @@ struct hashed_vocabulary {
   std::unique_ptr<cudf::column> table;             // uint64
   std::unique_ptr<cudf::column> bin_coefficients;  // uint64
   std::unique_ptr<cudf::column> bin_offsets;       // uint16
+  std::unique_ptr<cudf::column> cp_metadata;       // uint32
+  std::unique_ptr<cudf::column> aux_cp_table;      // uint64
 };
 
 /**

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <text/subword/detail/data_normalizer.hpp>
+#include <text/subword/detail/tokenizer_utils.cuh>
 #include <text/utilities/tokenize_ops.cuh>
 
 #include <nvtext/normalize.hpp>
@@ -199,8 +200,10 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
   // create the normalizer and call it
-  data_normalizer normalizer(stream, do_lower_case);
-  auto result = [&strings, &normalizer, stream] {
+  auto result = [&] {
+    auto const cp_metadata = get_codepoint_metadata(stream);
+    auto const aux_table   = get_aux_codepoint_data(stream);
+    data_normalizer normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
     auto const offsets   = strings.offsets();
     auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
     auto const offset    = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -262,11 +262,11 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
 
 }  // namespace
 
-data_normalizer::data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case)
-  : do_lower_case(do_lower_case)
+data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata,
+                                 aux_codepoint_data_type const* aux_table,
+                                 bool do_lower_case)
+  : d_cp_metadata{cp_metadata}, d_aux_table{aux_table}, do_lower_case{do_lower_case}
 {
-  d_cp_metadata = detail::get_codepoint_metadata(stream);
-  d_aux_table   = detail::get_aux_codepoint_data(stream);
 }
 
 uvector_pair data_normalizer::normalize(char const* d_strings,

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,14 +48,17 @@ namespace detail {
 class data_normalizer {
  public:
   /**
-   * @brief Transfer to the GPU the metadata needed to normalize characters.
+   * @brief Create instance of the normalizer.
    *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param cp_metadata The code point metadata table to use for normalization.
+   * @param aux_table The auxiliary code point table.
    * @param do_lower_case If true, the normalizer will convert uppercase characters in the
    *        input stream to lower case and strip accents from those characters.
    *        If false, accented and uppercase characters are not transformed.
    */
-  data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case = true);
+  data_normalizer(codepoint_metadata_type const* cp_metadata,
+                  aux_codepoint_data_type const* aux_table,
+                  bool do_lower_case = true);
 
   /**
    * @brief Normalize a vector of strings.

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <text/subword/detail/cp_data.h>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <stdint.h>
 
@@ -57,22 +58,16 @@ struct update_strings_lengths_fn {
 /**
  * @brief Retrieve the code point metadata table.
  *
- * This is a singleton instance that copies a large table of integers into
- * device memory on the very first call.
- *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-codepoint_metadata_type const* get_codepoint_metadata(rmm::cuda_stream_view stream);
+rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream);
 
 /**
- * @brief Retrieve the aux code point metadata table.
- *
- * This is a singleton instance that copies a large table of integers into
- * device memory on the very first call.
+ * @brief Retrieve the auxiliary code point metadata table.
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-aux_codepoint_data_type const* get_aux_codepoint_data(rmm::cuda_stream_view stream);
+rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream);
 
 }  // namespace detail
 }  // namespace nvtext
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,7 +61,6 @@ class wordpiece_tokenizer {
    * @param do_lower_case If true, the tokenizer will convert uppercase characters in the
    *        input stream to lowercase and strip accents from those characters.
    *        If false, accented and uppercase characters are not transformed.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param max_word_length The length of the longest word that will be tokenized. Words
    *        longer than this will simply be replaced by the unknown token
    *        specified in the `vocab_file`.
@@ -72,7 +71,6 @@ class wordpiece_tokenizer {
                       uint32_t stride,
                       bool do_truncate,
                       bool do_lower_case,
-                      rmm::cuda_stream_view stream,
                       uint32_t max_word_length = 200);
 
   /**

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include <text/subword/detail/codepoint_metadata.ah>
-#include <text/subword/detail/data_normalizer.hpp>
 #include <text/subword/detail/tokenizer_utils.cuh>
 
 #include <nvtext/detail/load_hash_file.hpp>
@@ -37,87 +36,32 @@
 namespace nvtext {
 namespace detail {
 
-namespace {
-struct get_codepoint_metadata_init {
-  rmm::cuda_stream_view stream;
-
-  rmm::device_uvector<codepoint_metadata_type>* operator()() const
-  {
-    auto table_vector =
-      new rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
-    auto table = table_vector->data();
-    thrust::fill(rmm::exec_policy(stream),
-                 table + cp_section1_end,
-                 table + codepoint_metadata_size,
-                 codepoint_metadata_default_value);
-    CUDA_TRY(cudaMemcpyAsync(table,
-                             codepoint_metadata,
-                             cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + cp_section2_begin,
-      cp_metadata_917505_917999,
-      (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    return table_vector;
-  };
-};
-
-struct get_aux_codepoint_data_init {
-  rmm::cuda_stream_view stream;
-
-  rmm::device_uvector<aux_codepoint_data_type>* operator()() const
-  {
-    auto table_vector =
-      new rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
-    auto table = table_vector->data();
-    thrust::fill(rmm::exec_policy(stream),
-                 table + aux_section1_end,
-                 table + aux_codepoint_data_size,
-                 aux_codepoint_default_value);
-    CUDA_TRY(cudaMemcpyAsync(table,
-                             aux_codepoint_data,
-                             aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section2_begin,
-      aux_cp_data_44032_55203,
-      (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]),  // 2nd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section3_begin,
-      aux_cp_data_70475_71099,
-      (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]),  // 3rd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section4_begin,
-      aux_cp_data_119134_119232,
-      (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    return table_vector;
-  }
-};
-}  // namespace
-
 /**
  * @brief Retrieve the code point metadata table.
  *
  * Build the code point metadata table in device memory
  * using the vector pieces from codepoint_metadata.ah
  */
-const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream)
+rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream)
 {
-  static cudf::strings::detail::thread_safe_per_context_cache<
-    rmm::device_uvector<codepoint_metadata_type>>
-    g_codepoint_metadata;
-
-  return g_codepoint_metadata.find_or_initialize(get_codepoint_metadata_init{stream})->data();
+  auto table_vector = rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
+  auto table        = table_vector.data();
+  thrust::fill(rmm::exec_policy(stream),
+               table + cp_section1_end,
+               table + codepoint_metadata_size,
+               codepoint_metadata_default_value);
+  CUDA_TRY(cudaMemcpyAsync(table,
+                           codepoint_metadata,
+                           cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + cp_section2_begin,
+    cp_metadata_917505_917999,
+    (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  return table_vector;
 }
 
 /**
@@ -126,13 +70,38 @@ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stre
  * Build the aux code point data table in device memory
  * using the vector pieces from codepoint_metadata.ah
  */
-const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream)
+rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream)
 {
-  static cudf::strings::detail::thread_safe_per_context_cache<
-    rmm::device_uvector<aux_codepoint_data_type>>
-    g_aux_codepoint_data;
-
-  return g_aux_codepoint_data.find_or_initialize(get_aux_codepoint_data_init{stream})->data();
+  auto table_vector = rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
+  auto table        = table_vector.data();
+  thrust::fill(rmm::exec_policy(stream),
+               table + aux_section1_end,
+               table + aux_codepoint_data_size,
+               aux_codepoint_default_value);
+  CUDA_TRY(cudaMemcpyAsync(table,
+                           aux_codepoint_data,
+                           aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section2_begin,
+    aux_cp_data_44032_55203,
+    (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]),  // 2nd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section3_begin,
+    aux_cp_data_70475_71099,
+    (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]),  // 3rd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section4_begin,
+    aux_cp_data_119134_119232,
+    (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  return table_vector;
 }
 
 namespace {
@@ -293,10 +262,17 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
                            cudaMemcpyHostToDevice,
                            stream.value()));
 
-  // this just initializes some constant tables into device memory
-  // to help speed up the runtime
-  detail::get_codepoint_metadata(stream);
-  detail::get_aux_codepoint_data(stream);
+  auto cp_metadata = detail::get_codepoint_metadata(stream);
+  result.cp_metadata =
+    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::UINT32},
+                                   static_cast<cudf::size_type>(cp_metadata.size()),
+                                   cp_metadata.release());
+
+  auto aux_cp_table = detail::get_aux_codepoint_data(stream);
+  result.aux_cp_table =
+    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::UINT64},
+                                   static_cast<cudf::size_type>(aux_cp_table.size()),
+                                   aux_cp_table.release());
 
   return std::make_unique<hashed_vocabulary>(std::move(result));
 }

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -153,7 +153,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
 
   // Create tokenizer
   wordpiece_tokenizer tokenizer(
-    vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case, stream);
+    vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case);
   // Run tokenizer
   auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
   // assign output components

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -394,10 +394,11 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
                                          uint32_t stride,
                                          bool do_truncate,
                                          bool do_lower_case,
-                                         rmm::cuda_stream_view stream,
                                          uint32_t max_word_length)
   : vocab_table(vocab_table),
-    normalizer(stream, do_lower_case),
+    normalizer(vocab_table.cp_metadata->view().data<codepoint_metadata_type>(),
+               vocab_table.aux_cp_table->view().data<aux_codepoint_data_type>(),
+               do_lower_case),
     max_sequence_length{max_sequence_length},
     stride(stride),
     do_truncate(do_truncate),