From b0e5aef46a0fc23a67bb2980d5e7707870bb15c4 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 24 Feb 2021 08:26:36 -0500
Subject: [PATCH] Change nvtext::load_vocabulary_file to return a unique ptr
 (#7424)

Reference #5868

This PR changes the `nvtext::load_vocabulary_file` to return a unique-pointer to make it easier to manage in Python/Cython class object. The original signature returned a flat structure that contained unique-pointers which would make it difficult to copy and manage.

The corresponding gtests and gbenchmarks were updated for this API change.

Authors:
  - David (@davidwendt)

Approvers:
  - Conor Hoekstra (@codereport)
  - Karthikeyan (@karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/7424
---
 cpp/benchmarks/text/subword_benchmark.cpp    |  4 ++--
 cpp/include/nvtext/detail/load_hash_file.hpp |  9 +++++----
 cpp/include/nvtext/subword_tokenize.hpp      |  4 ++--
 cpp/src/text/subword/load_hash_file.cu       | 13 +++++++------
 cpp/src/text/subword/subword_tokenize.cu     |  6 +++---
 cpp/tests/text/subword_tests.cpp             |  4 ++--
 6 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/cpp/benchmarks/text/subword_benchmark.cpp b/cpp/benchmarks/text/subword_benchmark.cpp
index d22f696c18d..3670fa7c9a7 100644
--- a/cpp/benchmarks/text/subword_benchmark.cpp
+++ b/cpp/benchmarks/text/subword_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,7 +65,7 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state)
   auto vocab = nvtext::load_vocabulary_file(hash_file);
   for (auto _ : state) {
     auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                           vocab,
+                                           *vocab,
                                            max_sequence_length,
                                            stride,
                                            do_lower,
diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp
index a75ae3d6181..b105c5c280e 100644
--- a/cpp/include/nvtext/detail/load_hash_file.hpp
+++ b/cpp/include/nvtext/detail/load_hash_file.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,9 +40,10 @@ namespace detail {
  * @param mr Memory resource to allocate any returned objects.
  * @return vocabulary hash-table elements
  */
-hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary,
-                                       rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
+  std::string const& filename_hashed_vocabulary,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index 8f6cf7d7c14..8cc000ff095 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,7 +59,7 @@ struct hashed_vocabulary {
  * @param mr Memory resource to allocate any returned objects.
  * @return vocabulary hash-table elements
  */
-hashed_vocabulary load_vocabulary_file(
+std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 2e98b88ccb4..f3f96933f19 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -183,9 +183,10 @@ uint64_t str_to_uint64(std::string const& str, uint64_t line_no)
  * @param filename_hashed_vocabulary Path to text file containing hashed vocabulary
  * @return object containing hash table elements for the wordpiece tokenizer
  */
-hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary,
-                                       rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
+  std::string const& filename_hashed_vocabulary,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   hashed_vocabulary result;
   std::ifstream hash_file(filename_hashed_vocabulary);
@@ -276,13 +277,13 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu
   detail::get_codepoint_metadata(stream);
   detail::get_aux_codepoint_data(stream);
 
-  return result;
+  return std::make_unique<hashed_vocabulary>(std::move(result));
 }
 
 }  // namespace detail
 
-hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary,
-                                       rmm::mr::device_memory_resource* mr)
+std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
+  std::string const& filename_hashed_vocabulary, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::load_vocabulary_file(filename_hashed_vocabulary, rmm::cuda_stream_default, mr);
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 46a84575dc1..1639af0dbde 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -256,10 +256,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   uint32_t max_rows_tensor,
                                   rmm::mr::device_memory_resource* mr)
 {
-  hashed_vocabulary vocab_table = load_vocabulary_file(filename_hashed_vocabulary, mr);
+  auto vocab_table = load_vocabulary_file(filename_hashed_vocabulary, mr);
   CUDF_FUNC_RANGE();
   return detail::subword_tokenize(strings,
-                                  vocab_table,
+                                  *vocab_table,
                                   max_sequence_length,
                                   stride,
                                   do_lower_case,
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index 3cab612fccd..7324cf3ec6a 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -237,7 +237,7 @@ TEST(TextSubwordTest, TokenizeFromVocabStruct)
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   auto vocab  = nvtext::load_vocabulary_file(hash_file);
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         vocab,
+                                         *vocab,
                                          8,
                                          6,
                                          true,  // do_lower_case
@@ -307,7 +307,7 @@ TEST(TextSubwordTest, TokenizeWithSpecialTokens)
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   auto vocab  = nvtext::load_vocabulary_file(hash_file);
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         vocab,
+                                         *vocab,
                                          8,
                                          6,
                                          true,  // do_lower_case