From 21d2ce6a96e0077e2fe3fde56b33742dcee63479 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 11 Feb 2021 17:36:34 -0500
Subject: [PATCH] Add support for special tokens in nvtext::subword_tokenizer
 (#7254)

Closes #6937
This PR adds support for the following 7 special tokens in the `subword_tokenize`
[BOS], [EOS], [UNK], [SEP], [PAD], [CLS], and [MASK]
Descriptions for these can be found in links/text found in #6937

These can be placed anywhere in the text and may be upper or lower-case. They will be recognized regardless if they exist in the given vocabulary hash table. Example using vocab-hash.txt and code snippet from #6937
```
>>> text = '[CLS]I ate dinner.[SEP][BOS]It was yummy.[EOS]'
>>> cudf_ser = cudf.Series([text])
>>> tokens, attention_masks, metadata = cudf_ser.str.subword_tokenize('vocab-hash.txt', do_lower=True, do_truncate=False)
>>> print(id2vocab[tokens[0:17].get()])
['[CLS]' 'i' 'ate' 'dinner' '.' '[SEP]' '[BOS]' 'it' 'was' 'yummy' '.'
 '[EOS]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]']
```

A new gtest was added for this feature.
This requires no API change to the C++ or Python interfaces.

Authors:
  - David (@davidwendt)

Approvers:
  - Devavret Makkar (@devavret)
  - Karthikeyan (@karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/7254
---
 cpp/src/text/subword/load_hash_file.cu      |  73 +++++++++--
 cpp/src/text/subword/wordpiece_tokenizer.cu | 132 +++++++++++++++++++-
 cpp/tests/text/subword_tests.cpp            |  81 +++++++++++-
 3 files changed, 270 insertions(+), 16 deletions(-)

diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index bb0af41e602..1e81e603ca8 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -114,6 +114,52 @@ const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stre
   });
 }
 
+namespace {
+/**
+ * @brief Convert string to uint32.
+ *
+ * This just wraps the std::stoi but provides a nice error message
+ * in case the hash file format is incorrect.
+ */
+uint32_t str_to_uint32(std::string const& str, uint64_t line_no)
+{
+  try {
+    return std::stoi(str);  // there is no std::stoui
+  } catch (std::exception exc) {
+    std::string message("Line ");
+    message += std::to_string(line_no) + ": ";
+    message += "cannot convert integer from '";
+    message += str;
+    message += "': ";
+    message += exc.what();
+    std::cerr << message << std::endl;
+    throw exc;
+  }
+}
+
+/**
+ * @brief Convert string to uint64.
+ *
+ * This just wraps the std::stoul but provides a nice error message
+ * in case the hash file format is incorrect.
+ */
+uint64_t str_to_uint64(std::string const& str, uint64_t line_no)
+{
+  try {
+    return std::stoul(str);
+  } catch (std::exception exc) {
+    std::string message("Line ");
+    message += std::to_string(line_no) + ": ";
+    message += "cannot convert integer from '";
+    message += str;
+    message += "': ";
+    message += exc.what();
+    std::cerr << message << std::endl;
+    throw exc;
+  }
+}
+}  // namespace
+
 /**
  * @brief Loads a text file representing the hashed vocabulary into hashed_vocabulary struct.
  *
@@ -145,15 +191,16 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu
   std::ifstream hash_file(filename_hashed_vocabulary);
   CUDF_EXPECTS(hash_file.good(), "Could not open " + filename_hashed_vocabulary);
 
+  uint64_t line_no = 1;
   std::string line;
   std::getline(hash_file, line);
-  result.outer_hash_a = std::stoi(line);
+  result.outer_hash_a = str_to_uint32(line, line_no++);
 
   std::getline(hash_file, line);
-  result.outer_hash_b = std::stoi(line);
+  result.outer_hash_b = str_to_uint32(line, line_no++);
 
   std::getline(hash_file, line);
-  result.num_bins = std::stoi(line);
+  result.num_bins = str_to_uint32(line, line_no++);
 
   std::vector<uint64_t> bin_coefficients(result.num_bins);
   std::vector<uint16_t> bin_offsets(result.num_bins);
@@ -161,32 +208,34 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu
   for (int i = 0; i < result.num_bins; ++i) {
     std::getline(hash_file, line);
     size_t loc_of_space = line.find(" ");
+    CUDF_EXPECTS(loc_of_space != line.npos, "invalid hash file format");
 
     std::string first_num  = line.substr(0, loc_of_space);
     std::string second_num = line.substr(loc_of_space + 1, line.length());
 
-    bin_coefficients[i] = std::stoull(first_num);
-    bin_offsets[i]      = std::stoull(second_num);
+    bin_coefficients[i] = str_to_uint64(first_num, line_no);
+    bin_offsets[i]      = str_to_uint32(second_num, line_no);
+    ++line_no;
   }
 
   std::getline(hash_file, line);
-  uint64_t hash_table_length = std::stoull(line);
+  uint64_t hash_table_length = str_to_uint64(line, line_no++);
   std::vector<uint64_t> table(hash_table_length);
 
-  std::generate(table.begin(), table.end(), [&hash_file]() {
+  std::generate(table.begin(), table.end(), [&hash_file, &line_no]() {
     std::string line;
     std::getline(hash_file, line);
-    return std::stoull(line);
+    return str_to_uint64(line, line_no++);
   });
 
   std::getline(hash_file, line);
-  result.unknown_token_id = std::stoi(line);
+  result.unknown_token_id = str_to_uint32(line, line_no++);
 
   std::getline(hash_file, line);
-  result.first_token_id = std::stoi(line);
+  result.first_token_id = str_to_uint32(line, line_no++);
 
   std::getline(hash_file, line);
-  result.separator_token_id = std::stoi(line);
+  result.separator_token_id = str_to_uint32(line, line_no++);
 
   // Transfer hash table to columns
   result.table = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT64},
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index b13c22670ee..4d048b3cf99 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <text/subword/detail/wordpiece_tokenizer.hpp>
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/error.hpp>
 #include <nvtext/subword_tokenize.hpp>
 
@@ -101,7 +102,7 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
 /**
  * @brief Resolves the string boundaries for the start and end words.
  *
- * This kernel should be called after `mark_word_start_and_ends` with at
+ * This kernel should be called after `init_data_and_mark_word_start_and_ends` with at
  * least `num_strings` total threads.
  *
  * The start and end indices are updated to honor the string boundaries
@@ -141,6 +142,125 @@ __global__ void mark_string_start_and_ends(uint32_t const* code_points,
   }
 }
 
+/**
+ * @brief Currently supported special tokens.
+ *
+ * Code logic expects these to be 3 upper-case characters along
+ * with a single trailing space.
+ */
+__constant__ char special_tokens[35]{"BOS EOS UNK SEP PAD CLS MASK "};
+constexpr cudf::size_type MIN_ST_WIDTH = 4;  // Min token size in special_tokens
+constexpr cudf::size_type MAX_ST_WIDTH = 5;  // Max token size in special_tokens
+
+struct mark_special_tokens {
+  /**
+   * @brief Check given code-point array to the list of known
+   * special tokens.
+   */
+  __device__ bool is_special_token(uint32_t const* token, cudf::size_type size) const
+  {
+    if (size < MIN_ST_WIDTH || size > MAX_ST_WIDTH) return false;
+    char str_token[MAX_ST_WIDTH];
+    // convert code-points to chars
+    thrust::transform(thrust::seq, token, token + size, str_token, [](uint32_t cp) {
+      // also upper-case them to match again special_tokens array
+      return static_cast<char>(cp >= 'a' ? cp - 'a' + 'A' : cp);
+    });
+    // search the special tokens array for the str_token
+    cudf::string_view tokens(special_tokens, sizeof(special_tokens));
+    return tokens.find(str_token, size) >= 0;
+  }
+
+  /**
+   * @brief Check code-points for special tokens and adjust indices.
+   *
+   * Tokens will appear in the `code_points` array as:
+   * `_[_ttt_]_` where `_` are single space characters and
+   *                   ttt is the variable-length token name
+   *
+   * The logic below uses the following variables to represent position
+   * values in the `code_points` array after locating a special token:
+   * ```
+   * _ [ _ t t t _  ] _
+   *   ^   ^     ^  ^
+   *   si  sp    ep ei
+   * ```
+   * where `si` is `start_index`
+   *       `sp` is `start_pos`
+   *       `ep` is `end_pos`
+   *       `ei` is `end_index`
+   *
+   * When a special token is found, the `code_points` are adjusted
+   * to remove the spaces and capitalize the name.
+   * ```
+   * _ [ _ t t t _ ] _  is updated to
+   * _ [ T T T ] _ ] _
+   * ```
+   * This is required for the downstream word-piece tokenizer to
+   * match it to the vocabulary hash table.
+   *
+   * The `start_word_indices` and `end_word_indices` are updated to
+   * identify the token and to ignore the extra trailing `]` character.
+   */
+  __device__ void operator()(size_t idx) const
+  {
+    uint32_t const start_index = start_word_indices[idx];
+    if ((start_index == std::numeric_limits<uint32_t>::max()) ||
+        ((start_index + MIN_ST_WIDTH + 2) > num_code_points))
+      return;
+    if (code_points[start_index] != '[') return;
+
+    // check for matching end bracket
+    uint32_t const start_pos = start_index + 2;  // after the space delimiter
+    // search for next start-word and then check it is a ']'
+    uint32_t const end_index = [&] {
+      auto const begin = start_word_indices + start_pos;
+      auto const width =
+        std::min(static_cast<size_t>(MAX_ST_WIDTH + 1), (num_code_points - start_pos));
+      auto const end = begin + width;
+      // checking the next start-word is more reliable than arbitrarily searching for ']'
+      // in case the text is split across string rows
+      auto const iter = thrust::find_if(thrust::seq, begin + 1, end, [](auto swi) {
+        return swi != std::numeric_limits<uint32_t>::max();
+      });
+      return iter == end ? start_index : static_cast<uint32_t>(iter - start_word_indices);
+    }();
+    if (code_points[end_index] != ']') return;
+
+    // check for special token
+    auto const size = static_cast<cudf::size_type>(end_index - start_pos);
+    if (!is_special_token(code_points + start_pos, size)) return;
+
+    // special token found
+    // adjust code-points
+    auto const end_pos = end_index - 2;
+    // change _[_ttt_]_ to _[TTT]_
+    for (auto left_idx = start_pos - 1; left_idx <= end_pos; ++left_idx) {
+      auto const cp         = code_points[left_idx + 1];
+      code_points[left_idx] = cp >= 'a' ? cp - 'a' + 'A' : cp;
+    }
+    code_points[end_pos] = ']';
+
+    // erase the intermediate indices
+    thrust::fill(thrust::seq,
+                 start_word_indices + start_index + 1,  // keep the first one
+                 start_word_indices + end_index + 1,
+                 std::numeric_limits<uint32_t>::max());
+    thrust::fill(thrust::seq,
+                 end_word_indices + start_index,
+                 end_word_indices + end_index + 1,
+                 std::numeric_limits<uint32_t>::max());
+
+    // reset the new end-word index
+    end_word_indices[end_pos] = end_pos + 1;
+  }
+
+  uint32_t* const code_points;
+  uint32_t* const start_word_indices;
+  uint32_t* const end_word_indices;
+  size_t const num_code_points;
+};
+
 /**
  * @brief Converts words into token ids.
  *
@@ -345,6 +465,14 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
                                                          num_strings);
   CHECK_CUDA(stream.value());
 
+  // check for special tokens and adjust indices
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_t>(0),
+    num_code_points,
+    mark_special_tokens{
+      device_code_points, device_start_word_indices, device_end_word_indices, num_code_points});
+
   // Now start_word_indices has the word starts scattered throughout the array. We need to select
   // all values not equal to the max uint32_t and place them at the start of the array. We leverage
   // the fact that the start_word_indices and the end_word indices are contiguous to only launch one
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index c87f6e9af8a..3cab612fccd 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ struct TextSubwordTest : public cudf::test::BaseFixture {
 // Create a fake hashed vocab text file for the tests in this source file.
 // The vocab only includes the following words:
 //  'this', 'is', 'a', 'test', 'tést'
-// The period '.' character is also supported.
+// The period '.' character also has a token id.
 void create_hashed_vocab(std::string const& hash_file)
 {
   std::vector<std::pair<int, int>> coefficients(23, {65559, 0});
@@ -262,3 +262,80 @@ TEST(TextSubwordTest, LoadVocabFileErrors)
   std::string hash_file = temp_env->get_temp_filepath("nothing.txt");
   EXPECT_THROW(nvtext::load_vocabulary_file(hash_file), cudf::logic_error);
 }
+
+// This includes the words above and 7 special tokens:
+//  [BOS] [EOS] [UNK] [SEP] [PAD] [CLS] [MASK]
+// The data here was generated by the utility:
+//   cudf.utils.hash_vocab_utils.hash_vocab()
+void create_special_tokens_hashed_vocab(std::string const& hash_file)
+{
+  std::ofstream outfile(hash_file, std::ofstream::out);
+  outfile << "26899\n27424\n3\n";
+  outfile << "1416131940466419714 0\n";
+  outfile << "313740585393291779 2\n";
+  outfile << "17006415773850330120 5\n";
+  outfile << "13\n";
+  outfile << "5903884228619468800\n";
+  outfile << "6205475701751152650\n";
+  outfile << "16285378285009240068\n";
+  outfile << "5162333542489915397\n";
+  outfile << "6064762127302393859\n";
+  outfile << "6173800107753209857\n";
+  outfile << "5322083323972878342\n";
+  outfile << "6242701866907861003\n";
+  outfile << "451412623368\n";
+  outfile << "3014668\n";
+  outfile << "5214737420442796034\n";
+  outfile << "6206321707968233479\n";
+  outfile << "6357001\n";
+  outfile << "1\n2\n3\n\n";
+}
+
+TEST(TextSubwordTest, TokenizeWithSpecialTokens)
+{
+  std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
+  create_special_tokens_hashed_vocab(hash_file);
+
+  // clang-format off
+  std::vector<const char*> h_strings{
+    "[BOS]This is a tést.[eos]",
+    "[CLS]A test[SEP]this is.",
+    "[PAD] [A][MASK]",
+    "test this [CL",
+    "S] is a ."};
+  // clang-format on
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
+  auto vocab  = nvtext::load_vocabulary_file(hash_file);
+  auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
+                                         vocab,
+                                         8,
+                                         6,
+                                         true,  // do_lower_case
+                                         true,  // do_truncate
+                                         MAX_ROWS_TENSOR);
+
+  EXPECT_EQ(static_cast<uint32_t>(h_strings.size()), result.nrows_tensor);
+  // clang-format off
+  cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens(
+    { 5, 7,  8, 9, 10, 12,  6, 0,
+      2, 9, 10, 3,  7,  8, 12, 0,
+      0, 1,  9, 1,  4,  0,  0, 0,
+     10, 7,  1, 1,  0,  0,  0, 0,
+      1, 1,  8, 9, 12,  0,  0, 0});
+  cudf::test::fixed_width_column_wrapper<uint32_t> expected_attn(
+    {1, 1, 1, 1, 1, 1, 1, 0,
+     1, 1, 1, 1, 1, 1, 1, 0,
+     1, 1, 1, 1, 1, 0, 0, 0,
+     1, 1, 1, 1, 0, 0, 0, 0,
+     1, 1, 1, 1, 1, 0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<uint32_t> expected_metadata(
+    {0, 0, 6,
+     1, 0, 6,
+     2, 0, 4,
+     3, 0, 3,
+     4, 0, 4});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
+}