From 403c83fe7346bdeb4ffb1b15240bdfbda3c125d9 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 15 May 2023 12:59:59 -0400
Subject: [PATCH] Fix subword_tokenize error when input contains no tokens
 (#13320)

Fixes a bug where an exception is thrown when there are no tokens in the entire input column. For example, the input column is filled with strings containing only whitespace.
This special case will return token-ids of all zeroes along with an attention mask of all zeros equivalent to `input.size() * max_sequence_length`.

Closes #13300

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Christopher Harris (https://github.com/cwharris)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13320
---
 cpp/src/text/subword/subword_tokenize.cu    | 35 +++++++++-
 cpp/src/text/subword/wordpiece_tokenizer.cu | 14 ++--
 cpp/tests/text/subword_tests.cpp            | 71 +++++++++++++++++++++
 3 files changed, 113 insertions(+), 7 deletions(-)

diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 844f2a625e0..e34aa4054da 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,13 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -31,6 +34,7 @@
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/tabulate.h>
 #include <thrust/transform_scan.h>
 
 namespace nvtext {
@@ -125,6 +129,28 @@ __global__ void kernel_compute_tensor_metadata(
   }
 }
 
+// this happens if there are no tokens in the input
+tokenizer_result build_empty_result(cudf::size_type size,
+                                    uint32_t max_sequence_length,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  auto zero = cudf::numeric_scalar<uint32_t>(0, true, stream);
+  auto ids  = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);
+  auto mask = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);
+
+  auto metadata = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::UINT32}, size * 3, cudf::mask_state::UNALLOCATED, stream, mr);
+  thrust::tabulate(rmm::exec_policy(stream),
+                   metadata->mutable_view().begin<uint32_t>(),
+                   metadata->mutable_view().end<uint32_t>(),
+                   [] __device__(auto idx) { return ((idx % 3) == 0) ? idx : 0; });
+  metadata->set_null_count(0);
+
+  return tokenizer_result{
+    0, max_sequence_length, std::move(ids), std::move(mask), std::move(metadata)};
+}
+
 }  // namespace
 
 tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
@@ -143,12 +169,13 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                  static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
                "max_sequence_length x max_rows_tensor is too large for cudf output column size");
   auto const strings_count = strings.size();
-  if (strings_count == 0 || strings.chars_size() == 0)
+  if (strings_count == strings.null_count()) {  // empty or all-null returns empty
     return tokenizer_result{0,
                             max_sequence_length,
                             cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
                             cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
                             cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32})};
+  }
 
   auto const offsets   = strings.offsets();
   auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
@@ -187,6 +214,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
     thrust::plus<uint32_t>());
   // last element is the total number of output rows
   uint32_t const nrows_tensor_token_ids = offsets_per_tensor.element(strings_count, stream);
+  // if there are no tokens at all, build a specific empty result
+  if (nrows_tensor_token_ids == 0) {
+    return build_empty_result(strings_count, max_sequence_length, stream, mr);
+  }
 
   // compute global_row to tensor, and global_row to within_tensor_row correspondence
   rmm::device_uvector<uint32_t> row2tensor(nrows_tensor_token_ids, stream);
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index 601072d583c..028cf5ee6d6 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -498,9 +498,12 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
   // We need to change the end_word_indices pointer after the selection is complete
   device_end_word_indices = device_start_word_indices + num_words;
 
-  cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(num_words), THREADS_PER_BLOCK};
-  detail::
-    kernel_wordpiece_tokenizer<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+  if (num_words > 0) {
+    cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(num_words), THREADS_PER_BLOCK};
+    detail::kernel_wordpiece_tokenizer<<<grid.num_blocks,
+                                         grid.num_threads_per_block,
+                                         0,
+                                         stream.value()>>>(
       device_code_points,
       vocab_table.table->view().data<uint64_t>(),
       vocab_table.bin_coefficients->view().data<uint64_t>(),
@@ -515,7 +518,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
       num_words,
       device_token_ids.data(),
       device_tokens_per_word.data());
-  CUDF_CHECK_CUDA(stream.value());
+    CUDF_CHECK_CUDA(stream.value());
+  }
 
   // Repurpose the input array for the token ids. In the worst case, each code point ends up being a
   // token so this will always have enough memory to store the contiguous tokens.
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index d50c7e73543..0cf223f4a99 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -144,6 +144,49 @@ TEST(TextSubwordTest, TokenizeMultiRow)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
 }
 
+TEST(TextSubwordTest, TokenizeWithEmptyRow)
+{
+  std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
+  create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+
+  cudf::test::strings_column_wrapper strings{
+    "This is a test.", "", "This is a test. This is a tést."};
+  auto input = cudf::strings_column_view{strings};
+
+  uint32_t const max_seq = 8;
+  uint32_t const stride  = 6;
+  bool const lower       = true;
+  bool const truncate    = false;
+
+  auto result =
+    nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, MAX_ROWS_TENSOR);
+
+  EXPECT_EQ(uint32_t{4}, result.nrows_tensor);
+
+  // clang-format off
+  auto expected_tokens = cudf::test::fixed_width_column_wrapper<uint32_t>(
+    {2023, 2003, 1037, 3231, 1012,   0,    0,    0,
+        0,    0,    0,    0,    0,   0,    0,    0,
+     2023, 2003, 1037, 3231, 1012, 2023, 2003, 1037,   // this one
+     2003, 1037, 3231, 1012,    0,    0,    0,    0}); // continues here
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens);
+  // clang-format off
+  auto expected_attn = cudf::test::fixed_width_column_wrapper<uint32_t>(
+     {1, 1, 1, 1, 1, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 0, 0, 0, 0});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn);
+  // clang-format off
+  auto expected_metadata = cudf::test::fixed_width_column_wrapper<uint32_t>(
+    {0,0,4, 1,0,0, 2,0,6, 2,1,3}); // note that the 3rd element has 2 tensors
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
+}
+
 TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
 {
   cudf::test::strings_column_wrapper strings({"This is a test."});
@@ -236,6 +279,34 @@ TEST(TextSubwordTest, AllNullStrings)
   EXPECT_EQ(0, result.tensor_metadata->size());
 }
 
+TEST(TextSubwordTest, NoTokens)
+{
+  std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
+  create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+
+  cudf::test::strings_column_wrapper strings({"  ", "\n\r", "\t"});
+  auto input = cudf::strings_column_view{strings};
+
+  uint32_t const max_seq = 16;
+  uint32_t const stride  = 16;
+  bool const lower       = true;
+  bool const truncate    = true;
+
+  auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, 2);
+
+  std::vector<uint32_t> zeros(max_seq * input.size(), 0);
+
+  EXPECT_EQ(static_cast<uint32_t>(input.size()), result.nrows_tensor);
+
+  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(zeros.begin(), zeros.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected);
+  auto expected_metadata =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({0, 0, 0, 1, 0, 0, 2, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
+}
+
 TEST(TextSubwordTest, TokenizeFromVocabStruct)
 {
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");