Fix subword_tokenize error when input contains no tokens (#13320)

Fixes a bug where an exception is thrown when there are no tokens in the entire input column. For example, the input column is filled with strings containing only whitespace. This special case will return token-ids of all zeroes along with an attention mask of all zeros equivalent to `input.size() * max_sequence_length`. Closes #13300 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Christopher Harris (https://github.com/cwharris) - Nghia Truong (https://github.com/ttnghia) URL: #13320
rapidsai · May 15, 2023 · 403c83f · 403c83f
1 parent d18fd4e
commit 403c83f
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 7 deletions.
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,13 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -31,6 +34,7 @@
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/tabulate.h>
 #include <thrust/transform_scan.h>
 
 namespace nvtext {
@@ -125,6 +129,28 @@ __global__ void kernel_compute_tensor_metadata(
   }
 }
 
+// this happens if there are no tokens in the input
+tokenizer_result build_empty_result(cudf::size_type size,
+                                    uint32_t max_sequence_length,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  auto zero = cudf::numeric_scalar<uint32_t>(0, true, stream);
+  auto ids  = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);
+  auto mask = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);
+
+  auto metadata = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::UINT32}, size * 3, cudf::mask_state::UNALLOCATED, stream, mr);
+  thrust::tabulate(rmm::exec_policy(stream),
+                   metadata->mutable_view().begin<uint32_t>(),
+                   metadata->mutable_view().end<uint32_t>(),
+                   [] __device__(auto idx) { return ((idx % 3) == 0) ? idx : 0; });
+  metadata->set_null_count(0);
+
+  return tokenizer_result{
+    0, max_sequence_length, std::move(ids), std::move(mask), std::move(metadata)};
+}
+
 }  // namespace
 
 tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
@@ -143,12 +169,13 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                  static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
                "max_sequence_length x max_rows_tensor is too large for cudf output column size");
   auto const strings_count = strings.size();
-  if (strings_count == 0 || strings.chars_size() == 0)
+  if (strings_count == strings.null_count()) {  // empty or all-null returns empty
     return tokenizer_result{0,
                             max_sequence_length,
                             cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
                             cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
                             cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32})};
+  }
 
   auto const offsets   = strings.offsets();
   auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
@@ -187,6 +214,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
     thrust::plus<uint32_t>());
   // last element is the total number of output rows
   uint32_t const nrows_tensor_token_ids = offsets_per_tensor.element(strings_count, stream);
+  // if there are no tokens at all, build a specific empty result
+  if (nrows_tensor_token_ids == 0) {
+    return build_empty_result(strings_count, max_sequence_length, stream, mr);
+  }
 
   // compute global_row to tensor, and global_row to within_tensor_row correspondence
   rmm::device_uvector<uint32_t> row2tensor(nrows_tensor_token_ids, stream);

diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -498,9 +498,12 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
   // We need to change the end_word_indices pointer after the selection is complete
   device_end_word_indices = device_start_word_indices + num_words;
 
-  cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(num_words), THREADS_PER_BLOCK};
-  detail::
-    kernel_wordpiece_tokenizer<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+  if (num_words > 0) {
+    cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(num_words), THREADS_PER_BLOCK};
+    detail::kernel_wordpiece_tokenizer<<<grid.num_blocks,
+                                         grid.num_threads_per_block,
+                                         0,
+                                         stream.value()>>>(
       device_code_points,
       vocab_table.table->view().data<uint64_t>(),
       vocab_table.bin_coefficients->view().data<uint64_t>(),
@@ -515,7 +518,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
       num_words,
       device_token_ids.data(),
       device_tokens_per_word.data());
-  CUDF_CHECK_CUDA(stream.value());
+    CUDF_CHECK_CUDA(stream.value());
+  }
 
   // Repurpose the input array for the token ids. In the worst case, each code point ends up being a
   // token so this will always have enough memory to store the contiguous tokens.

diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
@@ -144,6 +144,49 @@ TEST(TextSubwordTest, TokenizeMultiRow)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
 }
 
+TEST(TextSubwordTest, TokenizeWithEmptyRow)
+{
+  std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
+  create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+
+  cudf::test::strings_column_wrapper strings{
+    "This is a test.", "", "This is a test. This is a tést."};
+  auto input = cudf::strings_column_view{strings};
+
+  uint32_t const max_seq = 8;
+  uint32_t const stride  = 6;
+  bool const lower       = true;
+  bool const truncate    = false;
+
+  auto result =
+    nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, MAX_ROWS_TENSOR);
+
+  EXPECT_EQ(uint32_t{4}, result.nrows_tensor);
+
+  // clang-format off
+  auto expected_tokens = cudf::test::fixed_width_column_wrapper<uint32_t>(
+    {2023, 2003, 1037, 3231, 1012,   0,    0,    0,
+        0,    0,    0,    0,    0,   0,    0,    0,
+     2023, 2003, 1037, 3231, 1012, 2023, 2003, 1037,   // this one
+     2003, 1037, 3231, 1012,    0,    0,    0,    0}); // continues here
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens);
+  // clang-format off
+  auto expected_attn = cudf::test::fixed_width_column_wrapper<uint32_t>(
+     {1, 1, 1, 1, 1, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 0, 0, 0, 0});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn);
+  // clang-format off
+  auto expected_metadata = cudf::test::fixed_width_column_wrapper<uint32_t>(
+    {0,0,4, 1,0,0, 2,0,6, 2,1,3}); // note that the 3rd element has 2 tensors
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
+}
+
 TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
 {
   cudf::test::strings_column_wrapper strings({"This is a test."});
@@ -236,6 +279,34 @@ TEST(TextSubwordTest, AllNullStrings)
   EXPECT_EQ(0, result.tensor_metadata->size());
 }
 
+TEST(TextSubwordTest, NoTokens)
+{
+  std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
+  create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+
+  cudf::test::strings_column_wrapper strings({"  ", "\n\r", "\t"});
+  auto input = cudf::strings_column_view{strings};
+
+  uint32_t const max_seq = 16;
+  uint32_t const stride  = 16;
+  bool const lower       = true;
+  bool const truncate    = true;
+
+  auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, 2);
+
+  std::vector<uint32_t> zeros(max_seq * input.size(), 0);
+
+  EXPECT_EQ(static_cast<uint32_t>(input.size()), result.nrows_tensor);
+
+  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(zeros.begin(), zeros.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected);
+  auto expected_metadata =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({0, 0, 0, 1, 0, 0, 2, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
+}
+
 TEST(TextSubwordTest, TokenizeFromVocabStruct)
 {
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");