Skip to content

Commit

Permalink
Fix subword_tokenize error when input contains no tokens (#13320)
Browse files Browse the repository at this point in the history
Fixes a bug where an exception is thrown when there are no tokens in the entire input column. For example, the input column is filled with strings containing only whitespace.
This special case will return token-ids of all zeroes along with an attention mask of all zeros equivalent to `input.size() * max_sequence_length`.

Closes #13300

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Christopher Harris (https://github.com/cwharris)
  - Nghia Truong (https://github.com/ttnghia)

URL: #13320
  • Loading branch information
davidwendt authored May 15, 2023
1 parent d18fd4e commit 403c83f
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 7 deletions.
35 changes: 33 additions & 2 deletions cpp/src/text/subword/subword_tokenize.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,10 +14,13 @@
* limitations under the License.
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/sequence.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/scalar/scalar.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>

Expand All @@ -31,6 +34,7 @@
#include <thrust/for_each.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/tabulate.h>
#include <thrust/transform_scan.h>

namespace nvtext {
Expand Down Expand Up @@ -125,6 +129,28 @@ __global__ void kernel_compute_tensor_metadata(
}
}

// this happens if there are no tokens in the input
tokenizer_result build_empty_result(cudf::size_type size,
uint32_t max_sequence_length,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto zero = cudf::numeric_scalar<uint32_t>(0, true, stream);
auto ids = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);
auto mask = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);

auto metadata = cudf::make_numeric_column(
cudf::data_type{cudf::type_id::UINT32}, size * 3, cudf::mask_state::UNALLOCATED, stream, mr);
thrust::tabulate(rmm::exec_policy(stream),
metadata->mutable_view().begin<uint32_t>(),
metadata->mutable_view().end<uint32_t>(),
[] __device__(auto idx) { return ((idx % 3) == 0) ? idx : 0; });
metadata->set_null_count(0);

return tokenizer_result{
0, max_sequence_length, std::move(ids), std::move(mask), std::move(metadata)};
}

} // namespace

tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
Expand All @@ -143,12 +169,13 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
"max_sequence_length x max_rows_tensor is too large for cudf output column size");
auto const strings_count = strings.size();
if (strings_count == 0 || strings.chars_size() == 0)
if (strings_count == strings.null_count()) { // empty or all-null returns empty
return tokenizer_result{0,
max_sequence_length,
cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32})};
}

auto const offsets = strings.offsets();
auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
Expand Down Expand Up @@ -187,6 +214,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
thrust::plus<uint32_t>());
// last element is the total number of output rows
uint32_t const nrows_tensor_token_ids = offsets_per_tensor.element(strings_count, stream);
// if there are no tokens at all, build a specific empty result
if (nrows_tensor_token_ids == 0) {
return build_empty_result(strings_count, max_sequence_length, stream, mr);
}

// compute global_row to tensor, and global_row to within_tensor_row correspondence
rmm::device_uvector<uint32_t> row2tensor(nrows_tensor_token_ids, stream);
Expand Down
14 changes: 9 additions & 5 deletions cpp/src/text/subword/wordpiece_tokenizer.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -498,9 +498,12 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
// We need to change the end_word_indices pointer after the selection is complete
device_end_word_indices = device_start_word_indices + num_words;

cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(num_words), THREADS_PER_BLOCK};
detail::
kernel_wordpiece_tokenizer<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
if (num_words > 0) {
cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(num_words), THREADS_PER_BLOCK};
detail::kernel_wordpiece_tokenizer<<<grid.num_blocks,
grid.num_threads_per_block,
0,
stream.value()>>>(
device_code_points,
vocab_table.table->view().data<uint64_t>(),
vocab_table.bin_coefficients->view().data<uint64_t>(),
Expand All @@ -515,7 +518,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
num_words,
device_token_ids.data(),
device_tokens_per_word.data());
CUDF_CHECK_CUDA(stream.value());
CUDF_CHECK_CUDA(stream.value());
}

// Repurpose the input array for the token ids. In the worst case, each code point ends up being a
// token so this will always have enough memory to store the contiguous tokens.
Expand Down
71 changes: 71 additions & 0 deletions cpp/tests/text/subword_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,49 @@ TEST(TextSubwordTest, TokenizeMultiRow)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
}

TEST(TextSubwordTest, TokenizeWithEmptyRow)
{
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file);

cudf::test::strings_column_wrapper strings{
"This is a test.", "", "This is a test. This is a tést."};
auto input = cudf::strings_column_view{strings};

uint32_t const max_seq = 8;
uint32_t const stride = 6;
bool const lower = true;
bool const truncate = false;

auto result =
nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, MAX_ROWS_TENSOR);

EXPECT_EQ(uint32_t{4}, result.nrows_tensor);

// clang-format off
auto expected_tokens = cudf::test::fixed_width_column_wrapper<uint32_t>(
{2023, 2003, 1037, 3231, 1012, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
2023, 2003, 1037, 3231, 1012, 2023, 2003, 1037, // this one
2003, 1037, 3231, 1012, 0, 0, 0, 0}); // continues here
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens);
// clang-format off
auto expected_attn = cudf::test::fixed_width_column_wrapper<uint32_t>(
{1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0});
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn);
// clang-format off
auto expected_metadata = cudf::test::fixed_width_column_wrapper<uint32_t>(
{0,0,4, 1,0,0, 2,0,6, 2,1,3}); // note that the 3rd element has 2 tensors
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
}

TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
{
cudf::test::strings_column_wrapper strings({"This is a test."});
Expand Down Expand Up @@ -236,6 +279,34 @@ TEST(TextSubwordTest, AllNullStrings)
EXPECT_EQ(0, result.tensor_metadata->size());
}

TEST(TextSubwordTest, NoTokens)
{
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file);

cudf::test::strings_column_wrapper strings({" ", "\n\r", "\t"});
auto input = cudf::strings_column_view{strings};

uint32_t const max_seq = 16;
uint32_t const stride = 16;
bool const lower = true;
bool const truncate = true;

auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, 2);

std::vector<uint32_t> zeros(max_seq * input.size(), 0);

EXPECT_EQ(static_cast<uint32_t>(input.size()), result.nrows_tensor);

auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(zeros.begin(), zeros.end());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected);
auto expected_metadata =
cudf::test::fixed_width_column_wrapper<uint32_t>({0, 0, 0, 1, 0, 0, 2, 0, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
}

TEST(TextSubwordTest, TokenizeFromVocabStruct)
{
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
Expand Down

0 comments on commit 403c83f

Please sign in to comment.