diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 844f2a625e0..e34aa4054da 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,10 +14,13 @@ * limitations under the License. */ +#include #include #include #include +#include #include +#include #include #include @@ -31,6 +34,7 @@ #include #include #include +#include #include namespace nvtext { @@ -125,6 +129,28 @@ __global__ void kernel_compute_tensor_metadata( } } +// this happens if there are no tokens in the input +tokenizer_result build_empty_result(cudf::size_type size, + uint32_t max_sequence_length, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto zero = cudf::numeric_scalar(0, true, stream); + auto ids = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr); + auto mask = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr); + + auto metadata = cudf::make_numeric_column( + cudf::data_type{cudf::type_id::UINT32}, size * 3, cudf::mask_state::UNALLOCATED, stream, mr); + thrust::tabulate(rmm::exec_policy(stream), + metadata->mutable_view().begin(), + metadata->mutable_view().end(), + [] __device__(auto idx) { return ((idx % 3) == 0) ? idx : 0; }); + metadata->set_null_count(0); + + return tokenizer_result{ + 0, max_sequence_length, std::move(ids), std::move(mask), std::move(metadata)}; +} + } // namespace tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, @@ -143,12 +169,13 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, static_cast(std::numeric_limits::max()), "max_sequence_length x max_rows_tensor is too large for cudf output column size"); auto const strings_count = strings.size(); - if (strings_count == 0 || strings.chars_size() == 0) + if (strings_count == strings.null_count()) { // empty or all-null returns empty return tokenizer_result{0, max_sequence_length, cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}), cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}), cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32})}; + } auto const offsets = strings.offsets(); auto const d_offsets = offsets.data() + strings.offset(); @@ -187,6 +214,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, thrust::plus()); // last element is the total number of output rows uint32_t const nrows_tensor_token_ids = offsets_per_tensor.element(strings_count, stream); + // if there are no tokens at all, build a specific empty result + if (nrows_tensor_token_ids == 0) { + return build_empty_result(strings_count, max_sequence_length, stream, mr); + } // compute global_row to tensor, and global_row to within_tensor_row correspondence rmm::device_uvector row2tensor(nrows_tensor_token_ids, stream); diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index 601072d583c..028cf5ee6d6 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -498,9 +498,12 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre // We need to change the end_word_indices pointer after the selection is complete device_end_word_indices = device_start_word_indices + num_words; - cudf::detail::grid_1d const grid{static_cast(num_words), THREADS_PER_BLOCK}; - detail:: - kernel_wordpiece_tokenizer<<>>( + if (num_words > 0) { + cudf::detail::grid_1d const grid{static_cast(num_words), THREADS_PER_BLOCK}; + detail::kernel_wordpiece_tokenizer<<>>( device_code_points, vocab_table.table->view().data(), vocab_table.bin_coefficients->view().data(), @@ -515,7 +518,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre num_words, device_token_ids.data(), device_tokens_per_word.data()); - CUDF_CHECK_CUDA(stream.value()); + CUDF_CHECK_CUDA(stream.value()); + } // Repurpose the input array for the token ids. In the worst case, each code point ends up being a // token so this will always have enough memory to store the contiguous tokens. diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index d50c7e73543..0cf223f4a99 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -144,6 +144,49 @@ TEST(TextSubwordTest, TokenizeMultiRow) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata); } +TEST(TextSubwordTest, TokenizeWithEmptyRow) +{ + std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); + create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); + + cudf::test::strings_column_wrapper strings{ + "This is a test.", "", "This is a test. This is a tést."}; + auto input = cudf::strings_column_view{strings}; + + uint32_t const max_seq = 8; + uint32_t const stride = 6; + bool const lower = true; + bool const truncate = false; + + auto result = + nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, MAX_ROWS_TENSOR); + + EXPECT_EQ(uint32_t{4}, result.nrows_tensor); + + // clang-format off + auto expected_tokens = cudf::test::fixed_width_column_wrapper( + {2023, 2003, 1037, 3231, 1012, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 2023, 2003, 1037, 3231, 1012, 2023, 2003, 1037, // this one + 2003, 1037, 3231, 1012, 0, 0, 0, 0}); // continues here + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens); + // clang-format off + auto expected_attn = cudf::test::fixed_width_column_wrapper( + {1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn); + // clang-format off + auto expected_metadata = cudf::test::fixed_width_column_wrapper( + {0,0,4, 1,0,0, 2,0,6, 2,1,3}); // note that the 3rd element has 2 tensors + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata); +} + TEST(TextSubwordTest, TokenizeMaxEqualsTokens) { cudf::test::strings_column_wrapper strings({"This is a test."}); @@ -236,6 +279,34 @@ TEST(TextSubwordTest, AllNullStrings) EXPECT_EQ(0, result.tensor_metadata->size()); } +TEST(TextSubwordTest, NoTokens) +{ + std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); + create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); + + cudf::test::strings_column_wrapper strings({" ", "\n\r", "\t"}); + auto input = cudf::strings_column_view{strings}; + + uint32_t const max_seq = 16; + uint32_t const stride = 16; + bool const lower = true; + bool const truncate = true; + + auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, 2); + + std::vector zeros(max_seq * input.size(), 0); + + EXPECT_EQ(static_cast(input.size()), result.nrows_tensor); + + auto expected = cudf::test::fixed_width_column_wrapper(zeros.begin(), zeros.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected); + auto expected_metadata = + cudf::test::fixed_width_column_wrapper({0, 0, 0, 1, 0, 0, 2, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata); +} + TEST(TextSubwordTest, TokenizeFromVocabStruct) { std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");