From 3539735f2dd07b77ac73005d5dbdeed75651df1a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 9 May 2023 07:33:29 -0400 Subject: [PATCH 1/6] Fix subword_tokenize error when input contains no tokens --- cpp/src/text/subword/subword_tokenize.cu | 42 ++++++++++++++- cpp/src/text/subword/wordpiece_tokenizer.cu | 14 +++-- cpp/tests/text/subword_tests.cpp | 59 +++++++++++++++++++++ 3 files changed, 108 insertions(+), 7 deletions(-) diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 844f2a625e0..cd00f0a33d9 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,10 +14,13 @@ * limitations under the License. */ +#include #include #include #include +#include #include +#include #include #include @@ -31,6 +34,7 @@ #include #include #include +#include #include namespace nvtext { @@ -125,6 +129,37 @@ __global__ void kernel_compute_tensor_metadata( } } +struct tabulator { + uint32_t __device__ operator()(cudf::size_type idx) { return ((idx % 3) == 0) ? idx : 0; } +}; + +// this happens if there are no tokens in the input +tokenizer_result build_empty_result(cudf::size_type size, + uint32_t max_sequence_length, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // the first column will be size*max_sequence_length of all zeroes + // the 2nd column will be size*max_sequence_length of also all zeroes + // the 3rd column will be size*3 with all zeros and `output[idx*3] = idx` + + auto zero = cudf::numeric_scalar(0, true, stream); + auto ids = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr); + auto mask = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr); + + auto metadata = cudf::make_numeric_column( + cudf::data_type{cudf::type_id::UINT32}, size * 3, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_metadata = cudf::mutable_column_device_view::create(*metadata, stream); + thrust::tabulate(rmm::exec_policy(stream), + d_metadata->begin(), + d_metadata->end(), + tabulator{}); + metadata->set_null_count(0); + + return tokenizer_result{ + 0, max_sequence_length, std::move(ids), std::move(mask), std::move(metadata)}; +} + } // namespace tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, @@ -143,7 +178,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, static_cast(std::numeric_limits::max()), "max_sequence_length x max_rows_tensor is too large for cudf output column size"); auto const strings_count = strings.size(); - if (strings_count == 0 || strings.chars_size() == 0) + if (strings_count == strings.null_count()) return tokenizer_result{0, max_sequence_length, cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}), @@ -187,6 +222,9 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, thrust::plus()); // last element is the total number of output rows uint32_t const nrows_tensor_token_ids = offsets_per_tensor.element(strings_count, stream); + if (nrows_tensor_token_ids == 0) { + return build_empty_result(strings_count, max_sequence_length, stream, mr); + } // compute global_row to tensor, and global_row to within_tensor_row correspondence rmm::device_uvector row2tensor(nrows_tensor_token_ids, stream); diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index 601072d583c..028cf5ee6d6 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -498,9 +498,12 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre // We need to change the end_word_indices pointer after the selection is complete device_end_word_indices = device_start_word_indices + num_words; - cudf::detail::grid_1d const grid{static_cast(num_words), THREADS_PER_BLOCK}; - detail:: - kernel_wordpiece_tokenizer<<>>( + if (num_words > 0) { + cudf::detail::grid_1d const grid{static_cast(num_words), THREADS_PER_BLOCK}; + detail::kernel_wordpiece_tokenizer<<>>( device_code_points, vocab_table.table->view().data(), vocab_table.bin_coefficients->view().data(), @@ -515,7 +518,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre num_words, device_token_ids.data(), device_tokens_per_word.data()); - CUDF_CHECK_CUDA(stream.value()); + CUDF_CHECK_CUDA(stream.value()); + } // Repurpose the input array for the token ids. In the worst case, each code point ends up being a // token so this will always have enough memory to store the contiguous tokens. diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index d50c7e73543..d4925b48590 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -144,6 +144,41 @@ TEST(TextSubwordTest, TokenizeMultiRow) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata); } +TEST(TextSubwordTest, TokenizeMultiRow2) +{ + std::vector h_strings{"This is a test.", "", "This is a test. This is a tést."}; + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); + std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); + create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); + + uint32_t max_sequence_length = 8; + uint32_t stride = 6; + + auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, + *vocab, + max_sequence_length, + stride, + true, // do_lower_case + false, // do_truncate + MAX_ROWS_TENSOR); + + cudf::test::print(result.tensor_token_ids->view()); + cudf::test::print(result.tensor_attention_mask->view()); + cudf::test::print(result.tensor_metadata->view()); + + // EXPECT_EQ(uint32_t{3}, result.nrows_tensor); + // cudf::test::fixed_width_column_wrapper expected_tokens( + // {2023, 2003, 1037, 3231, 1012, 0, 0, 0, 2023, 2003, 1037, 3231, + // 1012, 2023, 2003, 1037, 2003, 1037, 3231, 1012, 0, 0, 0, 0}); + // CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens); + // cudf::test::fixed_width_column_wrapper expected_attn( + // {1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}); + // CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn); + // cudf::test::fixed_width_column_wrapper expected_metadata({0, 0, 4, 1, 0, 6, 1, 1, + // 3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata); +} + TEST(TextSubwordTest, TokenizeMaxEqualsTokens) { cudf::test::strings_column_wrapper strings({"This is a test."}); @@ -236,6 +271,30 @@ TEST(TextSubwordTest, AllNullStrings) EXPECT_EQ(0, result.tensor_metadata->size()); } +TEST(TextSubwordTest, NoTokens) +{ + cudf::test::strings_column_wrapper strings({" ", "\n\r", "\t"}); + std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); + create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); + auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, + *vocab, + 16, + 16, + true, // do_lower_case + true, // do_truncate + 2); + + cudf::test::print(result.tensor_token_ids->view()); + cudf::test::print(result.tensor_attention_mask->view()); + cudf::test::print(result.tensor_metadata->view()); + + // EXPECT_EQ(uint32_t{0}, result.nrows_tensor); + // EXPECT_EQ(0, result.tensor_token_ids->size()); + // EXPECT_EQ(0, result.tensor_attention_mask->size()); + // EXPECT_EQ(0, result.tensor_metadata->size()); +} + TEST(TextSubwordTest, TokenizeFromVocabStruct) { std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); From c7b6a2609ce196d2d558b706319572808d2aeb37 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 9 May 2023 14:28:20 -0400 Subject: [PATCH 2/6] add gtests for no tokens --- cpp/tests/text/subword_tests.cpp | 96 ++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 42 deletions(-) diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index d4925b48590..76c46803b9c 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -144,39 +144,47 @@ TEST(TextSubwordTest, TokenizeMultiRow) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata); } -TEST(TextSubwordTest, TokenizeMultiRow2) +TEST(TextSubwordTest, TokenizeWithEmptyRow) { - std::vector h_strings{"This is a test.", "", "This is a test. This is a tést."}; - cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); auto vocab = nvtext::load_vocabulary_file(hash_file); - uint32_t max_sequence_length = 8; - uint32_t stride = 6; + cudf::test::strings_column_wrapper strings{ + "This is a test.", "", "This is a test. This is a tést."}; + auto input = cudf::strings_column_view{strings}; - auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - *vocab, - max_sequence_length, - stride, - true, // do_lower_case - false, // do_truncate - MAX_ROWS_TENSOR); + uint32_t const max_seq = 8; + uint32_t const stride = 6; + bool const lower = true; + bool const truncate = false; + + auto result = + nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, MAX_ROWS_TENSOR); + + EXPECT_EQ(uint32_t{4}, result.nrows_tensor); - cudf::test::print(result.tensor_token_ids->view()); - cudf::test::print(result.tensor_attention_mask->view()); - cudf::test::print(result.tensor_metadata->view()); - - // EXPECT_EQ(uint32_t{3}, result.nrows_tensor); - // cudf::test::fixed_width_column_wrapper expected_tokens( - // {2023, 2003, 1037, 3231, 1012, 0, 0, 0, 2023, 2003, 1037, 3231, - // 1012, 2023, 2003, 1037, 2003, 1037, 3231, 1012, 0, 0, 0, 0}); - // CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens); - // cudf::test::fixed_width_column_wrapper expected_attn( - // {1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}); - // CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn); - // cudf::test::fixed_width_column_wrapper expected_metadata({0, 0, 4, 1, 0, 6, 1, 1, - // 3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata); + // clang-format off + auto expected_tokens = cudf::test::fixed_width_column_wrapper( + {2023, 2003, 1037, 3231, 1012, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 2023, 2003, 1037, 3231, 1012, 2023, 2003, 1037, // this one + 2003, 1037, 3231, 1012, 0, 0, 0, 0}); // continues here + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens); + // clang-format off + auto expected_attn = cudf::test::fixed_width_column_wrapper( + {1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn); + // clang-format off + auto expected_metadata = cudf::test::fixed_width_column_wrapper( + {0,0,4, 1,0,0, 2,0,6, 2,1,3}); // note that the 3rd element has 2 tensors + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata); } TEST(TextSubwordTest, TokenizeMaxEqualsTokens) @@ -273,26 +281,30 @@ TEST(TextSubwordTest, AllNullStrings) TEST(TextSubwordTest, NoTokens) { - cudf::test::strings_column_wrapper strings({" ", "\n\r", "\t"}); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); - auto vocab = nvtext::load_vocabulary_file(hash_file); - auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - *vocab, - 16, - 16, - true, // do_lower_case - true, // do_truncate - 2); + auto vocab = nvtext::load_vocabulary_file(hash_file); + + cudf::test::strings_column_wrapper strings({" ", "\n\r", "\t"}); + auto input = cudf::strings_column_view{strings}; - cudf::test::print(result.tensor_token_ids->view()); - cudf::test::print(result.tensor_attention_mask->view()); - cudf::test::print(result.tensor_metadata->view()); + uint32_t const max_seq = 16; + uint32_t const stride = 16; + bool const lower = true; + bool const truncate = true; - // EXPECT_EQ(uint32_t{0}, result.nrows_tensor); - // EXPECT_EQ(0, result.tensor_token_ids->size()); - // EXPECT_EQ(0, result.tensor_attention_mask->size()); - // EXPECT_EQ(0, result.tensor_metadata->size()); + auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, 2); + + std::vector zeros(max_seq * input.size(), 0); + + EXPECT_EQ(static_cast(input.size()), result.nrows_tensor); + + auto expected = cudf::test::fixed_width_column_wrapper(zeros.begin(), zeros.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected); + auto expected_metadata = + cudf::test::fixed_width_column_wrapper({0, 0, 0, 1, 0, 0, 2, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata); } TEST(TextSubwordTest, TokenizeFromVocabStruct) From e92175976362344e335bed279a0484d79a4c7aa6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 9 May 2023 14:28:56 -0400 Subject: [PATCH 3/6] add gtests for no tokens --- cpp/tests/text/subword_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index 76c46803b9c..0cf223f4a99 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -174,7 +174,7 @@ TEST(TextSubwordTest, TokenizeWithEmptyRow) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens); // clang-format off auto expected_attn = cudf::test::fixed_width_column_wrapper( - {1, 1, 1, 1, 1, 0, 0, 0, + {1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}); From dc547fb39dd49089ee32d23b7e7a9ba2d9e1bc69 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 9 May 2023 16:44:50 -0400 Subject: [PATCH 4/6] removed unneeded comments --- cpp/src/text/subword/subword_tokenize.cu | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index cd00f0a33d9..64ea4e24a76 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -139,10 +139,6 @@ tokenizer_result build_empty_result(cudf::size_type size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // the first column will be size*max_sequence_length of all zeroes - // the 2nd column will be size*max_sequence_length of also all zeroes - // the 3rd column will be size*3 with all zeros and `output[idx*3] = idx` - auto zero = cudf::numeric_scalar(0, true, stream); auto ids = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr); auto mask = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr); @@ -178,12 +174,13 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, static_cast(std::numeric_limits::max()), "max_sequence_length x max_rows_tensor is too large for cudf output column size"); auto const strings_count = strings.size(); - if (strings_count == strings.null_count()) + if (strings_count == strings.null_count()) { // empty or all-null returns empty return tokenizer_result{0, max_sequence_length, cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}), cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}), cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32})}; + } auto const offsets = strings.offsets(); auto const d_offsets = offsets.data() + strings.offset(); @@ -222,6 +219,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, thrust::plus()); // last element is the total number of output rows uint32_t const nrows_tensor_token_ids = offsets_per_tensor.element(strings_count, stream); + // if there are no tokens at all, build a specific empty result if (nrows_tensor_token_ids == 0) { return build_empty_result(strings_count, max_sequence_length, stream, mr); } From 736103e657991f06893ca41745935f0444aeb3e0 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 11 May 2023 13:46:35 -0400 Subject: [PATCH 5/6] empty commit to retrigger CI From e8175415aa49f418cda0757f471d7e9a3296674d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 15 May 2023 10:01:38 -0400 Subject: [PATCH 6/6] move functor to lambda --- cpp/src/text/subword/subword_tokenize.cu | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 64ea4e24a76..e34aa4054da 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -129,10 +129,6 @@ __global__ void kernel_compute_tensor_metadata( } } -struct tabulator { - uint32_t __device__ operator()(cudf::size_type idx) { return ((idx % 3) == 0) ? idx : 0; } -}; - // this happens if there are no tokens in the input tokenizer_result build_empty_result(cudf::size_type size, uint32_t max_sequence_length, @@ -145,11 +141,10 @@ tokenizer_result build_empty_result(cudf::size_type size, auto metadata = cudf::make_numeric_column( cudf::data_type{cudf::type_id::UINT32}, size * 3, cudf::mask_state::UNALLOCATED, stream, mr); - auto d_metadata = cudf::mutable_column_device_view::create(*metadata, stream); thrust::tabulate(rmm::exec_policy(stream), - d_metadata->begin(), - d_metadata->end(), - tabulator{}); + metadata->mutable_view().begin(), + metadata->mutable_view().end(), + [] __device__(auto idx) { return ((idx % 3) == 0) ? idx : 0; }); metadata->set_null_count(0); return tokenizer_result{