Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove unused max_rows_tensor parameter from subword tokenizer #13463

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions cpp/benchmarks/text/subword.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@
#include <iostream>
#include <vector>

#define MAX_ROWS_TENSOR 300

static std::string create_hash_vocab_file()
{
std::string dir_template{std::filesystem::temp_directory_path().string()};
Expand Down Expand Up @@ -74,8 +72,7 @@ static void BM_subword_tokenizer(benchmark::State& state)
max_sequence_length,
stride,
do_lower,
do_truncate,
MAX_ROWS_TENSOR);
do_truncate);
}
}

Expand Down
5 changes: 0 additions & 5 deletions cpp/include/nvtext/subword_tokenize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,6 @@ struct tokenizer_result {
* @param do_truncate If true, the tokenizer will discard all the token-ids after
* `max_sequence_length` for each input string. If false, it will use a new row
* in the output token-ids to continue generating the output.
* @param max_rows_tensor Maximum number of rows for the output token-ids expected
* to be generated by the tokenizer.
* Used for allocating temporary working memory on the GPU device.
* If the output generates a larger number of rows, behavior is undefined.
* @param mr Memory resource to allocate any returned objects.
* @return token-ids, attention-mask, and metadata
*/
Expand All @@ -159,7 +155,6 @@ tokenizer_result subword_tokenize(
uint32_t stride,
bool do_lower_case,
bool do_truncate,
uint32_t max_rows_tensor,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
6 changes: 1 addition & 5 deletions cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -49,9 +49,6 @@ class wordpiece_tokenizer {
* @brief Creates a full tokenizer that cleans the text and splits it into tokens.
*
* @param vocab_table The preprocessed hashed vocabulary data.
* @param max_rows_final_tensor Maximum number of rows in tensor_token-ids expected by tokenizer.
* Used to allocate temporary working memory on the GPU.
* If the output contains a larger number of rows, behavior is undefined.
* @param max_sequence_length Limit the number of token-ids per row in the output
* @param stride Each row in tensor-token-ids will replicate `max_sequence_length - stride`
* token-ids from the previous row, unless it is the first string.
Expand All @@ -66,7 +63,6 @@ class wordpiece_tokenizer {
* specified in the `vocab_file`.
*/
wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
uint32_t max_rows_final_tensor,
uint32_t max_sequence_length,
uint32_t stride,
bool do_truncate,
Expand Down
15 changes: 6 additions & 9 deletions cpp/src/text/subword/subword_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -159,17 +159,11 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
uint32_t stride,
bool do_lower_case,
bool do_truncate,
uint32_t max_rows_tensor,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(stride <= max_sequence_length,
"stride must be less than or equal to max_sequence_length");
CUDF_EXPECTS(
max_sequence_length <=
(static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()) / max_rows_tensor),
"max_sequence_length times max_rows_tensor exceeds the column size limit",
std::overflow_error);
auto const strings_count = strings.size();
if (strings_count == strings.null_count()) { // empty or all-null returns empty
return tokenizer_result{0,
Expand All @@ -178,6 +172,11 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32})};
}
CUDF_EXPECTS(
max_sequence_length <=
(static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()) / strings_count),
"max_sequence_length times number of input rows exceeds the column size limit",
std::overflow_error);

auto const offsets = strings.offsets();
auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
Expand All @@ -186,7 +185,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,

// Create tokenizer
wordpiece_tokenizer tokenizer(
vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case);
vocab_table, max_sequence_length, stride, do_truncate, do_lower_case);
// Run tokenizer
auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
// assign output components
Expand Down Expand Up @@ -292,7 +291,6 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
uint32_t stride,
bool do_lower_case,
bool do_truncate,
uint32_t max_rows_tensor,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
Expand All @@ -302,7 +300,6 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
stride,
do_lower_case,
do_truncate,
max_rows_tensor,
cudf::get_default_stream(),
mr);
}
Expand Down
1 change: 0 additions & 1 deletion cpp/src/text/subword/wordpiece_tokenizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,6 @@ __global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points,
} // namespace

wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
uint32_t max_rows_final_tensor,
uint32_t max_sequence_length,
uint32_t stride,
bool do_truncate,
Expand Down
65 changes: 26 additions & 39 deletions cpp/tests/text/subword_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@
#include <iostream>
#include <vector>

#define MAX_ROWS_TENSOR 300

// Global environment for temporary files
auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
Expand Down Expand Up @@ -75,9 +73,8 @@ TEST(TextSubwordTest, Tokenize)
*vocab,
max_sequence_length,
stride,
true, // do_lower_case
false, // do_truncate
MAX_ROWS_TENSOR);
true, // do_lower_case
false); // do_truncate

EXPECT_EQ(nrows, result.nrows_tensor);

Expand Down Expand Up @@ -128,9 +125,8 @@ TEST(TextSubwordTest, TokenizeMultiRow)
*vocab,
max_sequence_length,
stride,
true, // do_lower_case
false, // do_truncate
MAX_ROWS_TENSOR);
true, // do_lower_case
false); // do_truncate

EXPECT_EQ(uint32_t{3}, result.nrows_tensor);
cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens(
Expand Down Expand Up @@ -159,8 +155,7 @@ TEST(TextSubwordTest, TokenizeWithEmptyRow)
bool const lower = true;
bool const truncate = false;

auto result =
nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, MAX_ROWS_TENSOR);
auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate);

EXPECT_EQ(uint32_t{4}, result.nrows_tensor);

Expand Down Expand Up @@ -201,9 +196,8 @@ TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
*vocab,
max_sequence_length,
stride,
true, // do_lower_case
false, // do_truncate
MAX_ROWS_TENSOR);
true, // do_lower_case
false); // do_truncate

EXPECT_EQ(uint32_t{1}, result.nrows_tensor);
cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens({2023, 2003, 1037, 3231, 1012});
Expand All @@ -216,28 +210,26 @@ TEST(TextSubwordTest, TokenizeMaxEqualsTokens)

TEST(TextSubwordTest, ParameterErrors)
{
std::vector<const char*> h_strings{"This is a test.", "This is a test. This is a tést."};
std::vector<const char*> h_strings{"This is a test.", "This is a test. This is a tést.", "", ""};
cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file);

EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
*vocab,
12, // max_sequence_length
13, // stride <= max_sequence_length
true, // do_lower_case
true, // do_truncate
MAX_ROWS_TENSOR),
12, // max_sequence_length
13, // stride <= max_sequence_length
true, // do_lower_case
true), // do_truncate
cudf::logic_error);

EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
*vocab,
858993459,
5,
5,
true, // do_lower_case
true, // do_truncate
858993459),
true, // do_lower_case
true), // do_truncate
std::overflow_error);
}

Expand All @@ -251,9 +243,8 @@ TEST(TextSubwordTest, EmptyStrings)
*vocab,
16,
16,
true, // do_lower_case
false, // do_truncate
MAX_ROWS_TENSOR);
true, // do_lower_case
false); // do_truncate
EXPECT_EQ(uint32_t{0}, result.nrows_tensor);
EXPECT_EQ(0, result.tensor_token_ids->size());
EXPECT_EQ(0, result.tensor_attention_mask->size());
Expand All @@ -270,9 +261,8 @@ TEST(TextSubwordTest, AllNullStrings)
*vocab,
16,
16,
true, // do_lower_case
false, // do_truncate
MAX_ROWS_TENSOR);
true, // do_lower_case
false); // do_truncate
EXPECT_EQ(uint32_t{0}, result.nrows_tensor);
EXPECT_EQ(0, result.tensor_token_ids->size());
EXPECT_EQ(0, result.tensor_attention_mask->size());
Expand All @@ -293,7 +283,7 @@ TEST(TextSubwordTest, NoTokens)
bool const lower = true;
bool const truncate = true;

auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, 2);
auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate);

std::vector<uint32_t> zeros(max_seq * input.size(), 0);

Expand All @@ -319,9 +309,8 @@ TEST(TextSubwordTest, TokenizeFromVocabStruct)
*vocab,
8,
6,
true, // do_lower_case
true, // do_truncate
MAX_ROWS_TENSOR);
true, // do_lower_case
true); // do_truncate

EXPECT_EQ(uint32_t{2}, result.nrows_tensor);
cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens(
Expand Down Expand Up @@ -389,9 +378,8 @@ TEST(TextSubwordTest, TokenizeWithSpecialTokens)
*vocab,
8,
6,
true, // do_lower_case
true, // do_truncate
MAX_ROWS_TENSOR);
true, // do_lower_case
true); // do_truncate

EXPECT_EQ(static_cast<uint32_t>(h_strings.size()), result.nrows_tensor);
// clang-format off
Expand Down Expand Up @@ -439,9 +427,8 @@ TEST(TextSubwordTest, ZeroHashBinCoefficient)
*vocab,
8,
8,
true, // do_lower_case
true, // do_truncate
MAX_ROWS_TENSOR);
true, // do_lower_case
true); // do_truncate

// clang-format off
cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens({7, 0, 0, 0, 0, 0, 0, 0});
Expand Down
8 changes: 3 additions & 5 deletions python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from libc.stdint cimport uint16_t, uint32_t
from libcpp cimport bool
Expand Down Expand Up @@ -38,8 +38,7 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
uint32_t max_sequence_length,
uint32_t stride,
bool do_lower,
bool do_truncate,
uint32_t max_rows_tensor
bool do_truncate
) except +

cdef tokenizer_result subword_tokenize(
Expand All @@ -48,8 +47,7 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
uint32_t max_sequence_length,
uint32_t stride,
bool do_lower,
bool do_truncate,
uint32_t max_rows_tensor
bool do_truncate
) except +

cdef extern from "<utility>" namespace "std" nogil:
Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t

Expand Down Expand Up @@ -37,7 +37,6 @@ def subword_tokenize_inmem_hash(
uint32_t stride=48,
bool do_lower=True,
bool do_truncate=False,
uint32_t max_rows_tensor=500
):
"""
Subword tokenizes text series by using the pre-loaded hashed vocabulary
Expand All @@ -53,7 +52,6 @@ def subword_tokenize_inmem_hash(
stride,
do_lower,
do_truncate,
max_rows_tensor
)
)
# return the 3 tensor components
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/subword_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -216,7 +216,6 @@ def __call__(
stride=stride,
do_lower=self.do_lower_case,
do_truncate=truncation,
max_rows_tensor=max_num_rows,
)

tokenizer_output = {
Expand Down