Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW]Remove str.subword_tokenize #9968

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 1 addition & 22 deletions cpp/include/nvtext/subword_tokenize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,7 @@ struct tokenizer_result {
* larger than the max value for cudf::size_type
*
* @param strings The input strings to tokenize.
* @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file.
* Note that this is the file AFTER python/perfect_hash.py has been used
* for preprocessing.
* @param vocabulary_table The vocabulary table pre-loaded into this object.
* @param max_sequence_length Limit of the number of token-ids per row in final tensor
* for each string.
* @param stride Each row in the output token-ids will replicate `max_sequence_length - stride`
Expand All @@ -150,25 +148,6 @@ struct tokenizer_result {
* @param mr Memory resource to allocate any returned objects.
* @return token-ids, attention-mask, and metadata
*/
tokenizer_result subword_tokenize(
cudf::strings_column_view const& strings,
std::string const& filename_hashed_vocabulary,
uint32_t max_sequence_length,
uint32_t stride,
bool do_lower_case,
bool do_truncate,
uint32_t max_rows_tensor,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc subword_tokenize()
*
* This function differs from the one above by only the hashed vocabulary parameter.
* The file can be pre-loaded using the @ref load_vocabulary_file API and then
* passed in place of the file name in a call to this API.
*
* @param vocabulary_table The vocabulary table pre-loaded into this object.
*/
tokenizer_result subword_tokenize(
cudf::strings_column_view const& strings,
hashed_vocabulary const& vocabulary_table,
Expand Down
22 changes: 0 additions & 22 deletions cpp/src/text/subword/subword_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -249,28 +249,6 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,

} // namespace detail

tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
std::string const& filename_hashed_vocabulary,
uint32_t max_sequence_length,
uint32_t stride,
bool do_lower_case,
bool do_truncate,
uint32_t max_rows_tensor,
rmm::mr::device_memory_resource* mr)
{
auto vocab_table = load_vocabulary_file(filename_hashed_vocabulary, mr);
CUDF_FUNC_RANGE();
return detail::subword_tokenize(strings,
*vocab_table,
max_sequence_length,
stride,
do_lower_case,
do_truncate,
max_rows_tensor,
rmm::cuda_stream_default,
mr);
}

tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
hashed_vocabulary const& vocabulary_table,
uint32_t max_sequence_length,
Expand Down
21 changes: 14 additions & 7 deletions cpp/tests/text/subword_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,13 @@ TEST(TextSubwordTest, Tokenize)
cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file);

uint32_t max_sequence_length = 16;
uint32_t stride = 16;

auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
hash_file,
*vocab,
max_sequence_length,
stride,
true, // do_lower_case
Expand Down Expand Up @@ -119,12 +120,13 @@ TEST(TextSubwordTest, TokenizeMultiRow)
cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file);

uint32_t max_sequence_length = 8;
uint32_t stride = 6;

auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
hash_file,
*vocab,
max_sequence_length,
stride,
true, // do_lower_case
Expand All @@ -148,12 +150,13 @@ TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
cudf::test::strings_column_wrapper strings({"This is a test."});
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file);

uint32_t max_sequence_length = 5; // five tokens in strings;
uint32_t stride = 5; // this should not effect the result

auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
hash_file,
*vocab,
max_sequence_length,
stride,
true, // do_lower_case
Expand All @@ -175,8 +178,10 @@ TEST(TextSubwordTest, ParameterErrors)
cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file);

EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
hash_file,
*vocab,
12, // max_sequence_length
13, // stride <= max_sequence_length
true, // do_lower_case
Expand All @@ -185,7 +190,7 @@ TEST(TextSubwordTest, ParameterErrors)
cudf::logic_error);

EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
hash_file,
*vocab,
5,
5,
true, // do_lower_case
Expand All @@ -199,8 +204,9 @@ TEST(TextSubwordTest, EmptyStrings)
cudf::test::strings_column_wrapper strings;
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file);
auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
hash_file,
*vocab,
16,
16,
true, // do_lower_case
Expand All @@ -217,8 +223,9 @@ TEST(TextSubwordTest, AllNullStrings)
cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0});
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file);
auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
hash_file,
*vocab,
16,
16,
true, // do_lower_case
Expand Down
35 changes: 0 additions & 35 deletions python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -58,38 +58,3 @@ def subword_tokenize_inmem_hash(
masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
return tokens, masks, metadata


def subword_tokenize_vocab_file(
Column strings,
object hash_file,
uint32_t max_sequence_length=64,
uint32_t stride=48,
bool do_lower=True,
bool do_truncate=False,
uint32_t max_rows_tensor=500
):
"""
Subword tokenizes text series by using the hashed vocabulary
stored on disk
"""
cdef column_view c_strings = strings.view()
cdef cpp_tokenizer_result c_result
cdef string c_hash_file = <string>str(hash_file).encode()
with nogil:
c_result = tr_move(
cpp_subword_tokenize(
c_strings,
c_hash_file,
max_sequence_length,
stride,
do_lower,
do_truncate,
max_rows_tensor
)
)
# return the 3 tensor components
tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids))
masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
return tokens, masks, metadata
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
is_letter_multi,
porter_stemmer_measure,
)
from cudf._lib.nvtext.subword_tokenize import subword_tokenize_vocab_file
from cudf._lib.nvtext.tokenize import (
_count_tokens_column,
_count_tokens_scalar,
Expand Down
113 changes: 0 additions & 113 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4711,119 +4711,6 @@ def filter_tokens(
),
)

def subword_tokenize(
self,
hash_file: str,
max_length: int = 64,
stride: int = 48,
do_lower: bool = True,
do_truncate: bool = False,
max_rows_tensor: int = 500,
) -> Tuple[cupy.ndarray, cupy.ndarray, cupy.ndarray]:
"""
Run CUDA BERT subword tokenizer on cuDF strings column.
Encodes words to token ids using vocabulary from a pretrained
tokenizer.

This function requires about 21x the number of character bytes
in the input strings column as working memory.

``Series.str.subword_tokenize`` is deprecated and will be removed.
Use ``cudf.core.subword_tokenizer.SubwordTokenizer`` instead.

Parameters
----------
hash_file : str
Path to hash file containing vocabulary of words with token-ids.
This can be created from the raw vocabulary
using the ``cudf.utils.hash_vocab_utils.hash_vocab`` function
max_length : int, Default is 64
Limits the length of the sequence returned.
If tokenized string is shorter than max_length,
output will be padded with 0s.
If the tokenized string is longer than max_length and
do_truncate == False, there will be multiple returned
sequences containing the overflowing token-ids.
stride : int, Default is 48
If do_truncate == False and the tokenized string is larger
than max_length, the sequences containing the overflowing
token-ids can contain duplicated token-ids from the main
sequence. If max_length is equal to stride there are no
duplicated-id tokens. If stride is 80% of max_length,
20% of the first sequence will be repeated on the second
sequence and so on until the entire sentence is encoded.
do_lower : bool, Default is True
If set to true, original text will be lowercased before encoding.
do_truncate : bool, Default is False
If set to true, strings will be truncated and padded to
max_length. Each input string will result in exactly one output
sequence. If set to false, there may be multiple output
sequences when the max_length is smaller than generated tokens.
max_rows_tensor : int, Default is 500
Maximum number of rows for the output token-ids expected
to be generated by the tokenizer.
Used for allocating temporary working memory on the GPU device.
If the output generates a larger number of rows, behavior
is undefined.
This will vary based on stride, truncation, and max_length.
For example, for non-overlapping sequences output rows
will be the same as input rows.

Returns
-------
token-ids : cupy.ndarray
The token-ids for each string padded with 0s to max_length.
attention-mask : cupy.ndarray
The mask for token-ids result where corresponding positions
identify valid token-id values.
metadata : cupy.ndarray
Each row contains the index id of the original string and the
first and last index of the token-ids that are non-padded and
non-overlapping.

Examples
--------
>>> import cudf
>>> from cudf.utils.hash_vocab_utils import hash_vocab
>>> hash_vocab('bert-base-uncased-vocab.txt', 'voc_hash.txt')
>>> ser = cudf.Series(['this is the', 'best book'])
>>> stride, max_length = 8, 8
>>> max_rows_tensor = len(ser)
>>> tokens, masks, metadata = ser.str.subword_tokenize('voc_hash.txt',
... max_length=max_length, stride=stride,
... max_rows_tensor=max_rows_tensor)
>>> tokens.reshape(-1, max_length)
array([[2023, 2003, 1996, 0, 0, 0, 0, 0],
[2190, 2338, 0, 0, 0, 0, 0, 0]], dtype=uint32)
>>> masks.reshape(-1, max_length)
array([[1, 1, 1, 0, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0, 0, 0]], dtype=uint32)
>>> metadata.reshape(-1, 3)
array([[0, 0, 2],
[1, 0, 1]], dtype=uint32)
"""
warnings.warn(
"`Series.str.subword_tokenize` is deprecated and will be removed "
"in future versions of cudf. Use "
"`cudf.core.subword_tokenizer.SubwordTokenizer` instead.",
FutureWarning,
)

tokens, masks, metadata = libstrings.subword_tokenize_vocab_file(
self._column,
hash_file,
max_length,
stride,
do_lower,
do_truncate,
max_rows_tensor,
)
return (
cupy.asarray(tokens),
cupy.asarray(masks),
cupy.asarray(metadata),
)

def porter_stemmer_measure(self) -> SeriesOrIndex:
"""
Compute the Porter Stemmer measure for each string.
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/subword_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def _cast_to_appropriate_type(ar, cast_type):
from torch.utils.dlpack import from_dlpack

elif cast_type == "tf":
from tf.experimental.dlpack import from_dlpack
from tensorflow.experimental.dlpack import from_dlpack

return from_dlpack(ar.astype("int32").toDlpack())

Expand Down
Loading