diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index 8cc000ff095..2b09ec66203 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -130,9 +130,7 @@ struct tokenizer_result { * larger than the max value for cudf::size_type * * @param strings The input strings to tokenize. - * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file. - * Note that this is the file AFTER python/perfect_hash.py has been used - * for preprocessing. + * @param vocabulary_table The vocabulary table pre-loaded into this object. * @param max_sequence_length Limit of the number of token-ids per row in final tensor * for each string. * @param stride Each row in the output token-ids will replicate `max_sequence_length - stride` @@ -150,25 +148,6 @@ struct tokenizer_result { * @param mr Memory resource to allocate any returned objects. * @return token-ids, attention-mask, and metadata */ -tokenizer_result subword_tokenize( - cudf::strings_column_view const& strings, - std::string const& filename_hashed_vocabulary, - uint32_t max_sequence_length, - uint32_t stride, - bool do_lower_case, - bool do_truncate, - uint32_t max_rows_tensor, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @copydoc subword_tokenize() - * - * This function differs from the one above by only the hashed vocabulary parameter. - * The file can be pre-loaded using the @ref load_vocabulary_file API and then - * passed in place of the file name in a call to this API. - * - * @param vocabulary_table The vocabulary table pre-loaded into this object. - */ tokenizer_result subword_tokenize( cudf::strings_column_view const& strings, hashed_vocabulary const& vocabulary_table, diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 6de1044b492..193cd80d9a6 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -249,28 +249,6 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, } // namespace detail -tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, - std::string const& filename_hashed_vocabulary, - uint32_t max_sequence_length, - uint32_t stride, - bool do_lower_case, - bool do_truncate, - uint32_t max_rows_tensor, - rmm::mr::device_memory_resource* mr) -{ - auto vocab_table = load_vocabulary_file(filename_hashed_vocabulary, mr); - CUDF_FUNC_RANGE(); - return detail::subword_tokenize(strings, - *vocab_table, - max_sequence_length, - stride, - do_lower_case, - do_truncate, - max_rows_tensor, - rmm::cuda_stream_default, - mr); -} - tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, hashed_vocabulary const& vocabulary_table, uint32_t max_sequence_length, diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index 65cc466fee7..521a082faa2 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -67,12 +67,13 @@ TEST(TextSubwordTest, Tokenize) cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); uint32_t max_sequence_length = 16; uint32_t stride = 16; auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, max_sequence_length, stride, true, // do_lower_case @@ -119,12 +120,13 @@ TEST(TextSubwordTest, TokenizeMultiRow) cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); uint32_t max_sequence_length = 8; uint32_t stride = 6; auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, max_sequence_length, stride, true, // do_lower_case @@ -148,12 +150,13 @@ TEST(TextSubwordTest, TokenizeMaxEqualsTokens) cudf::test::strings_column_wrapper strings({"This is a test."}); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); uint32_t max_sequence_length = 5; // five tokens in strings; uint32_t stride = 5; // this should not effect the result auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, max_sequence_length, stride, true, // do_lower_case @@ -175,8 +178,10 @@ TEST(TextSubwordTest, ParameterErrors) cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); + EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, 12, // max_sequence_length 13, // stride <= max_sequence_length true, // do_lower_case @@ -185,7 +190,7 @@ TEST(TextSubwordTest, ParameterErrors) cudf::logic_error); EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, 5, 5, true, // do_lower_case @@ -199,8 +204,9 @@ TEST(TextSubwordTest, EmptyStrings) cudf::test::strings_column_wrapper strings; std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, 16, 16, true, // do_lower_case @@ -217,8 +223,9 @@ TEST(TextSubwordTest, AllNullStrings) cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0}); std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file); auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - hash_file, + *vocab, 16, 16, true, // do_lower_case diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx index 49f24436b88..426744ee46c 100644 --- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx @@ -58,38 +58,3 @@ def subword_tokenize_inmem_hash( masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask)) metadata = Column.from_unique_ptr(move(c_result.tensor_metadata)) return tokens, masks, metadata - - -def subword_tokenize_vocab_file( - Column strings, - object hash_file, - uint32_t max_sequence_length=64, - uint32_t stride=48, - bool do_lower=True, - bool do_truncate=False, - uint32_t max_rows_tensor=500 -): - """ - Subword tokenizes text series by using the hashed vocabulary - stored on disk - """ - cdef column_view c_strings = strings.view() - cdef cpp_tokenizer_result c_result - cdef string c_hash_file = str(hash_file).encode() - with nogil: - c_result = tr_move( - cpp_subword_tokenize( - c_strings, - c_hash_file, - max_sequence_length, - stride, - do_lower, - do_truncate, - max_rows_tensor - ) - ) - # return the 3 tensor components - tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids)) - masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask)) - metadata = Column.from_unique_ptr(move(c_result.tensor_metadata)) - return tokens, masks, metadata diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index fbc1538cc74..7911d0eff2a 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -12,7 +12,6 @@ is_letter_multi, porter_stemmer_measure, ) -from cudf._lib.nvtext.subword_tokenize import subword_tokenize_vocab_file from cudf._lib.nvtext.tokenize import ( _count_tokens_column, _count_tokens_scalar, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 1c9a013810a..a83110d273c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4711,119 +4711,6 @@ def filter_tokens( ), ) - def subword_tokenize( - self, - hash_file: str, - max_length: int = 64, - stride: int = 48, - do_lower: bool = True, - do_truncate: bool = False, - max_rows_tensor: int = 500, - ) -> Tuple[cupy.ndarray, cupy.ndarray, cupy.ndarray]: - """ - Run CUDA BERT subword tokenizer on cuDF strings column. - Encodes words to token ids using vocabulary from a pretrained - tokenizer. - - This function requires about 21x the number of character bytes - in the input strings column as working memory. - - ``Series.str.subword_tokenize`` is deprecated and will be removed. - Use ``cudf.core.subword_tokenizer.SubwordTokenizer`` instead. - - Parameters - ---------- - hash_file : str - Path to hash file containing vocabulary of words with token-ids. - This can be created from the raw vocabulary - using the ``cudf.utils.hash_vocab_utils.hash_vocab`` function - max_length : int, Default is 64 - Limits the length of the sequence returned. - If tokenized string is shorter than max_length, - output will be padded with 0s. - If the tokenized string is longer than max_length and - do_truncate == False, there will be multiple returned - sequences containing the overflowing token-ids. - stride : int, Default is 48 - If do_truncate == False and the tokenized string is larger - than max_length, the sequences containing the overflowing - token-ids can contain duplicated token-ids from the main - sequence. If max_length is equal to stride there are no - duplicated-id tokens. If stride is 80% of max_length, - 20% of the first sequence will be repeated on the second - sequence and so on until the entire sentence is encoded. - do_lower : bool, Default is True - If set to true, original text will be lowercased before encoding. - do_truncate : bool, Default is False - If set to true, strings will be truncated and padded to - max_length. Each input string will result in exactly one output - sequence. If set to false, there may be multiple output - sequences when the max_length is smaller than generated tokens. - max_rows_tensor : int, Default is 500 - Maximum number of rows for the output token-ids expected - to be generated by the tokenizer. - Used for allocating temporary working memory on the GPU device. - If the output generates a larger number of rows, behavior - is undefined. - This will vary based on stride, truncation, and max_length. - For example, for non-overlapping sequences output rows - will be the same as input rows. - - Returns - ------- - token-ids : cupy.ndarray - The token-ids for each string padded with 0s to max_length. - attention-mask : cupy.ndarray - The mask for token-ids result where corresponding positions - identify valid token-id values. - metadata : cupy.ndarray - Each row contains the index id of the original string and the - first and last index of the token-ids that are non-padded and - non-overlapping. - - Examples - -------- - >>> import cudf - >>> from cudf.utils.hash_vocab_utils import hash_vocab - >>> hash_vocab('bert-base-uncased-vocab.txt', 'voc_hash.txt') - >>> ser = cudf.Series(['this is the', 'best book']) - >>> stride, max_length = 8, 8 - >>> max_rows_tensor = len(ser) - >>> tokens, masks, metadata = ser.str.subword_tokenize('voc_hash.txt', - ... max_length=max_length, stride=stride, - ... max_rows_tensor=max_rows_tensor) - >>> tokens.reshape(-1, max_length) - array([[2023, 2003, 1996, 0, 0, 0, 0, 0], - [2190, 2338, 0, 0, 0, 0, 0, 0]], dtype=uint32) - >>> masks.reshape(-1, max_length) - array([[1, 1, 1, 0, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0, 0, 0]], dtype=uint32) - >>> metadata.reshape(-1, 3) - array([[0, 0, 2], - [1, 0, 1]], dtype=uint32) - """ - warnings.warn( - "`Series.str.subword_tokenize` is deprecated and will be removed " - "in future versions of cudf. Use " - "`cudf.core.subword_tokenizer.SubwordTokenizer` instead.", - FutureWarning, - ) - - tokens, masks, metadata = libstrings.subword_tokenize_vocab_file( - self._column, - hash_file, - max_length, - stride, - do_lower, - do_truncate, - max_rows_tensor, - ) - return ( - cupy.asarray(tokens), - cupy.asarray(masks), - cupy.asarray(metadata), - ) - def porter_stemmer_measure(self) -> SeriesOrIndex: """ Compute the Porter Stemmer measure for each string. diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index 3502fc9acae..782b74ef4a6 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -21,7 +21,7 @@ def _cast_to_appropriate_type(ar, cast_type): from torch.utils.dlpack import from_dlpack elif cast_type == "tf": - from tf.experimental.dlpack import from_dlpack + from tensorflow.experimental.dlpack import from_dlpack return from_dlpack(ar.astype("int32").toDlpack()) diff --git a/python/cudf/cudf/tests/test_subword_tokenizer.py b/python/cudf/cudf/tests/test_subword_tokenizer.py index 717b3de8479..ec6e0b30cb1 100644 --- a/python/cudf/cudf/tests/test_subword_tokenizer.py +++ b/python/cudf/cudf/tests/test_subword_tokenizer.py @@ -1,12 +1,14 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import os +import cupy import numpy as np import pytest from transformers import BertTokenizer import cudf from cudf.core.subword_tokenizer import SubwordTokenizer +from cudf.testing._utils import assert_eq @pytest.fixture(scope="module") @@ -26,30 +28,6 @@ def assert_equal_tokenization_outputs(hf_output, cudf_output): ) -def test_subword_tokenize_on_disk_vocab_str_api(datadir): - """ - Tests the subword-tokenizer API where - the vocabulary is not pre-loaded - and is accessed via the string accessor - """ - with open( - os.path.join(datadir, "test_sentences.txt"), encoding="utf-8" - ) as file: - input_sentence_ls = [line.strip() for line in file] - - vocab_dir = os.path.join(datadir, "bert_base_cased_sampled") - vocab_hash_path = os.path.join(vocab_dir, "vocab-hash.txt") - - ser = cudf.Series(input_sentence_ls) - tokens, masks, metadata = ser.str.subword_tokenize( - vocab_hash_path, - max_length=32, - stride=32, - do_lower=True, - max_rows_tensor=len(ser), - ) - - @pytest.mark.parametrize("seq_len", [32, 64]) @pytest.mark.parametrize("stride", [0, 15, 30]) @pytest.mark.parametrize("add_special_tokens", [True, False]) @@ -115,3 +93,145 @@ def test_subword_tokenize_with_truncation(datadir): truncation=False, add_special_tokens=True, ) + + +def test_text_subword_tokenize(tmpdir): + sr = cudf.Series( + [ + "This is a test", + "A test this is", + "Is test a this", + "Test test", + "this This", + ] + ) + hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt") + content = "1\n0\n23\n" + coefficients = [65559] * 23 + for c in coefficients: + content = content + str(c) + " 0\n" + # based on values from the bert_hash_table.txt file for the + # test words used here: 'this' 'is' 'a' test' + table = [0] * 23 + table[0] = 3015668 + table[1] = 6205475701751155871 + table[5] = 6358029 + table[16] = 451412625363 + table[20] = 6206321707968235495 + content = content + "23\n" + for v in table: + content = content + str(v) + "\n" + content = content + "100\n101\n102\n\n" + hash_file.write(content) + + cudf_tokenizer = SubwordTokenizer(hash_file) + + token_d = cudf_tokenizer( + sr, 8, 8, add_special_tokens=False, truncation=True + ) + tokens, masks, metadata = ( + token_d["input_ids"], + token_d["attention_mask"], + token_d["metadata"], + ) + expected_tokens = cupy.asarray( + [ + 2023, + 2003, + 1037, + 3231, + 0, + 0, + 0, + 0, + 1037, + 3231, + 2023, + 2003, + 0, + 0, + 0, + 0, + 2003, + 3231, + 1037, + 2023, + 0, + 0, + 0, + 0, + 3231, + 3231, + 0, + 0, + 0, + 0, + 0, + 0, + 2023, + 2023, + 0, + 0, + 0, + 0, + 0, + 0, + ], + dtype=np.uint32, + ) + expected_tokens = expected_tokens.reshape(-1, 8) + assert_eq(expected_tokens, tokens) + + expected_masks = cupy.asarray( + [ + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + ], + dtype=np.uint32, + ) + expected_masks = expected_masks.reshape(-1, 8) + assert_eq(expected_masks, masks) + + expected_metadata = cupy.asarray( + [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32 + ) + expected_metadata = expected_metadata.reshape(-1, 3) + assert_eq(expected_metadata, metadata) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index fcae0a21b6a..a447a60c709 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -1,6 +1,5 @@ # Copyright (c) 2019, NVIDIA CORPORATION. -import cupy import numpy as np import pytest @@ -655,136 +654,6 @@ def test_text_filter_tokens_error_cases(): sr.str.filter_tokens(3, delimiter=["a", "b"]) -def test_text_subword_tokenize(tmpdir): - sr = cudf.Series( - [ - "This is a test", - "A test this is", - "Is test a this", - "Test test", - "this This", - ] - ) - hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt") - content = "1\n0\n23\n" - coefficients = [65559] * 23 - for c in coefficients: - content = content + str(c) + " 0\n" - # based on values from the bert_hash_table.txt file for the - # test words used here: 'this' 'is' 'a' test' - table = [0] * 23 - table[0] = 3015668 - table[1] = 6205475701751155871 - table[5] = 6358029 - table[16] = 451412625363 - table[20] = 6206321707968235495 - content = content + "23\n" - for v in table: - content = content + str(v) + "\n" - content = content + "100\n101\n102\n\n" - hash_file.write(content) - - tokens, masks, metadata = sr.str.subword_tokenize(str(hash_file), 8, 8) - expected_tokens = cupy.asarray( - [ - 2023, - 2003, - 1037, - 3231, - 0, - 0, - 0, - 0, - 1037, - 3231, - 2023, - 2003, - 0, - 0, - 0, - 0, - 2003, - 3231, - 1037, - 2023, - 0, - 0, - 0, - 0, - 3231, - 3231, - 0, - 0, - 0, - 0, - 0, - 0, - 2023, - 2023, - 0, - 0, - 0, - 0, - 0, - 0, - ], - dtype=np.uint32, - ) - assert_eq(expected_tokens, tokens) - - expected_masks = cupy.asarray( - [ - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - ], - dtype=np.uint32, - ) - assert_eq(expected_masks, masks) - - expected_metadata = cupy.asarray( - [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32 - ) - assert_eq(expected_metadata, metadata) - - def test_edit_distance(): sr = cudf.Series(["kitten", "saturday", "address", "book"]) tg = cudf.Series(["sitting", "sunday", "addressee", "back"])