diff --git a/setup.py b/setup.py index 1d8115f6684532..4ac28772625e22 100644 --- a/setup.py +++ b/setup.py @@ -145,7 +145,7 @@ "packaging>=20.0", "parameterized", "phonemizer", - "protobuf<=3.20.2", + "protobuf<=3.20.3", "psutil", "pyyaml>=5.1", "pydantic", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index f325447a109d8c..9012b21ff43a08 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -46,7 +46,7 @@ "packaging": "packaging>=20.0", "parameterized": "parameterized", "phonemizer": "phonemizer", - "protobuf": "protobuf<=3.20.2", + "protobuf": "protobuf<=3.20.3", "psutil": "psutil", "pyyaml": "pyyaml>=5.1", "pydantic": "pydantic", diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 84f6886db66770..53231998ee41fa 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -23,6 +23,7 @@ import shutil import sys import tempfile +import traceback import unittest import unittest.mock as mock from collections import OrderedDict @@ -64,6 +65,7 @@ require_tf, require_tokenizers, require_torch, + run_test_in_subprocess, slow, ) from transformers.tokenization_utils import AddedToken, Trie @@ -131,6 +133,71 @@ def merge_model_tokenizer_mappings( return model_tokenizer_mapping +def _test_subword_regularization_tokenizer(in_queue, out_queue, timeout): + error = None + + try: + inputs = in_queue.get(timeout=timeout) + tokenizer = inputs["tokenizer"] + sp_model_kwargs = inputs["sp_model_kwargs"] + test_sentencepiece_ignore_case = inputs["test_sentencepiece_ignore_case"] + + unittest.TestCase().assertTrue(hasattr(tokenizer, "sp_model_kwargs")) + unittest.TestCase().assertIsNotNone(tokenizer.sp_model_kwargs) + unittest.TestCase().assertTrue(isinstance(tokenizer.sp_model_kwargs, dict)) + unittest.TestCase().assertDictEqual(tokenizer.sp_model_kwargs, sp_model_kwargs) + check_subword_sampling(tokenizer, test_sentencepiece_ignore_case=test_sentencepiece_ignore_case) + + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() + + +def check_subword_sampling( + tokenizer: PreTrainedTokenizer, + text: str = None, + test_sentencepiece_ignore_case: bool = True, +) -> None: + """ + Check if the tokenizer generates different results when subword regularization is enabled. + + Subword regularization augments training data with subword sampling. + This has a random component. + + Args: + tokenizer: The tokenizer to check. + text: The text to use for the checks. + test_sentencepiece_ignore_case: See `TokenizerTesterMixin.test_sentencepiece_ignore_case`. + """ + text = "This is a test for subword regularization." if text is None else text + if test_sentencepiece_ignore_case: + text = text.lower() + + tokens_list = [] + for _ in range(5): + tokens_list.append(tokenizer.tokenize(text)) + + # the list of different pairs of tokens_list + combinations = itertools.combinations(tokens_list, 2) + + # check of sampling is done + subword_sampling_found = False + for combination in combinations: + if combination[0] != combination[1]: + subword_sampling_found = True + unittest.TestCase().assertTrue(subword_sampling_found) + + # check if converting back to original text works + for tokens in tokens_list: + if test_sentencepiece_ignore_case: + unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower()) + else: + unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens)) + + class TokenizerTesterMixin: tokenizer_class = None rust_tokenizer_class = None @@ -420,11 +487,15 @@ def test_subword_regularization_tokenizer(self) -> None: sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs) - self.assertTrue(hasattr(tokenizer, "sp_model_kwargs")) - self.assertIsNotNone(tokenizer.sp_model_kwargs) - self.assertTrue(isinstance(tokenizer.sp_model_kwargs, dict)) - self.assertEqual(tokenizer.sp_model_kwargs, sp_model_kwargs) - self.check_subword_sampling(tokenizer) + run_test_in_subprocess( + test_case=self, + target_func=_test_subword_regularization_tokenizer, + inputs={ + "tokenizer": tokenizer, + "sp_model_kwargs": sp_model_kwargs, + "test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case, + }, + ) def test_pickle_subword_regularization_tokenizer(self) -> None: if not self.test_sentencepiece: @@ -438,11 +509,15 @@ def test_pickle_subword_regularization_tokenizer(self) -> None: del tokenizer tokenizer_new = pickle.loads(tokenizer_bin) - self.assertTrue(hasattr(tokenizer_new, "sp_model_kwargs")) - self.assertIsNotNone(tokenizer_new.sp_model_kwargs) - self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict)) - self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs) - self.check_subword_sampling(tokenizer_new) + run_test_in_subprocess( + test_case=self, + target_func=_test_subword_regularization_tokenizer, + inputs={ + "tokenizer": tokenizer_new, + "sp_model_kwargs": sp_model_kwargs, + "test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case, + }, + ) def test_save_sentencepiece_tokenizer(self) -> None: if not self.test_sentencepiece or not self.test_slow_tokenizer: @@ -2317,46 +2392,6 @@ def _check_no_pad_token_padding(self, tokenizer, sequences): # add pad_token_id to pass subsequent tests tokenizer.add_special_tokens({"pad_token": ""}) - def check_subword_sampling( - self, - tokenizer: PreTrainedTokenizer, - text: str = None, - ) -> None: - """ - Check if the tokenizer generates different results when subword regularization is enabled. - - Subword regularization augments training data with subword sampling. - This has a random component. - - Args: - tokenizer: The tokenizer to check. - text: The text to use for the checks. - """ - text = "This is a test for subword regularization." if text is None else text - if self.test_sentencepiece_ignore_case: - text = text.lower() - - tokens_list = [] - for _ in range(5): - tokens_list.append(tokenizer.tokenize(text)) - - # the list of different pairs of tokens_list - combinations = itertools.combinations(tokens_list, 2) - - # check of sampling is done - subword_sampling_found = False - for combination in combinations: - if combination[0] != combination[1]: - subword_sampling_found = True - self.assertTrue(subword_sampling_found) - - # check if converting back to original text works - for tokens in tokens_list: - if self.test_sentencepiece_ignore_case: - self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower()) - else: - self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens)) - @require_torch @slow def test_torch_encode_plus_sent_to_model(self):