Skip to content

Commit

Permalink
chore: allow protobuf 3.20.3 requirement (huggingface#22759)
Browse files Browse the repository at this point in the history
* chore: allow protobuf 3.20.3

Allow latest bugfix release for protobuf (3.20.3)

* chore: update auto-generated dependency table

update auto-generated dependency table

* run in subprocess

* Apply suggestions from code review

Co-authored-by: amyeroberts <[email protected]>

* Apply suggestions

---------

Co-authored-by: ydshieh <[email protected]>
Co-authored-by: Yih-Dar <[email protected]>
Co-authored-by: amyeroberts <[email protected]>
  • Loading branch information
4 people authored and novice03 committed Jun 23, 2023
1 parent 5f29a98 commit f17018e
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 52 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@
"packaging>=20.0",
"parameterized",
"phonemizer",
"protobuf<=3.20.2",
"protobuf<=3.20.3",
"psutil",
"pyyaml>=5.1",
"pydantic",
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/dependency_versions_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
"packaging": "packaging>=20.0",
"parameterized": "parameterized",
"phonemizer": "phonemizer",
"protobuf": "protobuf<=3.20.2",
"protobuf": "protobuf<=3.20.3",
"psutil": "psutil",
"pyyaml": "pyyaml>=5.1",
"pydantic": "pydantic",
Expand Down
135 changes: 85 additions & 50 deletions tests/test_tokenization_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import shutil
import sys
import tempfile
import traceback
import unittest
import unittest.mock as mock
from collections import OrderedDict
Expand Down Expand Up @@ -64,6 +65,7 @@
require_tf,
require_tokenizers,
require_torch,
run_test_in_subprocess,
slow,
)
from transformers.tokenization_utils import AddedToken, Trie
Expand Down Expand Up @@ -131,6 +133,71 @@ def merge_model_tokenizer_mappings(
return model_tokenizer_mapping


def _test_subword_regularization_tokenizer(in_queue, out_queue, timeout):
error = None

try:
inputs = in_queue.get(timeout=timeout)
tokenizer = inputs["tokenizer"]
sp_model_kwargs = inputs["sp_model_kwargs"]
test_sentencepiece_ignore_case = inputs["test_sentencepiece_ignore_case"]

unittest.TestCase().assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
unittest.TestCase().assertIsNotNone(tokenizer.sp_model_kwargs)
unittest.TestCase().assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
unittest.TestCase().assertDictEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
check_subword_sampling(tokenizer, test_sentencepiece_ignore_case=test_sentencepiece_ignore_case)

except Exception:
error = f"{traceback.format_exc()}"

results = {"error": error}
out_queue.put(results, timeout=timeout)
out_queue.join()


def check_subword_sampling(
tokenizer: PreTrainedTokenizer,
text: str = None,
test_sentencepiece_ignore_case: bool = True,
) -> None:
"""
Check if the tokenizer generates different results when subword regularization is enabled.
Subword regularization augments training data with subword sampling.
This has a random component.
Args:
tokenizer: The tokenizer to check.
text: The text to use for the checks.
test_sentencepiece_ignore_case: See `TokenizerTesterMixin.test_sentencepiece_ignore_case`.
"""
text = "This is a test for subword regularization." if text is None else text
if test_sentencepiece_ignore_case:
text = text.lower()

tokens_list = []
for _ in range(5):
tokens_list.append(tokenizer.tokenize(text))

# the list of different pairs of tokens_list
combinations = itertools.combinations(tokens_list, 2)

# check of sampling is done
subword_sampling_found = False
for combination in combinations:
if combination[0] != combination[1]:
subword_sampling_found = True
unittest.TestCase().assertTrue(subword_sampling_found)

# check if converting back to original text works
for tokens in tokens_list:
if test_sentencepiece_ignore_case:
unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
else:
unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens))


class TokenizerTesterMixin:
tokenizer_class = None
rust_tokenizer_class = None
Expand Down Expand Up @@ -420,11 +487,15 @@ def test_subword_regularization_tokenizer(self) -> None:
sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)

self.assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
self.assertIsNotNone(tokenizer.sp_model_kwargs)
self.assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
self.assertEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
self.check_subword_sampling(tokenizer)
run_test_in_subprocess(
test_case=self,
target_func=_test_subword_regularization_tokenizer,
inputs={
"tokenizer": tokenizer,
"sp_model_kwargs": sp_model_kwargs,
"test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
},
)

def test_pickle_subword_regularization_tokenizer(self) -> None:
if not self.test_sentencepiece:
Expand All @@ -438,11 +509,15 @@ def test_pickle_subword_regularization_tokenizer(self) -> None:
del tokenizer
tokenizer_new = pickle.loads(tokenizer_bin)

self.assertTrue(hasattr(tokenizer_new, "sp_model_kwargs"))
self.assertIsNotNone(tokenizer_new.sp_model_kwargs)
self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict))
self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
self.check_subword_sampling(tokenizer_new)
run_test_in_subprocess(
test_case=self,
target_func=_test_subword_regularization_tokenizer,
inputs={
"tokenizer": tokenizer_new,
"sp_model_kwargs": sp_model_kwargs,
"test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
},
)

def test_save_sentencepiece_tokenizer(self) -> None:
if not self.test_sentencepiece or not self.test_slow_tokenizer:
Expand Down Expand Up @@ -2317,46 +2392,6 @@ def _check_no_pad_token_padding(self, tokenizer, sequences):
# add pad_token_id to pass subsequent tests
tokenizer.add_special_tokens({"pad_token": "<PAD>"})

def check_subword_sampling(
self,
tokenizer: PreTrainedTokenizer,
text: str = None,
) -> None:
"""
Check if the tokenizer generates different results when subword regularization is enabled.
Subword regularization augments training data with subword sampling.
This has a random component.
Args:
tokenizer: The tokenizer to check.
text: The text to use for the checks.
"""
text = "This is a test for subword regularization." if text is None else text
if self.test_sentencepiece_ignore_case:
text = text.lower()

tokens_list = []
for _ in range(5):
tokens_list.append(tokenizer.tokenize(text))

# the list of different pairs of tokens_list
combinations = itertools.combinations(tokens_list, 2)

# check of sampling is done
subword_sampling_found = False
for combination in combinations:
if combination[0] != combination[1]:
subword_sampling_found = True
self.assertTrue(subword_sampling_found)

# check if converting back to original text works
for tokens in tokens_list:
if self.test_sentencepiece_ignore_case:
self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
else:
self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens))

@require_torch
@slow
def test_torch_encode_plus_sent_to_model(self):
Expand Down

0 comments on commit f17018e

Please sign in to comment.