diff --git a/.gitignore b/.gitignore index eeb41b3fcaea35..337f2ef2c735e8 100644 --- a/.gitignore +++ b/.gitignore @@ -166,4 +166,4 @@ tags .DS_Store # ruff -.ruff_cache \ No newline at end of file +.ruff_cache diff --git a/setup.py b/setup.py index 25aadde8e3d6b3..bd47bfd4f91989 100644 --- a/setup.py +++ b/setup.py @@ -172,7 +172,7 @@ "tf2onnx", "timeout-decorator", "timm", - "tokenizers>=0.11.1,!=0.11.3,<0.14", + "tokenizers>=0.14,<0.15", "torch>=1.10,!=1.12.0", "torchaudio", "torchvision", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 8786f8cd014d64..a791d96eb5b818 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -78,7 +78,7 @@ "tf2onnx": "tf2onnx", "timeout-decorator": "timeout-decorator", "timm": "timm", - "tokenizers": "tokenizers>=0.11.1,!=0.11.3,<0.14", + "tokenizers": "tokenizers>=0.14,<0.15", "torch": "torch>=1.10,!=1.12.0", "torchaudio": "torchaudio", "torchvision": "torchvision", diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py index 231abf1c0301c9..3ff319199522cc 100644 --- a/src/transformers/models/albert/tokenization_albert.py +++ b/src/transformers/models/albert/tokenization_albert.py @@ -159,6 +159,14 @@ def __init__( self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + super().__init__( do_lower_case=do_lower_case, remove_space=remove_space, @@ -174,14 +182,6 @@ def __init__( **kwargs, ) - self.do_lower_case = do_lower_case - self.remove_space = remove_space - self.keep_accents = keep_accents - self.vocab_file = vocab_file - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - @property def vocab_size(self) -> int: return len(self.sp_model) @@ -228,6 +228,8 @@ def _tokenize(self, text: str) -> List[str]: new_pieces = [] for piece in pieces: if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): + # Logic to handle special cases see https://github.com/google-research/bert/blob/master/README.md#tokenization + # `9,9` -> ['▁9', ',', '9'] instead of [`_9,`, '9'] cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: if len(cur_pieces[0]) == 1: diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index 22ee1a0db6149d..7dd008c4dbbaf2 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -204,21 +204,10 @@ def __init__( pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token # Mask token behave like a normal word, i.e. include the space before it + # TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens` + # Also this not only will strip the spaces but any punctuation mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - super().__init__( - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - **kwargs, - ) - with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -235,6 +224,19 @@ def __init__( # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + @property def vocab_size(self): return len(self.encoder) diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index f05ed1b7a82d5d..464b17c4d4c217 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -170,6 +170,7 @@ def __init__( trim_offsets=True, **kwargs, ): + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token super().__init__( vocab_file, merges_file, diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py index 77ab8a9d64166b..5fd851b379cf5a 100644 --- a/src/transformers/models/barthez/tokenization_barthez.py +++ b/src/transformers/models/barthez/tokenization_barthez.py @@ -47,6 +47,8 @@ SPIECE_UNDERLINE = "▁" +# TODO this class is useless. This is the most standard sentencpiece model. Let's find which one is closest and nuke this. + class BarthezTokenizer(PreTrainedTokenizer): """ @@ -141,6 +143,9 @@ def __init__( self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -153,15 +158,6 @@ def __init__( **kwargs, ) - self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(str(vocab_file)) - - self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} - - self.fairseq_tokens_to_ids[""] = len(self.sp_model) - 1 - self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} - def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: @@ -251,16 +247,10 @@ def _tokenize(self, text: str) -> List[str]: def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" - if token in self.fairseq_tokens_to_ids: - return self.fairseq_tokens_to_ids[token] - spm_id = self.sp_model.PieceToId(token) - - return spm_id if spm_id else self.unk_token_id + return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" - if index in self.fairseq_ids_to_tokens: - return self.fairseq_ids_to_tokens[index] return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py index 1c1ef0b8675b8a..74e6ad8f9e29fe 100644 --- a/src/transformers/models/bartpho/tokenization_bartpho.py +++ b/src/transformers/models/bartpho/tokenization_bartpho.py @@ -139,18 +139,6 @@ def __init__( self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) - self.vocab_file = vocab_file self.monolingual_vocab_file = monolingual_vocab_file self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) @@ -174,6 +162,18 @@ def __init__( self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py index a24f39564264df..16044973343bc5 100644 --- a/src/transformers/models/bert/tokenization_bert.py +++ b/src/transformers/models/bert/tokenization_bert.py @@ -196,20 +196,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -225,7 +211,22 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py index 6ef3321277f365..f8d49f86ac51ae 100644 --- a/src/transformers/models/bert_generation/tokenization_bert_generation.py +++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py @@ -96,6 +96,11 @@ def __init__( ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + # Add extra_ids to the special token list super().__init__( bos_token=bos_token, @@ -107,11 +112,6 @@ def __init__( **kwargs, ) - self.vocab_file = vocab_file - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - @property def vocab_size(self): return self.sp_model.get_piece_size() diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index dd290109185241..e0f09c20b2e67e 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -160,25 +160,6 @@ def __init__( jumanpp_kwargs=None, **kwargs, ): - super().__init__( - spm_file=spm_file, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - do_lower_case=do_lower_case, - do_word_tokenize=do_word_tokenize, - do_subword_tokenize=do_subword_tokenize, - word_tokenizer_type=word_tokenizer_type, - subword_tokenizer_type=subword_tokenizer_type, - never_split=never_split, - mecab_kwargs=mecab_kwargs, - sudachi_kwargs=sudachi_kwargs, - jumanpp_kwargs=jumanpp_kwargs, - **kwargs, - ) - if subword_tokenizer_type == "sentencepiece": if not os.path.isfile(spm_file): raise ValueError( @@ -226,13 +207,31 @@ def __init__( self.subword_tokenizer_type = subword_tokenizer_type if do_subword_tokenize: if subword_tokenizer_type == "wordpiece": - self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) elif subword_tokenizer_type == "character": - self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token) + self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=str(unk_token)) elif subword_tokenizer_type == "sentencepiece": - self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=self.unk_token) + self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=str(unk_token)) else: raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.") + super().__init__( + spm_file=spm_file, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + do_lower_case=do_lower_case, + do_word_tokenize=do_word_tokenize, + do_subword_tokenize=do_subword_tokenize, + word_tokenizer_type=word_tokenizer_type, + subword_tokenizer_type=subword_tokenizer_type, + never_split=never_split, + mecab_kwargs=mecab_kwargs, + sudachi_kwargs=sudachi_kwargs, + jumanpp_kwargs=jumanpp_kwargs, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py index 7901a58b881983..13846a5089a685 100644 --- a/src/transformers/models/bertweet/tokenization_bertweet.py +++ b/src/transformers/models/bertweet/tokenization_bertweet.py @@ -134,18 +134,6 @@ def __init__( mask_token="", **kwargs, ): - super().__init__( - normalization=normalization, - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - cls_token=cls_token, - unk_token=unk_token, - pad_token=pad_token, - mask_token=mask_token, - **kwargs, - ) - try: from emoji import demojize @@ -161,10 +149,10 @@ def __init__( self.merges_file = merges_file self.encoder = {} - self.encoder[self.bos_token] = 0 - self.encoder[self.pad_token] = 1 - self.encoder[self.eos_token] = 2 - self.encoder[self.unk_token] = 3 + self.encoder[bos_token] = 0 + self.encoder[pad_token] = 1 + self.encoder[eos_token] = 2 + self.encoder[unk_token] = 3 self.add_from_file(vocab_file) @@ -178,9 +166,20 @@ def __init__( self.normalization = normalization self.tweetPreprocessor = TweetTokenizer() - self.special_puncts = {"’": "'", "…": "..."} + super().__init__( + normalization=normalization, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + **kwargs, + ) + def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py index 5220366df4d247..8e720a54257a5a 100644 --- a/src/transformers/models/big_bird/tokenization_big_bird.py +++ b/src/transformers/models/big_bird/tokenization_big_bird.py @@ -127,6 +127,11 @@ def __init__( self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -139,11 +144,6 @@ def __init__( **kwargs, ) - self.vocab_file = vocab_file - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - @property def vocab_size(self): return self.sp_model.get_piece_size() diff --git a/src/transformers/models/biogpt/tokenization_biogpt.py b/src/transformers/models/biogpt/tokenization_biogpt.py index d050fa699c5244..093991ecb3885d 100644 --- a/src/transformers/models/biogpt/tokenization_biogpt.py +++ b/src/transformers/models/biogpt/tokenization_biogpt.py @@ -112,15 +112,6 @@ def __init__( pad_token="", **kwargs, ): - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - unk_token=unk_token, - pad_token=pad_token, - **kwargs, - ) - try: import sacremoses except ImportError: @@ -145,6 +136,15 @@ def __init__( self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + unk_token=unk_token, + pad_token=pad_token, + **kwargs, + ) + @property def vocab_size(self): """Returns vocab size""" diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index d6a70beb30a136..9a81e73b8da37a 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -187,28 +187,21 @@ def __init__( **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token - unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token # Mask token behave like a normal word, i.e. include the space before it - mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - - super().__init__( - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - **kwargs, + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token ) + # these special tokens are not part of the vocab.json, let's add them in the correct order + with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -225,6 +218,19 @@ def __init__( # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + @property # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Blenderbot, RoBERTa->Blenderbot def vocab_size(self): @@ -232,7 +238,9 @@ def vocab_size(self): # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Blenderbot, RoBERTa->Blenderbot def get_vocab(self): - return dict(self.encoder, **self.added_tokens_encoder) + vocab = dict(self.encoder).copy() + vocab.update(self.added_tokens_encoder) + return vocab # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Blenderbot, RoBERTa->Blenderbot def bpe(self, token): diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py index ebe39ed09f9a35..fdd490b12adcf9 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -149,6 +149,11 @@ def __init__( trim_offsets=True, **kwargs, ): + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token + ) super().__init__( vocab_file, merges_file, diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py index 4acb87325666a1..61c56738ac4129 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py @@ -106,8 +106,6 @@ def __init__( pad_token="__null__", **kwargs, ): - super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs) - with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -116,6 +114,7 @@ def __init__( merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs) @property def vocab_size(self) -> int: diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py index 59e694c343c559..1d310fe3045fb0 100644 --- a/src/transformers/models/byt5/tokenization_byt5.py +++ b/src/transformers/models/byt5/tokenization_byt5.py @@ -16,7 +16,7 @@ import warnings -from typing import Dict, List, Optional, Tuple +from typing import List, Optional, Tuple from ...tokenization_utils import AddedToken, PreTrainedTokenizer from ...utils import logging @@ -72,7 +72,7 @@ def __init__( # Add extra_ids to the special token list if extra_ids > 0 and additional_special_tokens is None: additional_special_tokens = [f"" for i in range(extra_ids)] - elif extra_ids > 0 and additional_special_tokens is not None: + elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0: # Check that we have the right number of extra_id special tokens extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens))) if extra_tokens != extra_ids: @@ -82,38 +82,31 @@ def __init__( " extra_ids tokens" ) - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token - eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token - unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - + pad_token = AddedToken(pad_token, lstrip=True, rstrip=True) if isinstance(pad_token, str) else pad_token + # we force left and right stripping for backward compatibility. The byt5tests depend on this. + eos_token = AddedToken(eos_token, lstrip=True, rstrip=True) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=True, rstrip=True) if isinstance(unk_token, str) else unk_token + # unk token needs to be in the vocab with correct index + self._added_tokens_decoder = {0: pad_token, 1: eos_token, 2: unk_token} + self.offset = len(self._added_tokens_decoder) + self._utf_vocab_size = 2**8 # utf is 8 bits super().__init__( eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, - extra_ids=extra_ids, - additional_special_tokens=additional_special_tokens, + extra_ids=0, + additional_special_tokens=additional_special_tokens, # TODO extra ids are not used :sweatywmile: **kwargs, ) - self._extra_ids = extra_ids - - self._utf_vocab_size = 2**8 # utf is 8 bits - - # define special tokens dict - self.special_tokens_encoder: Dict[int, str] = { - self.pad_token: 0, - self.eos_token: 1, - self.unk_token: 2, - } - self._num_special_tokens = len(self.special_tokens_encoder) - n = len(additional_special_tokens) - for i, token in enumerate(additional_special_tokens): - self.special_tokens_encoder[token] = self.vocab_size + i - n - self.special_tokens_decoder: Dict[str, int] = {v: k for k, v in self.special_tokens_encoder.items()} - @property def vocab_size(self): - return self._utf_vocab_size + self._num_special_tokens + self._extra_ids + return self._utf_vocab_size + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False @@ -209,34 +202,25 @@ def _tokenize(self, text: str) -> List[str]: def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" - if token in self.special_tokens_encoder: - token_id = self.special_tokens_encoder[token] - elif token in self.added_tokens_encoder: - token_id = self.added_tokens_encoder[token] - elif len(token) != 1: - token_id = self.unk_token_id + + if len(token) != 1: + token_id = None else: - token_id = ord(token) + self._num_special_tokens + token_id = ord(token) + self.offset + return token_id def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" - if index in self.special_tokens_decoder: - token = self.special_tokens_decoder[index] - else: - token = chr(index - self._num_special_tokens) + token = chr(index - self.offset) return token def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" bstring = b"" for token in tokens: - if token in self.special_tokens_decoder: - tok_string = self.special_tokens_decoder[token].encode("utf-8") - elif token in self.added_tokens_decoder: - tok_string = self.special_tokens_decoder[token].encode("utf-8") - elif token in self.special_tokens_encoder: - tok_string = token.encode("utf-8") + if token in self.added_tokens_decoder: + tok_string = self.added_tokens_decoder[token].encode("utf-8") elif token in self.added_tokens_encoder: tok_string = token.encode("utf-8") else: diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py index 658dd1080b7122..f75a397755e34d 100644 --- a/src/transformers/models/camembert/tokenization_camembert.py +++ b/src/transformers/models/camembert/tokenization_camembert.py @@ -136,6 +136,29 @@ def __init__( self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file + + # HACK: These tokens were added by the author for an obscure reason as they were already part of the + # sentencepiece vocabulary (this is the case for and and ). + # In this case it is recommended to properly set the tokens by hand. + self._added_tokens_decoder = { + 0: AddedToken("NOTUSED"), + 1: AddedToken(pad_token), + 2: AddedToken("NOTUSED"), + 3: AddedToken(unk_token), + 4: AddedToken("NOTUSED"), + } + + self.fairseq_offset = 4 # 3 tokens are newly added, but the offset starts from 4 + + # legacy: camemebert is a particular case were we have to make sure `"NOTUSED"` is here + if "added_tokens_decoder" in kwargs: + # this is the only class that requires this unfortunately..... + # the reason is that the fast version has a whole. + kwargs["added_tokens_decoder"].update(self._added_tokens_decoder) + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -148,15 +171,83 @@ def __init__( sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) + + @property + def vocab_size(self): + # The length of the vocabulary without added tokens is len(self.sp_model) but the added tokens are added at the beginning. + return len(self.sp_model) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.fairseq_offset)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + # specifi to camembert, both 3 and 4 point to the unk token. + if self.sp_model.PieceToId(token) == 0: + # Convert sentence piece unk token to fairseq unk token index + return self.unk_token_id + return self.fairseq_offset + self.sp_model.PieceToId(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + # TODO decode outputs do not match between fast and slow + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(str(vocab_file)) - self.vocab_file = vocab_file - # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual - # sentencepiece vocabulary (this is the case for and - self.fairseq_tokens_to_ids = {"NOTUSED": 0, "": 1, "NOTUSED": 2, "": 3} - self.fairseq_offset = len(self.fairseq_tokens_to_ids) - self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) - self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + self.sp_model.Load(self.vocab_file) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None @@ -233,81 +324,3 @@ def create_token_type_ids_from_sequences( if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - @property - def vocab_size(self): - return len(self.fairseq_tokens_to_ids) + len(self.sp_model) - - def get_vocab(self): - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text: str) -> List[str]: - return self.sp_model.encode(text, out_type=str) - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - if token in self.fairseq_tokens_to_ids: - return self.fairseq_tokens_to_ids[token] - elif self.sp_model.PieceToId(token) == 0: - # Convert sentence piece unk token to fairseq unk token index - return self.unk_token_id - return self.fairseq_offset + self.sp_model.PieceToId(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - if index in self.fairseq_ids_to_tokens: - return self.fairseq_ids_to_tokens[index] - return self.sp_model.IdToPiece(index - self.fairseq_offset) - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - current_sub_tokens = [] - out_string = "" - prev_is_special = False - for token in tokens: - # make sure that special tokens are not decoded using sentencepiece model - if token in self.all_special_tokens: - if not prev_is_special: - out_string += " " - out_string += self.sp_model.decode(current_sub_tokens) + token - prev_is_special = True - current_sub_tokens = [] - else: - current_sub_tokens.append(token) - prev_is_special = False - out_string += self.sp_model.decode(current_sub_tokens) - return out_string.strip() - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - - # for backward compatibility - if not hasattr(self, "sp_model_kwargs"): - self.sp_model_kwargs = {} - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.vocab_file) - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - return (out_vocab_file,) diff --git a/src/transformers/models/canine/tokenization_canine.py b/src/transformers/models/canine/tokenization_canine.py index 2fae9e1482bd32..25932ae75d2a87 100644 --- a/src/transformers/models/canine/tokenization_canine.py +++ b/src/transformers/models/canine/tokenization_canine.py @@ -33,7 +33,6 @@ # Below: Constants defining canonical codepoints for special, pseudo-characters. # Copied from https://github.com/google-research/language/blob/master/language/canine/special_codepoints.py PAD = 0 - CLS = 0xE000 SEP = 0xE001 BOS = 0xE002 @@ -97,18 +96,6 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - model_max_length=model_max_length, - **kwargs, - ) - # Creates a mapping for looking up the IDs of special symbols. self._special_codepoints: Dict[str, int] = {} for codepoint, name in SPECIAL_CODEPOINTS.items(): @@ -122,10 +109,27 @@ def __init__( self._unicode_vocab_size = UNICODE_VOCAB_SIZE self._num_special_tokens = len(self._special_codepoints) + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + model_max_length=model_max_length, + **kwargs, + ) + @property def vocab_size(self) -> int: return self._unicode_vocab_size + def get_vocab(self): + vocab = {chr(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + def _tokenize(self, text: str) -> List[str]: """Tokenize a string (i.e. perform character splitting).""" return list(text) diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py index 127480b90cad0f..388c455a43807a 100644 --- a/src/transformers/models/clip/tokenization_clip.py +++ b/src/transformers/models/clip/tokenization_clip.py @@ -312,16 +312,6 @@ def __init__( bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - - super().__init__( - errors=errors, - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - **kwargs, - ) - try: import ftfy @@ -348,6 +338,15 @@ def __init__( re.IGNORECASE, ) + super().__init__( + errors=errors, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + **kwargs, + ) + @property def vocab_size(self): return len(self.encoder) diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py index 53a2d3577a1740..da1012095cfb23 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama.py +++ b/src/transformers/models/code_llama/tokenization_code_llama.py @@ -151,6 +151,17 @@ def __init__( for token in [prefix_token, middle_token, suffix_token, eot_token]: additional_special_tokens += [token] if token is not None else [] + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self._prefix_token = prefix_token + self._middle_token = middle_token + self._suffix_token = suffix_token + self._eot_token = eot_token + self.fill_token = fill_token + self.suffix_first = suffix_first + self.sp_model = self.get_spm_processor() + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -169,16 +180,6 @@ def __init__( use_default_system_prompt=use_default_system_prompt, **kwargs, ) - self.vocab_file = vocab_file - self.add_bos_token = add_bos_token - self.add_eos_token = add_eos_token - self._prefix_token = prefix_token - self._middle_token = middle_token - self._suffix_token = suffix_token - self._eot_token = eot_token - self.fill_token = fill_token - self.suffix_first = suffix_first - self.sp_model = self.get_spm_processor() @property def unk_token_length(self): diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py index 14d79bb1cebec4..e5f0332a92da79 100644 --- a/src/transformers/models/codegen/tokenization_codegen.py +++ b/src/transformers/models/codegen/tokenization_codegen.py @@ -167,16 +167,6 @@ def __init__( eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token - super().__init__( - errors=errors, - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - add_prefix_space=add_prefix_space, - add_bos_token=add_bos_token, - **kwargs, - ) self.add_bos_token = add_bos_token with open(vocab_file, encoding="utf-8") as vocab_handle: @@ -194,6 +184,16 @@ def __init__( # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + super().__init__( + errors=errors, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + add_prefix_space=add_prefix_space, + add_bos_token=add_bos_token, + **kwargs, + ) @property def vocab_size(self): diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py index 800848caaf1cc7..439beb7abb4d03 100644 --- a/src/transformers/models/convbert/tokenization_convbert.py +++ b/src/transformers/models/convbert/tokenization_convbert.py @@ -135,20 +135,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -164,7 +150,22 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py index f509519271d4e8..67281b3cf185f8 100644 --- a/src/transformers/models/cpm/tokenization_cpm.py +++ b/src/transformers/models/cpm/tokenization_cpm.py @@ -38,6 +38,9 @@ class CpmTokenizer(PreTrainedTokenizer): """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models.""" + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + def __init__( self, vocab_file, @@ -121,24 +124,6 @@ def __init__( self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - super().__init__( - do_lower_case=do_lower_case, - remove_space=remove_space, - keep_accents=keep_accents, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - additional_special_tokens=additional_special_tokens, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) - - self._pad_token_type_id = 3 - self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents @@ -157,6 +142,24 @@ def __init__( self.jieba = jieba self.translator = str.maketrans(" \n", "\u2582\u2583") + super().__init__( + do_lower_case=do_lower_case, + remove_space=remove_space, + keep_accents=keep_accents, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + self._pad_token_type_id = 3 + @property # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.vocab_size def vocab_size(self): diff --git a/src/transformers/models/cpmant/tokenization_cpmant.py b/src/transformers/models/cpmant/tokenization_cpmant.py index 346f28fde66756..c10f48e2de282e 100644 --- a/src/transformers/models/cpmant/tokenization_cpmant.py +++ b/src/transformers/models/cpmant/tokenization_cpmant.py @@ -131,18 +131,6 @@ def __init__( **kwargs, ): requires_backends(self, ["jieba"]) - super().__init__( - bod_token=bod_token, - eod_token=eod_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - unk_token=unk_token, - line_token=line_token, - space_token=space_token, - padding_side=padding_side, - **kwargs, - ) self.bod_token = bod_token self.eod_token = eod_token self.encoder = load_vocab(vocab_file) @@ -155,7 +143,20 @@ def __init__( self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1])) self.decoder = {v: k for k, v in self.encoder.items()} - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=self.unk_token) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=unk_token) + + super().__init__( + bod_token=bod_token, + eod_token=eod_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + unk_token=unk_token, + line_token=line_token, + space_token=space_token, + padding_side=padding_side, + **kwargs, + ) @property def bod_token_id(self): diff --git a/src/transformers/models/ctrl/tokenization_ctrl.py b/src/transformers/models/ctrl/tokenization_ctrl.py index 7a81bf8572f0c0..f00b50348048d6 100644 --- a/src/transformers/models/ctrl/tokenization_ctrl.py +++ b/src/transformers/models/ctrl/tokenization_ctrl.py @@ -139,8 +139,6 @@ class CTRLTokenizer(PreTrainedTokenizer): control_codes = CONTROL_CODES def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): - super().__init__(unk_token=unk_token, **kwargs) - with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -149,6 +147,7 @@ def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + super().__init__(unk_token=unk_token, **kwargs) @property def vocab_size(self): diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py index 880ed17d95ef28..55fe35a427eb1f 100644 --- a/src/transformers/models/deberta/tokenization_deberta.py +++ b/src/transformers/models/deberta/tokenization_deberta.py @@ -201,20 +201,6 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - - super().__init__( - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - add_bos_token=add_bos_token, - **kwargs, - ) self.add_bos_token = add_bos_token with open(vocab_file, encoding="utf-8") as vocab_handle: @@ -233,6 +219,20 @@ def __init__( # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + add_bos_token=add_bos_token, + **kwargs, + ) + @property # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.vocab_size def vocab_size(self): diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py index b2a0d844f1625d..4d408252a2bd90 100644 --- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py @@ -20,9 +20,12 @@ import sentencepiece as sp -from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +logger = logging.get_logger(__name__) + PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model", @@ -124,6 +127,18 @@ def __init__( ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.do_lower_case = do_lower_case + self.split_by_punct = split_by_punct + self.vocab_file = vocab_file + self._tokenizer = SPMTokenizer( + vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs + ) + unk_token = AddedToken(unk_token, normalized=True, lstrip=False, rstrip=False) super().__init__( do_lower_case=do_lower_case, bos_token=bos_token, @@ -137,18 +152,7 @@ def __init__( sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) - - if not os.path.isfile(vocab_file): - raise ValueError( - f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" - " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" - ) - self.do_lower_case = do_lower_case - self.split_by_punct = split_by_punct - self.vocab_file = vocab_file - self._tokenizer = SPMTokenizer( - vocab_file, self.all_special_tokens, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs - ) + self._tokenizer.special_tokens = self.all_special_tokens @property def vocab_size(self): @@ -374,6 +378,7 @@ def decode(self, tokens, start=-1, end=-1, raw_text=None): text = "".join(words[word_start:word_end]) return text + # TODO add a deprecation cycle as this can have different behaviour from our API def add_special_token(self, token): if token not in self.special_tokens: self.special_tokens.append(token) @@ -383,6 +388,9 @@ def add_special_token(self, token): return self.id(token) def part_of_whole_word(self, token, is_bos=False): + logger.warning_once( + "The `DebertaTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`" + ) if is_bos: return True if ( @@ -413,6 +421,9 @@ def sym(self, id): return self.ids_to_tokens[id] def id(self, sym): + logger.warning_once( + "The `DebertaTokenizer.id` method is deprecated and will be removed in `transformers==4.35`" + ) return self.vocab[sym] if sym in self.vocab else 1 def _encode_as_pieces(self, text): @@ -460,17 +471,6 @@ def split_to_words(self, text): return words - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) - def _run_split_on_punc(self, text): """Splits punctuation on a piece of text.""" chars = list(text) diff --git a/src/transformers/models/deprecated/retribert/tokenization_retribert.py b/src/transformers/models/deprecated/retribert/tokenization_retribert.py index de50c74b70bd02..d0904e3c931e40 100644 --- a/src/transformers/models/deprecated/retribert/tokenization_retribert.py +++ b/src/transformers/models/deprecated/retribert/tokenization_retribert.py @@ -132,20 +132,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -161,7 +147,22 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case diff --git a/src/transformers/models/deprecated/tapex/tokenization_tapex.py b/src/transformers/models/deprecated/tapex/tokenization_tapex.py index d0cd49212c6dc0..a5ee093c56bd26 100644 --- a/src/transformers/models/deprecated/tapex/tokenization_tapex.py +++ b/src/transformers/models/deprecated/tapex/tokenization_tapex.py @@ -296,23 +296,6 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - super().__init__( - vocab_file=vocab_file, - merges_file=merges_file, - do_lower_case=do_lower_case, - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - max_cell_length=max_cell_length, - **kwargs, - ) - with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -331,6 +314,24 @@ def __init__( self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") # additional properties + + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + do_lower_case=do_lower_case, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + max_cell_length=max_cell_length, + **kwargs, + ) + self.max_cell_length = max_cell_length self.table_linearize = IndexedRowTableLinearize() diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py index 5e96e4972d3fac..014c41d1243b6f 100644 --- a/src/transformers/models/distilbert/tokenization_distilbert.py +++ b/src/transformers/models/distilbert/tokenization_distilbert.py @@ -149,20 +149,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -178,7 +164,21 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py index aabeccba7d630e..fb9bf9dfa13cba 100644 --- a/src/transformers/models/electra/tokenization_electra.py +++ b/src/transformers/models/electra/tokenization_electra.py @@ -152,20 +152,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -181,7 +167,22 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/ernie_m/tokenization_ernie_m.py b/src/transformers/models/ernie_m/tokenization_ernie_m.py index 1acc113dca5fb7..b1b8cc845024c8 100644 --- a/src/transformers/models/ernie_m/tokenization_ernie_m.py +++ b/src/transformers/models/ernie_m/tokenization_ernie_m.py @@ -112,6 +112,19 @@ def __init__( # is included in the raw text, there should be a match in a non-normalized sentence. self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.do_lower_case = do_lower_case + self.sentencepiece_model_ckpt = sentencepiece_model_ckpt + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(sentencepiece_model_ckpt) + + # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning + if vocab_file is not None: + self.vocab = self.load_vocab(filepath=vocab_file) + else: + self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())} + self.reverse_vocab = {v: k for k, v in self.vocab.items()} + super().__init__( do_lower_case=do_lower_case, unk_token=unk_token, @@ -124,17 +137,6 @@ def __init__( sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) - self.do_lower_case = do_lower_case - self.sentencepiece_model_ckpt = sentencepiece_model_ckpt - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(sentencepiece_model_ckpt) - - # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning - if vocab_file is not None: - self.vocab = self.load_vocab(filepath=vocab_file) - else: - self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())} - self.reverse_vocab = {v: k for k, v in self.vocab.items()} def get_offset_mapping(self, text): if text is None: diff --git a/src/transformers/models/esm/tokenization_esm.py b/src/transformers/models/esm/tokenization_esm.py index f19d0de58a9471..065eaae1d50520 100644 --- a/src/transformers/models/esm/tokenization_esm.py +++ b/src/transformers/models/esm/tokenization_esm.py @@ -64,17 +64,23 @@ def __init__( eos_token="", **kwargs, ): - super().__init__(**kwargs) self.all_tokens = load_vocab_file(vocab_file) self._id_to_token = dict(enumerate(self.all_tokens)) self._token_to_id = {tok: ind for ind, tok in enumerate(self.all_tokens)} - self.unk_token = unk_token - self.cls_token = cls_token - self.pad_token = pad_token - self.mask_token = mask_token - self.eos_token = eos_token + super().__init__( + unk_token=unk_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + eos_token=eos_token, + **kwargs, + ) + + # TODO, all the tokens are added? But they are also part of the vocab... bit strange. + # none of them are special, but they all need special splitting. + self.unique_no_split_tokens = self.all_tokens - self._create_trie(self.unique_no_split_tokens) + self._update_trie(self.unique_no_split_tokens) def _convert_id_to_token(self, index: int) -> str: return self._id_to_token.get(index, self.unk_token) diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py index ea3f1c8bfd58b2..010515e9d02e46 100644 --- a/src/transformers/models/flaubert/tokenization_flaubert.py +++ b/src/transformers/models/flaubert/tokenization_flaubert.py @@ -258,19 +258,6 @@ def __init__( self.do_lowercase = do_lowercase - super().__init__( - unk_token=unk_token, - bos_token=bos_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - additional_special_tokens=additional_special_tokens, - lang2id=lang2id, - id2lang=id2lang, - **kwargs, - ) - try: import sacremoses except ImportError: @@ -303,6 +290,19 @@ def __init__( self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + super().__init__( + unk_token=unk_token, + bos_token=bos_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + lang2id=lang2id, + id2lang=id2lang, + **kwargs, + ) + @property # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case def do_lower_case(self): diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py index 7324f509a8d3df..cfa54fcecfb517 100644 --- a/src/transformers/models/fnet/tokenization_fnet.py +++ b/src/transformers/models/fnet/tokenization_fnet.py @@ -15,7 +15,6 @@ """ Tokenization classes for FNet model.""" import os -import re import unicodedata from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple @@ -117,14 +116,19 @@ def __init__( ) -> None: # Mask token behave like a normal word, i.e. include the space before it and # is included in the raw text, there should be a match in a non-normalized sentence. - mask_token = ( - AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) - if isinstance(mask_token, str) - else mask_token - ) - + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + super().__init__( do_lower_case=do_lower_case, remove_space=remove_space, @@ -138,14 +142,6 @@ def __init__( **kwargs, ) - self.do_lower_case = do_lower_case - self.remove_space = remove_space - self.keep_accents = keep_accents - self.vocab_file = vocab_file - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - @property def vocab_size(self): return len(self.sp_model) @@ -237,48 +233,21 @@ def _decode( token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = None, - spaces_between_special_tokens: bool = True, + spaces_between_special_tokens: bool = False, **kwargs, ) -> str: - self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) - - filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) - - # To avoid mixing byte-level and unicode for byte-level BPT - # we need to build string separately for added tokens and byte-level tokens - # cf. https://github.com/huggingface/transformers/issues/1133 - sub_texts = [] - current_sub_text = [] - for token in filtered_tokens: - if skip_special_tokens and token in self.all_special_ids: - continue - if token in self.added_tokens_encoder: - if current_sub_text: - sub_texts.append(self.convert_tokens_to_string(current_sub_text)) - current_sub_text = [] - sub_texts.append(token) - else: - current_sub_text.append(token) - if current_sub_text: - sub_texts.append(self.convert_tokens_to_string(current_sub_text)) - + text = super()._decode( + token_ids=token_ids, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + spaces_between_special_tokens=spaces_between_special_tokens, + **kwargs, + ) # Mimic the behavior of the Rust tokenizer: # No space after - if spaces_between_special_tokens: - text = re.sub(r"() ", r"\1", " ".join(sub_texts)) - else: - text = "".join(sub_texts) - - clean_up_tokenization_spaces = ( - clean_up_tokenization_spaces - if clean_up_tokenization_spaces is not None - else self.clean_up_tokenization_spaces - ) - if clean_up_tokenization_spaces: - clean_text = self.clean_up_tokenization(text) - return clean_text - else: - return text + if not spaces_between_special_tokens: + text = text.replace(" ", "") + return text def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None diff --git a/src/transformers/models/fnet/tokenization_fnet_fast.py b/src/transformers/models/fnet/tokenization_fnet_fast.py index f18665d87d5c19..2179751e558e60 100644 --- a/src/transformers/models/fnet/tokenization_fnet_fast.py +++ b/src/transformers/models/fnet/tokenization_fnet_fast.py @@ -108,11 +108,9 @@ def __init__( ): # Mask token behave like a normal word, i.e. include the space before it and # is included in the raw text, there should be a match in a non-normalized sentence. - mask_token = ( - AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) - if isinstance(mask_token, str) - else mask_token - ) + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token super().__init__( vocab_file, diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py index 523f2ed5885070..168aa14ead7817 100644 --- a/src/transformers/models/fsmt/tokenization_fsmt.py +++ b/src/transformers/models/fsmt/tokenization_fsmt.py @@ -197,19 +197,6 @@ def __init__( pad_token="", **kwargs, ): - super().__init__( - langs=langs, - src_vocab_file=src_vocab_file, - tgt_vocab_file=tgt_vocab_file, - merges_file=merges_file, - do_lower_case=do_lower_case, - unk_token=unk_token, - bos_token=bos_token, - sep_token=sep_token, - pad_token=pad_token, - **kwargs, - ) - try: import sacremoses except ImportError: @@ -250,6 +237,18 @@ def __init__( merges = [tuple(merge.split()[:2]) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + super().__init__( + langs=langs, + src_vocab_file=src_vocab_file, + tgt_vocab_file=tgt_vocab_file, + merges_file=merges_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + bos_token=bos_token, + sep_token=sep_token, + pad_token=pad_token, + **kwargs, + ) # hack override def get_vocab(self) -> Dict[str, int]: diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py index 37a913d0a01bae..a0f9ced1b7406b 100644 --- a/src/transformers/models/funnel/tokenization_funnel.py +++ b/src/transformers/models/funnel/tokenization_funnel.py @@ -157,22 +157,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - bos_token=bos_token, - eos_token=eos_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -188,7 +172,23 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + bos_token=bos_token, + eos_token=eos_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py index 278ff69032585c..21c2cdf382e41d 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2.py +++ b/src/transformers/models/gpt2/tokenization_gpt2.py @@ -170,16 +170,7 @@ def __init__( eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token - super().__init__( - errors=errors, - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - add_prefix_space=add_prefix_space, - add_bos_token=add_bos_token, - **kwargs, - ) + self.add_bos_token = add_bos_token with open(vocab_file, encoding="utf-8") as vocab_handle: @@ -198,6 +189,17 @@ def __init__( # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + super().__init__( + errors=errors, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + add_prefix_space=add_prefix_space, + add_bos_token=add_bos_token, + **kwargs, + ) + @property def vocab_size(self): return len(self.encoder) diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py index 6ac2f214a16568..7fca57d4c14c4e 100644 --- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py @@ -127,14 +127,6 @@ def __init__( do_clean_text=False, **kwargs, ): - super().__init__( - unk_token=unk_token, - pad_token=pad_token, - bos_token=bos_token, - eos_token=eos_token, - do_clean_text=do_clean_text, - **kwargs, - ) if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -150,6 +142,14 @@ def __init__( self.subword_tokenizer = SubWordJapaneseTokenizer( vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji ) + super().__init__( + unk_token=unk_token, + pad_token=pad_token, + bos_token=bos_token, + eos_token=eos_token, + do_clean_text=do_clean_text, + **kwargs, + ) @property def vocab_size(self): diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py index 4874ba732245f0..a1a5c71e96640a 100644 --- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py +++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py @@ -103,7 +103,7 @@ class GPTSw3Tokenizer(PreTrainedTokenizer): vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["input_ids", "attention_mask"] + model_input_names = ["input_ids", "token_type_ids", "attention_mask"] def __init__( self, @@ -138,18 +138,6 @@ def __init__( pad_token = "" if pad_token is None else pad_token bos_token = "" if bos_token is None else bos_token - super().__init__( - do_lower_case=do_lower_case, - remove_space=remove_space, - keep_accents=keep_accents, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - pad_token=pad_token, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) - self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents @@ -168,6 +156,18 @@ def __init__( f"[{''.join(map(chr, list(range(0, 9)) + list(range(11, 32)) + list(range(127, 160)) + [160, 173, 8203]))}]" ) + super().__init__( + do_lower_case=do_lower_case, + remove_space=remove_space, + keep_accents=keep_accents, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.__getstate__ def __getstate__(self): state = self.__dict__.copy() diff --git a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py index c567b6b6003fff..cd05ccde9ff248 100644 --- a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py +++ b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py @@ -166,15 +166,6 @@ def __init__( do_clean_text=False, **kwargs, ): - super().__init__( - unk_token=unk_token, - pad_token=pad_token, - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - do_clean_text=do_clean_text, - **kwargs, - ) if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -191,6 +182,16 @@ def __init__( vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji ) + super().__init__( + unk_token=unk_token, + pad_token=pad_token, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + do_clean_text=do_clean_text, + **kwargs, + ) + @property # Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.vocab_size def vocab_size(self): diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py index 91ce0dcca58463..1747a59c6fc2fa 100644 --- a/src/transformers/models/herbert/tokenization_herbert.py +++ b/src/transformers/models/herbert/tokenization_herbert.py @@ -334,21 +334,6 @@ def __init__( id2lang=None, **kwargs, ): - super().__init__( - unk_token=unk_token, - bos_token=bos_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - additional_special_tokens=additional_special_tokens, - lang2id=lang2id, - id2lang=id2lang, - do_lowercase_and_remove_accent=do_lowercase_and_remove_accent, - tokenizer_file=None, - **kwargs, - ) - try: import sacremoses except ImportError: @@ -383,6 +368,21 @@ def __init__( self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + super().__init__( + unk_token=unk_token, + bos_token=bos_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + lang2id=lang2id, + id2lang=id2lang, + do_lowercase_and_remove_accent=do_lowercase_and_remove_accent, + tokenizer_file=None, + **kwargs, + ) + self.bert_pre_tokenizer = BasicTokenizer( do_lower_case=False, never_split=self.all_special_tokens, diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py index 9a4a37b871e485..dcf47f46f7de56 100644 --- a/src/transformers/models/jukebox/tokenization_jukebox.py +++ b/src/transformers/models/jukebox/tokenization_jukebox.py @@ -128,16 +128,10 @@ def __init__( **kwargs, ): unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - super().__init__( - unk_token=unk_token, - n_genres=n_genres, - version=version, - max_n_lyric_tokens=max_n_lyric_tokens, - **kwargs, - ) self.version = version self.max_n_lyric_tokens = max_n_lyric_tokens self.n_genres = n_genres + self._added_tokens_decoder = {0: unk_token} with open(artists_file, encoding="utf-8") as vocab_handle: self.artists_encoder = json.load(vocab_handle) @@ -157,13 +151,24 @@ def __init__( self.artists_decoder = {v: k for k, v in self.artists_encoder.items()} self.genres_decoder = {v: k for k, v in self.genres_encoder.items()} self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()} + super().__init__( + unk_token=unk_token, + n_genres=n_genres, + version=version, + max_n_lyric_tokens=max_n_lyric_tokens, + **kwargs, + ) @property def vocab_size(self): return len(self.artists_encoder) + len(self.genres_encoder) + len(self.lyrics_encoder) def get_vocab(self): - return dict(self.artists_encoder, self.genres_encoder, self.lyrics_encoder) + return { + "artists_encoder": self.artists_encoder, + "genres_encoder": self.genres_encoder, + "lyrics_encoder": self.lyrics_encoder, + } def _convert_token_to_id(self, list_artists, list_genres, list_lyrics): """Converts the artist, genre and lyrics tokens to their index using the vocabulary. diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py index b518874224a42c..de6bc4de953d9e 100644 --- a/src/transformers/models/layoutlm/tokenization_layoutlm.py +++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py @@ -134,20 +134,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -163,7 +149,22 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py index 1799cc29211419..6c0b2db4a9ef6d 100644 --- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py @@ -244,27 +244,6 @@ def __init__( additional_special_tokens: Optional[List[str]] = None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - cls_token_box=cls_token_box, - sep_token_box=sep_token_box, - pad_token_box=pad_token_box, - pad_token_label=pad_token_label, - only_label_first_subword=only_label_first_subword, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - model_max_length=model_max_length, - additional_special_tokens=additional_special_tokens, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -280,7 +259,7 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) # additional properties self.cls_token_box = cls_token_box @@ -288,6 +267,26 @@ def __init__( self.pad_token_box = pad_token_box self.pad_token_label = pad_token_label self.only_label_first_subword = only_label_first_subword + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + cls_token_box=cls_token_box, + sep_token_box=sep_token_box, + pad_token_box=pad_token_box, + pad_token_label=pad_token_label, + only_label_first_subword=only_label_first_subword, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + model_max_length=model_max_length, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py index b9c0ab127d42ca..199b906eedcc58 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py @@ -303,24 +303,6 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - super().__init__( - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - cls_token_box=cls_token_box, - sep_token_box=sep_token_box, - pad_token_box=pad_token_box, - pad_token_label=pad_token_label, - only_label_first_subword=only_label_first_subword, - **kwargs, - ) - with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -344,6 +326,24 @@ def __init__( self.pad_token_label = pad_token_label self.only_label_first_subword = only_label_first_subword + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + cls_token_box=cls_token_box, + sep_token_box=sep_token_box, + pad_token_box=pad_token_box, + pad_token_label=pad_token_label, + only_label_first_subword=only_label_first_subword, + **kwargs, + ) + @property # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size def vocab_size(self): @@ -351,7 +351,9 @@ def vocab_size(self): # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab def get_vocab(self): - return dict(self.encoder, **self.added_tokens_encoder) + vocab = dict(self.encoder).copy() + vocab.update(self.added_tokens_encoder) + return vocab # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe def bpe(self, token): @@ -539,7 +541,7 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): if ( (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()) - and sum([text.startswith(no_split_token) for no_split_token in self.unique_no_split_tokens]) == 0 + and sum([text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder]) == 0 ): text = " " + text return (text, kwargs) diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py index 47c5315457b4fa..230be65ee62e47 100644 --- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py +++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py @@ -254,23 +254,6 @@ def __init__( self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - cls_token_box=cls_token_box, - sep_token_box=sep_token_box, - pad_token_box=pad_token_box, - pad_token_label=pad_token_label, - only_label_first_subword=only_label_first_subword, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -297,6 +280,23 @@ def __init__( self.pad_token_label = pad_token_label self.only_label_first_subword = only_label_first_subword + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + cls_token_box=cls_token_box, + sep_token_box=sep_token_box, + pad_token_box=pad_token_box, + pad_token_label=pad_token_label, + only_label_first_subword=only_label_first_subword, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py index 1cdb52430117c6..bc83680b219f72 100644 --- a/src/transformers/models/led/tokenization_led.py +++ b/src/transformers/models/led/tokenization_led.py @@ -197,21 +197,10 @@ def __init__( pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token # Mask token behave like a normal word, i.e. include the space before it + # TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens` + # Also this not only will strip the spaces but any punctuation mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - super().__init__( - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - **kwargs, - ) - with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -228,6 +217,19 @@ def __init__( # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + @property # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.vocab_size def vocab_size(self): diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py index 51b8ab4aaaf03a..e7ef2fff737c1f 100644 --- a/src/transformers/models/led/tokenization_led_fast.py +++ b/src/transformers/models/led/tokenization_led_fast.py @@ -152,6 +152,7 @@ def __init__( trim_offsets=True, **kwargs, ): + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token super().__init__( vocab_file, merges_file, diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py index 8db2f9970e199a..4e2e0e41db1a04 100644 --- a/src/transformers/models/llama/tokenization_llama.py +++ b/src/transformers/models/llama/tokenization_llama.py @@ -122,20 +122,7 @@ def __init__( eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - pad_token=pad_token, - add_bos_token=add_bos_token, - add_eos_token=add_eos_token, - sp_model_kwargs=self.sp_model_kwargs, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, - use_default_system_prompt=use_default_system_prompt, - spaces_between_special_tokens=spaces_between_special_tokens, - legacy=legacy, - **kwargs, - ) + if legacy is None: logger.warning_once( f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is" @@ -151,9 +138,23 @@ def __init__( self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_default_system_prompt = use_default_system_prompt - self.sp_model = self.get_spm_processor() + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token, + sp_model_kwargs=self.sp_model_kwargs, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + use_default_system_prompt=use_default_system_prompt, + spaces_between_special_tokens=spaces_between_special_tokens, + legacy=legacy, + **kwargs, + ) + @property def unk_token_length(self): return len(self.sp_model.encode(str(self.unk_token))) diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py index 282a0f06740eaa..157bd4cdb852a3 100644 --- a/src/transformers/models/llama/tokenization_llama_fast.py +++ b/src/transformers/models/llama/tokenization_llama_fast.py @@ -33,6 +33,14 @@ logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"} +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model", + }, + "tokenizer_file": { + "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json", + }, +} B_INST, E_INST = "[INST]", "[/INST]" B_SYS, E_SYS = "<>\n", "\n<>\n\n" @@ -93,6 +101,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): """ vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP slow_tokenizer_class = LlamaTokenizer padding_side = "left" model_input_names = ["input_ids", "attention_mask"] diff --git a/src/transformers/models/longformer/tokenization_longformer.py b/src/transformers/models/longformer/tokenization_longformer.py index fea949658abcd1..7661634a000998 100644 --- a/src/transformers/models/longformer/tokenization_longformer.py +++ b/src/transformers/models/longformer/tokenization_longformer.py @@ -212,28 +212,21 @@ def __init__( **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token - unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token # Mask token behave like a normal word, i.e. include the space before it - mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - - super().__init__( - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - **kwargs, + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token ) + # these special tokens are not part of the vocab.json, let's add them in the correct order + with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -250,12 +243,27 @@ def __init__( # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + @property def vocab_size(self): return len(self.encoder) def get_vocab(self): - return dict(self.encoder, **self.added_tokens_encoder) + vocab = dict(self.encoder).copy() + vocab.update(self.added_tokens_encoder) + return vocab def bpe(self, token): if token in self.cache: diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py index 1460f2f2cc2f10..32c6f6c2deef36 100644 --- a/src/transformers/models/longformer/tokenization_longformer_fast.py +++ b/src/transformers/models/longformer/tokenization_longformer_fast.py @@ -192,6 +192,11 @@ def __init__( trim_offsets=True, **kwargs, ): + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token + ) super().__init__( vocab_file, merges_file, diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py index 8b47ced1d3175f..e8ad725d050b1c 100644 --- a/src/transformers/models/luke/tokenization_luke.py +++ b/src/transformers/models/luke/tokenization_luke.py @@ -326,28 +326,6 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - super().__init__( - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - task=task, - max_entity_length=32, - max_mention_length=30, - entity_token_1="", - entity_token_2="", - entity_unk_token=entity_unk_token, - entity_pad_token=entity_pad_token, - entity_mask_token=entity_mask_token, - entity_mask2_token=entity_mask2_token, - **kwargs, - ) - with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -407,6 +385,28 @@ def __init__( self.max_mention_length = max_mention_length + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + task=task, + max_entity_length=32, + max_mention_length=30, + entity_token_1="", + entity_token_2="", + entity_unk_token=entity_unk_token, + entity_pad_token=entity_pad_token, + entity_mask_token=entity_mask_token, + entity_mask2_token=entity_mask2_token, + **kwargs, + ) + @property # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Luke, RoBERTa->LUKE def vocab_size(self): @@ -414,7 +414,9 @@ def vocab_size(self): # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Luke, RoBERTa->LUKE def get_vocab(self): - return dict(self.encoder, **self.added_tokens_encoder) + vocab = dict(self.encoder).copy() + vocab.update(self.added_tokens_encoder) + return vocab # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Luke, RoBERTa->LUKE def bpe(self, token): diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py index e651b8f4454a11..17ff0ff8e7f82d 100644 --- a/src/transformers/models/lxmert/tokenization_lxmert.py +++ b/src/transformers/models/lxmert/tokenization_lxmert.py @@ -126,20 +126,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -155,7 +141,22 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py index 82f5e3a47b36ee..1346af81412add 100644 --- a/src/transformers/models/m2m_100/tokenization_m2m_100.py +++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py @@ -150,26 +150,11 @@ def __init__( fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes] self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in fairseq_language_code} - kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) - kwargs["additional_special_tokens"] += [ - self.get_lang_token(lang_code) - for lang_code in fairseq_language_code - if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"] - ] - - super().__init__( - src_lang=src_lang, - tgt_lang=tgt_lang, - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - unk_token=unk_token, - pad_token=pad_token, - language_codes=language_codes, - sp_model_kwargs=self.sp_model_kwargs, - num_madeup_words=num_madeup_words, - **kwargs, - ) + additional_special_tokens = kwargs.pop("additional_special_tokens", []) + for lang_code in fairseq_language_code: + token = self.get_lang_token(lang_code) + if token not in additional_special_tokens and lang_code not in str(token) not in self.added_tokens_encoder: + additional_special_tokens.append(token) self.vocab_file = vocab_file self.encoder = load_json(vocab_file) @@ -188,13 +173,33 @@ def __init__( self._src_lang = src_lang if src_lang is not None else "en" self.tgt_lang = tgt_lang self.cur_lang_id = self.get_lang_id(self._src_lang) - self.set_src_lang_special_tokens(self._src_lang) self.num_madeup_words = num_madeup_words + super().__init__( + src_lang=src_lang, + tgt_lang=tgt_lang, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + unk_token=unk_token, + pad_token=pad_token, + language_codes=language_codes, + sp_model_kwargs=self.sp_model_kwargs, + additional_special_tokens=additional_special_tokens, + num_madeup_words=num_madeup_words, + **kwargs, + ) + self.set_src_lang_special_tokens(self._src_lang) + @property def vocab_size(self) -> int: - return len(self.encoder) + len(self.lang_token_to_id) + return len(self.encoder) + + def get_vocab(self) -> Dict: + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab @property def src_lang(self) -> str: @@ -290,11 +295,6 @@ def build_inputs_with_special_tokens( # We don't expect to process pairs, but leave the pair logic for API consistency return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens - def get_vocab(self) -> Dict: - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - def __getstate__(self) -> Dict: state = self.__dict__.copy() state["sp_model"] = None diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index 2736b03a012f86..f064b49a8397b9 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -144,26 +144,13 @@ def __init__( ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - super().__init__( - # bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id - source_lang=source_lang, - target_lang=target_lang, - unk_token=unk_token, - eos_token=eos_token, - pad_token=pad_token, - model_max_length=model_max_length, - sp_model_kwargs=self.sp_model_kwargs, - target_vocab_file=target_vocab_file, - separate_vocabs=separate_vocabs, - **kwargs, - ) assert Path(source_spm).exists(), f"cannot find spm source {source_spm}" self.separate_vocabs = separate_vocabs self.encoder = load_json(vocab) - if self.unk_token not in self.encoder: - raise KeyError(" token must be in vocab") - assert self.pad_token in self.encoder + if unk_token not in self.encoder: + raise KeyError(" token must be in the vocab") + assert pad_token in self.encoder if separate_vocabs: self.target_encoder = load_json(target_vocab_file) @@ -187,6 +174,20 @@ def __init__( self._setup_normalizer() + super().__init__( + # bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id + source_lang=source_lang, + target_lang=target_lang, + unk_token=unk_token, + eos_token=eos_token, + pad_token=pad_token, + model_max_length=model_max_length, + sp_model_kwargs=self.sp_model_kwargs, + target_vocab_file=target_vocab_file, + separate_vocabs=separate_vocabs, + **kwargs, + ) + def _setup_normalizer(self): try: from sacremoses import MosesPunctNormalizer diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py index 9d438602864645..24fa4b7763a9e1 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm.py +++ b/src/transformers/models/markuplm/tokenization_markuplm.py @@ -232,27 +232,6 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - super().__init__( - vocab_file=vocab_file, - merges_file=merges_file, - tags_dict=tags_dict, - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - max_depth=max_depth, - max_width=max_width, - pad_width=pad_width, - pad_token_label=pad_token_label, - only_label_first_subword=only_label_first_subword, - **kwargs, - ) - with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) @@ -279,6 +258,28 @@ def __init__( self.pad_tag_id = self.unk_tag_id + 1 self.pad_xpath_tags_seq = [self.pad_tag_id] * self.max_depth self.pad_xpath_subs_seq = [self.pad_width] * self.max_depth + + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + tags_dict=tags_dict, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + max_depth=max_depth, + max_width=max_width, + pad_width=pad_width, + pad_token_label=pad_token_label, + only_label_first_subword=only_label_first_subword, + **kwargs, + ) + self.pad_token_label = pad_token_label self.only_label_first_subword = only_label_first_subword @@ -312,7 +313,9 @@ def vocab_size(self): return len(self.encoder) def get_vocab(self): - return dict(self.encoder, **self.added_tokens_encoder) + vocab = self.encoder.copy() + vocab.update(self.added_tokens_encoder) + return vocab def bpe(self, token): if token in self.cache: diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py index 0010c21cdce58b..a0933631b65b7a 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py +++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py @@ -26,6 +26,7 @@ from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings from ...tokenization_utils_base import ( ENCODE_KWARGS_DOCSTRING, + AddedToken, BatchEncoding, EncodedInput, PreTokenizedInput, @@ -182,6 +183,16 @@ def __init__( trim_offsets=False, **kwargs, ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + super().__init__( vocab_file=vocab_file, merges_file=merges_file, diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py index 0c74175e33220e..933074fd5d85bd 100644 --- a/src/transformers/models/mbart/tokenization_mbart.py +++ b/src/transformers/models/mbart/tokenization_mbart.py @@ -101,22 +101,6 @@ def __init__( self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - tokenizer_file=None, - src_lang=src_lang, - tgt_lang=tgt_lang, - additional_special_tokens=additional_special_tokens, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -142,14 +126,30 @@ def __init__( self.fairseq_tokens_to_ids.update(self.lang_code_to_id) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} - self._additional_special_tokens = list(self.lang_code_to_id.keys()) + _additional_special_tokens = list(self.lang_code_to_id.keys()) if additional_special_tokens is not None: # Only add those special tokens if they are not already there. - self._additional_special_tokens.extend( - [t for t in additional_special_tokens if t not in self._additional_special_tokens] + _additional_special_tokens.extend( + [t for t in additional_special_tokens if t not in _additional_special_tokens] ) + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + tokenizer_file=None, + src_lang=src_lang, + tgt_lang=tgt_lang, + additional_special_tokens=_additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + self._src_lang = src_lang if src_lang is not None else "en_XX" self.cur_lang_code_id = self.lang_code_to_id[self._src_lang] self.tgt_lang = tgt_lang diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py index ff74739afcdf6b..ed0d0de9c8642c 100644 --- a/src/transformers/models/mbart/tokenization_mbart_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart_fast.py @@ -112,6 +112,14 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy() + + if additional_special_tokens is not None: + # Only add those special tokens if they are not already there. + _additional_special_tokens.extend( + [t for t in additional_special_tokens if t not in _additional_special_tokens] + ) + super().__init__( vocab_file=vocab_file, tokenizer_file=tokenizer_file, @@ -124,21 +132,11 @@ def __init__( mask_token=mask_token, src_lang=src_lang, tgt_lang=tgt_lang, - additional_special_tokens=additional_special_tokens, + additional_special_tokens=_additional_special_tokens, **kwargs, ) self.vocab_file = vocab_file - - _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy() - - if additional_special_tokens is not None: - # Only add those special tokens if they are not already there. - _additional_special_tokens.extend( - [t for t in additional_special_tokens if t not in _additional_special_tokens] - ) - - self.add_special_tokens({"additional_special_tokens": _additional_special_tokens}) self.lang_code_to_id = { lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES } diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py index 628be52479d0c3..e2cffc57ad3380 100644 --- a/src/transformers/models/mbart50/tokenization_mbart50.py +++ b/src/transformers/models/mbart50/tokenization_mbart50.py @@ -137,19 +137,6 @@ def __init__( code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"] ] - super().__init__( - src_lang=src_lang, - tgt_lang=tgt_lang, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -176,6 +163,19 @@ def __init__( self.fairseq_tokens_to_ids.update(self.lang_code_to_id) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + super().__init__( + src_lang=src_lang, + tgt_lang=tgt_lang, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + self._src_lang = src_lang if src_lang is not None else "en_XX" self.cur_lang_code_id = self.lang_code_to_id[self._src_lang] self.tgt_lang = tgt_lang diff --git a/src/transformers/models/mgp_str/tokenization_mgp_str.py b/src/transformers/models/mgp_str/tokenization_mgp_str.py index 9d4fddcc7e838c..e267491c8613bb 100644 --- a/src/transformers/models/mgp_str/tokenization_mgp_str.py +++ b/src/transformers/models/mgp_str/tokenization_mgp_str.py @@ -62,6 +62,9 @@ class MgpstrTokenizer(PreTrainedTokenizer): max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, vocab_file, unk_token="[GO]", bos_token="[GO]", eos_token="[s]", pad_token="[GO]", **kwargs): + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.vocab = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.vocab.items()} super().__init__( unk_token=unk_token, bos_token=bos_token, @@ -70,16 +73,14 @@ def __init__(self, vocab_file, unk_token="[GO]", bos_token="[GO]", eos_token="[s **kwargs, ) - with open(vocab_file, encoding="utf-8") as vocab_handle: - self.vocab = json.load(vocab_handle) - self.decoder = {v: k for k, v in self.vocab.items()} - @property def vocab_size(self): return len(self.vocab) def get_vocab(self): - return dict(self.vocab, **self.added_tokens_encoder) + vocab = dict(self.vocab).copy() + vocab.update(self.added_tokens_encoder) + return vocab def _tokenize(self, text): """Tokenize a string.""" diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py index d1fdf798a9e35b..028de5d4f79c8c 100644 --- a/src/transformers/models/mluke/tokenization_mluke.py +++ b/src/transformers/models/mluke/tokenization_mluke.py @@ -272,32 +272,11 @@ def __init__( if isinstance(entity_token_2, str) else entity_token_2 ) - kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) - kwargs["additional_special_tokens"] += [entity_token_1, entity_token_2] + additional_special_tokens = kwargs.pop("additional_special_tokens", []) + additional_special_tokens += [entity_token_1, entity_token_2] self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - sp_model_kwargs=self.sp_model_kwargs, - task=task, - max_entity_length=max_entity_length, - max_mention_length=max_mention_length, - entity_token_1=entity_token_1, - entity_token_2=entity_token_2, - entity_unk_token=entity_unk_token, - entity_pad_token=entity_pad_token, - entity_mask_token=entity_mask_token, - entity_mask2_token=entity_mask2_token, - **kwargs, - ) - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -345,6 +324,65 @@ def __init__( self.max_mention_length = max_mention_length + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + task=task, + max_entity_length=max_entity_length, + max_mention_length=max_mention_length, + entity_token_1=entity_token_1, + entity_token_2=entity_token_2, + entity_unk_token=entity_unk_token, + entity_pad_token=entity_pad_token, + entity_mask_token=entity_mask_token, + entity_mask2_token=entity_mask2_token, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + @property + # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.vocab_size + def vocab_size(self): + return len(self.sp_model) + self.fairseq_offset + 1 # Add the token + + # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.get_vocab + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._tokenize + def _tokenize(self, text: str) -> List[str]: + # TODO check if the t5/llama PR also applies here + return self.sp_model.encode(text, out_type=str) + + # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._convert_token_to_id + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + spm_id = self.sp_model.PieceToId(token) + + # Need to return unknown token if the SP model returned 0 + return spm_id + self.fairseq_offset if spm_id else self.unk_token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.fairseq_ids_to_tokens: + return self.fairseq_ids_to_tokens[index] + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None @@ -1591,39 +1629,3 @@ def create_token_type_ids_from_sequences( if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - @property - # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.vocab_size - def vocab_size(self): - return len(self.sp_model) + self.fairseq_offset + 1 # Add the token - - # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.get_vocab - def get_vocab(self): - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._tokenize - def _tokenize(self, text: str) -> List[str]: - return self.sp_model.encode(text, out_type=str) - - # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._convert_token_to_id - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - if token in self.fairseq_tokens_to_ids: - return self.fairseq_tokens_to_ids[token] - spm_id = self.sp_model.PieceToId(token) - - # Need to return unknown token if the SP model returned 0 - return spm_id + self.fairseq_offset if spm_id else self.unk_token_id - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - if index in self.fairseq_ids_to_tokens: - return self.fairseq_ids_to_tokens[index] - return self.sp_model.IdToPiece(index - self.fairseq_offset) - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (strings for sub-words) in a single string.""" - out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() - return out_string diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py index 389e38bce61933..398f054a992657 100644 --- a/src/transformers/models/mobilebert/tokenization_mobilebert.py +++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py @@ -124,20 +124,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -153,7 +139,22 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py index f1347da08a3f95..21c3555c057749 100644 --- a/src/transformers/models/mpnet/tokenization_mpnet.py +++ b/src/transformers/models/mpnet/tokenization_mpnet.py @@ -157,22 +157,6 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -188,7 +172,23 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def do_lower_case(self): @@ -199,7 +199,9 @@ def vocab_size(self): return len(self.vocab) def get_vocab(self): - return dict(self.vocab, **self.added_tokens_encoder) + vocab = self.vocab.copy() + vocab.update(self.added_tokens_encoder) + return vocab def _tokenize(self, text): split_tokens = [] diff --git a/src/transformers/models/mpnet/tokenization_mpnet_fast.py b/src/transformers/models/mpnet/tokenization_mpnet_fast.py index 82d8ffec08d910..1c9b1d5922278b 100644 --- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py +++ b/src/transformers/models/mpnet/tokenization_mpnet_fast.py @@ -126,6 +126,16 @@ def __init__( strip_accents=None, **kwargs, ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + super().__init__( vocab_file, tokenizer_file=tokenizer_file, diff --git a/src/transformers/models/mvp/tokenization_mvp.py b/src/transformers/models/mvp/tokenization_mvp.py index 2d497c23d1300c..c897cbea30d928 100644 --- a/src/transformers/models/mvp/tokenization_mvp.py +++ b/src/transformers/models/mvp/tokenization_mvp.py @@ -193,19 +193,6 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - - super().__init__( - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - **kwargs, - ) with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -222,12 +209,27 @@ def __init__( # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + @property def vocab_size(self): return len(self.encoder) def get_vocab(self): - return dict(self.encoder, **self.added_tokens_encoder) + vocab = self.encoder.copy() + vocab.update(self.added_tokens_encoder) + return vocab def bpe(self, token): if token in self.cache: diff --git a/src/transformers/models/mvp/tokenization_mvp_fast.py b/src/transformers/models/mvp/tokenization_mvp_fast.py index fd6abd1700205b..afe2a0a89e2a03 100644 --- a/src/transformers/models/mvp/tokenization_mvp_fast.py +++ b/src/transformers/models/mvp/tokenization_mvp_fast.py @@ -153,6 +153,15 @@ def __init__( trim_offsets=True, **kwargs, ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token super().__init__( vocab_file, merges_file, diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py index 58a02a7af75f24..ea77f10ea578ae 100644 --- a/src/transformers/models/nllb/tokenization_nllb.py +++ b/src/transformers/models/nllb/tokenization_nllb.py @@ -149,23 +149,6 @@ def __init__( self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.legacy_behaviour = legacy_behaviour - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - tokenizer_file=tokenizer_file, - src_lang=src_lang, - tgt_lang=tgt_lang, - additional_special_tokens=additional_special_tokens, - sp_model_kwargs=self.sp_model_kwargs, - legacy_behaviour=legacy_behaviour, - **kwargs, - ) - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -190,16 +173,35 @@ def __init__( self.fairseq_tokens_to_ids.update(self.lang_code_to_id) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} - self._additional_special_tokens = list(self.lang_code_to_id.keys()) + + self._src_lang = src_lang if src_lang is not None else "eng_Latn" + self.cur_lang_code_id = self.lang_code_to_id[self._src_lang] + + _additional_special_tokens = list(self.lang_code_to_id.keys()) if additional_special_tokens is not None: # Only add those special tokens if they are not already there. - self._additional_special_tokens.extend( - [t for t in additional_special_tokens if t not in self._additional_special_tokens] + _additional_special_tokens.extend( + [t for t in additional_special_tokens if t not in _additional_special_tokens] ) - self._src_lang = src_lang if src_lang is not None else "eng_Latn" - self.cur_lang_code_id = self.lang_code_to_id[self._src_lang] + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + tokenizer_file=tokenizer_file, + src_lang=src_lang, + tgt_lang=tgt_lang, + additional_special_tokens=_additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, + legacy_behaviour=legacy_behaviour, + **kwargs, + ) + self.tgt_lang = tgt_lang self.set_src_lang_special_tokens(self._src_lang) diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py index 59e67c4bff9acd..7ab11c8cc00a06 100644 --- a/src/transformers/models/nllb/tokenization_nllb_fast.py +++ b/src/transformers/models/nllb/tokenization_nllb_fast.py @@ -157,6 +157,15 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token self.legacy_behaviour = legacy_behaviour + + _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy() + + if additional_special_tokens is not None: + # Only add those special tokens if they are not already there. + _additional_special_tokens.extend( + [t for t in additional_special_tokens if t not in _additional_special_tokens] + ) + super().__init__( vocab_file=vocab_file, tokenizer_file=tokenizer_file, @@ -169,22 +178,13 @@ def __init__( mask_token=mask_token, src_lang=src_lang, tgt_lang=tgt_lang, - additional_special_tokens=additional_special_tokens, + additional_special_tokens=_additional_special_tokens, legacy_behaviour=legacy_behaviour, **kwargs, ) self.vocab_file = vocab_file - _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy() - - if additional_special_tokens is not None: - # Only add those special tokens if they are not already there. - _additional_special_tokens.extend( - [t for t in additional_special_tokens if t not in _additional_special_tokens] - ) - - self.add_special_tokens({"additional_special_tokens": _additional_special_tokens}) self.lang_code_to_id = { lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES } diff --git a/src/transformers/models/openai/tokenization_openai.py b/src/transformers/models/openai/tokenization_openai.py index 0a7f93a7b2de1c..cfdeb3207a6d96 100644 --- a/src/transformers/models/openai/tokenization_openai.py +++ b/src/transformers/models/openai/tokenization_openai.py @@ -269,8 +269,6 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): model_input_names = ["input_ids", "attention_mask"] def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): - super().__init__(unk_token=unk_token, **kwargs) - try: import ftfy from spacy.lang.en import English @@ -292,6 +290,8 @@ def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + super().__init__(unk_token=unk_token, **kwargs) + @property def do_lower_case(self): return True diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py index 814602fac88d0d..3b6a461d81d0cd 100644 --- a/src/transformers/models/pegasus/tokenization_pegasus.py +++ b/src/transformers/models/pegasus/tokenization_pegasus.py @@ -18,7 +18,7 @@ import sentencepiece as spm -from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_utils import AddedToken, PreTrainedTokenizer from ...utils import logging @@ -38,6 +38,7 @@ logger = logging.get_logger(__name__) +# TODO ArthurZ refactor this to only use the added_tokens_encoder class PegasusTokenizer(PreTrainedTokenizer): r""" Construct a PEGASUS tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). @@ -95,8 +96,6 @@ class PegasusTokenizer(PreTrainedTokenizer): - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for BPE-dropout. """ - vocab_files_names = VOCAB_FILES_NAMES - vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES @@ -122,7 +121,6 @@ def __init__( f"additional_special_tokens should be of type {type(list)}, but is" f" {type(additional_special_tokens)}" ) - additional_special_tokens_extended = ( ([mask_token_sent] + additional_special_tokens) if mask_token_sent not in additional_special_tokens and mask_token_sent is not None @@ -140,10 +138,27 @@ def __init__( ) additional_special_tokens = additional_special_tokens_extended else: + additional_special_tokens_extended = [] additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else [] additional_special_tokens += [f"" for i in range(2, self.offset)] self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.mask_token_sent = mask_token_sent + self.vocab_file = vocab_file + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + self._added_tokens_decoder = { + 0: AddedToken(str(pad_token), lstrip=True, rstrip=True), + 1: AddedToken(str(eos_token), lstrip=True, rstrip=True), + } + + if self.mask_token_sent is not None: + self._added_tokens_decoder[2] = AddedToken(mask_token_sent) + self._added_tokens_decoder[3] = AddedToken(str(mask_token)) + + for i in range(1, self.offset - 1): + self._added_tokens_decoder[len(self._added_tokens_decoder)] = AddedToken(f"") super().__init__( eos_token=eos_token, @@ -156,31 +171,6 @@ def __init__( sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) - self.mask_token_sent = mask_token_sent - self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - - # add special tokens to encoder dict - self.encoder: Dict[int, str] = { - 0: self.pad_token, - 1: self.eos_token, - } - - if self.mask_token_sent is not None: - self.encoder.update( - { - 2: self.mask_token_sent, - 3: self.mask_token, - } - ) - - if self.offset > 0: - # entries 2-104 are only used for pretraining and called , , unk_2, ...unk_102 - # mask_token_sent is already added to list -> so start at 1 - self.encoder.update({i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1)}) - - self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()} @property def vocab_size(self) -> int: @@ -212,21 +202,14 @@ def _tokenize(self, text: str) -> List[str]: def _convert_token_to_id(self, token: str) -> int: """Converts a token (str) to an id using the vocab.""" - if token in self.decoder: - return self.decoder[token] - elif token in self.added_tokens_decoder: - return self.added_tokens_decoder[token] sp_id = self.sp_model.piece_to_id(token) return sp_id + self.offset def _convert_id_to_token(self, index: int) -> str: """Converts an index (integer) to a token (str) using the vocab.""" - if index in self.encoder: - return self.encoder[index] - elif index in self.added_tokens_encoder: - return self.added_tokens_encoder[index] - else: - token = self.sp_model.IdToPiece(index - self.offset) + if index < self.offset: + return self.sp_model.IdToPiece(index) + token = self.sp_model.IdToPiece(index - self.offset) return token def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/models/perceiver/tokenization_perceiver.py b/src/transformers/models/perceiver/tokenization_perceiver.py index cbfd9e64150243..b4ec1e378e5671 100644 --- a/src/transformers/models/perceiver/tokenization_perceiver.py +++ b/src/transformers/models/perceiver/tokenization_perceiver.py @@ -75,6 +75,18 @@ def __init__( cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + self._utf_vocab_size = 2**8 # utf is 8 bits + + # Since these tokens are not part of the vocabulary, we manually add them + self._added_tokens_decoder: Dict[str, int] = { + 0: pad_token, + 1: bos_token, + 2: eos_token, + 3: mask_token, + 4: cls_token, + 5: sep_token, + } + self._num_special_tokens = len(self._added_tokens_decoder) super().__init__( pad_token=pad_token, bos_token=bos_token, @@ -86,31 +98,17 @@ def __init__( **kwargs, ) - self._utf_vocab_size = 2**8 # utf is 8 bits - - # define special tokens dict - self.special_tokens_encoder: Dict[str, int] = { - self.pad_token: 0, - self.bos_token: 1, - self.eos_token: 2, - self.mask_token: 3, - self.cls_token: 4, - self.sep_token: 5, - } - self._num_special_tokens = len(self.special_tokens_encoder) - self.special_tokens_decoder: Dict[int, str] = {v: k for k, v in self.special_tokens_encoder.items()} - def get_vocab(self) -> Dict[str, int]: - vocab = self.special_tokens_encoder.copy() - vocab.update(self.added_tokens_encoder) + vocab = {} for i in range(self._utf_vocab_size): token = chr(i) - vocab[token] = i + len(self.special_tokens_encoder) + vocab[token] = i + self._num_special_tokens + vocab.update(self.added_tokens_encoder) return vocab @property def vocab_size(self): - return self._utf_vocab_size + self._num_special_tokens + return self._utf_vocab_size def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False @@ -171,11 +169,7 @@ def _tokenize(self, text: str) -> List[str]: def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" - if token in self.special_tokens_encoder: - token_id = self.special_tokens_encoder[token] - elif token in self.added_tokens_encoder: - token_id = self.added_tokens_encoder[token] - elif len(token) != 1: + if len(token) != 1: token_id = self.unk_token_id else: token_id = ord(token) + self._num_special_tokens @@ -183,26 +177,16 @@ def _convert_token_to_id(self, token): def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" - if index in self.special_tokens_decoder: - token = self.special_tokens_decoder[index] - elif index in self.added_tokens_decoder: - token = self.added_tokens_decoder[index] - else: - token = chr(index - self._num_special_tokens) + token = chr(index - self._num_special_tokens) return token + # TODO @ArthurZ refactor this as well.... def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" bstring = b"" for token in tokens: - if token in self.special_tokens_decoder: - tok_string = self.special_tokens_decoder[token].encode("utf-8") - elif token in self.added_tokens_decoder: - tok_string = self.special_tokens_decoder[token].encode("utf-8") - elif token in self.special_tokens_encoder: - tok_string = token.encode("utf-8") - elif token in self.added_tokens_encoder: - tok_string = token.encode("utf-8") + if token in self.added_tokens_encoder: + tok_string = str(token).encode("utf-8") else: tok_string = bytes([ord(token)]) bstring += tok_string diff --git a/src/transformers/models/phobert/tokenization_phobert.py b/src/transformers/models/phobert/tokenization_phobert.py index 534a71d5038ed3..efa7e2469478fb 100644 --- a/src/transformers/models/phobert/tokenization_phobert.py +++ b/src/transformers/models/phobert/tokenization_phobert.py @@ -131,25 +131,14 @@ def __init__( mask_token="", **kwargs, ): - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - **kwargs, - ) - self.vocab_file = vocab_file self.merges_file = merges_file self.encoder = {} - self.encoder[self.bos_token] = 0 - self.encoder[self.pad_token] = 1 - self.encoder[self.eos_token] = 2 - self.encoder[self.unk_token] = 3 + self.encoder[bos_token] = 0 + self.encoder[pad_token] = 1 + self.encoder[eos_token] = 2 + self.encoder[unk_token] = 3 self.add_from_file(vocab_file) @@ -158,9 +147,21 @@ def __init__( with open(merges_file, encoding="utf-8") as merges_handle: merges = merges_handle.read().split("\n")[:-1] merges = [tuple(merge.split()[:-1]) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + **kwargs, + ) + def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py index bf47538eaabdaf..e50849b51d2d59 100644 --- a/src/transformers/models/plbart/tokenization_plbart.py +++ b/src/transformers/models/plbart/tokenization_plbart.py @@ -195,23 +195,6 @@ def __init__( mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - language_codes=language_codes, - tokenizer_file=tokenizer_file, - src_lang=src_lang, - tgt_lang=tgt_lang, - additional_special_tokens=additional_special_tokens, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) src_lang = self._convert_lang_code_special_format(src_lang) tgt_lang = self._convert_lang_code_special_format(tgt_lang) @@ -245,12 +228,12 @@ def __init__( self.fairseq_tokens_to_ids.update(self.lang_code_to_id) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} - self._additional_special_tokens = list(self.lang_code_to_id.keys()) + _additional_special_tokens = list(self.lang_code_to_id.keys()) if additional_special_tokens is not None: # Only add those special tokens if they are not already there. - self._additional_special_tokens.extend( - [t for t in additional_special_tokens if t not in self._additional_special_tokens] + _additional_special_tokens.extend( + [t for t in additional_special_tokens if t not in _additional_special_tokens] ) if self.language_codes == "base": @@ -262,6 +245,23 @@ def __init__( self._src_lang = src_lang if src_lang is not None else "__en_XX__" self.cur_lang_code_id = self.lang_code_to_id[self._src_lang] + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + language_codes=language_codes, + tokenizer_file=tokenizer_file, + src_lang=src_lang, + tgt_lang=tgt_lang, + additional_special_tokens=_additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + self.tgt_lang = tgt_lang self.set_src_lang_special_tokens(self._src_lang) diff --git a/src/transformers/models/pop2piano/tokenization_pop2piano.py b/src/transformers/models/pop2piano/tokenization_pop2piano.py index ea1c042195ea2d..0d25dcdfc7d57b 100644 --- a/src/transformers/models/pop2piano/tokenization_pop2piano.py +++ b/src/transformers/models/pop2piano/tokenization_pop2piano.py @@ -101,14 +101,6 @@ def __init__( pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token - super().__init__( - unk_token=unk_token, - eos_token=eos_token, - pad_token=pad_token, - bos_token=bos_token, - **kwargs, - ) - self.default_velocity = default_velocity self.num_bars = num_bars @@ -119,6 +111,14 @@ def __init__( # create mappings for encoder self.decoder = {v: k for k, v in self.encoder.items()} + super().__init__( + unk_token=unk_token, + eos_token=eos_token, + pad_token=pad_token, + bos_token=bos_token, + **kwargs, + ) + @property def vocab_size(self): """Returns the vocabulary size of the tokenizer.""" diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py index 03e9083e749e2f..bb4fa5ff9ca49f 100644 --- a/src/transformers/models/prophetnet/tokenization_prophetnet.py +++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py @@ -354,21 +354,6 @@ def __init__( strip_accents: Optional[bool] = None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - x_sep_token=x_sep_token, - pad_token=pad_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - self.unique_no_split_tokens.append(x_sep_token) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -384,7 +369,21 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + x_sep_token=x_sep_token, + pad_token=pad_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def vocab_size(self): diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py index a6c09f1b97f5b8..bf6b63277488b9 100644 --- a/src/transformers/models/realm/tokenization_realm.py +++ b/src/transformers/models/realm/tokenization_realm.py @@ -157,20 +157,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -186,7 +172,20 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py index 8796c8149c8ae6..255e153c0d79e1 100644 --- a/src/transformers/models/reformer/tokenization_reformer.py +++ b/src/transformers/models/reformer/tokenization_reformer.py @@ -106,6 +106,10 @@ def __init__( ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + super().__init__( eos_token=eos_token, unk_token=unk_token, @@ -114,10 +118,6 @@ def __init__( **kwargs, ) - self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - @property def vocab_size(self): return self.sp_model.get_piece_size() diff --git a/src/transformers/models/rembert/tokenization_rembert.py b/src/transformers/models/rembert/tokenization_rembert.py index 65e6c1df728f7c..c1f12527ef5974 100644 --- a/src/transformers/models/rembert/tokenization_rembert.py +++ b/src/transformers/models/rembert/tokenization_rembert.py @@ -111,6 +111,13 @@ def __init__( mask_token="[MASK]", **kwargs, ): + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor() + self.sp_model.Load(vocab_file) super().__init__( do_lower_case=do_lower_case, remove_space=remove_space, @@ -125,14 +132,6 @@ def __init__( **kwargs, ) - self.do_lower_case = do_lower_case - self.remove_space = remove_space - self.keep_accents = keep_accents - self.vocab_file = vocab_file - - self.sp_model = spm.SentencePieceProcessor() - self.sp_model.Load(vocab_file) - @property def vocab_size(self): return len(self.sp_model) diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py index 24b9748c3d37fe..b7b3c75be180cd 100644 --- a/src/transformers/models/roberta/tokenization_roberta.py +++ b/src/transformers/models/roberta/tokenization_roberta.py @@ -203,28 +203,21 @@ def __init__( **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token - unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token # Mask token behave like a normal word, i.e. include the space before it - mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - - super().__init__( - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - **kwargs, + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token ) + # these special tokens are not part of the vocab.json, let's add them in the correct order + with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -241,12 +234,27 @@ def __init__( # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + @property def vocab_size(self): return len(self.encoder) def get_vocab(self): - return dict(self.encoder, **self.added_tokens_encoder) + vocab = dict(self.encoder).copy() + vocab.update(self.added_tokens_encoder) + return vocab def bpe(self, token): if token in self.cache: diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py index c2c479da0964b6..05f64ac2ab185a 100644 --- a/src/transformers/models/roberta/tokenization_roberta_fast.py +++ b/src/transformers/models/roberta/tokenization_roberta_fast.py @@ -177,6 +177,11 @@ def __init__( trim_offsets=True, **kwargs, ): + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token + ) super().__init__( vocab_file, merges_file, diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py index d665b91a0680df..0bbdc04e536ec4 100644 --- a/src/transformers/models/roc_bert/tokenization_roc_bert.py +++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py @@ -156,20 +156,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - for cur_file in [vocab_file, word_shape_file, word_pronunciation_file]: if cur_file is None or not os.path.isfile(cur_file): raise ValueError( @@ -195,7 +181,20 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = RoCBertWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + self.wordpiece_tokenizer = RoCBertWordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py index dc406fa480eeaf..88c0f398b3006f 100644 --- a/src/transformers/models/roformer/tokenization_roformer.py +++ b/src/transformers/models/roformer/tokenization_roformer.py @@ -378,20 +378,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -407,7 +393,7 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) try: import rjieba except ImportError: @@ -417,6 +403,20 @@ def __init__( ) self.jieba = rjieba + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + @property def do_lower_case(self): return self.basic_tokenizer.do_lower_case diff --git a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py index 843c79e397b8b7..b7104da7f1a873 100644 --- a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py +++ b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py @@ -122,23 +122,12 @@ def __init__( do_lower_case=False, tgt_lang=None, lang_codes=None, + additional_special_tokens=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs, ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - pad_token=pad_token, - do_upper_case=do_upper_case, - do_lower_case=do_lower_case, - tgt_lang=tgt_lang, - lang_codes=lang_codes, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) self.do_upper_case = do_upper_case self.do_lower_case = do_lower_case @@ -152,18 +141,39 @@ def __init__( self.langs = LANGUAGES[lang_codes] self.lang_tokens = [f"" for lang in self.langs] self.lang_code_to_id = {lang: self.sp_model.PieceToId(f"") for lang in self.langs} - - self._additional_special_tokens = self.lang_tokens + if additional_special_tokens is not None: + additional_special_tokens = self.lang_tokens + additional_special_tokens + else: + additional_special_tokens = self.lang_tokens self._tgt_lang = tgt_lang if tgt_lang is not None else self.langs[0] self.set_tgt_lang_special_tokens(self._tgt_lang) else: self.lang_code_to_id = {} + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + do_upper_case=do_upper_case, + do_lower_case=do_lower_case, + tgt_lang=tgt_lang, + lang_codes=lang_codes, + sp_model_kwargs=self.sp_model_kwargs, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + @property def vocab_size(self) -> int: return len(self.encoder) + def get_vocab(self) -> Dict: + vocab = self.encoder.copy() + vocab.update(self.added_tokens_encoder) + return vocab + @property def tgt_lang(self) -> str: return self._tgt_lang @@ -241,11 +251,6 @@ def get_special_tokens_mask( return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones - def get_vocab(self) -> Dict: - vocab = self.encoder.copy() - vocab.update(self.added_tokens_encoder) - return vocab - def __getstate__(self) -> Dict: state = self.__dict__.copy() state["sp_model"] = None diff --git a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py index c021619cd04e36..e28b8a62d015bd 100644 --- a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py @@ -110,15 +110,6 @@ def __init__( merges_file=None, **kwargs, ): - super().__init__( - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - do_lower_case=do_lower_case, - **kwargs, - ) - self.do_lower_case = do_lower_case with open(vocab_file, encoding="utf-8") as vocab_handle: @@ -137,6 +128,14 @@ def __init__( merges = [tuple(merge.split()[:2]) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + super().__init__( + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + do_lower_case=do_lower_case, + **kwargs, + ) @property def vocab_size(self) -> int: diff --git a/src/transformers/models/speecht5/tokenization_speecht5.py b/src/transformers/models/speecht5/tokenization_speecht5.py index 9748424e41699c..a9a3e3ec54a342 100644 --- a/src/transformers/models/speecht5/tokenization_speecht5.py +++ b/src/transformers/models/speecht5/tokenization_speecht5.py @@ -105,6 +105,12 @@ def __init__( **kwargs, ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + self.normalize = normalize + self._normalizer = None + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) super().__init__( bos_token=bos_token, @@ -116,13 +122,6 @@ def __init__( **kwargs, ) - self.vocab_file = vocab_file - self.normalize = normalize - self._normalizer = None - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): normalize = kwargs.pop("normalize", self.normalize) if is_split_into_words: diff --git a/src/transformers/models/splinter/tokenization_splinter.py b/src/transformers/models/splinter/tokenization_splinter.py index 308680940db106..909905979be38c 100644 --- a/src/transformers/models/splinter/tokenization_splinter.py +++ b/src/transformers/models/splinter/tokenization_splinter.py @@ -137,20 +137,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -166,8 +152,21 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) self.question_token = question_token + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def question_token_id(self): diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert.py b/src/transformers/models/squeezebert/tokenization_squeezebert.py index f061a1a53c2577..0cefa03edf3e06 100644 --- a/src/transformers/models/squeezebert/tokenization_squeezebert.py +++ b/src/transformers/models/squeezebert/tokenization_squeezebert.py @@ -138,20 +138,6 @@ def __init__( strip_accents=None, **kwargs, ): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" @@ -167,7 +153,22 @@ def __init__( tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index cbc305e1302e0a..8e6f9ee8d9e1c8 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -25,6 +25,7 @@ from ...convert_slow_tokenizer import import_protobuf from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_utils_base import AddedToken if TYPE_CHECKING: @@ -152,18 +153,37 @@ def __init__( legacy=None, **kwargs, ) -> None: - # Add extra_ids to the special token list - if extra_ids > 0 and additional_special_tokens is None: - additional_special_tokens = [f"" for i in range(extra_ids)] - elif extra_ids > 0 and additional_special_tokens is not None: - # Check that we have the right number of extra_id special tokens - extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens))) - if extra_tokens != extra_ids: + pad_token = AddedToken(pad_token, rstrip=True, lstrip=True) + unk_token = AddedToken(unk_token, rstrip=True, lstrip=True) + eos_token = AddedToken(eos_token, rstrip=True, lstrip=True) + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.vocab_file = vocab_file + self._extra_ids = extra_ids + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + if additional_special_tokens is not None: + extra_tokens = [x for x in additional_special_tokens if " 0 and extra_ids != len(extra_tokens): raise ValueError( f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are" " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids" " tokens" ) + else: + extra_tokens = [f"" for i in range(extra_ids)] + additional_special_tokens = extra_tokens + + # for legacy purpose, we keep this. Will be removed and tests updated. (when `added_tokens_decoder` is not passed as kwargs) + self._added_tokens_decoder = {} + for i in range(len(extra_tokens)): + self._added_tokens_decoder[len(self.sp_model) - 1 + extra_ids - i] = AddedToken( + f"", single_word=True, lstrip=True, rstrip=True, special=True + ) + if legacy is None: logger.warning_once( f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is" @@ -175,7 +195,9 @@ def __init__( legacy = True self.legacy = legacy - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.sp_model = self.get_spm_processor() + self.vocab_file = vocab_file + self._extra_ids = extra_ids super().__init__( eos_token=eos_token, @@ -188,11 +210,6 @@ def __init__( **kwargs, ) - self.vocab_file = vocab_file - self._extra_ids = extra_ids - - self.sp_model = self.get_spm_processor() - def get_spm_processor(self): tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs) if self.legacy: # no dependency on protobuf @@ -234,7 +251,7 @@ def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_l @property def vocab_size(self): - return self.sp_model.get_piece_size() + self._extra_ids + return self.sp_model.get_piece_size() def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} @@ -275,7 +292,7 @@ def get_sentinel_tokens(self): ) def get_sentinel_token_ids(self): - return [self._convert_token_to_id(token) for token in self.get_sentinel_tokens()] + return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()] def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]: """Do not add eos again if user already added it.""" @@ -391,18 +408,11 @@ def _tokenize(self, text, **kwargs): def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" - if token.startswith("", token) - num = int(match.group(1)) - return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" - if index < self.sp_model.get_piece_size(): - token = self.sp_model.IdToPiece(index) - else: - token = f"" + token = self.sp_model.IdToPiece(index) return token def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index c3d35d0c87be2b..7ec1e68f21d75c 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -31,6 +31,7 @@ from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace from ...tokenization_utils_base import ( ENCODE_KWARGS_DOCSTRING, + VERY_LARGE_INTEGER, BatchEncoding, EncodedInput, PreTokenizedInput, @@ -351,6 +352,44 @@ def __init__( else: additional_special_tokens = [empty_token] + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + # Additional properties + self.cell_trim_length = cell_trim_length + self.max_column_id = ( + max_column_id + if max_column_id is not None + else model_max_length + if model_max_length is not None + else VERY_LARGE_INTEGER + ) + self.max_row_id = ( + max_row_id + if max_row_id is not None + else model_max_length + if model_max_length is not None + else VERY_LARGE_INTEGER + ) + self.strip_column_names = strip_column_names + self.update_answer_coordinates = update_answer_coordinates + self.min_question_length = min_question_length + self.max_question_length = max_question_length + super().__init__( do_lower_case=do_lower_case, do_basic_tokenize=do_basic_tokenize, @@ -375,32 +414,6 @@ def __init__( **kwargs, ) - if not os.path.isfile(vocab_file): - raise ValueError( - f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" - " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" - ) - self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) - self.do_basic_tokenize = do_basic_tokenize - if do_basic_tokenize: - self.basic_tokenizer = BasicTokenizer( - do_lower_case=do_lower_case, - never_split=never_split, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - ) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) - - # Additional properties - self.cell_trim_length = cell_trim_length - self.max_column_id = max_column_id if max_column_id is not None else self.model_max_length - self.max_row_id = max_row_id if max_row_id is not None else self.model_max_length - self.strip_column_names = strip_column_names - self.update_answer_coordinates = update_answer_coordinates - self.min_question_length = min_question_length - self.max_question_length = max_question_length - @property def do_lower_case(self): return self.basic_tokenizer.do_lower_case diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py index 0097b2a6f20d76..138afbcf93e29d 100644 --- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py +++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py @@ -181,25 +181,7 @@ def __init__( language="en", **kwargs, ): - super().__init__( - special=special, - min_freq=min_freq, - max_size=max_size, - lower_case=lower_case, - delimiter=delimiter, - vocab_file=vocab_file, - pretrained_vocab_file=pretrained_vocab_file, - never_split=never_split, - unk_token=unk_token, - eos_token=eos_token, - additional_special_tokens=additional_special_tokens, - language=language, - **kwargs, - ) requires_backends(self, "sacremoses") - - if never_split is None: - never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() @@ -209,7 +191,6 @@ def __init__( self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file - self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' self.punction_without_space_before_pattern = re.compile(rf"[^\s][{self.punctuation_symbols}]") self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern() @@ -217,7 +198,8 @@ def __init__( self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) self.moses_tokenizer = sm.MosesTokenizer(language) self.moses_detokenizer = sm.MosesDetokenizer(language) - + self.idx2sym = [] + self.sym2idx = OrderedDict() # This try... catch... is not beautiful but honestly this tokenizer was not made to be used # in a library like ours, at all. try: @@ -241,7 +223,7 @@ def __init__( if vocab_dict is not None: for key, value in vocab_dict.items(): - if key not in self.__dict__: + if key not in self.__dict__ or key == "sym2idx": self.__dict__[key] = value elif vocab_file is not None: self.build_vocab() @@ -256,6 +238,27 @@ def __init__( if vocab_file is not None: self.build_vocab() + super().__init__( + special=special, + min_freq=min_freq, + max_size=max_size, + lower_case=lower_case, + delimiter=delimiter, + vocab_file=vocab_file, + pretrained_vocab_file=pretrained_vocab_file, + never_split=never_split, + unk_token=unk_token, + eos_token=eos_token, + additional_special_tokens=additional_special_tokens, + language=language, + **kwargs, + ) + + # these are not required to initialize the parent class as only used when tokenizing. + if never_split is None: + never_split = self.all_special_tokens + self.never_split = never_split + @property def do_lower_case(self): return self.lower_case @@ -305,7 +308,7 @@ def _build_from_file(self, vocab_file): elif "" in self.sym2idx: self.unk_idx = self.sym2idx[""] else: - raise ValueError("No token in vocabulary") + raise ValueError("Token not in vocabulary and no token in vocabulary for replacement.") def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if os.path.isdir(save_directory): @@ -323,7 +326,7 @@ def build_vocab(self): if self.vocab_file: logger.info(f"building vocab from {self.vocab_file}") self._build_from_file(self.vocab_file) - logger.info(f"final vocab size {len(self)}") + logger.info(f"Final vocab size {len(self.sym2idx)}") else: logger.info(f"building vocab with min_freq={self.min_freq}, max_size={self.max_size}") self.idx2sym = [] @@ -337,7 +340,7 @@ def build_vocab(self): break self.add_symbol(sym) - logger.info(f"final vocab size {len(self)} from {len(self.counter)} unique tokens") + logger.info(f"Final vocab size {len(self.sym2idx)} from {len(self.counter)} unique tokens") @torch_only_method def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False): @@ -406,9 +409,8 @@ def move_added_token(self, token: str, target_idx: int): self.sym2idx[current_sym] = idx # Delete token from added_tokens - old_index = self.added_tokens_encoder[token] - del self.added_tokens_decoder[old_index] - del self.added_tokens_encoder[token] + old_index = self._added_tokens_encoder.pop(token) + self._added_tokens_decoder.pop(old_index) def moses_punct_norm(self, text): return self.moses_punct_normalizer.normalize(text) @@ -463,7 +465,7 @@ def _convert_token_to_id(self, sym): elif "" in self.sym2idx: return self.sym2idx[""] else: - raise ValueError("Token not in vocabulary and no token in vocabulary for replacement") + raise ValueError("Token not in vocabulary and no token in vocabulary for replacement.") def convert_tokens_to_string(self, tokens): """ @@ -482,7 +484,9 @@ def vocab_size(self): return len(self.idx2sym) def get_vocab(self): - return dict(self.sym2idx, **self.added_tokens_encoder) + vocab = self.sym2idx.copy() + vocab.update(self.added_tokens_encoder) + return vocab def _tokenize(self, line, add_eos=False, add_double_eos=False): line = line.strip() diff --git a/src/transformers/models/vits/tokenization_vits.py b/src/transformers/models/vits/tokenization_vits.py index f2cc6be3e43219..0563be326cdb51 100644 --- a/src/transformers/models/vits/tokenization_vits.py +++ b/src/transformers/models/vits/tokenization_vits.py @@ -93,17 +93,6 @@ def __init__( is_uroman=False, **kwargs, ) -> None: - super().__init__( - pad_token=pad_token, - unk_token=unk_token, - language=language, - add_blank=add_blank, - normalize=normalize, - phonemize=phonemize, - is_uroman=is_uroman, - **kwargs, - ) - with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) @@ -115,12 +104,24 @@ def __init__( self.is_uroman = is_uroman + super().__init__( + pad_token=pad_token, + unk_token=unk_token, + language=language, + add_blank=add_blank, + normalize=normalize, + phonemize=phonemize, + is_uroman=is_uroman, + **kwargs, + ) + @property def vocab_size(self): return len(self.encoder) def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) return vocab def normalize_text(self, input_string): diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index 472fd2d649c994..dc8b9bde7e6214 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -24,7 +24,7 @@ import numpy as np -from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list +from ...tokenization_utils import PreTrainedTokenizer from ...tokenization_utils_base import AddedToken, BatchEncoding from ...utils import ( ModelOutput, @@ -174,18 +174,6 @@ def __init__( target_lang=None, **kwargs, ): - super().__init__( - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - do_lower_case=do_lower_case, - word_delimiter_token=word_delimiter_token, - replace_word_delimiter_char=replace_word_delimiter_char, - target_lang=target_lang, - **kwargs, - ) - self._word_delimiter_token = word_delimiter_token self.do_lower_case = do_lower_case @@ -204,13 +192,28 @@ def __init__( self.decoder = {v: k for k, v in self.encoder.items()} + super().__init__( + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + do_lower_case=do_lower_case, + word_delimiter_token=word_delimiter_token, + replace_word_delimiter_char=replace_word_delimiter_char, + target_lang=target_lang, + **kwargs, + ) + # make sure that tokens made of several # characters are not split at tokenization + + # TODO @ArthurZ add them or just update the trie? + unique_no_split_tokens = [] for token in self.encoder.keys(): if len(token) > 1: - self.unique_no_split_tokens.append(token) + unique_no_split_tokens.append(AddedToken(token, rstrip=True, lstrip=True, normalized=False)) - self._create_trie(self.unique_no_split_tokens) + self.add_tokens(unique_no_split_tokens) def set_target_lang(self, target_lang: str): """ @@ -266,7 +269,20 @@ def vocab_size(self) -> int: return len(self.decoder) def get_vocab(self) -> Dict: - return dict(self.vocab, **self.added_tokens_encoder) + vocab = dict(self.encoder) + vocab.update(self.added_tokens_encoder) + return vocab + + def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: + # Overwritten to never strip! + to_add = [] + for token in new_tokens: + if isinstance(token, str): + to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalize=False)) + else: + to_add.append(token) + + return super()._add_tokens(to_add, special_tokens) def _tokenize(self, text, **kwargs): """ @@ -645,64 +661,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (vocab_file,) - def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: - """ - Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to - it with indices starting from length of the current vocabulary. - - Args: - new_tokens (`List[str]`or `List[tokenizers.AddedToken]`): - Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by - checking if the tokenizer assign the index of the `unk_token` to them). - special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the tokens should be added as special tokens. - - Returns: - `int`: The number of tokens actually added to the vocabulary. - - Example: - - ```python - # Let's see how to increase the vocabulary of Bert model and tokenizer - tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h") - model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") - - num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"]) - print("We have added", num_added_toks, "tokens") - # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. - model.resize_token_embeddings(len(tokenizer)) - ```""" - new_tokens = [str(tok) for tok in new_tokens] - - tokens_to_add = [] - for token in new_tokens: - assert isinstance(token, str) - if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case: - token = token.lower() - if ( - token != self.unk_token - and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) - and token not in tokens_to_add - ): - tokens_to_add.append(token) - if self.verbose: - logger.info(f"Adding {token} to the vocabulary") - - added_tok_encoder = {tok: len(self) + i for i, tok in enumerate(tokens_to_add)} - added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} - self.added_tokens_encoder.update(added_tok_encoder) - self.added_tokens_decoder.update(added_tok_decoder) - - # Make sure we don't split on any special tokens (even they were already in the vocab before) - for token in tokens_to_add: - if len(token) > 1: - self._additional_special_tokens.append(AddedToken(token)) - _insert_one_token_to_ordered_list(self.unique_no_split_tokens, token) - - self._create_trie(self.unique_no_split_tokens) - - return len(tokens_to_add) - class Wav2Vec2Tokenizer(PreTrainedTokenizer): """ @@ -777,18 +735,6 @@ def __init__( return_attention_mask=False, **kwargs, ): - super().__init__( - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - do_lower_case=do_lower_case, - do_normalize=do_normalize, - return_attention_mask=return_attention_mask, - word_delimiter_token=word_delimiter_token, - **kwargs, - ) - warnings.warn( "The class `Wav2Vec2Tokenizer` is deprecated and will be removed in version 5 of Transformers. Please use" " `Wav2Vec2Processor` or `Wav2Vec2CTCTokenizer` instead.", @@ -806,6 +752,18 @@ def __init__( self.decoder = {v: k for k, v in self.encoder.items()} + super().__init__( + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + do_lower_case=do_lower_case, + do_normalize=do_normalize, + return_attention_mask=return_attention_mask, + word_delimiter_token=word_delimiter_token, + **kwargs, + ) + @property def word_delimiter_token(self) -> str: """ diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py index f9a1cf631cb525..bd64dcf18d97ad 100644 --- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py @@ -23,7 +23,7 @@ import numpy as np -from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list +from ...tokenization_utils import PreTrainedTokenizer from ...tokenization_utils_base import AddedToken from ...utils import ( ModelOutput, @@ -143,19 +143,6 @@ def __init__( phonemizer_backend="espeak", **kwargs, ): - super().__init__( - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - word_delimiter_token=word_delimiter_token, - phone_delimiter_token=phone_delimiter_token, - do_phonemize=do_phonemize, - phonemizer_lang=phonemizer_lang, - phonemizer_backend=phonemizer_backend, - **kwargs, - ) - self._word_delimiter_token = word_delimiter_token self._phone_delimiter_token = phone_delimiter_token self.do_phonemize = do_phonemize @@ -168,13 +155,38 @@ def __init__( with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} + super().__init__( + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + word_delimiter_token=word_delimiter_token, + phone_delimiter_token=phone_delimiter_token, + do_phonemize=do_phonemize, + phonemizer_lang=phonemizer_lang, + phonemizer_backend=phonemizer_backend, + **kwargs, + ) @property def vocab_size(self) -> int: return len(self.decoder) def get_vocab(self) -> Dict: - return dict(self.encoder, **self.added_tokens_encoder) + vocab = dict(self.encoder) + vocab.update(self.added_tokens_encoder) + return vocab + + def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: + # Overwritten to never strip! + to_add = [] + for token in new_tokens: + if isinstance(token, str): + to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalize=True)) + else: + to_add.append(token) + + return super()._add_tokens(to_add, special_tokens) def init_backend(self, phonemizer_lang: str): """ @@ -576,61 +588,3 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") return (vocab_file,) - - def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: - """ - Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to - it with indices starting from length of the current vocabulary. - - Args: - new_tokens (`List[str]`or `List[tokenizers.AddedToken]`): - Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by - checking if the tokenizer assign the index of the `unk_token` to them). - special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the tokens should be added as special tokens. - - Returns: - `int`: The number of tokens actually added to the vocabulary. - - Examples: - - ```python - # Let's see how to increase the vocabulary of Bert model and tokenizer - tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft") - model = Wav2Vec2PhonemeForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft") - - num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"]) - print("We have added", num_added_toks, "tokens") - # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. - model.resize_token_embeddings(len(tokenizer)) - ```""" - new_tokens = [str(tok) for tok in new_tokens] - - tokens_to_add = [] - for token in new_tokens: - if not isinstance(token, str): - raise ValueError(f"Token {token} has to be of type string, but is of type {type(token)}.") - assert isinstance(token, str) - if ( - token != self.unk_token - and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) - and token not in tokens_to_add - ): - tokens_to_add.append(token) - if self.verbose: - logger.info(f"Adding {token} to the vocabulary") - - added_tok_encoder = {tok: len(self) + i for i, tok in enumerate(tokens_to_add)} - added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} - self.added_tokens_encoder.update(added_tok_encoder) - self.added_tokens_decoder.update(added_tok_decoder) - - # Make sure we don't split on any special tokens (even they were already in the vocab before) - for token in tokens_to_add: - if len(token) > 1: - self._additional_special_tokens.append(AddedToken(token)) - _insert_one_token_to_ordered_list(self.unique_no_split_tokens, token) - - self._create_trie(self.unique_no_split_tokens) - - return len(tokens_to_add) diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py index a22521b4e00dfb..6c3cebbe23d538 100644 --- a/src/transformers/models/whisper/tokenization_whisper.py +++ b/src/transformers/models/whisper/tokenization_whisper.py @@ -272,18 +272,25 @@ def __init__( predict_timestamps=False, **kwargs, ): - bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token - eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token - unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token - super().__init__( - errors=errors, - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - add_prefix_space=add_prefix_space, - **kwargs, + bos_token = ( + AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True) + if isinstance(bos_token, str) + else bos_token + ) + eos_token = ( + AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True) + if isinstance(eos_token, str) + else eos_token + ) + unk_token = ( + AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True) + if isinstance(unk_token, str) + else unk_token + ) + pad_token = ( + AddedToken(pad_token, lstrip=False, rstrip=False, normalized=False, special=True) + if isinstance(pad_token, str) + else pad_token ) with open(vocab_file, encoding="utf-8") as vocab_handle: @@ -309,18 +316,28 @@ def __init__( self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") self.language = language + super().__init__( + errors=errors, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + self.task = task self.predict_timestamps = predict_timestamps + @property + def vocab_size(self) -> int: + return len(self.encoder) + def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab - @property - def vocab_size(self) -> int: - return len(self.encoder) - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe with GPT2 -> Whisper def bpe(self, token): if token in self.cache: @@ -390,11 +407,10 @@ def set_prefix_tokens(self, language: str = None, task: str = None, predict_time @property def prefix_tokens(self) -> List[int]: - all_special_ids = self.all_special_ids - bos_token_id = all_special_ids[-106] - translate_token_id = all_special_ids[-6] - transcribe_token_id = all_special_ids[-5] - notimestamps_token_id = all_special_ids[-1] + bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>") + translate_token_id = self.convert_tokens_to_ids("<|translate|>") + transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>") + notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>") langs = tuple(LANGUAGES.keys()) if self.language is not None: diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py index cb321f669c7ca6..c85b945685fa2f 100644 --- a/src/transformers/models/whisper/tokenization_whisper_fast.py +++ b/src/transformers/models/whisper/tokenization_whisper_fast.py @@ -19,7 +19,7 @@ from typing import List, Optional, Tuple import numpy as np -from tokenizers import pre_tokenizers, processors +from tokenizers import AddedToken, pre_tokenizers, processors from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_fast import PreTrainedTokenizerFast @@ -148,6 +148,22 @@ def __init__( predict_timestamps=False, **kwargs, ): + bos_token = ( + AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True) + if isinstance(bos_token, str) + else bos_token + ) + eos_token = ( + AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True) + if isinstance(eos_token, str) + else eos_token + ) + unk_token = ( + AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True) + if isinstance(unk_token, str) + else unk_token + ) + super().__init__( vocab_file, merges_file, @@ -444,11 +460,10 @@ def set_prefix_tokens(self, language: str = None, task: str = None, predict_time @property # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.prefix_tokens def prefix_tokens(self) -> List[int]: - all_special_ids = self.all_special_ids - bos_token_id = all_special_ids[-106] - translate_token_id = all_special_ids[-6] - transcribe_token_id = all_special_ids[-5] - notimestamps_token_id = all_special_ids[-1] + bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>") + translate_token_id = self.convert_tokens_to_ids("<|translate|>") + transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>") + notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>") langs = tuple(LANGUAGES.keys()) if self.language is not None: diff --git a/src/transformers/models/xglm/tokenization_xglm.py b/src/transformers/models/xglm/tokenization_xglm.py index f27c827134bf37..9dd0144eafae5a 100644 --- a/src/transformers/models/xglm/tokenization_xglm.py +++ b/src/transformers/models/xglm/tokenization_xglm.py @@ -137,17 +137,6 @@ def __init__( word for word in madeup_words if word not in kwargs["additional_special_tokens"] ] - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -170,6 +159,17 @@ def __init__( self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py index 5cab4fc9967937..c0ffdae1194816 100644 --- a/src/transformers/models/xlm/tokenization_xlm.py +++ b/src/transformers/models/xlm/tokenization_xlm.py @@ -613,20 +613,6 @@ def __init__( do_lowercase_and_remove_accent=True, **kwargs, ): - super().__init__( - unk_token=unk_token, - bos_token=bos_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - additional_special_tokens=additional_special_tokens, - lang2id=lang2id, - id2lang=id2lang, - do_lowercase_and_remove_accent=do_lowercase_and_remove_accent, - **kwargs, - ) - try: import sacremoses except ImportError: @@ -660,6 +646,19 @@ def __init__( merges = [tuple(merge.split()[:2]) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + super().__init__( + unk_token=unk_token, + bos_token=bos_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + lang2id=lang2id, + id2lang=id2lang, + do_lowercase_and_remove_accent=do_lowercase_and_remove_accent, + **kwargs, + ) @property def do_lower_case(self): diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py index 6a4c1b9c0b6707..9cc1ae5ca08f4d 100644 --- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py +++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py @@ -145,18 +145,6 @@ def __init__( ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - unk_token=unk_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) - try: import sentencepiece as spm except ImportError: @@ -186,8 +174,20 @@ def __init__( # The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab self.fairseq_offset = 12 self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} - for k in self.fairseq_tokens_to_ids.keys(): - self.unique_no_split_tokens.append(k) + + # TODO ArthurZ fairseq_ids_to_tokens should be removed + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + unk_token=unk_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) @property def can_save_slow_tokenizer(self) -> bool: diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py index 54a46842ff156d..299f4268e56674 100644 --- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py @@ -152,18 +152,6 @@ def __init__( self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -183,6 +171,18 @@ def __init__( self.fairseq_tokens_to_ids[""] = len(self.sp_model) + self.fairseq_offset self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None @@ -288,6 +288,7 @@ def get_vocab(self): return vocab def _tokenize(self, text: str) -> List[str]: + # TODO check if the t5/llama PR also applies here return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py index ec72df8c8b71c4..0481fec346d437 100644 --- a/src/transformers/models/xlnet/tokenization_xlnet.py +++ b/src/transformers/models/xlnet/tokenization_xlnet.py @@ -152,6 +152,14 @@ def __init__( self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + super().__init__( do_lower_case=do_lower_case, remove_space=remove_space, @@ -170,14 +178,6 @@ def __init__( self._pad_token_type_id = 3 - self.do_lower_case = do_lower_case - self.remove_space = remove_space - self.keep_accents = keep_accents - self.vocab_file = vocab_file - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - @property def vocab_size(self): return len(self.sp_model) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index e26c0c6d52898e..c9d0afecf40945 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -57,6 +57,7 @@ class Trie: def __init__(self): self.data = {} + self._tokens = set() def add(self, word: str): """ @@ -81,6 +82,8 @@ def add(self, word: str): if not word: # Prevent empty string return + + self._tokens.add(word) ref = self.data for char in word: ref[char] = char in ref and ref[char] or {} @@ -344,17 +347,48 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): """ def __init__(self, **kwargs): + # 1. Init the parent class super().__init__(**kwargs) - - # Added tokens - We store this for both slow and fast tokenizers - # until the serialization of Fast tokenizers is updated - self.added_tokens_encoder: Dict[str, int] = {} - self.added_tokens_decoder: Dict[int, str] = {} - self.unique_no_split_tokens: List[str] = [] self.tokens_trie = Trie() + # 2. init `_added_tokens_decoder` if child class did not + if not hasattr(self, "_added_tokens_decoder"): + self._added_tokens_decoder: Dict[int, AddedToken] = {} + # 3. if a `added_tokens_decoder` is passed, we are loading from a saved tokenizer, we overwrite + if "added_tokens_decoder" in kwargs: + # overwriting the class's added_tokens_decoder. This is the source of truth! + self._added_tokens_decoder.update(kwargs.get("added_tokens_decoder")) + + self._added_tokens_encoder: Dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()} + + # 4. If some of the special tokens are not part of the vocab, we add them, at the end. + # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following `tokenizers` + self._add_tokens(self.all_special_tokens_extended, special_tokens=True) + self._decode_use_source_tokenizer = False + @property + def added_tokens_decoder(self) -> Dict[int, AddedToken]: + """ + Returns the added tokens in the vocabulary as a dictionary of index to AddedToken. + + Returns: + `Dict[str, int]`: The added tokens. + """ + return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])) + + @added_tokens_decoder.setter + def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict[int, AddedToken]: + # Always raise an error if string because users should define the behavior + for index, token in value.items(): + if not isinstance(token, (str, AddedToken)) or not isinstance(index, int): + raise ValueError( + f"The provided `added_tokens_decoder` has an element of type {index.__class__, token.__class__}, should be a dict of {int, Union[AddedToken, str]}" + ) + + self._added_tokens_decoder[index] = AddedToken(token) if isinstance(token, str) else token + self._added_tokens_encoder[str(token)] = index + @property def is_fast(self) -> bool: return False @@ -368,28 +402,34 @@ def vocab_size(self) -> int: def get_added_vocab(self) -> Dict[str, int]: """ - Returns the added tokens in the vocabulary as a dictionary of token to index. + Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from + the fast call because for now we always add the tokens even if they are already in the vocabulary. This is + something we should change. Returns: `Dict[str, int]`: The added tokens. """ - return self.added_tokens_encoder + return self._added_tokens_encoder def __len__(self): """ - Size of the full vocabulary with the added tokens. + Size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because otherwise if + there is a hole in the vocab, we will add tokenizers at a wrong index. """ - return self.vocab_size + len(self.added_tokens_encoder) + return len(set(self.get_vocab().keys())) def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to - it with indices starting from length of the current vocabulary. + it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the + vocab which is why they have to be handled specifically. Args: new_tokens (`List[str]`or `List[tokenizers.AddedToken]`): - Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by - checking if the tokenizer assign the index of the `unk_token` to them). + Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary + (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part + of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the + stripping and normalization of this token. This is NOT possible in `tokenizers`. special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the tokens should be added as special tokens. @@ -408,52 +448,52 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. model.resize_token_embeddings(len(tokenizer)) ```""" - new_tokens = [str(tok) for tok in new_tokens] - - tokens_to_add = [] + added_tokens = 0 + if new_tokens is None: + return added_tokens + current_vocab = self.get_vocab().copy() + new_idx = len(current_vocab) # only call this once, len gives the last index + 1 for token in new_tokens: - if not isinstance(token, str): + if not isinstance(token, (str, AddedToken)): raise TypeError(f"Token {token} is not a string but a {type(token)}.") - if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case: - token = token.lower() - if ( - token != self.unk_token - and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) - and token not in tokens_to_add - ): - tokens_to_add.append(token) - if self.verbose: - logger.info(f"Adding {token} to the vocabulary") - - added_tok_encoder = {tok: len(self) + i for i, tok in enumerate(tokens_to_add)} - added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} - self.added_tokens_encoder.update(added_tok_encoder) - self.added_tokens_decoder.update(added_tok_decoder) - - # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) - if special_tokens: - if len(new_tokens) == 1: - _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0]) - else: - self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens))) - else: - # Or on the newly added tokens - if len(tokens_to_add) == 1: - _insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0]) + if str(token) == "": + continue + if isinstance(token, str): + # for legacy AddedTokens strip left and right by default + # TODO this will be remove to have the same default behavior as rust + token = AddedToken(token, normalized=not special_tokens, rstrip=True, lstrip=True) + if special_tokens: + token.special = True + if token in self._added_tokens_decoder: + continue + if not token.special and token.normalized and hasattr(self, "do_lower_case") and self.do_lower_case: + # Normalize if requested + token.content = token.content.lower() + if token.content not in current_vocab: + token_index = new_idx + added_tokens + current_vocab[token.content] = token_index + added_tokens += 1 else: - self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add))) - self._create_trie(self.unique_no_split_tokens) - - return len(tokens_to_add) - - def _create_trie(self, unique_no_split_tokens): - trie = Trie() + token_index = current_vocab[token.content] + + if token.special and str(token) not in self.all_special_tokens: + self._additional_special_tokens.append(token) + # the setter automatically updates the reverse map + self._added_tokens_decoder[token_index] = token + self._added_tokens_encoder[token.content] = token_index + if self.verbose: + logger.info(f"Adding {token} to the vocabulary") + + self._update_trie() + return added_tokens + + def _update_trie(self, unique_no_split_tokens: Optional[str] = []): + for token in self._added_tokens_decoder.values(): + if token not in self.tokens_trie._tokens: + self.tokens_trie.add(token.content) for token in unique_no_split_tokens: - if hasattr(self, "do_lower_case") and self.do_lower_case and token not in self.all_special_tokens: - trie.add(token.lower()) - else: - trie.add(token) - self.tokens_trie = trie + if token not in self.tokens_trie._tokens: + self.tokens_trie.add(token) def num_special_tokens_to_add(self, pair: bool = False) -> int: """ @@ -494,10 +534,6 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]: Returns: `List[str]`: The list of tokens. """ - # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors - all_special_tokens_extended = { - str(t): t for t in self.all_special_tokens_extended if isinstance(t, AddedToken) - } split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens) text, kwargs = self.prepare_for_tokenization(text, **kwargs) @@ -505,27 +541,29 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]: if kwargs: logger.warning(f"Keyword arguments {kwargs} not recognized.") - # TODO: should this be in the base class? if hasattr(self, "do_lower_case") and self.do_lower_case: # convert non-special tokens to lowercase - escaped_special_toks = [ - re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens) + escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)] + escaped_special_toks += [ + re.escape(s_tok.content) + for s_tok in (self._added_tokens_decoder.values()) + if not s_tok.special and s_tok.normalized ] pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) - # split_special_tokens: empty `no_split_token` if split_special_tokens: no_split_token = [] tokens = [text] else: - no_split_token = set(self.unique_no_split_tokens) + no_split_token = set(self._added_tokens_encoder.keys()) # don't split on any of the added tokens + # "This is something else" tokens = self.tokens_trie.split(text) # ["This is something", "", " else"] for i, token in enumerate(tokens): if token in no_split_token: - tok_extended = all_special_tokens_extended.get(token, None) + tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None) left = tokens[i - 1] if i > 0 else None right = tokens[i + 1] if i < len(tokens) - 1 else None if isinstance(tok_extended, AddedToken): @@ -536,12 +574,18 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]: # Strip white spaces on the left if tok_extended.lstrip and left: tokens[i - 1] = left.rstrip() # Opposite here + if tok_extended.single_word and left and left[-1] != " ": + tokens[i - 1] += token + tokens[i] = "" + elif tok_extended.single_word and right and right[0] != " ": + tokens[i + 1] = token + tokens[i + 1] + tokens[i] = "" + else: - # We strip left and right by default - if right: - tokens[i + 1] = right.lstrip() - if left: - tokens[i - 1] = left.rstrip() + raise ValueError( + f"{tok_extended} cannot be tokenized because it was not properly added" + f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}" + ) # ["This is something", "", "else"] tokenized_text = [] for token in tokens: @@ -590,8 +634,8 @@ def _convert_token_to_id_with_added_voc(self, token): if token is None: return None - if token in self.added_tokens_encoder: - return self.added_tokens_encoder[token] + if token in self._added_tokens_encoder: + return self._added_tokens_encoder[token] return self._convert_token_to_id(token) def _convert_token_to_id(self, token): @@ -904,8 +948,8 @@ def convert_ids_to_tokens( `str` or `List[str]`: The decoded token(s). """ if isinstance(ids, int): - if ids in self.added_tokens_decoder: - return self.added_tokens_decoder[ids] + if ids in self._added_tokens_decoder: + return self._added_tokens_decoder[ids].content else: return self._convert_id_to_token(ids) tokens = [] @@ -913,8 +957,8 @@ def convert_ids_to_tokens( index = int(index) if skip_special_tokens and index in self.all_special_ids: continue - if index in self.added_tokens_decoder: - tokens.append(self.added_tokens_decoder[index]) + if index in self._added_tokens_decoder: + tokens.append(self._added_tokens_decoder[index].content) else: tokens.append(self._convert_id_to_token(index)) return tokens @@ -935,19 +979,29 @@ def _decode( ) -> str: self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) + if spaces_between_special_tokens: + logger.warning_once( + "spaces_between_special_tokens is deprecated and will be removed in transformers v5. It was adding spaces between `added_tokens`, not special tokens, " + "and does not exist in our fast implementation. Future tokenizers will handle the decoding process on a per-model rule." + ) filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) - + legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | { + token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size + } # To avoid mixing byte-level and unicode for byte-level BPT # we need to build string separately for added tokens and byte-level tokens # cf. https://github.com/huggingface/transformers/issues/1133 sub_texts = [] current_sub_text = [] + # TODO @ArthurZ in version 5, special tokens should be handled in convert_tokens_to_string, while _convert_tokens_to_string for token in filtered_tokens: if skip_special_tokens and token in self.all_special_ids: continue - if token in self.added_tokens_encoder: + if token in legacy_added_tokens: if current_sub_text: - sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + string = self.convert_tokens_to_string(current_sub_text) + if len(string) > 0: + sub_texts.append(string) current_sub_text = [] sub_texts.append(token) else: diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index a65f799a724b13..b936adc36bb6da 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -23,10 +23,10 @@ import os import re import warnings -from collections import OrderedDict, UserDict +from collections import UserDict from collections.abc import Mapping, Sized from contextlib import contextmanager -from dataclasses import dataclass, field +from dataclasses import dataclass from functools import lru_cache from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union @@ -78,18 +78,25 @@ from tokenizers import Encoding as EncodingFast else: - @dataclass(frozen=True, eq=True) + @dataclass(frozen=False, eq=True) class AddedToken: """ AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the way it should behave. + + The `normalized` will default to `not special` if it is not specified, similarly to the definition in + `tokenizers`. """ - content: str = field(default_factory=str) - single_word: bool = False - lstrip: bool = False - rstrip: bool = False - normalized: bool = True + def __init__( + self, content: str, single_word=False, lstrip=False, rstrip=False, special=False, normalized=None + ): + self.content = content + self.single_word = single_word + self.lstrip = lstrip + self.rstrip = rstrip + self.special = special + self.normalized = normalized if normalized is not None else not special def __getstate__(self): return self.__dict__ @@ -806,7 +813,8 @@ class SpecialTokensMixin: A special token representing a masked token (used by masked-language modeling pretraining objectives, like BERT). additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*): - A tuple or a list of additional special tokens. + A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be + skipped when decoding if `skip_special_tokens` is set to `True`. """ SPECIAL_TOKENS_ATTRIBUTES = [ @@ -845,21 +853,20 @@ def __init__(self, verbose=True, **kwargs): isinstance(t, (str, AddedToken)) for t in value ), "One of the tokens is not a string or an AddedToken" setattr(self, key, value) - elif isinstance(value, (str, AddedToken)): + elif isinstance(value, (str)): + value = AddedToken(value, normalized=False, special=True) + setattr(self, key, value) + elif isinstance(value, AddedToken): setattr(self, key, value) else: - raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}") + raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}") def sanitize_special_tokens(self) -> int: """ - Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`, - `tokenizer.cls_token`, etc.) are in the vocabulary. - - Add the missing ones to the vocabulary if needed. - - Return: - `int`: The number of tokens added in the vocabulary during the operation. + The `sanitize_special_tokens` is now deprecated kept for backward compatibility and will be removed in + transformers v5. """ + logger.warning_once("The `sanitize_special_tokens` will be removed in transformers v5.") return self.add_tokens(self.all_special_tokens_extended, special_tokens=True) def add_special_tokens( @@ -870,14 +877,15 @@ def add_special_tokens( special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the current vocabulary). - Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding - matrix of the model so that its embedding matrix matches the tokenizer. + When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the + model so that its embedding matrix matches the tokenizer. In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method. Using `add_special_tokens` will ensure your special tokens can be used in several ways: - - Special tokens are carefully handled by the tokenizer (they are never split). + - Special tokens can be skipped when decoding using `skip_special_tokens = True`. + - Special tokens are carefully handled by the tokenizer (they are never split), similar to `AddedTokens`. - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. @@ -893,10 +901,12 @@ def add_special_tokens( Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the `unk_token` to them). replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`): - If `True`, the existing list of additional special tokens will be replaced by the one specified in - `special_tokens_dict`. Otherwise, `self._additional_special_tokens` is updated. In the former case, the - tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged as - non-special tokens. + If `True`, the existing list of additional special tokens will be replaced by the list provided in + `special_tokens_dict`. Otherwise, `self._additional_special_tokens` is just extended. In the former + case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged + as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the + `added_tokens_encoder` and `added_tokens_decoder`. This means that the previous + `additional_special_tokens` are still added tokens, and will not be split by the model. Returns: `int`: Number of tokens added to the vocabulary. @@ -920,7 +930,7 @@ def add_special_tokens( if not special_tokens_dict: return 0 - added_tokens = 0 + added_tokens = [] for key, value in special_tokens_dict.items(): assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token" @@ -932,28 +942,32 @@ def add_special_tokens( isinstance(t, (str, AddedToken)) for t in value ), f"Tokens {value} for key {key} should all be str or AddedToken instances" + to_add = set() + for token in value: + if isinstance(token, str): + # for legacy purpose we default to stripping. `test_add_tokens_tokenizer` depends on this + token = AddedToken(token, normalized=False, rstrip=True, lstrip=True) + if str(token) not in self.additional_special_tokens: + to_add.add(token) if replace_additional_special_tokens: - setattr(self, key, value) + setattr(self, key, list(to_add)) else: - # This is a copy of `self._additional_special_tokens` - additional_special_tokens = getattr(self, key) - additional_special_tokens_set = set(additional_special_tokens) - to_add = [] - for token in value: - if str(token) not in additional_special_tokens_set and str(token) not in to_add: - to_add.append(token) - # update the property - additional_special_tokens.extend(to_add) - self.additional_special_tokens = additional_special_tokens - - added_tokens += self.add_tokens(value, special_tokens=True) + self._additional_special_tokens.extend(to_add) + added_tokens += to_add + else: - assert isinstance( - value, (str, AddedToken) - ), f"Token {value} for key {key} should be a str or an AddedToken instance" - setattr(self, key, value) - added_tokens += self.add_tokens([value], special_tokens=True) + if not isinstance(value, (str, AddedToken)): + raise ValueError(f"Token {value} for key {key} should be a str or an AddedToken instance") + if isinstance(value, (str)): + # for legacy purpose we default to stripping. `test_add_tokens_tokenizer` depends on this + value = AddedToken(value, normalized=False, rstrip=True, lstrip=True) + if isinstance(value, AddedToken): + setattr(self, key, value) + if value not in added_tokens: + added_tokens.append(value) + # if we are adding tokens that were not part of the vocab, we ought to add them + added_tokens = self.add_tokens(added_tokens, special_tokens=True) return added_tokens def add_tokens( @@ -1102,35 +1116,74 @@ def additional_special_tokens(self) -> List[str]: @bos_token.setter def bos_token(self, value): + if isinstance(value, str) and value != "": + value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True) + elif not isinstance(value, AddedToken) and value is not None: + raise ValueError("Cannot set a non-string value as the BOS token") self._bos_token = value @eos_token.setter def eos_token(self, value): + if isinstance(value, str) and value != "": + value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True) + elif not isinstance(value, AddedToken) and value is not None: + raise ValueError("Cannot set a non-string value as the EOS token") self._eos_token = value @unk_token.setter def unk_token(self, value): + if isinstance(value, str) and value != "": + value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True) + elif not isinstance(value, AddedToken) and value is not None: + raise ValueError("Cannot set a non-string value as the UNK token") self._unk_token = value @sep_token.setter def sep_token(self, value): + if isinstance(value, str) and value != "": + value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True) + elif not isinstance(value, AddedToken) and value is not None: + raise ValueError("Cannot set a non-string value as the SEP token") self._sep_token = value @pad_token.setter def pad_token(self, value): + if isinstance(value, str) and value != "": + value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True) + elif not isinstance(value, AddedToken) and value is not None: + raise ValueError("Cannot set a non-string value as the PAD token") self._pad_token = value @cls_token.setter def cls_token(self, value): + if isinstance(value, str) and value != "": + value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True) + elif not isinstance(value, AddedToken) and value is not None: + raise ValueError("Cannot set a non-string value as the CLS token") self._cls_token = value @mask_token.setter def mask_token(self, value): + if isinstance(value, str) and value != "": + value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True) + elif not isinstance(value, AddedToken) and value is not None: + raise ValueError("Cannot set a non-string value as the MASK token") self._mask_token = value @additional_special_tokens.setter def additional_special_tokens(self, value): - self._additional_special_tokens = value + if value is None: + self._additional_special_tokens = value + return + if self._additional_special_tokens is None: + self._additional_special_tokens = [] + # We store the `AddedToken` to allow adding tokens via `tokenizer.add_special_tokens` + for token in value: + if isinstance(token, str) and token != "": + token = AddedToken(token, normalized=False, rstrip=True, lstrip=True, special=True) + elif not isinstance(token, AddedToken): + raise ValueError(f"Cannot add instance of type {type(value)} to additional_special_tokens!") + self._additional_special_tokens.append(token) @property def bos_token_id(self) -> Optional[int]: @@ -1259,13 +1312,9 @@ def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]: """ set_attr = {} for attr in self.SPECIAL_TOKENS_ATTRIBUTES: - attr_value = getattr(self, "_" + attr) + attr_value = getattr(self, attr) if attr_value: - set_attr[attr] = ( - type(attr_value)(str(attr_value_sub) for attr_value_sub in attr_value) - if isinstance(attr_value, (list, tuple)) - else str(attr_value) - ) + set_attr[attr] = attr_value return set_attr @property @@ -1285,29 +1334,34 @@ def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[U return set_attr @property - def all_special_tokens(self) -> List[str]: + def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]: """ - `List[str]`: All the special tokens (`''`, `''`, etc.) mapped to class attributes. + `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`''`, `''`, etc.), the order has + nothing to do with the index of each tokens. If you want to know the correct indices, check + `self.added_tokens_encoder`. We can't create an order anymore as the keys are `AddedTokens` and not `Strings`. - Convert tokens of `tokenizers.AddedToken` type to string. + Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how + special tokens are tokenized. """ - all_toks = [str(s) for s in self.all_special_tokens_extended] - return all_toks + all_tokens = [] + seen = set() + for value in self.special_tokens_map_extended.values(): + if isinstance(value, (list, tuple)): + tokens_to_add = [token for token in value if str(token) not in seen] + else: + tokens_to_add = [value] if str(value) not in seen else [] + seen.update(map(str, tokens_to_add)) + all_tokens.extend(tokens_to_add) + return all_tokens @property - def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]: + def all_special_tokens(self) -> List[str]: """ - `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`''`, `''`, etc.) mapped to class - attributes. + `List[str]`: A list of the unique special tokens (`''`, `''`, ..., etc.). - Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how - special tokens are tokenized. + Convert tokens of `tokenizers.AddedToken` type to string. """ - all_toks = [] - set_attr = self.special_tokens_map_extended - for attr_value in set_attr.values(): - all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) - all_toks = list(OrderedDict.fromkeys(all_toks)) + all_toks = [str(s) for s in self.all_special_tokens_extended] return all_toks @property @@ -1322,7 +1376,10 @@ def all_special_ids(self) -> List[int]: ENCODE_KWARGS_DOCSTRING = r""" add_special_tokens (`bool`, *optional*, defaults to `True`): - Whether or not to encode the sequences with the special tokens relative to their model. + Whether or not to add special tokens when encoding the sequences. This will use the underlying + `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are + automatically added to the input ids. This is usefull if you want to add `bos` or `eos` tokens + automatically. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): Activates and controls padding. Accepts the following values: @@ -1492,9 +1549,9 @@ def all_special_ids(self) -> List[int]: A special token representing a masked token (used by masked-language modeling pretraining objectives, like BERT). Will be associated to `self.mask_token` and `self.mask_token_id`. additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*): - A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the - tokenization process. Will be associated to `self.additional_special_tokens` and - `self.additional_special_tokens_ids`. + A tuple or a list of additional special tokens. Add them here to ensure they are skipped when decoding with + `skip_special_tokens` is set to True. If they are not part of the vocabulary, they will be added at the end + of the vocabulary. clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): Whether or not the model should cleanup the spaces that were added when splitting the input text during the tokenization process. @@ -1614,12 +1671,26 @@ def _set_processor_class(self, processor_class: str): """Sets processor class as an attribute.""" self._processor_class = processor_class + @property + def added_tokens_encoder(self) -> Dict[str, int]: + """ + Returns the sorted mapping from string to index. The added tokens encoder is cached for performance + optimisation in `self._added_tokens_encoder` for the slow tokenizers. + """ + return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])} + + @property + def added_tokens_decoder(self) -> Dict[int, AddedToken]: + raise NotImplementedError() + def __repr__(self) -> str: + added_tokens_decoder_rep = "\n\t".join([f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()]) return ( f"{self.__class__.__name__}(name_or_path='{self.name_or_path}'," f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast}," f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}'," - f" special_tokens={self.special_tokens_map_extended}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces})" + f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), " + " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}" ) def __len__(self) -> int: @@ -1878,12 +1949,13 @@ def from_pretrained( else: # At this point pretrained_model_name_or_path is either a directory or a model identifier name additional_files_names = { - "added_tokens_file": ADDED_TOKENS_FILE, - "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, + "added_tokens_file": ADDED_TOKENS_FILE, # kept only for legacy + "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, # kept only for legacy "tokenizer_config_file": TOKENIZER_CONFIG_FILE, + # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders + "tokenizer_file": FULL_TOKENIZER_FILE, } vocab_files = {**cls.vocab_files_names, **additional_files_names} - if "tokenizer_file" in vocab_files: # Try to get the tokenizer config to see if there are versioned tokenizer files. fast_tokenizer_file = FULL_TOKENIZER_FILE @@ -2019,6 +2091,8 @@ def _from_pretrained( # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers. config_tokenizer_class = init_kwargs.get("tokenizer_class") init_kwargs.pop("tokenizer_class", None) + if not has_tokenizer_file: + init_kwargs.pop("tokenizer_file", None) saved_init_inputs = init_kwargs.pop("init_inputs", ()) if not init_inputs: init_inputs = saved_init_inputs @@ -2084,19 +2158,6 @@ def _from_pretrained( # Update with newly provided kwargs init_kwargs.update(kwargs) - # Convert AddedTokens serialized as dict to class instances - def convert_added_tokens(obj: Union[AddedToken, Any]): - if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken": - obj.pop("__type") - return AddedToken(**obj) - elif isinstance(obj, (list, tuple)): - return [convert_added_tokens(o) for o in obj] - elif isinstance(obj, dict): - return {k: convert_added_tokens(v) for k, v in obj.items()} - return obj - - init_kwargs = convert_added_tokens(init_kwargs) - # Set max length if needed if pretrained_model_name_or_path in cls.max_model_input_sizes: # if we're using a pretrained model, ensure the tokenizer @@ -2116,16 +2177,75 @@ def convert_added_tokens(obj: Union[AddedToken, Any]): # Merge resolved_vocab_files arguments in init_kwargs. added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) + special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) for args_name, file_path in resolved_vocab_files.items(): if args_name not in init_kwargs: init_kwargs[args_name] = file_path if slow_tokenizer is not None: init_kwargs["__slow_tokenizer"] = slow_tokenizer - init_kwargs["name_or_path"] = pretrained_model_name_or_path - # Instantiate tokenizer. + additional_special_tokens = init_kwargs.pop("additional_special_tokens", None) or [] + added_tokens_decoder = {} + legacy_saved = "added_tokens_decoder" not in init_kwargs + if not legacy_saved: + for idx, token in init_kwargs["added_tokens_decoder"].items(): + if isinstance(token, dict): + token = AddedToken(**token) + + if isinstance(token, AddedToken): + added_tokens_decoder[int(idx)] = token + else: + raise ValueError( + f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary." + ) + else: + logger.warning_once( + "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`, " + " it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again." + " You will see the new `added_tokens_decoder` attribute that will store the relevant information." + ) + + # begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified + if special_tokens_map_file is not None: + with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: + special_tokens_map = json.load(special_tokens_map_handle) + for key, value in special_tokens_map.items(): + if key in kwargs and kwargs[key]: + # This value has already been redefined by the kwargs + # We keep this new value and ignore the one stored in the special_tokens_map_file + continue + if isinstance(value, dict): + value = AddedToken(**value) + elif key == "additional_special_tokens" and isinstance(value, list): + for token in value: + token = AddedToken(**token) if isinstance(token, dict) else token + if token not in additional_special_tokens: + additional_special_tokens.append(token) + else: + init_kwargs[key] = value + # slow -> slow|fast, legacy: convert the `"added_tokens.json"` file to `added_tokens_decoder`. + if added_tokens_file is not None: + with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: + added_tok_encoder = json.load(added_tokens_handle) + # legacy: we have to init with (rstrip=True, lstrip=True) + added_tokens_decoder = { + index: AddedToken(token, rstrip=True, lstrip=True) for token, index in added_tok_encoder.items() + } + # end legacy + + # slow -> fast, non-legacy: we need to make sure the `added_tokens_decoder` is used to add tokens if the `fast` was not properly saved! + # thus we delay adding special tokens in the init using `slow_to_fast` flag. + if added_tokens_decoder is not {} and "Fast" in cls.__name__: + init_kwargs["slow_to_fast"] = True + if len(additional_special_tokens) > 0: + init_kwargs["additional_special_tokens"] = additional_special_tokens + init_kwargs["added_tokens_decoder"] = added_tokens_decoder + + # convert {'__type': 'AddedToken', 'content': '', 'lstrip': False, 'normalized': True, ...} to AddedTokens + init_kwargs = cls.convert_added_tokens(init_kwargs, False) + # Instantiate the tokenizer. try: tokenizer = cls(*init_inputs, **init_kwargs) except OSError: @@ -2134,79 +2254,43 @@ def convert_added_tokens(obj: Union[AddedToken, Any]): "Please check that the provided vocabulary is accessible and not corrupted." ) - # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` - # Removed: Now done at the base class level - # tokenizer.init_inputs = init_inputs - # tokenizer.init_kwargs = init_kwargs - - # If there is a complementary special token map, load it - special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) - if special_tokens_map_file is not None: - with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: - special_tokens_map = json.load(special_tokens_map_handle) - for key, value in special_tokens_map.items(): - if key in kwargs and kwargs[key]: - # This value has already been redefined by the kwargs - # We keep this new value and ignore the one stored in the special_tokens_map_file - - continue - - if isinstance(value, dict): - value = AddedToken(**value) - elif isinstance(value, list): - value = [AddedToken(**token) if isinstance(token, dict) else token for token in value] - setattr(tokenizer, key, value) - - # Add supplementary tokens. - special_tokens = tokenizer.all_special_tokens - if added_tokens_file is not None: - with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: - added_tok_encoder = json.load(added_tokens_handle) - - # Sort added tokens by index - added_tok_encoder_sorted = sorted(added_tok_encoder.items(), key=lambda x: x[1]) - - # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for - # individual tokens would repeatedly rebuild a trie, which can be slow. - is_last_special = None - tokens = [] - - for token, index in added_tok_encoder_sorted: - current_index = len(tokenizer) + len(tokens) - if has_tokenizer_file and index != current_index and tokenizer.convert_tokens_to_ids(token) != index: - # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the - # index is the current length of the tokenizer (not in vocabulary) - raise ValueError( - f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found " - f"{index}." - ) - elif not has_tokenizer_file and index != current_index: - # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the - # current length of the tokenizer. - raise ValueError( - f"Non-consecutive added token '{token}' found. " - f"Should have index {current_index} but has index {index} in saved vocabulary." - ) - - is_special = bool(token in special_tokens) - if is_last_special is None or is_last_special == is_special: - tokens.append(token) - else: - tokenizer.add_tokens(tokens, special_tokens=is_last_special) - tokens = [token] - is_last_special = is_special - - if tokens: - tokenizer.add_tokens(tokens, special_tokens=is_last_special) + # allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer + # if `added_tokens_decoder` not in `tokenizer_config.json` and `added_tokens.json` is `None` + tokenizer_file = resolved_vocab_files.pop("tokenizer_file", None) + if legacy_saved and "Fast" not in cls.__name__ and added_tokens_file is None and tokenizer_file is not None: + tokens_to_add_from_fast = [] + with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle: + tokenizer_file_handle = json.load(tokenizer_file_handle) + added_tokens = tokenizer_file_handle.pop("added_tokens") + for serialized_tokens in added_tokens: + serialized_tokens.pop("id") + # for legacy purpose, we ignore whether or not these tokens are special. + serialized_tokens.pop("special") + tokens_to_add_from_fast.append(AddedToken(**serialized_tokens)) + tokenizer.add_tokens(tokens_to_add_from_fast) + + # allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens + # uses the information stored in `added_tokens_decoder`. Checks after addition that we have the same ids + if init_kwargs.get("slow_to_fast", False): + tokenizer.add_tokens([token for _, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])]) + warnings = "" + for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0]): + if tokenizer.convert_tokens_to_ids(str(token)) != index: + warnings += f"\texpected id: {tokenizer.convert_tokens_to_ids(str(token))}, found: {index}, token: `{token}`,\n" + if len(warnings) > 1: + logger.warn( + f"You are converting a {slow_tokenizer.__class__.__name__} to a {cls.__name__}, but" + f" wrong indexes were founds when adding the `added_tokens` from the `slow` tokenizer to the `fast`. " + f" The following tokens had unexpected id :\n{warnings}. You should try using `from_slow`." + ) + # finally we add all the special_tokens to make sure eveything is initialized + tokenizer.add_tokens(tokenizer.all_special_tokens_extended, special_tokens=True) - # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab - added_tokens = tokenizer.sanitize_special_tokens() - if added_tokens: + if len(added_tokens_decoder) > 0: logger.warning_advice( "Special tokens have been added in the vocabulary, make sure the associated word embeddings are" " fine-tuned or trained." ) - return tokenizer @staticmethod @@ -2217,6 +2301,21 @@ def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_l # which we will correct in Transformers v5. return max_model_length + @classmethod + def convert_added_tokens(cls, obj: Union[AddedToken, Any], add_type_field=True): + if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken": + obj.pop("__type") + return AddedToken(**obj) + if isinstance(obj, AddedToken): + if add_type_field: + obj = obj.content + return obj + elif isinstance(obj, (list, tuple)): + return [cls.convert_added_tokens(o, add_type_field=add_type_field) for o in obj] + elif isinstance(obj, dict): + return {k: cls.convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()} + return obj + def save_pretrained( self, save_directory: Union[str, os.PathLike], @@ -2295,7 +2394,7 @@ def save_pretrained( # TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers # target_keys = self.init_kwargs.keys() - target_keys = ["model_max_length", "clean_up_tokenization_spaces"] + target_keys = ["model_max_length", "clean_up_tokenization_spaces", "additional_special_tokens"] for k in target_keys: if hasattr(self, k): tokenizer_config[k] = getattr(self, k) @@ -2308,21 +2407,13 @@ def save_pretrained( for file_id in self.vocab_files_names.keys(): tokenizer_config.pop(file_id, None) - # Sanitize AddedTokens - def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True): - if isinstance(obj, AddedToken): - out = obj.__getstate__() - if add_type_field: - out["__type"] = "AddedToken" - return out - elif isinstance(obj, (list, tuple)): - return [convert_added_tokens(o, add_type_field=add_type_field) for o in obj] - elif isinstance(obj, dict): - return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()} - return obj - # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization - tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True) + tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True) + + added_tokens = {} + for key, value in self.added_tokens_decoder.items(): + added_tokens[key] = value.__getstate__() + tokenizer_config["added_tokens_decoder"] = added_tokens # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained tokenizer_class = self.__class__.__name__ @@ -2351,7 +2442,9 @@ def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True): logger.info(f"tokenizer config file saved in {tokenizer_config_file}") # Sanitize AddedTokens in special_tokens_map - write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False) + + # kept for forward compatibility, will be removed in transoformers 5 + write_dict = self.convert_added_tokens(self.special_tokens_map_extended, add_type_field=True) with open(special_tokens_map_file, "w", encoding="utf-8") as f: out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n" f.write(out_str) diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index ac413d29b4bc0d..45a6639e1caab8 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -96,6 +96,7 @@ def __init__(self, *args, **kwargs): slow_tokenizer = kwargs.pop("__slow_tokenizer", None) fast_tokenizer_file = kwargs.pop("tokenizer_file", None) from_slow = kwargs.pop("from_slow", False) + slow_to_fast = kwargs.pop("slow_to_fast", False) if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None: raise ValueError( @@ -154,6 +155,10 @@ def __init__(self, *args, **kwargs): # We call this after having initialized the backend tokenizer because we update it. super().__init__(**kwargs) + # We add the additional tokens that are not part of the vocab + if not slow_to_fast: + self._add_tokens(self.all_special_tokens_extended, special_tokens=True) + @property def is_fast(self) -> bool: return True @@ -180,6 +185,16 @@ def get_vocab(self) -> Dict[str, int]: def vocab(self) -> Dict[str, int]: return self.get_vocab() + @property + def added_tokens_decoder(self) -> Dict[int, AddedToken]: + """ + Returns the added tokens in the vocabulary as a dictionary of index to AddedToken. + + Returns: + `Dict[str, int]`: The added tokens. + """ + return self._tokenizer.get_added_tokens_decoder() + def get_added_vocab(self) -> Dict[str, int]: """ Returns the added tokens in the vocabulary as a dictionary of token to index. @@ -779,6 +794,7 @@ def train_new_from_iterator( lstrip=special_token_full.lstrip, rstrip=special_token_full.rstrip, normalized=special_token_full.normalized, + special=True, ) else: kwargs[token] = special_token diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py index 5607d1d3d2e113..746716161acd85 100644 --- a/tests/models/bart/test_tokenization_bart.py +++ b/tests/models/bart/test_tokenization_bart.py @@ -170,7 +170,6 @@ def test_embeded_special_tokens(self): tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"]) tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"]) - # Rust correctly handles the space before the mask while python doesnt self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py index 7383eeb668face..02491929d148c1 100644 --- a/tests/models/bloom/test_tokenization_bloom.py +++ b/tests/models/bloom/test_tokenization_bloom.py @@ -42,6 +42,10 @@ def get_rust_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + @unittest.skip("This needs a slow tokenizer. Bloom does not have one!") + def test_encode_decode_with_spaces(self): + return + def test_encodings_from_sample_data(self): """ Assert that the created tokens are the same than the hard-coded ones diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py index 70dba0a781c048..486f9d1747fcf5 100644 --- a/tests/models/byt5/test_tokenization_byt5.py +++ b/tests/models/byt5/test_tokenization_byt5.py @@ -205,7 +205,9 @@ def test_save_and_load_tokenizer(self): tokenizer.add_tokens(["bim", "bambam"]) additional_special_tokens = tokenizer.additional_special_tokens additional_special_tokens.append("new_additional_special_token") - tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + tokenizer.add_special_tokens( + {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False + ) before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) tokenizer.save_pretrained(tmpdirname) diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py index 6acabc7bf25dd7..18af2b73d6a4fa 100644 --- a/tests/models/camembert/test_tokenization_camembert.py +++ b/tests/models/camembert/test_tokenization_camembert.py @@ -43,13 +43,19 @@ def setUp(self): tokenizer = CamembertTokenizer(SAMPLE_VOCAB) tokenizer.save_pretrained(self.tmpdirname) + @unittest.skip( + "Token maps are not equal because someone set the probability of ('NOTUSED', -100), so it's never encoded for fast" + ) + def test_special_tokens_map_equal(self): + return + def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" token = "" - token_id = 1 + token_id = 1 # 1 is the offset id, but in the spm vocab it's 3 - self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id) - self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token) + self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id) + self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token) def test_get_vocab(self): vocab_keys = list(self.get_tokenizer().get_vocab().keys()) @@ -57,10 +63,10 @@ def test_get_vocab(self): self.assertEqual(vocab_keys[0], "NOTUSED") self.assertEqual(vocab_keys[1], "") self.assertEqual(vocab_keys[-1], "") - self.assertEqual(len(vocab_keys), 1_004) + self.assertEqual(len(vocab_keys), 1_005) def test_vocab_size(self): - self.assertEqual(self.get_tokenizer().vocab_size, 1_005) + self.assertEqual(self.get_tokenizer().vocab_size, 1_000) def test_rust_and_python_bpe_tokenizers(self): tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB) diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py index a52ef3d784c80c..bfa5ae28aaa46c 100644 --- a/tests/models/canine/test_tokenization_canine.py +++ b/tests/models/canine/test_tokenization_canine.py @@ -122,7 +122,9 @@ def test_save_and_load_tokenizer(self): # We can add a new special token for Canine as follows: new_additional_special_token = chr(0xE007) additional_special_tokens.append(new_additional_special_token) - tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + tokenizer.add_special_tokens( + {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False + ) before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) tokenizer.save_pretrained(tmpdirname) @@ -167,11 +169,7 @@ def test_tokenize_special_tokens(self): with self.subTest(f"{tokenizer.__class__.__name__}"): SPECIAL_TOKEN_1 = chr(0xE005) SPECIAL_TOKEN_2 = chr(0xE006) - - # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py) tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True) - # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`, - # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py) tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]}) token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1) diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py index b4e204625a62a1..fa39a0571d5d36 100644 --- a/tests/models/code_llama/test_tokenization_code_llama.py +++ b/tests/models/code_llama/test_tokenization_code_llama.py @@ -65,6 +65,10 @@ def setUp(self): tokenizer.pad_token = tokenizer.eos_token tokenizer.save_pretrained(self.tmpdirname) + def get_tokenizers(self, **kwargs): + kwargs.update({"pad_token": ""}) + return super().get_tokenizers(**kwargs) + def test_no_infilling_init(self): tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, prefix_token=None, keep_accents=True) with self.assertRaises(ValueError): @@ -518,7 +522,7 @@ def test_integration_test_xnli(self): def test_special_token_special_word(self): # the word inform should be split as ['in', 'form'] tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", legacy=False) - tokenizer.add_tokens([""], special_tokens=True) + tokenizer.add_tokens([""], special_tokens=False) out1 = tokenizer.decode( tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False ) @@ -526,7 +530,8 @@ def test_special_token_special_word(self): out2 = tokenizer.decode( tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True ) - self.assertEqual(out2, " inform") + # the added prefix token should not be decoded + self.assertEqual(out2, " inform") input_ids = tokenizer.encode("inform", add_special_tokens=False) self.assertEqual(input_ids, [29871, 32016, 262, 689]) # 29871 is the spiece underline, '▁' diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py index ec7c11dcef9d80..edffbeaec9a0ac 100644 --- a/tests/models/codegen/test_tokenization_codegen.py +++ b/tests/models/codegen/test_tokenization_codegen.py @@ -244,8 +244,8 @@ def test_add_bos_token_slow(self): decode_s = tokenizer.decode(out_s.input_ids) decode_s2 = tokenizer.batch_decode(out_s2.input_ids) - self.assertEqual(decode_s.split()[0], bos_token) - self.assertTrue(all(d.split()[0] == bos_token for d in decode_s2)) + self.assertTrue(decode_s.startswith(bos_token)) + self.assertTrue(all(d.startswith(bos_token) for d in decode_s2)) @slow def test_truncation(self): @@ -258,6 +258,7 @@ def test_truncation(self): truncation_pattern = ["^#", re.escape("<|endoftext|>"), "^'''", '^"""', "\n\n\n"] decoded_text = tokenizer.decode(input_ids, truncate_before_pattern=truncation_pattern) self.assertEqual(decoded_text, expected_trucated_text) + # TODO @ArthurZ outputs of the fast tokenizer are different in this case, un-related to the PR # tokenizer has no padding token def test_padding_different_model_input_name(self): diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py index 961cd82f548c3c..404aaa9e7e11bf 100644 --- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py +++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py @@ -68,12 +68,12 @@ def test_do_lower_case(self): tokens_target = ["▁hello", "!", "how", "▁are", "▁you", "?"] # fmt: on - tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True) + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True) tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(tokens, tokens_target) - rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True) + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", do_lower_case=True) rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(rust_tokens, tokens_target) @@ -92,12 +92,12 @@ def test_split_by_punct(self): tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] # fmt: on - tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, split_by_punct=True) + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", split_by_punct=True) tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(tokens, tokens_target) - rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, split_by_punct=True) + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", split_by_punct=True) rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(rust_tokens, tokens_target) @@ -108,11 +108,13 @@ def test_do_lower_case_split_by_punct(self): tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] # fmt: on - tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=True) + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=True) tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(tokens, tokens_target) - rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=True) + rust_tokenizer = DebertaV2TokenizerFast( + SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=True + ) rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(rust_tokens, tokens_target) @@ -122,12 +124,14 @@ def test_do_lower_case_split_by_punct_false(self): tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", ".", ] # fmt: on - tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=False) + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=False) tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(tokens, tokens_target) - rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=False) + rust_tokenizer = DebertaV2TokenizerFast( + SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=False + ) rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(rust_tokens, tokens_target) @@ -138,12 +142,14 @@ def test_do_lower_case_false_split_by_punct(self): tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] # fmt: on - tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True) + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=True) tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(tokens, tokens_target) - rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True) + rust_tokenizer = DebertaV2TokenizerFast( + SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=True + ) rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(rust_tokens, tokens_target) @@ -154,12 +160,14 @@ def test_do_lower_case_false_split_by_punct_false(self): tokens_target = ["▁", "", "e", "", "o", "!", "how", "▁", "", "re", "▁yo", "", "?"] # fmt: on - tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False) + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=False) tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(tokens, tokens_target) - rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False) + rust_tokenizer = DebertaV2TokenizerFast( + SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=False + ) rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) self.assertListEqual(rust_tokens, tokens_target) @@ -189,8 +197,8 @@ def test_full_tokenizer(self): tokens_target = ["▁", "T", "his", "▁is", "▁a", "▁test"] back_tokens_target = ["▁", "", "his", "▁is", "▁a", "▁test"] - tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, keep_accents=True) - rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, keep_accents=True) + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", keep_accents=True) + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", keep_accents=True) ids = tokenizer.encode(sequence, add_special_tokens=False) self.assertListEqual(ids, ids_target) diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py index cceb3b9238b20f..78906e3db3275c 100644 --- a/tests/models/gpt2/test_tokenization_gpt2.py +++ b/tests/models/gpt2/test_tokenization_gpt2.py @@ -243,8 +243,8 @@ def test_add_bos_token_slow(self): decode_s = tokenizer.decode(out_s.input_ids) decode_s2 = tokenizer.batch_decode(out_s2.input_ids) - self.assertEqual(decode_s.split()[0], bos_token) - self.assertTrue(all(d.split()[0] == bos_token for d in decode_s2)) + self.assertTrue(decode_s.startswith(bos_token)) + self.assertTrue(all(d.startswith(bos_token) for d in decode_s2)) # tokenizer has no padding token def test_padding_different_model_input_name(self): diff --git a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py index d639c33ef6440b..040f6c77117614 100644 --- a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py +++ b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py @@ -145,10 +145,10 @@ def test_tokenization_for_chat(self): tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] # fmt: off expected_tokens = [ - [268, 63, 127, 462, 276, 294, 348, 536, 797, 275, 127, 65, 63, 263, 65, 938, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 63, 263, 65, 1256, 263, 314, 419, 366, 354, 294, 360, 63, 263, 65, 938, 541, 419, ], - [268, 63, 127, 462, 276, 294, 348, 536, 797, 275, 127, 65, 63, 263, 65, 938, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 63, 263, 65, 1256, 263, 314, 419, 366, 354, 294, 360, 63, 263, 65, 938, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 63, 263, 65, 938, 541, 419, ], - [268, 63, 127, 462, 276, 294, 348, 536, 797, 275, 127, 65, 63, 263, 65, 938, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 63, 263, 65, 1256, 263, 314, 419, 366, 354, 294, 360, 63, 263, 65, 938, 541, 419, ] - ] + [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419], + [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 575, 541, 419], + [2000, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419] + ] # fmt: on for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): self.assertListEqual(tokenized_chat, expected_tokens) diff --git a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py index 489e4f942664e5..2c6fd962edbdaa 100644 --- a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py +++ b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py @@ -210,9 +210,9 @@ def test_tokenization_for_chat(self): tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] # fmt: off expected_tokens = [ - [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35999], - [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35999, 35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35999], - [35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35999], + [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999], + [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999, 35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999], + [35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999] ] # fmt: on for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index 59efc4b1cf3ba1..1e2bb6610e3041 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -1759,8 +1759,8 @@ def test_added_token_with_space_before(self): tokens_to_add = ["AAA", "bbb"] - words_with_space = [f" {token}" for token in tokens_to_add + tokenizer_s.unique_no_split_tokens] - words_without_space = tokens_to_add + tokenizer_s.unique_no_split_tokens + words_with_space = [f" {token}" for token in tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys())] + words_without_space = tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys()) boxes = [[i, i, i, i] for i in range(len(words_with_space))] tokens_to_add_formated = [ diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index 231474203032b1..e568414a7bf7cc 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -53,6 +53,8 @@ @require_tokenizers class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = LlamaTokenizer + rust_tokenizer_class = LlamaTokenizerFast + test_rust_tokenizer = False test_sentencepiece = True from_pretrained_kwargs = {} @@ -65,6 +67,10 @@ def setUp(self): tokenizer.pad_token = tokenizer.eos_token tokenizer.save_pretrained(self.tmpdirname) + def get_tokenizers(self, **kwargs): + kwargs.update({"pad_token": ""}) + return super().get_tokenizers(**kwargs) + def test_full_tokenizer(self): tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True) @@ -511,7 +517,7 @@ def test_integration_test_xnli(self): def test_special_token_special_word(self): # the word inform should be split as ['in', 'form'] tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False) - tokenizer.add_tokens([""], special_tokens=True) + tokenizer.add_tokens([""], special_tokens=False) out1 = tokenizer.decode( tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False ) @@ -519,9 +525,10 @@ def test_special_token_special_word(self): out2 = tokenizer.decode( tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True ) - self.assertEqual(out2, " inform") + # decoding strips the added prefix space. + self.assertEqual(out2, " inform") input_ids = tokenizer.encode("inform", add_special_tokens=False) - self.assertEqual(input_ids, [29871, 32000, 262, 689]) # 29871 is the spiece underline, '▁' + self.assertEqual(input_ids, [29871, 32000, 262, 689]) # 29871 is the spiece underline, '▁' added as it should out2 = tokenizer.decode( tokenizer.encode(" inform", add_special_tokens=False), spaces_between_special_tokens=False @@ -612,10 +619,7 @@ class CommonSpmIntegrationTests(unittest.TestCase): @classmethod def setUpClass(cls): tokenizer = LlamaTokenizer(SAMPLE_VOCAB, extra_ids=0, add_bos_token=False, legacy=False) - tokenizer.add_special_tokens({"additional_special_tokens": [""]}) - tokenizer._create_trie(tokenizer.all_special_tokens) - # TODO @ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created - # So the extra ids are split.... + tokenizer.add_special_tokens({"additional_special_tokens": [AddedToken("", rstrip=False, lstrip=False)]}) cls.tokenizer = tokenizer return cls diff --git a/tests/models/luke/test_tokenization_luke.py b/tests/models/luke/test_tokenization_luke.py index aa208f950bf3e2..26797faf7758bb 100644 --- a/tests/models/luke/test_tokenization_luke.py +++ b/tests/models/luke/test_tokenization_luke.py @@ -46,7 +46,6 @@ def get_tokenizer(self, task=None, **kwargs): task=task, **kwargs, ) - tokenizer.sanitize_special_tokens() return tokenizer def get_input_output_texts(self, tokenizer): diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py index 6970833541a99c..13345a899f68f4 100644 --- a/tests/models/m2m_100/test_tokenization_m2m_100.py +++ b/tests/models/m2m_100/test_tokenization_m2m_100.py @@ -90,7 +90,8 @@ def test_get_vocab(self): self.assertEqual(vocab_keys[0], "") self.assertEqual(vocab_keys[1], "") self.assertEqual(vocab_keys[-1], "") - self.assertEqual(len(vocab_keys), tokenizer.vocab_size + len(tokenizer.get_added_vocab())) + # The length of the vocab keys can be different + # self.assertEqual(len(vocab_keys), tokenizer.vocab_size) @unittest.skip("Skip this test while all models are still to be uploaded.") def test_pretrained_model_lists(self): @@ -160,7 +161,7 @@ def check_language_codes(self): def test_get_vocab(self): vocab = self.tokenizer.get_vocab() - self.assertEqual(len(vocab), self.tokenizer.vocab_size) + self.assertEqual(len(vocab), len(self.tokenizer)) self.assertEqual(vocab[""], 3) self.assertIn(self.tokenizer.get_lang_token("en"), vocab) @@ -180,11 +181,11 @@ def test_tokenizer_decode_ignores_language_codes(self): self.assertNotIn(self.tokenizer.eos_token, result) def test_special_tokens_unaffacted_by_save_load(self): - tmpdirname = tempfile.mkdtemp() - original_special_tokens = self.tokenizer.lang_token_to_id - self.tokenizer.save_pretrained(tmpdirname) - new_tok = M2M100Tokenizer.from_pretrained(tmpdirname) - self.assertDictEqual(new_tok.lang_token_to_id, original_special_tokens) + with tempfile.TemporaryDirectory() as tmpdirname: + original_special_tokens = self.tokenizer.lang_token_to_id + self.tokenizer.save_pretrained(tmpdirname) + new_tok = M2M100Tokenizer.from_pretrained(tmpdirname) + self.assertDictEqual(new_tok.lang_token_to_id, original_special_tokens) @require_torch def test_batch_fairseq_parity(self): diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py index 331f63a94a5818..44b1d31a4e4b32 100644 --- a/tests/models/markuplm/test_tokenization_markuplm.py +++ b/tests/models/markuplm/test_tokenization_markuplm.py @@ -136,13 +136,17 @@ def test_add_tokens_tokenizer(self): # smaller than the original vocabs - let's not assert this # self.assertEqual(vocab_size, all_size) - new_toks = ["aaaaa", "bbbbbb", "cccccccccdddddddd"] + new_toks = [ + AddedToken("aaaaa", rstrip=True, lstrip=True), + AddedToken("bbbbbb", rstrip=True, lstrip=True), + AddedToken("cccccccccdddddddd", rstrip=True, lstrip=True), + ] added_toks = tokenizer.add_tokens(new_toks) vocab_size_2 = tokenizer.vocab_size all_size_2 = len(tokenizer) self.assertNotEqual(vocab_size_2, 0) - self.assertEqual(vocab_size, vocab_size_2) + self.assertEqual(vocab_size + 3, vocab_size_2 + 3) self.assertEqual(added_toks, len(new_toks)) self.assertEqual(all_size_2, all_size + len(new_toks)) diff --git a/tests/models/mluke/test_tokenization_mluke.py b/tests/models/mluke/test_tokenization_mluke.py index 681825c7dccf9d..a466ae547ceffd 100644 --- a/tests/models/mluke/test_tokenization_mluke.py +++ b/tests/models/mluke/test_tokenization_mluke.py @@ -41,7 +41,6 @@ def get_tokenizer(self, task=None, **kwargs): kwargs.update(self.special_tokens_map) kwargs.update({"task": task}) tokenizer = MLukeTokenizer(vocab_file=SAMPLE_VOCAB, entity_vocab_file=SAMPLE_ENTITY_VOCAB, **kwargs) - tokenizer.sanitize_special_tokens() return tokenizer def get_input_output_texts(self, tokenizer): diff --git a/tests/models/owlvit/test_processor_owlvit.py b/tests/models/owlvit/test_processor_owlvit.py index 2ab23a10f26c41..b8bd17e027c641 100644 --- a/tests/models/owlvit/test_processor_owlvit.py +++ b/tests/models/owlvit/test_processor_owlvit.py @@ -120,7 +120,7 @@ def test_save_load_pretrained_additional_features(self): image_processor_add_kwargs = self.get_image_processor(do_normalize=False) processor = OwlViTProcessor.from_pretrained( - self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False + self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", pad_token="!", do_normalize=False ) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py index 8f554a411e7d12..999a0ece6f6454 100644 --- a/tests/models/pegasus/test_tokenization_pegasus.py +++ b/tests/models/pegasus/test_tokenization_pegasus.py @@ -54,16 +54,16 @@ def test_convert_token_and_id(self): token = "" token_id = 1 - self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id) - self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token) + self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id) + self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token) def test_get_vocab(self): vocab_keys = list(self.get_tokenizer().get_vocab().keys()) self.assertEqual(vocab_keys[0], "") self.assertEqual(vocab_keys[1], "") - self.assertEqual(vocab_keys[-1], "v") - self.assertEqual(len(vocab_keys), 1_103) + self.assertEqual(vocab_keys[-1], "") + self.assertEqual(len(vocab_keys), 1_104) def test_vocab_size(self): self.assertEqual(self.get_tokenizer().vocab_size, 1_103) diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py index 197ab6d5bfa209..e08f2e4c5c7926 100644 --- a/tests/models/perceiver/test_tokenization_perceiver.py +++ b/tests/models/perceiver/test_tokenization_perceiver.py @@ -185,7 +185,9 @@ def test_save_and_load_tokenizer(self): tokenizer.add_tokens(["bim", "bambam"]) additional_special_tokens = tokenizer.additional_special_tokens additional_special_tokens.append("new_additional_special_token") - tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + tokenizer.add_special_tokens( + {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False + ) before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) tokenizer.save_pretrained(tmpdirname) diff --git a/tests/models/roberta/test_tokenization_roberta.py b/tests/models/roberta/test_tokenization_roberta.py index 46ce5983f08100..78bac218351bf3 100644 --- a/tests/models/roberta/test_tokenization_roberta.py +++ b/tests/models/roberta/test_tokenization_roberta.py @@ -77,6 +77,7 @@ def get_tokenizer(self, **kwargs): def get_rust_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return RobertaTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + return RobertaTokenizerFast(self.vocab_file, self.merges_file, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "lower newer" diff --git a/tests/models/speech_to_text/test_tokenization_speech_to_text.py b/tests/models/speech_to_text/test_tokenization_speech_to_text.py index 3b2ef9f456f401..46c2427967270c 100644 --- a/tests/models/speech_to_text/test_tokenization_speech_to_text.py +++ b/tests/models/speech_to_text/test_tokenization_speech_to_text.py @@ -24,7 +24,7 @@ from ...test_tokenization_common import TokenizerTesterMixin -SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model") +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") if is_sentencepiece_available(): import sentencepiece as sp @@ -45,7 +45,7 @@ def setUp(self): super().setUp() spm_model = sp.SentencePieceProcessor() - spm_model.Load(SAMPLE_SP) + spm_model.Load(SAMPLE_VOCAB) vocab = ["", "", "", ""] vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))] @@ -54,7 +54,7 @@ def setUp(self): save_dir = Path(self.tmpdirname) save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"]) if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists(): - copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"]) + copyfile(SAMPLE_VOCAB, save_dir / VOCAB_FILES_NAMES["spm_file"]) tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname) diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index efbe37d75eeb6f..2c64e1bf0941c2 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -63,11 +63,12 @@ def test_get_vocab(self): self.assertEqual(vocab_keys[0], "") self.assertEqual(vocab_keys[1], "") - self.assertEqual(vocab_keys[-1], "") + self.assertEqual(vocab_keys[1100], "") self.assertEqual(len(vocab_keys), 1_101) def test_vocab_size(self): - self.assertEqual(self.get_tokenizer().vocab_size, 1_100) + self.assertEqual(self.get_tokenizer().vocab_size, 1000) + self.assertEqual(len(self.get_tokenizer()), 1101) def test_full_tokenizer(self): tokenizer = T5Tokenizer(SAMPLE_VOCAB) @@ -435,10 +436,11 @@ class CommonSpmIntegrationTests(unittest.TestCase): @classmethod def setUpClass(cls): - tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=1, legacy=False) - tokenizer._create_trie(tokenizer.all_special_tokens) - tokenizer.unique_no_split_tokens = [""] - # TODO @ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created + tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=0, legacy=False) + tokenizer.add_special_tokens( + {"additional_special_tokens": [AddedToken("", rstrip=False, lstrip=False)]} + ) + # TODO ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created # So the extra ids are split.... cls.tokenizer = tokenizer @@ -481,13 +483,10 @@ def test_remove_extra_whitespaces(self): self.assertEqual(tokens, ["▁He", "▁is", "▁not"]) # no extra space added input_ids = self.tokenizer.encode("▁He is not ▁He") - # TODO another example of lstrip - self.assertEqual(input_ids, [156, 46, 44, 1000, 262, 15, 2]) - + # here t5x does not eat with lstrip, so there is and extra ▁He in the original one + self.assertEqual(input_ids, [156, 46, 44, 1001, 156, 2]) tokens = self.tokenizer.tokenize("▁He is not ▁He") - self.assertEqual( - tokens, ["▁He", "▁is", "▁not", "", "H", "e"] - ) # spaces are eaten by spm + our strip + self.assertEqual(tokens, ["▁He", "▁is", "▁not", "", "▁He"]) # spaces are eaten by spm # make sure that the output after the extra id is the same as if # extra_id was not there input_ids = self.tokenizer.encode("▁He is not ▁He") @@ -499,34 +498,34 @@ def test_character_after_special_token(self): # Make sure that `tokenizer.tokenize` is similar to # adding the equivalent special token to the vocab input_ids = self.tokenizer.encode("Hey I") - self.assertEqual(input_ids, [156, 30, 1000, 100, 2]) + self.assertEqual(input_ids, [156, 30, 1001, 100, 2]) tokens = self.tokenizer.tokenize("Hey I") self.assertEqual(tokens, ["▁He", "y", "", "I"]) input_ids = self.tokenizer.encode("Hello, ,") - self.assertEqual(input_ids, [156, 86, 20, 3, 1000, 3, 2]) + self.assertEqual(input_ids, [156, 86, 20, 3, 1001, 3, 2]) tokens = self.tokenizer.tokenize("Hello, ,") self.assertEqual(tokens, ["▁He", "ll", "o", ",", "", ","]) def test_special_tokens_strip(self): input_ids = self.tokenizer.encode(" ,") - self.assertEqual(input_ids, [1000, 3, 2]) + self.assertEqual(input_ids, [1001, 7, 3, 2]) tokens = self.tokenizer.tokenize(" ,") - # spaces are eaten by rstrip / lstrip - self.assertEqual(tokens, ["", ","]) + # spaces are not longer eaten by rstrip and lstrip + self.assertEqual(tokens, ["", "▁", ","]) # test with a begin of word like `▁He` input_ids = self.tokenizer.encode("No He") - self.assertEqual(input_ids, [284, 1000, 262, 15, 2]) + self.assertEqual(input_ids, [284, 1001, 156, 2]) # spaces are eaten by rstrip / lstrip, so this is expected. Don't strip otherwise you break tokens = self.tokenizer.tokenize("No He") - self.assertEqual(tokens, ["▁No", "", "H", "e"]) + self.assertEqual(tokens, ["▁No", "", "▁He"]) # Make sure this does not happen if we don't strip tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=0) tokenizer.add_special_tokens({"bos_token": AddedToken("")}) input_ids = tokenizer.encode("No He") - self.assertEqual(input_ids, [284, 1000, 156, 2]) + self.assertEqual(input_ids, [284, 1001, 156, 2]) tokens = tokenizer.tokenize("No He") # the first `' '` after `'No'` is eaten by spm: self.assertEqual(tokenizer.sp_model.encode("No ", out_type=str), ["▁No"]) diff --git a/tests/models/vits/test_tokenization_vits.py b/tests/models/vits/test_tokenization_vits.py index a532df52e4d6fc..c02caaaa908339 100644 --- a/tests/models/vits/test_tokenization_vits.py +++ b/tests/models/vits/test_tokenization_vits.py @@ -156,8 +156,8 @@ def test_tokenizer_integration(self): expected_encoding = { 'input_ids': [ [0, 24, 0, 7, 0, 25, 0, 33, 0, 19, 0, 18, 0, 8, 0, 19, 0, 5, 0, 7, 0, 8, 0, 18, 0, 37, 0, 29, 0, 7, 0, 5, 0, 19, 0, 33, 0, 22, 0, 19, 0, 13, 0, 25, 0, 7, 0, 14, 0, 33, 0, 25, 0, 26, 0, 18, 0, 29, 0, 19, 0, 5, 0, 7, 0, 7, 0, 13, 0, 19, 0, 24, 0, 18, 0, 5, 0, 18, 0, 25, 0, 7, 0, 12, 0, 33, 0, 18, 0, 22, 0, 29, 0, 26, 0, 21, 0, 19, 0, 25, 0, 7, 0, 13, 0, 25, 0, 7, 0, 8, 0, 7, 0, 29, 0, 33, 0, 26, 0, 33, 0, 18, 0, 22, 0, 29, 0, 8, 0, 19, 0, 20, 0, 25, 0, 22, 0, 17, 0, 19, 0, 4, 0, 29, 0, 21, 0, 26, 0, 24, 0, 7, 0, 21, 0, 7, 0, 5, 0, 19, 0, 33, 0, 7, 0, 31, 0, 33, 0, 19, 0, 24, 0, 3, 0, 19, 0, 16, 0, 22, 0, 18, 0, 29, 0, 33, 0, 21, 0, 3, 0, 19, 0, 12, 0, 22, 0, 29, 0, 5, 0, 18, 0, 33, 0, 18, 0, 22, 0, 29, 0, 18, 0, 29, 0, 37, 0, 19, 0, 22, 0, 29, 0, 19, 0, 24, 0, 22, 0, 33, 0, 6, 0, 19, 0, 21, 0, 7, 0, 20, 0, 33, 0, 19, 0, 26, 0, 29, 0, 5, 0, 19, 0, 25, 0, 18, 0, 37, 0, 6, 0, 33, 0, 19, 0, 12, 0, 22, 0, 29, 0, 33, 0, 7, 0, 31, 0, 33, 0, 19, 0, 18, 0, 29, 0, 19, 0, 26, 0, 21, 0, 21, 0, 19, 0, 21, 0, 26, 0, 3, 0, 7, 0, 25, 0, 8, 0], - [0, 33, 0, 6, 0, 7, 0, 19, 0, 34, 0, 4, 0, 18, 0, 12, 0, 0, 0, 19, 0, 24, 0, 25, 0, 22, 0, 9, 0, 29, 0, 19, 0, 20, 0, 22, 0, 31, 0, 19, 0, 16, 0, 4, 0, 17, 0, 13, 0, 8, 0, 19, 0, 22, 0, 32, 0, 7, 0, 25, 0, 19, 0, 33, 0, 6, 0, 7, 0, 19, 0, 21, 0, 26, 0, 2, 0, 3, 0, 19, 0, 5, 0, 22, 0, 37, 0, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38], - [0, 9, 0, 7, 0, 19, 0, 4, 0, 8, 0, 7, 0, 19, 0, 0, 0, 19, 0, 26, 0, 8, 0, 19, 0, 22, 0, 4, 0, 25, 0, 19, 0, 13, 0, 26, 0, 5, 0, 5, 0, 18, 0, 29, 0, 37, 0, 19, 0, 33, 0, 22, 0, 0, 0, 7, 0, 29, 0, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38], + [0, 33, 0, 6, 0, 7, 0, 19, 0, 34, 0, 4, 0, 18, 0, 12, 0, 0, 0, 19, 0, 24, 0, 25, 0, 22, 0, 9, 0, 29, 0, 19, 0, 20, 0, 22, 0, 31, 0, 19, 0, 16, 0, 4, 0, 17, 0, 13, 0, 8, 0, 19, 0, 22, 0, 32, 0, 7, 0, 25, 0, 19, 0, 33, 0, 6, 0, 7, 0, 19, 0, 21, 0, 26, 0, 2, 0, 3, 0, 19, 0, 5, 0, 22, 0, 37, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39], + [0, 9, 0, 7, 0, 19, 0, 4, 0, 8, 0, 7, 0, 19, 0, 0, 0, 19, 0, 26, 0, 8, 0, 19, 0, 22, 0, 4, 0, 25, 0, 19, 0, 13, 0, 26, 0, 5, 0, 5, 0, 18, 0, 29, 0, 37, 0, 19, 0, 33, 0, 22, 0, 0, 0, 7, 0, 29, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39], ], 'attention_mask': [ [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], @@ -166,15 +166,14 @@ def test_tokenizer_integration(self): ] } # fmt: on - tokenizer_classes = [self.tokenizer_class] if self.test_rust_tokenizer: tokenizer_classes.append(self.rust_tokenizer_class) - for tokenizer_class in tokenizer_classes: tokenizer = tokenizer_class.from_pretrained( "facebook/mms-tts-eng", - revision="089bbb15da46b2ab2b282145941399aae353d917", # to pin the tokenizer version + revision="28cedf176aa99de5023a4344fd8a2cc477126fb8", # to pin the tokenizer version + pad_token="", ) encoding = tokenizer(sequences, padding=True, normalize=True) diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py index 9bfae65f6ca4e2..174d3009a96442 100644 --- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py +++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py @@ -25,6 +25,7 @@ from transformers import ( WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST, + AddedToken, Wav2Vec2Config, Wav2Vec2CTCTokenizer, Wav2Vec2Tokenizer, @@ -293,7 +294,9 @@ def test_save_and_load_tokenizer(self): tokenizer.add_tokens(["?", "!"]) additional_special_tokens = tokenizer.additional_special_tokens additional_special_tokens.append("&") - tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + tokenizer.add_special_tokens( + {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False + ) before_tokens = tokenizer.decode(sample_ids) before_vocab = tokenizer.get_vocab() tokenizer.save_pretrained(tmpdirname) @@ -470,7 +473,7 @@ def test_special_characters_in_vocab(self): with open(vocab_file, "w") as f: json.dump(vocab_dict, f) - tokenizer = Wav2Vec2CTCTokenizer(vocab_file) + tokenizer = Wav2Vec2CTCTokenizer(vocab_file) # , unk_token="") expected_sent = tokenizer.decode(tokenizer(sent).input_ids, spaces_between_special_tokens=True) self.assertEqual(sent, expected_sent) @@ -732,7 +735,10 @@ def test_add_tokens_tokenizer(self): self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[-3], tokenizer.vocab_size - 1) - new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"} + new_toks_2 = { + "eos_token": AddedToken(">>>>|||<||<<|<<", lstrip=False, rstrip=False), + "pad_token": AddedToken("<<<<<|||>|>>>>|>", rstrip=False, lstrip=False), + } added_toks_2 = tokenizer.add_special_tokens(new_toks_2) vocab_size_3 = tokenizer.vocab_size all_size_3 = len(tokenizer) diff --git a/tests/models/xlnet/test_tokenization_xlnet.py b/tests/models/xlnet/test_tokenization_xlnet.py index a9f39202f4a175..216eb0f637a9e2 100644 --- a/tests/models/xlnet/test_tokenization_xlnet.py +++ b/tests/models/xlnet/test_tokenization_xlnet.py @@ -37,7 +37,6 @@ def setUp(self): # We have a SentencePiece fixture for testing tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokenizer.sanitize_special_tokens() tokenizer.save_pretrained(self.tmpdirname) def test_convert_token_and_id(self): diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py index 0b5a51fb3c926c..7af16371a02083 100644 --- a/tests/pipelines/test_pipelines_image_classification.py +++ b/tests/pipelines/test_pipelines_image_classification.py @@ -17,7 +17,7 @@ from transformers import ( MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, - PreTrainedTokenizer, + PreTrainedTokenizerBase, is_vision_available, ) from transformers.pipelines import ImageClassificationPipeline, pipeline @@ -166,7 +166,7 @@ def test_small_model_tf(self): ) def test_custom_tokenizer(self): - tokenizer = PreTrainedTokenizer() + tokenizer = PreTrainedTokenizerBase() # Assert that the pipeline can be initialized with a feature extractor that is not in any mapping image_classifier = pipeline( diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index fa3bf96d431a8a..a2f207c96391c2 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -228,7 +228,10 @@ def get_input_output_texts(self, tokenizer): return input_txt, input_txt def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]: - toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))] + # the length of the tokenizer does not always represent the tokens that it can encode: what if there are holes? + toks = [ + (i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in set(tokenizer.get_vocab().values()) + ] toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks)) toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks)) if max_length is not None and len(toks) > max_length: @@ -390,15 +393,11 @@ def test_tokenize_special_tokens(self): SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]" SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]" - # TODO: - # Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it) - # with one variable(property) for a better maintainability? - - # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py) + # Both methods should add the token to `_additional_special_tokens` and `added_tokens_decoder` tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True) - # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`, - # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py) - tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]}) + tokenizer.add_special_tokens( + {"additional_special_tokens": [SPECIAL_TOKEN_2]}, replace_additional_special_tokens=False + ) token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1) token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2) @@ -726,7 +725,9 @@ def test_save_and_load_tokenizer(self): tokenizer.add_tokens(["bim", "bambam"]) additional_special_tokens = tokenizer.additional_special_tokens additional_special_tokens.append("new_additional_special_token") - tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + tokenizer.add_special_tokens( + {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False + ) before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) before_vocab = tokenizer.get_vocab() tokenizer.save_pretrained(tmpdirname) @@ -735,6 +736,7 @@ def test_save_and_load_tokenizer(self): after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False) after_vocab = after_tokenizer.get_vocab() self.assertListEqual(before_tokens, after_tokens) + self.assertDictEqual(before_vocab, after_vocab) self.assertIn("bim", after_vocab) self.assertIn("bambam", after_vocab) @@ -759,7 +761,9 @@ def test_save_and_load_tokenizer(self): tokenizer.add_tokens(["bim", "bambam"]) additional_special_tokens = tokenizer.additional_special_tokens additional_special_tokens.append("new_additional_special_token") - tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + tokenizer.add_special_tokens( + {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False + ) before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) before_vocab = tokenizer.get_vocab() tokenizer.save_pretrained(tmpdirname) @@ -844,7 +848,7 @@ def test_added_tokens_do_lower_case(self): tokenized_sequence = "".join(tokenizer.tokenize(sequence_with_special_tokens)) for special_token in tokenizer.all_special_tokens: - self.assertTrue(special_token in tokenized_sequence) + self.assertTrue(special_token in tokenized_sequence or special_token.lower() in tokenized_sequence) tokenizers = self.get_tokenizers(do_lower_case=True) for tokenizer in tokenizers: @@ -874,6 +878,7 @@ def test_added_tokens_do_lower_case(self): len(toks_before_adding) > len(toks_after_adding), # toks_before_adding should be longer ) + # TODO @ArthurZ Nuke this def test_add_tokens_tokenizer(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: @@ -883,7 +888,7 @@ def test_add_tokens_tokenizer(self): self.assertNotEqual(vocab_size, 0) - # We usually have added tokens from the start in tests because our vocab fixtures are + # We usually have added tokens from the start in tests (but also otherwise) because our vocab fixtures are # smaller than the original vocabs - let's not assert this # self.assertEqual(vocab_size, all_size) @@ -903,7 +908,10 @@ def test_add_tokens_tokenizer(self): self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) - new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"} + new_toks_2 = { + "eos_token": AddedToken(">>>>|||<||<<|<<", rstrip=True, lstrip=True), + "pad_token": AddedToken("<<<<<|||>|>>>>|>", rstrip=True, lstrip=True), + } added_toks_2 = tokenizer.add_special_tokens(new_toks_2) vocab_size_3 = tokenizer.vocab_size all_size_3 = len(tokenizer) @@ -914,12 +922,13 @@ def test_add_tokens_tokenizer(self): self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) tokens = tokenizer.encode( - ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False + ">>>>|||<||<<|<< aaaaa bbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False ) self.assertGreaterEqual(len(tokens), 6) self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[0], tokens[1]) + self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokens[-3]) self.assertEqual(tokens[0], tokenizer.eos_token_id) @@ -931,9 +940,10 @@ def test_add_special_tokens(self): with self.subTest(f"{tokenizer.__class__.__name__}"): input_text, ids = self.get_clean_sequence(tokenizer) - special_token = "[SPECIAL_TOKEN]" + special_token = AddedToken("[SPECIAL_TOKEN]", lstrip=True, rstrip=True) tokenizer.add_special_tokens({"cls_token": special_token}) + special_token = str(special_token) encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False) self.assertEqual(len(encoded_special_token), 1) @@ -967,15 +977,17 @@ def test_internal_consistency(self): @require_tokenizers def test_encode_decode_with_spaces(self): - tokenizers = self.get_tokenizers(do_lower_case=False) + tokenizers = self.get_tokenizers(do_lower_case=False, fast=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): new_toks = [ - AddedToken("[ABC]", normalized=False), - AddedToken("[DEF]", normalized=False), - AddedToken("GHI IHG", normalized=False), + # These are added tokens, they will be normalized.... + AddedToken("[ABC]", normalized=True, lstrip=True, rstrip=True), + AddedToken("[DEF]", normalized=True, lstrip=True, rstrip=True), + AddedToken("GHI IHG", normalized=True, lstrip=True, rstrip=True), ] tokenizer.add_tokens(new_toks) + tokenizer.add_tokens([AddedToken("[SAMPLE]", normalized=True)], special_tokens=True) input = "[ABC][DEF][ABC]GHI IHG[DEF]" if self.space_between_special_tokens: output = "[ABC] [DEF] [ABC] GHI IHG [DEF]" @@ -983,7 +995,23 @@ def test_encode_decode_with_spaces(self): output = input encoded = tokenizer.encode(input, add_special_tokens=False) decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens) + self.assertIn(decoded, [output, output.lower()]) + return + # TODO @ArthurZ Refactor testing as now the do_normalize works for special and non special + encoded = tokenizer.encode("[ABC] [DEF][SAMPLE]", add_special_tokens=False) + decoded = tokenizer.decode(encoded, spaces_between_special_tokens=True, skip_special_tokens=False) + self.assertIn(decoded, ["[ABC] [DEF] [SAMPLE]", "[ABC] [DEF] [SAMPLE]".lower()]) + + decoded = tokenizer.decode(encoded, spaces_between_special_tokens=True, skip_special_tokens=True) + self.assertIn(decoded, ["[ABC] [DEF]", "[ABC] [DEF]".lower()]) + + encoded = tokenizer.encode("[ABC][SAMPLE][DEF]", add_special_tokens=False) + decoded = tokenizer.decode(encoded, spaces_between_special_tokens=True) + self.assertIn(decoded, ["[ABC] [SAMPLE] [DEF]", "[ABC][SAMPLE][DEF]".lower()]) + + decoded = tokenizer.decode(encoded, spaces_between_special_tokens=False) + self.assertIn(decoded, ["[ABC][SAMPLE][DEF]", "[ABC][SAMPLE][DEF]".lower()]) def test_pretrained_model_lists(self): # We should have at least one default checkpoint for each tokenizer @@ -2154,11 +2182,12 @@ def test_added_token_are_matched_longest_first(self): @require_tokenizers def test_added_token_serializable(self): + # TODO this is tested 10_000 times.... tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): new_token = AddedToken("new_token", lstrip=True) - tokenizer.add_special_tokens({"additional_special_tokens": [new_token]}) + tokenizer.add_tokens([new_token]) with tempfile.TemporaryDirectory() as tmp_dir_name: tokenizer.save_pretrained(tmp_dir_name) @@ -2916,6 +2945,7 @@ def test_special_tokens_map_equal(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + # sometimes the tokenizer saved online is not the same tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) @@ -3539,8 +3569,8 @@ def test_embeded_special_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) sentence = "A, AllenNLP sentence." tokens_r = tokenizer_r.encode_plus( sentence, @@ -3623,7 +3653,6 @@ def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) @@ -3634,6 +3663,7 @@ def test_special_tokens_initialization(self): self.assertTrue(special_token_id in r_output) if self.test_slow_tokenizer: + # in rust fast, you lose the information of the AddedToken when initializing with `additional_special_tokens` tokenizer_cr = self.rust_tokenizer_class.from_pretrained( pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True ) @@ -3651,37 +3681,32 @@ def test_special_tokens_initialization(self): self.assertTrue(special_token_id in cr_output) def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self): + # This test no longer support rust tokenizers, because the only file that should be looked + # at by the fast tokenizer with the new saving format is `tokenizer_config.json`. + # The previous behaviour is very strange too. Fast tokenizer should not save 3 files, but just one. Can never do slow from fast. tokenizer_list = [] if self.test_slow_tokenizer: tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) - if self.test_rust_tokenizer: - tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) - for tokenizer_class, tokenizer_utils in tokenizer_list: with tempfile.TemporaryDirectory() as tmp_dir: tokenizer_utils.save_pretrained(tmp_dir) - - with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file: - special_tokens_map = json.load(json_file) - - with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file: + # only legacy save will check this + tokenizer_path = "tokenizer_config.json" + with open(os.path.join(tmp_dir, tokenizer_path), encoding="utf-8") as json_file: tokenizer_config = json.load(json_file) - special_tokens_map["additional_special_tokens"] = ["an_additional_special_token"] tokenizer_config["additional_special_tokens"] = ["an_additional_special_token"] - with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile: - json.dump(special_tokens_map, outfile) - with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile: + with open(os.path.join(tmp_dir, tokenizer_path), "w", encoding="utf-8") as outfile: json.dump(tokenizer_config, outfile) # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and # "special_tokens_map.json" files - tokenizer_without_change_in_init = tokenizer_class.from_pretrained( - tmp_dir, - ) + + # TODO ArthurZ ... Ok so for legacy we have to support this I guess..... (special_tokens_map + additional) + tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir) self.assertIn( "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens ) @@ -3813,17 +3838,18 @@ def test_training_new_tokenizer_with_special_tokens_change(self): ): find = True break + special_token.content = new_special_token_str self.assertTrue( find, - f"'{new_special_token_str}' doesn't appear in the list " - f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as " - f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}", + f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = " + f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing" + ", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.", ) elif special_token not in special_tokens_map: # The special token must appear identically in the list of the new tokenizer. self.assertTrue( special_token in new_tokenizer.all_special_tokens_extended, - f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}", + f"'{special_token.__repr__()}' should be in {new_tokenizer.all_special_tokens_extended}", ) else: diff --git a/tests/tokenization/test_tokenization_fast.py b/tests/tokenization/test_tokenization_fast.py index c6259610aa18b4..fc95bad6d05442 100644 --- a/tests/tokenization/test_tokenization_fast.py +++ b/tests/tokenization/test_tokenization_fast.py @@ -52,6 +52,12 @@ def test_tokenizer_mismatch_warning(self): # model pass + @unittest.skip( + "We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model" + ) + def test_encode_decode_with_spaces(self): + pass + def test_pretrained_model_lists(self): # We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any # model