diff --git a/.gitignore b/.gitignore
index eeb41b3fcaea35..337f2ef2c735e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,4 +166,4 @@ tags
.DS_Store
# ruff
-.ruff_cache
\ No newline at end of file
+.ruff_cache
diff --git a/setup.py b/setup.py
index 25aadde8e3d6b3..bd47bfd4f91989 100644
--- a/setup.py
+++ b/setup.py
@@ -172,7 +172,7 @@
"tf2onnx",
"timeout-decorator",
"timm",
- "tokenizers>=0.11.1,!=0.11.3,<0.14",
+ "tokenizers>=0.14,<0.15",
"torch>=1.10,!=1.12.0",
"torchaudio",
"torchvision",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 8786f8cd014d64..a791d96eb5b818 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -78,7 +78,7 @@
"tf2onnx": "tf2onnx",
"timeout-decorator": "timeout-decorator",
"timm": "timm",
- "tokenizers": "tokenizers>=0.11.1,!=0.11.3,<0.14",
+ "tokenizers": "tokenizers>=0.14,<0.15",
"torch": "torch>=1.10,!=1.12.0",
"torchaudio": "torchaudio",
"torchvision": "torchvision",
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index 231abf1c0301c9..3ff319199522cc 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -159,6 +159,14 @@ def __init__(
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.do_lower_case = do_lower_case
+ self.remove_space = remove_space
+ self.keep_accents = keep_accents
+ self.vocab_file = vocab_file
+
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+
super().__init__(
do_lower_case=do_lower_case,
remove_space=remove_space,
@@ -174,14 +182,6 @@ def __init__(
**kwargs,
)
- self.do_lower_case = do_lower_case
- self.remove_space = remove_space
- self.keep_accents = keep_accents
- self.vocab_file = vocab_file
-
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(vocab_file)
-
@property
def vocab_size(self) -> int:
return len(self.sp_model)
@@ -228,6 +228,8 @@ def _tokenize(self, text: str) -> List[str]:
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+ # Logic to handle special cases see https://github.com/google-research/bert/blob/master/README.md#tokenization
+ # `9,9` -> ['▁9', ',', '9'] instead of [`_9,`, '9']
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py
index 22ee1a0db6149d..7dd008c4dbbaf2 100644
--- a/src/transformers/models/bart/tokenization_bart.py
+++ b/src/transformers/models/bart/tokenization_bart.py
@@ -204,21 +204,10 @@ def __init__(
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
# Mask token behave like a normal word, i.e. include the space before it
+ # TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens`
+ # Also this not only will strip the spaces but any punctuation
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
- super().__init__(
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- **kwargs,
- )
-
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -235,6 +224,19 @@ def __init__(
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+ super().__init__(
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ **kwargs,
+ )
+
@property
def vocab_size(self):
return len(self.encoder)
diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py
index f05ed1b7a82d5d..464b17c4d4c217 100644
--- a/src/transformers/models/bart/tokenization_bart_fast.py
+++ b/src/transformers/models/bart/tokenization_bart_fast.py
@@ -170,6 +170,7 @@ def __init__(
trim_offsets=True,
**kwargs,
):
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
super().__init__(
vocab_file,
merges_file,
diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py
index 77ab8a9d64166b..5fd851b379cf5a 100644
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -47,6 +47,8 @@
SPIECE_UNDERLINE = "▁"
+# TODO this class is useless. This is the most standard sentencpiece model. Let's find which one is closest and nuke this.
+
class BarthezTokenizer(PreTrainedTokenizer):
"""
@@ -141,6 +143,9 @@ def __init__(
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.vocab_file = vocab_file
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(str(vocab_file))
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
@@ -153,15 +158,6 @@ def __init__(
**kwargs,
)
- self.vocab_file = vocab_file
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(str(vocab_file))
-
- self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3}
-
- self.fairseq_tokens_to_ids[""] = len(self.sp_model) - 1
- self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
@@ -251,16 +247,10 @@ def _tokenize(self, text: str) -> List[str]:
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
- if token in self.fairseq_tokens_to_ids:
- return self.fairseq_tokens_to_ids[token]
- spm_id = self.sp_model.PieceToId(token)
-
- return spm_id if spm_id else self.unk_token_id
+ return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
- if index in self.fairseq_ids_to_tokens:
- return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens):
diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py
index 1c1ef0b8675b8a..74e6ad8f9e29fe 100644
--- a/src/transformers/models/bartpho/tokenization_bartpho.py
+++ b/src/transformers/models/bartpho/tokenization_bartpho.py
@@ -139,18 +139,6 @@ def __init__(
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- sp_model_kwargs=self.sp_model_kwargs,
- **kwargs,
- )
-
self.vocab_file = vocab_file
self.monolingual_vocab_file = monolingual_vocab_file
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
@@ -174,6 +162,18 @@ def __init__(
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py
index a24f39564264df..16044973343bc5 100644
--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -196,20 +196,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -225,7 +211,22 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py
index 6ef3321277f365..f8d49f86ac51ae 100644
--- a/src/transformers/models/bert_generation/tokenization_bert_generation.py
+++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py
@@ -96,6 +96,11 @@ def __init__(
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.vocab_file = vocab_file
+
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+
# Add extra_ids to the special token list
super().__init__(
bos_token=bos_token,
@@ -107,11 +112,6 @@ def __init__(
**kwargs,
)
- self.vocab_file = vocab_file
-
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(vocab_file)
-
@property
def vocab_size(self):
return self.sp_model.get_piece_size()
diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
index dd290109185241..e0f09c20b2e67e 100644
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -160,25 +160,6 @@ def __init__(
jumanpp_kwargs=None,
**kwargs,
):
- super().__init__(
- spm_file=spm_file,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- do_lower_case=do_lower_case,
- do_word_tokenize=do_word_tokenize,
- do_subword_tokenize=do_subword_tokenize,
- word_tokenizer_type=word_tokenizer_type,
- subword_tokenizer_type=subword_tokenizer_type,
- never_split=never_split,
- mecab_kwargs=mecab_kwargs,
- sudachi_kwargs=sudachi_kwargs,
- jumanpp_kwargs=jumanpp_kwargs,
- **kwargs,
- )
-
if subword_tokenizer_type == "sentencepiece":
if not os.path.isfile(spm_file):
raise ValueError(
@@ -226,13 +207,31 @@ def __init__(
self.subword_tokenizer_type = subword_tokenizer_type
if do_subword_tokenize:
if subword_tokenizer_type == "wordpiece":
- self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+ self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
elif subword_tokenizer_type == "character":
- self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+ self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=str(unk_token))
elif subword_tokenizer_type == "sentencepiece":
- self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=self.unk_token)
+ self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=str(unk_token))
else:
raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
+ super().__init__(
+ spm_file=spm_file,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ do_lower_case=do_lower_case,
+ do_word_tokenize=do_word_tokenize,
+ do_subword_tokenize=do_subword_tokenize,
+ word_tokenizer_type=word_tokenizer_type,
+ subword_tokenizer_type=subword_tokenizer_type,
+ never_split=never_split,
+ mecab_kwargs=mecab_kwargs,
+ sudachi_kwargs=sudachi_kwargs,
+ jumanpp_kwargs=jumanpp_kwargs,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index 7901a58b881983..13846a5089a685 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -134,18 +134,6 @@ def __init__(
mask_token="",
**kwargs,
):
- super().__init__(
- normalization=normalization,
- bos_token=bos_token,
- eos_token=eos_token,
- sep_token=sep_token,
- cls_token=cls_token,
- unk_token=unk_token,
- pad_token=pad_token,
- mask_token=mask_token,
- **kwargs,
- )
-
try:
from emoji import demojize
@@ -161,10 +149,10 @@ def __init__(
self.merges_file = merges_file
self.encoder = {}
- self.encoder[self.bos_token] = 0
- self.encoder[self.pad_token] = 1
- self.encoder[self.eos_token] = 2
- self.encoder[self.unk_token] = 3
+ self.encoder[bos_token] = 0
+ self.encoder[pad_token] = 1
+ self.encoder[eos_token] = 2
+ self.encoder[unk_token] = 3
self.add_from_file(vocab_file)
@@ -178,9 +166,20 @@ def __init__(
self.normalization = normalization
self.tweetPreprocessor = TweetTokenizer()
-
self.special_puncts = {"’": "'", "…": "..."}
+ super().__init__(
+ normalization=normalization,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ **kwargs,
+ )
+
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py
index 5220366df4d247..8e720a54257a5a 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@@ -127,6 +127,11 @@ def __init__(
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.vocab_file = vocab_file
+
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
@@ -139,11 +144,6 @@ def __init__(
**kwargs,
)
- self.vocab_file = vocab_file
-
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(vocab_file)
-
@property
def vocab_size(self):
return self.sp_model.get_piece_size()
diff --git a/src/transformers/models/biogpt/tokenization_biogpt.py b/src/transformers/models/biogpt/tokenization_biogpt.py
index d050fa699c5244..093991ecb3885d 100644
--- a/src/transformers/models/biogpt/tokenization_biogpt.py
+++ b/src/transformers/models/biogpt/tokenization_biogpt.py
@@ -112,15 +112,6 @@ def __init__(
pad_token="",
**kwargs,
):
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- sep_token=sep_token,
- unk_token=unk_token,
- pad_token=pad_token,
- **kwargs,
- )
-
try:
import sacremoses
except ImportError:
@@ -145,6 +136,15 @@ def __init__(
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ sep_token=sep_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ **kwargs,
+ )
+
@property
def vocab_size(self):
"""Returns vocab size"""
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py
index d6a70beb30a136..9a81e73b8da37a 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -187,28 +187,21 @@ def __init__(
**kwargs,
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
- unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
- pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
# Mask token behave like a normal word, i.e. include the space before it
- mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
- super().__init__(
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- **kwargs,
+ mask_token = (
+ AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+ if isinstance(mask_token, str)
+ else mask_token
)
+ # these special tokens are not part of the vocab.json, let's add them in the correct order
+
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -225,6 +218,19 @@ def __init__(
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+ super().__init__(
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ **kwargs,
+ )
+
@property
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Blenderbot, RoBERTa->Blenderbot
def vocab_size(self):
@@ -232,7 +238,9 @@ def vocab_size(self):
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Blenderbot, RoBERTa->Blenderbot
def get_vocab(self):
- return dict(self.encoder, **self.added_tokens_encoder)
+ vocab = dict(self.encoder).copy()
+ vocab.update(self.added_tokens_encoder)
+ return vocab
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Blenderbot, RoBERTa->Blenderbot
def bpe(self, token):
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
index ebe39ed09f9a35..fdd490b12adcf9 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@@ -149,6 +149,11 @@ def __init__(
trim_offsets=True,
**kwargs,
):
+ mask_token = (
+ AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+ if isinstance(mask_token, str)
+ else mask_token
+ )
super().__init__(
vocab_file,
merges_file,
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
index 4acb87325666a1..61c56738ac4129 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -106,8 +106,6 @@ def __init__(
pad_token="__null__",
**kwargs,
):
- super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
-
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -116,6 +114,7 @@ def __init__(
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
+ super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
@property
def vocab_size(self) -> int:
diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py
index 59e694c343c559..1d310fe3045fb0 100644
--- a/src/transformers/models/byt5/tokenization_byt5.py
+++ b/src/transformers/models/byt5/tokenization_byt5.py
@@ -16,7 +16,7 @@
import warnings
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional, Tuple
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
@@ -72,7 +72,7 @@ def __init__(
# Add extra_ids to the special token list
if extra_ids > 0 and additional_special_tokens is None:
additional_special_tokens = [f"" for i in range(extra_ids)]
- elif extra_ids > 0 and additional_special_tokens is not None:
+ elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0:
# Check that we have the right number of extra_id special tokens
extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
if extra_tokens != extra_ids:
@@ -82,38 +82,31 @@ def __init__(
" extra_ids tokens"
)
- pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
- eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
- unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-
+ pad_token = AddedToken(pad_token, lstrip=True, rstrip=True) if isinstance(pad_token, str) else pad_token
+ # we force left and right stripping for backward compatibility. The byt5tests depend on this.
+ eos_token = AddedToken(eos_token, lstrip=True, rstrip=True) if isinstance(eos_token, str) else eos_token
+ unk_token = AddedToken(unk_token, lstrip=True, rstrip=True) if isinstance(unk_token, str) else unk_token
+ # unk token needs to be in the vocab with correct index
+ self._added_tokens_decoder = {0: pad_token, 1: eos_token, 2: unk_token}
+ self.offset = len(self._added_tokens_decoder)
+ self._utf_vocab_size = 2**8 # utf is 8 bits
super().__init__(
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
- extra_ids=extra_ids,
- additional_special_tokens=additional_special_tokens,
+ extra_ids=0,
+ additional_special_tokens=additional_special_tokens, # TODO extra ids are not used :sweatywmile:
**kwargs,
)
- self._extra_ids = extra_ids
-
- self._utf_vocab_size = 2**8 # utf is 8 bits
-
- # define special tokens dict
- self.special_tokens_encoder: Dict[int, str] = {
- self.pad_token: 0,
- self.eos_token: 1,
- self.unk_token: 2,
- }
- self._num_special_tokens = len(self.special_tokens_encoder)
- n = len(additional_special_tokens)
- for i, token in enumerate(additional_special_tokens):
- self.special_tokens_encoder[token] = self.vocab_size + i - n
- self.special_tokens_decoder: Dict[str, int] = {v: k for k, v in self.special_tokens_encoder.items()}
-
@property
def vocab_size(self):
- return self._utf_vocab_size + self._num_special_tokens + self._extra_ids
+ return self._utf_vocab_size
+
+ def get_vocab(self):
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
@@ -209,34 +202,25 @@ def _tokenize(self, text: str) -> List[str]:
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
- if token in self.special_tokens_encoder:
- token_id = self.special_tokens_encoder[token]
- elif token in self.added_tokens_encoder:
- token_id = self.added_tokens_encoder[token]
- elif len(token) != 1:
- token_id = self.unk_token_id
+
+ if len(token) != 1:
+ token_id = None
else:
- token_id = ord(token) + self._num_special_tokens
+ token_id = ord(token) + self.offset
+
return token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
- if index in self.special_tokens_decoder:
- token = self.special_tokens_decoder[index]
- else:
- token = chr(index - self._num_special_tokens)
+ token = chr(index - self.offset)
return token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
bstring = b""
for token in tokens:
- if token in self.special_tokens_decoder:
- tok_string = self.special_tokens_decoder[token].encode("utf-8")
- elif token in self.added_tokens_decoder:
- tok_string = self.special_tokens_decoder[token].encode("utf-8")
- elif token in self.special_tokens_encoder:
- tok_string = token.encode("utf-8")
+ if token in self.added_tokens_decoder:
+ tok_string = self.added_tokens_decoder[token].encode("utf-8")
elif token in self.added_tokens_encoder:
tok_string = token.encode("utf-8")
else:
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index 658dd1080b7122..f75a397755e34d 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -136,6 +136,29 @@ def __init__(
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(str(vocab_file))
+ self.vocab_file = vocab_file
+
+ # HACK: These tokens were added by the author for an obscure reason as they were already part of the
+ # sentencepiece vocabulary (this is the case for and and ).
+ # In this case it is recommended to properly set the tokens by hand.
+ self._added_tokens_decoder = {
+ 0: AddedToken("NOTUSED"),
+ 1: AddedToken(pad_token),
+ 2: AddedToken("NOTUSED"),
+ 3: AddedToken(unk_token),
+ 4: AddedToken("NOTUSED"),
+ }
+
+ self.fairseq_offset = 4 # 3 tokens are newly added, but the offset starts from 4
+
+ # legacy: camemebert is a particular case were we have to make sure `"NOTUSED"` is here
+ if "added_tokens_decoder" in kwargs:
+ # this is the only class that requires this unfortunately.....
+ # the reason is that the fast version has a whole.
+ kwargs["added_tokens_decoder"].update(self._added_tokens_decoder)
+
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
@@ -148,15 +171,83 @@ def __init__(
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
+
+ @property
+ def vocab_size(self):
+ # The length of the vocabulary without added tokens is len(self.sp_model) but the added tokens are added at the beginning.
+ return len(self.sp_model)
+
+ def get_vocab(self):
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.fairseq_offset)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ def _tokenize(self, text: str) -> List[str]:
+ return self.sp_model.encode(text, out_type=str)
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ # specifi to camembert, both 3 and 4 point to the unk token.
+ if self.sp_model.PieceToId(token) == 0:
+ # Convert sentence piece unk token to fairseq unk token index
+ return self.unk_token_id
+ return self.fairseq_offset + self.sp_model.PieceToId(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ # TODO decode outputs do not match between fast and slow
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for token in tokens:
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ return out_string.strip()
+
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ state["sp_model"] = None
+ return state
+
+ def __setstate__(self, d):
+ self.__dict__ = d
+
+ # for backward compatibility
+ if not hasattr(self, "sp_model_kwargs"):
+ self.sp_model_kwargs = {}
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(str(vocab_file))
- self.vocab_file = vocab_file
- # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
- # sentencepiece vocabulary (this is the case for and
- self.fairseq_tokens_to_ids = {"NOTUSED": 0, "": 1, "NOTUSED": 2, "": 3}
- self.fairseq_offset = len(self.fairseq_tokens_to_ids)
- self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
- self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+ self.sp_model.Load(self.vocab_file)
+
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
@@ -233,81 +324,3 @@ def create_token_type_ids_from_sequences(
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
- @property
- def vocab_size(self):
- return len(self.fairseq_tokens_to_ids) + len(self.sp_model)
-
- def get_vocab(self):
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
- vocab.update(self.added_tokens_encoder)
- return vocab
-
- def _tokenize(self, text: str) -> List[str]:
- return self.sp_model.encode(text, out_type=str)
-
- def _convert_token_to_id(self, token):
- """Converts a token (str) in an id using the vocab."""
- if token in self.fairseq_tokens_to_ids:
- return self.fairseq_tokens_to_ids[token]
- elif self.sp_model.PieceToId(token) == 0:
- # Convert sentence piece unk token to fairseq unk token index
- return self.unk_token_id
- return self.fairseq_offset + self.sp_model.PieceToId(token)
-
- def _convert_id_to_token(self, index):
- """Converts an index (integer) in a token (str) using the vocab."""
- if index in self.fairseq_ids_to_tokens:
- return self.fairseq_ids_to_tokens[index]
- return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
- def convert_tokens_to_string(self, tokens):
- """Converts a sequence of tokens (string) in a single string."""
- current_sub_tokens = []
- out_string = ""
- prev_is_special = False
- for token in tokens:
- # make sure that special tokens are not decoded using sentencepiece model
- if token in self.all_special_tokens:
- if not prev_is_special:
- out_string += " "
- out_string += self.sp_model.decode(current_sub_tokens) + token
- prev_is_special = True
- current_sub_tokens = []
- else:
- current_sub_tokens.append(token)
- prev_is_special = False
- out_string += self.sp_model.decode(current_sub_tokens)
- return out_string.strip()
-
- def __getstate__(self):
- state = self.__dict__.copy()
- state["sp_model"] = None
- return state
-
- def __setstate__(self, d):
- self.__dict__ = d
-
- # for backward compatibility
- if not hasattr(self, "sp_model_kwargs"):
- self.sp_model_kwargs = {}
-
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(self.vocab_file)
-
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
- if not os.path.isdir(save_directory):
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
- return
- out_vocab_file = os.path.join(
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
- )
-
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
- copyfile(self.vocab_file, out_vocab_file)
- elif not os.path.isfile(self.vocab_file):
- with open(out_vocab_file, "wb") as fi:
- content_spiece_model = self.sp_model.serialized_model_proto()
- fi.write(content_spiece_model)
-
- return (out_vocab_file,)
diff --git a/src/transformers/models/canine/tokenization_canine.py b/src/transformers/models/canine/tokenization_canine.py
index 2fae9e1482bd32..25932ae75d2a87 100644
--- a/src/transformers/models/canine/tokenization_canine.py
+++ b/src/transformers/models/canine/tokenization_canine.py
@@ -33,7 +33,6 @@
# Below: Constants defining canonical codepoints for special, pseudo-characters.
# Copied from https://github.com/google-research/language/blob/master/language/canine/special_codepoints.py
PAD = 0
-
CLS = 0xE000
SEP = 0xE001
BOS = 0xE002
@@ -97,18 +96,6 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- model_max_length=model_max_length,
- **kwargs,
- )
-
# Creates a mapping for looking up the IDs of special symbols.
self._special_codepoints: Dict[str, int] = {}
for codepoint, name in SPECIAL_CODEPOINTS.items():
@@ -122,10 +109,27 @@ def __init__(
self._unicode_vocab_size = UNICODE_VOCAB_SIZE
self._num_special_tokens = len(self._special_codepoints)
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ model_max_length=model_max_length,
+ **kwargs,
+ )
+
@property
def vocab_size(self) -> int:
return self._unicode_vocab_size
+ def get_vocab(self):
+ vocab = {chr(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
def _tokenize(self, text: str) -> List[str]:
"""Tokenize a string (i.e. perform character splitting)."""
return list(text)
diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py
index 127480b90cad0f..388c455a43807a 100644
--- a/src/transformers/models/clip/tokenization_clip.py
+++ b/src/transformers/models/clip/tokenization_clip.py
@@ -312,16 +312,6 @@ def __init__(
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-
- super().__init__(
- errors=errors,
- unk_token=unk_token,
- bos_token=bos_token,
- eos_token=eos_token,
- pad_token=pad_token,
- **kwargs,
- )
-
try:
import ftfy
@@ -348,6 +338,15 @@ def __init__(
re.IGNORECASE,
)
+ super().__init__(
+ errors=errors,
+ unk_token=unk_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ **kwargs,
+ )
+
@property
def vocab_size(self):
return len(self.encoder)
diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py
index 53a2d3577a1740..da1012095cfb23 100644
--- a/src/transformers/models/code_llama/tokenization_code_llama.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama.py
@@ -151,6 +151,17 @@ def __init__(
for token in [prefix_token, middle_token, suffix_token, eot_token]:
additional_special_tokens += [token] if token is not None else []
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self._prefix_token = prefix_token
+ self._middle_token = middle_token
+ self._suffix_token = suffix_token
+ self._eot_token = eot_token
+ self.fill_token = fill_token
+ self.suffix_first = suffix_first
+ self.sp_model = self.get_spm_processor()
+
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
@@ -169,16 +180,6 @@ def __init__(
use_default_system_prompt=use_default_system_prompt,
**kwargs,
)
- self.vocab_file = vocab_file
- self.add_bos_token = add_bos_token
- self.add_eos_token = add_eos_token
- self._prefix_token = prefix_token
- self._middle_token = middle_token
- self._suffix_token = suffix_token
- self._eot_token = eot_token
- self.fill_token = fill_token
- self.suffix_first = suffix_first
- self.sp_model = self.get_spm_processor()
@property
def unk_token_length(self):
diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py
index 14d79bb1cebec4..e5f0332a92da79 100644
--- a/src/transformers/models/codegen/tokenization_codegen.py
+++ b/src/transformers/models/codegen/tokenization_codegen.py
@@ -167,16 +167,6 @@ def __init__(
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
- super().__init__(
- errors=errors,
- unk_token=unk_token,
- bos_token=bos_token,
- eos_token=eos_token,
- pad_token=pad_token,
- add_prefix_space=add_prefix_space,
- add_bos_token=add_bos_token,
- **kwargs,
- )
self.add_bos_token = add_bos_token
with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -194,6 +184,16 @@ def __init__(
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+ super().__init__(
+ errors=errors,
+ unk_token=unk_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ add_prefix_space=add_prefix_space,
+ add_bos_token=add_bos_token,
+ **kwargs,
+ )
@property
def vocab_size(self):
diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py
index 800848caaf1cc7..439beb7abb4d03 100644
--- a/src/transformers/models/convbert/tokenization_convbert.py
+++ b/src/transformers/models/convbert/tokenization_convbert.py
@@ -135,20 +135,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -164,7 +150,22 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py
index f509519271d4e8..67281b3cf185f8 100644
--- a/src/transformers/models/cpm/tokenization_cpm.py
+++ b/src/transformers/models/cpm/tokenization_cpm.py
@@ -38,6 +38,9 @@
class CpmTokenizer(PreTrainedTokenizer):
"""Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+
def __init__(
self,
vocab_file,
@@ -121,24 +124,6 @@ def __init__(
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
- super().__init__(
- do_lower_case=do_lower_case,
- remove_space=remove_space,
- keep_accents=keep_accents,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- additional_special_tokens=additional_special_tokens,
- sp_model_kwargs=self.sp_model_kwargs,
- **kwargs,
- )
-
- self._pad_token_type_id = 3
-
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
@@ -157,6 +142,24 @@ def __init__(
self.jieba = jieba
self.translator = str.maketrans(" \n", "\u2582\u2583")
+ super().__init__(
+ do_lower_case=do_lower_case,
+ remove_space=remove_space,
+ keep_accents=keep_accents,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ additional_special_tokens=additional_special_tokens,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+
+ self._pad_token_type_id = 3
+
@property
# Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.vocab_size
def vocab_size(self):
diff --git a/src/transformers/models/cpmant/tokenization_cpmant.py b/src/transformers/models/cpmant/tokenization_cpmant.py
index 346f28fde66756..c10f48e2de282e 100644
--- a/src/transformers/models/cpmant/tokenization_cpmant.py
+++ b/src/transformers/models/cpmant/tokenization_cpmant.py
@@ -131,18 +131,6 @@ def __init__(
**kwargs,
):
requires_backends(self, ["jieba"])
- super().__init__(
- bod_token=bod_token,
- eod_token=eod_token,
- bos_token=bos_token,
- eos_token=eos_token,
- pad_token=pad_token,
- unk_token=unk_token,
- line_token=line_token,
- space_token=space_token,
- padding_side=padding_side,
- **kwargs,
- )
self.bod_token = bod_token
self.eod_token = eod_token
self.encoder = load_vocab(vocab_file)
@@ -155,7 +143,20 @@ def __init__(
self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
self.decoder = {v: k for k, v in self.encoder.items()}
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=self.unk_token)
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=unk_token)
+
+ super().__init__(
+ bod_token=bod_token,
+ eod_token=eod_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ unk_token=unk_token,
+ line_token=line_token,
+ space_token=space_token,
+ padding_side=padding_side,
+ **kwargs,
+ )
@property
def bod_token_id(self):
diff --git a/src/transformers/models/ctrl/tokenization_ctrl.py b/src/transformers/models/ctrl/tokenization_ctrl.py
index 7a81bf8572f0c0..f00b50348048d6 100644
--- a/src/transformers/models/ctrl/tokenization_ctrl.py
+++ b/src/transformers/models/ctrl/tokenization_ctrl.py
@@ -139,8 +139,6 @@ class CTRLTokenizer(PreTrainedTokenizer):
control_codes = CONTROL_CODES
def __init__(self, vocab_file, merges_file, unk_token="", **kwargs):
- super().__init__(unk_token=unk_token, **kwargs)
-
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -149,6 +147,7 @@ def __init__(self, vocab_file, merges_file, unk_token="", **kwargs):
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
+ super().__init__(unk_token=unk_token, **kwargs)
@property
def vocab_size(self):
diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py
index 880ed17d95ef28..55fe35a427eb1f 100644
--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -201,20 +201,6 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
- super().__init__(
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- add_bos_token=add_bos_token,
- **kwargs,
- )
self.add_bos_token = add_bos_token
with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -233,6 +219,20 @@ def __init__(
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+ super().__init__(
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ add_bos_token=add_bos_token,
+ **kwargs,
+ )
+
@property
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.vocab_size
def vocab_size(self):
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
index b2a0d844f1625d..4d408252a2bd90 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -20,9 +20,12 @@
import sentencepiece as sp
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+logger = logging.get_logger(__name__)
+
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
@@ -124,6 +127,18 @@ def __init__(
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ if not os.path.isfile(vocab_file):
+ raise ValueError(
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+ " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+ )
+ self.do_lower_case = do_lower_case
+ self.split_by_punct = split_by_punct
+ self.vocab_file = vocab_file
+ self._tokenizer = SPMTokenizer(
+ vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
+ )
+ unk_token = AddedToken(unk_token, normalized=True, lstrip=False, rstrip=False)
super().__init__(
do_lower_case=do_lower_case,
bos_token=bos_token,
@@ -137,18 +152,7 @@ def __init__(
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
-
- if not os.path.isfile(vocab_file):
- raise ValueError(
- f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
- " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
- )
- self.do_lower_case = do_lower_case
- self.split_by_punct = split_by_punct
- self.vocab_file = vocab_file
- self._tokenizer = SPMTokenizer(
- vocab_file, self.all_special_tokens, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
- )
+ self._tokenizer.special_tokens = self.all_special_tokens
@property
def vocab_size(self):
@@ -374,6 +378,7 @@ def decode(self, tokens, start=-1, end=-1, raw_text=None):
text = "".join(words[word_start:word_end])
return text
+ # TODO add a deprecation cycle as this can have different behaviour from our API
def add_special_token(self, token):
if token not in self.special_tokens:
self.special_tokens.append(token)
@@ -383,6 +388,9 @@ def add_special_token(self, token):
return self.id(token)
def part_of_whole_word(self, token, is_bos=False):
+ logger.warning_once(
+ "The `DebertaTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`"
+ )
if is_bos:
return True
if (
@@ -413,6 +421,9 @@ def sym(self, id):
return self.ids_to_tokens[id]
def id(self, sym):
+ logger.warning_once(
+ "The `DebertaTokenizer.id` method is deprecated and will be removed in `transformers==4.35`"
+ )
return self.vocab[sym] if sym in self.vocab else 1
def _encode_as_pieces(self, text):
@@ -460,17 +471,6 @@ def split_to_words(self, text):
return words
- def _run_strip_accents(self, text):
- """Strips accents from a piece of text."""
- text = unicodedata.normalize("NFD", text)
- output = []
- for char in text:
- cat = unicodedata.category(char)
- if cat == "Mn":
- continue
- output.append(char)
- return "".join(output)
-
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
diff --git a/src/transformers/models/deprecated/retribert/tokenization_retribert.py b/src/transformers/models/deprecated/retribert/tokenization_retribert.py
index de50c74b70bd02..d0904e3c931e40 100644
--- a/src/transformers/models/deprecated/retribert/tokenization_retribert.py
+++ b/src/transformers/models/deprecated/retribert/tokenization_retribert.py
@@ -132,20 +132,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -161,7 +147,22 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
diff --git a/src/transformers/models/deprecated/tapex/tokenization_tapex.py b/src/transformers/models/deprecated/tapex/tokenization_tapex.py
index d0cd49212c6dc0..a5ee093c56bd26 100644
--- a/src/transformers/models/deprecated/tapex/tokenization_tapex.py
+++ b/src/transformers/models/deprecated/tapex/tokenization_tapex.py
@@ -296,23 +296,6 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
- super().__init__(
- vocab_file=vocab_file,
- merges_file=merges_file,
- do_lower_case=do_lower_case,
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- max_cell_length=max_cell_length,
- **kwargs,
- )
-
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -331,6 +314,24 @@ def __init__(
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
# additional properties
+
+ super().__init__(
+ vocab_file=vocab_file,
+ merges_file=merges_file,
+ do_lower_case=do_lower_case,
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ max_cell_length=max_cell_length,
+ **kwargs,
+ )
+
self.max_cell_length = max_cell_length
self.table_linearize = IndexedRowTableLinearize()
diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py
index 5e96e4972d3fac..014c41d1243b6f 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert.py
@@ -149,20 +149,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -178,7 +164,21 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py
index aabeccba7d630e..fb9bf9dfa13cba 100644
--- a/src/transformers/models/electra/tokenization_electra.py
+++ b/src/transformers/models/electra/tokenization_electra.py
@@ -152,20 +152,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -181,7 +167,22 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/ernie_m/tokenization_ernie_m.py b/src/transformers/models/ernie_m/tokenization_ernie_m.py
index 1acc113dca5fb7..b1b8cc845024c8 100644
--- a/src/transformers/models/ernie_m/tokenization_ernie_m.py
+++ b/src/transformers/models/ernie_m/tokenization_ernie_m.py
@@ -112,6 +112,19 @@ def __init__(
# is included in the raw text, there should be a match in a non-normalized sentence.
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+ self.do_lower_case = do_lower_case
+ self.sentencepiece_model_ckpt = sentencepiece_model_ckpt
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(sentencepiece_model_ckpt)
+
+ # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
+ if vocab_file is not None:
+ self.vocab = self.load_vocab(filepath=vocab_file)
+ else:
+ self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())}
+ self.reverse_vocab = {v: k for k, v in self.vocab.items()}
+
super().__init__(
do_lower_case=do_lower_case,
unk_token=unk_token,
@@ -124,17 +137,6 @@ def __init__(
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
- self.do_lower_case = do_lower_case
- self.sentencepiece_model_ckpt = sentencepiece_model_ckpt
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(sentencepiece_model_ckpt)
-
- # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
- if vocab_file is not None:
- self.vocab = self.load_vocab(filepath=vocab_file)
- else:
- self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())}
- self.reverse_vocab = {v: k for k, v in self.vocab.items()}
def get_offset_mapping(self, text):
if text is None:
diff --git a/src/transformers/models/esm/tokenization_esm.py b/src/transformers/models/esm/tokenization_esm.py
index f19d0de58a9471..065eaae1d50520 100644
--- a/src/transformers/models/esm/tokenization_esm.py
+++ b/src/transformers/models/esm/tokenization_esm.py
@@ -64,17 +64,23 @@ def __init__(
eos_token="",
**kwargs,
):
- super().__init__(**kwargs)
self.all_tokens = load_vocab_file(vocab_file)
self._id_to_token = dict(enumerate(self.all_tokens))
self._token_to_id = {tok: ind for ind, tok in enumerate(self.all_tokens)}
- self.unk_token = unk_token
- self.cls_token = cls_token
- self.pad_token = pad_token
- self.mask_token = mask_token
- self.eos_token = eos_token
+ super().__init__(
+ unk_token=unk_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ eos_token=eos_token,
+ **kwargs,
+ )
+
+ # TODO, all the tokens are added? But they are also part of the vocab... bit strange.
+ # none of them are special, but they all need special splitting.
+
self.unique_no_split_tokens = self.all_tokens
- self._create_trie(self.unique_no_split_tokens)
+ self._update_trie(self.unique_no_split_tokens)
def _convert_id_to_token(self, index: int) -> str:
return self._id_to_token.get(index, self.unk_token)
diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py
index ea3f1c8bfd58b2..010515e9d02e46 100644
--- a/src/transformers/models/flaubert/tokenization_flaubert.py
+++ b/src/transformers/models/flaubert/tokenization_flaubert.py
@@ -258,19 +258,6 @@ def __init__(
self.do_lowercase = do_lowercase
- super().__init__(
- unk_token=unk_token,
- bos_token=bos_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- additional_special_tokens=additional_special_tokens,
- lang2id=lang2id,
- id2lang=id2lang,
- **kwargs,
- )
-
try:
import sacremoses
except ImportError:
@@ -303,6 +290,19 @@ def __init__(
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
+ super().__init__(
+ unk_token=unk_token,
+ bos_token=bos_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ additional_special_tokens=additional_special_tokens,
+ lang2id=lang2id,
+ id2lang=id2lang,
+ **kwargs,
+ )
+
@property
# Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case
def do_lower_case(self):
diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py
index 7324f509a8d3df..cfa54fcecfb517 100644
--- a/src/transformers/models/fnet/tokenization_fnet.py
+++ b/src/transformers/models/fnet/tokenization_fnet.py
@@ -15,7 +15,6 @@
""" Tokenization classes for FNet model."""
import os
-import re
import unicodedata
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
@@ -117,14 +116,19 @@ def __init__(
) -> None:
# Mask token behave like a normal word, i.e. include the space before it and
# is included in the raw text, there should be a match in a non-normalized sentence.
- mask_token = (
- AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
- if isinstance(mask_token, str)
- else mask_token
- )
-
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+ cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+ sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.do_lower_case = do_lower_case
+ self.remove_space = remove_space
+ self.keep_accents = keep_accents
+ self.vocab_file = vocab_file
+
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+
super().__init__(
do_lower_case=do_lower_case,
remove_space=remove_space,
@@ -138,14 +142,6 @@ def __init__(
**kwargs,
)
- self.do_lower_case = do_lower_case
- self.remove_space = remove_space
- self.keep_accents = keep_accents
- self.vocab_file = vocab_file
-
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(vocab_file)
-
@property
def vocab_size(self):
return len(self.sp_model)
@@ -237,48 +233,21 @@ def _decode(
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
- spaces_between_special_tokens: bool = True,
+ spaces_between_special_tokens: bool = False,
**kwargs,
) -> str:
- self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
-
- filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-
- # To avoid mixing byte-level and unicode for byte-level BPT
- # we need to build string separately for added tokens and byte-level tokens
- # cf. https://github.com/huggingface/transformers/issues/1133
- sub_texts = []
- current_sub_text = []
- for token in filtered_tokens:
- if skip_special_tokens and token in self.all_special_ids:
- continue
- if token in self.added_tokens_encoder:
- if current_sub_text:
- sub_texts.append(self.convert_tokens_to_string(current_sub_text))
- current_sub_text = []
- sub_texts.append(token)
- else:
- current_sub_text.append(token)
- if current_sub_text:
- sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-
+ text = super()._decode(
+ token_ids=token_ids,
+ skip_special_tokens=skip_special_tokens,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ spaces_between_special_tokens=spaces_between_special_tokens,
+ **kwargs,
+ )
# Mimic the behavior of the Rust tokenizer:
# No space after
- if spaces_between_special_tokens:
- text = re.sub(r"() ", r"\1", " ".join(sub_texts))
- else:
- text = "".join(sub_texts)
-
- clean_up_tokenization_spaces = (
- clean_up_tokenization_spaces
- if clean_up_tokenization_spaces is not None
- else self.clean_up_tokenization_spaces
- )
- if clean_up_tokenization_spaces:
- clean_text = self.clean_up_tokenization(text)
- return clean_text
- else:
- return text
+ if not spaces_between_special_tokens:
+ text = text.replace(" ", "")
+ return text
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
diff --git a/src/transformers/models/fnet/tokenization_fnet_fast.py b/src/transformers/models/fnet/tokenization_fnet_fast.py
index f18665d87d5c19..2179751e558e60 100644
--- a/src/transformers/models/fnet/tokenization_fnet_fast.py
+++ b/src/transformers/models/fnet/tokenization_fnet_fast.py
@@ -108,11 +108,9 @@ def __init__(
):
# Mask token behave like a normal word, i.e. include the space before it and
# is included in the raw text, there should be a match in a non-normalized sentence.
- mask_token = (
- AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
- if isinstance(mask_token, str)
- else mask_token
- )
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+ cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+ sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
super().__init__(
vocab_file,
diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py
index 523f2ed5885070..168aa14ead7817 100644
--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -197,19 +197,6 @@ def __init__(
pad_token="",
**kwargs,
):
- super().__init__(
- langs=langs,
- src_vocab_file=src_vocab_file,
- tgt_vocab_file=tgt_vocab_file,
- merges_file=merges_file,
- do_lower_case=do_lower_case,
- unk_token=unk_token,
- bos_token=bos_token,
- sep_token=sep_token,
- pad_token=pad_token,
- **kwargs,
- )
-
try:
import sacremoses
except ImportError:
@@ -250,6 +237,18 @@ def __init__(
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
+ super().__init__(
+ langs=langs,
+ src_vocab_file=src_vocab_file,
+ tgt_vocab_file=tgt_vocab_file,
+ merges_file=merges_file,
+ do_lower_case=do_lower_case,
+ unk_token=unk_token,
+ bos_token=bos_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ **kwargs,
+ )
# hack override
def get_vocab(self) -> Dict[str, int]:
diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py
index 37a913d0a01bae..a0f9ced1b7406b 100644
--- a/src/transformers/models/funnel/tokenization_funnel.py
+++ b/src/transformers/models/funnel/tokenization_funnel.py
@@ -157,22 +157,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- bos_token=bos_token,
- eos_token=eos_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -188,7 +172,23 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py
index 278ff69032585c..21c2cdf382e41d 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -170,16 +170,7 @@ def __init__(
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
- super().__init__(
- errors=errors,
- unk_token=unk_token,
- bos_token=bos_token,
- eos_token=eos_token,
- pad_token=pad_token,
- add_prefix_space=add_prefix_space,
- add_bos_token=add_bos_token,
- **kwargs,
- )
+
self.add_bos_token = add_bos_token
with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -198,6 +189,17 @@ def __init__(
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+ super().__init__(
+ errors=errors,
+ unk_token=unk_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ add_prefix_space=add_prefix_space,
+ add_bos_token=add_bos_token,
+ **kwargs,
+ )
+
@property
def vocab_size(self):
return len(self.encoder)
diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
index 6ac2f214a16568..7fca57d4c14c4e 100644
--- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@@ -127,14 +127,6 @@ def __init__(
do_clean_text=False,
**kwargs,
):
- super().__init__(
- unk_token=unk_token,
- pad_token=pad_token,
- bos_token=bos_token,
- eos_token=eos_token,
- do_clean_text=do_clean_text,
- **kwargs,
- )
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -150,6 +142,14 @@ def __init__(
self.subword_tokenizer = SubWordJapaneseTokenizer(
vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
)
+ super().__init__(
+ unk_token=unk_token,
+ pad_token=pad_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ do_clean_text=do_clean_text,
+ **kwargs,
+ )
@property
def vocab_size(self):
diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
index 4874ba732245f0..a1a5c71e96640a 100644
--- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
+++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
@@ -103,7 +103,7 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
- model_input_names = ["input_ids", "attention_mask"]
+ model_input_names = ["input_ids", "token_type_ids", "attention_mask"]
def __init__(
self,
@@ -138,18 +138,6 @@ def __init__(
pad_token = "" if pad_token is None else pad_token
bos_token = "" if bos_token is None else bos_token
- super().__init__(
- do_lower_case=do_lower_case,
- remove_space=remove_space,
- keep_accents=keep_accents,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- pad_token=pad_token,
- sp_model_kwargs=self.sp_model_kwargs,
- **kwargs,
- )
-
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
@@ -168,6 +156,18 @@ def __init__(
f"[{''.join(map(chr, list(range(0, 9)) + list(range(11, 32)) + list(range(127, 160)) + [160, 173, 8203]))}]"
)
+ super().__init__(
+ do_lower_case=do_lower_case,
+ remove_space=remove_space,
+ keep_accents=keep_accents,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.__getstate__
def __getstate__(self):
state = self.__dict__.copy()
diff --git a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
index c567b6b6003fff..cd05ccde9ff248 100644
--- a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -166,15 +166,6 @@ def __init__(
do_clean_text=False,
**kwargs,
):
- super().__init__(
- unk_token=unk_token,
- pad_token=pad_token,
- bos_token=bos_token,
- eos_token=eos_token,
- sep_token=sep_token,
- do_clean_text=do_clean_text,
- **kwargs,
- )
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -191,6 +182,16 @@ def __init__(
vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
)
+ super().__init__(
+ unk_token=unk_token,
+ pad_token=pad_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ sep_token=sep_token,
+ do_clean_text=do_clean_text,
+ **kwargs,
+ )
+
@property
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.vocab_size
def vocab_size(self):
diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py
index 91ce0dcca58463..1747a59c6fc2fa 100644
--- a/src/transformers/models/herbert/tokenization_herbert.py
+++ b/src/transformers/models/herbert/tokenization_herbert.py
@@ -334,21 +334,6 @@ def __init__(
id2lang=None,
**kwargs,
):
- super().__init__(
- unk_token=unk_token,
- bos_token=bos_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- additional_special_tokens=additional_special_tokens,
- lang2id=lang2id,
- id2lang=id2lang,
- do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
- tokenizer_file=None,
- **kwargs,
- )
-
try:
import sacremoses
except ImportError:
@@ -383,6 +368,21 @@ def __init__(
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
+ super().__init__(
+ unk_token=unk_token,
+ bos_token=bos_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ additional_special_tokens=additional_special_tokens,
+ lang2id=lang2id,
+ id2lang=id2lang,
+ do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
+ tokenizer_file=None,
+ **kwargs,
+ )
+
self.bert_pre_tokenizer = BasicTokenizer(
do_lower_case=False,
never_split=self.all_special_tokens,
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 9a4a37b871e485..dcf47f46f7de56 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -128,16 +128,10 @@ def __init__(
**kwargs,
):
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
- super().__init__(
- unk_token=unk_token,
- n_genres=n_genres,
- version=version,
- max_n_lyric_tokens=max_n_lyric_tokens,
- **kwargs,
- )
self.version = version
self.max_n_lyric_tokens = max_n_lyric_tokens
self.n_genres = n_genres
+ self._added_tokens_decoder = {0: unk_token}
with open(artists_file, encoding="utf-8") as vocab_handle:
self.artists_encoder = json.load(vocab_handle)
@@ -157,13 +151,24 @@ def __init__(
self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()}
+ super().__init__(
+ unk_token=unk_token,
+ n_genres=n_genres,
+ version=version,
+ max_n_lyric_tokens=max_n_lyric_tokens,
+ **kwargs,
+ )
@property
def vocab_size(self):
return len(self.artists_encoder) + len(self.genres_encoder) + len(self.lyrics_encoder)
def get_vocab(self):
- return dict(self.artists_encoder, self.genres_encoder, self.lyrics_encoder)
+ return {
+ "artists_encoder": self.artists_encoder,
+ "genres_encoder": self.genres_encoder,
+ "lyrics_encoder": self.lyrics_encoder,
+ }
def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
"""Converts the artist, genre and lyrics tokens to their index using the vocabulary.
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py
index b518874224a42c..de6bc4de953d9e 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py
@@ -134,20 +134,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -163,7 +149,22 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index 1799cc29211419..6c0b2db4a9ef6d 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -244,27 +244,6 @@ def __init__(
additional_special_tokens: Optional[List[str]] = None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- cls_token_box=cls_token_box,
- sep_token_box=sep_token_box,
- pad_token_box=pad_token_box,
- pad_token_label=pad_token_label,
- only_label_first_subword=only_label_first_subword,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- model_max_length=model_max_length,
- additional_special_tokens=additional_special_tokens,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -280,7 +259,7 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
# additional properties
self.cls_token_box = cls_token_box
@@ -288,6 +267,26 @@ def __init__(
self.pad_token_box = pad_token_box
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ cls_token_box=cls_token_box,
+ sep_token_box=sep_token_box,
+ pad_token_box=pad_token_box,
+ pad_token_label=pad_token_label,
+ only_label_first_subword=only_label_first_subword,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ model_max_length=model_max_length,
+ additional_special_tokens=additional_special_tokens,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
index b9c0ab127d42ca..199b906eedcc58 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
@@ -303,24 +303,6 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
- super().__init__(
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- cls_token_box=cls_token_box,
- sep_token_box=sep_token_box,
- pad_token_box=pad_token_box,
- pad_token_label=pad_token_label,
- only_label_first_subword=only_label_first_subword,
- **kwargs,
- )
-
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -344,6 +326,24 @@ def __init__(
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
+ super().__init__(
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ cls_token_box=cls_token_box,
+ sep_token_box=sep_token_box,
+ pad_token_box=pad_token_box,
+ pad_token_label=pad_token_label,
+ only_label_first_subword=only_label_first_subword,
+ **kwargs,
+ )
+
@property
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size
def vocab_size(self):
@@ -351,7 +351,9 @@ def vocab_size(self):
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab
def get_vocab(self):
- return dict(self.encoder, **self.added_tokens_encoder)
+ vocab = dict(self.encoder).copy()
+ vocab.update(self.added_tokens_encoder)
+ return vocab
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe
def bpe(self, token):
@@ -539,7 +541,7 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
if (
(is_split_into_words or add_prefix_space)
and (len(text) > 0 and not text[0].isspace())
- and sum([text.startswith(no_split_token) for no_split_token in self.unique_no_split_tokens]) == 0
+ and sum([text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder]) == 0
):
text = " " + text
return (text, kwargs)
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index 47c5315457b4fa..230be65ee62e47 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -254,23 +254,6 @@ def __init__(
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- cls_token_box=cls_token_box,
- sep_token_box=sep_token_box,
- pad_token_box=pad_token_box,
- pad_token_label=pad_token_label,
- only_label_first_subword=only_label_first_subword,
- sp_model_kwargs=self.sp_model_kwargs,
- **kwargs,
- )
-
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
@@ -297,6 +280,23 @@ def __init__(
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ cls_token_box=cls_token_box,
+ sep_token_box=sep_token_box,
+ pad_token_box=pad_token_box,
+ pad_token_label=pad_token_label,
+ only_label_first_subword=only_label_first_subword,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py
index 1cdb52430117c6..bc83680b219f72 100644
--- a/src/transformers/models/led/tokenization_led.py
+++ b/src/transformers/models/led/tokenization_led.py
@@ -197,21 +197,10 @@ def __init__(
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
# Mask token behave like a normal word, i.e. include the space before it
+ # TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens`
+ # Also this not only will strip the spaces but any punctuation
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
- super().__init__(
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- **kwargs,
- )
-
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -228,6 +217,19 @@ def __init__(
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+ super().__init__(
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ **kwargs,
+ )
+
@property
# Copied from transformers.models.bart.tokenization_bart.BartTokenizer.vocab_size
def vocab_size(self):
diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py
index 51b8ab4aaaf03a..e7ef2fff737c1f 100644
--- a/src/transformers/models/led/tokenization_led_fast.py
+++ b/src/transformers/models/led/tokenization_led_fast.py
@@ -152,6 +152,7 @@ def __init__(
trim_offsets=True,
**kwargs,
):
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
super().__init__(
vocab_file,
merges_file,
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 8db2f9970e199a..4e2e0e41db1a04 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -122,20 +122,7 @@ def __init__(
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- pad_token=pad_token,
- add_bos_token=add_bos_token,
- add_eos_token=add_eos_token,
- sp_model_kwargs=self.sp_model_kwargs,
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
- use_default_system_prompt=use_default_system_prompt,
- spaces_between_special_tokens=spaces_between_special_tokens,
- legacy=legacy,
- **kwargs,
- )
+
if legacy is None:
logger.warning_once(
f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is"
@@ -151,9 +138,23 @@ def __init__(
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self.use_default_system_prompt = use_default_system_prompt
-
self.sp_model = self.get_spm_processor()
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ add_bos_token=add_bos_token,
+ add_eos_token=add_eos_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ use_default_system_prompt=use_default_system_prompt,
+ spaces_between_special_tokens=spaces_between_special_tokens,
+ legacy=legacy,
+ **kwargs,
+ )
+
@property
def unk_token_length(self):
return len(self.sp_model.encode(str(self.unk_token)))
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index 282a0f06740eaa..157bd4cdb852a3 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -33,6 +33,14 @@
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
+PRETRAINED_VOCAB_FILES_MAP = {
+ "vocab_file": {
+ "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+ },
+ "tokenizer_file": {
+ "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+ },
+}
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<>\n", "\n<>\n\n"
@@ -93,6 +101,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
"""
vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
slow_tokenizer_class = LlamaTokenizer
padding_side = "left"
model_input_names = ["input_ids", "attention_mask"]
diff --git a/src/transformers/models/longformer/tokenization_longformer.py b/src/transformers/models/longformer/tokenization_longformer.py
index fea949658abcd1..7661634a000998 100644
--- a/src/transformers/models/longformer/tokenization_longformer.py
+++ b/src/transformers/models/longformer/tokenization_longformer.py
@@ -212,28 +212,21 @@ def __init__(
**kwargs,
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
- unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
- pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
# Mask token behave like a normal word, i.e. include the space before it
- mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
- super().__init__(
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- **kwargs,
+ mask_token = (
+ AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+ if isinstance(mask_token, str)
+ else mask_token
)
+ # these special tokens are not part of the vocab.json, let's add them in the correct order
+
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -250,12 +243,27 @@ def __init__(
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+ super().__init__(
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ **kwargs,
+ )
+
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
- return dict(self.encoder, **self.added_tokens_encoder)
+ vocab = dict(self.encoder).copy()
+ vocab.update(self.added_tokens_encoder)
+ return vocab
def bpe(self, token):
if token in self.cache:
diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py
index 1460f2f2cc2f10..32c6f6c2deef36 100644
--- a/src/transformers/models/longformer/tokenization_longformer_fast.py
+++ b/src/transformers/models/longformer/tokenization_longformer_fast.py
@@ -192,6 +192,11 @@ def __init__(
trim_offsets=True,
**kwargs,
):
+ mask_token = (
+ AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+ if isinstance(mask_token, str)
+ else mask_token
+ )
super().__init__(
vocab_file,
merges_file,
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 8b47ced1d3175f..e8ad725d050b1c 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -326,28 +326,6 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
- super().__init__(
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- task=task,
- max_entity_length=32,
- max_mention_length=30,
- entity_token_1="",
- entity_token_2="",
- entity_unk_token=entity_unk_token,
- entity_pad_token=entity_pad_token,
- entity_mask_token=entity_mask_token,
- entity_mask2_token=entity_mask2_token,
- **kwargs,
- )
-
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -407,6 +385,28 @@ def __init__(
self.max_mention_length = max_mention_length
+ super().__init__(
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ task=task,
+ max_entity_length=32,
+ max_mention_length=30,
+ entity_token_1="",
+ entity_token_2="",
+ entity_unk_token=entity_unk_token,
+ entity_pad_token=entity_pad_token,
+ entity_mask_token=entity_mask_token,
+ entity_mask2_token=entity_mask2_token,
+ **kwargs,
+ )
+
@property
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Luke, RoBERTa->LUKE
def vocab_size(self):
@@ -414,7 +414,9 @@ def vocab_size(self):
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Luke, RoBERTa->LUKE
def get_vocab(self):
- return dict(self.encoder, **self.added_tokens_encoder)
+ vocab = dict(self.encoder).copy()
+ vocab.update(self.added_tokens_encoder)
+ return vocab
# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Luke, RoBERTa->LUKE
def bpe(self, token):
diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py
index e651b8f4454a11..17ff0ff8e7f82d 100644
--- a/src/transformers/models/lxmert/tokenization_lxmert.py
+++ b/src/transformers/models/lxmert/tokenization_lxmert.py
@@ -126,20 +126,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -155,7 +141,22 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py
index 82f5e3a47b36ee..1346af81412add 100644
--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -150,26 +150,11 @@ def __init__(
fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes]
self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in fairseq_language_code}
- kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
- kwargs["additional_special_tokens"] += [
- self.get_lang_token(lang_code)
- for lang_code in fairseq_language_code
- if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
- ]
-
- super().__init__(
- src_lang=src_lang,
- tgt_lang=tgt_lang,
- bos_token=bos_token,
- eos_token=eos_token,
- sep_token=sep_token,
- unk_token=unk_token,
- pad_token=pad_token,
- language_codes=language_codes,
- sp_model_kwargs=self.sp_model_kwargs,
- num_madeup_words=num_madeup_words,
- **kwargs,
- )
+ additional_special_tokens = kwargs.pop("additional_special_tokens", [])
+ for lang_code in fairseq_language_code:
+ token = self.get_lang_token(lang_code)
+ if token not in additional_special_tokens and lang_code not in str(token) not in self.added_tokens_encoder:
+ additional_special_tokens.append(token)
self.vocab_file = vocab_file
self.encoder = load_json(vocab_file)
@@ -188,13 +173,33 @@ def __init__(
self._src_lang = src_lang if src_lang is not None else "en"
self.tgt_lang = tgt_lang
self.cur_lang_id = self.get_lang_id(self._src_lang)
- self.set_src_lang_special_tokens(self._src_lang)
self.num_madeup_words = num_madeup_words
+ super().__init__(
+ src_lang=src_lang,
+ tgt_lang=tgt_lang,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ sep_token=sep_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ language_codes=language_codes,
+ sp_model_kwargs=self.sp_model_kwargs,
+ additional_special_tokens=additional_special_tokens,
+ num_madeup_words=num_madeup_words,
+ **kwargs,
+ )
+ self.set_src_lang_special_tokens(self._src_lang)
+
@property
def vocab_size(self) -> int:
- return len(self.encoder) + len(self.lang_token_to_id)
+ return len(self.encoder)
+
+ def get_vocab(self) -> Dict:
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
@property
def src_lang(self) -> str:
@@ -290,11 +295,6 @@ def build_inputs_with_special_tokens(
# We don't expect to process pairs, but leave the pair logic for API consistency
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
- def get_vocab(self) -> Dict:
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
- vocab.update(self.added_tokens_encoder)
- return vocab
-
def __getstate__(self) -> Dict:
state = self.__dict__.copy()
state["sp_model"] = None
diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py
index 2736b03a012f86..f064b49a8397b9 100644
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -144,26 +144,13 @@ def __init__(
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
- super().__init__(
- # bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id
- source_lang=source_lang,
- target_lang=target_lang,
- unk_token=unk_token,
- eos_token=eos_token,
- pad_token=pad_token,
- model_max_length=model_max_length,
- sp_model_kwargs=self.sp_model_kwargs,
- target_vocab_file=target_vocab_file,
- separate_vocabs=separate_vocabs,
- **kwargs,
- )
assert Path(source_spm).exists(), f"cannot find spm source {source_spm}"
self.separate_vocabs = separate_vocabs
self.encoder = load_json(vocab)
- if self.unk_token not in self.encoder:
- raise KeyError(" token must be in vocab")
- assert self.pad_token in self.encoder
+ if unk_token not in self.encoder:
+ raise KeyError(" token must be in the vocab")
+ assert pad_token in self.encoder
if separate_vocabs:
self.target_encoder = load_json(target_vocab_file)
@@ -187,6 +174,20 @@ def __init__(
self._setup_normalizer()
+ super().__init__(
+ # bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id
+ source_lang=source_lang,
+ target_lang=target_lang,
+ unk_token=unk_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ model_max_length=model_max_length,
+ sp_model_kwargs=self.sp_model_kwargs,
+ target_vocab_file=target_vocab_file,
+ separate_vocabs=separate_vocabs,
+ **kwargs,
+ )
+
def _setup_normalizer(self):
try:
from sacremoses import MosesPunctNormalizer
diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py
index 9d438602864645..24fa4b7763a9e1 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm.py
@@ -232,27 +232,6 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
- super().__init__(
- vocab_file=vocab_file,
- merges_file=merges_file,
- tags_dict=tags_dict,
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- max_depth=max_depth,
- max_width=max_width,
- pad_width=pad_width,
- pad_token_label=pad_token_label,
- only_label_first_subword=only_label_first_subword,
- **kwargs,
- )
-
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
@@ -279,6 +258,28 @@ def __init__(
self.pad_tag_id = self.unk_tag_id + 1
self.pad_xpath_tags_seq = [self.pad_tag_id] * self.max_depth
self.pad_xpath_subs_seq = [self.pad_width] * self.max_depth
+
+ super().__init__(
+ vocab_file=vocab_file,
+ merges_file=merges_file,
+ tags_dict=tags_dict,
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ max_depth=max_depth,
+ max_width=max_width,
+ pad_width=pad_width,
+ pad_token_label=pad_token_label,
+ only_label_first_subword=only_label_first_subword,
+ **kwargs,
+ )
+
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword
@@ -312,7 +313,9 @@ def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
- return dict(self.encoder, **self.added_tokens_encoder)
+ vocab = self.encoder.copy()
+ vocab.update(self.added_tokens_encoder)
+ return vocab
def bpe(self, token):
if token in self.cache:
diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
index 0010c21cdce58b..a0933631b65b7a 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
@@ -26,6 +26,7 @@
from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
from ...tokenization_utils_base import (
ENCODE_KWARGS_DOCSTRING,
+ AddedToken,
BatchEncoding,
EncodedInput,
PreTokenizedInput,
@@ -182,6 +183,16 @@ def __init__(
trim_offsets=False,
**kwargs,
):
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+ cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+ # Mask token behave like a normal word, i.e. include the space before it
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py
index 0c74175e33220e..933074fd5d85bd 100644
--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -101,22 +101,6 @@ def __init__(
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- tokenizer_file=None,
- src_lang=src_lang,
- tgt_lang=tgt_lang,
- additional_special_tokens=additional_special_tokens,
- sp_model_kwargs=self.sp_model_kwargs,
- **kwargs,
- )
-
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
@@ -142,14 +126,30 @@ def __init__(
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
- self._additional_special_tokens = list(self.lang_code_to_id.keys())
+ _additional_special_tokens = list(self.lang_code_to_id.keys())
if additional_special_tokens is not None:
# Only add those special tokens if they are not already there.
- self._additional_special_tokens.extend(
- [t for t in additional_special_tokens if t not in self._additional_special_tokens]
+ _additional_special_tokens.extend(
+ [t for t in additional_special_tokens if t not in _additional_special_tokens]
)
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ tokenizer_file=None,
+ src_lang=src_lang,
+ tgt_lang=tgt_lang,
+ additional_special_tokens=_additional_special_tokens,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+
self._src_lang = src_lang if src_lang is not None else "en_XX"
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
self.tgt_lang = tgt_lang
diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py
index ff74739afcdf6b..ed0d0de9c8642c 100644
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -112,6 +112,14 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+ _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
+
+ if additional_special_tokens is not None:
+ # Only add those special tokens if they are not already there.
+ _additional_special_tokens.extend(
+ [t for t in additional_special_tokens if t not in _additional_special_tokens]
+ )
+
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
@@ -124,21 +132,11 @@ def __init__(
mask_token=mask_token,
src_lang=src_lang,
tgt_lang=tgt_lang,
- additional_special_tokens=additional_special_tokens,
+ additional_special_tokens=_additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
-
- _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
-
- if additional_special_tokens is not None:
- # Only add those special tokens if they are not already there.
- _additional_special_tokens.extend(
- [t for t in additional_special_tokens if t not in _additional_special_tokens]
- )
-
- self.add_special_tokens({"additional_special_tokens": _additional_special_tokens})
self.lang_code_to_id = {
lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
}
diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py
index 628be52479d0c3..e2cffc57ad3380 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50.py
@@ -137,19 +137,6 @@ def __init__(
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
]
- super().__init__(
- src_lang=src_lang,
- tgt_lang=tgt_lang,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- sp_model_kwargs=self.sp_model_kwargs,
- **kwargs,
- )
-
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
@@ -176,6 +163,19 @@ def __init__(
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+ super().__init__(
+ src_lang=src_lang,
+ tgt_lang=tgt_lang,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+
self._src_lang = src_lang if src_lang is not None else "en_XX"
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
self.tgt_lang = tgt_lang
diff --git a/src/transformers/models/mgp_str/tokenization_mgp_str.py b/src/transformers/models/mgp_str/tokenization_mgp_str.py
index 9d4fddcc7e838c..e267491c8613bb 100644
--- a/src/transformers/models/mgp_str/tokenization_mgp_str.py
+++ b/src/transformers/models/mgp_str/tokenization_mgp_str.py
@@ -62,6 +62,9 @@ class MgpstrTokenizer(PreTrainedTokenizer):
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, unk_token="[GO]", bos_token="[GO]", eos_token="[s]", pad_token="[GO]", **kwargs):
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
+ self.vocab = json.load(vocab_handle)
+ self.decoder = {v: k for k, v in self.vocab.items()}
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
@@ -70,16 +73,14 @@ def __init__(self, vocab_file, unk_token="[GO]", bos_token="[GO]", eos_token="[s
**kwargs,
)
- with open(vocab_file, encoding="utf-8") as vocab_handle:
- self.vocab = json.load(vocab_handle)
- self.decoder = {v: k for k, v in self.vocab.items()}
-
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
- return dict(self.vocab, **self.added_tokens_encoder)
+ vocab = dict(self.vocab).copy()
+ vocab.update(self.added_tokens_encoder)
+ return vocab
def _tokenize(self, text):
"""Tokenize a string."""
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index d1fdf798a9e35b..028de5d4f79c8c 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -272,32 +272,11 @@ def __init__(
if isinstance(entity_token_2, str)
else entity_token_2
)
- kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
- kwargs["additional_special_tokens"] += [entity_token_1, entity_token_2]
+ additional_special_tokens = kwargs.pop("additional_special_tokens", [])
+ additional_special_tokens += [entity_token_1, entity_token_2]
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- sp_model_kwargs=self.sp_model_kwargs,
- task=task,
- max_entity_length=max_entity_length,
- max_mention_length=max_mention_length,
- entity_token_1=entity_token_1,
- entity_token_2=entity_token_2,
- entity_unk_token=entity_unk_token,
- entity_pad_token=entity_pad_token,
- entity_mask_token=entity_mask_token,
- entity_mask2_token=entity_mask2_token,
- **kwargs,
- )
-
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
@@ -345,6 +324,65 @@ def __init__(
self.max_mention_length = max_mention_length
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ task=task,
+ max_entity_length=max_entity_length,
+ max_mention_length=max_mention_length,
+ entity_token_1=entity_token_1,
+ entity_token_2=entity_token_2,
+ entity_unk_token=entity_unk_token,
+ entity_pad_token=entity_pad_token,
+ entity_mask_token=entity_mask_token,
+ entity_mask2_token=entity_mask2_token,
+ additional_special_tokens=additional_special_tokens,
+ **kwargs,
+ )
+
+ @property
+ # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.vocab_size
+ def vocab_size(self):
+ return len(self.sp_model) + self.fairseq_offset + 1 # Add the token
+
+ # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.get_vocab
+ def get_vocab(self):
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._tokenize
+ def _tokenize(self, text: str) -> List[str]:
+ # TODO check if the t5/llama PR also applies here
+ return self.sp_model.encode(text, out_type=str)
+
+ # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._convert_token_to_id
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ if token in self.fairseq_tokens_to_ids:
+ return self.fairseq_tokens_to_ids[token]
+ spm_id = self.sp_model.PieceToId(token)
+
+ # Need to return unknown token if the SP model returned 0
+ return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ if index in self.fairseq_ids_to_tokens:
+ return self.fairseq_ids_to_tokens[index]
+ return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (strings for sub-words) in a single string."""
+ out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+ return out_string
+
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
@@ -1591,39 +1629,3 @@ def create_token_type_ids_from_sequences(
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
- @property
- # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.vocab_size
- def vocab_size(self):
- return len(self.sp_model) + self.fairseq_offset + 1 # Add the token
-
- # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.get_vocab
- def get_vocab(self):
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
- vocab.update(self.added_tokens_encoder)
- return vocab
-
- # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._tokenize
- def _tokenize(self, text: str) -> List[str]:
- return self.sp_model.encode(text, out_type=str)
-
- # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._convert_token_to_id
- def _convert_token_to_id(self, token):
- """Converts a token (str) in an id using the vocab."""
- if token in self.fairseq_tokens_to_ids:
- return self.fairseq_tokens_to_ids[token]
- spm_id = self.sp_model.PieceToId(token)
-
- # Need to return unknown token if the SP model returned 0
- return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
-
- def _convert_id_to_token(self, index):
- """Converts an index (integer) in a token (str) using the vocab."""
- if index in self.fairseq_ids_to_tokens:
- return self.fairseq_ids_to_tokens[index]
- return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
- def convert_tokens_to_string(self, tokens):
- """Converts a sequence of tokens (strings for sub-words) in a single string."""
- out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
- return out_string
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py
index 389e38bce61933..398f054a992657 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py
@@ -124,20 +124,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -153,7 +139,22 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py
index f1347da08a3f95..21c3555c057749 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet.py
@@ -157,22 +157,6 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -188,7 +172,23 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def do_lower_case(self):
@@ -199,7 +199,9 @@ def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
- return dict(self.vocab, **self.added_tokens_encoder)
+ vocab = self.vocab.copy()
+ vocab.update(self.added_tokens_encoder)
+ return vocab
def _tokenize(self, text):
split_tokens = []
diff --git a/src/transformers/models/mpnet/tokenization_mpnet_fast.py b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
index 82d8ffec08d910..1c9b1d5922278b 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
@@ -126,6 +126,16 @@ def __init__(
strip_accents=None,
**kwargs,
):
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+ cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+ # Mask token behave like a normal word, i.e. include the space before it
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
diff --git a/src/transformers/models/mvp/tokenization_mvp.py b/src/transformers/models/mvp/tokenization_mvp.py
index 2d497c23d1300c..c897cbea30d928 100644
--- a/src/transformers/models/mvp/tokenization_mvp.py
+++ b/src/transformers/models/mvp/tokenization_mvp.py
@@ -193,19 +193,6 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
- super().__init__(
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- **kwargs,
- )
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -222,12 +209,27 @@ def __init__(
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+ super().__init__(
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ **kwargs,
+ )
+
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
- return dict(self.encoder, **self.added_tokens_encoder)
+ vocab = self.encoder.copy()
+ vocab.update(self.added_tokens_encoder)
+ return vocab
def bpe(self, token):
if token in self.cache:
diff --git a/src/transformers/models/mvp/tokenization_mvp_fast.py b/src/transformers/models/mvp/tokenization_mvp_fast.py
index fd6abd1700205b..afe2a0a89e2a03 100644
--- a/src/transformers/models/mvp/tokenization_mvp_fast.py
+++ b/src/transformers/models/mvp/tokenization_mvp_fast.py
@@ -153,6 +153,15 @@ def __init__(
trim_offsets=True,
**kwargs,
):
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+ cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+ # Mask token behave like a normal word, i.e. include the space before it
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
super().__init__(
vocab_file,
merges_file,
diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py
index 58a02a7af75f24..ea77f10ea578ae 100644
--- a/src/transformers/models/nllb/tokenization_nllb.py
+++ b/src/transformers/models/nllb/tokenization_nllb.py
@@ -149,23 +149,6 @@ def __init__(
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.legacy_behaviour = legacy_behaviour
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- tokenizer_file=tokenizer_file,
- src_lang=src_lang,
- tgt_lang=tgt_lang,
- additional_special_tokens=additional_special_tokens,
- sp_model_kwargs=self.sp_model_kwargs,
- legacy_behaviour=legacy_behaviour,
- **kwargs,
- )
-
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
@@ -190,16 +173,35 @@ def __init__(
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
- self._additional_special_tokens = list(self.lang_code_to_id.keys())
+
+ self._src_lang = src_lang if src_lang is not None else "eng_Latn"
+ self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
+
+ _additional_special_tokens = list(self.lang_code_to_id.keys())
if additional_special_tokens is not None:
# Only add those special tokens if they are not already there.
- self._additional_special_tokens.extend(
- [t for t in additional_special_tokens if t not in self._additional_special_tokens]
+ _additional_special_tokens.extend(
+ [t for t in additional_special_tokens if t not in _additional_special_tokens]
)
- self._src_lang = src_lang if src_lang is not None else "eng_Latn"
- self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ tokenizer_file=tokenizer_file,
+ src_lang=src_lang,
+ tgt_lang=tgt_lang,
+ additional_special_tokens=_additional_special_tokens,
+ sp_model_kwargs=self.sp_model_kwargs,
+ legacy_behaviour=legacy_behaviour,
+ **kwargs,
+ )
+
self.tgt_lang = tgt_lang
self.set_src_lang_special_tokens(self._src_lang)
diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py
index 59e67c4bff9acd..7ab11c8cc00a06 100644
--- a/src/transformers/models/nllb/tokenization_nllb_fast.py
+++ b/src/transformers/models/nllb/tokenization_nllb_fast.py
@@ -157,6 +157,15 @@ def __init__(
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
self.legacy_behaviour = legacy_behaviour
+
+ _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
+
+ if additional_special_tokens is not None:
+ # Only add those special tokens if they are not already there.
+ _additional_special_tokens.extend(
+ [t for t in additional_special_tokens if t not in _additional_special_tokens]
+ )
+
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
@@ -169,22 +178,13 @@ def __init__(
mask_token=mask_token,
src_lang=src_lang,
tgt_lang=tgt_lang,
- additional_special_tokens=additional_special_tokens,
+ additional_special_tokens=_additional_special_tokens,
legacy_behaviour=legacy_behaviour,
**kwargs,
)
self.vocab_file = vocab_file
- _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
-
- if additional_special_tokens is not None:
- # Only add those special tokens if they are not already there.
- _additional_special_tokens.extend(
- [t for t in additional_special_tokens if t not in _additional_special_tokens]
- )
-
- self.add_special_tokens({"additional_special_tokens": _additional_special_tokens})
self.lang_code_to_id = {
lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
}
diff --git a/src/transformers/models/openai/tokenization_openai.py b/src/transformers/models/openai/tokenization_openai.py
index 0a7f93a7b2de1c..cfdeb3207a6d96 100644
--- a/src/transformers/models/openai/tokenization_openai.py
+++ b/src/transformers/models/openai/tokenization_openai.py
@@ -269,8 +269,6 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
model_input_names = ["input_ids", "attention_mask"]
def __init__(self, vocab_file, merges_file, unk_token="", **kwargs):
- super().__init__(unk_token=unk_token, **kwargs)
-
try:
import ftfy
from spacy.lang.en import English
@@ -292,6 +290,8 @@ def __init__(self, vocab_file, merges_file, unk_token="", **kwargs):
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
+ super().__init__(unk_token=unk_token, **kwargs)
+
@property
def do_lower_case(self):
return True
diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py
index 814602fac88d0d..3b6a461d81d0cd 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus.py
@@ -18,7 +18,7 @@
import sentencepiece as spm
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
@@ -38,6 +38,7 @@
logger = logging.get_logger(__name__)
+# TODO ArthurZ refactor this to only use the added_tokens_encoder
class PegasusTokenizer(PreTrainedTokenizer):
r"""
Construct a PEGASUS tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
@@ -95,8 +96,6 @@ class PegasusTokenizer(PreTrainedTokenizer):
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""
- vocab_files_names = VOCAB_FILES_NAMES
-
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -122,7 +121,6 @@ def __init__(
f"additional_special_tokens should be of type {type(list)}, but is"
f" {type(additional_special_tokens)}"
)
-
additional_special_tokens_extended = (
([mask_token_sent] + additional_special_tokens)
if mask_token_sent not in additional_special_tokens and mask_token_sent is not None
@@ -140,10 +138,27 @@ def __init__(
)
additional_special_tokens = additional_special_tokens_extended
else:
+ additional_special_tokens_extended = []
additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else []
additional_special_tokens += [f"" for i in range(2, self.offset)]
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.mask_token_sent = mask_token_sent
+ self.vocab_file = vocab_file
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+
+ self._added_tokens_decoder = {
+ 0: AddedToken(str(pad_token), lstrip=True, rstrip=True),
+ 1: AddedToken(str(eos_token), lstrip=True, rstrip=True),
+ }
+
+ if self.mask_token_sent is not None:
+ self._added_tokens_decoder[2] = AddedToken(mask_token_sent)
+ self._added_tokens_decoder[3] = AddedToken(str(mask_token))
+
+ for i in range(1, self.offset - 1):
+ self._added_tokens_decoder[len(self._added_tokens_decoder)] = AddedToken(f"")
super().__init__(
eos_token=eos_token,
@@ -156,31 +171,6 @@ def __init__(
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
- self.mask_token_sent = mask_token_sent
- self.vocab_file = vocab_file
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(vocab_file)
-
- # add special tokens to encoder dict
- self.encoder: Dict[int, str] = {
- 0: self.pad_token,
- 1: self.eos_token,
- }
-
- if self.mask_token_sent is not None:
- self.encoder.update(
- {
- 2: self.mask_token_sent,
- 3: self.mask_token,
- }
- )
-
- if self.offset > 0:
- # entries 2-104 are only used for pretraining and called , , unk_2, ...unk_102
- # mask_token_sent is already added to list -> so start at 1
- self.encoder.update({i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1)})
-
- self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
@property
def vocab_size(self) -> int:
@@ -212,21 +202,14 @@ def _tokenize(self, text: str) -> List[str]:
def _convert_token_to_id(self, token: str) -> int:
"""Converts a token (str) to an id using the vocab."""
- if token in self.decoder:
- return self.decoder[token]
- elif token in self.added_tokens_decoder:
- return self.added_tokens_decoder[token]
sp_id = self.sp_model.piece_to_id(token)
return sp_id + self.offset
def _convert_id_to_token(self, index: int) -> str:
"""Converts an index (integer) to a token (str) using the vocab."""
- if index in self.encoder:
- return self.encoder[index]
- elif index in self.added_tokens_encoder:
- return self.added_tokens_encoder[index]
- else:
- token = self.sp_model.IdToPiece(index - self.offset)
+ if index < self.offset:
+ return self.sp_model.IdToPiece(index)
+ token = self.sp_model.IdToPiece(index - self.offset)
return token
def convert_tokens_to_string(self, tokens):
diff --git a/src/transformers/models/perceiver/tokenization_perceiver.py b/src/transformers/models/perceiver/tokenization_perceiver.py
index cbfd9e64150243..b4ec1e378e5671 100644
--- a/src/transformers/models/perceiver/tokenization_perceiver.py
+++ b/src/transformers/models/perceiver/tokenization_perceiver.py
@@ -75,6 +75,18 @@ def __init__(
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+ self._utf_vocab_size = 2**8 # utf is 8 bits
+
+ # Since these tokens are not part of the vocabulary, we manually add them
+ self._added_tokens_decoder: Dict[str, int] = {
+ 0: pad_token,
+ 1: bos_token,
+ 2: eos_token,
+ 3: mask_token,
+ 4: cls_token,
+ 5: sep_token,
+ }
+ self._num_special_tokens = len(self._added_tokens_decoder)
super().__init__(
pad_token=pad_token,
bos_token=bos_token,
@@ -86,31 +98,17 @@ def __init__(
**kwargs,
)
- self._utf_vocab_size = 2**8 # utf is 8 bits
-
- # define special tokens dict
- self.special_tokens_encoder: Dict[str, int] = {
- self.pad_token: 0,
- self.bos_token: 1,
- self.eos_token: 2,
- self.mask_token: 3,
- self.cls_token: 4,
- self.sep_token: 5,
- }
- self._num_special_tokens = len(self.special_tokens_encoder)
- self.special_tokens_decoder: Dict[int, str] = {v: k for k, v in self.special_tokens_encoder.items()}
-
def get_vocab(self) -> Dict[str, int]:
- vocab = self.special_tokens_encoder.copy()
- vocab.update(self.added_tokens_encoder)
+ vocab = {}
for i in range(self._utf_vocab_size):
token = chr(i)
- vocab[token] = i + len(self.special_tokens_encoder)
+ vocab[token] = i + self._num_special_tokens
+ vocab.update(self.added_tokens_encoder)
return vocab
@property
def vocab_size(self):
- return self._utf_vocab_size + self._num_special_tokens
+ return self._utf_vocab_size
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
@@ -171,11 +169,7 @@ def _tokenize(self, text: str) -> List[str]:
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
- if token in self.special_tokens_encoder:
- token_id = self.special_tokens_encoder[token]
- elif token in self.added_tokens_encoder:
- token_id = self.added_tokens_encoder[token]
- elif len(token) != 1:
+ if len(token) != 1:
token_id = self.unk_token_id
else:
token_id = ord(token) + self._num_special_tokens
@@ -183,26 +177,16 @@ def _convert_token_to_id(self, token):
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
- if index in self.special_tokens_decoder:
- token = self.special_tokens_decoder[index]
- elif index in self.added_tokens_decoder:
- token = self.added_tokens_decoder[index]
- else:
- token = chr(index - self._num_special_tokens)
+ token = chr(index - self._num_special_tokens)
return token
+ # TODO @ArthurZ refactor this as well....
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
bstring = b""
for token in tokens:
- if token in self.special_tokens_decoder:
- tok_string = self.special_tokens_decoder[token].encode("utf-8")
- elif token in self.added_tokens_decoder:
- tok_string = self.special_tokens_decoder[token].encode("utf-8")
- elif token in self.special_tokens_encoder:
- tok_string = token.encode("utf-8")
- elif token in self.added_tokens_encoder:
- tok_string = token.encode("utf-8")
+ if token in self.added_tokens_encoder:
+ tok_string = str(token).encode("utf-8")
else:
tok_string = bytes([ord(token)])
bstring += tok_string
diff --git a/src/transformers/models/phobert/tokenization_phobert.py b/src/transformers/models/phobert/tokenization_phobert.py
index 534a71d5038ed3..efa7e2469478fb 100644
--- a/src/transformers/models/phobert/tokenization_phobert.py
+++ b/src/transformers/models/phobert/tokenization_phobert.py
@@ -131,25 +131,14 @@ def __init__(
mask_token="",
**kwargs,
):
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- **kwargs,
- )
-
self.vocab_file = vocab_file
self.merges_file = merges_file
self.encoder = {}
- self.encoder[self.bos_token] = 0
- self.encoder[self.pad_token] = 1
- self.encoder[self.eos_token] = 2
- self.encoder[self.unk_token] = 3
+ self.encoder[bos_token] = 0
+ self.encoder[pad_token] = 1
+ self.encoder[eos_token] = 2
+ self.encoder[unk_token] = 3
self.add_from_file(vocab_file)
@@ -158,9 +147,21 @@ def __init__(
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[:-1]
merges = [tuple(merge.split()[:-1]) for merge in merges]
+
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ **kwargs,
+ )
+
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py
index bf47538eaabdaf..e50849b51d2d59 100644
--- a/src/transformers/models/plbart/tokenization_plbart.py
+++ b/src/transformers/models/plbart/tokenization_plbart.py
@@ -195,23 +195,6 @@ def __init__(
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- language_codes=language_codes,
- tokenizer_file=tokenizer_file,
- src_lang=src_lang,
- tgt_lang=tgt_lang,
- additional_special_tokens=additional_special_tokens,
- sp_model_kwargs=self.sp_model_kwargs,
- **kwargs,
- )
src_lang = self._convert_lang_code_special_format(src_lang)
tgt_lang = self._convert_lang_code_special_format(tgt_lang)
@@ -245,12 +228,12 @@ def __init__(
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
- self._additional_special_tokens = list(self.lang_code_to_id.keys())
+ _additional_special_tokens = list(self.lang_code_to_id.keys())
if additional_special_tokens is not None:
# Only add those special tokens if they are not already there.
- self._additional_special_tokens.extend(
- [t for t in additional_special_tokens if t not in self._additional_special_tokens]
+ _additional_special_tokens.extend(
+ [t for t in additional_special_tokens if t not in _additional_special_tokens]
)
if self.language_codes == "base":
@@ -262,6 +245,23 @@ def __init__(
self._src_lang = src_lang if src_lang is not None else "__en_XX__"
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ language_codes=language_codes,
+ tokenizer_file=tokenizer_file,
+ src_lang=src_lang,
+ tgt_lang=tgt_lang,
+ additional_special_tokens=_additional_special_tokens,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+
self.tgt_lang = tgt_lang
self.set_src_lang_special_tokens(self._src_lang)
diff --git a/src/transformers/models/pop2piano/tokenization_pop2piano.py b/src/transformers/models/pop2piano/tokenization_pop2piano.py
index ea1c042195ea2d..0d25dcdfc7d57b 100644
--- a/src/transformers/models/pop2piano/tokenization_pop2piano.py
+++ b/src/transformers/models/pop2piano/tokenization_pop2piano.py
@@ -101,14 +101,6 @@ def __init__(
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
- super().__init__(
- unk_token=unk_token,
- eos_token=eos_token,
- pad_token=pad_token,
- bos_token=bos_token,
- **kwargs,
- )
-
self.default_velocity = default_velocity
self.num_bars = num_bars
@@ -119,6 +111,14 @@ def __init__(
# create mappings for encoder
self.decoder = {v: k for k, v in self.encoder.items()}
+ super().__init__(
+ unk_token=unk_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ bos_token=bos_token,
+ **kwargs,
+ )
+
@property
def vocab_size(self):
"""Returns the vocabulary size of the tokenizer."""
diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py
index 03e9083e749e2f..bb4fa5ff9ca49f 100644
--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -354,21 +354,6 @@ def __init__(
strip_accents: Optional[bool] = None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- x_sep_token=x_sep_token,
- pad_token=pad_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
- self.unique_no_split_tokens.append(x_sep_token)
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -384,7 +369,21 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ x_sep_token=x_sep_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def vocab_size(self):
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index a6c09f1b97f5b8..bf6b63277488b9 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -157,20 +157,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -186,7 +172,20 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py
index 8796c8149c8ae6..255e153c0d79e1 100644
--- a/src/transformers/models/reformer/tokenization_reformer.py
+++ b/src/transformers/models/reformer/tokenization_reformer.py
@@ -106,6 +106,10 @@ def __init__(
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.vocab_file = vocab_file
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+
super().__init__(
eos_token=eos_token,
unk_token=unk_token,
@@ -114,10 +118,6 @@ def __init__(
**kwargs,
)
- self.vocab_file = vocab_file
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(vocab_file)
-
@property
def vocab_size(self):
return self.sp_model.get_piece_size()
diff --git a/src/transformers/models/rembert/tokenization_rembert.py b/src/transformers/models/rembert/tokenization_rembert.py
index 65e6c1df728f7c..c1f12527ef5974 100644
--- a/src/transformers/models/rembert/tokenization_rembert.py
+++ b/src/transformers/models/rembert/tokenization_rembert.py
@@ -111,6 +111,13 @@ def __init__(
mask_token="[MASK]",
**kwargs,
):
+ self.do_lower_case = do_lower_case
+ self.remove_space = remove_space
+ self.keep_accents = keep_accents
+ self.vocab_file = vocab_file
+
+ self.sp_model = spm.SentencePieceProcessor()
+ self.sp_model.Load(vocab_file)
super().__init__(
do_lower_case=do_lower_case,
remove_space=remove_space,
@@ -125,14 +132,6 @@ def __init__(
**kwargs,
)
- self.do_lower_case = do_lower_case
- self.remove_space = remove_space
- self.keep_accents = keep_accents
- self.vocab_file = vocab_file
-
- self.sp_model = spm.SentencePieceProcessor()
- self.sp_model.Load(vocab_file)
-
@property
def vocab_size(self):
return len(self.sp_model)
diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py
index 24b9748c3d37fe..b7b3c75be180cd 100644
--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -203,28 +203,21 @@ def __init__(
**kwargs,
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
- unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
- pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
# Mask token behave like a normal word, i.e. include the space before it
- mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
- super().__init__(
- errors=errors,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- add_prefix_space=add_prefix_space,
- **kwargs,
+ mask_token = (
+ AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+ if isinstance(mask_token, str)
+ else mask_token
)
+ # these special tokens are not part of the vocab.json, let's add them in the correct order
+
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -241,12 +234,27 @@ def __init__(
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+ super().__init__(
+ errors=errors,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ add_prefix_space=add_prefix_space,
+ **kwargs,
+ )
+
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
- return dict(self.encoder, **self.added_tokens_encoder)
+ vocab = dict(self.encoder).copy()
+ vocab.update(self.added_tokens_encoder)
+ return vocab
def bpe(self, token):
if token in self.cache:
diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py
index c2c479da0964b6..05f64ac2ab185a 100644
--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_fast.py
@@ -177,6 +177,11 @@ def __init__(
trim_offsets=True,
**kwargs,
):
+ mask_token = (
+ AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+ if isinstance(mask_token, str)
+ else mask_token
+ )
super().__init__(
vocab_file,
merges_file,
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index d665b91a0680df..0bbdc04e536ec4 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -156,20 +156,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
for cur_file in [vocab_file, word_shape_file, word_pronunciation_file]:
if cur_file is None or not os.path.isfile(cur_file):
raise ValueError(
@@ -195,7 +181,20 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = RoCBertWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+ self.wordpiece_tokenizer = RoCBertWordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py
index dc406fa480eeaf..88c0f398b3006f 100644
--- a/src/transformers/models/roformer/tokenization_roformer.py
+++ b/src/transformers/models/roformer/tokenization_roformer.py
@@ -378,20 +378,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -407,7 +393,7 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
try:
import rjieba
except ImportError:
@@ -417,6 +403,20 @@ def __init__(
)
self.jieba = rjieba
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
+
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
diff --git a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
index 843c79e397b8b7..b7104da7f1a873 100644
--- a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
@@ -122,23 +122,12 @@ def __init__(
do_lower_case=False,
tgt_lang=None,
lang_codes=None,
+ additional_special_tokens=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- pad_token=pad_token,
- do_upper_case=do_upper_case,
- do_lower_case=do_lower_case,
- tgt_lang=tgt_lang,
- lang_codes=lang_codes,
- sp_model_kwargs=self.sp_model_kwargs,
- **kwargs,
- )
self.do_upper_case = do_upper_case
self.do_lower_case = do_lower_case
@@ -152,18 +141,39 @@ def __init__(
self.langs = LANGUAGES[lang_codes]
self.lang_tokens = [f"" for lang in self.langs]
self.lang_code_to_id = {lang: self.sp_model.PieceToId(f"") for lang in self.langs}
-
- self._additional_special_tokens = self.lang_tokens
+ if additional_special_tokens is not None:
+ additional_special_tokens = self.lang_tokens + additional_special_tokens
+ else:
+ additional_special_tokens = self.lang_tokens
self._tgt_lang = tgt_lang if tgt_lang is not None else self.langs[0]
self.set_tgt_lang_special_tokens(self._tgt_lang)
else:
self.lang_code_to_id = {}
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ do_upper_case=do_upper_case,
+ do_lower_case=do_lower_case,
+ tgt_lang=tgt_lang,
+ lang_codes=lang_codes,
+ sp_model_kwargs=self.sp_model_kwargs,
+ additional_special_tokens=additional_special_tokens,
+ **kwargs,
+ )
+
@property
def vocab_size(self) -> int:
return len(self.encoder)
+ def get_vocab(self) -> Dict:
+ vocab = self.encoder.copy()
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
@property
def tgt_lang(self) -> str:
return self._tgt_lang
@@ -241,11 +251,6 @@ def get_special_tokens_mask(
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
- def get_vocab(self) -> Dict:
- vocab = self.encoder.copy()
- vocab.update(self.added_tokens_encoder)
- return vocab
-
def __getstate__(self) -> Dict:
state = self.__dict__.copy()
state["sp_model"] = None
diff --git a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
index c021619cd04e36..e28b8a62d015bd 100644
--- a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
@@ -110,15 +110,6 @@ def __init__(
merges_file=None,
**kwargs,
):
- super().__init__(
- unk_token=unk_token,
- bos_token=bos_token,
- eos_token=eos_token,
- pad_token=pad_token,
- do_lower_case=do_lower_case,
- **kwargs,
- )
-
self.do_lower_case = do_lower_case
with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -137,6 +128,14 @@ def __init__(
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
+ super().__init__(
+ unk_token=unk_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ do_lower_case=do_lower_case,
+ **kwargs,
+ )
@property
def vocab_size(self) -> int:
diff --git a/src/transformers/models/speecht5/tokenization_speecht5.py b/src/transformers/models/speecht5/tokenization_speecht5.py
index 9748424e41699c..a9a3e3ec54a342 100644
--- a/src/transformers/models/speecht5/tokenization_speecht5.py
+++ b/src/transformers/models/speecht5/tokenization_speecht5.py
@@ -105,6 +105,12 @@ def __init__(
**kwargs,
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.vocab_file = vocab_file
+ self.normalize = normalize
+ self._normalizer = None
+
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
super().__init__(
bos_token=bos_token,
@@ -116,13 +122,6 @@ def __init__(
**kwargs,
)
- self.vocab_file = vocab_file
- self.normalize = normalize
- self._normalizer = None
-
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(vocab_file)
-
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
normalize = kwargs.pop("normalize", self.normalize)
if is_split_into_words:
diff --git a/src/transformers/models/splinter/tokenization_splinter.py b/src/transformers/models/splinter/tokenization_splinter.py
index 308680940db106..909905979be38c 100644
--- a/src/transformers/models/splinter/tokenization_splinter.py
+++ b/src/transformers/models/splinter/tokenization_splinter.py
@@ -137,20 +137,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -166,8 +152,21 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
self.question_token = question_token
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def question_token_id(self):
diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert.py b/src/transformers/models/squeezebert/tokenization_squeezebert.py
index f061a1a53c2577..0cefa03edf3e06 100644
--- a/src/transformers/models/squeezebert/tokenization_squeezebert.py
+++ b/src/transformers/models/squeezebert/tokenization_squeezebert.py
@@ -138,20 +138,6 @@ def __init__(
strip_accents=None,
**kwargs,
):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- unk_token=unk_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
@@ -167,7 +153,22 @@ def __init__(
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ super().__init__(
+ do_lower_case=do_lower_case,
+ do_basic_tokenize=do_basic_tokenize,
+ never_split=never_split,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index cbc305e1302e0a..8e6f9ee8d9e1c8 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -25,6 +25,7 @@
from ...convert_slow_tokenizer import import_protobuf
from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_base import AddedToken
if TYPE_CHECKING:
@@ -152,18 +153,37 @@ def __init__(
legacy=None,
**kwargs,
) -> None:
- # Add extra_ids to the special token list
- if extra_ids > 0 and additional_special_tokens is None:
- additional_special_tokens = [f"" for i in range(extra_ids)]
- elif extra_ids > 0 and additional_special_tokens is not None:
- # Check that we have the right number of extra_id special tokens
- extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
- if extra_tokens != extra_ids:
+ pad_token = AddedToken(pad_token, rstrip=True, lstrip=True)
+ unk_token = AddedToken(unk_token, rstrip=True, lstrip=True)
+ eos_token = AddedToken(eos_token, rstrip=True, lstrip=True)
+
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+ self.vocab_file = vocab_file
+ self._extra_ids = extra_ids
+
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+
+ if additional_special_tokens is not None:
+ extra_tokens = [x for x in additional_special_tokens if " 0 and extra_ids != len(extra_tokens):
raise ValueError(
f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
" provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
" tokens"
)
+ else:
+ extra_tokens = [f"" for i in range(extra_ids)]
+ additional_special_tokens = extra_tokens
+
+ # for legacy purpose, we keep this. Will be removed and tests updated. (when `added_tokens_decoder` is not passed as kwargs)
+ self._added_tokens_decoder = {}
+ for i in range(len(extra_tokens)):
+ self._added_tokens_decoder[len(self.sp_model) - 1 + extra_ids - i] = AddedToken(
+ f"", single_word=True, lstrip=True, rstrip=True, special=True
+ )
+
if legacy is None:
logger.warning_once(
f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is"
@@ -175,7 +195,9 @@ def __init__(
legacy = True
self.legacy = legacy
- self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.sp_model = self.get_spm_processor()
+ self.vocab_file = vocab_file
+ self._extra_ids = extra_ids
super().__init__(
eos_token=eos_token,
@@ -188,11 +210,6 @@ def __init__(
**kwargs,
)
- self.vocab_file = vocab_file
- self._extra_ids = extra_ids
-
- self.sp_model = self.get_spm_processor()
-
def get_spm_processor(self):
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
if self.legacy: # no dependency on protobuf
@@ -234,7 +251,7 @@ def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_l
@property
def vocab_size(self):
- return self.sp_model.get_piece_size() + self._extra_ids
+ return self.sp_model.get_piece_size()
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
@@ -275,7 +292,7 @@ def get_sentinel_tokens(self):
)
def get_sentinel_token_ids(self):
- return [self._convert_token_to_id(token) for token in self.get_sentinel_tokens()]
+ return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()]
def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
"""Do not add eos again if user already added it."""
@@ -391,18 +408,11 @@ def _tokenize(self, text, **kwargs):
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
- if token.startswith("", token)
- num = int(match.group(1))
- return self.vocab_size - num - 1
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
- if index < self.sp_model.get_piece_size():
- token = self.sp_model.IdToPiece(index)
- else:
- token = f""
+ token = self.sp_model.IdToPiece(index)
return token
def convert_tokens_to_string(self, tokens):
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index c3d35d0c87be2b..7ec1e68f21d75c 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -31,6 +31,7 @@
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...tokenization_utils_base import (
ENCODE_KWARGS_DOCSTRING,
+ VERY_LARGE_INTEGER,
BatchEncoding,
EncodedInput,
PreTokenizedInput,
@@ -351,6 +352,44 @@ def __init__(
else:
additional_special_tokens = [empty_token]
+ if not os.path.isfile(vocab_file):
+ raise ValueError(
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+ " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+ )
+ self.vocab = load_vocab(vocab_file)
+ self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+ self.do_basic_tokenize = do_basic_tokenize
+ if do_basic_tokenize:
+ self.basic_tokenizer = BasicTokenizer(
+ do_lower_case=do_lower_case,
+ never_split=never_split,
+ tokenize_chinese_chars=tokenize_chinese_chars,
+ strip_accents=strip_accents,
+ )
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+ # Additional properties
+ self.cell_trim_length = cell_trim_length
+ self.max_column_id = (
+ max_column_id
+ if max_column_id is not None
+ else model_max_length
+ if model_max_length is not None
+ else VERY_LARGE_INTEGER
+ )
+ self.max_row_id = (
+ max_row_id
+ if max_row_id is not None
+ else model_max_length
+ if model_max_length is not None
+ else VERY_LARGE_INTEGER
+ )
+ self.strip_column_names = strip_column_names
+ self.update_answer_coordinates = update_answer_coordinates
+ self.min_question_length = min_question_length
+ self.max_question_length = max_question_length
+
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
@@ -375,32 +414,6 @@ def __init__(
**kwargs,
)
- if not os.path.isfile(vocab_file):
- raise ValueError(
- f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
- " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
- )
- self.vocab = load_vocab(vocab_file)
- self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
- self.do_basic_tokenize = do_basic_tokenize
- if do_basic_tokenize:
- self.basic_tokenizer = BasicTokenizer(
- do_lower_case=do_lower_case,
- never_split=never_split,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- )
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
-
- # Additional properties
- self.cell_trim_length = cell_trim_length
- self.max_column_id = max_column_id if max_column_id is not None else self.model_max_length
- self.max_row_id = max_row_id if max_row_id is not None else self.model_max_length
- self.strip_column_names = strip_column_names
- self.update_answer_coordinates = update_answer_coordinates
- self.min_question_length = min_question_length
- self.max_question_length = max_question_length
-
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
index 0097b2a6f20d76..138afbcf93e29d 100644
--- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
@@ -181,25 +181,7 @@ def __init__(
language="en",
**kwargs,
):
- super().__init__(
- special=special,
- min_freq=min_freq,
- max_size=max_size,
- lower_case=lower_case,
- delimiter=delimiter,
- vocab_file=vocab_file,
- pretrained_vocab_file=pretrained_vocab_file,
- never_split=never_split,
- unk_token=unk_token,
- eos_token=eos_token,
- additional_special_tokens=additional_special_tokens,
- language=language,
- **kwargs,
- )
requires_backends(self, "sacremoses")
-
- if never_split is None:
- never_split = self.all_special_tokens
if special is None:
special = []
self.counter = Counter()
@@ -209,7 +191,6 @@ def __init__(
self.lower_case = lower_case
self.delimiter = delimiter
self.vocab_file = vocab_file
- self.never_split = never_split
self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
self.punction_without_space_before_pattern = re.compile(rf"[^\s][{self.punctuation_symbols}]")
self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern()
@@ -217,7 +198,8 @@ def __init__(
self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
self.moses_tokenizer = sm.MosesTokenizer(language)
self.moses_detokenizer = sm.MosesDetokenizer(language)
-
+ self.idx2sym = []
+ self.sym2idx = OrderedDict()
# This try... catch... is not beautiful but honestly this tokenizer was not made to be used
# in a library like ours, at all.
try:
@@ -241,7 +223,7 @@ def __init__(
if vocab_dict is not None:
for key, value in vocab_dict.items():
- if key not in self.__dict__:
+ if key not in self.__dict__ or key == "sym2idx":
self.__dict__[key] = value
elif vocab_file is not None:
self.build_vocab()
@@ -256,6 +238,27 @@ def __init__(
if vocab_file is not None:
self.build_vocab()
+ super().__init__(
+ special=special,
+ min_freq=min_freq,
+ max_size=max_size,
+ lower_case=lower_case,
+ delimiter=delimiter,
+ vocab_file=vocab_file,
+ pretrained_vocab_file=pretrained_vocab_file,
+ never_split=never_split,
+ unk_token=unk_token,
+ eos_token=eos_token,
+ additional_special_tokens=additional_special_tokens,
+ language=language,
+ **kwargs,
+ )
+
+ # these are not required to initialize the parent class as only used when tokenizing.
+ if never_split is None:
+ never_split = self.all_special_tokens
+ self.never_split = never_split
+
@property
def do_lower_case(self):
return self.lower_case
@@ -305,7 +308,7 @@ def _build_from_file(self, vocab_file):
elif "" in self.sym2idx:
self.unk_idx = self.sym2idx[""]
else:
- raise ValueError("No token in vocabulary")
+ raise ValueError("Token not in vocabulary and no token in vocabulary for replacement.")
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if os.path.isdir(save_directory):
@@ -323,7 +326,7 @@ def build_vocab(self):
if self.vocab_file:
logger.info(f"building vocab from {self.vocab_file}")
self._build_from_file(self.vocab_file)
- logger.info(f"final vocab size {len(self)}")
+ logger.info(f"Final vocab size {len(self.sym2idx)}")
else:
logger.info(f"building vocab with min_freq={self.min_freq}, max_size={self.max_size}")
self.idx2sym = []
@@ -337,7 +340,7 @@ def build_vocab(self):
break
self.add_symbol(sym)
- logger.info(f"final vocab size {len(self)} from {len(self.counter)} unique tokens")
+ logger.info(f"Final vocab size {len(self.sym2idx)} from {len(self.counter)} unique tokens")
@torch_only_method
def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
@@ -406,9 +409,8 @@ def move_added_token(self, token: str, target_idx: int):
self.sym2idx[current_sym] = idx
# Delete token from added_tokens
- old_index = self.added_tokens_encoder[token]
- del self.added_tokens_decoder[old_index]
- del self.added_tokens_encoder[token]
+ old_index = self._added_tokens_encoder.pop(token)
+ self._added_tokens_decoder.pop(old_index)
def moses_punct_norm(self, text):
return self.moses_punct_normalizer.normalize(text)
@@ -463,7 +465,7 @@ def _convert_token_to_id(self, sym):
elif "" in self.sym2idx:
return self.sym2idx[""]
else:
- raise ValueError("Token not in vocabulary and no token in vocabulary for replacement")
+ raise ValueError("Token not in vocabulary and no token in vocabulary for replacement.")
def convert_tokens_to_string(self, tokens):
"""
@@ -482,7 +484,9 @@ def vocab_size(self):
return len(self.idx2sym)
def get_vocab(self):
- return dict(self.sym2idx, **self.added_tokens_encoder)
+ vocab = self.sym2idx.copy()
+ vocab.update(self.added_tokens_encoder)
+ return vocab
def _tokenize(self, line, add_eos=False, add_double_eos=False):
line = line.strip()
diff --git a/src/transformers/models/vits/tokenization_vits.py b/src/transformers/models/vits/tokenization_vits.py
index f2cc6be3e43219..0563be326cdb51 100644
--- a/src/transformers/models/vits/tokenization_vits.py
+++ b/src/transformers/models/vits/tokenization_vits.py
@@ -93,17 +93,6 @@ def __init__(
is_uroman=False,
**kwargs,
) -> None:
- super().__init__(
- pad_token=pad_token,
- unk_token=unk_token,
- language=language,
- add_blank=add_blank,
- normalize=normalize,
- phonemize=phonemize,
- is_uroman=is_uroman,
- **kwargs,
- )
-
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
@@ -115,12 +104,24 @@ def __init__(
self.is_uroman = is_uroman
+ super().__init__(
+ pad_token=pad_token,
+ unk_token=unk_token,
+ language=language,
+ add_blank=add_blank,
+ normalize=normalize,
+ phonemize=phonemize,
+ is_uroman=is_uroman,
+ **kwargs,
+ )
+
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
return vocab
def normalize_text(self, input_string):
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index 472fd2d649c994..dc8b9bde7e6214 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -24,7 +24,7 @@
import numpy as np
-from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list
+from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...utils import (
ModelOutput,
@@ -174,18 +174,6 @@ def __init__(
target_lang=None,
**kwargs,
):
- super().__init__(
- unk_token=unk_token,
- bos_token=bos_token,
- eos_token=eos_token,
- pad_token=pad_token,
- do_lower_case=do_lower_case,
- word_delimiter_token=word_delimiter_token,
- replace_word_delimiter_char=replace_word_delimiter_char,
- target_lang=target_lang,
- **kwargs,
- )
-
self._word_delimiter_token = word_delimiter_token
self.do_lower_case = do_lower_case
@@ -204,13 +192,28 @@ def __init__(
self.decoder = {v: k for k, v in self.encoder.items()}
+ super().__init__(
+ unk_token=unk_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ do_lower_case=do_lower_case,
+ word_delimiter_token=word_delimiter_token,
+ replace_word_delimiter_char=replace_word_delimiter_char,
+ target_lang=target_lang,
+ **kwargs,
+ )
+
# make sure that tokens made of several
# characters are not split at tokenization
+
+ # TODO @ArthurZ add them or just update the trie?
+ unique_no_split_tokens = []
for token in self.encoder.keys():
if len(token) > 1:
- self.unique_no_split_tokens.append(token)
+ unique_no_split_tokens.append(AddedToken(token, rstrip=True, lstrip=True, normalized=False))
- self._create_trie(self.unique_no_split_tokens)
+ self.add_tokens(unique_no_split_tokens)
def set_target_lang(self, target_lang: str):
"""
@@ -266,7 +269,20 @@ def vocab_size(self) -> int:
return len(self.decoder)
def get_vocab(self) -> Dict:
- return dict(self.vocab, **self.added_tokens_encoder)
+ vocab = dict(self.encoder)
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+ # Overwritten to never strip!
+ to_add = []
+ for token in new_tokens:
+ if isinstance(token, str):
+ to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalize=False))
+ else:
+ to_add.append(token)
+
+ return super()._add_tokens(to_add, special_tokens)
def _tokenize(self, text, **kwargs):
"""
@@ -645,64 +661,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
return (vocab_file,)
- def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
- """
- Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
- it with indices starting from length of the current vocabulary.
-
- Args:
- new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
- Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
- checking if the tokenizer assign the index of the `unk_token` to them).
- special_tokens (`bool`, *optional*, defaults to `False`):
- Whether or not the tokens should be added as special tokens.
-
- Returns:
- `int`: The number of tokens actually added to the vocabulary.
-
- Example:
-
- ```python
- # Let's see how to increase the vocabulary of Bert model and tokenizer
- tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-
- num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
- print("We have added", num_added_toks, "tokens")
- # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
- model.resize_token_embeddings(len(tokenizer))
- ```"""
- new_tokens = [str(tok) for tok in new_tokens]
-
- tokens_to_add = []
- for token in new_tokens:
- assert isinstance(token, str)
- if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
- token = token.lower()
- if (
- token != self.unk_token
- and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
- and token not in tokens_to_add
- ):
- tokens_to_add.append(token)
- if self.verbose:
- logger.info(f"Adding {token} to the vocabulary")
-
- added_tok_encoder = {tok: len(self) + i for i, tok in enumerate(tokens_to_add)}
- added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
- self.added_tokens_encoder.update(added_tok_encoder)
- self.added_tokens_decoder.update(added_tok_decoder)
-
- # Make sure we don't split on any special tokens (even they were already in the vocab before)
- for token in tokens_to_add:
- if len(token) > 1:
- self._additional_special_tokens.append(AddedToken(token))
- _insert_one_token_to_ordered_list(self.unique_no_split_tokens, token)
-
- self._create_trie(self.unique_no_split_tokens)
-
- return len(tokens_to_add)
-
class Wav2Vec2Tokenizer(PreTrainedTokenizer):
"""
@@ -777,18 +735,6 @@ def __init__(
return_attention_mask=False,
**kwargs,
):
- super().__init__(
- unk_token=unk_token,
- bos_token=bos_token,
- eos_token=eos_token,
- pad_token=pad_token,
- do_lower_case=do_lower_case,
- do_normalize=do_normalize,
- return_attention_mask=return_attention_mask,
- word_delimiter_token=word_delimiter_token,
- **kwargs,
- )
-
warnings.warn(
"The class `Wav2Vec2Tokenizer` is deprecated and will be removed in version 5 of Transformers. Please use"
" `Wav2Vec2Processor` or `Wav2Vec2CTCTokenizer` instead.",
@@ -806,6 +752,18 @@ def __init__(
self.decoder = {v: k for k, v in self.encoder.items()}
+ super().__init__(
+ unk_token=unk_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ do_lower_case=do_lower_case,
+ do_normalize=do_normalize,
+ return_attention_mask=return_attention_mask,
+ word_delimiter_token=word_delimiter_token,
+ **kwargs,
+ )
+
@property
def word_delimiter_token(self) -> str:
"""
diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
index f9a1cf631cb525..bd64dcf18d97ad 100644
--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -23,7 +23,7 @@
import numpy as np
-from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list
+from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import AddedToken
from ...utils import (
ModelOutput,
@@ -143,19 +143,6 @@ def __init__(
phonemizer_backend="espeak",
**kwargs,
):
- super().__init__(
- unk_token=unk_token,
- bos_token=bos_token,
- eos_token=eos_token,
- pad_token=pad_token,
- word_delimiter_token=word_delimiter_token,
- phone_delimiter_token=phone_delimiter_token,
- do_phonemize=do_phonemize,
- phonemizer_lang=phonemizer_lang,
- phonemizer_backend=phonemizer_backend,
- **kwargs,
- )
-
self._word_delimiter_token = word_delimiter_token
self._phone_delimiter_token = phone_delimiter_token
self.do_phonemize = do_phonemize
@@ -168,13 +155,38 @@ def __init__(
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
+ super().__init__(
+ unk_token=unk_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ word_delimiter_token=word_delimiter_token,
+ phone_delimiter_token=phone_delimiter_token,
+ do_phonemize=do_phonemize,
+ phonemizer_lang=phonemizer_lang,
+ phonemizer_backend=phonemizer_backend,
+ **kwargs,
+ )
@property
def vocab_size(self) -> int:
return len(self.decoder)
def get_vocab(self) -> Dict:
- return dict(self.encoder, **self.added_tokens_encoder)
+ vocab = dict(self.encoder)
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+ # Overwritten to never strip!
+ to_add = []
+ for token in new_tokens:
+ if isinstance(token, str):
+ to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalize=True))
+ else:
+ to_add.append(token)
+
+ return super()._add_tokens(to_add, special_tokens)
def init_backend(self, phonemizer_lang: str):
"""
@@ -576,61 +588,3 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
return (vocab_file,)
-
- def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
- """
- Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
- it with indices starting from length of the current vocabulary.
-
- Args:
- new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
- Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
- checking if the tokenizer assign the index of the `unk_token` to them).
- special_tokens (`bool`, *optional*, defaults to `False`):
- Whether or not the tokens should be added as special tokens.
-
- Returns:
- `int`: The number of tokens actually added to the vocabulary.
-
- Examples:
-
- ```python
- # Let's see how to increase the vocabulary of Bert model and tokenizer
- tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
- model = Wav2Vec2PhonemeForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
-
- num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
- print("We have added", num_added_toks, "tokens")
- # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
- model.resize_token_embeddings(len(tokenizer))
- ```"""
- new_tokens = [str(tok) for tok in new_tokens]
-
- tokens_to_add = []
- for token in new_tokens:
- if not isinstance(token, str):
- raise ValueError(f"Token {token} has to be of type string, but is of type {type(token)}.")
- assert isinstance(token, str)
- if (
- token != self.unk_token
- and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
- and token not in tokens_to_add
- ):
- tokens_to_add.append(token)
- if self.verbose:
- logger.info(f"Adding {token} to the vocabulary")
-
- added_tok_encoder = {tok: len(self) + i for i, tok in enumerate(tokens_to_add)}
- added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
- self.added_tokens_encoder.update(added_tok_encoder)
- self.added_tokens_decoder.update(added_tok_decoder)
-
- # Make sure we don't split on any special tokens (even they were already in the vocab before)
- for token in tokens_to_add:
- if len(token) > 1:
- self._additional_special_tokens.append(AddedToken(token))
- _insert_one_token_to_ordered_list(self.unique_no_split_tokens, token)
-
- self._create_trie(self.unique_no_split_tokens)
-
- return len(tokens_to_add)
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index a22521b4e00dfb..6c3cebbe23d538 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -272,18 +272,25 @@ def __init__(
predict_timestamps=False,
**kwargs,
):
- bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
- eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
- unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
- pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
- super().__init__(
- errors=errors,
- unk_token=unk_token,
- bos_token=bos_token,
- eos_token=eos_token,
- pad_token=pad_token,
- add_prefix_space=add_prefix_space,
- **kwargs,
+ bos_token = (
+ AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+ if isinstance(bos_token, str)
+ else bos_token
+ )
+ eos_token = (
+ AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+ if isinstance(eos_token, str)
+ else eos_token
+ )
+ unk_token = (
+ AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
+ if isinstance(unk_token, str)
+ else unk_token
+ )
+ pad_token = (
+ AddedToken(pad_token, lstrip=False, rstrip=False, normalized=False, special=True)
+ if isinstance(pad_token, str)
+ else pad_token
)
with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -309,18 +316,28 @@ def __init__(
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
self.language = language
+ super().__init__(
+ errors=errors,
+ unk_token=unk_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ add_prefix_space=add_prefix_space,
+ **kwargs,
+ )
+
self.task = task
self.predict_timestamps = predict_timestamps
+ @property
+ def vocab_size(self) -> int:
+ return len(self.encoder)
+
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
- @property
- def vocab_size(self) -> int:
- return len(self.encoder)
-
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe with GPT2 -> Whisper
def bpe(self, token):
if token in self.cache:
@@ -390,11 +407,10 @@ def set_prefix_tokens(self, language: str = None, task: str = None, predict_time
@property
def prefix_tokens(self) -> List[int]:
- all_special_ids = self.all_special_ids
- bos_token_id = all_special_ids[-106]
- translate_token_id = all_special_ids[-6]
- transcribe_token_id = all_special_ids[-5]
- notimestamps_token_id = all_special_ids[-1]
+ bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+ translate_token_id = self.convert_tokens_to_ids("<|translate|>")
+ transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
+ notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
langs = tuple(LANGUAGES.keys())
if self.language is not None:
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
index cb321f669c7ca6..c85b945685fa2f 100644
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -19,7 +19,7 @@
from typing import List, Optional, Tuple
import numpy as np
-from tokenizers import pre_tokenizers, processors
+from tokenizers import AddedToken, pre_tokenizers, processors
from ...tokenization_utils_base import BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
@@ -148,6 +148,22 @@ def __init__(
predict_timestamps=False,
**kwargs,
):
+ bos_token = (
+ AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+ if isinstance(bos_token, str)
+ else bos_token
+ )
+ eos_token = (
+ AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+ if isinstance(eos_token, str)
+ else eos_token
+ )
+ unk_token = (
+ AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
+ if isinstance(unk_token, str)
+ else unk_token
+ )
+
super().__init__(
vocab_file,
merges_file,
@@ -444,11 +460,10 @@ def set_prefix_tokens(self, language: str = None, task: str = None, predict_time
@property
# Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.prefix_tokens
def prefix_tokens(self) -> List[int]:
- all_special_ids = self.all_special_ids
- bos_token_id = all_special_ids[-106]
- translate_token_id = all_special_ids[-6]
- transcribe_token_id = all_special_ids[-5]
- notimestamps_token_id = all_special_ids[-1]
+ bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+ translate_token_id = self.convert_tokens_to_ids("<|translate|>")
+ transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
+ notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
langs = tuple(LANGUAGES.keys())
if self.language is not None:
diff --git a/src/transformers/models/xglm/tokenization_xglm.py b/src/transformers/models/xglm/tokenization_xglm.py
index f27c827134bf37..9dd0144eafae5a 100644
--- a/src/transformers/models/xglm/tokenization_xglm.py
+++ b/src/transformers/models/xglm/tokenization_xglm.py
@@ -137,17 +137,6 @@ def __init__(
word for word in madeup_words if word not in kwargs["additional_special_tokens"]
]
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- sp_model_kwargs=self.sp_model_kwargs,
- **kwargs,
- )
-
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
@@ -170,6 +159,17 @@ def __init__(
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py
index 5cab4fc9967937..c0ffdae1194816 100644
--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -613,20 +613,6 @@ def __init__(
do_lowercase_and_remove_accent=True,
**kwargs,
):
- super().__init__(
- unk_token=unk_token,
- bos_token=bos_token,
- sep_token=sep_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- additional_special_tokens=additional_special_tokens,
- lang2id=lang2id,
- id2lang=id2lang,
- do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
- **kwargs,
- )
-
try:
import sacremoses
except ImportError:
@@ -660,6 +646,19 @@ def __init__(
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
+ super().__init__(
+ unk_token=unk_token,
+ bos_token=bos_token,
+ sep_token=sep_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ additional_special_tokens=additional_special_tokens,
+ lang2id=lang2id,
+ id2lang=id2lang,
+ do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
+ **kwargs,
+ )
@property
def do_lower_case(self):
diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
index 6a4c1b9c0b6707..9cc1ae5ca08f4d 100644
--- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
@@ -145,18 +145,6 @@ def __init__(
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- sep_token=sep_token,
- unk_token=unk_token,
- pad_token=pad_token,
- cls_token=cls_token,
- mask_token=mask_token,
- sp_model_kwargs=self.sp_model_kwargs,
- **kwargs,
- )
-
try:
import sentencepiece as spm
except ImportError:
@@ -186,8 +174,20 @@ def __init__(
# The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab
self.fairseq_offset = 12
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
- for k in self.fairseq_tokens_to_ids.keys():
- self.unique_no_split_tokens.append(k)
+
+ # TODO ArthurZ fairseq_ids_to_tokens should be removed
+
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ sep_token=sep_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ cls_token=cls_token,
+ mask_token=mask_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
@property
def can_save_slow_tokenizer(self) -> bool:
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
index 54a46842ff156d..299f4268e56674 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -152,18 +152,6 @@ def __init__(
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
- super().__init__(
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- sp_model_kwargs=self.sp_model_kwargs,
- **kwargs,
- )
-
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
@@ -183,6 +171,18 @@ def __init__(
self.fairseq_tokens_to_ids[""] = len(self.sp_model) + self.fairseq_offset
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ sep_token=sep_token,
+ cls_token=cls_token,
+ pad_token=pad_token,
+ mask_token=mask_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ **kwargs,
+ )
+
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
@@ -288,6 +288,7 @@ def get_vocab(self):
return vocab
def _tokenize(self, text: str) -> List[str]:
+ # TODO check if the t5/llama PR also applies here
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index ec72df8c8b71c4..0481fec346d437 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -152,6 +152,14 @@ def __init__(
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.do_lower_case = do_lower_case
+ self.remove_space = remove_space
+ self.keep_accents = keep_accents
+ self.vocab_file = vocab_file
+
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+
super().__init__(
do_lower_case=do_lower_case,
remove_space=remove_space,
@@ -170,14 +178,6 @@ def __init__(
self._pad_token_type_id = 3
- self.do_lower_case = do_lower_case
- self.remove_space = remove_space
- self.keep_accents = keep_accents
- self.vocab_file = vocab_file
-
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
- self.sp_model.Load(vocab_file)
-
@property
def vocab_size(self):
return len(self.sp_model)
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index e26c0c6d52898e..c9d0afecf40945 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -57,6 +57,7 @@ class Trie:
def __init__(self):
self.data = {}
+ self._tokens = set()
def add(self, word: str):
"""
@@ -81,6 +82,8 @@ def add(self, word: str):
if not word:
# Prevent empty string
return
+
+ self._tokens.add(word)
ref = self.data
for char in word:
ref[char] = char in ref and ref[char] or {}
@@ -344,17 +347,48 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
"""
def __init__(self, **kwargs):
+ # 1. Init the parent class
super().__init__(**kwargs)
-
- # Added tokens - We store this for both slow and fast tokenizers
- # until the serialization of Fast tokenizers is updated
- self.added_tokens_encoder: Dict[str, int] = {}
- self.added_tokens_decoder: Dict[int, str] = {}
- self.unique_no_split_tokens: List[str] = []
self.tokens_trie = Trie()
+ # 2. init `_added_tokens_decoder` if child class did not
+ if not hasattr(self, "_added_tokens_decoder"):
+ self._added_tokens_decoder: Dict[int, AddedToken] = {}
+ # 3. if a `added_tokens_decoder` is passed, we are loading from a saved tokenizer, we overwrite
+ if "added_tokens_decoder" in kwargs:
+ # overwriting the class's added_tokens_decoder. This is the source of truth!
+ self._added_tokens_decoder.update(kwargs.get("added_tokens_decoder"))
+
+ self._added_tokens_encoder: Dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}
+
+ # 4. If some of the special tokens are not part of the vocab, we add them, at the end.
+ # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following `tokenizers`
+ self._add_tokens(self.all_special_tokens_extended, special_tokens=True)
+
self._decode_use_source_tokenizer = False
+ @property
+ def added_tokens_decoder(self) -> Dict[int, AddedToken]:
+ """
+ Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
+
+ Returns:
+ `Dict[str, int]`: The added tokens.
+ """
+ return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0]))
+
+ @added_tokens_decoder.setter
+ def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict[int, AddedToken]:
+ # Always raise an error if string because users should define the behavior
+ for index, token in value.items():
+ if not isinstance(token, (str, AddedToken)) or not isinstance(index, int):
+ raise ValueError(
+ f"The provided `added_tokens_decoder` has an element of type {index.__class__, token.__class__}, should be a dict of {int, Union[AddedToken, str]}"
+ )
+
+ self._added_tokens_decoder[index] = AddedToken(token) if isinstance(token, str) else token
+ self._added_tokens_encoder[str(token)] = index
+
@property
def is_fast(self) -> bool:
return False
@@ -368,28 +402,34 @@ def vocab_size(self) -> int:
def get_added_vocab(self) -> Dict[str, int]:
"""
- Returns the added tokens in the vocabulary as a dictionary of token to index.
+ Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
+ the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
+ something we should change.
Returns:
`Dict[str, int]`: The added tokens.
"""
- return self.added_tokens_encoder
+ return self._added_tokens_encoder
def __len__(self):
"""
- Size of the full vocabulary with the added tokens.
+ Size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because otherwise if
+ there is a hole in the vocab, we will add tokenizers at a wrong index.
"""
- return self.vocab_size + len(self.added_tokens_encoder)
+ return len(set(self.get_vocab().keys()))
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
"""
Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
- it with indices starting from length of the current vocabulary.
+ it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the
+ vocab which is why they have to be handled specifically.
Args:
new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
- Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
- checking if the tokenizer assign the index of the `unk_token` to them).
+ Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary
+ (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part
+ of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the
+ stripping and normalization of this token. This is NOT possible in `tokenizers`.
special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the tokens should be added as special tokens.
@@ -408,52 +448,52 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
# Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))
```"""
- new_tokens = [str(tok) for tok in new_tokens]
-
- tokens_to_add = []
+ added_tokens = 0
+ if new_tokens is None:
+ return added_tokens
+ current_vocab = self.get_vocab().copy()
+ new_idx = len(current_vocab) # only call this once, len gives the last index + 1
for token in new_tokens:
- if not isinstance(token, str):
+ if not isinstance(token, (str, AddedToken)):
raise TypeError(f"Token {token} is not a string but a {type(token)}.")
- if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
- token = token.lower()
- if (
- token != self.unk_token
- and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
- and token not in tokens_to_add
- ):
- tokens_to_add.append(token)
- if self.verbose:
- logger.info(f"Adding {token} to the vocabulary")
-
- added_tok_encoder = {tok: len(self) + i for i, tok in enumerate(tokens_to_add)}
- added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
- self.added_tokens_encoder.update(added_tok_encoder)
- self.added_tokens_decoder.update(added_tok_decoder)
-
- # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
- if special_tokens:
- if len(new_tokens) == 1:
- _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0])
- else:
- self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
- else:
- # Or on the newly added tokens
- if len(tokens_to_add) == 1:
- _insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0])
+ if str(token) == "":
+ continue
+ if isinstance(token, str):
+ # for legacy AddedTokens strip left and right by default
+ # TODO this will be remove to have the same default behavior as rust
+ token = AddedToken(token, normalized=not special_tokens, rstrip=True, lstrip=True)
+ if special_tokens:
+ token.special = True
+ if token in self._added_tokens_decoder:
+ continue
+ if not token.special and token.normalized and hasattr(self, "do_lower_case") and self.do_lower_case:
+ # Normalize if requested
+ token.content = token.content.lower()
+ if token.content not in current_vocab:
+ token_index = new_idx + added_tokens
+ current_vocab[token.content] = token_index
+ added_tokens += 1
else:
- self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
- self._create_trie(self.unique_no_split_tokens)
-
- return len(tokens_to_add)
-
- def _create_trie(self, unique_no_split_tokens):
- trie = Trie()
+ token_index = current_vocab[token.content]
+
+ if token.special and str(token) not in self.all_special_tokens:
+ self._additional_special_tokens.append(token)
+ # the setter automatically updates the reverse map
+ self._added_tokens_decoder[token_index] = token
+ self._added_tokens_encoder[token.content] = token_index
+ if self.verbose:
+ logger.info(f"Adding {token} to the vocabulary")
+
+ self._update_trie()
+ return added_tokens
+
+ def _update_trie(self, unique_no_split_tokens: Optional[str] = []):
+ for token in self._added_tokens_decoder.values():
+ if token not in self.tokens_trie._tokens:
+ self.tokens_trie.add(token.content)
for token in unique_no_split_tokens:
- if hasattr(self, "do_lower_case") and self.do_lower_case and token not in self.all_special_tokens:
- trie.add(token.lower())
- else:
- trie.add(token)
- self.tokens_trie = trie
+ if token not in self.tokens_trie._tokens:
+ self.tokens_trie.add(token)
def num_special_tokens_to_add(self, pair: bool = False) -> int:
"""
@@ -494,10 +534,6 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
Returns:
`List[str]`: The list of tokens.
"""
- # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
- all_special_tokens_extended = {
- str(t): t for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
- }
split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
@@ -505,27 +541,29 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
if kwargs:
logger.warning(f"Keyword arguments {kwargs} not recognized.")
- # TODO: should this be in the base class?
if hasattr(self, "do_lower_case") and self.do_lower_case:
# convert non-special tokens to lowercase
- escaped_special_toks = [
- re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
+ escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
+ escaped_special_toks += [
+ re.escape(s_tok.content)
+ for s_tok in (self._added_tokens_decoder.values())
+ if not s_tok.special and s_tok.normalized
]
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
- # split_special_tokens: empty `no_split_token`
if split_special_tokens:
no_split_token = []
tokens = [text]
else:
- no_split_token = set(self.unique_no_split_tokens)
+ no_split_token = set(self._added_tokens_encoder.keys()) # don't split on any of the added tokens
+ # "This is something else"
tokens = self.tokens_trie.split(text)
# ["This is something", "", " else"]
for i, token in enumerate(tokens):
if token in no_split_token:
- tok_extended = all_special_tokens_extended.get(token, None)
+ tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None)
left = tokens[i - 1] if i > 0 else None
right = tokens[i + 1] if i < len(tokens) - 1 else None
if isinstance(tok_extended, AddedToken):
@@ -536,12 +574,18 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
# Strip white spaces on the left
if tok_extended.lstrip and left:
tokens[i - 1] = left.rstrip() # Opposite here
+ if tok_extended.single_word and left and left[-1] != " ":
+ tokens[i - 1] += token
+ tokens[i] = ""
+ elif tok_extended.single_word and right and right[0] != " ":
+ tokens[i + 1] = token + tokens[i + 1]
+ tokens[i] = ""
+
else:
- # We strip left and right by default
- if right:
- tokens[i + 1] = right.lstrip()
- if left:
- tokens[i - 1] = left.rstrip()
+ raise ValueError(
+ f"{tok_extended} cannot be tokenized because it was not properly added"
+ f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
+ )
# ["This is something", "", "else"]
tokenized_text = []
for token in tokens:
@@ -590,8 +634,8 @@ def _convert_token_to_id_with_added_voc(self, token):
if token is None:
return None
- if token in self.added_tokens_encoder:
- return self.added_tokens_encoder[token]
+ if token in self._added_tokens_encoder:
+ return self._added_tokens_encoder[token]
return self._convert_token_to_id(token)
def _convert_token_to_id(self, token):
@@ -904,8 +948,8 @@ def convert_ids_to_tokens(
`str` or `List[str]`: The decoded token(s).
"""
if isinstance(ids, int):
- if ids in self.added_tokens_decoder:
- return self.added_tokens_decoder[ids]
+ if ids in self._added_tokens_decoder:
+ return self._added_tokens_decoder[ids].content
else:
return self._convert_id_to_token(ids)
tokens = []
@@ -913,8 +957,8 @@ def convert_ids_to_tokens(
index = int(index)
if skip_special_tokens and index in self.all_special_ids:
continue
- if index in self.added_tokens_decoder:
- tokens.append(self.added_tokens_decoder[index])
+ if index in self._added_tokens_decoder:
+ tokens.append(self._added_tokens_decoder[index].content)
else:
tokens.append(self._convert_id_to_token(index))
return tokens
@@ -935,19 +979,29 @@ def _decode(
) -> str:
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
+ if spaces_between_special_tokens:
+ logger.warning_once(
+ "spaces_between_special_tokens is deprecated and will be removed in transformers v5. It was adding spaces between `added_tokens`, not special tokens, "
+ "and does not exist in our fast implementation. Future tokenizers will handle the decoding process on a per-model rule."
+ )
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-
+ legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
+ token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
+ }
# To avoid mixing byte-level and unicode for byte-level BPT
# we need to build string separately for added tokens and byte-level tokens
# cf. https://github.com/huggingface/transformers/issues/1133
sub_texts = []
current_sub_text = []
+ # TODO @ArthurZ in version 5, special tokens should be handled in convert_tokens_to_string, while _convert_tokens_to_string
for token in filtered_tokens:
if skip_special_tokens and token in self.all_special_ids:
continue
- if token in self.added_tokens_encoder:
+ if token in legacy_added_tokens:
if current_sub_text:
- sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+ string = self.convert_tokens_to_string(current_sub_text)
+ if len(string) > 0:
+ sub_texts.append(string)
current_sub_text = []
sub_texts.append(token)
else:
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index a65f799a724b13..b936adc36bb6da 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -23,10 +23,10 @@
import os
import re
import warnings
-from collections import OrderedDict, UserDict
+from collections import UserDict
from collections.abc import Mapping, Sized
from contextlib import contextmanager
-from dataclasses import dataclass, field
+from dataclasses import dataclass
from functools import lru_cache
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
@@ -78,18 +78,25 @@
from tokenizers import Encoding as EncodingFast
else:
- @dataclass(frozen=True, eq=True)
+ @dataclass(frozen=False, eq=True)
class AddedToken:
"""
AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
way it should behave.
+
+ The `normalized` will default to `not special` if it is not specified, similarly to the definition in
+ `tokenizers`.
"""
- content: str = field(default_factory=str)
- single_word: bool = False
- lstrip: bool = False
- rstrip: bool = False
- normalized: bool = True
+ def __init__(
+ self, content: str, single_word=False, lstrip=False, rstrip=False, special=False, normalized=None
+ ):
+ self.content = content
+ self.single_word = single_word
+ self.lstrip = lstrip
+ self.rstrip = rstrip
+ self.special = special
+ self.normalized = normalized if normalized is not None else not special
def __getstate__(self):
return self.__dict__
@@ -806,7 +813,8 @@ class SpecialTokensMixin:
A special token representing a masked token (used by masked-language modeling pretraining objectives, like
BERT).
additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
- A tuple or a list of additional special tokens.
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
+ skipped when decoding if `skip_special_tokens` is set to `True`.
"""
SPECIAL_TOKENS_ATTRIBUTES = [
@@ -845,21 +853,20 @@ def __init__(self, verbose=True, **kwargs):
isinstance(t, (str, AddedToken)) for t in value
), "One of the tokens is not a string or an AddedToken"
setattr(self, key, value)
- elif isinstance(value, (str, AddedToken)):
+ elif isinstance(value, (str)):
+ value = AddedToken(value, normalized=False, special=True)
+ setattr(self, key, value)
+ elif isinstance(value, AddedToken):
setattr(self, key, value)
else:
- raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}")
+ raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}")
def sanitize_special_tokens(self) -> int:
"""
- Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`,
- `tokenizer.cls_token`, etc.) are in the vocabulary.
-
- Add the missing ones to the vocabulary if needed.
-
- Return:
- `int`: The number of tokens added in the vocabulary during the operation.
+ The `sanitize_special_tokens` is now deprecated kept for backward compatibility and will be removed in
+ transformers v5.
"""
+ logger.warning_once("The `sanitize_special_tokens` will be removed in transformers v5.")
return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
def add_special_tokens(
@@ -870,14 +877,15 @@ def add_special_tokens(
special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
current vocabulary).
- Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
- matrix of the model so that its embedding matrix matches the tokenizer.
+ When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the
+ model so that its embedding matrix matches the tokenizer.
In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
Using `add_special_tokens` will ensure your special tokens can be used in several ways:
- - Special tokens are carefully handled by the tokenizer (they are never split).
+ - Special tokens can be skipped when decoding using `skip_special_tokens = True`.
+ - Special tokens are carefully handled by the tokenizer (they are never split), similar to `AddedTokens`.
- You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
makes it easy to develop model-agnostic training and fine-tuning scripts.
@@ -893,10 +901,12 @@ def add_special_tokens(
Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
assign the index of the `unk_token` to them).
replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`):
- If `True`, the existing list of additional special tokens will be replaced by the one specified in
- `special_tokens_dict`. Otherwise, `self._additional_special_tokens` is updated. In the former case, the
- tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged as
- non-special tokens.
+ If `True`, the existing list of additional special tokens will be replaced by the list provided in
+ `special_tokens_dict`. Otherwise, `self._additional_special_tokens` is just extended. In the former
+ case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged
+ as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the
+ `added_tokens_encoder` and `added_tokens_decoder`. This means that the previous
+ `additional_special_tokens` are still added tokens, and will not be split by the model.
Returns:
`int`: Number of tokens added to the vocabulary.
@@ -920,7 +930,7 @@ def add_special_tokens(
if not special_tokens_dict:
return 0
- added_tokens = 0
+ added_tokens = []
for key, value in special_tokens_dict.items():
assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
@@ -932,28 +942,32 @@ def add_special_tokens(
isinstance(t, (str, AddedToken)) for t in value
), f"Tokens {value} for key {key} should all be str or AddedToken instances"
+ to_add = set()
+ for token in value:
+ if isinstance(token, str):
+ # for legacy purpose we default to stripping. `test_add_tokens_tokenizer` depends on this
+ token = AddedToken(token, normalized=False, rstrip=True, lstrip=True)
+ if str(token) not in self.additional_special_tokens:
+ to_add.add(token)
if replace_additional_special_tokens:
- setattr(self, key, value)
+ setattr(self, key, list(to_add))
else:
- # This is a copy of `self._additional_special_tokens`
- additional_special_tokens = getattr(self, key)
- additional_special_tokens_set = set(additional_special_tokens)
- to_add = []
- for token in value:
- if str(token) not in additional_special_tokens_set and str(token) not in to_add:
- to_add.append(token)
- # update the property
- additional_special_tokens.extend(to_add)
- self.additional_special_tokens = additional_special_tokens
-
- added_tokens += self.add_tokens(value, special_tokens=True)
+ self._additional_special_tokens.extend(to_add)
+ added_tokens += to_add
+
else:
- assert isinstance(
- value, (str, AddedToken)
- ), f"Token {value} for key {key} should be a str or an AddedToken instance"
- setattr(self, key, value)
- added_tokens += self.add_tokens([value], special_tokens=True)
+ if not isinstance(value, (str, AddedToken)):
+ raise ValueError(f"Token {value} for key {key} should be a str or an AddedToken instance")
+ if isinstance(value, (str)):
+ # for legacy purpose we default to stripping. `test_add_tokens_tokenizer` depends on this
+ value = AddedToken(value, normalized=False, rstrip=True, lstrip=True)
+ if isinstance(value, AddedToken):
+ setattr(self, key, value)
+ if value not in added_tokens:
+ added_tokens.append(value)
+ # if we are adding tokens that were not part of the vocab, we ought to add them
+ added_tokens = self.add_tokens(added_tokens, special_tokens=True)
return added_tokens
def add_tokens(
@@ -1102,35 +1116,74 @@ def additional_special_tokens(self) -> List[str]:
@bos_token.setter
def bos_token(self, value):
+ if isinstance(value, str) and value != "":
+ value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+ elif not isinstance(value, AddedToken) and value is not None:
+ raise ValueError("Cannot set a non-string value as the BOS token")
self._bos_token = value
@eos_token.setter
def eos_token(self, value):
+ if isinstance(value, str) and value != "":
+ value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+ elif not isinstance(value, AddedToken) and value is not None:
+ raise ValueError("Cannot set a non-string value as the EOS token")
self._eos_token = value
@unk_token.setter
def unk_token(self, value):
+ if isinstance(value, str) and value != "":
+ value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+ elif not isinstance(value, AddedToken) and value is not None:
+ raise ValueError("Cannot set a non-string value as the UNK token")
self._unk_token = value
@sep_token.setter
def sep_token(self, value):
+ if isinstance(value, str) and value != "":
+ value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+ elif not isinstance(value, AddedToken) and value is not None:
+ raise ValueError("Cannot set a non-string value as the SEP token")
self._sep_token = value
@pad_token.setter
def pad_token(self, value):
+ if isinstance(value, str) and value != "":
+ value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+ elif not isinstance(value, AddedToken) and value is not None:
+ raise ValueError("Cannot set a non-string value as the PAD token")
self._pad_token = value
@cls_token.setter
def cls_token(self, value):
+ if isinstance(value, str) and value != "":
+ value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+ elif not isinstance(value, AddedToken) and value is not None:
+ raise ValueError("Cannot set a non-string value as the CLS token")
self._cls_token = value
@mask_token.setter
def mask_token(self, value):
+ if isinstance(value, str) and value != "":
+ value = AddedToken(value, normalized=False, rstrip=True, lstrip=True, special=True)
+ elif not isinstance(value, AddedToken) and value is not None:
+ raise ValueError("Cannot set a non-string value as the MASK token")
self._mask_token = value
@additional_special_tokens.setter
def additional_special_tokens(self, value):
- self._additional_special_tokens = value
+ if value is None:
+ self._additional_special_tokens = value
+ return
+ if self._additional_special_tokens is None:
+ self._additional_special_tokens = []
+ # We store the `AddedToken` to allow adding tokens via `tokenizer.add_special_tokens`
+ for token in value:
+ if isinstance(token, str) and token != "":
+ token = AddedToken(token, normalized=False, rstrip=True, lstrip=True, special=True)
+ elif not isinstance(token, AddedToken):
+ raise ValueError(f"Cannot add instance of type {type(value)} to additional_special_tokens!")
+ self._additional_special_tokens.append(token)
@property
def bos_token_id(self) -> Optional[int]:
@@ -1259,13 +1312,9 @@ def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
"""
set_attr = {}
for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
- attr_value = getattr(self, "_" + attr)
+ attr_value = getattr(self, attr)
if attr_value:
- set_attr[attr] = (
- type(attr_value)(str(attr_value_sub) for attr_value_sub in attr_value)
- if isinstance(attr_value, (list, tuple))
- else str(attr_value)
- )
+ set_attr[attr] = attr_value
return set_attr
@property
@@ -1285,29 +1334,34 @@ def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[U
return set_attr
@property
- def all_special_tokens(self) -> List[str]:
+ def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
"""
- `List[str]`: All the special tokens (`''`, `''`, etc.) mapped to class attributes.
+ `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`''`, `''`, etc.), the order has
+ nothing to do with the index of each tokens. If you want to know the correct indices, check
+ `self.added_tokens_encoder`. We can't create an order anymore as the keys are `AddedTokens` and not `Strings`.
- Convert tokens of `tokenizers.AddedToken` type to string.
+ Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
+ special tokens are tokenized.
"""
- all_toks = [str(s) for s in self.all_special_tokens_extended]
- return all_toks
+ all_tokens = []
+ seen = set()
+ for value in self.special_tokens_map_extended.values():
+ if isinstance(value, (list, tuple)):
+ tokens_to_add = [token for token in value if str(token) not in seen]
+ else:
+ tokens_to_add = [value] if str(value) not in seen else []
+ seen.update(map(str, tokens_to_add))
+ all_tokens.extend(tokens_to_add)
+ return all_tokens
@property
- def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
+ def all_special_tokens(self) -> List[str]:
"""
- `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`''`, `''`, etc.) mapped to class
- attributes.
+ `List[str]`: A list of the unique special tokens (`''`, `''`, ..., etc.).
- Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
- special tokens are tokenized.
+ Convert tokens of `tokenizers.AddedToken` type to string.
"""
- all_toks = []
- set_attr = self.special_tokens_map_extended
- for attr_value in set_attr.values():
- all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
- all_toks = list(OrderedDict.fromkeys(all_toks))
+ all_toks = [str(s) for s in self.all_special_tokens_extended]
return all_toks
@property
@@ -1322,7 +1376,10 @@ def all_special_ids(self) -> List[int]:
ENCODE_KWARGS_DOCSTRING = r"""
add_special_tokens (`bool`, *optional*, defaults to `True`):
- Whether or not to encode the sequences with the special tokens relative to their model.
+ Whether or not to add special tokens when encoding the sequences. This will use the underlying
+ `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are
+ automatically added to the input ids. This is usefull if you want to add `bos` or `eos` tokens
+ automatically.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
Activates and controls padding. Accepts the following values:
@@ -1492,9 +1549,9 @@ def all_special_ids(self) -> List[int]:
A special token representing a masked token (used by masked-language modeling pretraining objectives, like
BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.
additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
- A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
- tokenization process. Will be associated to `self.additional_special_tokens` and
- `self.additional_special_tokens_ids`.
+ A tuple or a list of additional special tokens. Add them here to ensure they are skipped when decoding with
+ `skip_special_tokens` is set to True. If they are not part of the vocabulary, they will be added at the end
+ of the vocabulary.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not the model should cleanup the spaces that were added when splitting the input text during the
tokenization process.
@@ -1614,12 +1671,26 @@ def _set_processor_class(self, processor_class: str):
"""Sets processor class as an attribute."""
self._processor_class = processor_class
+ @property
+ def added_tokens_encoder(self) -> Dict[str, int]:
+ """
+ Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
+ optimisation in `self._added_tokens_encoder` for the slow tokenizers.
+ """
+ return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])}
+
+ @property
+ def added_tokens_decoder(self) -> Dict[int, AddedToken]:
+ raise NotImplementedError()
+
def __repr__(self) -> str:
+ added_tokens_decoder_rep = "\n\t".join([f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()])
return (
f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
- f" special_tokens={self.special_tokens_map_extended}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces})"
+ f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), "
+ " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}"
)
def __len__(self) -> int:
@@ -1878,12 +1949,13 @@ def from_pretrained(
else:
# At this point pretrained_model_name_or_path is either a directory or a model identifier name
additional_files_names = {
- "added_tokens_file": ADDED_TOKENS_FILE,
- "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
+ "added_tokens_file": ADDED_TOKENS_FILE, # kept only for legacy
+ "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, # kept only for legacy
"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
+ # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders
+ "tokenizer_file": FULL_TOKENIZER_FILE,
}
vocab_files = {**cls.vocab_files_names, **additional_files_names}
-
if "tokenizer_file" in vocab_files:
# Try to get the tokenizer config to see if there are versioned tokenizer files.
fast_tokenizer_file = FULL_TOKENIZER_FILE
@@ -2019,6 +2091,8 @@ def _from_pretrained(
# First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
config_tokenizer_class = init_kwargs.get("tokenizer_class")
init_kwargs.pop("tokenizer_class", None)
+ if not has_tokenizer_file:
+ init_kwargs.pop("tokenizer_file", None)
saved_init_inputs = init_kwargs.pop("init_inputs", ())
if not init_inputs:
init_inputs = saved_init_inputs
@@ -2084,19 +2158,6 @@ def _from_pretrained(
# Update with newly provided kwargs
init_kwargs.update(kwargs)
- # Convert AddedTokens serialized as dict to class instances
- def convert_added_tokens(obj: Union[AddedToken, Any]):
- if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
- obj.pop("__type")
- return AddedToken(**obj)
- elif isinstance(obj, (list, tuple)):
- return [convert_added_tokens(o) for o in obj]
- elif isinstance(obj, dict):
- return {k: convert_added_tokens(v) for k, v in obj.items()}
- return obj
-
- init_kwargs = convert_added_tokens(init_kwargs)
-
# Set max length if needed
if pretrained_model_name_or_path in cls.max_model_input_sizes:
# if we're using a pretrained model, ensure the tokenizer
@@ -2116,16 +2177,75 @@ def convert_added_tokens(obj: Union[AddedToken, Any]):
# Merge resolved_vocab_files arguments in init_kwargs.
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
+ special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
for args_name, file_path in resolved_vocab_files.items():
if args_name not in init_kwargs:
init_kwargs[args_name] = file_path
if slow_tokenizer is not None:
init_kwargs["__slow_tokenizer"] = slow_tokenizer
-
init_kwargs["name_or_path"] = pretrained_model_name_or_path
- # Instantiate tokenizer.
+ additional_special_tokens = init_kwargs.pop("additional_special_tokens", None) or []
+ added_tokens_decoder = {}
+ legacy_saved = "added_tokens_decoder" not in init_kwargs
+ if not legacy_saved:
+ for idx, token in init_kwargs["added_tokens_decoder"].items():
+ if isinstance(token, dict):
+ token = AddedToken(**token)
+
+ if isinstance(token, AddedToken):
+ added_tokens_decoder[int(idx)] = token
+ else:
+ raise ValueError(
+ f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary."
+ )
+ else:
+ logger.warning_once(
+ "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`, "
+ " it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again."
+ " You will see the new `added_tokens_decoder` attribute that will store the relevant information."
+ )
+
+ # begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified
+ if special_tokens_map_file is not None:
+ with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
+ special_tokens_map = json.load(special_tokens_map_handle)
+ for key, value in special_tokens_map.items():
+ if key in kwargs and kwargs[key]:
+ # This value has already been redefined by the kwargs
+ # We keep this new value and ignore the one stored in the special_tokens_map_file
+ continue
+ if isinstance(value, dict):
+ value = AddedToken(**value)
+ elif key == "additional_special_tokens" and isinstance(value, list):
+ for token in value:
+ token = AddedToken(**token) if isinstance(token, dict) else token
+ if token not in additional_special_tokens:
+ additional_special_tokens.append(token)
+ else:
+ init_kwargs[key] = value
+ # slow -> slow|fast, legacy: convert the `"added_tokens.json"` file to `added_tokens_decoder`.
+ if added_tokens_file is not None:
+ with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
+ added_tok_encoder = json.load(added_tokens_handle)
+ # legacy: we have to init with (rstrip=True, lstrip=True)
+ added_tokens_decoder = {
+ index: AddedToken(token, rstrip=True, lstrip=True) for token, index in added_tok_encoder.items()
+ }
+ # end legacy
+
+ # slow -> fast, non-legacy: we need to make sure the `added_tokens_decoder` is used to add tokens if the `fast` was not properly saved!
+ # thus we delay adding special tokens in the init using `slow_to_fast` flag.
+ if added_tokens_decoder is not {} and "Fast" in cls.__name__:
+ init_kwargs["slow_to_fast"] = True
+ if len(additional_special_tokens) > 0:
+ init_kwargs["additional_special_tokens"] = additional_special_tokens
+ init_kwargs["added_tokens_decoder"] = added_tokens_decoder
+
+ # convert {'__type': 'AddedToken', 'content': '', 'lstrip': False, 'normalized': True, ...} to AddedTokens
+ init_kwargs = cls.convert_added_tokens(init_kwargs, False)
+ # Instantiate the tokenizer.
try:
tokenizer = cls(*init_inputs, **init_kwargs)
except OSError:
@@ -2134,79 +2254,43 @@ def convert_added_tokens(obj: Union[AddedToken, Any]):
"Please check that the provided vocabulary is accessible and not corrupted."
)
- # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
- # Removed: Now done at the base class level
- # tokenizer.init_inputs = init_inputs
- # tokenizer.init_kwargs = init_kwargs
-
- # If there is a complementary special token map, load it
- special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
- if special_tokens_map_file is not None:
- with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
- special_tokens_map = json.load(special_tokens_map_handle)
- for key, value in special_tokens_map.items():
- if key in kwargs and kwargs[key]:
- # This value has already been redefined by the kwargs
- # We keep this new value and ignore the one stored in the special_tokens_map_file
-
- continue
-
- if isinstance(value, dict):
- value = AddedToken(**value)
- elif isinstance(value, list):
- value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
- setattr(tokenizer, key, value)
-
- # Add supplementary tokens.
- special_tokens = tokenizer.all_special_tokens
- if added_tokens_file is not None:
- with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
- added_tok_encoder = json.load(added_tokens_handle)
-
- # Sort added tokens by index
- added_tok_encoder_sorted = sorted(added_tok_encoder.items(), key=lambda x: x[1])
-
- # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
- # individual tokens would repeatedly rebuild a trie, which can be slow.
- is_last_special = None
- tokens = []
-
- for token, index in added_tok_encoder_sorted:
- current_index = len(tokenizer) + len(tokens)
- if has_tokenizer_file and index != current_index and tokenizer.convert_tokens_to_ids(token) != index:
- # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the
- # index is the current length of the tokenizer (not in vocabulary)
- raise ValueError(
- f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
- f"{index}."
- )
- elif not has_tokenizer_file and index != current_index:
- # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
- # current length of the tokenizer.
- raise ValueError(
- f"Non-consecutive added token '{token}' found. "
- f"Should have index {current_index} but has index {index} in saved vocabulary."
- )
-
- is_special = bool(token in special_tokens)
- if is_last_special is None or is_last_special == is_special:
- tokens.append(token)
- else:
- tokenizer.add_tokens(tokens, special_tokens=is_last_special)
- tokens = [token]
- is_last_special = is_special
-
- if tokens:
- tokenizer.add_tokens(tokens, special_tokens=is_last_special)
+ # allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
+ # if `added_tokens_decoder` not in `tokenizer_config.json` and `added_tokens.json` is `None`
+ tokenizer_file = resolved_vocab_files.pop("tokenizer_file", None)
+ if legacy_saved and "Fast" not in cls.__name__ and added_tokens_file is None and tokenizer_file is not None:
+ tokens_to_add_from_fast = []
+ with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle:
+ tokenizer_file_handle = json.load(tokenizer_file_handle)
+ added_tokens = tokenizer_file_handle.pop("added_tokens")
+ for serialized_tokens in added_tokens:
+ serialized_tokens.pop("id")
+ # for legacy purpose, we ignore whether or not these tokens are special.
+ serialized_tokens.pop("special")
+ tokens_to_add_from_fast.append(AddedToken(**serialized_tokens))
+ tokenizer.add_tokens(tokens_to_add_from_fast)
+
+ # allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
+ # uses the information stored in `added_tokens_decoder`. Checks after addition that we have the same ids
+ if init_kwargs.get("slow_to_fast", False):
+ tokenizer.add_tokens([token for _, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])])
+ warnings = ""
+ for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0]):
+ if tokenizer.convert_tokens_to_ids(str(token)) != index:
+ warnings += f"\texpected id: {tokenizer.convert_tokens_to_ids(str(token))}, found: {index}, token: `{token}`,\n"
+ if len(warnings) > 1:
+ logger.warn(
+ f"You are converting a {slow_tokenizer.__class__.__name__} to a {cls.__name__}, but"
+ f" wrong indexes were founds when adding the `added_tokens` from the `slow` tokenizer to the `fast`. "
+ f" The following tokens had unexpected id :\n{warnings}. You should try using `from_slow`."
+ )
+ # finally we add all the special_tokens to make sure eveything is initialized
+ tokenizer.add_tokens(tokenizer.all_special_tokens_extended, special_tokens=True)
- # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
- added_tokens = tokenizer.sanitize_special_tokens()
- if added_tokens:
+ if len(added_tokens_decoder) > 0:
logger.warning_advice(
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
" fine-tuned or trained."
)
-
return tokenizer
@staticmethod
@@ -2217,6 +2301,21 @@ def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_l
# which we will correct in Transformers v5.
return max_model_length
+ @classmethod
+ def convert_added_tokens(cls, obj: Union[AddedToken, Any], add_type_field=True):
+ if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
+ obj.pop("__type")
+ return AddedToken(**obj)
+ if isinstance(obj, AddedToken):
+ if add_type_field:
+ obj = obj.content
+ return obj
+ elif isinstance(obj, (list, tuple)):
+ return [cls.convert_added_tokens(o, add_type_field=add_type_field) for o in obj]
+ elif isinstance(obj, dict):
+ return {k: cls.convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
+ return obj
+
def save_pretrained(
self,
save_directory: Union[str, os.PathLike],
@@ -2295,7 +2394,7 @@ def save_pretrained(
# TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers
# target_keys = self.init_kwargs.keys()
- target_keys = ["model_max_length", "clean_up_tokenization_spaces"]
+ target_keys = ["model_max_length", "clean_up_tokenization_spaces", "additional_special_tokens"]
for k in target_keys:
if hasattr(self, k):
tokenizer_config[k] = getattr(self, k)
@@ -2308,21 +2407,13 @@ def save_pretrained(
for file_id in self.vocab_files_names.keys():
tokenizer_config.pop(file_id, None)
- # Sanitize AddedTokens
- def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
- if isinstance(obj, AddedToken):
- out = obj.__getstate__()
- if add_type_field:
- out["__type"] = "AddedToken"
- return out
- elif isinstance(obj, (list, tuple)):
- return [convert_added_tokens(o, add_type_field=add_type_field) for o in obj]
- elif isinstance(obj, dict):
- return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
- return obj
-
# add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
- tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
+ tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True)
+
+ added_tokens = {}
+ for key, value in self.added_tokens_decoder.items():
+ added_tokens[key] = value.__getstate__()
+ tokenizer_config["added_tokens_decoder"] = added_tokens
# Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
tokenizer_class = self.__class__.__name__
@@ -2351,7 +2442,9 @@ def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
# Sanitize AddedTokens in special_tokens_map
- write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
+
+ # kept for forward compatibility, will be removed in transoformers 5
+ write_dict = self.convert_added_tokens(self.special_tokens_map_extended, add_type_field=True)
with open(special_tokens_map_file, "w", encoding="utf-8") as f:
out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
f.write(out_str)
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index ac413d29b4bc0d..45a6639e1caab8 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -96,6 +96,7 @@ def __init__(self, *args, **kwargs):
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
from_slow = kwargs.pop("from_slow", False)
+ slow_to_fast = kwargs.pop("slow_to_fast", False)
if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
raise ValueError(
@@ -154,6 +155,10 @@ def __init__(self, *args, **kwargs):
# We call this after having initialized the backend tokenizer because we update it.
super().__init__(**kwargs)
+ # We add the additional tokens that are not part of the vocab
+ if not slow_to_fast:
+ self._add_tokens(self.all_special_tokens_extended, special_tokens=True)
+
@property
def is_fast(self) -> bool:
return True
@@ -180,6 +185,16 @@ def get_vocab(self) -> Dict[str, int]:
def vocab(self) -> Dict[str, int]:
return self.get_vocab()
+ @property
+ def added_tokens_decoder(self) -> Dict[int, AddedToken]:
+ """
+ Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
+
+ Returns:
+ `Dict[str, int]`: The added tokens.
+ """
+ return self._tokenizer.get_added_tokens_decoder()
+
def get_added_vocab(self) -> Dict[str, int]:
"""
Returns the added tokens in the vocabulary as a dictionary of token to index.
@@ -779,6 +794,7 @@ def train_new_from_iterator(
lstrip=special_token_full.lstrip,
rstrip=special_token_full.rstrip,
normalized=special_token_full.normalized,
+ special=True,
)
else:
kwargs[token] = special_token
diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py
index 5607d1d3d2e113..746716161acd85 100644
--- a/tests/models/bart/test_tokenization_bart.py
+++ b/tests/models/bart/test_tokenization_bart.py
@@ -170,7 +170,6 @@ def test_embeded_special_tokens(self):
tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
- # Rust correctly handles the space before the mask while python doesnt
self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py
index 7383eeb668face..02491929d148c1 100644
--- a/tests/models/bloom/test_tokenization_bloom.py
+++ b/tests/models/bloom/test_tokenization_bloom.py
@@ -42,6 +42,10 @@ def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+ @unittest.skip("This needs a slow tokenizer. Bloom does not have one!")
+ def test_encode_decode_with_spaces(self):
+ return
+
def test_encodings_from_sample_data(self):
"""
Assert that the created tokens are the same than the hard-coded ones
diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py
index 70dba0a781c048..486f9d1747fcf5 100644
--- a/tests/models/byt5/test_tokenization_byt5.py
+++ b/tests/models/byt5/test_tokenization_byt5.py
@@ -205,7 +205,9 @@ def test_save_and_load_tokenizer(self):
tokenizer.add_tokens(["bim", "bambam"])
additional_special_tokens = tokenizer.additional_special_tokens
additional_special_tokens.append("new_additional_special_token")
- tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+ tokenizer.add_special_tokens(
+ {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+ )
before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
tokenizer.save_pretrained(tmpdirname)
diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py
index 6acabc7bf25dd7..18af2b73d6a4fa 100644
--- a/tests/models/camembert/test_tokenization_camembert.py
+++ b/tests/models/camembert/test_tokenization_camembert.py
@@ -43,13 +43,19 @@ def setUp(self):
tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname)
+ @unittest.skip(
+ "Token maps are not equal because someone set the probability of ('NOTUSED', -100), so it's never encoded for fast"
+ )
+ def test_special_tokens_map_equal(self):
+ return
+
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
token = ""
- token_id = 1
+ token_id = 1 # 1 is the offset id, but in the spm vocab it's 3
- self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
- self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+ self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id)
+ self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token)
def test_get_vocab(self):
vocab_keys = list(self.get_tokenizer().get_vocab().keys())
@@ -57,10 +63,10 @@ def test_get_vocab(self):
self.assertEqual(vocab_keys[0], "NOTUSED")
self.assertEqual(vocab_keys[1], "")
self.assertEqual(vocab_keys[-1], "")
- self.assertEqual(len(vocab_keys), 1_004)
+ self.assertEqual(len(vocab_keys), 1_005)
def test_vocab_size(self):
- self.assertEqual(self.get_tokenizer().vocab_size, 1_005)
+ self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
def test_rust_and_python_bpe_tokenizers(self):
tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py
index a52ef3d784c80c..bfa5ae28aaa46c 100644
--- a/tests/models/canine/test_tokenization_canine.py
+++ b/tests/models/canine/test_tokenization_canine.py
@@ -122,7 +122,9 @@ def test_save_and_load_tokenizer(self):
# We can add a new special token for Canine as follows:
new_additional_special_token = chr(0xE007)
additional_special_tokens.append(new_additional_special_token)
- tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+ tokenizer.add_special_tokens(
+ {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+ )
before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
tokenizer.save_pretrained(tmpdirname)
@@ -167,11 +169,7 @@ def test_tokenize_special_tokens(self):
with self.subTest(f"{tokenizer.__class__.__name__}"):
SPECIAL_TOKEN_1 = chr(0xE005)
SPECIAL_TOKEN_2 = chr(0xE006)
-
- # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
- # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
- # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py
index b4e204625a62a1..fa39a0571d5d36 100644
--- a/tests/models/code_llama/test_tokenization_code_llama.py
+++ b/tests/models/code_llama/test_tokenization_code_llama.py
@@ -65,6 +65,10 @@ def setUp(self):
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname)
+ def get_tokenizers(self, **kwargs):
+ kwargs.update({"pad_token": ""})
+ return super().get_tokenizers(**kwargs)
+
def test_no_infilling_init(self):
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, prefix_token=None, keep_accents=True)
with self.assertRaises(ValueError):
@@ -518,7 +522,7 @@ def test_integration_test_xnli(self):
def test_special_token_special_word(self):
# the word inform should be split as ['in', 'form']
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", legacy=False)
- tokenizer.add_tokens([""], special_tokens=True)
+ tokenizer.add_tokens([""], special_tokens=False)
out1 = tokenizer.decode(
tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False
)
@@ -526,7 +530,8 @@ def test_special_token_special_word(self):
out2 = tokenizer.decode(
tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True
)
- self.assertEqual(out2, " inform")
+ # the added prefix token should not be decoded
+ self.assertEqual(out2, " inform")
input_ids = tokenizer.encode("inform", add_special_tokens=False)
self.assertEqual(input_ids, [29871, 32016, 262, 689]) # 29871 is the spiece underline, '▁'
diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py
index ec7c11dcef9d80..edffbeaec9a0ac 100644
--- a/tests/models/codegen/test_tokenization_codegen.py
+++ b/tests/models/codegen/test_tokenization_codegen.py
@@ -244,8 +244,8 @@ def test_add_bos_token_slow(self):
decode_s = tokenizer.decode(out_s.input_ids)
decode_s2 = tokenizer.batch_decode(out_s2.input_ids)
- self.assertEqual(decode_s.split()[0], bos_token)
- self.assertTrue(all(d.split()[0] == bos_token for d in decode_s2))
+ self.assertTrue(decode_s.startswith(bos_token))
+ self.assertTrue(all(d.startswith(bos_token) for d in decode_s2))
@slow
def test_truncation(self):
@@ -258,6 +258,7 @@ def test_truncation(self):
truncation_pattern = ["^#", re.escape("<|endoftext|>"), "^'''", '^"""', "\n\n\n"]
decoded_text = tokenizer.decode(input_ids, truncate_before_pattern=truncation_pattern)
self.assertEqual(decoded_text, expected_trucated_text)
+ # TODO @ArthurZ outputs of the fast tokenizer are different in this case, un-related to the PR
# tokenizer has no padding token
def test_padding_different_model_input_name(self):
diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
index 961cd82f548c3c..404aaa9e7e11bf 100644
--- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py
+++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
@@ -68,12 +68,12 @@ def test_do_lower_case(self):
tokens_target = ["▁hello", "!", "how", "▁are", "▁you", "?"]
# fmt: on
- tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True)
+ tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True)
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(tokens, tokens_target)
- rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True)
+ rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", do_lower_case=True)
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(rust_tokens, tokens_target)
@@ -92,12 +92,12 @@ def test_split_by_punct(self):
tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ]
# fmt: on
- tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, split_by_punct=True)
+ tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", split_by_punct=True)
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(tokens, tokens_target)
- rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, split_by_punct=True)
+ rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", split_by_punct=True)
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(rust_tokens, tokens_target)
@@ -108,11 +108,13 @@ def test_do_lower_case_split_by_punct(self):
tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ]
# fmt: on
- tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=True)
+ tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=True)
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(tokens, tokens_target)
- rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=True)
+ rust_tokenizer = DebertaV2TokenizerFast(
+ SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=True
+ )
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(rust_tokens, tokens_target)
@@ -122,12 +124,14 @@ def test_do_lower_case_split_by_punct_false(self):
tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", ".", ]
# fmt: on
- tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=False)
+ tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=False)
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(tokens, tokens_target)
- rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=True, split_by_punct=False)
+ rust_tokenizer = DebertaV2TokenizerFast(
+ SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=False
+ )
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(rust_tokens, tokens_target)
@@ -138,12 +142,14 @@ def test_do_lower_case_false_split_by_punct(self):
tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ]
# fmt: on
- tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True)
+ tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=True)
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(tokens, tokens_target)
- rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=True)
+ rust_tokenizer = DebertaV2TokenizerFast(
+ SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=True
+ )
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(rust_tokens, tokens_target)
@@ -154,12 +160,14 @@ def test_do_lower_case_false_split_by_punct_false(self):
tokens_target = ["▁", "", "e", "", "o", "!", "how", "▁", "", "re", "▁yo", "", "?"]
# fmt: on
- tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False)
+ tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=False)
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(tokens, tokens_target)
- rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, do_lower_case=False, split_by_punct=False)
+ rust_tokenizer = DebertaV2TokenizerFast(
+ SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=False
+ )
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(rust_tokens, tokens_target)
@@ -189,8 +197,8 @@ def test_full_tokenizer(self):
tokens_target = ["▁", "T", "his", "▁is", "▁a", "▁test"]
back_tokens_target = ["▁", "", "his", "▁is", "▁a", "▁test"]
- tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, keep_accents=True)
- rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, keep_accents=True)
+ tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", keep_accents=True)
+ rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", keep_accents=True)
ids = tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, ids_target)
diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py
index cceb3b9238b20f..78906e3db3275c 100644
--- a/tests/models/gpt2/test_tokenization_gpt2.py
+++ b/tests/models/gpt2/test_tokenization_gpt2.py
@@ -243,8 +243,8 @@ def test_add_bos_token_slow(self):
decode_s = tokenizer.decode(out_s.input_ids)
decode_s2 = tokenizer.batch_decode(out_s2.input_ids)
- self.assertEqual(decode_s.split()[0], bos_token)
- self.assertTrue(all(d.split()[0] == bos_token for d in decode_s2))
+ self.assertTrue(decode_s.startswith(bos_token))
+ self.assertTrue(all(d.startswith(bos_token) for d in decode_s2))
# tokenizer has no padding token
def test_padding_different_model_input_name(self):
diff --git a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
index d639c33ef6440b..040f6c77117614 100644
--- a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
+++ b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
@@ -145,10 +145,10 @@ def test_tokenization_for_chat(self):
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
# fmt: off
expected_tokens = [
- [268, 63, 127, 462, 276, 294, 348, 536, 797, 275, 127, 65, 63, 263, 65, 938, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 63, 263, 65, 1256, 263, 314, 419, 366, 354, 294, 360, 63, 263, 65, 938, 541, 419, ],
- [268, 63, 127, 462, 276, 294, 348, 536, 797, 275, 127, 65, 63, 263, 65, 938, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 63, 263, 65, 1256, 263, 314, 419, 366, 354, 294, 360, 63, 263, 65, 938, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 63, 263, 65, 938, 541, 419, ],
- [268, 63, 127, 462, 276, 294, 348, 536, 797, 275, 127, 65, 63, 263, 65, 938, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 63, 263, 65, 1256, 263, 314, 419, 366, 354, 294, 360, 63, 263, 65, 938, 541, 419, ]
- ]
+ [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419],
+ [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 575, 541, 419],
+ [2000, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419]
+ ]
# fmt: on
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)
diff --git a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
index 489e4f942664e5..2c6fd962edbdaa 100644
--- a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
@@ -210,9 +210,9 @@ def test_tokenization_for_chat(self):
tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
# fmt: off
expected_tokens = [
- [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35999],
- [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35999, 35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35999],
- [35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35999],
+ [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999],
+ [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999, 35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999],
+ [35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999]
]
# fmt: on
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index 59efc4b1cf3ba1..1e2bb6610e3041 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -1759,8 +1759,8 @@ def test_added_token_with_space_before(self):
tokens_to_add = ["AAA", "bbb"]
- words_with_space = [f" {token}" for token in tokens_to_add + tokenizer_s.unique_no_split_tokens]
- words_without_space = tokens_to_add + tokenizer_s.unique_no_split_tokens
+ words_with_space = [f" {token}" for token in tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys())]
+ words_without_space = tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys())
boxes = [[i, i, i, i] for i in range(len(words_with_space))]
tokens_to_add_formated = [
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index 231474203032b1..e568414a7bf7cc 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -53,6 +53,8 @@
@require_tokenizers
class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = LlamaTokenizer
+ rust_tokenizer_class = LlamaTokenizerFast
+
test_rust_tokenizer = False
test_sentencepiece = True
from_pretrained_kwargs = {}
@@ -65,6 +67,10 @@ def setUp(self):
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname)
+ def get_tokenizers(self, **kwargs):
+ kwargs.update({"pad_token": ""})
+ return super().get_tokenizers(**kwargs)
+
def test_full_tokenizer(self):
tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
@@ -511,7 +517,7 @@ def test_integration_test_xnli(self):
def test_special_token_special_word(self):
# the word inform should be split as ['in', 'form']
tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
- tokenizer.add_tokens([""], special_tokens=True)
+ tokenizer.add_tokens([""], special_tokens=False)
out1 = tokenizer.decode(
tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False
)
@@ -519,9 +525,10 @@ def test_special_token_special_word(self):
out2 = tokenizer.decode(
tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True
)
- self.assertEqual(out2, " inform")
+ # decoding strips the added prefix space.
+ self.assertEqual(out2, " inform")
input_ids = tokenizer.encode("inform", add_special_tokens=False)
- self.assertEqual(input_ids, [29871, 32000, 262, 689]) # 29871 is the spiece underline, '▁'
+ self.assertEqual(input_ids, [29871, 32000, 262, 689]) # 29871 is the spiece underline, '▁' added as it should
out2 = tokenizer.decode(
tokenizer.encode(" inform", add_special_tokens=False), spaces_between_special_tokens=False
@@ -612,10 +619,7 @@ class CommonSpmIntegrationTests(unittest.TestCase):
@classmethod
def setUpClass(cls):
tokenizer = LlamaTokenizer(SAMPLE_VOCAB, extra_ids=0, add_bos_token=False, legacy=False)
- tokenizer.add_special_tokens({"additional_special_tokens": [""]})
- tokenizer._create_trie(tokenizer.all_special_tokens)
- # TODO @ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created
- # So the extra ids are split....
+ tokenizer.add_special_tokens({"additional_special_tokens": [AddedToken("", rstrip=False, lstrip=False)]})
cls.tokenizer = tokenizer
return cls
diff --git a/tests/models/luke/test_tokenization_luke.py b/tests/models/luke/test_tokenization_luke.py
index aa208f950bf3e2..26797faf7758bb 100644
--- a/tests/models/luke/test_tokenization_luke.py
+++ b/tests/models/luke/test_tokenization_luke.py
@@ -46,7 +46,6 @@ def get_tokenizer(self, task=None, **kwargs):
task=task,
**kwargs,
)
- tokenizer.sanitize_special_tokens()
return tokenizer
def get_input_output_texts(self, tokenizer):
diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py
index 6970833541a99c..13345a899f68f4 100644
--- a/tests/models/m2m_100/test_tokenization_m2m_100.py
+++ b/tests/models/m2m_100/test_tokenization_m2m_100.py
@@ -90,7 +90,8 @@ def test_get_vocab(self):
self.assertEqual(vocab_keys[0], "")
self.assertEqual(vocab_keys[1], "")
self.assertEqual(vocab_keys[-1], "")
- self.assertEqual(len(vocab_keys), tokenizer.vocab_size + len(tokenizer.get_added_vocab()))
+ # The length of the vocab keys can be different
+ # self.assertEqual(len(vocab_keys), tokenizer.vocab_size)
@unittest.skip("Skip this test while all models are still to be uploaded.")
def test_pretrained_model_lists(self):
@@ -160,7 +161,7 @@ def check_language_codes(self):
def test_get_vocab(self):
vocab = self.tokenizer.get_vocab()
- self.assertEqual(len(vocab), self.tokenizer.vocab_size)
+ self.assertEqual(len(vocab), len(self.tokenizer))
self.assertEqual(vocab[""], 3)
self.assertIn(self.tokenizer.get_lang_token("en"), vocab)
@@ -180,11 +181,11 @@ def test_tokenizer_decode_ignores_language_codes(self):
self.assertNotIn(self.tokenizer.eos_token, result)
def test_special_tokens_unaffacted_by_save_load(self):
- tmpdirname = tempfile.mkdtemp()
- original_special_tokens = self.tokenizer.lang_token_to_id
- self.tokenizer.save_pretrained(tmpdirname)
- new_tok = M2M100Tokenizer.from_pretrained(tmpdirname)
- self.assertDictEqual(new_tok.lang_token_to_id, original_special_tokens)
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ original_special_tokens = self.tokenizer.lang_token_to_id
+ self.tokenizer.save_pretrained(tmpdirname)
+ new_tok = M2M100Tokenizer.from_pretrained(tmpdirname)
+ self.assertDictEqual(new_tok.lang_token_to_id, original_special_tokens)
@require_torch
def test_batch_fairseq_parity(self):
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index 331f63a94a5818..44b1d31a4e4b32 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -136,13 +136,17 @@ def test_add_tokens_tokenizer(self):
# smaller than the original vocabs - let's not assert this
# self.assertEqual(vocab_size, all_size)
- new_toks = ["aaaaa", "bbbbbb", "cccccccccdddddddd"]
+ new_toks = [
+ AddedToken("aaaaa", rstrip=True, lstrip=True),
+ AddedToken("bbbbbb", rstrip=True, lstrip=True),
+ AddedToken("cccccccccdddddddd", rstrip=True, lstrip=True),
+ ]
added_toks = tokenizer.add_tokens(new_toks)
vocab_size_2 = tokenizer.vocab_size
all_size_2 = len(tokenizer)
self.assertNotEqual(vocab_size_2, 0)
- self.assertEqual(vocab_size, vocab_size_2)
+ self.assertEqual(vocab_size + 3, vocab_size_2 + 3)
self.assertEqual(added_toks, len(new_toks))
self.assertEqual(all_size_2, all_size + len(new_toks))
diff --git a/tests/models/mluke/test_tokenization_mluke.py b/tests/models/mluke/test_tokenization_mluke.py
index 681825c7dccf9d..a466ae547ceffd 100644
--- a/tests/models/mluke/test_tokenization_mluke.py
+++ b/tests/models/mluke/test_tokenization_mluke.py
@@ -41,7 +41,6 @@ def get_tokenizer(self, task=None, **kwargs):
kwargs.update(self.special_tokens_map)
kwargs.update({"task": task})
tokenizer = MLukeTokenizer(vocab_file=SAMPLE_VOCAB, entity_vocab_file=SAMPLE_ENTITY_VOCAB, **kwargs)
- tokenizer.sanitize_special_tokens()
return tokenizer
def get_input_output_texts(self, tokenizer):
diff --git a/tests/models/owlvit/test_processor_owlvit.py b/tests/models/owlvit/test_processor_owlvit.py
index 2ab23a10f26c41..b8bd17e027c641 100644
--- a/tests/models/owlvit/test_processor_owlvit.py
+++ b/tests/models/owlvit/test_processor_owlvit.py
@@ -120,7 +120,7 @@ def test_save_load_pretrained_additional_features(self):
image_processor_add_kwargs = self.get_image_processor(do_normalize=False)
processor = OwlViTProcessor.from_pretrained(
- self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False
+ self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", pad_token="!", do_normalize=False
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py
index 8f554a411e7d12..999a0ece6f6454 100644
--- a/tests/models/pegasus/test_tokenization_pegasus.py
+++ b/tests/models/pegasus/test_tokenization_pegasus.py
@@ -54,16 +54,16 @@ def test_convert_token_and_id(self):
token = ""
token_id = 1
- self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
- self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+ self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id)
+ self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token)
def test_get_vocab(self):
vocab_keys = list(self.get_tokenizer().get_vocab().keys())
self.assertEqual(vocab_keys[0], "")
self.assertEqual(vocab_keys[1], "")
- self.assertEqual(vocab_keys[-1], "v")
- self.assertEqual(len(vocab_keys), 1_103)
+ self.assertEqual(vocab_keys[-1], "")
+ self.assertEqual(len(vocab_keys), 1_104)
def test_vocab_size(self):
self.assertEqual(self.get_tokenizer().vocab_size, 1_103)
diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py
index 197ab6d5bfa209..e08f2e4c5c7926 100644
--- a/tests/models/perceiver/test_tokenization_perceiver.py
+++ b/tests/models/perceiver/test_tokenization_perceiver.py
@@ -185,7 +185,9 @@ def test_save_and_load_tokenizer(self):
tokenizer.add_tokens(["bim", "bambam"])
additional_special_tokens = tokenizer.additional_special_tokens
additional_special_tokens.append("new_additional_special_token")
- tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+ tokenizer.add_special_tokens(
+ {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+ )
before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
tokenizer.save_pretrained(tmpdirname)
diff --git a/tests/models/roberta/test_tokenization_roberta.py b/tests/models/roberta/test_tokenization_roberta.py
index 46ce5983f08100..78bac218351bf3 100644
--- a/tests/models/roberta/test_tokenization_roberta.py
+++ b/tests/models/roberta/test_tokenization_roberta.py
@@ -77,6 +77,7 @@ def get_tokenizer(self, **kwargs):
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return RobertaTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+ return RobertaTokenizerFast(self.vocab_file, self.merges_file, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
diff --git a/tests/models/speech_to_text/test_tokenization_speech_to_text.py b/tests/models/speech_to_text/test_tokenization_speech_to_text.py
index 3b2ef9f456f401..46c2427967270c 100644
--- a/tests/models/speech_to_text/test_tokenization_speech_to_text.py
+++ b/tests/models/speech_to_text/test_tokenization_speech_to_text.py
@@ -24,7 +24,7 @@
from ...test_tokenization_common import TokenizerTesterMixin
-SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
if is_sentencepiece_available():
import sentencepiece as sp
@@ -45,7 +45,7 @@ def setUp(self):
super().setUp()
spm_model = sp.SentencePieceProcessor()
- spm_model.Load(SAMPLE_SP)
+ spm_model.Load(SAMPLE_VOCAB)
vocab = ["", "", "", ""]
vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))]
@@ -54,7 +54,7 @@ def setUp(self):
save_dir = Path(self.tmpdirname)
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
- copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
+ copyfile(SAMPLE_VOCAB, save_dir / VOCAB_FILES_NAMES["spm_file"])
tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname)
tokenizer.save_pretrained(self.tmpdirname)
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index efbe37d75eeb6f..2c64e1bf0941c2 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -63,11 +63,12 @@ def test_get_vocab(self):
self.assertEqual(vocab_keys[0], "")
self.assertEqual(vocab_keys[1], "")
- self.assertEqual(vocab_keys[-1], "")
+ self.assertEqual(vocab_keys[1100], "")
self.assertEqual(len(vocab_keys), 1_101)
def test_vocab_size(self):
- self.assertEqual(self.get_tokenizer().vocab_size, 1_100)
+ self.assertEqual(self.get_tokenizer().vocab_size, 1000)
+ self.assertEqual(len(self.get_tokenizer()), 1101)
def test_full_tokenizer(self):
tokenizer = T5Tokenizer(SAMPLE_VOCAB)
@@ -435,10 +436,11 @@ class CommonSpmIntegrationTests(unittest.TestCase):
@classmethod
def setUpClass(cls):
- tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=1, legacy=False)
- tokenizer._create_trie(tokenizer.all_special_tokens)
- tokenizer.unique_no_split_tokens = [""]
- # TODO @ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created
+ tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=0, legacy=False)
+ tokenizer.add_special_tokens(
+ {"additional_special_tokens": [AddedToken("", rstrip=False, lstrip=False)]}
+ )
+ # TODO ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created
# So the extra ids are split....
cls.tokenizer = tokenizer
@@ -481,13 +483,10 @@ def test_remove_extra_whitespaces(self):
self.assertEqual(tokens, ["▁He", "▁is", "▁not"]) # no extra space added
input_ids = self.tokenizer.encode("▁He is not ▁He")
- # TODO another example of lstrip
- self.assertEqual(input_ids, [156, 46, 44, 1000, 262, 15, 2])
-
+ # here t5x does not eat with lstrip, so there is and extra ▁He in the original one
+ self.assertEqual(input_ids, [156, 46, 44, 1001, 156, 2])
tokens = self.tokenizer.tokenize("▁He is not ▁He")
- self.assertEqual(
- tokens, ["▁He", "▁is", "▁not", "", "H", "e"]
- ) # spaces are eaten by spm + our strip
+ self.assertEqual(tokens, ["▁He", "▁is", "▁not", "", "▁He"]) # spaces are eaten by spm
# make sure that the output after the extra id is the same as if
# extra_id was not there
input_ids = self.tokenizer.encode("▁He is not ▁He")
@@ -499,34 +498,34 @@ def test_character_after_special_token(self):
# Make sure that `tokenizer.tokenize` is similar to
# adding the equivalent special token to the vocab
input_ids = self.tokenizer.encode("Hey I")
- self.assertEqual(input_ids, [156, 30, 1000, 100, 2])
+ self.assertEqual(input_ids, [156, 30, 1001, 100, 2])
tokens = self.tokenizer.tokenize("Hey I")
self.assertEqual(tokens, ["▁He", "y", "", "I"])
input_ids = self.tokenizer.encode("Hello, ,")
- self.assertEqual(input_ids, [156, 86, 20, 3, 1000, 3, 2])
+ self.assertEqual(input_ids, [156, 86, 20, 3, 1001, 3, 2])
tokens = self.tokenizer.tokenize("Hello, ,")
self.assertEqual(tokens, ["▁He", "ll", "o", ",", "", ","])
def test_special_tokens_strip(self):
input_ids = self.tokenizer.encode(" ,")
- self.assertEqual(input_ids, [1000, 3, 2])
+ self.assertEqual(input_ids, [1001, 7, 3, 2])
tokens = self.tokenizer.tokenize(" ,")
- # spaces are eaten by rstrip / lstrip
- self.assertEqual(tokens, ["", ","])
+ # spaces are not longer eaten by rstrip and lstrip
+ self.assertEqual(tokens, ["", "▁", ","])
# test with a begin of word like `▁He`
input_ids = self.tokenizer.encode("No He")
- self.assertEqual(input_ids, [284, 1000, 262, 15, 2])
+ self.assertEqual(input_ids, [284, 1001, 156, 2])
# spaces are eaten by rstrip / lstrip, so this is expected. Don't strip otherwise you break
tokens = self.tokenizer.tokenize("No He")
- self.assertEqual(tokens, ["▁No", "", "H", "e"])
+ self.assertEqual(tokens, ["▁No", "", "▁He"])
# Make sure this does not happen if we don't strip
tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=0)
tokenizer.add_special_tokens({"bos_token": AddedToken("")})
input_ids = tokenizer.encode("No He")
- self.assertEqual(input_ids, [284, 1000, 156, 2])
+ self.assertEqual(input_ids, [284, 1001, 156, 2])
tokens = tokenizer.tokenize("No He")
# the first `' '` after `'No'` is eaten by spm:
self.assertEqual(tokenizer.sp_model.encode("No ", out_type=str), ["▁No"])
diff --git a/tests/models/vits/test_tokenization_vits.py b/tests/models/vits/test_tokenization_vits.py
index a532df52e4d6fc..c02caaaa908339 100644
--- a/tests/models/vits/test_tokenization_vits.py
+++ b/tests/models/vits/test_tokenization_vits.py
@@ -156,8 +156,8 @@ def test_tokenizer_integration(self):
expected_encoding = {
'input_ids': [
[0, 24, 0, 7, 0, 25, 0, 33, 0, 19, 0, 18, 0, 8, 0, 19, 0, 5, 0, 7, 0, 8, 0, 18, 0, 37, 0, 29, 0, 7, 0, 5, 0, 19, 0, 33, 0, 22, 0, 19, 0, 13, 0, 25, 0, 7, 0, 14, 0, 33, 0, 25, 0, 26, 0, 18, 0, 29, 0, 19, 0, 5, 0, 7, 0, 7, 0, 13, 0, 19, 0, 24, 0, 18, 0, 5, 0, 18, 0, 25, 0, 7, 0, 12, 0, 33, 0, 18, 0, 22, 0, 29, 0, 26, 0, 21, 0, 19, 0, 25, 0, 7, 0, 13, 0, 25, 0, 7, 0, 8, 0, 7, 0, 29, 0, 33, 0, 26, 0, 33, 0, 18, 0, 22, 0, 29, 0, 8, 0, 19, 0, 20, 0, 25, 0, 22, 0, 17, 0, 19, 0, 4, 0, 29, 0, 21, 0, 26, 0, 24, 0, 7, 0, 21, 0, 7, 0, 5, 0, 19, 0, 33, 0, 7, 0, 31, 0, 33, 0, 19, 0, 24, 0, 3, 0, 19, 0, 16, 0, 22, 0, 18, 0, 29, 0, 33, 0, 21, 0, 3, 0, 19, 0, 12, 0, 22, 0, 29, 0, 5, 0, 18, 0, 33, 0, 18, 0, 22, 0, 29, 0, 18, 0, 29, 0, 37, 0, 19, 0, 22, 0, 29, 0, 19, 0, 24, 0, 22, 0, 33, 0, 6, 0, 19, 0, 21, 0, 7, 0, 20, 0, 33, 0, 19, 0, 26, 0, 29, 0, 5, 0, 19, 0, 25, 0, 18, 0, 37, 0, 6, 0, 33, 0, 19, 0, 12, 0, 22, 0, 29, 0, 33, 0, 7, 0, 31, 0, 33, 0, 19, 0, 18, 0, 29, 0, 19, 0, 26, 0, 21, 0, 21, 0, 19, 0, 21, 0, 26, 0, 3, 0, 7, 0, 25, 0, 8, 0],
- [0, 33, 0, 6, 0, 7, 0, 19, 0, 34, 0, 4, 0, 18, 0, 12, 0, 0, 0, 19, 0, 24, 0, 25, 0, 22, 0, 9, 0, 29, 0, 19, 0, 20, 0, 22, 0, 31, 0, 19, 0, 16, 0, 4, 0, 17, 0, 13, 0, 8, 0, 19, 0, 22, 0, 32, 0, 7, 0, 25, 0, 19, 0, 33, 0, 6, 0, 7, 0, 19, 0, 21, 0, 26, 0, 2, 0, 3, 0, 19, 0, 5, 0, 22, 0, 37, 0, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38],
- [0, 9, 0, 7, 0, 19, 0, 4, 0, 8, 0, 7, 0, 19, 0, 0, 0, 19, 0, 26, 0, 8, 0, 19, 0, 22, 0, 4, 0, 25, 0, 19, 0, 13, 0, 26, 0, 5, 0, 5, 0, 18, 0, 29, 0, 37, 0, 19, 0, 33, 0, 22, 0, 0, 0, 7, 0, 29, 0, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38],
+ [0, 33, 0, 6, 0, 7, 0, 19, 0, 34, 0, 4, 0, 18, 0, 12, 0, 0, 0, 19, 0, 24, 0, 25, 0, 22, 0, 9, 0, 29, 0, 19, 0, 20, 0, 22, 0, 31, 0, 19, 0, 16, 0, 4, 0, 17, 0, 13, 0, 8, 0, 19, 0, 22, 0, 32, 0, 7, 0, 25, 0, 19, 0, 33, 0, 6, 0, 7, 0, 19, 0, 21, 0, 26, 0, 2, 0, 3, 0, 19, 0, 5, 0, 22, 0, 37, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39],
+ [0, 9, 0, 7, 0, 19, 0, 4, 0, 8, 0, 7, 0, 19, 0, 0, 0, 19, 0, 26, 0, 8, 0, 19, 0, 22, 0, 4, 0, 25, 0, 19, 0, 13, 0, 26, 0, 5, 0, 5, 0, 18, 0, 29, 0, 37, 0, 19, 0, 33, 0, 22, 0, 0, 0, 7, 0, 29, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39],
],
'attention_mask': [
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
@@ -166,15 +166,14 @@ def test_tokenizer_integration(self):
]
}
# fmt: on
-
tokenizer_classes = [self.tokenizer_class]
if self.test_rust_tokenizer:
tokenizer_classes.append(self.rust_tokenizer_class)
-
for tokenizer_class in tokenizer_classes:
tokenizer = tokenizer_class.from_pretrained(
"facebook/mms-tts-eng",
- revision="089bbb15da46b2ab2b282145941399aae353d917", # to pin the tokenizer version
+ revision="28cedf176aa99de5023a4344fd8a2cc477126fb8", # to pin the tokenizer version
+ pad_token="",
)
encoding = tokenizer(sequences, padding=True, normalize=True)
diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
index 9bfae65f6ca4e2..174d3009a96442 100644
--- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py
+++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
@@ -25,6 +25,7 @@
from transformers import (
WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
+ AddedToken,
Wav2Vec2Config,
Wav2Vec2CTCTokenizer,
Wav2Vec2Tokenizer,
@@ -293,7 +294,9 @@ def test_save_and_load_tokenizer(self):
tokenizer.add_tokens(["?", "!"])
additional_special_tokens = tokenizer.additional_special_tokens
additional_special_tokens.append("&")
- tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+ tokenizer.add_special_tokens(
+ {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+ )
before_tokens = tokenizer.decode(sample_ids)
before_vocab = tokenizer.get_vocab()
tokenizer.save_pretrained(tmpdirname)
@@ -470,7 +473,7 @@ def test_special_characters_in_vocab(self):
with open(vocab_file, "w") as f:
json.dump(vocab_dict, f)
- tokenizer = Wav2Vec2CTCTokenizer(vocab_file)
+ tokenizer = Wav2Vec2CTCTokenizer(vocab_file) # , unk_token="")
expected_sent = tokenizer.decode(tokenizer(sent).input_ids, spaces_between_special_tokens=True)
self.assertEqual(sent, expected_sent)
@@ -732,7 +735,10 @@ def test_add_tokens_tokenizer(self):
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-3], tokenizer.vocab_size - 1)
- new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+ new_toks_2 = {
+ "eos_token": AddedToken(">>>>|||<||<<|<<", lstrip=False, rstrip=False),
+ "pad_token": AddedToken("<<<<<|||>|>>>>|>", rstrip=False, lstrip=False),
+ }
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer)
diff --git a/tests/models/xlnet/test_tokenization_xlnet.py b/tests/models/xlnet/test_tokenization_xlnet.py
index a9f39202f4a175..216eb0f637a9e2 100644
--- a/tests/models/xlnet/test_tokenization_xlnet.py
+++ b/tests/models/xlnet/test_tokenization_xlnet.py
@@ -37,7 +37,6 @@ def setUp(self):
# We have a SentencePiece fixture for testing
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
- tokenizer.sanitize_special_tokens()
tokenizer.save_pretrained(self.tmpdirname)
def test_convert_token_and_id(self):
diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py
index 0b5a51fb3c926c..7af16371a02083 100644
--- a/tests/pipelines/test_pipelines_image_classification.py
+++ b/tests/pipelines/test_pipelines_image_classification.py
@@ -17,7 +17,7 @@
from transformers import (
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
- PreTrainedTokenizer,
+ PreTrainedTokenizerBase,
is_vision_available,
)
from transformers.pipelines import ImageClassificationPipeline, pipeline
@@ -166,7 +166,7 @@ def test_small_model_tf(self):
)
def test_custom_tokenizer(self):
- tokenizer = PreTrainedTokenizer()
+ tokenizer = PreTrainedTokenizerBase()
# Assert that the pipeline can be initialized with a feature extractor that is not in any mapping
image_classifier = pipeline(
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index fa3bf96d431a8a..a2f207c96391c2 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -228,7 +228,10 @@ def get_input_output_texts(self, tokenizer):
return input_txt, input_txt
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
- toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
+ # the length of the tokenizer does not always represent the tokens that it can encode: what if there are holes?
+ toks = [
+ (i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in set(tokenizer.get_vocab().values())
+ ]
toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
if max_length is not None and len(toks) > max_length:
@@ -390,15 +393,11 @@ def test_tokenize_special_tokens(self):
SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
- # TODO:
- # Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
- # with one variable(property) for a better maintainability?
-
- # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
+ # Both methods should add the token to `_additional_special_tokens` and `added_tokens_decoder`
tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
- # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
- # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
- tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
+ tokenizer.add_special_tokens(
+ {"additional_special_tokens": [SPECIAL_TOKEN_2]}, replace_additional_special_tokens=False
+ )
token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
@@ -726,7 +725,9 @@ def test_save_and_load_tokenizer(self):
tokenizer.add_tokens(["bim", "bambam"])
additional_special_tokens = tokenizer.additional_special_tokens
additional_special_tokens.append("new_additional_special_token")
- tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+ tokenizer.add_special_tokens(
+ {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+ )
before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
before_vocab = tokenizer.get_vocab()
tokenizer.save_pretrained(tmpdirname)
@@ -735,6 +736,7 @@ def test_save_and_load_tokenizer(self):
after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
after_vocab = after_tokenizer.get_vocab()
self.assertListEqual(before_tokens, after_tokens)
+
self.assertDictEqual(before_vocab, after_vocab)
self.assertIn("bim", after_vocab)
self.assertIn("bambam", after_vocab)
@@ -759,7 +761,9 @@ def test_save_and_load_tokenizer(self):
tokenizer.add_tokens(["bim", "bambam"])
additional_special_tokens = tokenizer.additional_special_tokens
additional_special_tokens.append("new_additional_special_token")
- tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+ tokenizer.add_special_tokens(
+ {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+ )
before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
before_vocab = tokenizer.get_vocab()
tokenizer.save_pretrained(tmpdirname)
@@ -844,7 +848,7 @@ def test_added_tokens_do_lower_case(self):
tokenized_sequence = "".join(tokenizer.tokenize(sequence_with_special_tokens))
for special_token in tokenizer.all_special_tokens:
- self.assertTrue(special_token in tokenized_sequence)
+ self.assertTrue(special_token in tokenized_sequence or special_token.lower() in tokenized_sequence)
tokenizers = self.get_tokenizers(do_lower_case=True)
for tokenizer in tokenizers:
@@ -874,6 +878,7 @@ def test_added_tokens_do_lower_case(self):
len(toks_before_adding) > len(toks_after_adding), # toks_before_adding should be longer
)
+ # TODO @ArthurZ Nuke this
def test_add_tokens_tokenizer(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
@@ -883,7 +888,7 @@ def test_add_tokens_tokenizer(self):
self.assertNotEqual(vocab_size, 0)
- # We usually have added tokens from the start in tests because our vocab fixtures are
+ # We usually have added tokens from the start in tests (but also otherwise) because our vocab fixtures are
# smaller than the original vocabs - let's not assert this
# self.assertEqual(vocab_size, all_size)
@@ -903,7 +908,10 @@ def test_add_tokens_tokenizer(self):
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
- new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+ new_toks_2 = {
+ "eos_token": AddedToken(">>>>|||<||<<|<<", rstrip=True, lstrip=True),
+ "pad_token": AddedToken("<<<<<|||>|>>>>|>", rstrip=True, lstrip=True),
+ }
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer)
@@ -914,12 +922,13 @@ def test_add_tokens_tokenizer(self):
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
tokens = tokenizer.encode(
- ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
+ ">>>>|||<||<<|<< aaaaa bbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
)
self.assertGreaterEqual(len(tokens), 6)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[0], tokens[1])
+
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokens[-3])
self.assertEqual(tokens[0], tokenizer.eos_token_id)
@@ -931,9 +940,10 @@ def test_add_special_tokens(self):
with self.subTest(f"{tokenizer.__class__.__name__}"):
input_text, ids = self.get_clean_sequence(tokenizer)
- special_token = "[SPECIAL_TOKEN]"
+ special_token = AddedToken("[SPECIAL_TOKEN]", lstrip=True, rstrip=True)
tokenizer.add_special_tokens({"cls_token": special_token})
+ special_token = str(special_token)
encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
self.assertEqual(len(encoded_special_token), 1)
@@ -967,15 +977,17 @@ def test_internal_consistency(self):
@require_tokenizers
def test_encode_decode_with_spaces(self):
- tokenizers = self.get_tokenizers(do_lower_case=False)
+ tokenizers = self.get_tokenizers(do_lower_case=False, fast=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
new_toks = [
- AddedToken("[ABC]", normalized=False),
- AddedToken("[DEF]", normalized=False),
- AddedToken("GHI IHG", normalized=False),
+ # These are added tokens, they will be normalized....
+ AddedToken("[ABC]", normalized=True, lstrip=True, rstrip=True),
+ AddedToken("[DEF]", normalized=True, lstrip=True, rstrip=True),
+ AddedToken("GHI IHG", normalized=True, lstrip=True, rstrip=True),
]
tokenizer.add_tokens(new_toks)
+ tokenizer.add_tokens([AddedToken("[SAMPLE]", normalized=True)], special_tokens=True)
input = "[ABC][DEF][ABC]GHI IHG[DEF]"
if self.space_between_special_tokens:
output = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
@@ -983,7 +995,23 @@ def test_encode_decode_with_spaces(self):
output = input
encoded = tokenizer.encode(input, add_special_tokens=False)
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
+
self.assertIn(decoded, [output, output.lower()])
+ return
+ # TODO @ArthurZ Refactor testing as now the do_normalize works for special and non special
+ encoded = tokenizer.encode("[ABC] [DEF][SAMPLE]", add_special_tokens=False)
+ decoded = tokenizer.decode(encoded, spaces_between_special_tokens=True, skip_special_tokens=False)
+ self.assertIn(decoded, ["[ABC] [DEF] [SAMPLE]", "[ABC] [DEF] [SAMPLE]".lower()])
+
+ decoded = tokenizer.decode(encoded, spaces_between_special_tokens=True, skip_special_tokens=True)
+ self.assertIn(decoded, ["[ABC] [DEF]", "[ABC] [DEF]".lower()])
+
+ encoded = tokenizer.encode("[ABC][SAMPLE][DEF]", add_special_tokens=False)
+ decoded = tokenizer.decode(encoded, spaces_between_special_tokens=True)
+ self.assertIn(decoded, ["[ABC] [SAMPLE] [DEF]", "[ABC][SAMPLE][DEF]".lower()])
+
+ decoded = tokenizer.decode(encoded, spaces_between_special_tokens=False)
+ self.assertIn(decoded, ["[ABC][SAMPLE][DEF]", "[ABC][SAMPLE][DEF]".lower()])
def test_pretrained_model_lists(self):
# We should have at least one default checkpoint for each tokenizer
@@ -2154,11 +2182,12 @@ def test_added_token_are_matched_longest_first(self):
@require_tokenizers
def test_added_token_serializable(self):
+ # TODO this is tested 10_000 times....
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
new_token = AddedToken("new_token", lstrip=True)
- tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
+ tokenizer.add_tokens([new_token])
with tempfile.TemporaryDirectory() as tmp_dir_name:
tokenizer.save_pretrained(tmp_dir_name)
@@ -2916,6 +2945,7 @@ def test_special_tokens_map_equal(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+ # sometimes the tokenizer saved online is not the same
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
@@ -3539,8 +3569,8 @@ def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
- tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+ tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
sentence = "A, AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(
sentence,
@@ -3623,7 +3653,6 @@ def test_special_tokens_initialization(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("", lstrip=True)]
-
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
@@ -3634,6 +3663,7 @@ def test_special_tokens_initialization(self):
self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer:
+ # in rust fast, you lose the information of the AddedToken when initializing with `additional_special_tokens`
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
)
@@ -3651,37 +3681,32 @@ def test_special_tokens_initialization(self):
self.assertTrue(special_token_id in cr_output)
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
+ # This test no longer support rust tokenizers, because the only file that should be looked
+ # at by the fast tokenizer with the new saving format is `tokenizer_config.json`.
+ # The previous behaviour is very strange too. Fast tokenizer should not save 3 files, but just one. Can never do slow from fast.
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
- if self.test_rust_tokenizer:
- tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
-
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
-
- with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
- special_tokens_map = json.load(json_file)
-
- with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
+ # only legacy save will check this
+ tokenizer_path = "tokenizer_config.json"
+ with open(os.path.join(tmp_dir, tokenizer_path), encoding="utf-8") as json_file:
tokenizer_config = json.load(json_file)
- special_tokens_map["additional_special_tokens"] = ["an_additional_special_token"]
tokenizer_config["additional_special_tokens"] = ["an_additional_special_token"]
- with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
- json.dump(special_tokens_map, outfile)
- with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
+ with open(os.path.join(tmp_dir, tokenizer_path), "w", encoding="utf-8") as outfile:
json.dump(tokenizer_config, outfile)
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
# "special_tokens_map.json" files
- tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
- tmp_dir,
- )
+
+ # TODO ArthurZ ... Ok so for legacy we have to support this I guess..... (special_tokens_map + additional)
+ tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir)
self.assertIn(
"an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
)
@@ -3813,17 +3838,18 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
):
find = True
break
+ special_token.content = new_special_token_str
self.assertTrue(
find,
- f"'{new_special_token_str}' doesn't appear in the list "
- f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
- f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}",
+ f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = "
+ f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing"
+ ", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.",
)
elif special_token not in special_tokens_map:
# The special token must appear identically in the list of the new tokenizer.
self.assertTrue(
special_token in new_tokenizer.all_special_tokens_extended,
- f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
+ f"'{special_token.__repr__()}' should be in {new_tokenizer.all_special_tokens_extended}",
)
else:
diff --git a/tests/tokenization/test_tokenization_fast.py b/tests/tokenization/test_tokenization_fast.py
index c6259610aa18b4..fc95bad6d05442 100644
--- a/tests/tokenization/test_tokenization_fast.py
+++ b/tests/tokenization/test_tokenization_fast.py
@@ -52,6 +52,12 @@ def test_tokenizer_mismatch_warning(self):
# model
pass
+ @unittest.skip(
+ "We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model"
+ )
+ def test_encode_decode_with_spaces(self):
+ pass
+
def test_pretrained_model_lists(self):
# We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any
# model