Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for BPE merge creation from scores instead of ids. #22582

Merged
merged 4 commits into from
Apr 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 27 additions & 12 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,28 @@ def __init__(self, model: str):
self.sp = SentencePieceProcessor()
self.sp.Load(model)

def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add a little doc on the vocab, what happens when it defaults

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's added underneath

Narsil marked this conversation as resolved.
Show resolved Hide resolved
"""
By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
order the merges with respect to the piece scores instead.
"""
sp = self.sp
vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())}
if vocab_scores is not None:
vocab_scores, reverse = dict(vocab_scores), True
else:
vocab_scores, reverse = vocab, False

# Merges
merges = []
for piece_l in vocab.keys():
for piece_r in vocab.keys():
merge = f"{piece_l}{piece_r}"
piece_id = vocab.get(merge, None)
if piece_id:
merges += [(piece_l, piece_r, piece_id)]
merges = sorted(merges, key=lambda val: val[2])
piece_score = vocab_scores.get(merge, None)
if piece_score:
merges += [(piece_l, piece_r, piece_score)]
merges = sorted(merges, key=lambda val: val[2], reverse=reverse)
merges = [(val[0], val[1]) for val in merges]

return vocab, merges


Expand Down Expand Up @@ -458,14 +465,14 @@ def unk_id(self, proto):

def tokenizer(self, proto):
model_type = proto.trainer_spec.model_type
vocab = self.vocab(proto)
vocab_scores = self.vocab(proto)
unk_id = self.unk_id(proto)

if model_type == 1:
tokenizer = Tokenizer(Unigram(vocab, unk_id))
tokenizer = Tokenizer(Unigram(vocab_scores, unk_id))
elif model_type == 2:
_, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
bpe_vocab = {word: i for i, (word, score) in enumerate(vocab)}
bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)}
tokenizer = Tokenizer(
BPE(
bpe_vocab,
Expand Down Expand Up @@ -496,16 +503,24 @@ def pre_tokenizer(self, replacement, add_prefix_space):
def post_processor(self):
return None

def decoder(self, replacement, add_prefix_space):
return decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)

def converted(self) -> Tokenizer:
tokenizer = self.tokenizer(self.proto)

# Tokenizer assemble
tokenizer.normalizer = self.normalizer(self.proto)
normalizer = self.normalizer(self.proto)
if normalizer is not None:
tokenizer.normalizer = normalizer

replacement = "▁"
add_prefix_space = True
tokenizer.pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
if pre_tokenizer is not None:
tokenizer.pre_tokenizer = pre_tokenizer

tokenizer.decoder = self.decoder(replacement, add_prefix_space)
post_processor = self.post_processor()
if post_processor:
tokenizer.post_processor = post_processor
Expand Down