Skip to content

Commit

Permalink
py : improve BPE tokenizer support (#5189)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sang-Kil Park authored Jan 29, 2024
1 parent fbe7dfa commit e76627b
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,10 @@ def load(model_plus: ModelPlus) -> Params:
class BpeVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
self.vocab = self.bpe_tokenizer["model"]["vocab"]
try:
self.vocab = self.bpe_tokenizer["model"]["vocab"]
except:
self.vocab = self.bpe_tokenizer
added_tokens: dict[str, int]
if fname_added_tokens is not None:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
Expand Down

0 comments on commit e76627b

Please sign in to comment.