Skip to content

Commit

Permalink
include changes from llama (huggingface#26260)
Browse files Browse the repository at this point in the history
* include changes from llama

* add a test
  • Loading branch information
ArthurZucker authored and EduardoPach committed Nov 18, 2023
1 parent d5f38b0 commit 9132353
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/transformers/models/code_llama/tokenization_code_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ def _tokenize(self, text, **kwargs):
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
tokens = self.sp_model.encode(text, out_type=str)
if not text.startswith((SPIECE_UNDERLINE, " ")):
return tokens
# 1. Encode string + prefix ex: "<unk> Hey"
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
# 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
Expand Down
12 changes: 12 additions & 0 deletions tests/models/code_llama/test_tokenization_code_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,18 @@ def test_special_token_special_word(self):
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, " <s> Hello<s> how")

def test_spm_edge_cases(self):
# the word inform should be split as ['in', 'form']
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", legacy=False)
tokens = tokenizer.tokenize("[INST] How are you doing?<s>[/INST]")
self.assertEqual(
tokens, ["▁[", "INST", "]", "▁How", "▁are", "▁you", "▁doing", "?", "<s>", "[", "/", "INST", "]"]
)
inputs_ids = tokenizer.encode("[INST] How are you doing?<s>[/INST]")
self.assertEqual(
inputs_ids, [1, 518, 25580, 29962, 1128, 526, 366, 2599, 29973, 1, 29961, 29914, 25580, 29962]
)

def test_infilling_tokenization(self):
PROMPTS = [
'''def remove_non_ascii(s: str) -> str:
Expand Down

0 comments on commit 9132353

Please sign in to comment.