Use correct chat template

determined-ai · Jan 29, 2024 · 207555c · 207555c
1 parent 72c7103
commit 207555c
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 18 deletions.
diff --git a/blog/llm-finetuning/chat_format.py b/blog/llm-finetuning/chat_format.py
@@ -0,0 +1,31 @@
+CHAT_ML_TEMPLATE = """
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }}
+{% elif message['role'] == 'system' %}
+{{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }}
+{% elif message['role'] == 'assistant' %}
+{{'<|im_start|>assistant\n'  + message['content'] + '<|im_end|>' }}
+{% endif %}
+{% endfor %}
+"""
+
+ASSISTANT_PROMPT = "<|im_start|>assistant"
+
+EOS_TOKEN = "<|im_end|>"
+
+
+def get_chat_format(element):
+    system_prompt = (
+        "You are a helpful programmer assistant that excels at SQL. "
+        "When prompted with a task and a definition of an SQL table, you "
+        "respond with a SQL query to retrieve information from the table. "
+        "Don't explain your reasoning, only provide the SQL query."
+    )
+    user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: "
+
+    return [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt.format_map(element)},
+        {"role": "assistant", "content": element["response"]},
+    ]
diff --git a/blog/llm-finetuning/finetune.py b/blog/llm-finetuning/finetune.py
@@ -4,25 +4,10 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
 from trl import DataCollatorForCompletionOnlyLM
 
+from chat_format import CHAT_ML_TEMPLATE, get_chat_format
 from dataset_utils import load_or_create_dataset
 
 
-def get_chat_format(element):
-    system_prompt = (
-        "You are a helpful programmer assistant that excels at SQL. "
-        "When prompted with a task and a definition of an SQL table, you "
-        "respond with a SQL query to retrieve information from the table. "
-        "Don't explain your reasoning, only provide the SQL query."
-    )
-    user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: "
-
-    return [
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": user_prompt.format_map(element)},
-        {"role": "assistant", "content": element["response"]},
-    ]
-
-
 def preprocess_logits_for_metrics(logits, labels):
     if isinstance(logits, tuple):
         # Depending on the model and config, logits may contain extra tensors,
@@ -35,6 +20,7 @@ def main(training_args, det_callback, hparams):
     model_name = hparams["model"]
     model = AutoModelForCausalLM.from_pretrained(model_name)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.chat_template = CHAT_ML_TEMPLATE
 
     def tokenize(element):
         formatted = tokenizer.apply_chat_template(

diff --git a/blog/llm-finetuning/test_model.py b/blog/llm-finetuning/test_model.py
@@ -4,6 +4,7 @@
 from determined.experimental import client
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from chat_format import ASSISTANT_PROMPT, CHAT_ML_TEMPLATE, EOS_TOKEN
 from dataset_utils import load_or_create_dataset
 from finetune import get_chat_format
 
@@ -22,13 +23,17 @@ def main(exp_id, dataset_subset):
         checkpoint_dir = glob.glob(f"{checkpoint_dir}/checkpoint-*")[0]
 
     model = AutoModelForCausalLM.from_pretrained(checkpoint_dir)
-    tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir, eos_token=EOS_TOKEN)
+    tokenizer.chat_template = CHAT_ML_TEMPLATE
 
     dataset = load_or_create_dataset(dataset_subset)["test"]
     element = dataset[0]
     formatted = tokenizer.apply_chat_template(
-        get_chat_format(element)[:2], tokenize=False
+        get_chat_format(element)[:2],
+        tokenize=False,
     )
+    formatted += ASSISTANT_PROMPT
+
     print(formatted)
 
     inputs = tokenizer(formatted, return_tensors="pt")