Skip to content

Commit

Permalink
Use correct chat template
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinMusgrave committed Jan 29, 2024
1 parent 72c7103 commit 207555c
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 18 deletions.
31 changes: 31 additions & 0 deletions blog/llm-finetuning/chat_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
CHAT_ML_TEMPLATE = """
{% for message in messages %}
{% if message['role'] == 'user' %}
{{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }}
{% elif message['role'] == 'system' %}
{{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }}
{% elif message['role'] == 'assistant' %}
{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}
{% endif %}
{% endfor %}
"""

ASSISTANT_PROMPT = "<|im_start|>assistant"

EOS_TOKEN = "<|im_end|>"


def get_chat_format(element):
system_prompt = (
"You are a helpful programmer assistant that excels at SQL. "
"When prompted with a task and a definition of an SQL table, you "
"respond with a SQL query to retrieve information from the table. "
"Don't explain your reasoning, only provide the SQL query."
)
user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: "

return [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt.format_map(element)},
{"role": "assistant", "content": element["response"]},
]
18 changes: 2 additions & 16 deletions blog/llm-finetuning/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,10 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from trl import DataCollatorForCompletionOnlyLM

from chat_format import CHAT_ML_TEMPLATE, get_chat_format
from dataset_utils import load_or_create_dataset


def get_chat_format(element):
system_prompt = (
"You are a helpful programmer assistant that excels at SQL. "
"When prompted with a task and a definition of an SQL table, you "
"respond with a SQL query to retrieve information from the table. "
"Don't explain your reasoning, only provide the SQL query."
)
user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: "

return [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt.format_map(element)},
{"role": "assistant", "content": element["response"]},
]


def preprocess_logits_for_metrics(logits, labels):
if isinstance(logits, tuple):
# Depending on the model and config, logits may contain extra tensors,
Expand All @@ -35,6 +20,7 @@ def main(training_args, det_callback, hparams):
model_name = hparams["model"]
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.chat_template = CHAT_ML_TEMPLATE

def tokenize(element):
formatted = tokenizer.apply_chat_template(
Expand Down
9 changes: 7 additions & 2 deletions blog/llm-finetuning/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from determined.experimental import client
from transformers import AutoModelForCausalLM, AutoTokenizer

from chat_format import ASSISTANT_PROMPT, CHAT_ML_TEMPLATE, EOS_TOKEN
from dataset_utils import load_or_create_dataset
from finetune import get_chat_format

Expand All @@ -22,13 +23,17 @@ def main(exp_id, dataset_subset):
checkpoint_dir = glob.glob(f"{checkpoint_dir}/checkpoint-*")[0]

model = AutoModelForCausalLM.from_pretrained(checkpoint_dir)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir, eos_token=EOS_TOKEN)
tokenizer.chat_template = CHAT_ML_TEMPLATE

dataset = load_or_create_dataset(dataset_subset)["test"]
element = dataset[0]
formatted = tokenizer.apply_chat_template(
get_chat_format(element)[:2], tokenize=False
get_chat_format(element)[:2],
tokenize=False,
)
formatted += ASSISTANT_PROMPT

print(formatted)

inputs = tokenizer(formatted, return_tensors="pt")
Expand Down

0 comments on commit 207555c

Please sign in to comment.