Skip to content

Commit

Permalink
Separate folder for new blog post
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinMusgrave committed Feb 21, 2024
1 parent f700eda commit e9329d5
Show file tree
Hide file tree
Showing 23 changed files with 479 additions and 149 deletions.
2 changes: 2 additions & 0 deletions blog/llm-finetuning-2/.detignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
text-to-sql*
checkpoints
5 changes: 5 additions & 0 deletions blog/llm-finetuning-2/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__pycache__
.DS_STORE
text-to-sql*
checkpoints
*.png
40 changes: 40 additions & 0 deletions blog/llm-finetuning-2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# LLM Finetuning using HuggingFace + Determined

In this demo, we finetune the [TinyLlama-1.1B-Chat](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4) on a [text-to-SQL dataset](https://huggingface.co/datasets/Clinton/Text-to-sql-v1). We ran this on two 80 GB A100 GPUs.

To get started, first install Determined on your local machine:
```bash
pip install determined
```

Then finetune:
```bash
det e create distributed.yaml .
```

Change configuration options in `distributed.yaml`. Some important options are:
- `slots_per_trial`: the number of GPUs to use.
- `dataset_subset`: the difficulty subset to train on.
- `per_device_train_batch_size`: the batch size per GPU.


Test your model's generation capabilities:

```bash
python test_model.py --exp_id <exp_id> --dataset_subset <dataset_subset>
```

Where
- `<exp_id>` is the id of your finetuning experiment in the Determined UI.
- `<dataset_subset>` is one of "easy", "medium", or "hard".

To test the pretrained model (not finetuned), leave out `--exp_id`. For example:

```bash
python test_model.py --dataset_subset easy
```

## Contributors

- [Kevin Musgrave](https://github.com/KevinMusgrave)
- [Agnieszka Ciborowska](https://github.com/aciborowska)
67 changes: 67 additions & 0 deletions blog/llm-finetuning-2/chat_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
CHAT_ML_TEMPLATE = """
{% for message in messages %}
{% if message['role'] == 'user' %}
{{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }}
{% elif message['role'] == 'system' %}
{{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }}
{% elif message['role'] == 'assistant' %}
{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}
{% endif %}
{% endfor %}
"""


CHAT_ML_EOS_TOKEN = "<|im_end|>"


def get_chat_format(element, model_name, with_assistant_response=True):
system_prompt = (
"You are a helpful programmer assistant that excels at SQL. "
"When prompted with a task and a definition of an SQL table, you "
"respond with a SQL query to retrieve information from the table. "
"Don't explain your reasoning, only provide the SQL query."
)

user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: "

if model_name == "mistralai/Mistral-7B-Instruct-v0.2":
user_prompt = f"{system_prompt}\n{user_prompt}"
output = [
{"role": "user", "content": user_prompt.format_map(element)},
]
else:
output = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt.format_map(element)},
]

if with_assistant_response:
output.append({"role": "assistant", "content": element["response"]})

return output


def set_special_tokens(tokenizer, model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
tokenizer.chat_template = CHAT_ML_TEMPLATE
tokenizer.eos_token = CHAT_ML_EOS_TOKEN
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id


def get_assistant_prompt(model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
return "<|im_start|>assistant\n"
else:
return "[/INST]"


def get_response_template_ids(tokenizer, model_name):
return tokenizer.encode(get_assistant_prompt(model_name), add_special_tokens=False)


def maybe_add_generation_prompt(x, model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
return x + get_assistant_prompt(model_name)
else:
return x
69 changes: 69 additions & 0 deletions blog/llm-finetuning-2/dataset_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import datasets
import pandas as pd


def add_length_column(dataset) -> pd.DataFrame:
df = dataset.to_pandas()
df["total_length"] = 0
for column_name in ["instruction", "input", "response"]:
num_words = df[column_name].astype(str).str.split().apply(len)
df["total_length"] += num_words

return df


def filter_by_total_length(df, difficulty, number_of_samples):
if difficulty == "easy":
return df[df["total_length"].between(10, 100)].iloc[:number_of_samples]
elif difficulty == "medium":
return df[df["total_length"].between(101, 200)].iloc[:number_of_samples]
elif difficulty == "hard":
return df[df["total_length"].between(201, 800)].iloc[:number_of_samples]


def get_dataset_subset_name(difficulty: str) -> str:
return f"text-to-sql-v1-{difficulty}"


def create_and_save_datasets(
df, difficulty, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1
):
seed = 123
# remove total_length column because we don't need it anymore
df = df.drop(columns=["total_length"])
dataset = datasets.Dataset.from_pandas(df, preserve_index=False)

# split into training and "the rest"
train_valtest = dataset.train_test_split(train_size=train_ratio, seed=seed)

# split "the rest" into validation and testing
val_test = train_valtest["test"].train_test_split(
test_size=test_ratio / (test_ratio + val_ratio), seed=seed
)

dataset = datasets.DatasetDict(
{
"train": train_valtest["train"],
"valid": val_test["train"],
"test": val_test["test"],
}
)
dataset_name = get_dataset_subset_name(difficulty)
dataset.save_to_disk(dataset_name)
return dataset


def load_dataset(difficulty):
return datasets.load_from_disk(get_dataset_subset_name(difficulty))


def load_or_create_dataset(difficulty, num_samples=10000):
try:
return load_dataset(difficulty)
except FileNotFoundError:
dataset = datasets.load_dataset("Clinton/Text-to-sql-v1")
dataset = dataset["train"]
dataset = dataset.remove_columns(["text", "source"])
df = add_length_column(dataset)
df = filter_by_total_length(df, difficulty, num_samples)
return create_and_save_datasets(df, difficulty)
File renamed without changes.
35 changes: 35 additions & 0 deletions blog/llm-finetuning-2/distributed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: mistral lora easy
debug: false
environment:
environment_variables:
- NCCL_DEBUG=INFO
image: determinedai/environments-dev:python-3.10-pytorch-2.0-deepspeed-0.10.0-smartsim
resources:
slots_per_trial: 2
searcher:
name: single
max_length:
batches: 5000
metric: eval_accuracy
smaller_is_better: false
hyperparameters:
model: "mistralai/Mistral-7B-Instruct-v0.2"
dataset_subset: "easy"
lora: true
training_args:
output_dir: "/tmp/llm_finetuning"
max_steps: 5000
per_device_train_batch_size: 8
per_device_eval_batch_size: 4
bf16: true
evaluation_strategy: "steps"
eval_steps: 1000
logging_strategy: "steps"
logging_steps: 100
save_strategy: "steps"
save_steps: 1000
learning_rate: 1e-5
entrypoint: >-
python -m determined.launch.torch_distributed
python finetune.py
max_restarts: 0
168 changes: 168 additions & 0 deletions blog/llm-finetuning-2/finetune.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import logging
import sys

import datasets
import determined as det
import evaluate
import torch
import transformers
from determined.transformers import DetCallback
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from trl import DataCollatorForCompletionOnlyLM

from chat_format import get_chat_format, get_response_template_ids, set_special_tokens
from dataset_utils import load_or_create_dataset

logger = logging.getLogger(__name__)


def get_tokenizer(model_name):
tokenizer = AutoTokenizer.from_pretrained(
model_name,
padding_side="right",
truncation_side="right",
)
set_special_tokens(tokenizer, model_name)
return tokenizer


def get_model_and_tokenizer(model_name):
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
)
tokenizer = get_tokenizer(model_name)
return model, tokenizer


def get_tokenize_fn(tokenizer):
def fn(formatted):
return tokenizer(formatted, padding=True, truncation=True, max_length=2048)

return fn


def preprocess_logits_for_metrics(logits, labels):
if isinstance(logits, tuple):
# Depending on the model and config, logits may contain extra tensors,
# like past_key_values, but logits always come first
logits = logits[0]
return logits.argmax(dim=-1)


def main(training_args, det_callback, hparams):
model_name = hparams["model"]
model, tokenizer = get_model_and_tokenizer(model_name)
tokenize_fn = get_tokenize_fn(tokenizer)

def tokenize(element):
formatted = tokenizer.apply_chat_template(
get_chat_format(element, model_name), tokenize=False
)
outputs = tokenize_fn(formatted)
return {
"input_ids": outputs["input_ids"],
"attention_mask": outputs["attention_mask"],
}

dataset = load_or_create_dataset(hparams["dataset_subset"])
column_names = list(dataset["train"].features)
for k in dataset.keys():
dataset[k] = dataset[k].map(tokenize, remove_columns=column_names)

response_template_ids = get_response_template_ids(tokenizer, model_name)
collator = DataCollatorForCompletionOnlyLM(
response_template_ids, tokenizer=tokenizer
)

bleu = evaluate.load("bleu")
acc = evaluate.load("accuracy")

def compute_metrics(eval_preds):
preds, labels = eval_preds
# preds have the same shape as the labels, after the argmax(-1) has been calculated
# by preprocess_logits_for_metrics but we need to shift the labels
labels = labels[:, 1:]
preds = preds[:, :-1]
# -100 is a default value for ignore_index used by DataCollatorForCompletionOnlyLM
mask = labels == -100
labels[mask] = tokenizer.pad_token_id
preds[mask] = tokenizer.pad_token_id

decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

for l, p in zip(decoded_labels, decoded_preds):
if l != p:
logging.error(f"decoded_label:{l}")
logging.error(f"decoded_pred:{p}")

bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)
accuracy = acc.compute(predictions=preds[~mask], references=labels[~mask])

return {**bleu_score, **accuracy}

if hparams["lora"]:
peft_config = LoraConfig(
task_type="CAUSAL_LM",
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.1,
)

model = get_peft_model(model, peft_config)

logging.error(f"dataset={dataset['train'][0]}")

trainer = Trainer(
args=training_args,
model=model,
tokenizer=tokenizer,
data_collator=collator,
train_dataset=dataset["train"],
eval_dataset=dataset["valid"],
preprocess_logits_for_metrics=preprocess_logits_for_metrics,
compute_metrics=compute_metrics,
)

trainer.add_callback(det_callback)
# we need to comment this one out, since it will lead to the following error:
# [parameter_offload.py:86:_apply_to_tensors_only] A module has unknown inputs or outputs type (<class 'transformers.cache_utils.DynamicCache'>)
# and the tensors embedded in it cannot be detected. The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on
# knowing the input and output tensors and therefore may not get triggered properly.
# The error happens due to deepspeed initialization happening in the trainer.train(), hence call on eval fails.

# trainer.evaluate()

trainer.train()


if __name__ == "__main__":
# Setup logging
logging.basicConfig(
format=det.LOG_FORMAT, handlers=[logging.StreamHandler(sys.stdout)]
)
log_level = logging.INFO
transformers.utils.logging.set_verbosity_info()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

info = det.get_cluster_info()
hparams = info.trial.hparams
training_args = TrainingArguments(**hparams["training_args"])
if training_args.deepspeed:
distributed = det.core.DistributedContext.from_deepspeed()
else:
distributed = det.core.DistributedContext.from_torch_distributed()

with det.core.init(distributed=distributed) as core_context:
det_callback = DetCallback(
core_context,
training_args,
)
main(training_args, det_callback, hparams)
File renamed without changes.
Loading

0 comments on commit e9329d5

Please sign in to comment.