-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
KevinMusgrave
committed
Feb 21, 2024
1 parent
f700eda
commit e9329d5
Showing
23 changed files
with
479 additions
and
149 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
text-to-sql* | ||
checkpoints |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
__pycache__ | ||
.DS_STORE | ||
text-to-sql* | ||
checkpoints | ||
*.png |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# LLM Finetuning using HuggingFace + Determined | ||
|
||
In this demo, we finetune the [TinyLlama-1.1B-Chat](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4) on a [text-to-SQL dataset](https://huggingface.co/datasets/Clinton/Text-to-sql-v1). We ran this on two 80 GB A100 GPUs. | ||
|
||
To get started, first install Determined on your local machine: | ||
```bash | ||
pip install determined | ||
``` | ||
|
||
Then finetune: | ||
```bash | ||
det e create distributed.yaml . | ||
``` | ||
|
||
Change configuration options in `distributed.yaml`. Some important options are: | ||
- `slots_per_trial`: the number of GPUs to use. | ||
- `dataset_subset`: the difficulty subset to train on. | ||
- `per_device_train_batch_size`: the batch size per GPU. | ||
|
||
|
||
Test your model's generation capabilities: | ||
|
||
```bash | ||
python test_model.py --exp_id <exp_id> --dataset_subset <dataset_subset> | ||
``` | ||
|
||
Where | ||
- `<exp_id>` is the id of your finetuning experiment in the Determined UI. | ||
- `<dataset_subset>` is one of "easy", "medium", or "hard". | ||
|
||
To test the pretrained model (not finetuned), leave out `--exp_id`. For example: | ||
|
||
```bash | ||
python test_model.py --dataset_subset easy | ||
``` | ||
|
||
## Contributors | ||
|
||
- [Kevin Musgrave](https://github.com/KevinMusgrave) | ||
- [Agnieszka Ciborowska](https://github.com/aciborowska) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
CHAT_ML_TEMPLATE = """ | ||
{% for message in messages %} | ||
{% if message['role'] == 'user' %} | ||
{{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }} | ||
{% elif message['role'] == 'system' %} | ||
{{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }} | ||
{% elif message['role'] == 'assistant' %} | ||
{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }} | ||
{% endif %} | ||
{% endfor %} | ||
""" | ||
|
||
|
||
CHAT_ML_EOS_TOKEN = "<|im_end|>" | ||
|
||
|
||
def get_chat_format(element, model_name, with_assistant_response=True): | ||
system_prompt = ( | ||
"You are a helpful programmer assistant that excels at SQL. " | ||
"When prompted with a task and a definition of an SQL table, you " | ||
"respond with a SQL query to retrieve information from the table. " | ||
"Don't explain your reasoning, only provide the SQL query." | ||
) | ||
|
||
user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: " | ||
|
||
if model_name == "mistralai/Mistral-7B-Instruct-v0.2": | ||
user_prompt = f"{system_prompt}\n{user_prompt}" | ||
output = [ | ||
{"role": "user", "content": user_prompt.format_map(element)}, | ||
] | ||
else: | ||
output = [ | ||
{"role": "system", "content": system_prompt}, | ||
{"role": "user", "content": user_prompt.format_map(element)}, | ||
] | ||
|
||
if with_assistant_response: | ||
output.append({"role": "assistant", "content": element["response"]}) | ||
|
||
return output | ||
|
||
|
||
def set_special_tokens(tokenizer, model_name): | ||
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4": | ||
tokenizer.chat_template = CHAT_ML_TEMPLATE | ||
tokenizer.eos_token = CHAT_ML_EOS_TOKEN | ||
if tokenizer.pad_token_id is None: | ||
tokenizer.pad_token_id = tokenizer.eos_token_id | ||
|
||
|
||
def get_assistant_prompt(model_name): | ||
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4": | ||
return "<|im_start|>assistant\n" | ||
else: | ||
return "[/INST]" | ||
|
||
|
||
def get_response_template_ids(tokenizer, model_name): | ||
return tokenizer.encode(get_assistant_prompt(model_name), add_special_tokens=False) | ||
|
||
|
||
def maybe_add_generation_prompt(x, model_name): | ||
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4": | ||
return x + get_assistant_prompt(model_name) | ||
else: | ||
return x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import datasets | ||
import pandas as pd | ||
|
||
|
||
def add_length_column(dataset) -> pd.DataFrame: | ||
df = dataset.to_pandas() | ||
df["total_length"] = 0 | ||
for column_name in ["instruction", "input", "response"]: | ||
num_words = df[column_name].astype(str).str.split().apply(len) | ||
df["total_length"] += num_words | ||
|
||
return df | ||
|
||
|
||
def filter_by_total_length(df, difficulty, number_of_samples): | ||
if difficulty == "easy": | ||
return df[df["total_length"].between(10, 100)].iloc[:number_of_samples] | ||
elif difficulty == "medium": | ||
return df[df["total_length"].between(101, 200)].iloc[:number_of_samples] | ||
elif difficulty == "hard": | ||
return df[df["total_length"].between(201, 800)].iloc[:number_of_samples] | ||
|
||
|
||
def get_dataset_subset_name(difficulty: str) -> str: | ||
return f"text-to-sql-v1-{difficulty}" | ||
|
||
|
||
def create_and_save_datasets( | ||
df, difficulty, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1 | ||
): | ||
seed = 123 | ||
# remove total_length column because we don't need it anymore | ||
df = df.drop(columns=["total_length"]) | ||
dataset = datasets.Dataset.from_pandas(df, preserve_index=False) | ||
|
||
# split into training and "the rest" | ||
train_valtest = dataset.train_test_split(train_size=train_ratio, seed=seed) | ||
|
||
# split "the rest" into validation and testing | ||
val_test = train_valtest["test"].train_test_split( | ||
test_size=test_ratio / (test_ratio + val_ratio), seed=seed | ||
) | ||
|
||
dataset = datasets.DatasetDict( | ||
{ | ||
"train": train_valtest["train"], | ||
"valid": val_test["train"], | ||
"test": val_test["test"], | ||
} | ||
) | ||
dataset_name = get_dataset_subset_name(difficulty) | ||
dataset.save_to_disk(dataset_name) | ||
return dataset | ||
|
||
|
||
def load_dataset(difficulty): | ||
return datasets.load_from_disk(get_dataset_subset_name(difficulty)) | ||
|
||
|
||
def load_or_create_dataset(difficulty, num_samples=10000): | ||
try: | ||
return load_dataset(difficulty) | ||
except FileNotFoundError: | ||
dataset = datasets.load_dataset("Clinton/Text-to-sql-v1") | ||
dataset = dataset["train"] | ||
dataset = dataset.remove_columns(["text", "source"]) | ||
df = add_length_column(dataset) | ||
df = filter_by_total_length(df, difficulty, num_samples) | ||
return create_and_save_datasets(df, difficulty) |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
name: mistral lora easy | ||
debug: false | ||
environment: | ||
environment_variables: | ||
- NCCL_DEBUG=INFO | ||
image: determinedai/environments-dev:python-3.10-pytorch-2.0-deepspeed-0.10.0-smartsim | ||
resources: | ||
slots_per_trial: 2 | ||
searcher: | ||
name: single | ||
max_length: | ||
batches: 5000 | ||
metric: eval_accuracy | ||
smaller_is_better: false | ||
hyperparameters: | ||
model: "mistralai/Mistral-7B-Instruct-v0.2" | ||
dataset_subset: "easy" | ||
lora: true | ||
training_args: | ||
output_dir: "/tmp/llm_finetuning" | ||
max_steps: 5000 | ||
per_device_train_batch_size: 8 | ||
per_device_eval_batch_size: 4 | ||
bf16: true | ||
evaluation_strategy: "steps" | ||
eval_steps: 1000 | ||
logging_strategy: "steps" | ||
logging_steps: 100 | ||
save_strategy: "steps" | ||
save_steps: 1000 | ||
learning_rate: 1e-5 | ||
entrypoint: >- | ||
python -m determined.launch.torch_distributed | ||
python finetune.py | ||
max_restarts: 0 |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
import logging | ||
import sys | ||
|
||
import datasets | ||
import determined as det | ||
import evaluate | ||
import torch | ||
import transformers | ||
from determined.transformers import DetCallback | ||
from peft import LoraConfig, get_peft_model | ||
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments | ||
from trl import DataCollatorForCompletionOnlyLM | ||
|
||
from chat_format import get_chat_format, get_response_template_ids, set_special_tokens | ||
from dataset_utils import load_or_create_dataset | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def get_tokenizer(model_name): | ||
tokenizer = AutoTokenizer.from_pretrained( | ||
model_name, | ||
padding_side="right", | ||
truncation_side="right", | ||
) | ||
set_special_tokens(tokenizer, model_name) | ||
return tokenizer | ||
|
||
|
||
def get_model_and_tokenizer(model_name): | ||
model = AutoModelForCausalLM.from_pretrained( | ||
model_name, | ||
torch_dtype=torch.bfloat16, | ||
) | ||
tokenizer = get_tokenizer(model_name) | ||
return model, tokenizer | ||
|
||
|
||
def get_tokenize_fn(tokenizer): | ||
def fn(formatted): | ||
return tokenizer(formatted, padding=True, truncation=True, max_length=2048) | ||
|
||
return fn | ||
|
||
|
||
def preprocess_logits_for_metrics(logits, labels): | ||
if isinstance(logits, tuple): | ||
# Depending on the model and config, logits may contain extra tensors, | ||
# like past_key_values, but logits always come first | ||
logits = logits[0] | ||
return logits.argmax(dim=-1) | ||
|
||
|
||
def main(training_args, det_callback, hparams): | ||
model_name = hparams["model"] | ||
model, tokenizer = get_model_and_tokenizer(model_name) | ||
tokenize_fn = get_tokenize_fn(tokenizer) | ||
|
||
def tokenize(element): | ||
formatted = tokenizer.apply_chat_template( | ||
get_chat_format(element, model_name), tokenize=False | ||
) | ||
outputs = tokenize_fn(formatted) | ||
return { | ||
"input_ids": outputs["input_ids"], | ||
"attention_mask": outputs["attention_mask"], | ||
} | ||
|
||
dataset = load_or_create_dataset(hparams["dataset_subset"]) | ||
column_names = list(dataset["train"].features) | ||
for k in dataset.keys(): | ||
dataset[k] = dataset[k].map(tokenize, remove_columns=column_names) | ||
|
||
response_template_ids = get_response_template_ids(tokenizer, model_name) | ||
collator = DataCollatorForCompletionOnlyLM( | ||
response_template_ids, tokenizer=tokenizer | ||
) | ||
|
||
bleu = evaluate.load("bleu") | ||
acc = evaluate.load("accuracy") | ||
|
||
def compute_metrics(eval_preds): | ||
preds, labels = eval_preds | ||
# preds have the same shape as the labels, after the argmax(-1) has been calculated | ||
# by preprocess_logits_for_metrics but we need to shift the labels | ||
labels = labels[:, 1:] | ||
preds = preds[:, :-1] | ||
# -100 is a default value for ignore_index used by DataCollatorForCompletionOnlyLM | ||
mask = labels == -100 | ||
labels[mask] = tokenizer.pad_token_id | ||
preds[mask] = tokenizer.pad_token_id | ||
|
||
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) | ||
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) | ||
|
||
for l, p in zip(decoded_labels, decoded_preds): | ||
if l != p: | ||
logging.error(f"decoded_label:{l}") | ||
logging.error(f"decoded_pred:{p}") | ||
|
||
bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels) | ||
accuracy = acc.compute(predictions=preds[~mask], references=labels[~mask]) | ||
|
||
return {**bleu_score, **accuracy} | ||
|
||
if hparams["lora"]: | ||
peft_config = LoraConfig( | ||
task_type="CAUSAL_LM", | ||
inference_mode=False, | ||
r=8, | ||
lora_alpha=32, | ||
lora_dropout=0.1, | ||
) | ||
|
||
model = get_peft_model(model, peft_config) | ||
|
||
logging.error(f"dataset={dataset['train'][0]}") | ||
|
||
trainer = Trainer( | ||
args=training_args, | ||
model=model, | ||
tokenizer=tokenizer, | ||
data_collator=collator, | ||
train_dataset=dataset["train"], | ||
eval_dataset=dataset["valid"], | ||
preprocess_logits_for_metrics=preprocess_logits_for_metrics, | ||
compute_metrics=compute_metrics, | ||
) | ||
|
||
trainer.add_callback(det_callback) | ||
# we need to comment this one out, since it will lead to the following error: | ||
# [parameter_offload.py:86:_apply_to_tensors_only] A module has unknown inputs or outputs type (<class 'transformers.cache_utils.DynamicCache'>) | ||
# and the tensors embedded in it cannot be detected. The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on | ||
# knowing the input and output tensors and therefore may not get triggered properly. | ||
# The error happens due to deepspeed initialization happening in the trainer.train(), hence call on eval fails. | ||
|
||
# trainer.evaluate() | ||
|
||
trainer.train() | ||
|
||
|
||
if __name__ == "__main__": | ||
# Setup logging | ||
logging.basicConfig( | ||
format=det.LOG_FORMAT, handlers=[logging.StreamHandler(sys.stdout)] | ||
) | ||
log_level = logging.INFO | ||
transformers.utils.logging.set_verbosity_info() | ||
logger.setLevel(log_level) | ||
datasets.utils.logging.set_verbosity(log_level) | ||
transformers.utils.logging.set_verbosity(log_level) | ||
transformers.utils.logging.enable_default_handler() | ||
transformers.utils.logging.enable_explicit_format() | ||
|
||
info = det.get_cluster_info() | ||
hparams = info.trial.hparams | ||
training_args = TrainingArguments(**hparams["training_args"]) | ||
if training_args.deepspeed: | ||
distributed = det.core.DistributedContext.from_deepspeed() | ||
else: | ||
distributed = det.core.DistributedContext.from_torch_distributed() | ||
|
||
with det.core.init(distributed=distributed) as core_context: | ||
det_callback = DetCallback( | ||
core_context, | ||
training_args, | ||
) | ||
main(training_args, det_callback, hparams) |
File renamed without changes.
Oops, something went wrong.