Skip to content

Commit

Permalink
blog: LoRA deepdive
Browse files Browse the repository at this point in the history
Summary:

Test Plan:
  • Loading branch information
szewaiyuen6 committed Jul 5, 2024
1 parent d5f4b27 commit 8248344
Show file tree
Hide file tree
Showing 16 changed files with 872 additions and 0 deletions.
2 changes: 2 additions & 0 deletions blog/llm-finetuning-4/.detignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
text-to-sql*
checkpoints
5 changes: 5 additions & 0 deletions blog/llm-finetuning-4/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__pycache__
.DS_STORE
text-to-sql*
checkpoints
*.png
71 changes: 71 additions & 0 deletions blog/llm-finetuning-4/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Finetuning Mistral-7B using LoRA and DeepSpeed

In this demo, we finetune [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) using [LoRA](https://arxiv.org/abs/2106.09685) and [DeepSpeed](https://github.com/microsoft/DeepSpeed). We ran LoRA on two 80 GB A100 GPUs, and DeepSpeed on two, four, and eight 80 GB A100 GPUs.

To get started, first install Determined on your local machine:
```bash
pip install determined
```

Then finetune with LoRA:
```bash
det e create lora.yaml .
```

Or finetune with DeepSpeed:
```bash
det e create deepspeed.yaml .
```

You can view the actual training code in `finetune.py`.




## Configuration

Change configuration options in `lora.yaml` or `deepspeed.yaml`. Some important options are:
- `slots_per_trial`: the number of GPUs to use.
- `dataset_subset`: the difficulty subset to train on.
- `per_device_train_batch_size`: the batch size per GPU.

The results in [our blog post](https://www.determined.ai/blog/llm-finetuning-2) were obtained using `per_device_train_batch_size: 1` and `per_device_eval_batch_size: 4`


DeepSpeed configuration files are in the `ds_configs` folder.

## Testing

Test your model's generation capabilities:

```bash
python inference.py --exp_id <exp_id> --dataset_subset <dataset_subset>
```

Where
- `<exp_id>` is the id of your finetuning experiment in the Determined UI.
- `<dataset_subset>` is one of "easy", "medium", or "hard".

If you're testing a LoRA model, then add `--lora` to the above command.

To use CPU instead of GPU, add `--device cpu`.

To test the pretrained model (not finetuned), leave out `--exp_id`. For example:

```bash
python inference.py --dataset_subset easy
```

## Validating the tokenizer

Plot the distribution of dataset sample lengths, and see how many samples will be truncated by the tokenizer:

```bash
python validate_tokenizer.py
```


## Contributors

- [Kevin Musgrave](https://github.com/KevinMusgrave)
- [Agnieszka Ciborowska](https://github.com/aciborowska)
67 changes: 67 additions & 0 deletions blog/llm-finetuning-4/chat_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
CHAT_ML_TEMPLATE = """
{% for message in messages %}
{% if message['role'] == 'user' %}
{{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }}
{% elif message['role'] == 'system' %}
{{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }}
{% elif message['role'] == 'assistant' %}
{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}
{% endif %}
{% endfor %}
"""


CHAT_ML_EOS_TOKEN = "<|im_end|>"


def get_chat_format(element, model_name, with_assistant_response=True):
system_prompt = (
"You are a helpful programmer assistant that excels at SQL. "
"When prompted with a task and a definition of an SQL table, you "
"respond with a SQL query to retrieve information from the table. "
"Don't explain your reasoning, only provide the SQL query."
)

user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: "

if model_name == "mistralai/Mistral-7B-Instruct-v0.2":
user_prompt = f"{system_prompt}\n{user_prompt}"
output = [
{"role": "user", "content": user_prompt.format_map(element)},
]
else:
output = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt.format_map(element)},
]

if with_assistant_response:
output.append({"role": "assistant", "content": element["response"]})

return output


def set_special_tokens(tokenizer, model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
tokenizer.chat_template = CHAT_ML_TEMPLATE
tokenizer.eos_token = CHAT_ML_EOS_TOKEN
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id


def get_assistant_prompt(model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
return "<|im_start|>assistant\n"
else:
return "[/INST]"


def get_response_template_ids(tokenizer, model_name):
return tokenizer.encode(get_assistant_prompt(model_name), add_special_tokens=False)


def maybe_add_generation_prompt(x, model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
return x + get_assistant_prompt(model_name)
else:
return x
69 changes: 69 additions & 0 deletions blog/llm-finetuning-4/dataset_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import datasets
import pandas as pd


def add_length_column(dataset) -> pd.DataFrame:
df = dataset.to_pandas()
df["total_length"] = 0
for column_name in ["instruction", "input", "response"]:
num_words = df[column_name].astype(str).str.split().apply(len)
df["total_length"] += num_words

return df


def filter_by_total_length(df, difficulty, number_of_samples):
if difficulty == "easy":
return df[df["total_length"].between(10, 100)].iloc[:number_of_samples]
elif difficulty == "medium":
return df[df["total_length"].between(101, 200)].iloc[:number_of_samples]
elif difficulty == "hard":
return df[df["total_length"].between(201, 800)].iloc[:number_of_samples]


def get_dataset_subset_name(difficulty: str) -> str:
return f"text-to-sql-v1-{difficulty}"


def create_and_save_datasets(
df, difficulty, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1
):
seed = 123
# remove total_length column because we don't need it anymore
df = df.drop(columns=["total_length"])
dataset = datasets.Dataset.from_pandas(df, preserve_index=False)

# split into training and "the rest"
train_valtest = dataset.train_test_split(train_size=train_ratio, seed=seed)

# split "the rest" into validation and testing
val_test = train_valtest["test"].train_test_split(
test_size=test_ratio / (test_ratio + val_ratio), seed=seed
)

dataset = datasets.DatasetDict(
{
"train": train_valtest["train"],
"valid": val_test["train"],
"test": val_test["test"],
}
)
dataset_name = get_dataset_subset_name(difficulty)
dataset.save_to_disk(dataset_name)
return dataset


def load_dataset(difficulty):
return datasets.load_from_disk(get_dataset_subset_name(difficulty))


def load_or_create_dataset(difficulty, num_samples=10000):
try:
return load_dataset(difficulty)
except FileNotFoundError:
dataset = datasets.load_dataset("Clinton/Text-to-sql-v1")
dataset = dataset["train"]
dataset = dataset.remove_columns(["text", "source"])
df = add_length_column(dataset)
df = filter_by_total_length(df, difficulty, num_samples)
return create_and_save_datasets(df, difficulty)
36 changes: 36 additions & 0 deletions blog/llm-finetuning-4/deepspeed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: mistral deepspeed easy
debug: false
environment:
environment_variables:
- NCCL_DEBUG=INFO
image: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-95c7a14
resources:
slots_per_trial: 2
searcher:
name: single
max_length:
batches: 5000
metric: eval_accuracy
smaller_is_better: false
hyperparameters:
model: "mistralai/Mistral-7B-Instruct-v0.2"
dataset_subset: "easy"
lora: false
training_args:
output_dir: "/tmp/llm_finetuning"
max_steps: 5000
per_device_train_batch_size: 2
per_device_eval_batch_size: 4
bf16: true
evaluation_strategy: "steps"
eval_steps: 1000
logging_strategy: "steps"
logging_steps: 100
save_strategy: "steps"
save_steps: 5000
learning_rate: 1e-5
deepspeed: "ds_configs/ds_config_stage_3.json"
entrypoint: >-
python -m determined.launch.deepspeed
python finetune.py
max_restarts: 0
48 changes: 48 additions & 0 deletions blog/llm-finetuning-4/ds_configs/ds_config_stage_1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"flops_profiler": {
"enabled": true,
"profile_step": 1,
"module_depth": -1,
"top_modules": 1,
"detailed": true,
"output_file": null
}
}
48 changes: 48 additions & 0 deletions blog/llm-finetuning-4/ds_configs/ds_config_stage_2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"flops_profiler": {
"enabled": true,
"profile_step": 1,
"module_depth": -1,
"top_modules": 1,
"detailed": true,
"output_file": null
}
}
Loading

0 comments on commit 8248344

Please sign in to comment.