-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
16 changed files
with
872 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
text-to-sql* | ||
checkpoints |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
__pycache__ | ||
.DS_STORE | ||
text-to-sql* | ||
checkpoints | ||
*.png |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# Finetuning Mistral-7B using LoRA and DeepSpeed | ||
|
||
In this demo, we finetune [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) using [LoRA](https://arxiv.org/abs/2106.09685) and [DeepSpeed](https://github.com/microsoft/DeepSpeed). We ran LoRA on two 80 GB A100 GPUs, and DeepSpeed on two, four, and eight 80 GB A100 GPUs. | ||
|
||
To get started, first install Determined on your local machine: | ||
```bash | ||
pip install determined | ||
``` | ||
|
||
Then finetune with LoRA: | ||
```bash | ||
det e create lora.yaml . | ||
``` | ||
|
||
Or finetune with DeepSpeed: | ||
```bash | ||
det e create deepspeed.yaml . | ||
``` | ||
|
||
You can view the actual training code in `finetune.py`. | ||
|
||
|
||
|
||
|
||
## Configuration | ||
|
||
Change configuration options in `lora.yaml` or `deepspeed.yaml`. Some important options are: | ||
- `slots_per_trial`: the number of GPUs to use. | ||
- `dataset_subset`: the difficulty subset to train on. | ||
- `per_device_train_batch_size`: the batch size per GPU. | ||
|
||
The results in [our blog post](https://www.determined.ai/blog/llm-finetuning-2) were obtained using `per_device_train_batch_size: 1` and `per_device_eval_batch_size: 4` | ||
|
||
|
||
DeepSpeed configuration files are in the `ds_configs` folder. | ||
|
||
## Testing | ||
|
||
Test your model's generation capabilities: | ||
|
||
```bash | ||
python inference.py --exp_id <exp_id> --dataset_subset <dataset_subset> | ||
``` | ||
|
||
Where | ||
- `<exp_id>` is the id of your finetuning experiment in the Determined UI. | ||
- `<dataset_subset>` is one of "easy", "medium", or "hard". | ||
|
||
If you're testing a LoRA model, then add `--lora` to the above command. | ||
|
||
To use CPU instead of GPU, add `--device cpu`. | ||
|
||
To test the pretrained model (not finetuned), leave out `--exp_id`. For example: | ||
|
||
```bash | ||
python inference.py --dataset_subset easy | ||
``` | ||
|
||
## Validating the tokenizer | ||
|
||
Plot the distribution of dataset sample lengths, and see how many samples will be truncated by the tokenizer: | ||
|
||
```bash | ||
python validate_tokenizer.py | ||
``` | ||
|
||
|
||
## Contributors | ||
|
||
- [Kevin Musgrave](https://github.com/KevinMusgrave) | ||
- [Agnieszka Ciborowska](https://github.com/aciborowska) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
CHAT_ML_TEMPLATE = """ | ||
{% for message in messages %} | ||
{% if message['role'] == 'user' %} | ||
{{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }} | ||
{% elif message['role'] == 'system' %} | ||
{{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }} | ||
{% elif message['role'] == 'assistant' %} | ||
{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }} | ||
{% endif %} | ||
{% endfor %} | ||
""" | ||
|
||
|
||
CHAT_ML_EOS_TOKEN = "<|im_end|>" | ||
|
||
|
||
def get_chat_format(element, model_name, with_assistant_response=True): | ||
system_prompt = ( | ||
"You are a helpful programmer assistant that excels at SQL. " | ||
"When prompted with a task and a definition of an SQL table, you " | ||
"respond with a SQL query to retrieve information from the table. " | ||
"Don't explain your reasoning, only provide the SQL query." | ||
) | ||
|
||
user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: " | ||
|
||
if model_name == "mistralai/Mistral-7B-Instruct-v0.2": | ||
user_prompt = f"{system_prompt}\n{user_prompt}" | ||
output = [ | ||
{"role": "user", "content": user_prompt.format_map(element)}, | ||
] | ||
else: | ||
output = [ | ||
{"role": "system", "content": system_prompt}, | ||
{"role": "user", "content": user_prompt.format_map(element)}, | ||
] | ||
|
||
if with_assistant_response: | ||
output.append({"role": "assistant", "content": element["response"]}) | ||
|
||
return output | ||
|
||
|
||
def set_special_tokens(tokenizer, model_name): | ||
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4": | ||
tokenizer.chat_template = CHAT_ML_TEMPLATE | ||
tokenizer.eos_token = CHAT_ML_EOS_TOKEN | ||
if tokenizer.pad_token_id is None: | ||
tokenizer.pad_token_id = tokenizer.eos_token_id | ||
|
||
|
||
def get_assistant_prompt(model_name): | ||
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4": | ||
return "<|im_start|>assistant\n" | ||
else: | ||
return "[/INST]" | ||
|
||
|
||
def get_response_template_ids(tokenizer, model_name): | ||
return tokenizer.encode(get_assistant_prompt(model_name), add_special_tokens=False) | ||
|
||
|
||
def maybe_add_generation_prompt(x, model_name): | ||
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4": | ||
return x + get_assistant_prompt(model_name) | ||
else: | ||
return x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import datasets | ||
import pandas as pd | ||
|
||
|
||
def add_length_column(dataset) -> pd.DataFrame: | ||
df = dataset.to_pandas() | ||
df["total_length"] = 0 | ||
for column_name in ["instruction", "input", "response"]: | ||
num_words = df[column_name].astype(str).str.split().apply(len) | ||
df["total_length"] += num_words | ||
|
||
return df | ||
|
||
|
||
def filter_by_total_length(df, difficulty, number_of_samples): | ||
if difficulty == "easy": | ||
return df[df["total_length"].between(10, 100)].iloc[:number_of_samples] | ||
elif difficulty == "medium": | ||
return df[df["total_length"].between(101, 200)].iloc[:number_of_samples] | ||
elif difficulty == "hard": | ||
return df[df["total_length"].between(201, 800)].iloc[:number_of_samples] | ||
|
||
|
||
def get_dataset_subset_name(difficulty: str) -> str: | ||
return f"text-to-sql-v1-{difficulty}" | ||
|
||
|
||
def create_and_save_datasets( | ||
df, difficulty, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1 | ||
): | ||
seed = 123 | ||
# remove total_length column because we don't need it anymore | ||
df = df.drop(columns=["total_length"]) | ||
dataset = datasets.Dataset.from_pandas(df, preserve_index=False) | ||
|
||
# split into training and "the rest" | ||
train_valtest = dataset.train_test_split(train_size=train_ratio, seed=seed) | ||
|
||
# split "the rest" into validation and testing | ||
val_test = train_valtest["test"].train_test_split( | ||
test_size=test_ratio / (test_ratio + val_ratio), seed=seed | ||
) | ||
|
||
dataset = datasets.DatasetDict( | ||
{ | ||
"train": train_valtest["train"], | ||
"valid": val_test["train"], | ||
"test": val_test["test"], | ||
} | ||
) | ||
dataset_name = get_dataset_subset_name(difficulty) | ||
dataset.save_to_disk(dataset_name) | ||
return dataset | ||
|
||
|
||
def load_dataset(difficulty): | ||
return datasets.load_from_disk(get_dataset_subset_name(difficulty)) | ||
|
||
|
||
def load_or_create_dataset(difficulty, num_samples=10000): | ||
try: | ||
return load_dataset(difficulty) | ||
except FileNotFoundError: | ||
dataset = datasets.load_dataset("Clinton/Text-to-sql-v1") | ||
dataset = dataset["train"] | ||
dataset = dataset.remove_columns(["text", "source"]) | ||
df = add_length_column(dataset) | ||
df = filter_by_total_length(df, difficulty, num_samples) | ||
return create_and_save_datasets(df, difficulty) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
name: mistral deepspeed easy | ||
debug: false | ||
environment: | ||
environment_variables: | ||
- NCCL_DEBUG=INFO | ||
image: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-95c7a14 | ||
resources: | ||
slots_per_trial: 2 | ||
searcher: | ||
name: single | ||
max_length: | ||
batches: 5000 | ||
metric: eval_accuracy | ||
smaller_is_better: false | ||
hyperparameters: | ||
model: "mistralai/Mistral-7B-Instruct-v0.2" | ||
dataset_subset: "easy" | ||
lora: false | ||
training_args: | ||
output_dir: "/tmp/llm_finetuning" | ||
max_steps: 5000 | ||
per_device_train_batch_size: 2 | ||
per_device_eval_batch_size: 4 | ||
bf16: true | ||
evaluation_strategy: "steps" | ||
eval_steps: 1000 | ||
logging_strategy: "steps" | ||
logging_steps: 100 | ||
save_strategy: "steps" | ||
save_steps: 5000 | ||
learning_rate: 1e-5 | ||
deepspeed: "ds_configs/ds_config_stage_3.json" | ||
entrypoint: >- | ||
python -m determined.launch.deepspeed | ||
python finetune.py | ||
max_restarts: 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
{ | ||
"fp16": { | ||
"enabled": "auto", | ||
"loss_scale": 0, | ||
"loss_scale_window": 1000, | ||
"initial_scale_power": 16, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1 | ||
}, | ||
"optimizer": { | ||
"type": "AdamW", | ||
"params": { | ||
"lr": "auto", | ||
"betas": "auto", | ||
"eps": "auto", | ||
"weight_decay": "auto" | ||
} | ||
}, | ||
"scheduler": { | ||
"type": "WarmupLR", | ||
"params": { | ||
"warmup_min_lr": "auto", | ||
"warmup_max_lr": "auto", | ||
"warmup_num_steps": "auto" | ||
} | ||
}, | ||
"zero_optimization": { | ||
"stage": 1, | ||
"allgather_partitions": true, | ||
"allgather_bucket_size": 2e8, | ||
"overlap_comm": true, | ||
"reduce_scatter": true, | ||
"reduce_bucket_size": 2e8, | ||
"contiguous_gradients": true | ||
}, | ||
"gradient_accumulation_steps": "auto", | ||
"gradient_clipping": "auto", | ||
"train_batch_size": "auto", | ||
"train_micro_batch_size_per_gpu": "auto", | ||
"flops_profiler": { | ||
"enabled": true, | ||
"profile_step": 1, | ||
"module_depth": -1, | ||
"top_modules": 1, | ||
"detailed": true, | ||
"output_file": null | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
{ | ||
"fp16": { | ||
"enabled": "auto", | ||
"loss_scale": 0, | ||
"loss_scale_window": 1000, | ||
"initial_scale_power": 16, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1 | ||
}, | ||
"optimizer": { | ||
"type": "AdamW", | ||
"params": { | ||
"lr": "auto", | ||
"betas": "auto", | ||
"eps": "auto", | ||
"weight_decay": "auto" | ||
} | ||
}, | ||
"scheduler": { | ||
"type": "WarmupLR", | ||
"params": { | ||
"warmup_min_lr": "auto", | ||
"warmup_max_lr": "auto", | ||
"warmup_num_steps": "auto" | ||
} | ||
}, | ||
"zero_optimization": { | ||
"stage": 2, | ||
"allgather_partitions": true, | ||
"allgather_bucket_size": 2e8, | ||
"overlap_comm": true, | ||
"reduce_scatter": true, | ||
"reduce_bucket_size": 2e8, | ||
"contiguous_gradients": true | ||
}, | ||
"gradient_accumulation_steps": "auto", | ||
"gradient_clipping": "auto", | ||
"train_batch_size": "auto", | ||
"train_micro_batch_size_per_gpu": "auto", | ||
"flops_profiler": { | ||
"enabled": true, | ||
"profile_step": 1, | ||
"module_depth": -1, | ||
"top_modules": 1, | ||
"detailed": true, | ||
"output_file": null | ||
} | ||
} |
Oops, something went wrong.