Skip to content

Commit

Permalink
feat: lora blog
Browse files Browse the repository at this point in the history
Summary:

Test Plan:
  • Loading branch information
szewaiyuen6 committed Aug 27, 2024
1 parent 91f8812 commit 95f6740
Show file tree
Hide file tree
Showing 10 changed files with 501 additions and 0 deletions.
2 changes: 2 additions & 0 deletions blog/llm-finetuning-4/.detignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
text-to-sql*
checkpoints
5 changes: 5 additions & 0 deletions blog/llm-finetuning-4/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__pycache__
.DS_STORE
text-to-sql*
checkpoints
*.png
26 changes: 26 additions & 0 deletions blog/llm-finetuning-4/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Finetuning Mistral-7B using LoRA and DeepSpeed

We finetune [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) using [LoRA](https://arxiv.org/abs/2106.09685) and [DeepSpeed](https://github.com/microsoft/DeepSpeed). We ran LoRA on two 40 GB A100 GPUs utilizing DeepSpeed.

To get started, first install Determined on your local machine:
```bash
pip install determined
```

Then finetune with LoRA:
```bash
det e create lora.yaml .
```

You can view the actual training code in `finetune.py`.


## Configuration

Change configuration options in `lora.yaml`. Some important options are:
- `slots_per_trial`: the number of GPUs to use.
- `dataset_subset`: the difficulty subset to train on.
- `per_device_train_batch_size`: the batch size per GPU.


DeepSpeed configuration files are in the `ds_configs` folder.
67 changes: 67 additions & 0 deletions blog/llm-finetuning-4/chat_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
CHAT_ML_TEMPLATE = """
{% for message in messages %}
{% if message['role'] == 'user' %}
{{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }}
{% elif message['role'] == 'system' %}
{{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }}
{% elif message['role'] == 'assistant' %}
{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}
{% endif %}
{% endfor %}
"""


CHAT_ML_EOS_TOKEN = "<|im_end|>"


def get_chat_format(element, model_name, with_assistant_response=True):
system_prompt = (
"You are a helpful programmer assistant that excels at SQL. "
"When prompted with a task and a definition of an SQL table, you "
"respond with a SQL query to retrieve information from the table. "
"Don't explain your reasoning, only provide the SQL query."
)

user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: "

if model_name == "mistralai/Mistral-7B-Instruct-v0.2":
user_prompt = f"{system_prompt}\n{user_prompt}"
output = [
{"role": "user", "content": user_prompt.format_map(element)},
]
else:
output = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt.format_map(element)},
]

if with_assistant_response:
output.append({"role": "assistant", "content": element["response"]})

return output


def set_special_tokens(tokenizer, model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
tokenizer.chat_template = CHAT_ML_TEMPLATE
tokenizer.eos_token = CHAT_ML_EOS_TOKEN
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id


def get_assistant_prompt(model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
return "<|im_start|>assistant\n"
else:
return "[/INST]"


def get_response_template_ids(tokenizer, model_name):
return tokenizer.encode(get_assistant_prompt(model_name), add_special_tokens=False)


def maybe_add_generation_prompt(x, model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
return x + get_assistant_prompt(model_name)
else:
return x
69 changes: 69 additions & 0 deletions blog/llm-finetuning-4/dataset_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import datasets
import pandas as pd


def add_length_column(dataset) -> pd.DataFrame:
df = dataset.to_pandas()
df["total_length"] = 0
for column_name in ["instruction", "input", "response"]:
num_words = df[column_name].astype(str).str.split().apply(len)
df["total_length"] += num_words

return df


def filter_by_total_length(df, difficulty, number_of_samples):
if difficulty == "easy":
return df[df["total_length"].between(10, 100)].iloc[:number_of_samples]
elif difficulty == "medium":
return df[df["total_length"].between(101, 200)].iloc[:number_of_samples]
elif difficulty == "hard":
return df[df["total_length"].between(201, 800)].iloc[:number_of_samples]


def get_dataset_subset_name(difficulty: str) -> str:
return f"text-to-sql-v1-{difficulty}"


def create_and_save_datasets(
df, difficulty, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1
):
seed = 123
# remove total_length column because we don't need it anymore
df = df.drop(columns=["total_length"])
dataset = datasets.Dataset.from_pandas(df, preserve_index=False)

# split into training and "the rest"
train_valtest = dataset.train_test_split(train_size=train_ratio, seed=seed)

# split "the rest" into validation and testing
val_test = train_valtest["test"].train_test_split(
test_size=test_ratio / (test_ratio + val_ratio), seed=seed
)

dataset = datasets.DatasetDict(
{
"train": train_valtest["train"],
"valid": val_test["train"],
"test": val_test["test"],
}
)
dataset_name = get_dataset_subset_name(difficulty)
dataset.save_to_disk(dataset_name)
return dataset


def load_dataset(difficulty):
return datasets.load_from_disk(get_dataset_subset_name(difficulty))


def load_or_create_dataset(difficulty, num_samples=10000):
try:
return load_dataset(difficulty)
except FileNotFoundError:
dataset = datasets.load_dataset("Clinton/Text-to-sql-v1")
dataset = dataset["train"]
dataset = dataset.remove_columns(["text", "source"])
df = add_length_column(dataset)
df = filter_by_total_length(df, difficulty, num_samples)
return create_and_save_datasets(df, difficulty)
47 changes: 47 additions & 0 deletions blog/llm-finetuning-4/ds_configs/ds_config_stage_3.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}
Loading

0 comments on commit 95f6740

Please sign in to comment.