Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: lora blog #29

Merged
merged 5 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ This repository contains a variety of Determined examples that are not actively
| [LLM Finetuning](blog/llm-finetuning) | Finetuning TinyLlama-1.1B on Text-to-SQL. |
| [LLM Finetuning 2](blog/llm-finetuning-2) | Finetuning Mistral-7B on Text-to-SQL using LoRA and DeepSpeed. |
| [LLM Finetuning 3](blog/llm-finetuning-3) | Finetuning Gemma-2B using DPO. |
| [LoRA Parameters](blog/lora-parameters) | Finding the best LoRA parameters. |
| [Python SDK demo](blog/python_sdk_demo) | Example usage of the Determined Python SDK to run and administer experiments. |
| [Tensor Parallelism](blog/tp) | Profiling tensor parallelism in PyTorch. |

Expand Down
2 changes: 2 additions & 0 deletions blog/lora-parameters/.detignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
text-to-sql*
checkpoints
5 changes: 5 additions & 0 deletions blog/lora-parameters/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__pycache__
.DS_STORE
text-to-sql*
checkpoints
*.png
34 changes: 34 additions & 0 deletions blog/lora-parameters/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Finding the best LoRA parameters

We finetune [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) using [LoRA](https://arxiv.org/abs/2106.09685) and [DeepSpeed](https://github.com/microsoft/DeepSpeed). We ran LoRA on two 40 GB A100 GPUs utilizing DeepSpeed.

See our [blog post](https://www.determined.ai/blog/lora-parameters) for our experiment results.

To get started, first install Determined on your local machine:
```bash
pip install determined
```

Then finetune with LoRA:
```bash
det e create lora.yaml .
```

You can view the actual training code in `finetune.py`.


## Configuration

Change configuration options in `lora.yaml`. Some important options are:
- `slots_per_trial`: the number of GPUs to use.
- `dataset_subset`: the difficulty subset to train on.
- `per_device_train_batch_size`: the batch size per GPU.


DeepSpeed configuration files are in the `ds_configs` folder.


## Contributors

- By [Sze Wai Yuen](https://github.com/szewaiyuen6)
- Built on `llm-finetuning` code by [Agnieszka Ciborowska](https://github.com/aciborowska) and [Kevin Musgrave](https://github.com/KevinMusgrave).
67 changes: 67 additions & 0 deletions blog/lora-parameters/chat_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
CHAT_ML_TEMPLATE = """
{% for message in messages %}
{% if message['role'] == 'user' %}
{{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }}
{% elif message['role'] == 'system' %}
{{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }}
{% elif message['role'] == 'assistant' %}
{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}
{% endif %}
{% endfor %}
"""


CHAT_ML_EOS_TOKEN = "<|im_end|>"


def get_chat_format(element, model_name, with_assistant_response=True):
system_prompt = (
"You are a helpful programmer assistant that excels at SQL. "
"When prompted with a task and a definition of an SQL table, you "
"respond with a SQL query to retrieve information from the table. "
"Don't explain your reasoning, only provide the SQL query."
)

user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: "

if model_name == "mistralai/Mistral-7B-Instruct-v0.2":
user_prompt = f"{system_prompt}\n{user_prompt}"
output = [
{"role": "user", "content": user_prompt.format_map(element)},
]
else:
output = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt.format_map(element)},
]

if with_assistant_response:
output.append({"role": "assistant", "content": element["response"]})

return output


def set_special_tokens(tokenizer, model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
tokenizer.chat_template = CHAT_ML_TEMPLATE
tokenizer.eos_token = CHAT_ML_EOS_TOKEN
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id


def get_assistant_prompt(model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
return "<|im_start|>assistant\n"
else:
return "[/INST]"


def get_response_template_ids(tokenizer, model_name):
return tokenizer.encode(get_assistant_prompt(model_name), add_special_tokens=False)


def maybe_add_generation_prompt(x, model_name):
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
return x + get_assistant_prompt(model_name)
else:
return x
69 changes: 69 additions & 0 deletions blog/lora-parameters/dataset_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import datasets
import pandas as pd


def add_length_column(dataset) -> pd.DataFrame:
df = dataset.to_pandas()
df["total_length"] = 0
for column_name in ["instruction", "input", "response"]:
num_words = df[column_name].astype(str).str.split().apply(len)
df["total_length"] += num_words

return df


def filter_by_total_length(df, difficulty, number_of_samples):
if difficulty == "easy":
return df[df["total_length"].between(10, 100)].iloc[:number_of_samples]
elif difficulty == "medium":
return df[df["total_length"].between(101, 200)].iloc[:number_of_samples]
elif difficulty == "hard":
return df[df["total_length"].between(201, 800)].iloc[:number_of_samples]


def get_dataset_subset_name(difficulty: str) -> str:
return f"text-to-sql-v1-{difficulty}"


def create_and_save_datasets(
df, difficulty, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1
):
seed = 123
# remove total_length column because we don't need it anymore
df = df.drop(columns=["total_length"])
dataset = datasets.Dataset.from_pandas(df, preserve_index=False)

# split into training and "the rest"
train_valtest = dataset.train_test_split(train_size=train_ratio, seed=seed)

# split "the rest" into validation and testing
val_test = train_valtest["test"].train_test_split(
test_size=test_ratio / (test_ratio + val_ratio), seed=seed
)

dataset = datasets.DatasetDict(
{
"train": train_valtest["train"],
"valid": val_test["train"],
"test": val_test["test"],
}
)
dataset_name = get_dataset_subset_name(difficulty)
dataset.save_to_disk(dataset_name)
return dataset


def load_dataset(difficulty):
return datasets.load_from_disk(get_dataset_subset_name(difficulty))


def load_or_create_dataset(difficulty, num_samples=10000):
try:
return load_dataset(difficulty)
except FileNotFoundError:
dataset = datasets.load_dataset("Clinton/Text-to-sql-v1")
dataset = dataset["train"]
dataset = dataset.remove_columns(["text", "source"])
df = add_length_column(dataset)
df = filter_by_total_length(df, difficulty, num_samples)
return create_and_save_datasets(df, difficulty)
47 changes: 47 additions & 0 deletions blog/lora-parameters/ds_configs/ds_config_stage_3.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}
Loading