Skip to content

Commit

Permalink
do not land: local setup
Browse files Browse the repository at this point in the history
Summary:

Test Plan:
  • Loading branch information
szewaiyuen6 committed Aug 22, 2024
1 parent 29e7250 commit c7012e1
Show file tree
Hide file tree
Showing 6 changed files with 223 additions and 55 deletions.
2 changes: 1 addition & 1 deletion blog/llm-finetuning-4/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Finetuning Mistral-7B using LoRA and DeepSpeed

In this demo, we finetune [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) using [LoRA](https://arxiv.org/abs/2106.09685) and [DeepSpeed](https://github.com/microsoft/DeepSpeed). We ran LoRA on two 80 GB A100 GPUs, and DeepSpeed on two, four, and eight 80 GB A100 GPUs.
In this demo, we finetune [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) using [LoRA](https://arxiv.org/abs/2106.09685) and [DeepSpeed](https://github.com/microsoft/DeepSpeed). We ran LoRA on two 40 GB A100 GPUs with DeepSpeed.

To get started, first install Determined on your local machine:
```bash
Expand Down
36 changes: 0 additions & 36 deletions blog/llm-finetuning-4/deepspeed.yaml

This file was deleted.

48 changes: 48 additions & 0 deletions blog/llm-finetuning-4/ds_configs/ds_config_stage_3_adam.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "Adam",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}

156 changes: 148 additions & 8 deletions blog/llm-finetuning-4/finetune.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
import functools
import logging
import os
import random
import sys
from itertools import chain
from typing import Dict

import datasets
import determined as det
import evaluate
import numpy as np
import torch
import transformers
import wandb

from determined.transformers import DetCallback
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, get_linear_schedule_with_warmup
from trl import DataCollatorForCompletionOnlyLM

from chat_format import get_chat_format, get_response_template_ids, set_special_tokens
Expand All @@ -18,17 +25,31 @@
logger = logging.getLogger(__name__)


def get_tokenizer(model_name, model_commit_hash):
def get_tokenizer(model_name, model_commit_hash, hparams):
tokenizer = AutoTokenizer.from_pretrained(
model_name,
padding_side="right",
truncation_side="right",
revision=model_commit_hash,
token=hparams["hf_token"],
)
set_special_tokens(tokenizer, model_name)
return tokenizer


def standardize_lora_init(lora_layer, alpha: int):
self_attn = lora_layer.self_attn
q_proj = self_attn.q_proj.lora_A.default
v_proj = self_attn.v_proj.lora_A.default
with torch.no_grad():
sd_q = q_proj.state_dict()
sd_q['weight'] = sd_q['weight'] / alpha
q_proj.load_state_dict(sd_q)
sd_v = v_proj.state_dict()
sd_v['weight'] = sd_v['weight'] / alpha
v_proj.load_state_dict(sd_v)


def get_model_and_tokenizer(model_name, use_lora, hparams, inference=False, device_map="auto", model_commit_hash=None):
if inference:
if use_lora:
Expand All @@ -47,22 +68,55 @@ def get_model_and_tokenizer(model_name, use_lora, hparams, inference=False, devi
model_name,
torch_dtype=torch.bfloat16,
revision=model_commit_hash,
token=hparams["hf_token"],
)
model.enable_input_require_grads()

if use_lora:
r = hparams["r"]
lora_alpha = r * hparams["lora_alpha_in_r"]
lora_alpha = hparams["lora_alpha"]
peft_config = LoraConfig(
task_type="CAUSAL_LM",
inference_mode=False,
r=r,
lora_alpha=lora_alpha,
lora_dropout=hparams["lora_dropout"],
use_rslora=hparams["use_rslora"]
)

model = get_peft_model(model, peft_config)

tokenizer = get_tokenizer(model_name, model_commit_hash=model_commit_hash)
lora_a = model.base_model.model.model.layers[0].self_attn.q_proj.lora_A.default
print("LoRA a at initialization, before rescaling, layer 0, q_proj:")
print(lora_a.state_dict())
lora_a = model.base_model.model.model.layers[31].self_attn.q_proj.lora_A.default
print("LoRA a at initialization, before rescaling, layer 31, q_proj:")
print(lora_a.state_dict())
lora_a = model.base_model.model.model.layers[0].self_attn.v_proj.lora_A.default
print("LoRA a at initialization, before rescaling, layer 0, v_proj:")
print(lora_a.state_dict())
lora_a = model.base_model.model.model.layers[31].self_attn.v_proj.lora_A.default
print("LoRA a at initialization, before rescaling, layer 31, v_proj:")
print(lora_a.state_dict())

if hparams["custom_scale_init"]:
for l in model.base_model.model.model.layers:
standardize_lora_init(l, lora_alpha)

lora_a = model.base_model.model.model.layers[0].self_attn.q_proj.lora_A.default
print("LoRA a at initialization, after rescaling, layer 0, q_proj:")
print(lora_a.state_dict())
lora_a = model.base_model.model.model.layers[31].self_attn.q_proj.lora_A.default
print("LoRA a at initialization, after rescaling, layer 31, q_proj:")
print(lora_a.state_dict())
lora_a = model.base_model.model.model.layers[0].self_attn.v_proj.lora_A.default
print("LoRA a at initialization, after rescaling, layer 0, v_proj:")
print(lora_a.state_dict())
lora_a = model.base_model.model.model.layers[31].self_attn.v_proj.lora_A.default
print("LoRA a at initialization, after rescaling, layer 31, v_proj:")
print(lora_a.state_dict())

tokenizer = get_tokenizer(model_name, model_commit_hash=model_commit_hash, hparams=hparams)
return model, tokenizer


Expand All @@ -73,6 +127,23 @@ def fn(formatted):
return fn


def group_texts(examples, block_size) -> Dict:
# Concatenate all texts.
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead
# of this drop, you can customize this part to your needs.
if total_length >= block_size:
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result


def preprocess_logits_for_metrics(logits, labels):
if isinstance(logits, tuple):
# Depending on the model and config, logits may contain extra tensors,
Expand Down Expand Up @@ -105,10 +176,18 @@ def tokenize(element):
}

dataset = load_or_create_dataset(hparams["dataset_subset"])
block_size = hparams["block_size"]
column_names = list(dataset["train"].features)
for k in dataset.keys():
dataset[k] = dataset[k].map(tokenize, remove_columns=column_names)

if hparams["group_text"]:
with training_args.main_process_first(desc="grouping texts together", local=False):
dataset = dataset.map(
functools.partial(group_texts, block_size=block_size),
batched=True,
desc=f"Grouping texts in chunks of {block_size}",
)

response_template_ids = get_response_template_ids(tokenizer, model_name)
collator = DataCollatorForCompletionOnlyLM(
response_template_ids, tokenizer=tokenizer
Expand Down Expand Up @@ -151,6 +230,18 @@ def compute_metrics(eval_preds):

trainer.train()

def set_seed(seed: int = 42) -> None:
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
# When running on the CuDNN backend, two further options must be set
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# Set a fixed value for the hash seed
os.environ["PYTHONHASHSEED"] = str(seed)
print(f"Random seed set as {seed}")


if __name__ == "__main__":
# Setup logging
Expand All @@ -169,12 +260,19 @@ def compute_metrics(eval_preds):
hparams = info.trial.hparams

if "hf_token" in hparams:
print("SWY logged flow triggered")
hf_token = hparams["hf_token"]
print(f"SWY token is {hf_token}")
import huggingface_hub

huggingface_hub.login(token=hparams["hf_token"])

if hparams["training_args"]["deepspeed"]:
hparams["training_args"]["deepspeed"] = "ds_configs/ds_config_stage_3.json"
if not hparams["use_adam"]:
hparams["training_args"]["deepspeed"] = "ds_configs/ds_config_stage_3.json"
print("swy not using adam")
else:
hparams["training_args"]["deepspeed"] = "ds_configs/ds_config_stage_3_adam.json"
print("swy using adam")

training_args = TrainingArguments(**hparams["training_args"])
if training_args.deepspeed:
Expand All @@ -186,8 +284,50 @@ def compute_metrics(eval_preds):
distributed = det.core.DistributedContext.from_deepspeed()
else:
distributed = det.core.DistributedContext.from_torch_distributed()


random_seed = 42

with det.core.init(distributed=distributed) as core_context:
if core_context.distributed.rank == 0:
wandb.login(key=hparams["wandb_key"])
import uuid
# Generate a UUID
my_uuid = uuid.uuid4()
# Convert UUID to string
uuid_str = str(my_uuid)[:5]
r = hparams["r"]
lora_alpha = hparams["lora_alpha"]
lora_dropout = hparams["lora_dropout"]
dataset_subset = hparams["dataset_subset"]
lr = str(hparams["training_args"]["learning_rate"])
use_rslora = False
if "use_rslora" in hparams:
use_rslora = hparams["use_rslora"]
optimizer = "adamW"
if "use_adam" in hparams and hparams["use_adam"]:
optimizer = "adam"
run_name = f"test_lora_blog_{dataset_subset}_r_{r}_alpha_{lora_alpha}_dropout_{lora_dropout}_lr_{lr}_seed_{random_seed}_opt_{optimizer}"
if use_rslora:
run_name += "_rslora"
run_name += f"_{uuid_str}"
run = wandb.init(
project="lora-blog-v3",
name=run_name,
config={
"r":hparams["r"],
"lora_alpha":hparams["lora_alpha"],
"dropout":hparams["lora_dropout"],
"dataset_subset":hparams["dataset_subset"],
"model":hparams["model"],
"lr": lr,
"seed": random_seed,
"optimizer": optimizer,
"use_rslora": use_rslora
}
)

set_seed(random_seed)

det_callback = DetCallback(
core_context,
training_args,
Expand Down
Loading

0 comments on commit c7012e1

Please sign in to comment.