Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lr scheduler, weight decay and max_grad_norm #214

Merged
merged 12 commits into from
Nov 19, 2024
24 changes: 24 additions & 0 deletions src/together/cli/api/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,30 @@ def fine_tuning(ctx: click.Context) -> None:
)
@click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size")
@click.option("--learning-rate", type=float, default=1e-5, help="Learning rate")
@click.option(
"--min-lr-ratio",
type=float,
default=0.0,
help="Final learning rate ratio of the initial learning rate",
azahed98 marked this conversation as resolved.
Show resolved Hide resolved
)
@click.option(
"--warmup-ratio",
type=float,
default=0.0,
help="Warmup ratio for learning rate scheduler.",
)
@click.option(
"--max-grad-norm",
type=float,
default=1.0,
help="Max gradient norm. Set to 0 to disable.",
azahed98 marked this conversation as resolved.
Show resolved Hide resolved
)
@click.option(
"--weight-decay",
type=float,
default=0.0,
help="Weight decay",
)
@click.option(
"--lora/--no-lora",
type=bool,
Expand Down Expand Up @@ -115,7 +133,10 @@ def create(
n_checkpoints: int,
batch_size: int | Literal["max"],
learning_rate: float,
min_lr_ratio: float,
warmup_ratio: float,
max_grad_norm: float,
weight_decay: float,
lora: bool,
lora_r: int,
lora_dropout: float,
Expand All @@ -138,7 +159,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down
42 changes: 42 additions & 0 deletions src/together/resources/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
TogetherClient,
TogetherRequest,
TrainingType,
FinetuneLRScheduler,
FinetuneLinearLRSchedulerArgs,
)
from together.types.finetune import DownloadCheckpointType
from together.utils import log_warn_once, normalize_key
Expand All @@ -35,7 +37,10 @@ def createFinetuneRequest(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
min_lr_ratio: float | None = 0.0,
azahed98 marked this conversation as resolved.
Show resolved Hide resolved
warmup_ratio: float | None = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand Down Expand Up @@ -83,6 +88,20 @@ def createFinetuneRequest(
if warmup_ratio > 1 or warmup_ratio < 0:
raise ValueError("Warmup ratio should be between 0 and 1")

if min_lr_ratio is not None and (min_lr_ratio > 1 or min_lr_ratio < 0):
raise ValueError("Min learning rate ratio should be between 0 and 1")

if max_grad_norm < 0:
raise ValueError("Max gradient norm should be non-negative")

if weight_decay is not None and (weight_decay < 0):
raise ValueError("Weight decay should be non-negative")

lrScheduler = FinetuneLRScheduler(
lr_scheduler_type="linear",
lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
)

finetune_request = FinetuneRequest(
model=model,
training_file=training_file,
Expand All @@ -92,7 +111,10 @@ def createFinetuneRequest(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
lr_scheduler=lrScheduler,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
azahed98 marked this conversation as resolved.
Show resolved Hide resolved
training_type=training_type,
suffix=suffix,
wandb_key=wandb_api_key,
Expand All @@ -117,7 +139,10 @@ def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
min_lr_ratio: float | None = 0.0,
azahed98 marked this conversation as resolved.
Show resolved Hide resolved
warmup_ratio: float | None = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand All @@ -143,7 +168,11 @@ def create(
batch_size (int or "max"): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
the learning rate scheduler. Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0. Set to 0 to disable.
azahed98 marked this conversation as resolved.
Show resolved Hide resolved
weight_decay (float, optional): Weight decay. Defaults to 0.0.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
Expand Down Expand Up @@ -185,7 +214,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down Expand Up @@ -436,7 +468,10 @@ async def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
min_lr_ratio: float | None = 0.0,
azahed98 marked this conversation as resolved.
Show resolved Hide resolved
warmup_ratio: float | None = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand All @@ -462,7 +497,11 @@ async def create(
batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
the learning rate scheduler. Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0. Set to 0 to disable.
azahed98 marked this conversation as resolved.
Show resolved Hide resolved
weight_decay (float, optional): Weight decay. Defaults to 0.0.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
Expand Down Expand Up @@ -504,7 +543,10 @@ async def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down
4 changes: 4 additions & 0 deletions src/together/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
LoRATrainingType,
TrainingType,
FinetuneTrainingLimits,
FinetuneLRScheduler,
FinetuneLinearLRSchedulerArgs,
)
from together.types.images import (
ImageRequest,
Expand Down Expand Up @@ -57,6 +59,8 @@
"FinetuneList",
"FinetuneListEvents",
"FinetuneDownloadResult",
"FinetuneLRScheduler",
"FinetuneLinearLRSchedulerArgs",
"FileRequest",
"FileResponse",
"FileList",
Expand Down
21 changes: 21 additions & 0 deletions src/together/types/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,14 @@ class FinetuneRequest(BaseModel):
n_epochs: int
# training learning rate
learning_rate: float
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float
# max gradient norm
max_grad_norm: float | None = None
# weight decay
weight_decay: float | None = None
azahed98 marked this conversation as resolved.
Show resolved Hide resolved
# number of checkpoints to save
n_checkpoints: int | None = None
# number of evaluation loops to run
Expand Down Expand Up @@ -193,8 +199,14 @@ class FinetuneResponse(BaseModel):
batch_size: int | None = None
# training learning rate
learning_rate: float | None = None
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float | None = None
# max gradient norm
max_grad_norm: float | None = None
# weight decay
weight_decay: float | None = None
# number of steps between evals
eval_steps: int | None = None
# training type
Expand Down Expand Up @@ -287,3 +299,12 @@ class FinetuneTrainingLimits(BaseModel):
min_learning_rate: float
full_training: FinetuneFullTrainingLimits | None = None
lora_training: FinetuneLoraTrainingLimits | None = None


class FinetuneLRScheduler(BaseModel):
lr_scheduler_type: str
lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None


class FinetuneLinearLRSchedulerArgs(BaseModel):
min_lr_ratio: float
Loading