Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nemotron Recipe #11118

Merged
merged 10 commits into from
Nov 4, 2024
4 changes: 2 additions & 2 deletions nemo/collections/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@
MixtralModel,
Nemotron3Config4B,
Nemotron3Config8B,
Nemotron3Config22B,
Nemotron4Config15B,
Nemotron4Config22B,
Nemotron4Config340B,
NemotronConfig,
NemotronModel,
Expand Down Expand Up @@ -138,8 +138,8 @@
"NemotronModel",
"Nemotron3Config4B",
"Nemotron3Config8B",
"Nemotron3Config22B",
"Nemotron4Config15B",
"Nemotron4Config22B",
"Nemotron4Config340B",
"NemotronConfig",
"SSMConfig",
Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/llm/gpt/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@
from nemo.collections.llm.gpt.model.nemotron import (
Nemotron3Config4B,
Nemotron3Config8B,
Nemotron3Config22B,
Nemotron4Config15B,
Nemotron4Config22B,
Nemotron4Config340B,
NemotronConfig,
NemotronModel,
Expand Down Expand Up @@ -137,7 +137,7 @@
"Nemotron3Config4B",
"Nemotron3Config8B",
"Nemotron4Config15B",
"Nemotron4Config22B",
"Nemotron3Config22B",
"Nemotron4Config340B",
"NemotronModel",
"CodeLlamaConfig7B",
Expand Down
20 changes: 11 additions & 9 deletions nemo/collections/llm/gpt/model/nemotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class NemotronConfig(GPTConfig):
persist_layer_norm: bool = True
bias_dropout_add_fusion: bool = False
layernorm_zero_centered_gamma: bool = True
cross_entropy_loss_fusion: bool = True

# Nemotron3Config4B as default configs
num_layers: int = 32
Expand Down Expand Up @@ -87,27 +88,27 @@ class Nemotron3Config8B(NemotronConfig):


@dataclass
class Nemotron4Config15B(NemotronConfig):
num_layers: int = 32
class Nemotron3Config22B(NemotronConfig):
num_layers: int = 40
seq_length: int = 4096
hidden_size: int = 6144
ffn_hidden_size: int = 24576
num_attention_heads: int = 48
num_query_groups: Optional[int] = 8
num_query_groups: Optional[int] = None
kv_channels: Optional[int] = None
init_method_std: float = 0.0134
init_method_std: float = 0.008


@dataclass
class Nemotron4Config22B(NemotronConfig):
num_layers: int = 40
class Nemotron4Config15B(NemotronConfig):
num_layers: int = 32
seq_length: int = 4096
hidden_size: int = 6144
ffn_hidden_size: int = 24576
num_attention_heads: int = 48
num_query_groups: Optional[int] = None
num_query_groups: Optional[int] = 8
kv_channels: Optional[int] = None
init_method_std: float = 0.008
init_method_std: float = 0.0134


@dataclass
Expand Down Expand Up @@ -141,6 +142,7 @@ def init(self) -> NemotronModel:
def apply(self, output_path: Path) -> Path:
from transformers import NemotronForCausalLM

print('Start converting Nemotron model..')
source = NemotronForCausalLM.from_pretrained(str(self), torch_dtype='auto')
target = self.init()
trainer = self.nemo_setup(target)
Expand Down Expand Up @@ -357,8 +359,8 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv):
"NemotronConfig",
"Nemotron3Config4B",
"Nemotron3Config8B",
"Nemotron3Config22B",
"Nemotron4Config15B",
"Nemotron4Config22B",
"Nemotron4Config340B",
"NemotronModel",
]
12 changes: 6 additions & 6 deletions nemo/collections/llm/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@
nemotron,
nemotron3_4b,
nemotron3_8b,
nemotron3_22b,
nemotron3_22b_16k,
nemotron3_22b_64k,
nemotron4_15b,
nemotron4_15b_16k,
nemotron4_15b_64k,
nemotron4_22b,
nemotron4_22b_16k,
nemotron4_22b_64k,
nemotron4_340b,
qwen2,
qwen2_1p5b,
Expand Down Expand Up @@ -100,12 +100,12 @@
"nemotron",
"nemotron3_4b",
"nemotron3_8b",
"nemotron3_22b",
"nemotron3_22b_16k",
"nemotron3_22b_64k",
"nemotron4_15b",
"nemotron4_15b_16k",
"nemotron4_15b_64k",
"nemotron4_22b",
"nemotron4_22b_16k",
"nemotron4_22b_64k",
"nemotron4_340b",
"t5_220m",
"t5_3b",
Expand Down
18 changes: 9 additions & 9 deletions nemo/collections/llm/recipes/nemotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
from nemo.collections.llm.gpt.model.nemotron import (
Nemotron3Config4B,
Nemotron3Config8B,
Nemotron3Config22B,
Nemotron4Config15B,
Nemotron4Config22B,
Nemotron4Config340B,
NemotronModel,
)
Expand All @@ -37,9 +37,9 @@ def nemotron_model(version: str) -> run.Config[pl.LightningModule]:
A function to create a Nemotron models.

Args:
version (str): The version of the Nemotron model to create. one of ["nemotron3_4b", "nemotron3_8b",
version (str): The version of the Nemotron model to create. one of ["nemotron3_4b", "nemotron3_8b",\
"nemotron3_22b", "nemotron3_22b_16k", "nemotron3_22b_64k",
"nemotron4_15b", "nemotron4_15b_16k", "nemotron4_15b_64k",
"nemotron4_22b", "nemotron4_22b_16k", "nemotron4_22b_64k",
"nemotron4_340b"].

Returns:
Expand All @@ -50,18 +50,18 @@ def nemotron_model(version: str) -> run.Config[pl.LightningModule]:
config = run.Config(Nemotron3Config4B)
elif version == "nemotron3_8b":
config = run.Config(Nemotron3Config8B)
elif version == "nemotron3_22b":
config = run.Config(Nemotron3Config22B)
elif version == "nemotron3_22b_16k":
config = run.Config(Nemotron3Config22B, seq_length=16384)
elif version == "nemotron3_22b_64k":
config = run.Config(Nemotron3Config22B, seq_length=65536)
elif version == "nemotron4_15b":
config = run.Config(Nemotron4Config15B)
elif version == "nemotron4_15b_16k":
config = run.Config(Nemotron4Config15B, seq_length=16384)
elif version == "nemotron4_15b_64k":
config = run.Config(Nemotron4Config15B, seq_length=65536)
elif version == "nemotron4_22b":
config = run.Config(Nemotron4Config22B)
elif version == "nemotron4_22b_16k":
config = run.Config(Nemotron4Config22B, seq_length=16384)
elif version == "nemotron4_22b_64k":
config = run.Config(Nemotron4Config22B, seq_length=65536)
elif version == "nemotron4_340b":
config = run.Config(Nemotron4Config340B)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,30 @@
import pytorch_lightning as pl
import torch

from nemo.collections.llm.api import pretrain
from nemo.collections.llm.api import finetune, pretrain
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.peft.lora import LoRA
from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback

NAME = "nemotron4_22b"
NAME = "nemotron3_22b"


@run.cli.factory(name=NAME)
def model() -> run.Config[pl.LightningModule]:
"""
Factory function to create a Nemotron4 22b model configuration.
Factory function to create a Nemotron3 22B model configuration.

Returns:
run.Config[pl.LightningModule]: Configuration for the Nemotron4 22b model.
run.Config[pl.LightningModule]: Configuration for the Nemotron3 22b model.

Examples:
CLI usage:
$ nemo llm pretrain model=nemotron4_22b ...
$ nemo llm pretrain model=nemotron3_22b ...

Python API usage:
>>> model_config = model()
Expand Down Expand Up @@ -85,7 +87,7 @@ def pretrain_recipe(
fn=pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Nemotron4 22b model.
Create a pre-training recipe for Nemotron3 22B model.

This function sets up a complete configuration for pre-training, including
model, trainer, data, logging, optimization, and resumption settings.
Expand Down Expand Up @@ -124,8 +126,8 @@ def pretrain_recipe(

Examples:
CLI usage:
$ nemo llm pretrain --factory nemotron4_22b
$ nemo llm pretrain --factory "nemotron4_22b(num_nodes=1, name='my_nemotron_pretrain')"
$ nemo llm pretrain --factory nemotron3_22b
$ nemo llm pretrain --factory "nemotron3_22b(num_nodes=1, name='my_nemotron_pretrain')"

Python API usage:
>>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
Expand Down Expand Up @@ -181,7 +183,7 @@ def pretrain_recipe(

def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Nemotron4 22B model.
Create a performance-optimized pre-training recipe for Nemotron3 22B model.

This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.
Expand Down Expand Up @@ -214,3 +216,61 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
)
)
return recipe


@run.cli.factory(target=finetune, name=NAME)
def finetune_recipe(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
peft_scheme: Optional[str] = 'lora',
packed_sequence: bool = False,
) -> run.Partial:
"""
Create a fine-tuning recipe for Nemotron3 22B model.

This function sets up a complete configuration for fine-tuning, including
model, trainer, data, logging, optimization, and resumption settings.
The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the fine-tuning run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.

Returns:
run.Partial: Partial configuration for fine-tuning.

Examples:
CLI usage:
$ nemo llm finetune --factory nemotron3_22b

Python API usage:
>>> recipe = finetune_recipe(name="nemotron3_22b_finetune", num_nodes=8)
>>> print(recipe)

Note:
This recipe uses the SQuAD dataset for fine-tuning. For more information
on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
`examples/llm/finetune/` directory.
"""

recipe = default_finetune_recipe(
model(), "thhaus/nemotron3-22b-hf", dir, name, num_nodes, num_gpus_per_node, packed_sequence
)
if peft_scheme is None or peft_scheme.lower() == 'none':
recipe.trainer.strategy.tensor_model_parallel_size = 8
recipe.optim.config.lr = 5e-6
elif peft_scheme.lower() == 'lora':
recipe.peft = run.Config(LoRA)
recipe.optim.config.lr = 1e-4
else:
raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")

# some settings currently do not function correctly with finetuning
recipe.model.config.cross_entropy_loss_fusion = False
return recipe
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,20 @@
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.utils.exp_manager import TimingCallback

NAME = "nemotron4_22b_16k"
NAME = "nemotron3_22b_16k"


@run.cli.factory(name=NAME)
def model() -> run.Config[pl.LightningModule]:
"""
Factory function to create a Nemotron4 22b model with 16k sequence length.
Factory function to create a Nemotron3 22B model with 16k sequence length.

Returns:
run.Config[pl.LightningModule]: Configuration for the Nemotron4 22b and 16k sequence length model.
run.Config[pl.LightningModule]: Configuration for the Nemotron3 22b and 16k sequence length model.

Examples:
CLI usage:
$ nemo llm pretrain model=nemotron4_22b_16k ...
$ nemo llm pretrain model=nemotron3_22b_16k ...

Python API usage:
>>> model_config = model()
Expand Down Expand Up @@ -83,7 +83,7 @@ def pretrain_recipe(
fn=pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Nemotron4 22b model with 16k sequence length.
Create a pre-training recipe for Nemotron3 22B model with 16k sequence length.

This function sets up a complete configuration for pre-training, including
model, trainer, data, logging, optimization, and resumption settings.
Expand Down Expand Up @@ -121,8 +121,8 @@ def pretrain_recipe(

Examples:
CLI usage:
$ nemo llm pretrain --factory nemotron4_22b_16k
$ nemo llm pretrain --factory "nemotron4_22b_16k(num_nodes=1, name='my_nemotron_pretrain')"
$ nemo llm pretrain --factory nemotron3_22b_16k
$ nemo llm pretrain --factory "nemotron3_22b_16k(num_nodes=1, name='my_nemotron_pretrain')"

Python API usage:
>>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,20 @@
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.utils.exp_manager import TimingCallback

NAME = "nemotron4_22b_64k"
NAME = "nemotron3_22b_64k"


@run.cli.factory(name=NAME)
def model() -> run.Config[pl.LightningModule]:
"""
Factory function to create a Nemotron4 22b model with 64k sequence length.
Factory function to create a Nemotron3 22B model with 64k sequence length.

Returns:
run.Config[pl.LightningModule]: Configuration for the Nemotron4 22b and 64k sequence length model.
run.Config[pl.LightningModule]: Configuration for the Nemotron3 22b and 64k sequence length model.

Examples:
CLI usage:
$ nemo llm pretrain model=nemotron4_22b_64k ...
$ nemo llm pretrain model=nemotron3_22b_64k ...

Python API usage:
>>> model_config = model()
Expand Down Expand Up @@ -83,7 +83,7 @@ def pretrain_recipe(
fn=pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Nemotron4 22b model with 16k sequence length.
Create a pre-training recipe for Nemotron3 22B model with 16k sequence length.

This function sets up a complete configuration for pre-training, including
model, trainer, data, logging, optimization, and resumption settings.
Expand Down Expand Up @@ -121,8 +121,8 @@ def pretrain_recipe(

Examples:
CLI usage:
$ nemo llm pretrain --factory nemotron4_22b_64k
$ nemo llm pretrain --factory "nemotron4_22b_64k(num_nodes=2, name='my_nemotron_pretrain')"
$ nemo llm pretrain --factory nemotron3_22b_64k
$ nemo llm pretrain --factory "nemotron3_22b_64k(num_nodes=2, name='my_nemotron_pretrain')"

Python API usage:
>>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=2)
Expand Down
Loading
Loading