NVIDIA · suiyoubi · Nov 4, 2024 · Oct 29, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
@@ -85,8 +85,8 @@
     MixtralModel,
     Nemotron3Config4B,
     Nemotron3Config8B,
+    Nemotron3Config22B,
     Nemotron4Config15B,
-    Nemotron4Config22B,
     Nemotron4Config340B,
     NemotronConfig,
     NemotronModel,
@@ -138,8 +138,8 @@
     "NemotronModel",
     "Nemotron3Config4B",
     "Nemotron3Config8B",
+    "Nemotron3Config22B",
     "Nemotron4Config15B",
-    "Nemotron4Config22B",
     "Nemotron4Config340B",
     "NemotronConfig",
     "SSMConfig",

diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
@@ -73,8 +73,8 @@
 from nemo.collections.llm.gpt.model.nemotron import (
     Nemotron3Config4B,
     Nemotron3Config8B,
+    Nemotron3Config22B,
     Nemotron4Config15B,
-    Nemotron4Config22B,
     Nemotron4Config340B,
     NemotronConfig,
     NemotronModel,
@@ -137,7 +137,7 @@
     "Nemotron3Config4B",
     "Nemotron3Config8B",
     "Nemotron4Config15B",
-    "Nemotron4Config22B",
+    "Nemotron3Config22B",
     "Nemotron4Config340B",
     "NemotronModel",
     "CodeLlamaConfig7B",

diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py
@@ -50,6 +50,7 @@ class NemotronConfig(GPTConfig):
     persist_layer_norm: bool = True
     bias_dropout_add_fusion: bool = False
     layernorm_zero_centered_gamma: bool = True
+    cross_entropy_loss_fusion: bool = True
 
     # Nemotron3Config4B as default configs
     num_layers: int = 32
@@ -87,27 +88,27 @@ class Nemotron3Config8B(NemotronConfig):
 
 
 @dataclass
-class Nemotron4Config15B(NemotronConfig):
-    num_layers: int = 32
+class Nemotron3Config22B(NemotronConfig):
+    num_layers: int = 40
     seq_length: int = 4096
     hidden_size: int = 6144
     ffn_hidden_size: int = 24576
     num_attention_heads: int = 48
-    num_query_groups: Optional[int] = 8
+    num_query_groups: Optional[int] = None
     kv_channels: Optional[int] = None
-    init_method_std: float = 0.0134
+    init_method_std: float = 0.008
 
 
 @dataclass
-class Nemotron4Config22B(NemotronConfig):
-    num_layers: int = 40
+class Nemotron4Config15B(NemotronConfig):
+    num_layers: int = 32
     seq_length: int = 4096
     hidden_size: int = 6144
     ffn_hidden_size: int = 24576
     num_attention_heads: int = 48
-    num_query_groups: Optional[int] = None
+    num_query_groups: Optional[int] = 8
     kv_channels: Optional[int] = None
-    init_method_std: float = 0.008
+    init_method_std: float = 0.0134
 
 
 @dataclass
@@ -141,6 +142,7 @@ def init(self) -> NemotronModel:
     def apply(self, output_path: Path) -> Path:
         from transformers import NemotronForCausalLM
 
+        print('Start converting Nemotron model..')
         source = NemotronForCausalLM.from_pretrained(str(self), torch_dtype='auto')
         target = self.init()
         trainer = self.nemo_setup(target)
@@ -357,8 +359,8 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv):
     "NemotronConfig",
     "Nemotron3Config4B",
     "Nemotron3Config8B",
+    "Nemotron3Config22B",
     "Nemotron4Config15B",
-    "Nemotron4Config22B",
     "Nemotron4Config340B",
     "NemotronModel",
 ]
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
@@ -47,12 +47,12 @@
     nemotron,
     nemotron3_4b,
     nemotron3_8b,
+    nemotron3_22b,
+    nemotron3_22b_16k,
+    nemotron3_22b_64k,
     nemotron4_15b,
     nemotron4_15b_16k,
     nemotron4_15b_64k,
-    nemotron4_22b,
-    nemotron4_22b_16k,
-    nemotron4_22b_64k,
     nemotron4_340b,
     qwen2,
     qwen2_1p5b,
@@ -100,12 +100,12 @@
     "nemotron",
     "nemotron3_4b",
     "nemotron3_8b",
+    "nemotron3_22b",
+    "nemotron3_22b_16k",
+    "nemotron3_22b_64k",
     "nemotron4_15b",
     "nemotron4_15b_16k",
     "nemotron4_15b_64k",
-    "nemotron4_22b",
-    "nemotron4_22b_16k",
-    "nemotron4_22b_64k",
     "nemotron4_340b",
     "t5_220m",
     "t5_3b",

diff --git a/nemo/collections/llm/recipes/nemotron.py b/nemo/collections/llm/recipes/nemotron.py
@@ -24,8 +24,8 @@
 from nemo.collections.llm.gpt.model.nemotron import (
     Nemotron3Config4B,
     Nemotron3Config8B,
+    Nemotron3Config22B,
     Nemotron4Config15B,
-    Nemotron4Config22B,
     Nemotron4Config340B,
     NemotronModel,
 )
@@ -37,9 +37,9 @@ def nemotron_model(version: str) -> run.Config[pl.LightningModule]:
     A function to create a Nemotron models.
 
     Args:
-        version (str): The version of the Nemotron model to create. one of ["nemotron3_4b", "nemotron3_8b",
+        version (str): The version of the Nemotron model to create. one of ["nemotron3_4b", "nemotron3_8b",\
+            "nemotron3_22b", "nemotron3_22b_16k", "nemotron3_22b_64k",
             "nemotron4_15b", "nemotron4_15b_16k", "nemotron4_15b_64k",
-            "nemotron4_22b", "nemotron4_22b_16k", "nemotron4_22b_64k",
             "nemotron4_340b"].
 
     Returns:
@@ -50,18 +50,18 @@ def nemotron_model(version: str) -> run.Config[pl.LightningModule]:
         config = run.Config(Nemotron3Config4B)
     elif version == "nemotron3_8b":
         config = run.Config(Nemotron3Config8B)
+    elif version == "nemotron3_22b":
+        config = run.Config(Nemotron3Config22B)
+    elif version == "nemotron3_22b_16k":
+        config = run.Config(Nemotron3Config22B, seq_length=16384)
+    elif version == "nemotron3_22b_64k":
+        config = run.Config(Nemotron3Config22B, seq_length=65536)
     elif version == "nemotron4_15b":
         config = run.Config(Nemotron4Config15B)
     elif version == "nemotron4_15b_16k":
         config = run.Config(Nemotron4Config15B, seq_length=16384)
     elif version == "nemotron4_15b_64k":
         config = run.Config(Nemotron4Config15B, seq_length=65536)
-    elif version == "nemotron4_22b":
-        config = run.Config(Nemotron4Config22B)
-    elif version == "nemotron4_22b_16k":
-        config = run.Config(Nemotron4Config22B, seq_length=16384)
-    elif version == "nemotron4_22b_64k":
-        config = run.Config(Nemotron4Config22B, seq_length=65536)
     elif version == "nemotron4_340b":
         config = run.Config(Nemotron4Config340B)
 

diff --git a/.../collections/llm/recipes/nemotron4_22b.py → .../collections/llm/recipes/nemotron3_22b.py b/.../collections/llm/recipes/nemotron4_22b.py → .../collections/llm/recipes/nemotron3_22b.py
@@ -18,28 +18,30 @@
 import pytorch_lightning as pl
 import torch
 
-from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
-NAME = "nemotron4_22b"
+NAME = "nemotron3_22b"
 
 
 @run.cli.factory(name=NAME)
 def model() -> run.Config[pl.LightningModule]:
     """
-    Factory function to create a Nemotron4 22b model configuration.
+    Factory function to create a Nemotron3 22B model configuration.
 
     Returns:
-        run.Config[pl.LightningModule]: Configuration for the Nemotron4 22b model.
+        run.Config[pl.LightningModule]: Configuration for the Nemotron3 22b model.
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain model=nemotron4_22b ...
+            $ nemo llm pretrain model=nemotron3_22b ...
 
         Python API usage:
             >>> model_config = model()
@@ -85,7 +87,7 @@ def pretrain_recipe(
     fn=pretrain,
 ) -> run.Partial:
     """
-    Create a pre-training recipe for Nemotron4 22b model.
+    Create a pre-training recipe for Nemotron3 22B model.
 
     This function sets up a complete configuration for pre-training, including
     model, trainer, data, logging, optimization, and resumption settings.
@@ -124,8 +126,8 @@ def pretrain_recipe(
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain --factory nemotron4_22b
-            $ nemo llm pretrain --factory "nemotron4_22b(num_nodes=1, name='my_nemotron_pretrain')"
+            $ nemo llm pretrain --factory nemotron3_22b
+            $ nemo llm pretrain --factory "nemotron3_22b(num_nodes=1, name='my_nemotron_pretrain')"
 
         Python API usage:
             >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
@@ -181,7 +183,7 @@ def pretrain_recipe(
 
 def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
-    Create a performance-optimized pre-training recipe for Nemotron4 22B model.
+    Create a performance-optimized pre-training recipe for Nemotron3 22B model.
 
     This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
@@ -214,3 +216,61 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
         )
     )
     return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+    packed_sequence: bool = False,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Nemotron3 22B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
+        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training efficiency. Default sequence length is 2048.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory nemotron3_22b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="nemotron3_22b_finetune", num_nodes=8)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+
+    recipe = default_finetune_recipe(
+        model(), "thhaus/nemotron3-22b-hf", dir, name, num_nodes, num_gpus_per_node, packed_sequence
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 8
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() == 'lora':
+        recipe.peft = run.Config(LoRA)
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+
+    # some settings currently do not function correctly with finetuning
+    recipe.model.config.cross_entropy_loss_fusion = False
+    return recipe
diff --git a/...lections/llm/recipes/nemotron4_22b_16k.py → ...lections/llm/recipes/nemotron3_22b_16k.py b/...lections/llm/recipes/nemotron4_22b_16k.py → ...lections/llm/recipes/nemotron3_22b_16k.py
@@ -25,20 +25,20 @@
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.utils.exp_manager import TimingCallback
 
-NAME = "nemotron4_22b_16k"
+NAME = "nemotron3_22b_16k"
 
 
 @run.cli.factory(name=NAME)
 def model() -> run.Config[pl.LightningModule]:
     """
-    Factory function to create a Nemotron4 22b model with 16k sequence length.
+    Factory function to create a Nemotron3 22B model with 16k sequence length.
 
     Returns:
-        run.Config[pl.LightningModule]: Configuration for the Nemotron4 22b and 16k sequence length model.
+        run.Config[pl.LightningModule]: Configuration for the Nemotron3 22b and 16k sequence length model.
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain model=nemotron4_22b_16k ...
+            $ nemo llm pretrain model=nemotron3_22b_16k ...
 
         Python API usage:
             >>> model_config = model()
@@ -83,7 +83,7 @@ def pretrain_recipe(
     fn=pretrain,
 ) -> run.Partial:
     """
-    Create a pre-training recipe for Nemotron4 22b model with 16k sequence length.
+    Create a pre-training recipe for Nemotron3 22B model with 16k sequence length.
 
     This function sets up a complete configuration for pre-training, including
     model, trainer, data, logging, optimization, and resumption settings.
@@ -121,8 +121,8 @@ def pretrain_recipe(
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain --factory nemotron4_22b_16k
-            $ nemo llm pretrain --factory "nemotron4_22b_16k(num_nodes=1, name='my_nemotron_pretrain')"
+            $ nemo llm pretrain --factory nemotron3_22b_16k
+            $ nemo llm pretrain --factory "nemotron3_22b_16k(num_nodes=1, name='my_nemotron_pretrain')"
 
         Python API usage:
             >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)

diff --git a/...lections/llm/recipes/nemotron4_22b_64k.py → ...lections/llm/recipes/nemotron3_22b_64k.py b/...lections/llm/recipes/nemotron4_22b_64k.py → ...lections/llm/recipes/nemotron3_22b_64k.py
@@ -25,20 +25,20 @@
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.utils.exp_manager import TimingCallback
 
-NAME = "nemotron4_22b_64k"
+NAME = "nemotron3_22b_64k"
 
 
 @run.cli.factory(name=NAME)
 def model() -> run.Config[pl.LightningModule]:
     """
-    Factory function to create a Nemotron4 22b model with 64k sequence length.
+    Factory function to create a Nemotron3 22B model with 64k sequence length.
 
     Returns:
-        run.Config[pl.LightningModule]: Configuration for the Nemotron4 22b and 64k sequence length model.
+        run.Config[pl.LightningModule]: Configuration for the Nemotron3 22b and 64k sequence length model.
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain model=nemotron4_22b_64k ...
+            $ nemo llm pretrain model=nemotron3_22b_64k ...
 
         Python API usage:
             >>> model_config = model()
@@ -83,7 +83,7 @@ def pretrain_recipe(
     fn=pretrain,
 ) -> run.Partial:
     """
-    Create a pre-training recipe for Nemotron4 22b model with 16k sequence length.
+    Create a pre-training recipe for Nemotron3 22B model with 16k sequence length.
 
     This function sets up a complete configuration for pre-training, including
     model, trainer, data, logging, optimization, and resumption settings.
@@ -121,8 +121,8 @@ def pretrain_recipe(
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain --factory nemotron4_22b_64k
-            $ nemo llm pretrain --factory "nemotron4_22b_64k(num_nodes=2, name='my_nemotron_pretrain')"
+            $ nemo llm pretrain --factory nemotron3_22b_64k
+            $ nemo llm pretrain --factory "nemotron3_22b_64k(num_nodes=2, name='my_nemotron_pretrain')"
 
         Python API usage:
             >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=2)