nemotron perf mode

Signed-off-by: Malay Nagda <[email protected]>
NVIDIA · Oct 18, 2024 · 406db95 · 406db95
1 parent f8e00d9
commit 406db95
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 109 deletions.
diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py
@@ -208,7 +208,6 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
         Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
     # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
     # They are added here for user's knowledge

diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py
@@ -205,7 +205,6 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
         Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
     # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
     # They are added here for user's knowledge

diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py
@@ -83,6 +83,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=3.0e-5,
     max_lr=3e-4,
+    performance_mode: bool =False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -118,6 +119,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -135,7 +137,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -174,43 +176,29 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 1,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron3 8B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron3_8b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron3_8b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
     recipe.trainer.callbacks.append(
         run.Config(

diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py
@@ -80,6 +80,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=4.5e-5,
     max_lr=4.5e-5,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -115,6 +116,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -132,7 +134,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -171,44 +173,29 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 8,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron4 15B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron4_15b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron4_15b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
-
     recipe.trainer.callbacks.append(
         run.Config(
             MegatronCommOverlapCallback,

diff --git a/nemo/collections/llm/recipes/nemotron4_22b.py b/nemo/collections/llm/recipes/nemotron4_22b.py
@@ -80,6 +80,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=1e-5,
     max_lr=1e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -115,6 +116,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -132,7 +134,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -171,57 +173,33 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 8,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron4 22B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron4_22b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron4_22b_perf", num_nodes=4)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
-
-    # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
-    # They are added here for user's knowledge
-    # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
-    # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
-
     recipe.trainer.callbacks.append(
         run.Config(
             MegatronCommOverlapCallback,
             tp_comm_overlap=True,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=22,
-            overlap_param_gather_with_optimizer_step=True,
-            align_param_gather=True,
         )
     )
     return recipe
diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -83,6 +83,7 @@ def pretrain_recipe(
     constant_steps=0,
     min_lr=1.0e-5,
     max_lr=1.0e-4,
+    performance_mode: bool = False,
     # Training function
     fn=pretrain,
 ) -> run.Partial:
@@ -118,6 +119,7 @@ def pretrain_recipe(
         constant_steps (int): Number of constant steps.
         min_lr (float): Minimum learning rate.
         max_lr (float): Maximum learning rate.
+        performance_mode (bool): If true, enables optimizations for maximum performance.
         fn (Callable): The pre-training function to use.
 
     Returns:
@@ -135,7 +137,7 @@ def pretrain_recipe(
     Note:
         This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
     """
-    return run.Partial(
+    recipe = run.Partial(
         fn,
         model=model(),
         trainer=nemotron_trainer(
@@ -174,43 +176,29 @@ def pretrain_recipe(
         resume=default_resume(),
     )
 
+    if performance_mode:
+        recipe = pretrain_performance_optimizations(recipe)
 
-@run.cli.factory(target=pretrain, name=NAME + "_performance")
-def pretrain_recipe_performance(
-    dir: Optional[str] = None,
-    name: str = "default",
-    num_nodes: int = 16,
-    num_gpus_per_node: int = 8,
-    fn: Callable = pretrain,
-) -> run.Partial:
+    return recipe
+
+
+def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
     """
     Create a performance-optimized pre-training recipe for Nemotron4 340B model.
 
-    This recipe enables performance optimizations that may not be suitable for all use cases.
+    This method enables performance optimizations that may not be suitable for all use cases.
     It builds upon the standard pre-training recipe and adds additional performance enhancements.
 
     Args:
-        dir (Optional[str]): Directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        num_nodes (int): Number of compute nodes to use.
-        num_gpus_per_node (int): Number of GPUs per node.
-        fn (Callable): The pre-training function to use.
+        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
 
     Returns:
         run.Partial: Partial configuration for performance-optimized pre-training.
 
-    Examples:
-            $ nemo llm pretrain --factory nemotron4_340b_optimized
-
-        Python API usage:
-            >>> recipe = pretrain_recipe_performance(name="nemotron4_340b_perf", num_nodes=16)
-            >>> print(recipe)
-
     Note:
-        Use this recipe with caution and only when you need maximum performance.
+        Use this method with caution and only when you need maximum performance.
         It may not be suitable for all hardware configurations or use cases.
     """
-    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
     # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
     # They are added here for user's knowledge