Skip to content

Commit

Permalink
nemotron perf mode
Browse files Browse the repository at this point in the history
Signed-off-by: Malay Nagda <[email protected]>
  • Loading branch information
malay-nagda committed Oct 18, 2024
1 parent f8e00d9 commit 406db95
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 109 deletions.
1 change: 0 additions & 1 deletion nemo/collections/llm/recipes/mixtral_8x22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,6 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
# They are added here for user's knowledge
Expand Down
1 change: 0 additions & 1 deletion nemo/collections/llm/recipes/mixtral_8x7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,6 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
# They are added here for user's knowledge
Expand Down
36 changes: 12 additions & 24 deletions nemo/collections/llm/recipes/nemotron3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def pretrain_recipe(
constant_steps=0,
min_lr=3.0e-5,
max_lr=3e-4,
performance_mode: bool =False,
# Training function
fn=pretrain,
) -> run.Partial:
Expand Down Expand Up @@ -118,6 +119,7 @@ def pretrain_recipe(
constant_steps (int): Number of constant steps.
min_lr (float): Minimum learning rate.
max_lr (float): Maximum learning rate.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.
Returns:
Expand All @@ -135,7 +137,7 @@ def pretrain_recipe(
Note:
This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
"""
return run.Partial(
recipe = run.Partial(
fn,
model=model(),
trainer=nemotron_trainer(
Expand Down Expand Up @@ -174,43 +176,29 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
fn: Callable = pretrain,
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Nemotron3 8B model.
This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
Returns:
run.Partial: Partial configuration for performance-optimized pre-training.
Examples:
$ nemo llm pretrain --factory nemotron3_8b_optimized
Python API usage:
>>> recipe = pretrain_recipe_performance(name="nemotron3_8b_perf", num_nodes=4)
>>> print(recipe)
Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

recipe.trainer.callbacks.append(
run.Config(
Expand Down
37 changes: 12 additions & 25 deletions nemo/collections/llm/recipes/nemotron4_15b.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def pretrain_recipe(
constant_steps=0,
min_lr=4.5e-5,
max_lr=4.5e-5,
performance_mode: bool = False,
# Training function
fn=pretrain,
) -> run.Partial:
Expand Down Expand Up @@ -115,6 +116,7 @@ def pretrain_recipe(
constant_steps (int): Number of constant steps.
min_lr (float): Minimum learning rate.
max_lr (float): Maximum learning rate.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.
Returns:
Expand All @@ -132,7 +134,7 @@ def pretrain_recipe(
Note:
This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
"""
return run.Partial(
recipe = run.Partial(
fn,
model=model(),
trainer=nemotron_trainer(
Expand Down Expand Up @@ -171,44 +173,29 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 8,
num_gpus_per_node: int = 8,
fn: Callable = pretrain,
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Nemotron4 15B model.
This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
Returns:
run.Partial: Partial configuration for performance-optimized pre-training.
Examples:
$ nemo llm pretrain --factory nemotron4_15b_optimized
Python API usage:
>>> recipe = pretrain_recipe_performance(name="nemotron4_15b_perf", num_nodes=4)
>>> print(recipe)
Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

recipe.trainer.callbacks.append(
run.Config(
MegatronCommOverlapCallback,
Expand Down
46 changes: 12 additions & 34 deletions nemo/collections/llm/recipes/nemotron4_22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def pretrain_recipe(
constant_steps=0,
min_lr=1e-5,
max_lr=1e-4,
performance_mode: bool = False,
# Training function
fn=pretrain,
) -> run.Partial:
Expand Down Expand Up @@ -115,6 +116,7 @@ def pretrain_recipe(
constant_steps (int): Number of constant steps.
min_lr (float): Minimum learning rate.
max_lr (float): Maximum learning rate.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.
Returns:
Expand All @@ -132,7 +134,7 @@ def pretrain_recipe(
Note:
This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
"""
return run.Partial(
recipe = run.Partial(
fn,
model=model(),
trainer=nemotron_trainer(
Expand Down Expand Up @@ -171,57 +173,33 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 8,
num_gpus_per_node: int = 8,
fn: Callable = pretrain,
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Nemotron4 22B model.
This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
Returns:
run.Partial: Partial configuration for performance-optimized pre-training.
Examples:
$ nemo llm pretrain --factory nemotron4_22b_optimized
Python API usage:
>>> recipe = pretrain_recipe_performance(name="nemotron4_22b_perf", num_nodes=4)
>>> print(recipe)
Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
# They are added here for user's knowledge
# overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed

recipe.trainer.callbacks.append(
run.Config(
MegatronCommOverlapCallback,
tp_comm_overlap=True,
defer_embedding_wgrad_compute=True,
wgrad_deferral_limit=22,
overlap_param_gather_with_optimizer_step=True,
align_param_gather=True,
)
)
return recipe
36 changes: 12 additions & 24 deletions nemo/collections/llm/recipes/nemotron4_340b.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def pretrain_recipe(
constant_steps=0,
min_lr=1.0e-5,
max_lr=1.0e-4,
performance_mode: bool = False,
# Training function
fn=pretrain,
) -> run.Partial:
Expand Down Expand Up @@ -118,6 +119,7 @@ def pretrain_recipe(
constant_steps (int): Number of constant steps.
min_lr (float): Minimum learning rate.
max_lr (float): Maximum learning rate.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.
Returns:
Expand All @@ -135,7 +137,7 @@ def pretrain_recipe(
Note:
This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
"""
return run.Partial(
recipe = run.Partial(
fn,
model=model(),
trainer=nemotron_trainer(
Expand Down Expand Up @@ -174,43 +176,29 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 16,
num_gpus_per_node: int = 8,
fn: Callable = pretrain,
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Nemotron4 340B model.
This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
Returns:
run.Partial: Partial configuration for performance-optimized pre-training.
Examples:
$ nemo llm pretrain --factory nemotron4_340b_optimized
Python API usage:
>>> recipe = pretrain_recipe_performance(name="nemotron4_340b_perf", num_nodes=16)
>>> print(recipe)
Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
# They are added here for user's knowledge
Expand Down

0 comments on commit 406db95

Please sign in to comment.