Skip to content

Commit

Permalink
Performance mode (NVIDIA#10926)
Browse files Browse the repository at this point in the history
* llama3 performance mode

Signed-off-by: Malay Nagda <[email protected]>

* llama3 performance mode tests

Signed-off-by: Malay Nagda <[email protected]>

* mixtral performance mode

Signed-off-by: Malay Nagda <[email protected]>

* remove unused

Signed-off-by: Malay Nagda <[email protected]>

* nemotron perf mode

Signed-off-by: Malay Nagda <[email protected]>

* 405b, 174b perf mode

Signed-off-by: Malay Nagda <[email protected]>

* perf mode comment

Signed-off-by: Malay Nagda <[email protected]>

* Apply isort and black reformatting

Signed-off-by: malay-nagda <[email protected]>

---------

Signed-off-by: Malay Nagda <[email protected]>
Signed-off-by: malay-nagda <[email protected]>
Signed-off-by: malay-nagda <[email protected]>
Co-authored-by: malay-nagda <[email protected]>
  • Loading branch information
malay-nagda and malay-nagda committed Oct 24, 2024
1 parent ca6035a commit 631ac2f
Show file tree
Hide file tree
Showing 12 changed files with 193 additions and 278 deletions.
52 changes: 22 additions & 30 deletions nemo/collections/llm/recipes/gpt3_175b.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,12 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
performance_mode: bool = False,
fn: Callable = pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for GPT3 175B model.
Expand All @@ -155,6 +160,7 @@ def pretrain_recipe(
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.
Returns:
Expand All @@ -172,7 +178,7 @@ def pretrain_recipe(
Note:
This recipe is optimized for the large 175B model and requires significant computational resources.
"""
return run.Partial(
recipe = run.Partial(
fn,
model=model(),
trainer=trainer(
Expand All @@ -186,49 +192,35 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
fn: Callable = pretrain,
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for GPT3 175B model.
This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
Returns:
run.Partial: Partial configuration for performance-optimized pre-training.
Examples:
CLI usage:
$ nemo llm pretrain --factory "gpt3_175b.pretrain_recipe_performance(num_nodes=64, name='perf_pretrain')"
Python API usage:
>>> recipe = pretrain_recipe_performance(name="gpt3_175b_perf", num_nodes=64)
>>> print(recipe)
Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
# They are added here for user's knowledge
# overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
# by MegatronCommOverlapCallback. They are added here for user's knowledge.
# overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
# each PP stage launches independently as needed.

recipe.trainer.callbacks.append(
run.Config(
Expand Down
52 changes: 22 additions & 30 deletions nemo/collections/llm/recipes/llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,12 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
performance_mode: bool = False,
fn: Callable = pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Llama3.1 405B model.
Expand All @@ -157,6 +162,7 @@ def pretrain_recipe(
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.
Returns:
Expand All @@ -174,7 +180,7 @@ def pretrain_recipe(
Note:
This recipe is optimized for the large 405B model and requires significant computational resources.
"""
return run.Partial(
recipe = run.Partial(
fn,
model=model(),
trainer=trainer(
Expand All @@ -188,49 +194,35 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
fn: Callable = pretrain,
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Llama3.1 405B model.
This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
Returns:
run.Partial: Partial configuration for performance-optimized pre-training.
Examples:
CLI usage:
$ nemo llm pretrain --factory "llama31_405b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"
Python API usage:
>>> recipe = pretrain_recipe_performance(name="llama31_405b_perf", num_nodes=4)
>>> print(recipe)
Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
# They are added here for user's knowledge
# overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
# by MegatronCommOverlapCallback. They are added here for user's knowledge.
# overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
# each PP stage launches independently as needed.

recipe.trainer.callbacks.append(
run.Config(
Expand Down
51 changes: 24 additions & 27 deletions nemo/collections/llm/recipes/llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.


from typing import Optional
from typing import Callable, Optional

import nemo_run as run
import pytorch_lightning as pl
Expand Down Expand Up @@ -142,7 +142,12 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
performance_mode: bool = False,
fn: Callable = pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Llama3 70B model.
Expand All @@ -155,6 +160,7 @@ def pretrain_recipe(
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.
Returns:
Expand All @@ -172,7 +178,8 @@ def pretrain_recipe(
Note:
This recipe is optimized for the large 70B model and requires significant computational resources.
"""
return run.Partial(

recipe = run.Partial(
fn,
model=model(),
trainer=trainer(
Expand All @@ -186,45 +193,35 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Llama3 70B model.
This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
Returns:
run.Partial: Partial configuration for performance-optimized pre-training.
Examples:
CLI usage:
$ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"
Python API usage:
>>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4)
>>> print(recipe)
Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
# They are added here for user's knowledge
# overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
# by MegatronCommOverlapCallback. They are added here for user's knowledge.
# overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
# each PP stage launches independently as needed.

recipe.trainer.callbacks.append(
run.Config(
Expand Down
43 changes: 17 additions & 26 deletions nemo/collections/llm/recipes/llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,12 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
performance_mode: bool = False,
fn: Callable = pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Llama3 8B model.
Expand All @@ -156,6 +161,7 @@ def pretrain_recipe(
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.
Returns:
Expand All @@ -174,7 +180,7 @@ def pretrain_recipe(
For more details on pre-training LLMs with NeMo, see the pre-training
guide in the `examples/llm/pretrain/` directory.
"""
return run.Partial(
recipe = run.Partial(
fn,
model=model(),
trainer=trainer(
Expand All @@ -188,44 +194,29 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
fn: Callable = pretrain,
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Llama3 8B model.
This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
Returns:
run.Partial: Partial configuration for performance-optimized pre-training.
Examples:
$ nemo llm pretrain --factory llama3_8b_optimized
Python API usage:
>>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4)
>>> print(recipe)
Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

recipe.trainer.callbacks.append(
run.Config(
MegatronCommOverlapCallback,
Expand Down
Loading

0 comments on commit 631ac2f

Please sign in to comment.