Skip to content

Commit

Permalink
Merge branch 'recipe_for_25.01' into dev.0111
Browse files Browse the repository at this point in the history
  • Loading branch information
guyueh1 committed Jan 12, 2025
2 parents 7f22379 + e234588 commit faff9a7
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 28 deletions.
6 changes: 2 additions & 4 deletions nemo/collections/llm/recipes/nemotron4_15b.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192
)
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_b200_h6144_tp2_mbs1_seqlen4096
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback

Expand Down Expand Up @@ -205,7 +203,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
run.Config(
MegatronCommOverlapCallback,
tp_comm_overlap=True,
tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192,
tp_comm_overlap_cfg=userbuffers_bf16_b200_h6144_tp2_mbs1_seqlen4096,
)
)
return recipe
Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/llm/recipes/nemotron4_340b.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096
userbuffers_bf16_b200_h18432_tp8_mbs1_seqlen4096,
)
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback
Expand Down Expand Up @@ -212,7 +212,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
run.Config(
MegatronCommOverlapCallback,
tp_comm_overlap=True,
tp_comm_overlap_cfg=userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096,
tp_comm_overlap_cfg=userbuffers_bf16_b200_h18432_tp8_mbs1_seqlen4096,
defer_embedding_wgrad_compute=True,
wgrad_deferral_limit=22,
overlap_param_gather_with_optimizer_step=False, # Currently disabled due to an issue with checkpointing
Expand Down
34 changes: 17 additions & 17 deletions nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,20 +88,6 @@ class TransformerLayerTPOverlapCfg:
fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
)

# llama3 70b
userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
proj_fprop=RingExchangeOverlapCfg(),
fc2_fprop=RingExchangeOverlapCfg(),
)

# llama3.1 405b
userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
Expand Down Expand Up @@ -183,8 +169,22 @@ class TransformerLayerTPOverlapCfg:
fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
)

# Nemotron 15B
userbuffers_bf16_b200_h6144_tp2_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg(
qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
)

# Nemotron 340B
userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg(
userbuffers_bf16_b200_h18432_tp8_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg(
qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
Expand All @@ -193,6 +193,6 @@ class TransformerLayerTPOverlapCfg:
proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
proj_fprop=PipelineOverlapCfg(num_sm=32, cga_size=2, num_splits=2, set_sm_margin=True, fp8_buf=True),
fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True),
)
25 changes: 20 additions & 5 deletions nemo/lightning/run/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from typing import Callable, Optional

import nemo_run as run
import torch
import yaml
from lightning.pytorch import Callback
from lightning.pytorch.loggers import WandbLogger
Expand All @@ -27,7 +28,6 @@
from nemo.lightning.pytorch.callbacks import NsysCallback, PreemptionCallback
from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
from nemo.utils import logging

from nemo.utils.import_utils import safe_import

res_module, HAVE_RES = safe_import('nvidia_resiliency_ext.ptl_resiliency')
Expand Down Expand Up @@ -317,7 +317,7 @@ class PerfEnvPlugin(run.Plugin):
layernorm_sm_margin: int = 16
enable_vboost: bool = False
nccl_pp_comm_chunksize: Optional[int] = None
custom_cuda_device_max_connections: int = None
num_cuda_device_max_connections: int = None

def get_vboost_srun_cmd(self, nodes, job_dir):
"Create the vboost `sudo nvidia-smi boost-slider --vboost 1` command"
Expand All @@ -344,9 +344,24 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
"""Enable the performance environment settings"""

if task.trainer.strategy.__fn_or_cls__ == MegatronStrategy:
# Force program order kernel launch for TP, CP overlap
if self.custom_cuda_device_max_connections:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(self.custom_cuda_device_max_connections)
if torch.cuda.is_available():
major, _ = torch.cuda.get_device_capability()
if major > 9:
if self.num_cuda_device_max_connections is not None:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(self.num_cuda_device_max_connections)
else:
# When TP or CP size is larger than 1, need to use a single cuda device connection to enforce
# the kernel queuing order of the host to GPU for their execution. This is needed for the optimal
# overlap between communication and computation kernels.
tp_size = task.trainer.strategy.tensor_model_parallel_size
cp_size = task.trainer.strategy.context_parallel_size
if tp_size > 1 or cp_size > 1:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
else:
if self.num_cuda_device_max_connections is not None:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(
self.num_cuda_device_max_connections
)

# Set LayerNorm SM margin to support the overlap with LayerNorm kernel
if self.enable_layernorm_sm_margin:
Expand Down

0 comments on commit faff9a7

Please sign in to comment.