Merge branch 'recipe_for_25.01' into dev.0111

guyueh1 · Jan 12, 2025 · faff9a7 · faff9a7
2 parents 7f22379 + e234588
commit faff9a7
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 28 deletions.
diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py
@@ -25,9 +25,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
-from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
-    userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192
-)
+from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_b200_h6144_tp2_mbs1_seqlen4096
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
@@ -205,7 +203,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
         run.Config(
             MegatronCommOverlapCallback,
             tp_comm_overlap=True,
-            tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192,
+            tp_comm_overlap_cfg=userbuffers_bf16_b200_h6144_tp2_mbs1_seqlen4096,
         )
     )
     return recipe

diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -26,7 +26,7 @@
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
-    userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096
+    userbuffers_bf16_b200_h18432_tp8_mbs1_seqlen4096,
 )
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
@@ -212,7 +212,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
         run.Config(
             MegatronCommOverlapCallback,
             tp_comm_overlap=True,
-            tp_comm_overlap_cfg=userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096,
+            tp_comm_overlap_cfg=userbuffers_bf16_b200_h18432_tp8_mbs1_seqlen4096,
             defer_embedding_wgrad_compute=True,
             wgrad_deferral_limit=22,
             overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing

diff --git a/nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py b/nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py
@@ -88,20 +88,6 @@ class TransformerLayerTPOverlapCfg:
     fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
 )
 
-# llama3 70b
-userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=RingExchangeOverlapCfg(),
-    fc2_fprop=RingExchangeOverlapCfg(),
-)
-
 # llama3.1 405b
 userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
     qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
@@ -183,8 +169,22 @@ class TransformerLayerTPOverlapCfg:
     fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
 )
 
+# Nemotron 15B
+userbuffers_bf16_b200_h6144_tp2_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg(
+    qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
+    qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
+    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
+    fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
+    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
+    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
+    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
+    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
+    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
+    fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
+)
+
 # Nemotron 340B
-userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg(
+userbuffers_bf16_b200_h18432_tp8_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg(
     qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
     qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
     fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
@@ -193,6 +193,6 @@ class TransformerLayerTPOverlapCfg:
     proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
     fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
     fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=PipelineOverlapCfg(num_sm=32, cga_size=2, num_splits=2, set_sm_margin=True, fp8_buf=True),
-    fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
+    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
+    fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True),
 )
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
@@ -19,6 +19,7 @@
 from typing import Callable, Optional
 
 import nemo_run as run
+import torch
 import yaml
 from lightning.pytorch import Callback
 from lightning.pytorch.loggers import WandbLogger
@@ -27,7 +28,6 @@
 from nemo.lightning.pytorch.callbacks import NsysCallback, PreemptionCallback
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
 from nemo.utils import logging
-
 from nemo.utils.import_utils import safe_import
 
 res_module, HAVE_RES = safe_import('nvidia_resiliency_ext.ptl_resiliency')
@@ -317,7 +317,7 @@ class PerfEnvPlugin(run.Plugin):
     layernorm_sm_margin: int = 16
     enable_vboost: bool = False
     nccl_pp_comm_chunksize: Optional[int] = None
-    custom_cuda_device_max_connections: int = None
+    num_cuda_device_max_connections: int = None
 
     def get_vboost_srun_cmd(self, nodes, job_dir):
         "Create the vboost `sudo nvidia-smi boost-slider --vboost 1` command"
@@ -344,9 +344,24 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
         """Enable the performance environment settings"""
 
         if task.trainer.strategy.__fn_or_cls__ == MegatronStrategy:
-            # Force program order kernel launch for TP, CP overlap
-            if self.custom_cuda_device_max_connections:
-                executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(self.custom_cuda_device_max_connections)
+            if torch.cuda.is_available():
+                major, _ = torch.cuda.get_device_capability()
+                if major > 9:
+                    if self.num_cuda_device_max_connections is not None:
+                        executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(self.num_cuda_device_max_connections)
+                else:
+                    # When TP or CP size is larger than 1, need to use a single cuda device connection to enforce
+                    # the kernel queuing order of the host to GPU for their execution. This is needed  for the optimal
+                    # overlap between communication and computation kernels.
+                    tp_size = task.trainer.strategy.tensor_model_parallel_size
+                    cp_size = task.trainer.strategy.context_parallel_size
+                    if tp_size > 1 or cp_size > 1:
+                        executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+                    else:
+                        if self.num_cuda_device_max_connections is not None:
+                            executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(
+                                self.num_cuda_device_max_connections
+                            )
 
             # Set LayerNorm SM margin to support the overlap with LayerNorm kernel
             if self.enable_layernorm_sm_margin: