From 2b9c5f40564f1208aa06aa135b9bab29004bbb26 Mon Sep 17 00:00:00 2001 From: jbaczek <45043825+jbaczek@users.noreply.github.com> Date: Sat, 12 Aug 2023 03:28:36 +0200 Subject: [PATCH] Load ub_cfg from hydra config (#7003) * Pass tp config via hydra Signed-off-by: Jan Baczek * Remove self.ub_cfgs field - it isn't used anywhere else Signed-off-by: Jan Baczek * Allow tp_overlap tree substitution in hydra config Signed-off-by: Jan Baczek * Add warning in case of usage of the default tp config * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Change warning message Signed-off-by: Jan Baczek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add compute capability resolver Signed-off-by: Jan Baczek * Bugfix Signed-off-by: Jan Baczek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add guards to pynvml import Signed-off-by: Jan Baczek --------- Signed-off-by: Jan Baczek Signed-off-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> --- .../conf/megatron_gpt_config.yaml | 3 + ...b_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml | 53 +++++++++++++++++ ...b_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml | 53 +++++++++++++++++ ...b_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml | 59 +++++++++++++++++++ ...b_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml | 59 +++++++++++++++++++ .../language_modeling/megatron_gpt_model.py | 17 +++--- nemo/core/config/hydra_runner.py | 21 +++++++ 7 files changed, 255 insertions(+), 10 deletions(-) create mode 100644 examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml create mode 100644 examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml create mode 100644 examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml create mode 100644 examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 575ac7d3d1fc..303e47d8088e 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -1,3 +1,6 @@ +defaults: + - optional tp_overlap@model.ub_tp_comm_overlap_cfg: + name: megatron_gpt restore_from_path: null # used when starting from a .nemo file diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml new file mode 100644 index 000000000000..c6e25c087ffc --- /dev/null +++ b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml @@ -0,0 +1,53 @@ +# UB communicator configurations +# Model configs: A100/175B/TP4/MBS1/SeqLen2K/BF16 + +# Bulk overlap with AllGather +qkv_dgrad: + method: bulk + num_sm: 2 + set_sm_margin: 0 + +qkv_wgrad: + method: bulk + num_sm: 2 + set_sm_margin: 0 + +fc1_dgrad: + method: bulk + num_sm: 2 + set_sm_margin: 0 + +fc1_wgrad: + method: bulk + num_sm: 2 + set_sm_margin: 0 + +## Ring-exchange overlap with AllGather +qkv_fprop: + method: ring_exchange + aggregate: 0 + +proj_dgrad: + method: ring_exchange + aggregate: 0 + +fc1_fprop: + method: ring_exchange + aggregate: 0 + +fc2_dgrad: + method: ring_exchange + aggregate: 0 + +# Chunked-collective overlap with ReduceScatter +proj_fprop: + method: pipeline + num_sm: 4 + num_splits: 4 + set_sm_margin: 0 + +fc2_fprop: + method: pipeline + num_sm: 4 + num_splits: 4 + set_sm_margin: 0 diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml new file mode 100644 index 000000000000..434e0a29f42c --- /dev/null +++ b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml @@ -0,0 +1,53 @@ +# UB communicator configurations +# Model configs: A100/175B/TP4/MBS2/SeqLen2K/BF16 + +# Bulk overlap with AllGather +qkv_dgrad: + method: bulk + num_sm: 2 + set_sm_margin: 0 + +qkv_wgrad: + method: bulk + num_sm: 2 + set_sm_margin: 0 + +fc1_dgrad: + method: bulk + num_sm: 2 + set_sm_margin: 0 + +fc1_wgrad: + method: bulk + num_sm: 2 + set_sm_margin: 0 + +## Ring-exchange overlap with AllGather +qkv_fprop: + method: ring_exchange + aggregate: 0 + +proj_dgrad: + method: ring_exchange + aggregate: 0 + +fc1_fprop: + method: ring_exchange + aggregate: 0 + +fc2_dgrad: + method: ring_exchange + aggregate: 0 + +# Chunked-collective overlap with ReduceScatter +proj_fprop: + method: pipeline + num_sm: 8 + num_splits: 4 + set_sm_margin: 0 + +fc2_fprop: + method: pipeline + num_sm: 4 + num_splits: 4 + set_sm_margin: 0 diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml new file mode 100644 index 000000000000..21d02f3dd22c --- /dev/null +++ b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml @@ -0,0 +1,59 @@ +# UB communicator configurations +# Model configs: H100/175B/TP4/MBS1/SeqLen2K/FP8 + +# Bulk overlap with AllGather / ReduceScatter +qkv_dgrad: + method: bulk + num_sm: 4 + cga_size: 2 + set_sm_margin: 0 + +qkv_wgrad: + method: bulk + num_sm: 8 + cga_size: 2 + set_sm_margin: 0 + +fc1_dgrad: + method: bulk + num_sm: 2 + cga_size: 2 + set_sm_margin: 0 + +fc1_wgrad: + method: bulk + num_sm: 4 + cga_size: 2 + set_sm_margin: 0 + +## Ring-exchange overlap with AllGather +qkv_fprop: + method: ring_exchange + aggregate: 0 + +proj_dgrad: + method: ring_exchange + aggregate: 0 + +fc1_fprop: + method: ring_exchange + aggregate: 0 + +fc2_dgrad: + method: ring_exchange + aggregate: 1 + +# Chunked-collective overlap with ReduceScatter +proj_fprop: + method: pipeline + num_sm: 24 + cga_size: 2 + num_splits: 4 + set_sm_margin: 1 + +fc2_fprop: + method: pipeline + num_sm: 20 + cga_size: 2 + num_splits: 4 + set_sm_margin: 1 diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml new file mode 100644 index 000000000000..444c8245e02c --- /dev/null +++ b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml @@ -0,0 +1,59 @@ +# UB communicator configurations +# Model configs: H100/175B/TP8/MBS2/SeqLen2K/FP8 + +# Bulk overlap with AllGather +qkv_dgrad: + method: bulk + num_sm: 8 + cga_size: 2 + set_sm_margin: 0 + +qkv_wgrad: + method: bulk + num_sm: 16 + cga_size: 2 + set_sm_margin: 0 + +fc1_dgrad: + method: bulk + num_sm: 4 + cga_size: 2 + set_sm_margin: 0 + +fc1_wgrad: + method: bulk + num_sm: 16 + cga_size: 2 + set_sm_margin: 0 + +## Ring-exchange overlap with AllGather +qkv_fprop: + method: ring_exchange + aggregate: 0 + +proj_dgrad: + method: ring_exchange + aggregate: 1 + +fc1_fprop: + method: ring_exchange + aggregate: 0 + +fc2_dgrad: + method: ring_exchange + aggregate: 0 + +# Chunked-collective overlap with ReduceScatter +proj_fprop: + method: pipeline + num_sm: 16 + cga_size: 2 + num_splits: 4 + set_sm_margin: 1 + +fc2_fprop: + method: pipeline + num_sm: 24 + cga_size: 2 + num_splits: 4 + set_sm_margin: 1 diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index b40fc9398e8a..eadefdcf40ee 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -522,20 +522,17 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): return loss_mean def initialize_ub_func(self): + ub_cfgs = self.cfg.get('ub_tp_comm_overlap_cfg', None) + if ub_cfgs is None: + warnings.warn( + "Couldn't find TP config. Please check the path correctness. Initializing TP comm overlap with the default config." + ) + input_shape = [ self.cfg.get('encoder_seq_length') * self.cfg.get('micro_batch_size'), self.cfg.get('hidden_size'), ] - ub_cfg_file_name = self.cfg.get('ub_tp_comm_overlap_cfg', None) - ub_cfgs = None - if ub_cfg_file_name is not None: - try: - import yaml - - with open(ub_cfg_file_name, 'r') as ub_cfg_file: - ub_cfgs = yaml.safe_load(ub_cfg_file) - except (ImportError, TypeError): - logging.error(f"Fail to read ub_tp_comm_overlap config file: {ub_cfg_file_name}.") + te_module.base.initialize_ub( shape=input_shape, tp_size=self.cfg.get('tensor_model_parallel_size'), diff --git a/nemo/core/config/hydra_runner.py b/nemo/core/config/hydra_runner.py index 6c6c9b47e0fd..9cabc45042f7 100644 --- a/nemo/core/config/hydra_runner.py +++ b/nemo/core/config/hydra_runner.py @@ -23,6 +23,27 @@ from hydra.types import TaskFunction from omegaconf import DictConfig, OmegaConf + +def _get_gpu_name(): + try: + import pynvml + except (ImportError, ModuleNotFoundError): + return None + + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + cuda_capability, _ = pynvml.nvmlDeviceGetCudaComputeCapability(handle) + pynvml.nvmlShutdown() + if cuda_capability == 8: + return "a100" + elif cuda_capability == 9: + return "h100" + else: + return None + + +OmegaConf.register_new_resolver("gpu_name", _get_gpu_name) + # multiple interpolated values in the config OmegaConf.register_new_resolver("multiply", lambda x, y: x * y)