From 71c66e1579aaf49075c26b9f5c4ef761c960fc76 Mon Sep 17 00:00:00 2001 From: David Date: Tue, 7 Mar 2023 11:23:07 -0700 Subject: [PATCH] Davidm/cherrypick r1.16.0 (#6082) * gpt fix Signed-off-by: David Mosallanezhad * per-micro-batch input loader (#5635) * per-micro-batch input loader * per-micro-batch input loader set arg default val * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fix * apply per-microbatch-loader to only GPT * update docstring on micro-batch input loader * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed the default arg val * fix batch size to 1 at log stat registration * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update container for CI Signed-off-by: ericharper * update container in jenkinsfile Signed-off-by: ericharper * update container for CI Signed-off-by: ericharper fix merge conflict * revert Jenkinsfile * Revert "revert Jenkinsfile" This reverts commit d23b7757e0f935dacde2840f234193c632a2b3be. * Update nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> * add GradScaler * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: ericharper Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: ericharper Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> * added PR#5995 Signed-off-by: David Mosallanezhad * Distributed Adam optimizer overlaps param all-gather with forward compute (#5684) * Add distopt support for overlapping param all-gather with forward compute Signed-off-by: Tim Moon * Update Apex commit Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon Co-authored-by: Eric Harper * per-micro-batch input loader (#5635) * per-micro-batch input loader * per-micro-batch input loader set arg default val * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fix * apply per-microbatch-loader to only GPT * update docstring on micro-batch input loader * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed the default arg val * fix batch size to 1 at log stat registration * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update container for CI Signed-off-by: ericharper * update container in jenkinsfile Signed-off-by: ericharper * update container for CI Signed-off-by: ericharper fix merge conflict * revert Jenkinsfile * Revert "revert Jenkinsfile" This reverts commit d23b7757e0f935dacde2840f234193c632a2b3be. * Update nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> * add GradScaler * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: ericharper Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: ericharper Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> * adding early stop callback to ptuning (#6028) * patch to allow using tokenizers without additional_special_tokens_ids attribute Signed-off-by: arendu * early stop callback for prompt/p tuning Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update Signed-off-by: arendu * added exp manager config for early stop Signed-off-by: arendu * pushed logic for creating early stopping inside exp manager Signed-off-by: arendu * pushed logic for creating early stopping inside exp manager Signed-off-by: arendu * minor updates and added dataclass check Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more args Signed-off-by: arendu * more args Signed-off-by: arendu --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: David Mosallanezhad Signed-off-by: ericharper Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Signed-off-by: Tim Moon Signed-off-by: arendu Co-authored-by: David Mosallanezhad Co-authored-by: Sangkug Lym Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: ericharper Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> --- .../megatron_gpt_pretraining.py | 3 + .../megatron/indexed_dataset.py | 6 +- .../language_modeling/megatron_gpt_model.py | 56 ++++++++----------- 3 files changed, 30 insertions(+), 35 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py index 1ac12b422108..596e3c59e753 100644 --- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py +++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py @@ -13,6 +13,7 @@ # limitations under the License. +import torch.multiprocessing as mp from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.plugins.environments import TorchElasticEnvironment @@ -29,6 +30,8 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +mp.set_start_method("spawn", force=True) + @hydra_runner(config_path="conf", config_name="megatron_gpt_config") def main(cfg) -> None: diff --git a/nemo/collections/nlp/data/language_modeling/megatron/indexed_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/indexed_dataset.py index f45d9b7155a8..b2c5f4976f32 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/indexed_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/indexed_dataset.py @@ -500,10 +500,10 @@ def __init__(self, path, skip_warmup=False): def __getstate__(self): return self._path - # def __setstate__(self, state): - # self._do_init(state) + def __setstate__(self, state): + self._do_init(state) - def _do_init(self, path, skip_warmup): + def _do_init(self, path, skip_warmup=True): self._path = path self._index = self.Index(index_file_path(self._path), skip_warmup) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index db3f37ae3c90..2b1edc954972 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -13,7 +13,7 @@ # limitations under the License. import itertools -from typing import Any, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import numpy as np import torch @@ -149,8 +149,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self._nsys_profile_start_step *= grad_accum_steps self._nsys_profile_end_step *= grad_accum_steps - self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', False) - def set_inference_config(self, inference_config): self._inference_config = inference_config @@ -231,6 +229,18 @@ def setup_optimizer_param_groups(self): else: self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model) + def setup_optimization( + self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None, + ): + optim_kwargs = {} if optim_kwargs is None else optim_kwargs.copy() + if self.with_distributed_adam: + + # Enable overlapped param sync by default + if 'overlap_param_sync' not in optim_kwargs: + optim_kwargs['overlap_param_sync'] = True + + return super().setup_optimization(optim_config=optim_config, optim_kwargs=optim_kwargs) + def configure_optimizers(self): if self.with_distributed_adam: @@ -522,43 +532,25 @@ def allreduce_first_last_embeddings(self): def get_forward_output_and_loss_func(self, validation_step=False): def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None): - batch = next(dataloader_iter) + # GPT3 uses only causal mask, which doesn't need attention mask if parallel_state.get_pipeline_model_parallel_world_size() == 1: + batch = next(dataloader_iter) for k in batch.keys(): - if self.get_attention_mask_from_fusion: - batch[k] = batch[k].cuda(non_blocking=True) if k not in ['attention_mask'] else None - else: - batch[k] = batch[k].cuda(non_blocking=True) + batch[k] = batch[k].cuda(non_blocking=True) if k not in ['attention_mask'] else None else: if parallel_state.is_pipeline_first_stage(): - # First pipeline stage needs tokens, position_ids, and attention_mask + batch = next(dataloader_iter) + # First pipeline stage needs only the tokens and position_ids for k in batch.keys(): - if self.get_attention_mask_from_fusion: - batch[k] = batch[k].cuda(non_blocking=True) if k in ['tokens', 'position_ids'] else None - else: - batch[k] = ( - batch[k].cuda(non_blocking=True) - if k in ['tokens', 'position_ids', 'attention_mask'] - else None - ) + batch[k] = batch[k].cuda(non_blocking=True) if k in ['tokens', 'position_ids'] else None elif parallel_state.is_pipeline_last_stage(): - # Last pipeline stage needs the labels, loss_mask, and attention_mask + batch = next(dataloader_iter) + # Last pipeline stage needs only the labels and loss_mask for k in batch.keys(): - if self.get_attention_mask_from_fusion: - batch[k] = batch[k].cuda(non_blocking=True) if k in ['labels', 'loss_mask'] else None - else: - batch[k] = ( - batch[k].cuda(non_blocking=True) - if k in ['labels', 'loss_mask', 'attention_mask'] - else None - ) + batch[k] = batch[k].cuda(non_blocking=True) if k in ['labels', 'loss_mask'] else None else: - # Intermediate pipeline stage only needs attention_mask - if self.get_attention_mask_from_fusion: - batch = {k: None for k in ['tokens', 'position_ids', 'attention_mask', 'labels']} - else: - for k in batch.keys(): - batch[k] = batch[k].cuda(non_blocking=True) if k in ['attention_mask'] else None + # Intermediate pipeline stage doesn't need any inputs + batch = {k: None for k in ['tokens', 'position_ids', 'attention_mask', 'labels']} output_tensor = model( batch['tokens'],