From ad50e7e4a2b96ced834dd4bb1440c9bec60c7034 Mon Sep 17 00:00:00 2001 From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Date: Tue, 17 Oct 2023 15:47:03 -0700 Subject: [PATCH] Add NLPDDPStrategyNotebook and change trainer gpus to devices (#7741) * Add NLPDDPStrategyNotebook and change trainer gpus to devices Signed-off-by: Abhishree * Add NLPDDPStrategyNotebook for strategy_eval in lora.ipynb Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Abhishree Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../asr/experimental/k2/align_speech_parallel.py | 2 +- scripts/nemo_legacy_import/nlp_checkpoint_port.py | 2 +- tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb | 6 +++--- tutorials/nlp/lora.ipynb | 13 ++++++++++--- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/examples/asr/experimental/k2/align_speech_parallel.py b/examples/asr/experimental/k2/align_speech_parallel.py index bd03420e94c1..abfffa0cdfdb 100644 --- a/examples/asr/experimental/k2/align_speech_parallel.py +++ b/examples/asr/experimental/k2/align_speech_parallel.py @@ -101,7 +101,7 @@ class ParallelAlignmentConfig: output_path: str = MISSING model_stride: int = 8 - trainer: TrainerConfig = field(default_factory=lambda: TrainerConfig(gpus=-1, accelerator="ddp")) + trainer: TrainerConfig = field(default_factory=lambda: TrainerConfig(devices=-1, accelerator="ddp")) # there arguments will be ignored return_predictions: bool = False diff --git a/scripts/nemo_legacy_import/nlp_checkpoint_port.py b/scripts/nemo_legacy_import/nlp_checkpoint_port.py index 162e4e4bef7a..909c1b7562c9 100644 --- a/scripts/nemo_legacy_import/nlp_checkpoint_port.py +++ b/scripts/nemo_legacy_import/nlp_checkpoint_port.py @@ -81,7 +81,7 @@ def nemo_convert(argv): # Create a PL trainer object which is required for restoring Megatron models cfg_trainer = TrainerConfig( - gpus=1, + devices=1, accelerator="ddp", num_nodes=1, # Need to set the following two to False as ExpManager will take care of them differently. diff --git a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb index 004014ebdeeb..076a8ffad3df 100644 --- a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb +++ b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb @@ -596,7 +596,7 @@ "source": [ "import torch\n", "import pytorch_lightning as pl\n", - "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy\n", + "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategyNotebook\n", "from pytorch_lightning.plugins.environments import TorchElasticEnvironment\n", "\n", "# let's modify some trainer configs\n", @@ -618,7 +618,7 @@ "os.environ[\"RANK\"] = '0'\n", "os.environ[\"WORLD_SIZE\"] = '1'\n", "\n", - "strategy = NLPDDPStrategy(find_unused_parameters=False, no_ddp_communication_hook=True)\n", + "strategy = NLPDDPStrategyNotebook(find_unused_parameters=False, no_ddp_communication_hook=True)\n", "plugins = [TorchElasticEnvironment()]\n", "trainer = pl.Trainer(plugins= plugins, strategy=strategy, **config.trainer)\n", "\n", @@ -783,4 +783,4 @@ }, "nbformat": 4, "nbformat_minor": 5 - } \ No newline at end of file + } diff --git a/tutorials/nlp/lora.ipynb b/tutorials/nlp/lora.ipynb index 8603bbb62411..21f99f2b8e69 100644 --- a/tutorials/nlp/lora.ipynb +++ b/tutorials/nlp/lora.ipynb @@ -423,7 +423,8 @@ "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy\n", "import torch\n", "import pytorch_lightning as pl\n", - "from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder\n", + "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategyNotebook\n", + "from pytorch_lightning.plugins.environments import TorchElasticEnvironment\n", "\n", "# let's modify some trainer configs\n", "# check if we have GPU available and uses it\n", @@ -441,7 +442,9 @@ "os.environ[\"RANK\"] = '0'\n", "os.environ[\"WORLD_SIZE\"] = '1'\n", "\n", - "trainer = MegatronTrainerBuilder(config).create_trainer()\n", + "strategy = NLPDDPStrategyNotebook(find_unused_parameters=False, no_ddp_communication_hook=True)\n", + "plugins = [TorchElasticEnvironment()]\n", + "trainer = pl.Trainer(plugins= plugins, strategy=strategy, **config.trainer)\n", "\n", "print(\"Trainer config - \\n\")\n", "print(OmegaConf.to_yaml(config.trainer))" @@ -685,7 +688,11 @@ "metadata": {}, "outputs": [], "source": [ - "trainer_eval = MegatronTrainerBuilder(config_eval).create_trainer()" + "strategy_eval = NLPDDPStrategyNotebook(find_unused_parameters=False, no_ddp_communication_hook=True)\n", + "plugins_eval = [TorchElasticEnvironment()]\n", + "# notice the plugins, strategy and config.trainer args are the same as is training portion of this tutorial\n", + "# we just create a new object with no overlap from the training section of this tutorial\n", + "trainer_eval = pl.Trainer(plugins= plugins_eval, strategy=strategy_eval, **config_eval.trainer) " ] }, {