Add NLPDDPStrategyNotebook and change trainer gpus to devices (#7741)

* Add NLPDDPStrategyNotebook and change trainer gpus to devices Signed-off-by: Abhishree <[email protected]> * Add NLPDDPStrategyNotebook for strategy_eval in lora.ipynb Signed-off-by: Abhishree <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Abhishree <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
NVIDIA · Oct 17, 2023 · 670a583 · 670a583
1 parent 9c955c4
commit 670a583
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 8 deletions.
diff --git a/examples/asr/experimental/k2/align_speech_parallel.py b/examples/asr/experimental/k2/align_speech_parallel.py
@@ -101,7 +101,7 @@ class ParallelAlignmentConfig:
     output_path: str = MISSING
     model_stride: int = 8
 
-    trainer: TrainerConfig = field(default_factory=lambda: TrainerConfig(gpus=-1, accelerator="ddp"))
+    trainer: TrainerConfig = field(default_factory=lambda: TrainerConfig(devices=-1, accelerator="ddp"))
 
     # there arguments will be ignored
     return_predictions: bool = False

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -436,6 +436,18 @@ def restore_checkpoint_after_setup(self) -> bool:
         return True
 
 
+class NLPDDPStrategyNotebook(NLPDDPStrategy):
+    """ Version of NLPDDPStrategy to be used in a Jupyter Notebook
+    A large portion of Megatron code has DDP dependency, so it has been necessary to use NLPDDPStrategy even for
+    single-GPU training (e.g. in a Jupyter notebook)
+    A PTL 2.0 changes has prevented DDPStrategy to be used in a notebook.
+    This version of NLPDDPStrategy enables megatron training in a notebook in PTL 2.0.
+    """
+
+    def _configure_launcher(self):
+        self._launcher = None
+
+
 class NLPSaveRestoreConnector(SaveRestoreConnector):
     def __init__(self) -> None:
         if not HAVE_APEX:

diff --git a/scripts/nemo_legacy_import/nlp_checkpoint_port.py b/scripts/nemo_legacy_import/nlp_checkpoint_port.py
@@ -81,7 +81,7 @@ def nemo_convert(argv):
 
     # Create a PL trainer object which is required for restoring Megatron models
     cfg_trainer = TrainerConfig(
-        gpus=1,
+        devices=1,
         accelerator="ddp",
         num_nodes=1,
         # Need to set the following two to False as ExpManager will take care of them differently.

diff --git a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb
@@ -596,7 +596,7 @@
     "source": [
      "import torch\n",
      "import pytorch_lightning as pl\n",
-     "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy\n",
+     "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategyNotebook\n",
      "from pytorch_lightning.plugins.environments import TorchElasticEnvironment\n",
      "\n",
      "# let's modify some trainer configs\n",
@@ -618,7 +618,7 @@
      "os.environ[\"RANK\"] = '0'\n",
      "os.environ[\"WORLD_SIZE\"] = '1'\n",
      "\n",
-     "strategy = NLPDDPStrategy(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
+     "strategy = NLPDDPStrategyNotebook(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
      "plugins = [TorchElasticEnvironment()]\n",
      "trainer = pl.Trainer(plugins= plugins, strategy=strategy, **config.trainer)\n",
      "\n",
@@ -783,4 +783,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 5
- }
+ }
diff --git a/tutorials/nlp/lora.ipynb b/tutorials/nlp/lora.ipynb
@@ -673,7 +673,7 @@
    "source": [
     "import torch\n",
     "import pytorch_lightning as pl\n",
-    "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy\n",
+    "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategyNotebook\n",
     "from pytorch_lightning.plugins.environments import TorchElasticEnvironment\n",
     "\n",
     "# let's modify some trainer configs\n",
@@ -695,7 +695,7 @@
     "os.environ[\"RANK\"] = '0'\n",
     "os.environ[\"WORLD_SIZE\"] = '1'\n",
     "\n",
-    "strategy = NLPDDPStrategy(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
+    "strategy = NLPDDPStrategyNotebook(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
     "plugins = [TorchElasticEnvironment()]\n",
     "trainer = pl.Trainer(plugins= plugins, strategy=strategy, **config.trainer)\n",
     "\n",
@@ -1311,7 +1311,7 @@
     }
    ],
    "source": [
-    "strategy_eval = NLPDDPStrategy(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
+    "strategy_eval = NLPDDPStrategyNotebook(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
     "plugins_eval = [TorchElasticEnvironment()]\n",
     "# notice the plugins, strategy and config.trainer args are the same as is training portion of this tutorial\n",
     "# we just create a new object with no overlap from the training section of this tutorial\n",