Skip to content

Commit

Permalink
Add NLPDDPStrategyNotebook and change trainer gpus to devices (#7741)
Browse files Browse the repository at this point in the history
* Add NLPDDPStrategyNotebook and change trainer gpus to devices

Signed-off-by: Abhishree <[email protected]>

* Add NLPDDPStrategyNotebook for strategy_eval in lora.ipynb

Signed-off-by: Abhishree <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
athitten and pre-commit-ci[bot] authored Oct 17, 2023
1 parent 9c955c4 commit 670a583
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 8 deletions.
2 changes: 1 addition & 1 deletion examples/asr/experimental/k2/align_speech_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class ParallelAlignmentConfig:
output_path: str = MISSING
model_stride: int = 8

trainer: TrainerConfig = field(default_factory=lambda: TrainerConfig(gpus=-1, accelerator="ddp"))
trainer: TrainerConfig = field(default_factory=lambda: TrainerConfig(devices=-1, accelerator="ddp"))

# there arguments will be ignored
return_predictions: bool = False
Expand Down
12 changes: 12 additions & 0 deletions nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,18 @@ def restore_checkpoint_after_setup(self) -> bool:
return True


class NLPDDPStrategyNotebook(NLPDDPStrategy):
""" Version of NLPDDPStrategy to be used in a Jupyter Notebook
A large portion of Megatron code has DDP dependency, so it has been necessary to use NLPDDPStrategy even for
single-GPU training (e.g. in a Jupyter notebook)
A PTL 2.0 changes has prevented DDPStrategy to be used in a notebook.
This version of NLPDDPStrategy enables megatron training in a notebook in PTL 2.0.
"""

def _configure_launcher(self):
self._launcher = None


class NLPSaveRestoreConnector(SaveRestoreConnector):
def __init__(self) -> None:
if not HAVE_APEX:
Expand Down
2 changes: 1 addition & 1 deletion scripts/nemo_legacy_import/nlp_checkpoint_port.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def nemo_convert(argv):

# Create a PL trainer object which is required for restoring Megatron models
cfg_trainer = TrainerConfig(
gpus=1,
devices=1,
accelerator="ddp",
num_nodes=1,
# Need to set the following two to False as ExpManager will take care of them differently.
Expand Down
6 changes: 3 additions & 3 deletions tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@
"source": [
"import torch\n",
"import pytorch_lightning as pl\n",
"from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy\n",
"from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategyNotebook\n",
"from pytorch_lightning.plugins.environments import TorchElasticEnvironment\n",
"\n",
"# let's modify some trainer configs\n",
Expand All @@ -618,7 +618,7 @@
"os.environ[\"RANK\"] = '0'\n",
"os.environ[\"WORLD_SIZE\"] = '1'\n",
"\n",
"strategy = NLPDDPStrategy(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
"strategy = NLPDDPStrategyNotebook(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
"plugins = [TorchElasticEnvironment()]\n",
"trainer = pl.Trainer(plugins= plugins, strategy=strategy, **config.trainer)\n",
"\n",
Expand Down Expand Up @@ -783,4 +783,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
6 changes: 3 additions & 3 deletions tutorials/nlp/lora.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@
"source": [
"import torch\n",
"import pytorch_lightning as pl\n",
"from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy\n",
"from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategyNotebook\n",
"from pytorch_lightning.plugins.environments import TorchElasticEnvironment\n",
"\n",
"# let's modify some trainer configs\n",
Expand All @@ -695,7 +695,7 @@
"os.environ[\"RANK\"] = '0'\n",
"os.environ[\"WORLD_SIZE\"] = '1'\n",
"\n",
"strategy = NLPDDPStrategy(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
"strategy = NLPDDPStrategyNotebook(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
"plugins = [TorchElasticEnvironment()]\n",
"trainer = pl.Trainer(plugins= plugins, strategy=strategy, **config.trainer)\n",
"\n",
Expand Down Expand Up @@ -1311,7 +1311,7 @@
}
],
"source": [
"strategy_eval = NLPDDPStrategy(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
"strategy_eval = NLPDDPStrategyNotebook(find_unused_parameters=False, no_ddp_communication_hook=True)\n",
"plugins_eval = [TorchElasticEnvironment()]\n",
"# notice the plugins, strategy and config.trainer args are the same as is training portion of this tutorial\n",
"# we just create a new object with no overlap from the training section of this tutorial\n",
Expand Down

0 comments on commit 670a583

Please sign in to comment.