Lightning-AI · awaelchli · Oct 19, 2021 · Oct 18, 2021 · Oct 18, 2021 · Oct 18, 2021
@@ -27,7 +27,9 @@
 import numpy as np
 import torch
 import torch.distributed
+from torch.nn import Module
 from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.utils.data import DataLoader, DistributedSampler
 
 import pytorch_lightning as pl
 from pytorch_lightning.core.optimizer import LightningOptimizer
@@ -181,6 +183,9 @@ def setup_environment(self) -> None:
 
         self.setup_distributed()
 
+    def setup_model(self, model: Module) -> Module:
+        return DistributedDataParallel(module=model, device_ids=self.determine_ddp_device_ids(), **self._ddp_kwargs)
+
     def _call_children_scripts(self):
         # bookkeeping of spawned processes
         self._check_can_spawn_children()
@@ -355,9 +360,7 @@ def _reinit_optimizers_with_post_localSGD(self, warmup_steps: int):
 
     def configure_ddp(self) -> None:
         self.pre_configure_ddp()
-        self._model = DistributedDataParallel(
-            LightningDistributedModule(self.model), device_ids=self.determine_ddp_device_ids(), **self._ddp_kwargs
-        )
+        self._model = self.setup_model(LightningDistributedModule(self.model))
         self._register_ddp_hooks()
 
     def determine_ddp_device_ids(self):

@@ -21,6 +21,7 @@
 import torch
 import torch.distributed
 import torch.multiprocessing as mp
+from torch.nn import Module
 from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
@@ -147,6 +148,9 @@ def setup(self) -> None:
         smp = mp.get_context("spawn")
         self.mp_queue = smp.SimpleQueue()
 
+    def setup_model(self, model: Module) -> Module:
+        return DistributedDataParallel(module=model, device_ids=self.determine_ddp_device_ids(), **self._ddp_kwargs)
+
     def set_world_ranks(self, process_idx: int = 0) -> None:
         self._local_rank = process_idx
         if self.cluster_environment is None:
@@ -256,9 +260,7 @@ def _register_ddp_hooks(self) -> None:
 
     def configure_ddp(self) -> None:
         self.pre_configure_ddp()
-        self._model = DistributedDataParallel(
-            LightningDistributedModule(self.model), device_ids=self.determine_ddp_device_ids(), **self._ddp_kwargs
-        )
+        self._model = self.setup_model(LightningDistributedModule(self.model))
         self._register_ddp_hooks()
 
     def determine_ddp_device_ids(self):

diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Optional
+from typing import List, Optional, Sequence
 
 import torch
-from torch.nn import DataParallel
+from torch.nn import DataParallel, Module
+from torch.optim import Optimizer
 
 from pytorch_lightning.overrides.data_parallel import LightningParallelModule
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
@@ -54,7 +55,10 @@ def world_size(self) -> int:
     def setup(self) -> None:
         # model needs to be moved to the device before it is wrapped
         self.model_to_device()
-        self._model = DataParallel(LightningParallelModule(self._model), self.parallel_devices)
+        self._model = self.setup_model(LightningParallelModule(self._model))
+
+    def setup_model(self, model: Module) -> Module:
+        return DataParallel(module=model, device_ids=self.parallel_devices)
 
     def reduce(self, collection: _METRIC_COLLECTION, *args, **kwargs) -> _METRIC_COLLECTION:
         """Reduces a collection of tensors from all processes. It can be applied to just a single tensor.

@@ -13,11 +13,12 @@
 # limitations under the License.
 import contextlib
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Generator, Iterable, Mapping, Optional, Union
+from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Sequence, Tuple, Union
 
 import torch
 from torch import Tensor
 from torch.nn import Module
+from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 
 import pytorch_lightning as pl
@@ -60,6 +61,19 @@ def setup_environment(self) -> None:
     def setup(self) -> None:
         """Called by the accelerator to finish setup."""
 
+    def setup_models_and_optimizers(
+        self, models: List[Module], optimizers: List[Optimizer]
+    ) -> Tuple[List[Module], List[Optimizer]]:
+        models = [self.setup_model(model) for model in models]
+        optimizers = [self.setup_optimizer(optimizer) for optimizer in optimizers]
+        return models, optimizers
+
+    def setup_model(self, model: Module) -> Module:
+        return model
+
+    def setup_optimizer(self, optimizer: Optimizer) -> Optimizer:
+        return optimizer
+
     @property
     @abstractmethod
     def on_gpu(self) -> bool: