2/n Move Precision Plugin into strategy - move optimizer related logics

Lightning-AI · Nov 18, 2021 · 19530c3 · 19530c3
1 parent 8eccdc0
commit 19530c3
Show file tree

Hide file tree

Showing 37 changed files with 275 additions and 270 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -245,7 +245,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Loop customization:
     * Added `Closure` and `AbstractClosure` classes ([#8642](https://github.com/PyTorchLightning/pytorch-lightning/pull/8642))
     * Refactored `TrainingBatchLoop` and extracted `OptimizerLoop`, splitting off automatic optimization into its own loop ([#9191](https://github.com/PyTorchLightning/pytorch-lightning/pull/9191))
-    * Removed `TrainingBatchLoop.backward()`; manual optimization now calls directly into `Accelerator.backward()` and automatic optimization handles backward in new `OptimizerLoop` ([#9265](https://github.com/PyTorchLightning/pytorch-lightning/pull/9265))
+    * Removed `TrainingBatchLoop.backward()`; manual optimization now calls directly into `training_type_plugin.backward()` and automatic optimization handles backward in new `OptimizerLoop` ([#9265](https://github.com/PyTorchLightning/pytorch-lightning/pull/9265))
     * Extracted `ManualOptimization` logic from `TrainingBatchLoop` into its own separate loop class ([#9266](https://github.com/PyTorchLightning/pytorch-lightning/pull/9266))
     * Added `OutputResult` and `ManualResult` classes ([#9437](https://github.com/PyTorchLightning/pytorch-lightning/pull/9437), [#9424](https://github.com/PyTorchLightning/pytorch-lightning/pull/9424))
     * Marked `OptimizerLoop.backward` as protected ([#9514](https://github.com/PyTorchLightning/pytorch-lightning/pull/9514))

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -11,23 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import contextlib
 from abc import abstractmethod
-from typing import Any, Callable, Dict, Generator, List, Optional, Union
+from typing import Any, Dict, Optional, Union
 
 import torch
-from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn import Module
-from torch.optim import Optimizer
 
 import pytorch_lightning as pl
-from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
-from pytorch_lightning.plugins.training_type import DataParallelPlugin, TrainingTypePlugin
-from pytorch_lightning.trainer.states import TrainerFn
-from pytorch_lightning.utilities import rank_zero_deprecation
-from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device
-from pytorch_lightning.utilities.enums import AMPType, LightningEnum
+from pytorch_lightning.plugins.precision import PrecisionPlugin
+from pytorch_lightning.plugins.training_type import TrainingTypePlugin
 from pytorch_lightning.utilities.types import STEP_OUTPUT
 
 
@@ -66,10 +58,6 @@ def __init__(self, precision_plugin: Optional[PrecisionPlugin], training_type_pl
             """
             self.training_type_plugin._precision_plugin = precision_plugin
 
-        self.optimizers: List = []
-        self.lr_schedulers: List = []
-        self.optimizer_frequencies: List = []
-
     def setup_environment(self) -> None:
         """Setup any processes or distributed connections.
 
@@ -84,28 +72,18 @@ def setup(self, trainer: "pl.Trainer") -> None:
         Args:
             trainer: the trainer instance
         """
-        self.setup_training_type_plugin()
-        if not self.training_type_plugin.setup_optimizers_in_pre_dispatch:
-            self.setup_optimizers(trainer)
-        self.setup_precision_plugin()
+        self.training_type_plugin.setup(trainer)
 
     def pre_dispatch(self, trainer: "pl.Trainer") -> None:
         """Hook to do something before the training/evaluation/prediction starts."""
-        self._move_optimizer_state()
+        self.training_type_plugin._move_optimizer_state()
 
         self.training_type_plugin.pre_dispatch()
         if self.training_type_plugin.setup_optimizers_in_pre_dispatch:
-            self.setup_optimizers(trainer)
+            self.training_type_plugin.setup_optimizers(trainer)
 
         self.training_type_plugin.precision_plugin.pre_dispatch()
 
-    def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None:
-        """Moves the state of the optimizers to the GPU if needed."""
-        device = device or self.root_device
-        for opt in self.optimizers:
-            for p, v in opt.state.items():
-                opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, device)
-
     def dispatch(self, trainer: "pl.Trainer") -> None:
         """Hook to do something before the training/evaluation/prediction starts."""
         self.training_type_plugin.dispatch(trainer)
@@ -137,36 +115,13 @@ def lightning_module(self) -> "pl.LightningModule":
         """
         return self.training_type_plugin.lightning_module
 
-    @property
-    def root_device(self) -> torch.device:
-        """Returns the root device."""
-        return self.training_type_plugin.root_device
-
     def teardown(self) -> None:
         """This method is called to teardown the training process.
 
         It is the right place to release memory and free other resources.
         """
         self.training_type_plugin.teardown()
 
-    def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0) -> Any:
-        """Moves the batch to the correct device. The returned batch is of the same type as the input batch, just
-        having all tensors on the correct device.
-
-        Args:
-            batch: The batch of samples to move to the correct device
-            device: The target device
-            dataloader_idx: The index of the dataloader to which the batch belongs.
-        """
-        model = self.lightning_module
-        device = device or self.root_device
-
-        if model is not None and not isinstance(self.training_type_plugin, DataParallelPlugin):
-            # no need to transfer batch to device in DP mode
-            return model._apply_batch_transfer_handler(batch, device=device, dataloader_idx=dataloader_idx)
-
-        return move_data_to_device(batch, device)
-
     def training_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> STEP_OUTPUT:
         """The actual training step.
 
@@ -199,123 +154,6 @@ def predict_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> STEP_OUTPUT:
         with self.training_type_plugin.precision_plugin.predict_step_context():
             return self.training_type_plugin.predict_step(*step_kwargs.values())
 
-    def backward(self, closure_loss: Tensor, *args: Any, **kwargs: Any) -> Tensor:
-        """Forwards backward-calls to the precision plugin.
-
-        Args:
-            closure_loss: a tensor holding the loss value to backpropagate
-        """
-        self.training_type_plugin.pre_backward(closure_loss)
-        closure_loss = self.training_type_plugin.precision_plugin.pre_backward(self.lightning_module, closure_loss)
-
-        self.training_type_plugin.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs)
-
-        closure_loss = self.training_type_plugin.precision_plugin.post_backward(self.lightning_module, closure_loss)
-        self.training_type_plugin.post_backward(closure_loss)
-
-        return closure_loss
-
-    def optimizer_step(
-        self,
-        optimizer: Optimizer,
-        opt_idx: int,
-        closure: Callable[[], Any],
-        model: Optional[Union["pl.LightningModule", Module]] = None,
-        **kwargs: Any,
-    ) -> None:
-        """performs the actual optimizer step.
-
-        Args:
-            optimizer: the optimizer performing the step
-            opt_idx: index of the current optimizer
-            closure: closure calculating the loss value
-            model: reference to the model, optionally defining optimizer step related hooks
-            **kwargs: Any extra arguments to ``optimizer.step``
-        """
-        model = model or self.lightning_module
-        self.training_type_plugin.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
-
-    def optimizer_zero_grad(self, current_epoch: int, batch_idx: int, optimizer: Optimizer, opt_idx: int) -> None:
-        """Zeros all model parameter's gradients."""
-        model_ref = self.lightning_module
-        model_ref.optimizer_zero_grad(current_epoch, batch_idx, optimizer, opt_idx)
-
-    def setup_optimizers(self, trainer: "pl.Trainer") -> None:
-        """Creates optimizers and schedulers.
-
-        Args:
-            trainer: the Trainer, these optimizers should be connected to
-        """
-        if trainer.state.fn not in (TrainerFn.FITTING, TrainerFn.TUNING):
-            return
-        optimizers, lr_schedulers, optimizer_frequencies = self.training_type_plugin.init_optimizers(
-            trainer=trainer, model=self.lightning_module
-        )
-        self.optimizers = optimizers
-        self.lr_schedulers = lr_schedulers
-        self.optimizer_frequencies = optimizer_frequencies
-
-    def setup_training_type_plugin(self) -> None:
-        """Attaches the training type plugin to the accelerator."""
-        self.training_type_plugin.setup()
-
-    def setup_precision_plugin(self) -> None:
-        """Attaches the precision plugin to the accelerator."""
-        model, optimizers, schedulers = self.training_type_plugin.precision_plugin.connect(
-            self.model, self.optimizers, self.lr_schedulers
-        )
-        self.model = model
-        self.optimizers = optimizers
-        self.lr_schedulers = schedulers
-
-    @property
-    def amp_backend(self) -> Optional[LightningEnum]:
-        if isinstance(self.training_type_plugin.precision_plugin, ApexMixedPrecisionPlugin):
-            return AMPType.APEX
-        if isinstance(self.training_type_plugin.precision_plugin, NativeMixedPrecisionPlugin):
-            return AMPType.NATIVE
-        return None
-
-    @property
-    def precision(self) -> Union[str, int]:
-        """The type of precision being used with this accelerator.
-
-        Use `training_type_plugin.precision_plugin.precision` instead.
-
-        .. deprecated::
-            The ``precision_plugin`` parameter has been deprecated and will be removed soon.
-            Pass the precision plugin as a parameter to the ``TrainingTypePlugin`` instead.
-        """
-        rank_zero_deprecation(
-            f"`{self.__class__.__name__}.precision` has been deprecated and will be removed soon"
-            f" Use `training_type_plugin.precision_plugin.precision` instead."
-        )
-        return self.training_type_plugin.precision_plugin.precision
-
-    @property
-    def scaler(self) -> Optional["GradScaler"]:
-        return getattr(self.training_type_plugin.precision_plugin, "scaler", None)
-
-    def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]:
-        """Returns state of an optimizer.
-
-        Allows for syncing/collating optimizer state from processes in custom plugins.
-        """
-        return getattr(self.training_type_plugin, "optimizer_state", lambda x: x.state_dict())(optimizer)
-
-    @contextlib.contextmanager
-    def model_sharded_context(self) -> Generator[None, None, None]:
-        """Provide hook to create modules in a distributed aware context. This is useful for when we'd like to.
-
-        shard the model instantly - useful for extremely large models. Can save memory and
-        initialization time.
-
-        Returns:
-            Model parallel context.
-        """
-        with self.training_type_plugin.model_sharded_context():
-            yield
-
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for a given device.
 

diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
@@ -29,8 +29,10 @@ def setup(self, trainer: "pl.Trainer") -> None:
             MisconfigurationException:
                 If the selected device is not CPU.
         """
-        if "cpu" not in str(self.root_device):
-            raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.")
+        if "cpu" not in str(self.training_type_plugin.root_device):
+            raise MisconfigurationException(
+                f"Device should be CPU, got {self.training_type_plugin.root_device} instead."
+            )
 
         return super().setup(trainer)
 

diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
@@ -37,12 +37,14 @@ def setup_environment(self) -> None:
                 If the selected device is not GPU.
         """
         super().setup_environment()
-        if "cuda" not in str(self.root_device):
-            raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead")
-        torch.cuda.set_device(self.root_device)
+        if "cuda" not in str(self.training_type_plugin.root_device):
+            raise MisconfigurationException(
+                f"Device should be GPU, got {self.training_type_plugin.root_device} instead"
+            )
+        torch.cuda.set_device(self.training_type_plugin.root_device)
 
     def setup(self, trainer: "pl.Trainer") -> None:
-        self.set_nvidia_flags(trainer.local_rank)
+        self.set_nvidia_flags(getattr(self.training_type_plugin, "local_rank", 0))
         return super().setup(trainer)
 
     def on_train_start(self) -> None:
@@ -77,7 +79,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
 
     def teardown(self) -> None:
         super().teardown()
-        self._move_optimizer_state(torch.device("cpu"))
+        self.training_type_plugin._move_optimizer_state(torch.device("cpu"))
 
     @staticmethod
     def auto_device_count() -> int:

diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py
@@ -15,25 +15,12 @@
 
 import torch
 
-import pytorch_lightning as pl
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class IPUAccelerator(Accelerator):
     """Accelerator for IPUs."""
 
-    def setup_optimizers(self, trainer: "pl.Trainer") -> None:
-        """
-        Raises:
-            MisconfigurationException:
-                If multiple optimizers are provided.
-        """
-        super().setup_optimizers(trainer)
-
-        if len(self.optimizers) > 1:
-            raise MisconfigurationException("IPUs currently only support one optimizer.")
-
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """IPU device stats aren't supported yet."""
         return {}

diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Union
 
 import torch
 
@@ -21,7 +21,6 @@
 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin
 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin
 from pytorch_lightning.utilities import _XLA_AVAILABLE
-from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device
 
 if _XLA_AVAILABLE:
     import torch_xla.core.xla_model as xm
@@ -49,14 +48,6 @@ def setup(self, trainer: "pl.Trainer") -> None:
             )
         return super().setup(trainer)
 
-    def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None:
-        """Moves the state of the optimizers to the TPU if needed."""
-        # TODO: `self.root_device` would raise error if called outside the spawn process
-        # while training on 8 and more cores.
-        for opt in self.optimizers:
-            for p, v in opt.state.items():
-                opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device)
-
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for the given TPU device.
 

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -1360,7 +1360,7 @@ def training_step(...):
             **kwargs: Additional keyword arguments to be forwarded to :meth:`~torch.Tensor.backward`
         """
         self._verify_is_manual_optimization("manual_backward")
-        self.trainer.accelerator.backward(loss, None, None, *args, **kwargs)
+        self.trainer.training_type_plugin.backward(loss, None, None, *args, **kwargs)
 
     def backward(
         self, loss: Tensor, optimizer: Optional[Optimizer], optimizer_idx: Optional[int], *args, **kwargs

diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py
@@ -161,4 +161,4 @@ def closure_dis():
         trainer = self._trainer
         assert trainer is not None
         with trainer.profiler.profile(profiler_action):
-            trainer.accelerator.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
+            trainer.training_type_plugin.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py
@@ -120,7 +120,7 @@ def device(self) -> torch.device:
 
         Use this to create tensors directly on the device if needed.
         """
-        return self._accelerator.root_device
+        return self._strategy.root_device
 
     @property
     def global_rank(self) -> int:

diff --git a/pytorch_lightning/lite/wrappers.py b/pytorch_lightning/lite/wrappers.py
@@ -50,18 +50,20 @@ def __init__(self, optimizer: Optimizer, accelerator: Accelerator) -> None:
         self.__class__ = type("Lite" + optimizer.__class__.__name__, (self.__class__, optimizer.__class__), {})
         self._optimizer = optimizer
         self._accelerator = accelerator
+        # TODO refactor to take Strategy as param, API breaking change for Lite? @
+        self._strategy = self._accelerator.training_type_plugin
 
     @property
     def optimizer(self) -> Optimizer:
         return self._optimizer
 
     def step(self, closure: Optional[Callable] = None) -> None:
         closure = closure or _do_nothing_closure
-        self._accelerator.optimizer_step(
+        self._strategy.optimizer_step(
             self.optimizer,
             opt_idx=0,
             closure=closure,
-            model=self._accelerator.model,
+            model=self._strategy.model,
         )
 
 

diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -109,7 +109,7 @@ def advance(
 
         if not self.trainer._data_connector.evaluation_data_fetcher.store_on_device:
             with self.trainer.profiler.profile("evaluation_batch_to_device"):
-                batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx)
+                batch = self.trainer.training_type_plugin.batch_to_device(batch, dataloader_idx=dataloader_idx)
 
         self.batch_progress.increment_ready()