Lightning-AI · carmocca · Jul 1, 2021 · Jun 29, 2021 · Jun 29, 2021 · Jun 29, 2021
@@ -123,6 +123,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added XLA Profiler ([#8014](https://github.com/PyTorchLightning/pytorch-lightning/pull/8014))
 
 
+- Added `state_dict` and `load_state_dict` function to `Loops` ([#8197](https://github.com/PyTorchLightning/pytorch-lightning/pull/8197))
+
+
 - Added `should_raise_exception` parameter to `parse_gpu_ids`, `parse_tpu_cores` and `_sanitize_gpu_ids` utility functions ([#8194](https://github.com/PyTorchLightning/pytorch-lightning/pull/8194))
 
 

@@ -13,11 +13,12 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any, Dict, Optional
 
 from deprecate import void
 
 import pytorch_lightning as pl
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class Loop(ABC):
@@ -46,6 +47,10 @@ def __init__(self) -> None:
         self.iteration_count: int = 0
         self.trainer: Optional['pl.Trainer'] = None
 
+    @property
+    def is_connected(self) -> bool:
+        return self.trainer is not None
+
     @property
     @abstractmethod
     def done(self) -> bool:
@@ -59,6 +64,10 @@ def skip(self) -> bool:
     def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None:
         """Connects Loop with all the necessary things like connectors and accelerators."""
         # TODO(@justusschock): Make the trainer a weakref/proxy
+        if not isinstance(trainer, pl.Trainer):
+            raise MisconfigurationException(
+                f"Loop {self.__class__.__name__} should be connected to a :class:`~pytorch_lightning.Trainer` instance."
+            )
         self.trainer = trainer
 
     def on_skip(self) -> Optional[Any]:
@@ -128,3 +137,9 @@ def on_run_end(self) -> Any:
 
     def teardown(self) -> None:
         """The very last method called inside :meth:`run`. Use to release memory etc."""
+
+    def load_state_dict(self, state_dict: Dict) -> None:
+        """Restore the loop state from the provided state_dict."""
+
+    def state_dict(self) -> Dict:
+        """Return the loop current states."""
diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -674,3 +674,9 @@ def _truncated_bptt_steps(self) -> int:
         if lightning_module.truncated_bptt_steps > 0:
             return lightning_module.truncated_bptt_steps
         return self.trainer.truncated_bptt_steps or 0
+
+    def state_dict(self) -> Dict:
+        return {}
+
+    def load_state_dict(self, state_dict: Dict) -> None:
+        pass
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, List, Optional, Sequence, Union
+from typing import Any, Dict, List, Optional, Sequence, Union
 
 from deprecate.utils import void
 from torch.utils.data.dataloader import DataLoader
@@ -70,6 +70,7 @@ def predictions(self):
     def connect(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
         """Connects the loop to everything necessary (like trainer and accelerators)"""
         super().connect(trainer, *args, **kwargs)
+        self.epoch_loop = EvaluationEpochLoop()
         self.epoch_loop.connect(trainer)
 
     @property
@@ -266,3 +267,9 @@ def on_evaluation_epoch_end(self) -> None:
         self.trainer.call_hook(hook_name)
         self.trainer.call_hook("on_epoch_end")
         self.trainer.logger_connector.on_epoch_end()
+
+    def state_dict(self) -> Dict:
+        return {}
+
+    def load_state_dict(self, state_dict: Dict) -> None:
+        pass
@@ -47,8 +47,8 @@ def __init__(self, min_steps: int, max_steps: int):
         self.batches_seen: int = 0
         self.is_last_batch: Optional[bool] = None
 
-        self.batch_loop: Optional[TrainingBatchLoop] = None
-        self.val_loop: Optional[loops.EvaluationLoop] = None
+        self.batch_loop = TrainingBatchLoop()
+        self.val_loop = loops.EvaluationLoop()
 
         self._dataloader_idx: Optional[int] = None
         self._warning_cache: WarningCache = WarningCache()
@@ -425,3 +425,10 @@ def _save_loggers_on_train_batch_end(self) -> None:
         should_flush_logs = self.trainer.logger_connector.should_flush_logs
         if should_flush_logs and self.trainer.is_global_zero and self.trainer.logger is not None:
             self.trainer.logger.save()
+
+    def state_dict(self) -> Dict:
+        return {"batch_loop": self.batch_loop.state_dict(), "validation_loop": self.val_loop.state_dict()}
+
+    def load_state_dict(self, state_dict: Dict) -> None:
+        self.batch_loop.load_state_dict(state_dict["batch_loop"])
+        self.val_loop.load_state_dict(state_dict["validation_loop"])
@@ -14,14 +14,15 @@
 
 import logging
 from contextlib import suppress
-from typing import Any, Optional
+from typing import Any, Dict, Optional
 
 import pytorch_lightning as pl
 from pytorch_lightning.loops import Loop
 from pytorch_lightning.loops.epoch import TrainingEpochLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import rank_zero_info
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 log = logging.getLogger(__name__)
 
@@ -97,6 +98,12 @@ def min_steps(self) -> int:
         """Returns the minimum numnber of steps to run"""
         return self.epoch_loop.min_steps
 
+    @min_steps.setter
+    def min_steps(self, value: int) -> None:
+        """Sets the minimum number of steps (forwards to epoch_loop)"""
+        # TODO(@awaelchli): This setter is required by debugging connector (fast dev run), should be avoided
+        self.epoch_loop.min_steps = value
+
     @property
     def max_steps(self) -> int:
         """Returns the maximum number of steps to run"""
@@ -159,6 +166,7 @@ def skip(self) -> bool:
     def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None:
         """Connects the loop with necessary arguments like the trainer"""
         super().connect(trainer, *args, **kwargs)
+        self.epoch_loop = TrainingEpochLoop(self.min_steps, self.max_steps)
         self.epoch_loop.connect(trainer)
 
     def reset(self) -> None:
@@ -274,3 +282,11 @@ def _check_checkpoint_callback(self, should_update: bool, is_last: bool = False)
 
             for cb in callbacks:
                 cb.on_validation_end(self.trainer, model)
+
+    def state_dict(self) -> Dict:
+        if not self.is_connected:
+            raise MisconfigurationException("The Trainer should be connected to loop to retrieve the state_dict.")
+        return {"epoch_loop": self.epoch_loop.state_dict()}
+
+    def load_state_dict(self, state_dict: Dict) -> None:
+        self.epoch_loop.load_state_dict(state_dict["epoch_loop"])
@@ -16,7 +16,7 @@
 from abc import ABC
 from argparse import ArgumentParser, Namespace
 from pathlib import Path
-from typing import cast, List, Optional, Type, TypeVar, Union
+from typing import Any, cast, Dict, List, Optional, Type, TypeVar, Union
 
 import torch
 from torch.optim import Optimizer
@@ -555,6 +555,13 @@ def _results(self) -> Optional[ResultCollection]:
         if active_loop is not None:
             return active_loop.results
 
+    def get_loops_state_dict(self) -> Dict[str, Any]:
+        return {
+            "fit_loop": self.fit_loop.state_dict(),
+            "validate_loop": self.validation_loop.state_dict(),
+            "test_loop": self.test_loop.state_dict(),
+        }
+
     """
     Other
     """

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
@@ -0,0 +1,57 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from unittest.mock import ANY
+
+import pytest
+
+from pytorch_lightning.loops import FitLoop
+from pytorch_lightning.trainer.trainer import Trainer
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+
+
+def test_loops_state_dict_structure():
+
+    fit_loop = FitLoop()
+    with pytest.raises(
+        MisconfigurationException, match="The Trainer should be connected to loop to retrieve the state_dict."
+    ):
+        state_dict = fit_loop.state_dict()
+    with pytest.raises(
+        MisconfigurationException,
+        match="Loop FitLoop should be connected to a :class:`~pytorch_lightning.Trainer` instance."
+    ):
+        fit_loop.connect(object())
+    fit_loop.connect(Trainer())
+    state_dict = fit_loop.state_dict()
+    expected = {'epoch_loop': {'batch_loop': ANY, 'validation_loop': ANY}}
+    assert state_dict == expected
+
+    fit_loop.load_state_dict(state_dict)
+
+
+def test_loops_state_dict_structure_with_trainer():
+
+    trainer = Trainer()
+    state_dict = trainer.get_loops_state_dict()
+    expected = {
+        "fit_loop": {
+            'epoch_loop': {
+                'batch_loop': ANY,
+                'validation_loop': ANY
+            }
+        },
+        "validate_loop": ANY,
+        "test_loop": ANY
+    }
+    assert state_dict == expected
Original file line number	Diff line number	Diff line change
Expand Up		@@ -123,6 +123,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
		- Added XLA Profiler ([#8014](https://github.com/PyTorchLightning/pytorch-lightning/pull/8014))


		- Added `state_dict` and `load_state_dict` function to `Loops` ([#8197](https://github.com/PyTorchLightning/pytorch-lightning/pull/8197))


		- Added `should_raise_exception` parameter to `parse_gpu_ids`, `parse_tpu_cores` and `_sanitize_gpu_ids` utility functions ([#8194](https://github.com/PyTorchLightning/pytorch-lightning/pull/8194))


Expand Down