Lightning-AI · DuYicong515 · Mar 5, 2022 · Mar 5, 2022
@@ -755,6 +755,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed to avoid common hook warning if no hook is overridden ([#12131](https://github.com/PyTorchLightning/pytorch-lightning/pull/12131))
 
 
+- Fixed an issue that metric states are incorretly saved and restored on checkpointing when `_fault_tolerant_training() = False` ([#12236](https://github.com/PyTorchLightning/pytorch-lightning/pull/12236))
+
+
 ## [1.5.10] - 2022-02-08
 
 ### Fixed

@@ -27,7 +27,6 @@
 from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _fault_tolerant_training
 from pytorch_lightning.utilities.migration import pl_legacy_patch
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info
 from pytorch_lightning.utilities.types import _PATH
@@ -408,11 +407,7 @@ def save_checkpoint(self, filepath: _PATH, weights_only: bool = False) -> None:
         self.trainer.strategy.save_checkpoint(_checkpoint, filepath)
 
     def _get_lightning_module_state_dict(self) -> Dict[str, torch.Tensor]:
-        metrics = (
-            [m for m in self.trainer.lightning_module.modules() if isinstance(m, Metric)]
-            if _fault_tolerant_training()
-            else []
-        )
+        metrics = [m for m in self.trainer.lightning_module.modules() if isinstance(m, Metric)]
 
         for metric in metrics:
             metric.persistent(True)

@@ -16,6 +16,7 @@
 
 import pytest
 import torch
+from torchmetrics import Metric
 from torchmetrics.functional import mean_absolute_percentage_error as mape
 
 from pytorch_lightning import seed_everything, Trainer
@@ -243,5 +244,8 @@ def test_quantization_val_test_predict(tmpdir):
 
     expected_state_dict = expected_qmodel.state_dict()
     for key, value in val_test_predict_qmodel.state_dict().items():
-        expected_value = expected_state_dict[key]
-        assert torch.allclose(value, expected_value)
+        # validate and test will affect metric module state, other modules should be comparable.
+        submodule = getattr(val_test_predict_qmodel, key.split(".")[0])
+        if not isinstance(submodule, Metric):
+            expected_value = expected_state_dict[key]
+            assert torch.allclose(value, expected_value)
@@ -17,12 +17,14 @@
 
 import pytest
 import torch
+from torchmetrics import Metric
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.plugins.environments import SLURMEnvironment
 from pytorch_lightning.trainer.states import TrainerFn
 from tests.helpers import BoringModel
+from tests.helpers.runif import RunIf
 
 
 # TODO: remove HPCHookedModel in v1.8
@@ -191,3 +193,46 @@ def test_loops_restore(tmpdir):
             if fn2 not in (fn, TrainerFn.TUNING):
                 trainer_loop2 = getattr(trainer, f"{fn2}_loop")
                 trainer_loop2.load_state_dict.assert_not_called()
+
+
+class DummyMetric(Metric):
+    def __init__(self):
+        super().__init__()
+        self.add_state("x", torch.tensor(0))
+
+    def update(self, x):
+        self.x += x
+
+    def compute(self):
+        return self.x
+
+
+class BoringModelWithMetric(BoringModel):
+    def __init__(self):
+        super().__init__()
+        self.dummy_metric = DummyMetric()
+
+    def training_step(self, batch, batch_idx):
+        self.dummy_metric.update(batch_idx)
+        return super().training_step(batch, batch_idx)
+
+
+@RunIf(min_gpus=2)
+def test_metric_load_and_save(tmpdir):
+    model = BoringModelWithMetric()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_steps=1,
+        accelerator="gpu",
+        devices=2,
+        logger=False,
+        callbacks=[ModelCheckpoint(dirpath=tmpdir, save_weights_only=True)],
+    )
+    trainer.fit(model)
+    checkpoint_path = trainer.checkpoint_callback.best_model_path
+    rank_0_local_state = model.dummy_metric.x
+    restored_model = BoringModelWithMetric.load_from_checkpoint(checkpoint_path)
+
+    # Rank 0 should restore from synced state, which is a stacked tensor across process
+    assert torch.equal(restored_model.dummy_metric.x, torch.tensor([0, 0]))
+    assert torch.equal(rank_0_local_state, torch.tensor(0))