Merge branch 'master' into plot/text1

Lightning-AI · Mar 21, 2023 · 24af473 · 24af473
2 parents ac7de67 + f735df1
commit 24af473
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 122 deletions.
diff --git a/docs/source/pages/lightning.rst b/docs/source/pages/lightning.rst
@@ -45,29 +45,31 @@ The example below shows how to use a metric in your `LightningModule <https://py
             self.log('train_acc_step', self.accuracy)
             ...
 
-        def training_epoch_end(self, outs):
+        def on_train_epoch_end(self):
             # log epoch metric
             self.log('train_acc_epoch', self.accuracy)
 
-Metric logging in Lightning happens through the ``self.log`` or ``self.log_dict`` method. Both methods only support the logging of *scalar-tensors*.
-While the vast majority of metrics in torchmetrics returns a scalar tensor, some metrics such as :class:`~torchmetrics.ConfusionMatrix`, :class:`~torchmetrics.ROC`,
-:class:`~torchmetrics.MeanAveragePrecision`, :class:`~torchmetrics.ROUGEScore` return outputs that are non-scalar tensors (often dicts or list of tensors) and should therefore be
-dealt with separately. For info about the return type and shape please look at the documentation for the ``compute`` method for each metric you want to log.
+Metric logging in Lightning happens through the ``self.log`` or ``self.log_dict`` method. Both methods only support the
+logging of *scalar-tensors*. While the vast majority of metrics in torchmetrics returns a scalar tensor, some metrics
+such as :class:`~torchmetrics.ConfusionMatrix`, :class:`~torchmetrics.ROC`, :class:`~torchmetrics.MeanAveragePrecision`,
+:class:`~torchmetrics.ROUGEScore` return outputs that are non-scalar tensors (often dicts or list of tensors) and should
+therefore be dealt with separately. For info about the return type and shape please look at the documentation for the
+``compute`` method for each metric you want to log.
 
 ********************
 Logging TorchMetrics
 ********************
 
-Logging metrics can be done in two ways: either logging the metric object directly or the computed metric values. When :class:`~torchmetrics.Metric` objects, which return a scalar tensor
-are logged directly in Lightning using the LightningModule `self.log <https://pytorch-lightning.readthedocs.io/en/stable/extensions/logging.html#logging-from-a-lightningmodule>`_ method,
-Lightning will log the metric based on ``on_step`` and ``on_epoch`` flags present in ``self.log(...)``. If ``on_epoch`` is True, the logger automatically logs the end of epoch metric
-value by calling ``.compute()``.
+Logging metrics can be done in two ways: either logging the metric object directly or the computed metric values.
+When :class:`~torchmetrics.Metric` objects, which return a scalar tensor are logged directly in Lightning using the
+LightningModule `self.log <https://pytorch-lightning.readthedocs.io/en/stable/extensions/logging.html#logging-from-a-lightningmodule>`_
+method, Lightning will log the metric based on ``on_step`` and ``on_epoch`` flags present in ``self.log(...)``. If
+``on_epoch`` is True, the logger automatically logs the end of epoch metric value by calling ``.compute()``.
 
 .. note::
 
-    ``sync_dist``, ``sync_dist_op``, ``sync_dist_group``, ``reduce_fx`` and ``tbptt_reduce_fx``
-    flags from ``self.log(...)`` don't affect the metric logging in any manner. The metric class
-    contains its own distributed synchronization logic.
+    ``sync_dist``, ``sync_dist_group`` and ``reduce_fx`` flags from ``self.log(...)`` don't affect the metric logging
+    in any manner. The metric class contains its own distributed synchronization logic.
 
     This however is only true for metrics that inherit the base class ``Metric``,
     and thus the functional metric API provides no support for in-built distributed synchronization
@@ -96,8 +98,8 @@ value by calling ``.compute()``.
             self.valid_acc(logits, y)
             self.log('valid_acc', self.valid_acc, on_step=True, on_epoch=True)
 
-As an alternative to logging the metric object and letting Lightning take care of when to reset the metric etc. you can also manually log the output
-of the metrics.
+As an alternative to logging the metric object and letting Lightning take care of when to reset the metric etc. you can
+also manually log the output of the metrics.
 
 .. testcode:: python
 
@@ -115,27 +117,28 @@ of the metrics.
             batch_value = self.train_acc(preds, y)
             self.log('train_acc_step', batch_value)
 
-        def training_epoch_end(self, outputs):
+        def on_train_epoch_end(self):
             self.train_acc.reset()
 
         def validation_step(self, batch, batch_idx):
             logits = self(x)
             ...
             self.valid_acc.update(logits, y)
 
-        def validation_epoch_end(self, outputs):
+        def on_validation_epoch_end(self, outputs):
             self.log('valid_acc_epoch', self.valid_acc.compute())
             self.valid_acc.reset()
 
-Note that logging metrics this way will require you to manually reset the metrics at the end of the epoch yourself. In general, we recommend logging
-the metric object to make sure that metrics are correctly computed and reset. Additionally, we highly recommend that the two ways of logging are not
-mixed as it can lead to wrong results.
+Note that logging metrics this way will require you to manually reset the metrics at the end of the epoch yourself.
+In general, we recommend logging the metric object to make sure that metrics are correctly computed and reset.
+Additionally, we highly recommend that the two ways of logging are not mixed as it can lead to wrong results.
 
 .. note::
 
-    When using any Modular metric, calling ``self.metric(...)`` or ``self.metric.forward(...)`` serves the dual purpose of calling ``self.metric.update()``
-    on its input and simultaneously returning the metric value over the provided input. So if you are logging a metric *only* on epoch-level (as in the
-    example above), it is recommended to call ``self.metric.update()`` directly to avoid the extra computation.
+    When using any Modular metric, calling ``self.metric(...)`` or ``self.metric.forward(...)`` serves the dual purpose
+    of calling ``self.metric.update()`` on its input and simultaneously returning the metric value over the provided
+    input. So if you are logging a metric *only* on epoch-level (as in the example above), it is recommended to call
+    ``self.metric.update()`` directly to avoid the extra computation.
 
     .. testcode:: python
 
@@ -158,25 +161,6 @@ Common Pitfalls
 
 The following contains a list of pitfalls to be aware of:
 
-* If using metrics in data parallel mode (dp), the metric update/logging should be done
-  in the ``<mode>_step_end`` method (where ``<mode>`` is either ``training``, ``validation``
-  or ``test``). This is because ``dp`` split the batches during the forward pass and metric states are destroyed after each forward pass, thus leading to wrong accumulation. In practice do the following:
-
-.. testcode:: python
-
-    class MyModule(LightningModule):
-
-        def training_step(self, batch, batch_idx):
-            data, target = batch
-            preds = self(data)
-            # ...
-            return {'loss': loss, 'preds': preds, 'target': target}
-
-        def training_step_end(self, outputs):
-            # update and log
-            self.metric(outputs['preds'], outputs['target'])
-            self.log('metric', self.metric)
-
 * Modular metrics contain internal states that should belong to only one DataLoader. In case you are using multiple DataLoaders,
   it is recommended to initialize a separate modular metric instances for each DataLoader and use them separately. The same holds
   for using seperate metrics for training, validation and testing.

diff --git a/docs/source/pages/overview.rst b/docs/source/pages/overview.rst
@@ -130,32 +130,6 @@ the native `MetricCollection`_ module can also be used to wrap multiple metrics.
 
 You can always check which device the metric is located on using the `.device` property.
 
-Metrics in Dataparallel (DP) mode
-=================================
-
-When using metrics in `Dataparallel (DP) <https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html#torch.nn.DataParallel>`_
-mode, one should be aware DP will both create and clean-up replicas of Metric objects during a single forward pass.
-This has the consequence, that the metric state of the replicas will as default be destroyed before we can sync
-them. It is therefore recommended, when using metrics in DP mode, to initialize them with ``dist_sync_on_step=True``
-such that metric states are synchonized between the main process and the replicas before they are destroyed.
-
-Addtionally, if metrics are used together with a `LightningModule` the metric update/logging should be done
-in the ``<mode>_step_end`` method (where ``<mode>`` is either ``training``, ``validation`` or ``test``), else
-it will lead to wrong accumulation. In practice do the following:
-
-.. testcode::
-
-    def training_step(self, batch, batch_idx):
-        data, target = batch
-        preds = self(data)
-        ...
-        return {'loss': loss, 'preds': preds, 'target': target}
-
-    def training_step_end(self, outputs):
-        #update and log
-        self.metric(outputs['preds'], outputs['target'])
-        self.log('metric', self.metric)
-
 Metrics in Distributed Data Parallel (DDP) mode
 ===============================================
 

diff --git a/tests/integrations/lightning/boring_model.py b/tests/integrations/lightning/boring_model.py
@@ -90,55 +90,32 @@ def training_step(self, batch, batch_idx):
         loss = self.loss(batch, output)
         return {"loss": loss}
 
-    @staticmethod
-    def training_step_end(training_step_outputs):
-        """Run at the end of a training step. Needed when using multiple devices."""
-        return training_step_outputs
-
-    @staticmethod
-    def training_epoch_end(outputs) -> None:
-        """Run at the end of a training epoch."""
-        torch.stack([x["loss"] for x in outputs]).mean()
-
     def validation_step(self, batch, batch_idx):
         """Single validation step in the model."""
         output = self.layer(batch)
         loss = self.loss(batch, output)
         return {"x": loss}
 
-    @staticmethod
-    def validation_epoch_end(outputs) -> None:
-        """Run at the end of each validation epoch."""
-        torch.stack([x["x"] for x in outputs]).mean()
-
     def test_step(self, batch, batch_idx):
         """Single test step in the model."""
         output = self.layer(batch)
         loss = self.loss(batch, output)
         return {"y": loss}
 
-    @staticmethod
-    def test_epoch_end(outputs) -> None:
-        """Run at the end of each test epoch."""
-        torch.stack([x["y"] for x in outputs]).mean()
-
     def configure_optimizers(self):
         """Configure which optimizer to use when training the model."""
         optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
-        return [optimizer], [lr_scheduler]
+        lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
+        return {"optimizer": optimizer, "scheduler": lr_scheduler}
 
-    @staticmethod
-    def train_dataloader():
+    def train_dataloader(self):
         """Define train dataloader used for training the model."""
         return torch.utils.data.DataLoader(RandomDataset(32, 64))
 
-    @staticmethod
-    def val_dataloader():
+    def val_dataloader(self):
         """Define validation dataloader used for validating the model."""
         return torch.utils.data.DataLoader(RandomDataset(32, 64))
 
-    @staticmethod
-    def test_dataloader():
+    def test_dataloader(self):
         """Define test dataloader used for testing the mdoel."""
         return torch.utils.data.DataLoader(RandomDataset(32, 64))
diff --git a/tests/integrations/test_lightning.py b/tests/integrations/test_lightning.py
@@ -49,7 +49,7 @@ def training_step(self, batch, batch_idx):
 
             return self.step(x)
 
-        def training_epoch_end(self, outs):
+        def on_training_epoch_end(self):
             if not torch.allclose(self.sum, self.metric.compute()):
                 raise ValueError("Sum and computed value must be equal")
             self.sum = 0.0
@@ -71,10 +71,10 @@ def training_epoch_end(self, outs):
 def test_metrics_reset(tmpdir):
     """Tests that metrics are reset correctly after the end of the train/val/test epoch.
 
-    Taken from:     `Metric Test for Reset`_
+    Taken from: `Metric Test for Reset`_
     """
 
-    class TestModel(LightningModule):
+    class TestModel(BoringModel):
         def __init__(self) -> None:
             super().__init__()
             self.layer = torch.nn.Linear(32, 1)
@@ -122,37 +122,20 @@ def validation_step(self, batch, batch_idx, *args, **kwargs):
         def test_step(self, batch, batch_idx, *args, **kwargs):
             return self._step("test", batch)
 
-        def configure_optimizers(self):
-            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
-            return [optimizer], [lr_scheduler]
-
-        @staticmethod
-        def train_dataloader():
-            return DataLoader(RandomDataset(32, 64), batch_size=2)
-
-        @staticmethod
-        def val_dataloader():
-            return DataLoader(RandomDataset(32, 64), batch_size=2)
-
-        @staticmethod
-        def test_dataloader():
-            return DataLoader(RandomDataset(32, 64), batch_size=2)
-
         def _assert_epoch_end(self, stage):
             acc = self._modules[f"acc_{stage}"]
             ap = self._modules[f"ap_{stage}"]
 
             acc.reset.asset_not_called()
             ap.reset.assert_not_called()
 
-        def train_epoch_end(self, outputs):
+        def on_train_epoch_end(self):
             self._assert_epoch_end("train")
 
-        def validation_epoch_end(self, outputs):
+        def on_validation_epoch_end(self):
             self._assert_epoch_end("val")
 
-        def test_epoch_end(self, outputs):
+        def on_test_epoch_end(self):
             self._assert_epoch_end("test")
 
     def _assert_called(model, stage):
@@ -194,6 +177,7 @@ def __init__(self) -> None:
             self.metric_step = SumMetric()
             self.metric_epoch = SumMetric()
             self.sum = torch.tensor(0.0)
+            self.outs = []
 
         def on_train_epoch_start(self):
             self.sum = torch.tensor(0.0)
@@ -203,10 +187,12 @@ def training_step(self, batch, batch_idx):
             self.metric_step(x.sum())
             self.sum += x.sum()
             self.log("sum_step", self.metric_step, on_epoch=True, on_step=False)
-            return {"loss": self.step(x), "data": x}
+            self.outs.append(x)
+            return self.step(x)
 
-        def training_epoch_end(self, outs):
-            self.log("sum_epoch", self.metric_epoch(torch.stack([o["data"] for o in outs]).sum()))
+        def on_train_epoch_end(self):
+            self.log("sum_epoch", self.metric_epoch(torch.stack(self.outs)))
+            self.outs = []
 
     model = TestModel()
 
@@ -246,7 +232,7 @@ def training_step(self, batch, batch_idx):
             self.log_dict({f"{k}_step": v for k, v in metric_vals.items()})
             return self.step(x)
 
-        def training_epoch_end(self, outputs):
+        def on_train_epoch_end(self):
             metric_vals = self.metric.compute()
             self.log_dict({f"{k}_epoch": v for k, v in metric_vals.items()})