From 940b910d271b827d472f394d1d8eea5bd9e35ca4 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <38207072+daniellepintz@users.noreply.github.com>
Date: Wed, 13 Oct 2021 11:29:36 -0700
Subject: [PATCH] [2/4] Add DeviceStatsMonitor callback (#9712)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: ananthsub <ananth.subramaniam@gmail.com>
Co-authored-by: thomas chaton <thomas@grid.ai>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Kaushik B <kaushikbokka@gmail.com>
Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
---
 CHANGELOG.md                                  |   3 +
 dockers/tpu-tests/tpu_test_cases.jsonnet      |   1 +
 docs/source/extensions/accelerators.rst       |   1 +
 docs/source/extensions/callbacks.rst          |   1 +
 pyproject.toml                                |   1 +
 pytorch_lightning/accelerators/cpu.py         |   2 +-
 pytorch_lightning/accelerators/ipu.py         |   7 +-
 pytorch_lightning/callbacks/__init__.py       |   2 +
 .../callbacks/device_stats_monitor.py         |  82 +++++++++++
 tests/accelerators/test_tpu.py                |  16 ---
 tests/callbacks/test_device_stats_monitor.py  | 130 ++++++++++++++++++
 11 files changed, 228 insertions(+), 18 deletions(-)
 create mode 100644 pytorch_lightning/callbacks/device_stats_monitor.py
 delete mode 100644 tests/accelerators/test_tpu.py
 create mode 100644 tests/callbacks/test_device_stats_monitor.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ba1be2433463f..6f12dda513629 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -163,6 +163,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added a warning when an unknown key is encountered in optimizer configuration, and when `OneCycleLR` is used with `"interval": "epoch"` ([#9666](https://github.com/PyTorchLightning/pytorch-lightning/pull/9666))
 
 
+- Added `DeviceStatsMonitor` callback ([#9712](https://github.com/PyTorchLightning/pytorch-lightning/pull/9712))
+
+
 - Added `enable_progress_bar` to Trainer constructor ([#9664](https://github.com/PyTorchLightning/pytorch-lightning/pull/9664))
 
 
diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet
index 4a3b9728221a7..67c46e8a53ec8 100644
--- a/dockers/tpu-tests/tpu_test_cases.jsonnet
+++ b/dockers/tpu-tests/tpu_test_cases.jsonnet
@@ -36,6 +36,7 @@ local tputests = base.BaseTest {
           tests/profiler/test_xla_profiler.py \
           pytorch_lightning/utilities/xla_device.py \
           tests/accelerators/test_tpu_backend.py \
+          tests/callbacks/test_device_stats_monitor.py \
           tests/models/test_tpu.py
       test_exit_code=$?
       echo "\n||| END PYTEST LOGS |||\n"
diff --git a/docs/source/extensions/accelerators.rst b/docs/source/extensions/accelerators.rst
index 6bd9ebd8267bd..61aad18eea4f8 100644
--- a/docs/source/extensions/accelerators.rst
+++ b/docs/source/extensions/accelerators.rst
@@ -14,6 +14,7 @@ Currently there are accelerators for:
 - CPU
 - GPU
 - TPU
+- IPU
 
 Each Accelerator gets two plugins upon initialization:
 One to handle differences from the training routine and one to handle different precisions.
diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst
index ad61c10a7bd3b..f8088d1d4153e 100644
--- a/docs/source/extensions/callbacks.rst
+++ b/docs/source/extensions/callbacks.rst
@@ -99,6 +99,7 @@ Lightning has a few built-in callbacks.
     BaseFinetuning
     BasePredictionWriter
     Callback
+    DeviceStatsMonitor
     EarlyStopping
     GPUStatsMonitor
     GradientAccumulationScheduler
diff --git a/pyproject.toml b/pyproject.toml
index be5b5fe4c571a..94ae7d502256c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,6 +61,7 @@ ignore_errors = "True"
 
 [[tool.mypy.overrides]]
 module = [
+    "pytorch_lightning.callbacks.device_stats_monitor",
     "pytorch_lightning.callbacks.model_summary",
     "pytorch_lightning.callbacks.pruning",
     "pytorch_lightning.callbacks.rich_model_summary",
diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
index baa922b6d796b..d16e8b6a8b1ac 100644
--- a/pytorch_lightning/accelerators/cpu.py
+++ b/pytorch_lightning/accelerators/cpu.py
@@ -35,5 +35,5 @@ def setup(self, trainer: "pl.Trainer") -> None:
         return super().setup(trainer)
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
-        """Returns dummy implementation for now."""
+        """CPU device stats aren't supported yet."""
         return {}
diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py
index 1456847a6ab4a..fbd23b5f2a217 100644
--- a/pytorch_lightning/accelerators/ipu.py
+++ b/pytorch_lightning/accelerators/ipu.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable
+from typing import Any, Callable, Dict, Union
 
+import torch
 from torch.optim import Optimizer
 
 import pytorch_lightning as pl
@@ -37,3 +38,7 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
     def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs: Any) -> None:
         # Optimizer step is handled by the IPU accelerator.
         lambda_closure()
+
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        """IPU device stats aren't supported yet."""
+        return {}
diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py
index 98cf5df7cafda..b94fa969f6ac9 100644
--- a/pytorch_lightning/callbacks/__init__.py
+++ b/pytorch_lightning/callbacks/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.callbacks.device_stats_monitor import DeviceStatsMonitor
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 from pytorch_lightning.callbacks.finetuning import BackboneFinetuning, BaseFinetuning
 from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor
@@ -33,6 +34,7 @@
     "BackboneFinetuning",
     "BaseFinetuning",
     "Callback",
+    "DeviceStatsMonitor",
     "EarlyStopping",
     "GPUStatsMonitor",
     "XLAStatsMonitor",
diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py
new file mode 100644
index 0000000000000..b743ed3e1bbeb
--- /dev/null
+++ b/pytorch_lightning/callbacks/device_stats_monitor.py
@@ -0,0 +1,82 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Device Stats Monitor
+====================
+
+Monitors and logs device stats during training.
+
+"""
+from typing import Any, Dict, Optional
+
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.types import STEP_OUTPUT
+
+
+class DeviceStatsMonitor(Callback):
+    r"""
+    Automatically monitors and logs device stats during training stage. ``DeviceStatsMonitor``
+    is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``.
+
+    Raises:
+        MisconfigurationException:
+            If ``Trainer`` has no logger.
+
+    Example:
+        >>> from pytorch_lightning import Trainer
+        >>> from pytorch_lightning.callbacks import DeviceStatsMonitor
+        >>> device_stats = DeviceStatsMonitor() # doctest: +SKIP
+        >>> trainer = Trainer(callbacks=[device_stats]) # doctest: +SKIP
+    """
+
+    def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None:
+        if not trainer.logger:
+            raise MisconfigurationException("Cannot use DeviceStatsMonitor callback with Trainer that has no logger.")
+
+    def on_train_batch_start(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        batch: Any,
+        batch_idx: int,
+        unused: Optional[int] = 0,
+    ) -> None:
+        if not trainer.logger_connector.should_update_logs:
+            return
+
+        device_stats = trainer.accelerator.get_device_stats(pl_module.device)
+        prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_start")
+        trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step)
+
+    def on_train_batch_end(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        outputs: STEP_OUTPUT,
+        batch: Any,
+        batch_idx: int,
+        unused: Optional[int] = 0,
+    ) -> None:
+        if not trainer.logger_connector.should_update_logs:
+            return
+
+        device_stats = trainer.accelerator.get_device_stats(pl_module.device)
+        prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_end")
+        trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step)
+
+
+def prefix_metrics_keys(metrics_dict: Dict[str, float], prefix: str) -> Dict[str, float]:
+    return {prefix + "." + k: v for k, v in metrics_dict.items()}
diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py
deleted file mode 100644
index f3a2c50c0e347..0000000000000
--- a/tests/accelerators/test_tpu.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from pytorch_lightning.accelerators import TPUAccelerator
-from pytorch_lightning.plugins import SingleTPUPlugin
-from pytorch_lightning.plugins.training_type import TPUSpawnPlugin
-from tests.helpers.runif import RunIf
-
-
-@RunIf(tpu=True)
-def test_device_stats_tpu(tmpdir):
-    """Test TPU get_device_stats."""
-    plugin = SingleTPUPlugin(1)
-    TPUAccel = TPUAccelerator(training_type_plugin=TPUSpawnPlugin(), precision_plugin=plugin)
-    tpu_stats = TPUAccel.get_device_stats("1")
-    fields = ["avg. free memory (MB)", "avg. peak memory (MB)"]
-
-    for f in fields:
-        assert any(f in h for h in tpu_stats.keys())
diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py
new file mode 100644
index 0000000000000..7a771ccdc8c43
--- /dev/null
+++ b/tests/callbacks/test_device_stats_monitor.py
@@ -0,0 +1,130 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional
+
+import pytest
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import DeviceStatsMonitor
+from pytorch_lightning.loggers import CSVLogger
+from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.helpers import BoringModel
+from tests.helpers.runif import RunIf
+
+
+@RunIf(min_torch="1.8")
+@RunIf(min_gpus=1)
+def test_device_stats_gpu_from_torch(tmpdir):
+    """Test GPU stats are logged using a logger with Pytorch >= 1.8.0."""
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    class DebugLogger(CSVLogger):
+        @rank_zero_only
+        def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+            fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"]
+            for f in fields:
+                assert any(f in h for h in metrics.keys())
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        limit_train_batches=7,
+        log_every_n_steps=1,
+        gpus=1,
+        callbacks=[device_stats],
+        logger=DebugLogger(tmpdir),
+        checkpoint_callback=False,
+        enable_progress_bar=False,
+    )
+
+    trainer.fit(model)
+
+
+@RunIf(max_torch="1.7")
+@RunIf(min_gpus=1)
+def test_device_stats_gpu_from_nvidia(tmpdir):
+    """Test GPU stats are logged using a logger with Pytorch < 1.8.0."""
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    class DebugLogger(CSVLogger):
+        @rank_zero_only
+        def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+            fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"]
+            for f in fields:
+                assert any(f in h for h in metrics.keys())
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        limit_train_batches=7,
+        log_every_n_steps=1,
+        gpus=1,
+        callbacks=[device_stats],
+        logger=DebugLogger(tmpdir),
+        checkpoint_callback=False,
+        enable_progress_bar=False,
+    )
+
+    trainer.fit(model)
+
+
+@RunIf(tpu=True)
+def test_device_stats_monitor_tpu(tmpdir):
+    """Test TPU stats are logged using a logger."""
+
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    class DebugLogger(CSVLogger):
+        @rank_zero_only
+        def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+            fields = ["avg. free memory (MB)", "avg. peak memory (MB)"]
+            for f in fields:
+                assert any(f in h for h in metrics.keys())
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=1,
+        tpu_cores=8,
+        log_every_n_steps=1,
+        callbacks=[device_stats],
+        logger=DebugLogger(tmpdir),
+        checkpoint_callback=False,
+        enable_progress_bar=False,
+    )
+
+    trainer.fit(model)
+
+
+def test_device_stats_monitor_no_logger(tmpdir):
+    """Test DeviceStatsMonitor with no logger in Trainer."""
+
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        callbacks=[device_stats],
+        max_epochs=1,
+        logger=False,
+        checkpoint_callback=False,
+        enable_progress_bar=False,
+    )
+
+    with pytest.raises(MisconfigurationException, match="Trainer that has no logger."):
+        trainer.fit(model)