From 4549007e6562d78e21a5a8cf96721e9f01155b93 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 28 Jun 2021 12:37:21 +0100
Subject: [PATCH 1/5] wip

---
 .../connectors/logger_connector/logger_connector.py        | 6 +++---
 tests/trainer/logging_/test_train_loop_logging.py          | 7 +++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 6aae89273f4fc..d2628ae94762a 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -91,14 +91,14 @@ def log_metrics(self, metrics: Dict[str, _METRIC], step: Optional[int] = None) -
             step: Step for which metrics should be logged. Default value is `self.global_step` during training or
                 the total validation / test log step count during validation and testing.
         """
-        if self.trainer.logger is None or not metrics:
-            return
-
         # add gpu memory
         if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
             mem_map = memory.get_memory_profile(self.log_gpu_memory)
             metrics.update(mem_map)
 
+        if self.trainer.logger is None or not metrics:
+            return
+
         # turn all tensors to scalars
         scalar_metrics = metrics_to_scalars(metrics)
 
diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py
index 8ca51b2dee3ef..68472bef9558c 100644
--- a/tests/trainer/logging_/test_train_loop_logging.py
+++ b/tests/trainer/logging_/test_train_loop_logging.py
@@ -712,3 +712,10 @@ def training_step(self, *args):
     model = TestModel()
     with pytest.raises(MisconfigurationException, match=r'reduce_fx={min,max,mean,sum}\)` are currently supported'):
         trainer.fit(model)
+
+
+def test_log_gpu_memory_without_logging_on_step(tmpdir):
+
+    model = BoringModel()
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, log_gpu_memory='all', log_every_n_steps=1)
+    trainer.fit(model)

From 3fc2d949fe61505ffcc08b1d1e0850e4d39d727b Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 28 Jun 2021 12:38:20 +0100
Subject: [PATCH 2/5] wip

---
 .../trainer/connectors/logger_connector/logger_connector.py  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index d2628ae94762a..ba5fcf5838b4c 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -91,12 +91,15 @@ def log_metrics(self, metrics: Dict[str, _METRIC], step: Optional[int] = None) -
             step: Step for which metrics should be logged. Default value is `self.global_step` during training or
                 the total validation / test log step count during validation and testing.
         """
+        if self.trainer.logger is None:
+            return
+
         # add gpu memory
         if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
             mem_map = memory.get_memory_profile(self.log_gpu_memory)
             metrics.update(mem_map)
 
-        if self.trainer.logger is None or not metrics:
+        if not metrics:
             return
 
         # turn all tensors to scalars

From ead9f88b3549d8a3a86cbcdcdc34e77cfc37703e Mon Sep 17 00:00:00 2001
From: Thomas Chaton <thomas@grid.ai>
Date: Mon, 28 Jun 2021 08:14:47 -0400
Subject: [PATCH 3/5] resolve gpu memory logging issue

---
 .../logger_connector/logger_connector.py      | 24 ++++++++++++++-----
 .../logging_/test_train_loop_logging.py       | 13 +++++++++-
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index d2628ae94762a..929e4f2394de0 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -38,6 +38,7 @@ def __init__(self, trainer: 'pl.Trainer', log_gpu_memory: Optional[str] = None)
         self._progress_bar_metrics: Dict[str, float] = {}
         self._logged_metrics: Dict[str, _METRIC] = {}
         self._callback_metrics: Dict[str, _METRIC] = {}
+        self._gpus_metrics: Dict[str, str] = {}
         self._epoch_end_reached = False
         self._current_fx: Optional[str] = None
         self._batch_idx: Optional[int] = None
@@ -91,11 +92,6 @@ def log_metrics(self, metrics: Dict[str, _METRIC], step: Optional[int] = None) -
             step: Step for which metrics should be logged. Default value is `self.global_step` during training or
                 the total validation / test log step count during validation and testing.
         """
-        # add gpu memory
-        if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
-            mem_map = memory.get_memory_profile(self.log_gpu_memory)
-            metrics.update(mem_map)
-
         if self.trainer.logger is None or not metrics:
             return
 
@@ -213,6 +209,8 @@ def update_train_step_metrics(self) -> None:
         if self.trainer.fit_loop.should_accumulate() and self.trainer.lightning_module.automatic_optimization:
             return
 
+        self._log_gpus_metrics()
+
         # when metrics should be logged
         assert not self._epoch_end_reached
         if self.should_update_logs or self.trainer.fast_dev_run:
@@ -226,6 +224,12 @@ def update_train_epoch_metrics(self) -> None:
         # reset result collection for next epoch
         self.trainer._results.reset(metrics=True)
 
+    def _log_gpus_metrics(self):
+        for key, memory in self.gpus_metrics.items():
+            gpu_id = int(key.split('/')[0].split(':')[1])
+            if gpu_id in self.trainer.accelerator_connector.parallel_device_ids:
+                self.trainer.lightning_module.log(key, memory, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+
     """
     Utilities and properties
     """
@@ -274,7 +278,15 @@ def reset(self, metrics: Optional[bool] = None) -> None:
     def metrics(self) -> Dict[MetricSource, Dict[str, _METRIC]]:
         """This function returns either batch or epoch metrics depending on ``_epoch_end_reached``."""
         on_step = not self._epoch_end_reached
-        return self.trainer._results.metrics(on_step)
+        metrics = self.trainer._results.metrics(on_step)
+        return metrics
+
+    @property
+    def gpus_metrics(self) -> Dict[str, str]:
+        if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
+            mem_map = memory.get_memory_profile(self.log_gpu_memory)
+            self._gpus_metrics.update(mem_map)
+        return self._gpus_metrics
 
     @property
     def callback_metrics(self) -> Dict[str, _METRIC]:
diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py
index 68472bef9558c..61e7a2a37ebc6 100644
--- a/tests/trainer/logging_/test_train_loop_logging.py
+++ b/tests/trainer/logging_/test_train_loop_logging.py
@@ -714,8 +714,19 @@ def training_step(self, *args):
         trainer.fit(model)
 
 
+@RunIf(min_gpus=2)
 def test_log_gpu_memory_without_logging_on_step(tmpdir):
 
     model = BoringModel()
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, log_gpu_memory='all', log_every_n_steps=1)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=1,
+        limit_val_batches=0,
+        log_gpu_memory='all',
+        log_every_n_steps=1,
+        gpus=[1]
+    )
     trainer.fit(model)
+
+    assert 'gpu_id: 1/memory.used (MB)' in trainer.logged_metrics

From c6e40e886c816099c5606757cb0518e3ba7b93df Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 28 Jun 2021 13:19:40 +0100
Subject: [PATCH 4/5] update changelog

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 81fbefca31453..fee8f94f226b9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -317,6 +317,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed a bug where an infinite recursion would be triggered when using the `BaseFinetuning` callback on a model that contains a `ModuleDict` ([#8170](https://github.com/PyTorchLightning/pytorch-lightning/pull/8170))
 
+
+- Fixed `log_gpu_memory` metrics not being added to `logging` when nothing else is logged ([#8174](https://github.com/PyTorchLightning/pytorch-lightning/pull/8174))
+
+
 ## [1.3.7] - 2021-06-22
 
 - Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/PyTorchLightning/pytorch-lightning/pull/7975))

From 319ad41be1988de39bd055ae27ef95405e2c24b4 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 28 Jun 2021 13:22:38 +0100
Subject: [PATCH 5/5] remove change

---
 .../trainer/connectors/logger_connector/logger_connector.py    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 22c67902db80d..5e75d51fc33a6 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -278,8 +278,7 @@ def reset(self, metrics: Optional[bool] = None) -> None:
     def metrics(self) -> Dict[MetricSource, Dict[str, _METRIC]]:
         """This function returns either batch or epoch metrics depending on ``_epoch_end_reached``."""
         on_step = not self._epoch_end_reached
-        metrics = self.trainer._results.metrics(on_step)
-        return metrics
+        return self.trainer._results.metrics(on_step)
 
     @property
     def gpus_metrics(self) -> Dict[str, str]: