diff --git a/CHANGELOG.md b/CHANGELOG.md index 81fbefca31453..fee8f94f226b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -317,6 +317,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a bug where an infinite recursion would be triggered when using the `BaseFinetuning` callback on a model that contains a `ModuleDict` ([#8170](https://github.com/PyTorchLightning/pytorch-lightning/pull/8170)) + +- Fixed `log_gpu_memory` metrics not being added to `logging` when nothing else is logged ([#8174](https://github.com/PyTorchLightning/pytorch-lightning/pull/8174)) + + ## [1.3.7] - 2021-06-22 - Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/PyTorchLightning/pytorch-lightning/pull/7975)) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 6aae89273f4fc..5e75d51fc33a6 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -38,6 +38,7 @@ def __init__(self, trainer: 'pl.Trainer', log_gpu_memory: Optional[str] = None) self._progress_bar_metrics: Dict[str, float] = {} self._logged_metrics: Dict[str, _METRIC] = {} self._callback_metrics: Dict[str, _METRIC] = {} + self._gpus_metrics: Dict[str, str] = {} self._epoch_end_reached = False self._current_fx: Optional[str] = None self._batch_idx: Optional[int] = None @@ -94,11 +95,6 @@ def log_metrics(self, metrics: Dict[str, _METRIC], step: Optional[int] = None) - if self.trainer.logger is None or not metrics: return - # add gpu memory - if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: - mem_map = memory.get_memory_profile(self.log_gpu_memory) - metrics.update(mem_map) - # turn all tensors to scalars scalar_metrics = metrics_to_scalars(metrics) @@ -213,6 +209,8 @@ def update_train_step_metrics(self) -> None: if self.trainer.fit_loop.should_accumulate() and self.trainer.lightning_module.automatic_optimization: return + self._log_gpus_metrics() + # when metrics should be logged assert not self._epoch_end_reached if self.should_update_logs or self.trainer.fast_dev_run: @@ -226,6 +224,12 @@ def update_train_epoch_metrics(self) -> None: # reset result collection for next epoch self.trainer._results.reset(metrics=True) + def _log_gpus_metrics(self): + for key, mem in self.gpus_metrics.items(): + gpu_id = int(key.split('/')[0].split(':')[1]) + if gpu_id in self.trainer.accelerator_connector.parallel_device_ids: + self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False) + """ Utilities and properties """ @@ -276,6 +280,13 @@ def metrics(self) -> Dict[MetricSource, Dict[str, _METRIC]]: on_step = not self._epoch_end_reached return self.trainer._results.metrics(on_step) + @property + def gpus_metrics(self) -> Dict[str, str]: + if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: + mem_map = memory.get_memory_profile(self.log_gpu_memory) + self._gpus_metrics.update(mem_map) + return self._gpus_metrics + @property def callback_metrics(self) -> Dict[str, _METRIC]: if self.trainer._results: diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py index 8ca51b2dee3ef..61e7a2a37ebc6 100644 --- a/tests/trainer/logging_/test_train_loop_logging.py +++ b/tests/trainer/logging_/test_train_loop_logging.py @@ -712,3 +712,21 @@ def training_step(self, *args): model = TestModel() with pytest.raises(MisconfigurationException, match=r'reduce_fx={min,max,mean,sum}\)` are currently supported'): trainer.fit(model) + + +@RunIf(min_gpus=2) +def test_log_gpu_memory_without_logging_on_step(tmpdir): + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=1, + limit_val_batches=0, + log_gpu_memory='all', + log_every_n_steps=1, + gpus=[1] + ) + trainer.fit(model) + + assert 'gpu_id: 1/memory.used (MB)' in trainer.logged_metrics