-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[2/4] Add DeviceStatsMonitor callback (#9712)
Co-authored-by: ananthsub <[email protected]> Co-authored-by: thomas chaton <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Adrian Wälchli <[email protected]> Co-authored-by: Kaushik B <[email protected]> Co-authored-by: Kaushik B <[email protected]>
- Loading branch information
1 parent
23e8b59
commit 940b910
Showing
11 changed files
with
228 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
# Copyright The PyTorch Lightning team. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
""" | ||
Device Stats Monitor | ||
==================== | ||
Monitors and logs device stats during training. | ||
""" | ||
from typing import Any, Dict, Optional | ||
|
||
import pytorch_lightning as pl | ||
from pytorch_lightning.callbacks.base import Callback | ||
from pytorch_lightning.utilities.exceptions import MisconfigurationException | ||
from pytorch_lightning.utilities.types import STEP_OUTPUT | ||
|
||
|
||
class DeviceStatsMonitor(Callback): | ||
r""" | ||
Automatically monitors and logs device stats during training stage. ``DeviceStatsMonitor`` | ||
is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``. | ||
Raises: | ||
MisconfigurationException: | ||
If ``Trainer`` has no logger. | ||
Example: | ||
>>> from pytorch_lightning import Trainer | ||
>>> from pytorch_lightning.callbacks import DeviceStatsMonitor | ||
>>> device_stats = DeviceStatsMonitor() # doctest: +SKIP | ||
>>> trainer = Trainer(callbacks=[device_stats]) # doctest: +SKIP | ||
""" | ||
|
||
def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: | ||
if not trainer.logger: | ||
raise MisconfigurationException("Cannot use DeviceStatsMonitor callback with Trainer that has no logger.") | ||
|
||
def on_train_batch_start( | ||
self, | ||
trainer: "pl.Trainer", | ||
pl_module: "pl.LightningModule", | ||
batch: Any, | ||
batch_idx: int, | ||
unused: Optional[int] = 0, | ||
) -> None: | ||
if not trainer.logger_connector.should_update_logs: | ||
return | ||
|
||
device_stats = trainer.accelerator.get_device_stats(pl_module.device) | ||
prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_start") | ||
trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step) | ||
|
||
def on_train_batch_end( | ||
self, | ||
trainer: "pl.Trainer", | ||
pl_module: "pl.LightningModule", | ||
outputs: STEP_OUTPUT, | ||
batch: Any, | ||
batch_idx: int, | ||
unused: Optional[int] = 0, | ||
) -> None: | ||
if not trainer.logger_connector.should_update_logs: | ||
return | ||
|
||
device_stats = trainer.accelerator.get_device_stats(pl_module.device) | ||
prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_end") | ||
trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step) | ||
|
||
|
||
def prefix_metrics_keys(metrics_dict: Dict[str, float], prefix: str) -> Dict[str, float]: | ||
return {prefix + "." + k: v for k, v in metrics_dict.items()} |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
# Copyright The PyTorch Lightning team. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
from typing import Dict, Optional | ||
|
||
import pytest | ||
|
||
from pytorch_lightning import Trainer | ||
from pytorch_lightning.callbacks import DeviceStatsMonitor | ||
from pytorch_lightning.loggers import CSVLogger | ||
from pytorch_lightning.utilities.distributed import rank_zero_only | ||
from pytorch_lightning.utilities.exceptions import MisconfigurationException | ||
from tests.helpers import BoringModel | ||
from tests.helpers.runif import RunIf | ||
|
||
|
||
@RunIf(min_torch="1.8") | ||
@RunIf(min_gpus=1) | ||
def test_device_stats_gpu_from_torch(tmpdir): | ||
"""Test GPU stats are logged using a logger with Pytorch >= 1.8.0.""" | ||
model = BoringModel() | ||
device_stats = DeviceStatsMonitor() | ||
|
||
class DebugLogger(CSVLogger): | ||
@rank_zero_only | ||
def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: | ||
fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"] | ||
for f in fields: | ||
assert any(f in h for h in metrics.keys()) | ||
|
||
trainer = Trainer( | ||
default_root_dir=tmpdir, | ||
max_epochs=2, | ||
limit_train_batches=7, | ||
log_every_n_steps=1, | ||
gpus=1, | ||
callbacks=[device_stats], | ||
logger=DebugLogger(tmpdir), | ||
checkpoint_callback=False, | ||
enable_progress_bar=False, | ||
) | ||
|
||
trainer.fit(model) | ||
|
||
|
||
@RunIf(max_torch="1.7") | ||
@RunIf(min_gpus=1) | ||
def test_device_stats_gpu_from_nvidia(tmpdir): | ||
"""Test GPU stats are logged using a logger with Pytorch < 1.8.0.""" | ||
model = BoringModel() | ||
device_stats = DeviceStatsMonitor() | ||
|
||
class DebugLogger(CSVLogger): | ||
@rank_zero_only | ||
def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: | ||
fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"] | ||
for f in fields: | ||
assert any(f in h for h in metrics.keys()) | ||
|
||
trainer = Trainer( | ||
default_root_dir=tmpdir, | ||
max_epochs=2, | ||
limit_train_batches=7, | ||
log_every_n_steps=1, | ||
gpus=1, | ||
callbacks=[device_stats], | ||
logger=DebugLogger(tmpdir), | ||
checkpoint_callback=False, | ||
enable_progress_bar=False, | ||
) | ||
|
||
trainer.fit(model) | ||
|
||
|
||
@RunIf(tpu=True) | ||
def test_device_stats_monitor_tpu(tmpdir): | ||
"""Test TPU stats are logged using a logger.""" | ||
|
||
model = BoringModel() | ||
device_stats = DeviceStatsMonitor() | ||
|
||
class DebugLogger(CSVLogger): | ||
@rank_zero_only | ||
def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: | ||
fields = ["avg. free memory (MB)", "avg. peak memory (MB)"] | ||
for f in fields: | ||
assert any(f in h for h in metrics.keys()) | ||
|
||
trainer = Trainer( | ||
default_root_dir=tmpdir, | ||
max_epochs=1, | ||
limit_train_batches=1, | ||
tpu_cores=8, | ||
log_every_n_steps=1, | ||
callbacks=[device_stats], | ||
logger=DebugLogger(tmpdir), | ||
checkpoint_callback=False, | ||
enable_progress_bar=False, | ||
) | ||
|
||
trainer.fit(model) | ||
|
||
|
||
def test_device_stats_monitor_no_logger(tmpdir): | ||
"""Test DeviceStatsMonitor with no logger in Trainer.""" | ||
|
||
model = BoringModel() | ||
device_stats = DeviceStatsMonitor() | ||
|
||
trainer = Trainer( | ||
default_root_dir=tmpdir, | ||
callbacks=[device_stats], | ||
max_epochs=1, | ||
logger=False, | ||
checkpoint_callback=False, | ||
enable_progress_bar=False, | ||
) | ||
|
||
with pytest.raises(MisconfigurationException, match="Trainer that has no logger."): | ||
trainer.fit(model) |