Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[1/4] Add get_device_stats to accelerator interface #9586

Merged
merged 31 commits into from
Sep 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
c11eb87
Add interface to accelerator to get_device_stats
daniellepintz Sep 17, 2021
cba4916
Update changelog
daniellepintz Sep 17, 2021
d4252c5
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
daniellepintz Sep 17, 2021
d0e1233
address comments
daniellepintz Sep 17, 2021
269f3ff
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 17, 2021
4d8cc75
comments
daniellepintz Sep 18, 2021
8e37419
Merge branch 'get_device_stats' of github.com:daniellepintz/pytorch-l…
daniellepintz Sep 18, 2021
6d9cc2e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 18, 2021
018e5cd
fix gpu
daniellepintz Sep 18, 2021
310f254
Merge branch 'get_device_stats' of github.com:daniellepintz/pytorch-l…
daniellepintz Sep 18, 2021
ec8084d
fix
daniellepintz Sep 18, 2021
5abce11
update docstring
daniellepintz Sep 18, 2021
3936242
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 18, 2021
0fdd368
fix tests
daniellepintz Sep 18, 2021
32f1047
Merge branch 'get_device_stats' of github.com:daniellepintz/pytorch-l…
daniellepintz Sep 18, 2021
d8314cf
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 18, 2021
5699e85
type fix
daniellepintz Sep 18, 2021
8d66aba
Merge branch 'get_device_stats' of github.com:daniellepintz/pytorch-l…
daniellepintz Sep 18, 2021
3ac0821
fix test
daniellepintz Sep 18, 2021
1160cd0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 18, 2021
ef5bc17
Update pytorch_lightning/accelerators/gpu.py
daniellepintz Sep 19, 2021
497680c
address comments
daniellepintz Sep 21, 2021
d3d13ec
Merge branch 'get_device_stats' of github.com:daniellepintz/pytorch-l…
daniellepintz Sep 21, 2021
ae7e912
Add unit tests
daniellepintz Sep 23, 2021
418e4a0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 23, 2021
46b9f36
comments
daniellepintz Sep 23, 2021
ccadca5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 23, 2021
2658b4a
lint
daniellepintz Sep 23, 2021
07bc597
Merge branch 'get_device_stats' of github.com:daniellepintz/pytorch-l…
daniellepintz Sep 23, 2021
e19239e
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
daniellepintz Sep 23, 2021
c4f0d02
comments
daniellepintz Sep 23, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added `PL_RECONCILE_PROCESS` environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389))


- Added `get_device_stats` to the Accelerator Interface and added its implementation for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586))


- Added `multifile` option to `LightningCLI` to enable/disable config save to preserve multiple files structure ([#9073](https://github.com/PyTorchLightning/pytorch-lightning/pull/9073))


Expand Down
12 changes: 12 additions & 0 deletions pytorch_lightning/accelerators/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class Accelerator:
- CPU
- GPU
- TPU
- IPU

Each Accelerator gets two plugins upon initialization:
One to handle differences from the training routine and one to handle different precisions.
Expand Down Expand Up @@ -436,6 +437,17 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool:
"""
return self.training_type_plugin.restore_checkpoint_after_pre_dispatch

def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
daniellepintz marked this conversation as resolved.
Show resolved Hide resolved
"""Gets stats for a given device.

Args:
device: device for which to get stats

Returns:
Dictionary of device stats
"""
raise NotImplementedError

def on_train_start(self) -> None:
"""Called when train begins."""
return self.training_type_plugin.on_train_start()
Expand Down
8 changes: 8 additions & 0 deletions pytorch_lightning/accelerators/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, Union
daniellepintz marked this conversation as resolved.
Show resolved Hide resolved

import torch

import pytorch_lightning as pl
from pytorch_lightning.accelerators.accelerator import Accelerator
from pytorch_lightning.utilities.exceptions import MisconfigurationException
Expand All @@ -29,3 +33,7 @@ def setup(self, trainer: "pl.Trainer") -> None:
raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.")

return super().setup(trainer)

def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
"""Returns dummy implementation for now."""
daniellepintz marked this conversation as resolved.
Show resolved Hide resolved
return {}
81 changes: 81 additions & 0 deletions pytorch_lightning/accelerators/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,16 @@
# limitations under the License.
import logging
import os
import shutil
import subprocess
from typing import Any, Dict, List, Union

import torch

import pytorch_lightning as pl
from pytorch_lightning.accelerators.accelerator import Accelerator
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -53,6 +57,83 @@ def set_nvidia_flags(local_rank: int) -> None:
devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids)
_log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]")

def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
"""Gets stats for the given GPU device.

daniellepintz marked this conversation as resolved.
Show resolved Hide resolved
Args:
device: GPU device for which to get stats

Returns:
A dictionary mapping the metrics to their values.

Raises:
FileNotFoundError:
If nvidia-smi installation not found
"""
if _TORCH_GREATER_EQUAL_1_8:
return torch.cuda.memory_stats(device)
return _get_nvidia_gpu_stats(device)

def teardown(self) -> None:
super().teardown()
self._move_optimizer_state(torch.device("cpu"))


def _get_nvidia_gpu_stats(device: torch.device) -> Dict[str, float]:
daniellepintz marked this conversation as resolved.
Show resolved Hide resolved
daniellepintz marked this conversation as resolved.
Show resolved Hide resolved
"""Get GPU stats including memory, fan speed, and temperature from nvidia-smi.

Args:
device: GPU device for which to get stats

Returns:
A dictionary mapping the metrics to their values.

Raises:
FileNotFoundError:
If nvidia-smi installation not found
"""
gpu_stat_metrics = [
("utilization.gpu", "%"),
("memory.used", "MB"),
("memory.free", "MB"),
("utilization.memory", "%"),
("fan.speed", "%"),
("temperature.gpu", "°C"),
("temperature.memory", "°C"),
]
gpu_stat_keys = [k for k, _ in gpu_stat_metrics]
gpu_query = ",".join(gpu_stat_keys)

gpu_id = _get_gpu_id(device.index)
nvidia_smi_path = shutil.which("nvidia-smi")
if nvidia_smi_path is None:
raise FileNotFoundError("nvidia-smi: command not found")
daniellepintz marked this conversation as resolved.
Show resolved Hide resolved
result = subprocess.run(
[nvidia_smi_path, f"--query-gpu={gpu_query}", "--format=csv,nounits,noheader", f"--id={gpu_id}"],
encoding="utf-8",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE, # for backward compatibility with python version 3.6
check=True,
)

def _to_float(x: str) -> float:
try:
return float(x)
except ValueError:
return 0.0

s = result.stdout.strip()
stats = [_to_float(x) for x in s.split(", ")]

gpu_stats = {}
for i, (x, unit) in enumerate(gpu_stat_metrics):
gpu_stats[f"{x} ({unit})"] = stats[i]
return gpu_stats


def _get_gpu_id(device_id: int) -> str:
"""Get the unmasked real GPU IDs."""
# All devices if `CUDA_VISIBLE_DEVICES` unset
default = ",".join(str(i) for i in range(torch.cuda.device_count()))
cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",")
return cuda_visible_devices[device_id].strip()
20 changes: 19 additions & 1 deletion pytorch_lightning/accelerators/tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Callable, Optional
from typing import Any, Callable, Dict, Optional, Union

import torch
from torch.optim import Optimizer
Expand Down Expand Up @@ -61,3 +61,21 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None:
for opt in self.optimizers:
for p, v in opt.state.items():
opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device)

def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
"""Gets stats for the given TPU device.

Args:
device: TPU device for which to get stats

Returns:
A dictionary mapping the metrics (free memory and peak memory) to their values.
"""
memory_info = xm.get_memory_info(device)
free_memory = memory_info["kb_free"]
peak_memory = memory_info["kb_total"] - free_memory
device_stats = {
"avg. free memory (MB)": free_memory,
"avg. peak memory (MB)": peak_memory,
}
return device_stats
36 changes: 36 additions & 0 deletions tests/accelerators/test_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import torch

from pytorch_lightning.accelerators import GPUAccelerator
from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin
from tests.helpers.runif import RunIf


@RunIf(min_torch="1.8")
@RunIf(min_gpus=1)
def test_get_torch_gpu_stats(tmpdir):
"""Test GPU get_device_stats with Pytorch >= 1.8.0."""
current_device = torch.device(f"cuda:{torch.cuda.current_device()}")
GPUAccel = GPUAccelerator(
training_type_plugin=DataParallelPlugin(parallel_devices=[current_device]), precision_plugin=PrecisionPlugin()
)
gpu_stats = GPUAccel.get_device_stats(current_device)
fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"]

for f in fields:
assert any(f in h for h in gpu_stats.keys())


@RunIf(max_torch="1.7")
@RunIf(min_gpus=1)
def test_get_nvidia_gpu_stats(tmpdir):
"""Test GPU get_device_stats with Pytorch < 1.8.0."""
current_device = torch.device(f"cuda:{torch.cuda.current_device()}")
GPUAccel = GPUAccelerator(
training_type_plugin=DataParallelPlugin(parallel_devices=[current_device]), precision_plugin=PrecisionPlugin()
)
gpu_stats = GPUAccel.get_device_stats(current_device)
fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"]

for f in fields:
assert any(f in h for h in gpu_stats.keys())
16 changes: 16 additions & 0 deletions tests/accelerators/test_tpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pytorch_lightning.accelerators import TPUAccelerator
from pytorch_lightning.plugins import SingleTPUPlugin
from pytorch_lightning.plugins.training_type import TPUSpawnPlugin
from tests.helpers.runif import RunIf


@RunIf(tpu=True)
def test_device_stats_tpu(tmpdir):
daniellepintz marked this conversation as resolved.
Show resolved Hide resolved
"""Test TPU get_device_stats."""
plugin = SingleTPUPlugin(1)
TPUAccel = TPUAccelerator(training_type_plugin=TPUSpawnPlugin(), precision_plugin=plugin)
tpu_stats = TPUAccel.get_device_stats("1")
fields = ["avg. free memory (MB)", "avg. peak memory (MB)"]

for f in fields:
assert any(f in h for h in tpu_stats.keys())