Lightning-AI · daniellepintz · Sep 27, 2021 · Sep 17, 2021 · Sep 17, 2021 · Sep 17, 2021
@@ -142,6 +142,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `PL_RECONCILE_PROCESS` environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389))
 
 
+- Added `get_device_stats` to Accelerator interface and implement it for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586))
+
+
 - Added `RichModelSummary` callback ([#9546](https://github.com/PyTorchLightning/pytorch-lightning/pull/9546))
 
 

@@ -41,6 +41,7 @@ class Accelerator:
     - CPU
     - GPU
     - TPU
+    - IPU
 
     Each Accelerator gets two plugins upon initialization:
     One to handle differences from the training routine and one to handle different precisions.
@@ -436,6 +437,10 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool:
         """
         return self.training_type_plugin.restore_checkpoint_after_pre_dispatch
 
+    def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]:
+        """Gets stats for a given device."""
+        pass
+
     def on_train_start(self) -> None:
         """Called when train begins."""
         return self.training_type_plugin.on_train_start()

@@ -13,12 +13,16 @@
 # limitations under the License.
 import logging
 import os
+import shutil
+import subprocess
+from typing import Any, Dict, List, Optional
 
 import torch
 
 import pytorch_lightning as pl
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
 
 _log = logging.getLogger(__name__)
 
@@ -39,6 +43,13 @@ def setup(self, trainer: "pl.Trainer") -> None:
                 If the selected device is not GPU.
         """
         self.set_nvidia_flags(trainer.local_rank)
+
+        # The logical device IDs for selected devices
+        self._device_ids: List[int] = sorted(set(trainer.data_parallel_device_ids))
+
+        # The unmasked real GPU IDs
+        self._gpu_ids: List[int] = self._get_gpu_ids(self._device_ids)
+
         return super().setup(trainer)
 
     def on_train_start(self) -> None:
@@ -53,6 +64,58 @@ def set_nvidia_flags(local_rank: int) -> None:
         devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids)
         _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]")
 
+    def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]:
+        """Gets stats for the given GPU device."""
+        if _TORCH_GREATER_EQUAL_1_8:
+            return torch.cuda.memory_stats(device=device)
+        else:
+            gpu_stat_keys = [
+                ("utilization.gpu", "%"),
+                ("memory.used", "MB"),
+                ("memory.free", "MB"),
+                ("utilization.memory", "%"),
+                ("fan.speed", "%"),
+                ("temperature.gpu", "°C"),
+                ("temperature.memory", "°C"),
+            ]
+            gpu_stats = self._get_gpu_stats([k for k, _ in gpu_stat_keys])
+            device_stats = self._parse_gpu_stats(self._device_ids, gpu_stats, gpu_stat_keys)
+            return device_stats
+
+    def _get_gpu_stats(self, queries: List[str]) -> List[List[float]]:
+        if not queries:
+            return []
+
+        """Run nvidia-smi to get the gpu stats"""
+        gpu_query = ",".join(queries)
+        format = "csv,nounits,noheader"
+        gpu_ids = ",".join(self._gpu_ids)
+        result = subprocess.run(
+            [shutil.which("nvidia-smi"), f"--query-gpu={gpu_query}", f"--format={format}", f"--id={gpu_ids}"],
+            encoding="utf-8",
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,  # for backward compatibility with python version 3.6
+            check=True,
+        )
+
+        def _to_float(x: str) -> float:
+            try:
+                return float(x)
+            except ValueError:
+                return 0.0
+
+        stats = result.stdout.strip().split(os.linesep)
+        stats = [[_to_float(x) for x in s.split(", ")] for s in stats]
+        return stats
+
+    @staticmethod
+    def _get_gpu_ids(device_ids: List[int]) -> List[str]:
+        """Get the unmasked real GPU IDs."""
+        # All devices if `CUDA_VISIBLE_DEVICES` unset
+        default = ",".join(str(i) for i in range(torch.cuda.device_count()))
+        cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",")
+        return [cuda_visible_devices[device_id].strip() for device_id in device_ids]
+
     def teardown(self) -> None:
         super().teardown()
         self._move_optimizer_state(torch.device("cpu"))
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Dict, Optional
 
 import torch
 from torch.optim import Optimizer
@@ -59,3 +59,18 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None:
         for opt in self.optimizers:
             for p, v in opt.state.items():
                 opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device)
+
+    def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]:
+        """Gets stats for the given TPU device."""
+        device_stats = {}
+        memory_info = xm.get_memory_info(device)
+
+        free_memory = memory_info["kb_free"]
+        peak_memory = memory_info["kb_total"] - free_memory
+
+        free_memory = self.training_type_plugin.reduce(free_memory) * 0.001
+        peak_memory = self.training_type_plugin.reduce(peak_memory) * 0.001
+
+        device_stats["avg. free memory (MB)"] = free_memory
+        device_stats["avg. peak memory (MB)"] = peak_memory
+        return device_stats
Original file line number	Diff line number	Diff line change
Expand Up		@@ -142,6 +142,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
		- Added `PL_RECONCILE_PROCESS` environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389))


		- Added `get_device_stats` to Accelerator interface and implement it for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586))
daniellepintz marked this conversation as resolved. Show resolved Hide resolved


		- Added `RichModelSummary` callback ([#9546](https://github.com/PyTorchLightning/pytorch-lightning/pull/9546))


Expand Down