chaiNNer-org · joeyballentine · Apr 27, 2024 · Apr 27, 2024
diff --git a/backend/src/gpu.py b/backend/src/gpu.py
@@ -1,32 +1,13 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
+from functools import cached_property
+from typing import Callable, Sequence
 
 import pynvml as nv
 from sanic.log import logger
 
-nvidia_is_available = False
-
-try:
-    nv.nvmlInit()
-    nvidia_is_available = True
-    nv.nvmlShutdown()
-except nv.NVMLError:
-    logger.info("No Nvidia GPU found, or invalid driver installed.")
-except Exception as e:
-    logger.info(f"Unknown error occurred when trying to initialize Nvidia GPU: {e}")
-
-
-@dataclass
-class _GPU:
-    name: str
-    uuid: str
-    index: int
-    handle: int
-    arch: int
-
-
-FP16_ARCH_ABILITY_MAP = {
+_FP16_ARCH_ABILITY_MAP = {
     nv.NVML_DEVICE_ARCH_KEPLER: False,
     nv.NVML_DEVICE_ARCH_MAXWELL: False,
     nv.NVML_DEVICE_ARCH_PASCAL: False,
@@ -39,70 +20,114 @@ class _GPU:
 }
 
 
-def supports_fp16(gpu: _GPU):
-    # This generation also contains the GTX 1600 cards, which do not support FP16.
-    if gpu.arch == nv.NVML_DEVICE_ARCH_TURING:
-        # There may be a more robust way to check this, but for now I think this will do.
-        return "RTX" in gpu.name
-    # Future proofing. We can be reasonably sure that future architectures will support FP16.
-    return FP16_ARCH_ABILITY_MAP.get(gpu.arch, gpu.arch > nv.NVML_DEVICE_ARCH_HOPPER)
+@dataclass
+class MemoryUsage:
+    total: int
+    used: int
+    free: int
 
 
-class NvidiaHelper:
-    def __init__(self):
-        nv.nvmlInit()
+@dataclass(frozen=True)
+class NvDevice:
+    index: int
+    handle: nv.c_nvmlDevice_t
+    name: str
 
-        self.__num_gpus = nv.nvmlDeviceGetCount()
-
-        self.__gpus: list[_GPU] = []
-        for i in range(self.__num_gpus):
-            handle = nv.nvmlDeviceGetHandleByIndex(i)
-            self.__gpus.append(
-                _GPU(
-                    name=nv.nvmlDeviceGetName(handle),
-                    uuid=nv.nvmlDeviceGetUUID(handle),
-                    index=i,
-                    handle=handle,  # type: ignore
-                    arch=nv.nvmlDeviceGetArchitecture(handle),
-                )
-            )
+    @staticmethod
+    def from_index(index: int) -> NvDevice:
+        handle = nv.nvmlDeviceGetHandleByIndex(index)
+
+        return NvDevice(
+            index=index,
+            handle=handle,
+            name=nv.nvmlDeviceGetName(handle),
+        )
+
+    @cached_property
+    def architecture(self) -> int:
+        # We catch and ignore errors to support older drivers that don't have nvmlDeviceGetArchitecture
+        try:
+            return nv.nvmlDeviceGetArchitecture(self.handle)
+        except Exception:
+            return nv.NVML_DEVICE_ARCH_UNKNOWN
+
+    @property
+    def supports_fp16(self):
+        arch = self.architecture
+
+        # This generation also contains the GTX 1600 cards, which do not support FP16.
+        if arch == nv.NVML_DEVICE_ARCH_TURING:
+            return "RTX" in self.name
+
+        # Future proofing. We can be reasonably sure that future architectures will support FP16.
+        return _FP16_ARCH_ABILITY_MAP.get(arch, arch > nv.NVML_DEVICE_ARCH_HOPPER)
+
+    def get_current_vram_usage(self) -> MemoryUsage:
+        info = nv.nvmlDeviceGetMemoryInfo(self.handle)
+        return MemoryUsage(info.total, info.used, info.free)  # type: ignore
+
+
+class NvInfo:
+    def __init__(self, devices: Sequence[NvDevice], clean_up: Callable[[], None]):
+        self.__devices: Sequence[NvDevice] = devices
+        self.__clean_up = clean_up
+
+    @staticmethod
+    def unavailable():
+        return NvInfo([], lambda: None)
 
     def __del__(self):
-        nv.nvmlShutdown()
+        self.__clean_up()
 
     @property
-    def num_gpus(self):
-        return self.__num_gpus
+    def devices(self) -> Sequence[NvDevice]:
+        return self.__devices
+
+    @property
+    def is_available(self):
+        return len(self.devices) > 0
+
+    @property
+    def all_support_fp16(self) -> bool:
+        return all(gpu.supports_fp16 for gpu in self.devices)
 
-    def list_gpus(self) -> list[str]:
-        return [gpu.name for gpu in self.__gpus]
 
-    def get_current_vram_usage(self, gpu_index: int = 0) -> tuple[int, int, int]:
-        info = nv.nvmlDeviceGetMemoryInfo(self.__gpus[gpu_index].handle)
+def _try_nvml_init():
+    try:
+        nv.nvmlInit()
+        return True
+    except Exception as e:
+        if isinstance(e, nv.NVMLError):
+            logger.info("No Nvidia GPU found, or invalid driver installed.")
+        else:
+            logger.info(
+                f"Unknown error occurred when trying to initialize Nvidia GPU: {e}"
+            )
+        return False
 
-        return info.total, info.used, info.free  # type: ignore
 
-    def supports_fp16(self, gpu_index: int | None = None) -> bool:
-        if gpu_index is None:
-            return all(supports_fp16(gpu) for gpu in self.__gpus)
-        gpu = self.__gpus[gpu_index]
-        return supports_fp16(gpu)
+def _try_nvml_shutdown():
+    try:
+        nv.nvmlShutdown()
+    except Exception:
+        logger.warn("Failed to shut down Nvidia GPU.", exc_info=True)
+
 
+def _get_nvidia_info() -> NvInfo:
+    if not _try_nvml_init():
+        return NvInfo.unavailable()
 
-_cached_nvidia_helper = None
+    try:
+        device_count = nv.nvmlDeviceGetCount()
+        devices = [NvDevice.from_index(i) for i in range(device_count)]
+        return NvInfo(devices, _try_nvml_shutdown)
+    except Exception as e:
+        logger.info(f"Unknown error occurred when trying to initialize Nvidia GPU: {e}")
+        _try_nvml_shutdown()
+        return NvInfo.unavailable()
 
 
-def get_nvidia_helper():
-    # pylint: disable=global-statement
-    global _cached_nvidia_helper
-    if not nvidia_is_available:
-        return None
-    if not _cached_nvidia_helper:
-        _cached_nvidia_helper = NvidiaHelper()
-    return _cached_nvidia_helper
+nvidia = _get_nvidia_info()
 
 
-__all__ = [
-    "nvidia_is_available",
-    "get_nvidia_helper",
-]
+__all__ = ["nvidia", "NvInfo", "NvDevice", "MemoryUsage"]
diff --git a/backend/src/packages/chaiNNer_onnx/__init__.py b/backend/src/packages/chaiNNer_onnx/__init__.py
@@ -1,7 +1,7 @@
 from sanic.log import logger
 
 from api import KB, MB, Dependency, add_package
-from gpu import nvidia_is_available
+from gpu import nvidia
 from system import is_arm_mac
 
 general = "ONNX uses .onnx models to upscale images."
@@ -19,7 +19,7 @@
 
 
 def get_onnx_runtime():
-    if nvidia_is_available:
+    if nvidia.is_available:
         return Dependency(
             display_name="ONNX Runtime (GPU)",
             pypi_name="onnxruntime-gpu",

diff --git a/backend/src/packages/chaiNNer_onnx/settings.py b/backend/src/packages/chaiNNer_onnx/settings.py
@@ -8,22 +8,18 @@
 from sanic.log import logger
 
 from api import CacheSetting, DropdownSetting, NodeContext, ToggleSetting
-from gpu import get_nvidia_helper
+from gpu import nvidia
 from system import is_arm_mac
 
 from . import package
 
-nv = get_nvidia_helper()
-
 if not is_arm_mac:
-    gpu_list = nv.list_gpus() if nv is not None else []
-
     package.add_setting(
         DropdownSetting(
             label="GPU",
             key="gpu_index",
             description="Which GPU to use for ONNX. This is only relevant if you have multiple GPUs.",
-            options=[{"label": x, "value": str(i)} for i, x in enumerate(gpu_list)],
+            options=[{"label": d.name, "value": str(d.index)} for d in nvidia.devices],
             default="0",
         )
     )
@@ -74,9 +70,7 @@ def get_provider_label(identifier: str) -> str:
         )
     )
 
-    should_fp16 = False
-    if nv is not None:
-        should_fp16 = nv.supports_fp16()
+    should_fp16 = nvidia.is_available and nvidia.all_support_fp16
 
     package.add_setting(
         ToggleSetting(

diff --git a/backend/src/packages/chaiNNer_pytorch/__init__.py b/backend/src/packages/chaiNNer_pytorch/__init__.py
@@ -3,7 +3,7 @@
 from sanic.log import logger
 
 from api import GB, KB, MB, Dependency, add_package
-from gpu import nvidia_is_available
+from gpu import nvidia
 from system import is_arm_mac
 
 general = "PyTorch uses .pth models to upscale images."
@@ -46,26 +46,26 @@ def get_pytorch():
             Dependency(
                 display_name="PyTorch",
                 pypi_name="torch",
-                version="2.1.2+cu121" if nvidia_is_available else "2.1.2",
-                size_estimate=2 * GB if nvidia_is_available else 140 * MB,
+                version="2.1.2+cu121" if nvidia.is_available else "2.1.2",
+                size_estimate=2 * GB if nvidia.is_available else 140 * MB,
                 extra_index_url=(
                     "https://download.pytorch.org/whl/cu121"
-                    if nvidia_is_available
+                    if nvidia.is_available
                     else "https://download.pytorch.org/whl/cpu"
                 ),
-                auto_update=not nvidia_is_available,  # Too large to auto-update
+                auto_update=not nvidia.is_available,  # Too large to auto-update
             ),
             Dependency(
                 display_name="TorchVision",
                 pypi_name="torchvision",
-                version="0.16.2+cu121" if nvidia_is_available else "0.16.2",
-                size_estimate=2 * MB if nvidia_is_available else 800 * KB,
+                version="0.16.2+cu121" if nvidia.is_available else "0.16.2",
+                size_estimate=2 * MB if nvidia.is_available else 800 * KB,
                 extra_index_url=(
                     "https://download.pytorch.org/whl/cu121"
-                    if nvidia_is_available
+                    if nvidia.is_available
                     else "https://download.pytorch.org/whl/cpu"
                 ),
-                auto_update=not nvidia_is_available,  # Needs to match PyTorch version
+                auto_update=not nvidia.is_available,  # Needs to match PyTorch version
             ),
         ]
 

diff --git a/backend/src/packages/chaiNNer_pytorch/settings.py b/backend/src/packages/chaiNNer_pytorch/settings.py
@@ -4,13 +4,11 @@
 from sanic.log import logger
 
 from api import DropdownSetting, NodeContext, NumberSetting, ToggleSetting
-from gpu import get_nvidia_helper
+from gpu import nvidia
 from system import is_arm_mac
 
 from . import package
 
-nv = get_nvidia_helper()
-
 if not is_arm_mac:
     gpu_list = []
     for i in range(torch.cuda.device_count()):
@@ -43,8 +41,8 @@
 )
 
 should_fp16 = False
-if nv is not None:
-    should_fp16 = nv.supports_fp16()
+if nvidia.is_available:
+    should_fp16 = nvidia.all_support_fp16
 else:
     should_fp16 = is_arm_mac
 

diff --git a/backend/src/server_host.py b/backend/src/server_host.py
@@ -26,7 +26,7 @@
     uninstall_dependencies,
 )
 from events import EventQueue
-from gpu import get_nvidia_helper
+from gpu import nvidia
 from response import error_response, success_response
 from server_config import ServerConfig
 from server_process_helper import WorkerServer
@@ -167,16 +167,14 @@ async def system_usage(_request: Request):
     mem_usage = psutil.virtual_memory().percent
     stats_list.append(SystemStat("CPU", cpu_usage))
     stats_list.append(SystemStat("RAM", mem_usage))
-    nv = get_nvidia_helper()
-    if nv is not None:
-        for i in range(nv.num_gpus):
-            total, used, _ = nv.get_current_vram_usage(i)
-            stats_list.append(
-                SystemStat(
-                    f"VRAM {i}" if nv.num_gpus > 1 else "VRAM",
-                    used / total * 100,
-                )
+    for device in nvidia.devices:
+        usage = device.get_current_vram_usage()
+        stats_list.append(
+            SystemStat(
+                f"VRAM {device.index}" if len(nvidia.devices) > 1 else "VRAM",
+                usage.used / usage.total * 100,
             )
+        )
     return json([asdict(x) for x in stats_list])