Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework GPU API #2821

Merged
merged 1 commit into from
Apr 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 97 additions & 72 deletions backend/src/gpu.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,13 @@
from __future__ import annotations

from dataclasses import dataclass
from functools import cached_property
from typing import Callable, Sequence

import pynvml as nv
from sanic.log import logger

nvidia_is_available = False

try:
nv.nvmlInit()
nvidia_is_available = True
nv.nvmlShutdown()
except nv.NVMLError:
logger.info("No Nvidia GPU found, or invalid driver installed.")
except Exception as e:
logger.info(f"Unknown error occurred when trying to initialize Nvidia GPU: {e}")


@dataclass
class _GPU:
name: str
uuid: str
index: int
handle: int
arch: int


FP16_ARCH_ABILITY_MAP = {
_FP16_ARCH_ABILITY_MAP = {
nv.NVML_DEVICE_ARCH_KEPLER: False,
nv.NVML_DEVICE_ARCH_MAXWELL: False,
nv.NVML_DEVICE_ARCH_PASCAL: False,
Expand All @@ -39,70 +20,114 @@ class _GPU:
}


def supports_fp16(gpu: _GPU):
# This generation also contains the GTX 1600 cards, which do not support FP16.
if gpu.arch == nv.NVML_DEVICE_ARCH_TURING:
# There may be a more robust way to check this, but for now I think this will do.
return "RTX" in gpu.name
# Future proofing. We can be reasonably sure that future architectures will support FP16.
return FP16_ARCH_ABILITY_MAP.get(gpu.arch, gpu.arch > nv.NVML_DEVICE_ARCH_HOPPER)
@dataclass
class MemoryUsage:
total: int
used: int
free: int


class NvidiaHelper:
def __init__(self):
nv.nvmlInit()
@dataclass(frozen=True)
class NvDevice:
index: int
handle: nv.c_nvmlDevice_t
name: str

self.__num_gpus = nv.nvmlDeviceGetCount()

self.__gpus: list[_GPU] = []
for i in range(self.__num_gpus):
handle = nv.nvmlDeviceGetHandleByIndex(i)
self.__gpus.append(
_GPU(
name=nv.nvmlDeviceGetName(handle),
uuid=nv.nvmlDeviceGetUUID(handle),
index=i,
handle=handle, # type: ignore
arch=nv.nvmlDeviceGetArchitecture(handle),
)
)
@staticmethod
def from_index(index: int) -> NvDevice:
handle = nv.nvmlDeviceGetHandleByIndex(index)

return NvDevice(
index=index,
handle=handle,
name=nv.nvmlDeviceGetName(handle),
)

@cached_property
def architecture(self) -> int:
# We catch and ignore errors to support older drivers that don't have nvmlDeviceGetArchitecture
try:
return nv.nvmlDeviceGetArchitecture(self.handle)
except Exception:
return nv.NVML_DEVICE_ARCH_UNKNOWN

@property
def supports_fp16(self):
arch = self.architecture

# This generation also contains the GTX 1600 cards, which do not support FP16.
if arch == nv.NVML_DEVICE_ARCH_TURING:
return "RTX" in self.name

# Future proofing. We can be reasonably sure that future architectures will support FP16.
return _FP16_ARCH_ABILITY_MAP.get(arch, arch > nv.NVML_DEVICE_ARCH_HOPPER)

def get_current_vram_usage(self) -> MemoryUsage:
info = nv.nvmlDeviceGetMemoryInfo(self.handle)
return MemoryUsage(info.total, info.used, info.free) # type: ignore


class NvInfo:
def __init__(self, devices: Sequence[NvDevice], clean_up: Callable[[], None]):
self.__devices: Sequence[NvDevice] = devices
self.__clean_up = clean_up

@staticmethod
def unavailable():
return NvInfo([], lambda: None)

def __del__(self):
nv.nvmlShutdown()
self.__clean_up()

@property
def num_gpus(self):
return self.__num_gpus
def devices(self) -> Sequence[NvDevice]:
return self.__devices

@property
def is_available(self):
return len(self.devices) > 0

@property
def all_support_fp16(self) -> bool:
return all(gpu.supports_fp16 for gpu in self.devices)

def list_gpus(self) -> list[str]:
return [gpu.name for gpu in self.__gpus]

def get_current_vram_usage(self, gpu_index: int = 0) -> tuple[int, int, int]:
info = nv.nvmlDeviceGetMemoryInfo(self.__gpus[gpu_index].handle)
def _try_nvml_init():
try:
nv.nvmlInit()
return True
except Exception as e:
if isinstance(e, nv.NVMLError):
logger.info("No Nvidia GPU found, or invalid driver installed.")
else:
logger.info(
f"Unknown error occurred when trying to initialize Nvidia GPU: {e}"
)
return False

return info.total, info.used, info.free # type: ignore

def supports_fp16(self, gpu_index: int | None = None) -> bool:
if gpu_index is None:
return all(supports_fp16(gpu) for gpu in self.__gpus)
gpu = self.__gpus[gpu_index]
return supports_fp16(gpu)
def _try_nvml_shutdown():
try:
nv.nvmlShutdown()
except Exception:
logger.warn("Failed to shut down Nvidia GPU.", exc_info=True)


def _get_nvidia_info() -> NvInfo:
if not _try_nvml_init():
return NvInfo.unavailable()

_cached_nvidia_helper = None
try:
device_count = nv.nvmlDeviceGetCount()
devices = [NvDevice.from_index(i) for i in range(device_count)]
return NvInfo(devices, _try_nvml_shutdown)
except Exception as e:
logger.info(f"Unknown error occurred when trying to initialize Nvidia GPU: {e}")
_try_nvml_shutdown()
return NvInfo.unavailable()


def get_nvidia_helper():
# pylint: disable=global-statement
global _cached_nvidia_helper
if not nvidia_is_available:
return None
if not _cached_nvidia_helper:
_cached_nvidia_helper = NvidiaHelper()
return _cached_nvidia_helper
nvidia = _get_nvidia_info()


__all__ = [
"nvidia_is_available",
"get_nvidia_helper",
]
__all__ = ["nvidia", "NvInfo", "NvDevice", "MemoryUsage"]
4 changes: 2 additions & 2 deletions backend/src/packages/chaiNNer_onnx/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from sanic.log import logger

from api import KB, MB, Dependency, add_package
from gpu import nvidia_is_available
from gpu import nvidia
from system import is_arm_mac

general = "ONNX uses .onnx models to upscale images."
Expand All @@ -19,7 +19,7 @@


def get_onnx_runtime():
if nvidia_is_available:
if nvidia.is_available:
return Dependency(
display_name="ONNX Runtime (GPU)",
pypi_name="onnxruntime-gpu",
Expand Down
12 changes: 3 additions & 9 deletions backend/src/packages/chaiNNer_onnx/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,18 @@
from sanic.log import logger

from api import CacheSetting, DropdownSetting, NodeContext, ToggleSetting
from gpu import get_nvidia_helper
from gpu import nvidia
from system import is_arm_mac

from . import package

nv = get_nvidia_helper()

if not is_arm_mac:
gpu_list = nv.list_gpus() if nv is not None else []

package.add_setting(
DropdownSetting(
label="GPU",
key="gpu_index",
description="Which GPU to use for ONNX. This is only relevant if you have multiple GPUs.",
options=[{"label": x, "value": str(i)} for i, x in enumerate(gpu_list)],
options=[{"label": d.name, "value": str(d.index)} for d in nvidia.devices],
default="0",
)
)
Expand Down Expand Up @@ -74,9 +70,7 @@ def get_provider_label(identifier: str) -> str:
)
)

should_fp16 = False
if nv is not None:
should_fp16 = nv.supports_fp16()
should_fp16 = nvidia.is_available and nvidia.all_support_fp16

package.add_setting(
ToggleSetting(
Expand Down
18 changes: 9 additions & 9 deletions backend/src/packages/chaiNNer_pytorch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from sanic.log import logger

from api import GB, KB, MB, Dependency, add_package
from gpu import nvidia_is_available
from gpu import nvidia
from system import is_arm_mac

general = "PyTorch uses .pth models to upscale images."
Expand Down Expand Up @@ -46,26 +46,26 @@ def get_pytorch():
Dependency(
display_name="PyTorch",
pypi_name="torch",
version="2.1.2+cu121" if nvidia_is_available else "2.1.2",
size_estimate=2 * GB if nvidia_is_available else 140 * MB,
version="2.1.2+cu121" if nvidia.is_available else "2.1.2",
size_estimate=2 * GB if nvidia.is_available else 140 * MB,
extra_index_url=(
"https://download.pytorch.org/whl/cu121"
if nvidia_is_available
if nvidia.is_available
else "https://download.pytorch.org/whl/cpu"
),
auto_update=not nvidia_is_available, # Too large to auto-update
auto_update=not nvidia.is_available, # Too large to auto-update
),
Dependency(
display_name="TorchVision",
pypi_name="torchvision",
version="0.16.2+cu121" if nvidia_is_available else "0.16.2",
size_estimate=2 * MB if nvidia_is_available else 800 * KB,
version="0.16.2+cu121" if nvidia.is_available else "0.16.2",
size_estimate=2 * MB if nvidia.is_available else 800 * KB,
extra_index_url=(
"https://download.pytorch.org/whl/cu121"
if nvidia_is_available
if nvidia.is_available
else "https://download.pytorch.org/whl/cpu"
),
auto_update=not nvidia_is_available, # Needs to match PyTorch version
auto_update=not nvidia.is_available, # Needs to match PyTorch version
),
]

Expand Down
8 changes: 3 additions & 5 deletions backend/src/packages/chaiNNer_pytorch/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,11 @@
from sanic.log import logger

from api import DropdownSetting, NodeContext, NumberSetting, ToggleSetting
from gpu import get_nvidia_helper
from gpu import nvidia
from system import is_arm_mac

from . import package

nv = get_nvidia_helper()

if not is_arm_mac:
gpu_list = []
for i in range(torch.cuda.device_count()):
Expand Down Expand Up @@ -43,8 +41,8 @@
)

should_fp16 = False
if nv is not None:
should_fp16 = nv.supports_fp16()
if nvidia.is_available:
should_fp16 = nvidia.all_support_fp16
else:
should_fp16 = is_arm_mac

Expand Down
18 changes: 8 additions & 10 deletions backend/src/server_host.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
uninstall_dependencies,
)
from events import EventQueue
from gpu import get_nvidia_helper
from gpu import nvidia
from response import error_response, success_response
from server_config import ServerConfig
from server_process_helper import WorkerServer
Expand Down Expand Up @@ -167,16 +167,14 @@ async def system_usage(_request: Request):
mem_usage = psutil.virtual_memory().percent
stats_list.append(SystemStat("CPU", cpu_usage))
stats_list.append(SystemStat("RAM", mem_usage))
nv = get_nvidia_helper()
if nv is not None:
for i in range(nv.num_gpus):
total, used, _ = nv.get_current_vram_usage(i)
stats_list.append(
SystemStat(
f"VRAM {i}" if nv.num_gpus > 1 else "VRAM",
used / total * 100,
)
for device in nvidia.devices:
usage = device.get_current_vram_usage()
stats_list.append(
SystemStat(
f"VRAM {device.index}" if len(nvidia.devices) > 1 else "VRAM",
usage.used / usage.total * 100,
)
)
return json([asdict(x) for x in stats_list])


Expand Down
Loading