Skip to content

Commit

Permalink
Fix NVML index usage in CUDAWorker/LocalCUDACluster (#671)
Browse files Browse the repository at this point in the history
Fix an issue where device index `0` would be used in NVML functions. For CUDA runtime calls, we expect that the GPU being targeted is always on index `0` as its relative to the `CUDA_VISIBLE_DEVICES` ordering. However, NVML relies on absolute indices, thus we have to always use the actual GPU index being targeted, rather than the first one in `CUDA_VISIBLE_DEVICES`.

This is normally not an issue if no `CUDA_VISIBLE_DEVICES` is set, or is just set as `list(",".join(list(str(i) for i in range(get_n_gpus())))`, but it may be an issue when targeting a different list of GPUs. For example on a DGX-1, CPU affinity for GPUs `0-3` is `0-19,40-59`, and for GPUs `4-7` it is `20-39,60-79`, but when the user would set `CUDA_VISIBLE_DEVICES=4` the CPU affinity for the targeted GPU would be the same as for device index `0`, or `0-19,40-59`. This could result in lower performance, as well as wrong computation of total GPU memory for non-homogenous GPU systems.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: #671
  • Loading branch information
pentschev authored Jul 20, 2021
1 parent ded6e4e commit c04856e
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 4 deletions.
7 changes: 5 additions & 2 deletions dask_cuda/cuda_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
get_n_gpus,
get_ucx_config,
get_ucx_net_devices,
nvml_device_index,
parse_device_memory_limit,
)

Expand Down Expand Up @@ -219,7 +220,9 @@ def del_pid_file():
security=security,
env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
plugins={
CPUAffinity(get_cpu_affinity(i)),
CPUAffinity(
get_cpu_affinity(nvml_device_index(i, cuda_visible_devices(i)))
),
RMMSetup(
rmm_pool_size, rmm_managed_memory, rmm_async, rmm_log_directory,
),
Expand All @@ -236,7 +239,7 @@ def del_pid_file():
cuda_device_index=i,
)
},
data=data(i),
data=data(nvml_device_index(i, cuda_visible_devices(i))),
worker_class=worker_class,
**kwargs,
)
Expand Down
7 changes: 5 additions & 2 deletions dask_cuda/local_cuda_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
get_cpu_affinity,
get_ucx_config,
get_ucx_net_devices,
nvml_device_index,
parse_cuda_visible_device,
parse_device_memory_limit,
)
Expand Down Expand Up @@ -215,7 +216,7 @@ def __init__(
memory_limit, threads_per_worker, n_workers
)
self.device_memory_limit = parse_device_memory_limit(
device_memory_limit, device_index=0
device_memory_limit, device_index=nvml_device_index(0, CUDA_VISIBLE_DEVICES)
)

self.rmm_pool_size = rmm_pool_size
Expand Down Expand Up @@ -361,7 +362,9 @@ def new_worker_spec(self):
{
"env": {"CUDA_VISIBLE_DEVICES": visible_devices,},
"plugins": {
CPUAffinity(get_cpu_affinity(worker_count)),
CPUAffinity(
get_cpu_affinity(nvml_device_index(0, visible_devices))
),
RMMSetup(
self.rmm_pool_size,
self.rmm_managed_memory,
Expand Down
13 changes: 13 additions & 0 deletions dask_cuda/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
get_preload_options,
get_ucx_config,
get_ucx_net_devices,
nvml_device_index,
parse_cuda_visible_device,
parse_device_memory_limit,
unpack_bitmask,
Expand Down Expand Up @@ -59,6 +60,18 @@ def test_cpu_affinity():
assert os.sched_getaffinity(0) == set(affinity)


def test_cpu_affinity_and_cuda_visible_devices():
affinity = dict()
for i in range(get_n_gpus()):
# The negative here would be `device = 0` as required for CUDA runtime
# calls.
device = nvml_device_index(0, cuda_visible_devices(i))
affinity[device] = get_cpu_affinity(device)

for i in range(get_n_gpus()):
assert get_cpu_affinity(i) == affinity[i]


def test_get_device_total_memory():
for i in range(get_n_gpus()):
with cuda.gpus[i]:
Expand Down
31 changes: 31 additions & 0 deletions dask_cuda/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,37 @@ def cuda_visible_devices(i, visible=None):
return ",".join(map(str, L))


def nvml_device_index(i, CUDA_VISIBLE_DEVICES):
"""Get the device index for NVML addressing
NVML expects the index of the physical device, unlike CUDA runtime which
expects the address relative to `CUDA_VISIBLE_DEVICES`. This function
returns the i-th device index from the `CUDA_VISIBLE_DEVICES`
comma-separated string of devices or list.
Examples
--------
>>> nvml_device_index(1, "0,1,2,3")
1
>>> nvml_device_index(1, "1,2,3,0")
2
>>> nvml_device_index(1, [0,1,2,3])
1
>>> nvml_device_index(1, [1,2,3,0])
2
>>> nvml_device_index(1, 2)
Traceback (most recent call last):
...
ValueError: CUDA_VISIBLE_DEVICES must be `str` or `list`
"""
if isinstance(CUDA_VISIBLE_DEVICES, str):
return int(CUDA_VISIBLE_DEVICES.split(",")[i])
elif isinstance(CUDA_VISIBLE_DEVICES, list):
return CUDA_VISIBLE_DEVICES[i]
else:
raise ValueError("`CUDA_VISIBLE_DEVICES` must be `str` or `list`")


def parse_device_memory_limit(device_memory_limit, device_index=0):
"""Parse memory limit to be used by a CUDA device.
Expand Down

0 comments on commit c04856e

Please sign in to comment.