Skip to content

Commit

Permalink
Added test from pull req rapidsai#671 and test to check number of mig…
Browse files Browse the repository at this point in the history
… devices if available
  • Loading branch information
akaanirban committed Jul 16, 2021
1 parent 1f1a15e commit 1452c7b
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 0 deletions.
34 changes: 34 additions & 0 deletions dask_cuda/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
get_preload_options,
get_ucx_config,
get_ucx_net_devices,
nvml_device_index,
parse_cuda_visible_device,
parse_device_memory_limit,
unpack_bitmask,
Expand Down Expand Up @@ -59,6 +60,18 @@ def test_cpu_affinity():
assert os.sched_getaffinity(0) == set(affinity)


def test_cpu_affinity_and_cuda_visible_devices():
affinity = dict()
for i in range(get_n_gpus()):
# The negative here would be `device = 0` as required for CUDA runtime
# calls.
device = nvml_device_index(0, cuda_visible_devices(i))
affinity[device] = get_cpu_affinity(device)

for i in range(get_n_gpus()):
assert get_cpu_affinity(i) == affinity[i]


def test_get_device_total_memory():
for i in range(get_n_gpus()):
with cuda.gpus[i]:
Expand Down Expand Up @@ -232,3 +245,24 @@ def test_parse_device_memory_limit():
assert parse_device_memory_limit(0.8) == int(total * 0.8)
assert parse_device_memory_limit(1000000000) == 1000000000
assert parse_device_memory_limit("1GB") == 1000000000


def test_parse_visible_mig_devices():
pynvml = pytest.importorskip("pynvml")
pynvml.nvmlInit()
for index in range(get_gpu_count()):
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
if pynvml.nvmlDeviceGetMigMode(handle)[0]:
# Just checks to see if there are any MIG enabled GPUS
# If there is one, check if the number of mig enabled
# instances is less than 7
count = pynvml.nvmlDeviceGetMaxMigDeviceCount(handle)
miguuids = []
for i in range(count):
try:
mighandle = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(
device=handle, index=i)
miguuids.append(mighandle)
except pynvml.NVMLError:
pass
assert len(miguuids) <= 7
30 changes: 30 additions & 0 deletions dask_cuda/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,36 @@ def get_gpu_count():
return pynvml.nvmlDeviceGetCount()


@toolz.memoize
def get_gpu_count_mig(return_uuids=False):
"""Return the number of MIG instances available
Parameters
----------
return_uuids: bool
Returns the uuids of the MIG instances available optionally
"""
pynvml.nvmlInit()
uuids = []
for index in range(get_gpu_count()):
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
if pynvml.nvmlDeviceGetMigMode(handle)[0]:
count = pynvml.nvmlDeviceGetMaxMigDeviceCount(handle)
miguuids = []
for i in range(count):
try:
mighandle = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(
device=handle, index=i)
miguuids.append(mighandle)
uuids.append(pynvml.nvmlDeviceGetUUID(mighandle))
except pynvml.NVMLError:
pass
if return_uuids:
return len(uuids) , uuids
return len(uuids)


def get_cpu_affinity(device_index=None):
"""Get a list containing the CPU indices to which a GPU is directly connected.
Use either the device index or the specified device identifier UUID.
Expand Down

0 comments on commit 1452c7b

Please sign in to comment.