Added test from pull req rapidsai#671 and test to check number of mig…

… devices if available
akaanirban · Jul 16, 2021 · 1452c7b · 1452c7b
1 parent 1f1a15e
commit 1452c7b
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 0 deletions.
diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
@@ -12,6 +12,7 @@
     get_preload_options,
     get_ucx_config,
     get_ucx_net_devices,
+    nvml_device_index,
     parse_cuda_visible_device,
     parse_device_memory_limit,
     unpack_bitmask,
@@ -59,6 +60,18 @@ def test_cpu_affinity():
         assert os.sched_getaffinity(0) == set(affinity)
 
 
+def test_cpu_affinity_and_cuda_visible_devices():
+    affinity = dict()
+    for i in range(get_n_gpus()):
+        # The negative here would be `device = 0` as required for CUDA runtime
+        # calls.
+        device = nvml_device_index(0, cuda_visible_devices(i))
+        affinity[device] = get_cpu_affinity(device)
+
+    for i in range(get_n_gpus()):
+        assert get_cpu_affinity(i) == affinity[i]
+
+
 def test_get_device_total_memory():
     for i in range(get_n_gpus()):
         with cuda.gpus[i]:
@@ -232,3 +245,24 @@ def test_parse_device_memory_limit():
     assert parse_device_memory_limit(0.8) == int(total * 0.8)
     assert parse_device_memory_limit(1000000000) == 1000000000
     assert parse_device_memory_limit("1GB") == 1000000000
+
+
+def test_parse_visible_mig_devices():
+    pynvml = pytest.importorskip("pynvml")
+    pynvml.nvmlInit()
+    for index in range(get_gpu_count()):
+        handle = pynvml.nvmlDeviceGetHandleByIndex(index)
+        if pynvml.nvmlDeviceGetMigMode(handle)[0]:
+            # Just checks to see if there are any MIG enabled GPUS
+            # If there is one, check if the number of mig enabled
+            # instances is less than 7
+            count = pynvml.nvmlDeviceGetMaxMigDeviceCount(handle)
+            miguuids = []
+            for i in range(count):
+                try:
+                    mighandle = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(
+                        device=handle, index=i)
+                    miguuids.append(mighandle)
+                except pynvml.NVMLError:
+                    pass
+            assert len(miguuids) <= 7
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
@@ -130,6 +130,36 @@ def get_gpu_count():
     return pynvml.nvmlDeviceGetCount()
 
 
+@toolz.memoize
+def get_gpu_count_mig(return_uuids=False):
+    """Return the number of MIG instances available
+
+    Parameters
+    ----------
+    return_uuids: bool
+        Returns the uuids of the MIG instances available optionally
+
+    """
+    pynvml.nvmlInit()
+    uuids = []
+    for index in range(get_gpu_count()):
+        handle = pynvml.nvmlDeviceGetHandleByIndex(index)
+        if pynvml.nvmlDeviceGetMigMode(handle)[0]:
+            count = pynvml.nvmlDeviceGetMaxMigDeviceCount(handle)
+            miguuids = []
+            for i in range(count):
+                try:
+                    mighandle = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(
+                        device=handle, index=i)
+                    miguuids.append(mighandle)
+                    uuids.append(pynvml.nvmlDeviceGetUUID(mighandle))
+                except pynvml.NVMLError:
+                    pass
+    if return_uuids:
+        return len(uuids) , uuids
+    return len(uuids)
+
+
 def get_cpu_affinity(device_index=None):
     """Get a list containing the CPU indices to which a GPU is directly connected.
     Use either the device index or the specified device identifier UUID.