From a0c823d8aa67abd446a214f2de86385ebc3a3985 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 13 Jul 2021 12:53:20 -0700 Subject: [PATCH 1/6] Add nvml_device_index util function --- dask_cuda/utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py index 9c6c4dcff..de225dc30 100644 --- a/dask_cuda/utils.py +++ b/dask_cuda/utils.py @@ -493,6 +493,24 @@ def cuda_visible_devices(i, visible=None): return ",".join(map(str, L)) +def nvml_device_index(i, CUDA_VISIBLE_DEVICES): + """Get the device index for NVML addressing + + NVML expects the index of the physical device, unlike CUDA runtime which + expects the address relative to `CUDA_VISIBLE_DEVICES`. This function + returns the i-th device index from the `CUDA_VISIBLE_DEVICES` + comma-separated list of devices. + + Examples + -------- + >>> nvml_device(1, "0,1,2,3") + 1 + >>> nvml_device(1, "1,2,3,0") + 2 + """ + return int(CUDA_VISIBLE_DEVICES.split(",")[i]) + + def parse_device_memory_limit(device_memory_limit, device_index=0): """Parse memory limit to be used by a CUDA device. From d5227f0551c3865621eeeda92baf9f438726e868 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 13 Jul 2021 12:54:25 -0700 Subject: [PATCH 2/6] Fix NVML index usage in CUDAWorker/LocalCUDACluster --- dask_cuda/cuda_worker.py | 7 +++++-- dask_cuda/local_cuda_cluster.py | 9 +++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py index fd74dddde..98333ddaa 100644 --- a/dask_cuda/cuda_worker.py +++ b/dask_cuda/cuda_worker.py @@ -30,6 +30,7 @@ get_n_gpus, get_ucx_config, get_ucx_net_devices, + nvml_device_index, parse_device_memory_limit, ) @@ -219,7 +220,9 @@ def del_pid_file(): security=security, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, plugins={ - CPUAffinity(get_cpu_affinity(i)), + CPUAffinity( + get_cpu_affinity(nvml_device_index(i, cuda_visible_devices(i))) + ), RMMSetup( rmm_pool_size, rmm_managed_memory, rmm_async, rmm_log_directory, ), @@ -236,7 +239,7 @@ def del_pid_file(): cuda_device_index=i, ) }, - data=data(i), + data=data(nvml_device_index(i, cuda_visible_devices(i))), worker_class=worker_class, **kwargs, ) diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py index 0276a4b6b..2ede84b04 100644 --- a/dask_cuda/local_cuda_cluster.py +++ b/dask_cuda/local_cuda_cluster.py @@ -17,6 +17,7 @@ get_cpu_affinity, get_ucx_config, get_ucx_net_devices, + nvml_device_index, parse_cuda_visible_device, parse_device_memory_limit, ) @@ -215,7 +216,7 @@ def __init__( memory_limit, threads_per_worker, n_workers ) self.device_memory_limit = parse_device_memory_limit( - device_memory_limit, device_index=0 + device_memory_limit, device_index=nvml_device_index(0, CUDA_VISIBLE_DEVICES) ) self.rmm_pool_size = rmm_pool_size @@ -361,7 +362,11 @@ def new_worker_spec(self): { "env": {"CUDA_VISIBLE_DEVICES": visible_devices,}, "plugins": { - CPUAffinity(get_cpu_affinity(worker_count)), + CPUAffinity( + get_cpu_affinity( + nvml_device_index(worker_count, visible_devices) + ) + ), RMMSetup( self.rmm_pool_size, self.rmm_managed_memory, From d6cc2c75c5b4553aff108027c556165396104315 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 13 Jul 2021 12:55:30 -0700 Subject: [PATCH 3/6] Test for CPU affinity based on appropriate NVML indexing --- dask_cuda/tests/test_utils.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py index b56e1b6c1..5bb170197 100644 --- a/dask_cuda/tests/test_utils.py +++ b/dask_cuda/tests/test_utils.py @@ -12,6 +12,7 @@ get_preload_options, get_ucx_config, get_ucx_net_devices, + nvml_device_index, parse_cuda_visible_device, parse_device_memory_limit, unpack_bitmask, @@ -59,6 +60,18 @@ def test_cpu_affinity(): assert os.sched_getaffinity(0) == set(affinity) +def test_cpu_affinity_and_cuda_visible_devices(): + affinity = dict() + for i in range(get_n_gpus()): + # The negative here would be `device = 0` as required for CUDA runtime + # calls. + device = nvml_device_index(0, cuda_visible_devices(i)) + affinity[device] = get_cpu_affinity(device) + + for i in range(get_n_gpus()): + assert get_cpu_affinity(i) == affinity[i] + + def test_get_device_total_memory(): for i in range(get_n_gpus()): with cuda.gpus[i]: From 3b1bb537ff7fda78b599ca3ea2e69b97095a7122 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 13 Jul 2021 15:21:03 -0700 Subject: [PATCH 4/6] Support for CUDA_VISIBLE_DEVICES list type in nvml_device_index --- dask_cuda/utils.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py index de225dc30..b004b9893 100644 --- a/dask_cuda/utils.py +++ b/dask_cuda/utils.py @@ -39,6 +39,7 @@ def __init__(self, cores): def setup(self, worker=None): os.sched_setaffinity(0, self.cores) + print(os.environ["CUDA_VISIBLE_DEVICES"], self.cores) class RMMSetup: @@ -499,16 +500,29 @@ def nvml_device_index(i, CUDA_VISIBLE_DEVICES): NVML expects the index of the physical device, unlike CUDA runtime which expects the address relative to `CUDA_VISIBLE_DEVICES`. This function returns the i-th device index from the `CUDA_VISIBLE_DEVICES` - comma-separated list of devices. + comma-separated string of devices or list. Examples -------- - >>> nvml_device(1, "0,1,2,3") + >>> nvml_device_index(1, "0,1,2,3") 1 - >>> nvml_device(1, "1,2,3,0") + >>> nvml_device_index(1, "1,2,3,0") 2 + >>> nvml_device_index(1, [0,1,2,3]) + 1 + >>> nvml_device_index(1, [1,2,3,0]) + 2 + >>> nvml_device_index(1, 2) + Traceback (most recent call last): + ... + ValueError: CUDA_VISIBLE_DEVICES must be `str` or `list` """ - return int(CUDA_VISIBLE_DEVICES.split(",")[i]) + if isinstance(CUDA_VISIBLE_DEVICES, str): + return int(CUDA_VISIBLE_DEVICES.split(",")[i]) + elif isinstance(CUDA_VISIBLE_DEVICES, list): + return CUDA_VISIBLE_DEVICES[i] + else: + raise ValueError("`CUDA_VISIBLE_DEVICES` must be `str` or `list`") def parse_device_memory_limit(device_memory_limit, device_index=0): From e2ca0681e15ff0c09e883e6f9302438c31894a34 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 13 Jul 2021 15:21:28 -0700 Subject: [PATCH 5/6] Fix nvml_device_index usage in LocalCUDACluster --- dask_cuda/local_cuda_cluster.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py index 2ede84b04..8d08381f1 100644 --- a/dask_cuda/local_cuda_cluster.py +++ b/dask_cuda/local_cuda_cluster.py @@ -363,9 +363,7 @@ def new_worker_spec(self): "env": {"CUDA_VISIBLE_DEVICES": visible_devices,}, "plugins": { CPUAffinity( - get_cpu_affinity( - nvml_device_index(worker_count, visible_devices) - ) + get_cpu_affinity(nvml_device_index(0, visible_devices)) ), RMMSetup( self.rmm_pool_size, From f84fcf427d344f4e625707ada3ecf0d53f9d74af Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 14 Jul 2021 03:33:14 -0700 Subject: [PATCH 6/6] Clean up debug code --- dask_cuda/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py index b004b9893..d1c581acc 100644 --- a/dask_cuda/utils.py +++ b/dask_cuda/utils.py @@ -39,7 +39,6 @@ def __init__(self, cores): def setup(self, worker=None): os.sched_setaffinity(0, self.cores) - print(os.environ["CUDA_VISIBLE_DEVICES"], self.cores) class RMMSetup: