From ab1d35ce032f2a8faa9b92d130850fed078988c7 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 21 May 2021 16:06:47 +0200 Subject: [PATCH] Disable reuse endpoints with UCX >= 1.11 (#620) The UCX-Py endpoint reuse is not anymore necessary, so we also disable that for UCX 1.11+. The primary reason it was introduced was to circumvent an issue with CUDA IPC that was resolved by https://github.com/openucx/ucx/pull/6360. Using the endpoint reuse class has also proven to be very slow, taking a long time to initialize for clusters with just a few dozen workers and pretty much unusable for a cluster in the order of 100 workers. Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Benjamin Zaitlen (https://github.com/quasiben) URL: https://github.com/rapidsai/dask-cuda/pull/620 --- dask_cuda/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py index 1ac441cbf..253a41dfc 100644 --- a/dask_cuda/utils.py +++ b/dask_cuda/utils.py @@ -27,8 +27,10 @@ def nvtx_annotate(message=None, color="blue", domain=None): import ucp _ucx_110 = ucp.get_ucx_version() >= (1, 10, 0) + _ucx_111 = ucp.get_ucx_version() >= (1, 11, 0) except ImportError: _ucx_110 = False + _ucx_111 = False class CPUAffinity: @@ -247,7 +249,7 @@ def get_ucx_config( "rdmacm": None, "net-devices": None, "cuda_copy": None, - "reuse-endpoints": True, + "reuse-endpoints": not _ucx_111, } if enable_tcp_over_ucx or enable_infiniband or enable_nvlink: ucx_config["cuda_copy"] = True