From cdb38ad025e6e027b99f731112fccba2a484c95a Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 6 Jun 2023 21:57:31 +0200 Subject: [PATCH] Increase minimum timeout to wait for workers in CI (#1192) (#1193) We have been getting timeouts waiting for workers in CI, those are not reproducible locally. The reason for that is probably some sort of congestion causing spinup to take longer in CI, therefore this change introduces a variable that can be used to control the minimum timeout and the minimum timeout is doubled in CI. Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Ray Douglass (https://github.com/raydouglass) --- ci/test_python.sh | 1 + dask_cuda/utils.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/test_python.sh b/ci/test_python.sh index b9610bcaf..c988ee15e 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -41,6 +41,7 @@ set +e rapids-logger "pytest dask-cuda" pushd dask_cuda DASK_CUDA_TEST_SINGLE_GPU=1 \ +DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \ UCXPY_IFNAME=eth0 \ UCX_WARN_UNUSED_ENV_VARS=n \ UCX_MEMTYPE_CACHE=n \ diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py index 468c37f47..9fe31333b 100644 --- a/dask_cuda/utils.py +++ b/dask_cuda/utils.py @@ -446,7 +446,9 @@ def wait_workers( client: distributed.Client Instance of client, used to query for number of workers connected. min_timeout: float - Minimum number of seconds to wait before timeout. + Minimum number of seconds to wait before timeout. This value may be + overridden by setting the `DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT` with + a positive integer. seconds_per_gpu: float Seconds to wait for each GPU on the system. For example, if its value is 2 and there is a total of 8 GPUs (workers) being started, @@ -463,6 +465,8 @@ def wait_workers( ------- True if all workers were started, False if a timeout occurs. """ + min_timeout_env = os.environ.get("DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT", None) + min_timeout = min_timeout if min_timeout_env is None else int(min_timeout_env) n_gpus = n_gpus or get_n_gpus() timeout = max(min_timeout, seconds_per_gpu * n_gpus)