Skip to content

Commit

Permalink
Remove ucp.reset() requirement from test_dgx (#1269)
Browse files Browse the repository at this point in the history
By moving the `ucp.get_transports()` call to the subprocess we remove the requirement to reset UCX from the `pytest` process, preventing potential interferences with tests that run after.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #1269
  • Loading branch information
pentschev authored Nov 6, 2023
1 parent 004185e commit e5b240c
Showing 1 changed file with 16 additions and 10 deletions.
26 changes: 16 additions & 10 deletions dask_cuda/tests/test_dgx.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,14 +128,22 @@ def test_tcp_only():


def _test_ucx_infiniband_nvlink(
protocol, enable_infiniband, enable_nvlink, enable_rdmacm
skip_queue, protocol, enable_infiniband, enable_nvlink, enable_rdmacm
):
cupy = pytest.importorskip("cupy")
if protocol == "ucx":
ucp = pytest.importorskip("ucp")
elif protocol == "ucxx":
ucp = pytest.importorskip("ucxx")

if enable_infiniband and not any(
[at.startswith("rc") for at in ucp.get_active_transports()]
):
skip_queue.put("No support available for 'rc' transport in UCX")
return
else:
skip_queue.put("ok")

if enable_infiniband is None and enable_nvlink is None and enable_rdmacm is None:
enable_tcp_over_ucx = None
cm_tls = ["all"]
Expand Down Expand Up @@ -205,17 +213,16 @@ def check_ucx_options():
)
def test_ucx_infiniband_nvlink(protocol, params):
if protocol == "ucx":
ucp = pytest.importorskip("ucp")
pytest.importorskip("ucp")
elif protocol == "ucxx":
ucp = pytest.importorskip("ucxx")
pytest.importorskip("ucxx")

if params["enable_infiniband"]:
if not any([at.startswith("rc") for at in ucp.get_active_transports()]):
pytest.skip("No support available for 'rc' transport in UCX")
skip_queue = mp.Queue()

p = mp.Process(
target=_test_ucx_infiniband_nvlink,
args=(
skip_queue,
protocol,
params["enable_infiniband"],
params["enable_nvlink"],
Expand All @@ -225,9 +232,8 @@ def test_ucx_infiniband_nvlink(protocol, params):
p.start()
p.join()

# Starting a new cluster on the same pytest process after an rdmacm cluster
# has been used may cause UCX-Py to complain about being already initialized.
if params["enable_rdmacm"] is True:
ucp.reset()
skip_msg = skip_queue.get()
if skip_msg != "ok":
pytest.skip(skip_msg)

assert not p.exitcode

0 comments on commit e5b240c

Please sign in to comment.