Skip to content

Commit

Permalink
Catch BaseException on UCX read error (dask#6996)
Browse files Browse the repository at this point in the history
  • Loading branch information
pentschev authored and gjoseph92 committed Oct 31, 2022
1 parent e05d14d commit 131aa0b
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 5 deletions.
13 changes: 13 additions & 0 deletions distributed/comm/tests/test_ucx.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from distributed import Client, Scheduler, wait
from distributed.comm import connect, listen, parse_address, ucx
from distributed.comm.core import CommClosedError
from distributed.comm.registry import backends, get_backend
from distributed.deploy.local import LocalCluster
from distributed.diagnostics.nvml import has_cuda_context
Expand Down Expand Up @@ -367,3 +368,15 @@ async def test_ucx_unreachable(
):
with pytest.raises(OSError, match="Timed out trying to connect to"):
await Client("ucx://255.255.255.255:12345", timeout=1, asynchronous=True)


@gen_test()
async def test_comm_closed_on_read_error():
reader, writer = await get_comm_pair()

# Depending on the UCP protocol selected, it may raise either
# `asyncio.TimeoutError` or `CommClosedError`, so validate either one.
with pytest.raises((asyncio.TimeoutError, CommClosedError)):
await asyncio.wait_for(reader.read(), 0.01)

assert reader.closed()
12 changes: 7 additions & 5 deletions distributed/comm/ucx.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,12 +325,14 @@ async def read(self, deserializers=("cuda", "dask", "pickle", "error")):
await self.ep.recv(header)
header = struct.unpack(header_fmt, header)
cuda_frames, sizes = header[:nframes], header[nframes:]
except (
ucp.exceptions.UCXCloseError,
ucp.exceptions.UCXCanceled,
) + (getattr(ucp.exceptions, "UCXConnectionReset", ()),):
except BaseException as e:
# In addition to UCX exceptions, may be CancelledError or a another
# "low-level" exception. The only safe thing to do is to abort.
# (See also https://github.com/dask/distributed/pull/6574).
self.abort()
raise CommClosedError("Connection closed by writer")
raise CommClosedError(
f"Connection closed by writer.\nInner exception: {e!r}"
)
else:
# Recv frames
frames = [
Expand Down

0 comments on commit 131aa0b

Please sign in to comment.