dask · fjetter · Oct 22, 2021 · Nov 4, 2021 · Nov 4, 2021 · gjoseph92
@@ -58,6 +58,7 @@ def __init__(self, interval, loop=None, serializers=None):
 
     def start(self, comm):
         self.comm = comm
+        self.please_stop = False
-        self.please_stop = False
+        self.please_stop = False
+        self.stopped.clear()
+        self.waker.clear()
-        self.please_stop = False
+        self.please_stop = False
+        self.stopped.clear()
+        self.waker.clear()
         self.loop.add_callback(self._background_send)
 
     def closed(self):
@@ -98,6 +99,7 @@ def _background_send(self):
                 else:
                     self.recent_message_log.append("large-message")
                 self.byte_count += nbytes
+                payload = []  # lose ref
             except CommClosedError:
                 logger.info("Batched Comm Closed %r", self.comm, exc_info=True)
                 break
@@ -111,7 +113,9 @@ def _background_send(self):
                 logger.exception("Error in batched write")
                 break
             finally:
-                payload = None  # lose ref
+                # If anything failed we should not loose payload. If a new comm
+                # is provided we can still resubmit messages
+                self.buffer = payload + self.buffer
         else:
             # nobreak. We've been gracefully closed.
             self.stopped.set()
@@ -121,7 +125,6 @@ def _background_send(self):
         # there was an exception when using `comm`.
         # We can't close gracefully via `.close()` since we can't send messages.
         # So we just abort.
-        # This means that any messages in our buffer our lost.
         # To propagate exceptions, we rely on subsequent `BatchedSend.send`
         # calls to raise CommClosedErrors.
         self.stopped.set()
@@ -152,6 +155,7 @@ def close(self, timeout=None):
         self.please_stop = True
         self.waker.set()
         yield self.stopped.wait(timeout=timeout)
+        payload = []
         if not self.comm.closed():
             try:
                 if self.buffer:
@@ -160,14 +164,15 @@ def close(self, timeout=None):
                         payload, serializers=self.serializers, on_error="raise"
                     )
             except CommClosedError:
-                pass
+                # If we're closing and there is an error there is little we
+                # can do about this to recover.
+                logger.error("Lost %i payload messages.", len(payload))
             yield self.comm.close()
 
     def abort(self):
         if self.comm is None:
             return
         self.please_stop = True
-        self.buffer = []
         self.waker.set()
         if not self.comm.closed():
             self.comm.abort()
@@ -249,3 +249,25 @@ async def test_serializers():
         assert "function" in value
 
         assert comm.closed()
+
+
+@pytest.mark.asyncio
+async def test_retain_buffer_commclosed():
+    async with EchoServer() as e:
+        with captured_logger("distributed.batched") as caplog:
+            comm = await connect(e.address)
+
+            b = BatchedSend(interval="1s", serializers=["msgpack"])
+            b.start(comm)
+            b.send("foo")
+            assert b.buffer
+            await comm.close()
+            await asyncio.sleep(1)
+
+        assert "Batched Comm Closed" in caplog.getvalue()
+        assert b.buffer
+
+        new_comm = await connect(e.address)
+        b.start(new_comm)
+        assert await new_comm.read() == ("foo",)
+        assert not b.buffer
@@ -2568,11 +2568,9 @@ def fast_on_a(lock):
 
     assert "Unexpected worker completed task" in s_logs.getvalue()
 
-    # Ensure that all in-memory tasks on A have been restored on the
-    # scheduler after reconnect
-    for ts in a.tasks.values():
-        if ts.state == "memory":
-            assert a.address in {ws.address for ws in s.tasks[ts.key].who_has}
+    sts = s.tasks[f3.key]
+    assert sts.state == "memory"
+    assert s.workers[a.address] in sts.who_has
 
     del f1, f2, f3
     while any(w.tasks for w in [a, b]):
@@ -3196,3 +3194,38 @@ async def test_deadlock_cancelled_after_inflight_before_gather_from_worker(
     args, kwargs = mocked_gather.call_args
     await Worker.gather_dep(b, *args, **kwargs)
     await fut3
+
+
+@gen_cluster(nthreads=[("", 1)])
+async def test_dont_loose_payload_reconnect(s, w):
+    """Ensure that payload of a BatchedSend is not lost if a worker reconnects"""
+    s.count = 0
+
+    def receive(worker, msg):
+        s.count += 1
+
+    s.stream_handlers["receive-msg"] = receive
+    w.batched_stream.next_deadline = w.loop.time() + 10_000
+
+    for x in range(100):
+        w.batched_stream.send({"op": "receive-msg", "msg": x})
+
+    await s.stream_comms[w.address].comm.close()
+    while not w.batched_stream.comm.closed():
+        await asyncio.sleep(0.1)
+    before = w.batched_stream.buffer.copy()
+    w.batched_stream.next_deadline = w.loop.time()
+    assert len(w.batched_stream.buffer) == 100
+    with captured_logger("distributed.batched") as caplog:
+        await w.batched_stream._background_send()
+
+    assert "Batched Comm Closed" in caplog.getvalue()
+    after = w.batched_stream.buffer.copy()
+
+    # Payload that couldn't be submitted is prepended
+    assert len(after) >= len(before)
+    assert after[: len(before)] == before
+
+    await w.heartbeat()
+    while not s.count == 100:
+        await asyncio.sleep(0.1)
diff --git a/distributed/worker.py b/distributed/worker.py
@@ -1238,6 +1238,7 @@ async def handle_scheduler(self, comm):
                 comm, every_cycle=[self.ensure_communicating, self.ensure_computing]
             )
         except Exception as e:
+            self.batched_stream.please_stop = True
             logger.exception(e)
             raise
         finally: