dask · hendrikmakait · Mar 8, 2024 · Mar 1, 2024 · Mar 1, 2024 · Mar 1, 2024
@@ -1964,6 +1964,7 @@
                 )
 
                 v = a_recs.get(key, finish)
+                # The inner rec has higher priority? Is that always desired?
                 func = self._TRANSITIONS_TABLE["released", v]
                 b_recs, b_cmsgs, b_wmsgs = func(self, key, stimulus_id)
 
@@ -2082,7 +2083,11 @@
             assert not ts.who_has
             assert not ts.processing_on
             for dts in ts.dependencies:
-                assert dts.state not in {"forgotten", "erred"}
+                assert dts.state not in {"forgotten", "erred"}, (
+                    ts,
+                    dts,
+                    self.transition_log,
+                )
 
         if ts.has_lost_dependencies:
             return {key: "forgotten"}, {}, {}
@@ -2480,7 +2485,9 @@
             recommendations[key] = "forgotten"
         elif ts.has_lost_dependencies:
             recommendations[key] = "forgotten"
-        elif ts.who_wants or ts.waiters:
+        elif (ts.who_wants or ts.waiters) and not any(
+            dts.state in ("erred",) for dts in ts.dependencies
+        ):
             recommendations[key] = "waiting"
 
         for dts in ts.waiters or ():
@@ -2505,14 +2512,13 @@
             assert ts.exception_blame
             assert not ts.who_has
             assert not ts.waiting_on
-            assert not ts.waiters
 
         failing_ts = ts.exception_blame
         assert failing_ts
 
         for dts in ts.dependents:
-            dts.exception_blame = failing_ts
             if not dts.who_has:
+                dts.exception_blame = failing_ts
                 recommendations[dts.key] = "erred"
 
         report_msg = {
@@ -2547,6 +2553,9 @@
 
         for dts in ts.dependents:
             if dts.state == "erred":
+                # Does this make sense?
+                # This goes via released
+                # dts -> released -> waiting
                 recommendations[dts.key] = "waiting"
 
         w_msg = {
@@ -2621,8 +2630,8 @@
         self,
         key: Key,
         stimulus_id: str,
+        worker: str | None = None,
         *,
-        worker: str,
         cause: Key | None = None,
         exception: Serialized | None = None,
         traceback: Serialized | None = None,
@@ -2675,7 +2684,8 @@
 
         if not ts.erred_on:
             ts.erred_on = set()
-        ts.erred_on.add(worker)
+        if worker:
+            ts.erred_on.add(worker)
         if exception is not None:
             ts.exception = exception
             ts.exception_text = exception_text
@@ -2699,8 +2709,9 @@
         )
 
         for dts in ts.dependents:
-            dts.exception_blame = failing_ts
-            recommendations[dts.key] = "erred"
+            if not dts.who_has:
+                dts.exception_blame = failing_ts
+                recommendations[dts.key] = "erred"
 
         for dts in ts.dependencies:
             if dts.waiters:
@@ -5038,6 +5049,19 @@
                     "stimulus_id": stimulus_id,
                 }
             ]
+        elif ts.state == "erred":
+            logger.debug(
+                "Received already erred task, worker: %s" ", key: %s",
+                worker,
+                key,
+            )
+            worker_msgs[worker] = [
+                {
+                    "op": "free-keys",
+                    "keys": [key],
+                    "stimulus_id": stimulus_id,
+                }
+            ]
         elif ts.run_id != run_id:
             if not ts.processing_on or ts.processing_on.address != worker:
                 logger.debug(

@@ -4890,3 +4890,93 @@ async def test_resubmit_different_task_same_key_warns_only_once(
 
     async with Worker(s.address):
         assert await c.gather(zs) == [2, 3, 4]  # Kept old ys
+
+
+def block(x, in_event, block_event):
+    in_event.set()
+    block_event.wait()
+    return x
+
+
+@gen_cluster(
+    client=True,
+    nthreads=[("", 1, {"resources": {"a": 1}})],
+    config={"distributed.scheduler.allowed-failures": 1},
+)
+async def test_fan_out_pattern_deadlock(c, s, a):
+    """Regression test for https://github.com/dask/distributed/issues/8548
+
+    This test heavily uses resources to force scheduling decisions.
+    """
+    in_ancestor = Event()
+    block_ancestor = Event()
+    in_on_a_descendant = Event()
+    in_on_b_descendant = Event()
+    block_on_a_descendant = Event()
+    block_on_b_descendant = Event()
+
+    # Input task to 'g' that we can fail
+    with dask.annotate(resources={"b": 1}):
+        f = delayed(block)(1, in_ancestor, block_ancestor, dask_key_name="f")
+        g = delayed(inc)(f, dask_key_name="g")
+
+        # Fan-out from 'g' and run h1 and h2 on different workers
+        h1 = delayed(block)(
+            g, in_on_b_descendant, block_on_b_descendant, dask_key_name="h1"
+        )
+    with dask.annotate(resources={"a": 1}):
+        h2 = delayed(block)(
+            g, in_on_a_descendant, block_on_a_descendant, dask_key_name="h2"
+        )
+    del g
+
+    f, h1, h2 = c.compute([f, h1, h2])
+    with captured_logger("distributed.scheduler", level=logging.ERROR) as logger:
+        async with Worker(s.address, nthreads=1, resources={"b": 1}) as b:
+            await block_ancestor.set()
+            await asyncio.gather(
+                in_on_a_descendant.wait(),
+                in_on_b_descendant.wait(),
+            )
+            await in_ancestor.clear()
+
+            # Make sure that the scheduler knows that both workers hold 'g' in memory
+            while len(s.tasks["g"].who_has) < 2:
+                await asyncio.sleep(0.1)
+            # Remove worker 'b' while it's processing h1
+            await s.remove_worker(b.address, stimulus_id="remove_b")
+            await block_on_b_descendant.set()
+            await b.close()
+        await block_ancestor.clear()
+
+        # Repeatedly remove new instances of the 'b' worker while it processes 'f'
+        # to trigger an transition for 'f' to 'erred'
+        async with Worker(s.address, nthreads=1, resources={"b": 1}) as b:
+            await in_ancestor.wait()
+            await in_ancestor.clear()
+            await s.remove_worker(b.address, stimulus_id="remove_b")
+            await block_ancestor.set()
+            await b.close()
+        await block_ancestor.clear()
+
+        async with Worker(s.address, nthreads=1, resources={"b": 1}) as b:
+            await in_ancestor.wait()
+            await in_ancestor.clear()
+            await s.remove_worker(b.address, stimulus_id="remove_b")
+            await block_ancestor.set()
+            await b.close()
+
+        await block_on_a_descendant.set()
+        await h2
+
+        with pytest.raises(KilledWorker, match="Attempted to run task 'h1'"):
+            await h1
+
+        del h1, h2
+        # Make sure that h2 gets forgotten on worker 'a'
+        await async_poll_for(lambda: not a.state.tasks, timeout=5)
+    # Ensure that no other errors including transition failures were logged
+    assert (
+        logger.getvalue()
+        == "Task h1 marked as failed because 2 workers died while trying to run it\nTask f marked as failed because 2 workers died while trying to run it\n"
+    )