dask · fjetter · Feb 23, 2024 · Feb 22, 2024 · Feb 23, 2024 · Feb 23, 2024
@@ -7091,12 +7091,11 @@ async def retire_workers(
             If neither ``workers`` nor ``names`` are provided, we call
             ``workers_to_close`` which finds a good set.
         close_workers: bool (defaults to False)
-            Whether or not to actually close the worker explicitly from here.
-            Otherwise we expect some external job scheduler to finish off the
-            worker.
+            Whether to actually close the worker explicitly from here.
+            Otherwise, we expect some external job scheduler to finish off the worker.
         remove: bool (defaults to True)
-            Whether or not to remove the worker metadata immediately or else
-            wait for the worker to contact us.
+            Whether to remove the worker metadata immediately or else wait for the
+            worker to contact us.
 
             If close_workers=False and remove=False, this method just flushes the tasks
             in memory out of the workers and then returns.

diff --git a/distributed/tests/test_nanny.py b/distributed/tests/test_nanny.py
@@ -956,3 +956,33 @@ async def test_nanny_plugin_register_nanny_killed(c, s, restart):
     finally:
         proc.kill()
     assert await register == {}
+
+
+@pytest.mark.slow
+@gen_cluster(
+    client=True,
+    Worker=Nanny,
+    nthreads=[("", 1)],
+    worker_kwargs={"heartbeat_interval": "10ms"},
+)
+async def test_nanny_does_not_restart_worker_on_graceful_retirement(c, s, a):
+    """Tests https://github.com/dask/distributed/pull/8522
+
+    Some clusters (e.g. SpecCluster) implement downscaling by calling
+    `Scheduler.retire_workers()` without arguments, which defaults to
+     `remove=True, close_workers=False`.
+
+    and then use an external system to tear down the worker and the nanny. In these
+    cases, make sure that the worker doesn't kill itself and that the nanny doesn't
+    restart it after the heartbeat to the scheduler fails.
+    """
+    await s.retire_workers([a.worker_address], stimulus_id="test")
+    # On Linux, it takes ~3.5s for the nanny to resuscitate a worker
+    await asyncio.sleep(5)
+    assert not s.workers
+    events = [
+        ev
+        for _, ev in s.events["all"]
+        if isinstance(ev, dict) and ev.get("action") == "add-worker"
+    ]
+    assert len(events) == 1
@@ -1272,12 +1272,24 @@
             self._update_latency(end - start)
 
             if response["status"] == "missing":
-                # Scheduler thought we left. Reconnection is not supported, so just shut down.
-                logger.error(
-                    f"Scheduler was unaware of this worker {self.address!r}. Shutting down."
-                )
-                # Something is out of sync; have the nanny restart us if possible.
-                await self.close(nanny=False)
 await self._register_with_scheduler() 
 self.start_periodic_callbacks() 
 await self._register_with_scheduler() 
 self.start_periodic_callbacks() 
+                # Scheduler thought we left.
+                # Reconnection is not supported, so just shut down.
+
+                if self.status == Status.closing_gracefully:
+                    # Called Scheduler.retire_workers(remove=True, close_workers=False)
+                    # The worker will remain indefinitely in this state, unknown to the
+                    # scheduler, until something else shuts it down.
+                    # Stopping the heartbeat is just a nice-to-have to reduce
+                    # unnecessary warnings on the scheduler log.
+                    logger.info("Stopping heartbeat to the scheduler")
+                    self.periodic_callbacks["heartbeat"].stop()
+                else:
+                    logger.error(
+                        f"Scheduler was unaware of this worker {self.address!r}. "
+                        "Shutting down."
+                    )
+                    # Have the nanny restart us if possible
+                    await self.close(nanny=False, reason="worker-heartbeat-missing")
                 return
 
             self.scheduler_delay = response["time"] - middle
@@ -1290,7 +1302,7 @@
             logger.exception("Failed to communicate with scheduler during heartbeat.")
         except Exception:
             logger.exception("Unexpected exception during heartbeat. Closing worker.")
-            await self.close()
+            await self.close(reason="worker-heartbeat-error")
             raise
 
     @fail_hard