From 2eec21e04b6f23c2a72e51599bc0b4553d96cd84 Mon Sep 17 00:00:00 2001
From: Gabe Joseph <gjoseph92@gmail.com>
Date: Fri, 5 Aug 2022 09:13:50 -0600
Subject: [PATCH] Ensure Nanny doesn't restart workers that fail to start, and
 joins subprocess (#6427)

---
 distributed/core.py                    |  3 +-
 distributed/nanny.py                   | 60 +++++++++++++---------
 distributed/process.py                 | 14 ++----
 distributed/tests/test_asyncprocess.py |  3 +-
 distributed/tests/test_nanny.py        | 70 ++++++++++++++++++++++----
 5 files changed, 102 insertions(+), 48 deletions(-)

diff --git a/distributed/core.py b/distributed/core.py
index 14e6949278..45c9c95e5f 100644
--- a/distributed/core.py
+++ b/distributed/core.py
@@ -15,7 +15,7 @@
 from contextlib import suppress
 from enum import Enum
 from functools import partial
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, TypedDict, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, TypedDict, TypeVar, final
 
 import tblib
 from tlz import merge
@@ -462,6 +462,7 @@ async def start_unsafe(self):
         await self.rpc.start()
         return self
 
+    @final
     async def start(self):
         async with self._startup_lock:
             if self.status == Status.failed:
diff --git a/distributed/nanny.py b/distributed/nanny.py
index 0e73206339..8012a3d545 100644
--- a/distributed/nanny.py
+++ b/distributed/nanny.py
@@ -398,7 +398,7 @@ async def instantiate(self) -> Status:
             self.process = WorkerProcess(
                 worker_kwargs=worker_kwargs,
                 silence_logs=self.silence_logs,
-                on_exit=self._on_exit_sync,
+                on_exit=self._on_worker_exit_sync,
                 worker=self.Worker,
                 env=self.env,
                 config=self.config,
@@ -490,19 +490,20 @@ def is_alive(self):
     def run(self, comm, *args, **kwargs):
         return run(self, comm, *args, **kwargs)
 
-    def _on_exit_sync(self, exitcode):
+    def _on_worker_exit_sync(self, exitcode):
         try:
-            self._ongoing_background_tasks.call_soon(self._on_exit, exitcode)
+            self._ongoing_background_tasks.call_soon(self._on_worker_exit, exitcode)
         except AsyncTaskGroupClosedError:  # Async task group has already been closed, so the nanny is already clos(ed|ing).
             pass
 
     @log_errors
-    async def _on_exit(self, exitcode):
+    async def _on_worker_exit(self, exitcode):
         if self.status not in (
             Status.init,
             Status.closing,
             Status.closed,
             Status.closing_gracefully,
+            Status.failed,
         ):
             try:
                 await self._unregister()
@@ -517,6 +518,7 @@ async def _on_exit(self, exitcode):
                 Status.closing,
                 Status.closed,
                 Status.closing_gracefully,
+                Status.failed,
             ):
                 logger.warning("Restarting worker")
                 await self.instantiate()
@@ -577,7 +579,7 @@ async def close(self, timeout=5):
             if self.process is not None:
                 await self.kill(timeout=timeout)
         except Exception:
-            pass
+            logger.exception("Error in Nanny killing Worker subprocess")
         self.process = None
         await self.rpc.close()
         self.status = Status.closed
@@ -662,15 +664,15 @@ async def start(self) -> Status:
             await self.process.start()
         except OSError:
             logger.exception("Nanny failed to start process", exc_info=True)
-            self.process.terminate()
+            # NOTE: doesn't wait for process to terminate, just for terminate signal to be sent
+            await self.process.terminate()
             self.status = Status.failed
-            return self.status
         try:
             msg = await self._wait_until_connected(uid)
         except Exception:
-            logger.exception("Failed to connect to process")
+            # NOTE: doesn't wait for process to terminate, just for terminate signal to be sent
+            await self.process.terminate()
             self.status = Status.failed
-            self.process.terminate()
             raise
         if not msg:
             return self.status
@@ -731,7 +733,12 @@ def mark_stopped(self):
     async def kill(self, timeout: float = 2, executor_wait: bool = True) -> None:
         """
         Ensure the worker process is stopped, waiting at most
-        *timeout* seconds before terminating it abruptly.
+        ``timeout * 0.8`` seconds before killing it abruptly.
+
+        When `kill` returns, the worker process has been joined.
+
+        If the worker process does not terminate within ``timeout`` seconds,
+        even after being killed, `asyncio.TimeoutError` is raised.
         """
         deadline = time() + timeout
 
@@ -740,32 +747,38 @@ async def kill(self, timeout: float = 2, executor_wait: bool = True) -> None:
         if self.status == Status.stopping:
             await self.stopped.wait()
             return
-        assert self.status in (Status.starting, Status.running)
+        assert self.status in (
+            Status.starting,
+            Status.running,
+            Status.failed,  # process failed to start, but hasn't been joined yet
+        ), self.status
         self.status = Status.stopping
         logger.info("Nanny asking worker to close")
 
         process = self.process
+        assert self.process
+        wait_timeout = timeout * 0.8
         self.child_stop_q.put(
             {
                 "op": "stop",
-                "timeout": max(0, deadline - time()) * 0.8,
+                "timeout": wait_timeout,
                 "executor_wait": executor_wait,
             }
         )
         await asyncio.sleep(0)  # otherwise we get broken pipe errors
         self.child_stop_q.close()
 
-        while process.is_alive() and time() < deadline:
-            await asyncio.sleep(0.05)
+        try:
+            await process.join(wait_timeout)
+            return
+        except asyncio.TimeoutError:
+            pass
 
-        if process.is_alive():
-            logger.warning(
-                f"Worker process still alive after {timeout} seconds, killing"
-            )
-            try:
-                await process.terminate()
-            except Exception as e:
-                logger.error("Failed to kill worker process: %s", e)
+        logger.warning(
+            f"Worker process still alive after {wait_timeout} seconds, killing"
+        )
+        await process.kill()
+        await process.join(max(0, deadline - time()))
 
     async def _wait_until_connected(self, uid):
         while True:
@@ -783,9 +796,6 @@ async def _wait_until_connected(self, uid):
                 continue
 
             if "exception" in msg:
-                logger.error(
-                    "Failed while trying to start worker process: %s", msg["exception"]
-                )
                 raise msg["exception"]
             else:
                 return msg
diff --git a/distributed/process.py b/distributed/process.py
index 7f43b5beca..ca7a9b705e 100644
--- a/distributed/process.py
+++ b/distributed/process.py
@@ -17,7 +17,7 @@
 
 import dask
 
-from distributed.utils import TimeoutError, get_mp_context
+from distributed.utils import get_mp_context
 
 logger = logging.getLogger(__name__)
 
@@ -299,15 +299,9 @@ async def join(self, timeout=None):
         assert self._state.pid is not None, "can only join a started process"
         if self._state.exitcode is not None:
             return
-        if timeout is None:
-            await self._exit_future
-        else:
-            try:
-                # Shield otherwise the timeout cancels the future and our
-                # on_exit callback will try to set a result on a canceled future
-                await asyncio.wait_for(asyncio.shield(self._exit_future), timeout)
-            except TimeoutError:
-                pass
+        # Shield otherwise the timeout cancels the future and our
+        # on_exit callback will try to set a result on a canceled future
+        await asyncio.wait_for(asyncio.shield(self._exit_future), timeout)
 
     def close(self):
         """
diff --git a/distributed/tests/test_asyncprocess.py b/distributed/tests/test_asyncprocess.py
index 65aa0e303c..b0cd9848fb 100644
--- a/distributed/tests/test_asyncprocess.py
+++ b/distributed/tests/test_asyncprocess.py
@@ -80,7 +80,8 @@ async def test_simple():
     assert proc.exitcode is None
 
     t1 = time()
-    await proc.join(timeout=0.02)
+    with pytest.raises(asyncio.TimeoutError):
+        await proc.join(timeout=0.02)
     dt = time() - t1
     assert 0.2 >= dt >= 0.001
     assert proc.is_alive()
diff --git a/distributed/tests/test_nanny.py b/distributed/tests/test_nanny.py
index 504ade763c..b9c6b2377f 100644
--- a/distributed/tests/test_nanny.py
+++ b/distributed/tests/test_nanny.py
@@ -13,6 +13,8 @@
 import psutil
 import pytest
 
+from distributed.diagnostics.plugin import WorkerPlugin
+
 pytestmark = pytest.mark.gpu
 
 from tlz import first, valmap
@@ -27,7 +29,7 @@
 from distributed.diagnostics import SchedulerPlugin
 from distributed.metrics import time
 from distributed.protocol.pickle import dumps
-from distributed.utils import TimeoutError, parse_ports
+from distributed.utils import TimeoutError, get_mp_context, parse_ports
 from distributed.utils_test import (
     captured_logger,
     gen_cluster,
@@ -474,21 +476,29 @@ async def test_nanny_closed_by_keyboard_interrupt(ucx_loop, protocol):
             assert "remove-worker" in str(s.events)
 
 
-class StartException(Exception):
-    pass
-
-
 class BrokenWorker(worker.Worker):
-    async def start(self):
-        raise StartException("broken")
+    async def start_unsafe(self):
+        raise ValueError("broken")
 
 
 @gen_cluster(nthreads=[])
 async def test_worker_start_exception(s):
-    # make sure this raises the right Exception:
-    with raises_with_cause(RuntimeError, None, StartException, None):
-        async with Nanny(s.address, worker_class=BrokenWorker) as n:
-            pass
+    nanny = Nanny(s.address, worker_class=BrokenWorker)
+    with captured_logger(logger="distributed.nanny", level=logging.WARNING) as logs:
+        with raises_with_cause(
+            RuntimeError,
+            "Nanny failed to start",
+            RuntimeError,
+            "BrokenWorker failed to start",
+        ):
+            async with nanny:
+                pass
+    assert nanny.status == Status.failed
+    # ^ NOTE: `Nanny.close` sets it to `closed`, then `Server.start._close_on_failure` sets it to `failed`
+    assert nanny.process is None
+    assert "Restarting worker" not in logs.getvalue()
+    # Avoid excessive spewing. (It's also printed once extra within the subprocess, which is okay.)
+    assert logs.getvalue().count("ValueError: broken") == 1, logs.getvalue()
 
 
 @gen_cluster(nthreads=[])
@@ -571,6 +581,44 @@ async def test_restart_memory(c, s, n):
         await asyncio.sleep(0.1)
 
 
+class BlockClose(WorkerPlugin):
+    def __init__(self, close_happened):
+        self.close_happened = close_happened
+
+    async def teardown(self, worker):
+        # Never let the worker cleanly shut down, so it has to be killed
+        self.close_happened.set()
+        while True:
+            await asyncio.sleep(10)
+
+
+@pytest.mark.slow
+@gen_cluster(nthreads=[])
+async def test_close_joins(s):
+    close_happened = get_mp_context().Event()
+
+    nanny = Nanny(s.address, plugins=[BlockClose(close_happened)])
+    async with nanny:
+        p = nanny.process
+        assert p
+        close_t = asyncio.create_task(nanny.close())
+
+        while not close_happened.wait(0):
+            await asyncio.sleep(0.01)
+
+        assert not close_t.done()
+        assert nanny.status == Status.closing
+        assert nanny.process and nanny.process.status == Status.stopping
+
+        await close_t
+
+        assert nanny.status == Status.closed
+        assert not nanny.process
+
+        assert p.status == Status.stopped
+        assert not p.process
+
+
 @gen_cluster(Worker=Nanny, nthreads=[("", 1)])
 async def test_scheduler_crash_doesnt_restart(s, a):
     # Simulate a scheduler crash by disconnecting it first