-
-
Notifications
You must be signed in to change notification settings - Fork 718
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Follow-up on removing report and safe from Worker.close #6423
Changes from all commits
ae9a98f
b653933
e9346cc
ccc356a
1aa55d0
7370386
0d932b4
d6e513c
a921eed
4211e5c
ba74495
e3ab594
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -568,3 +568,19 @@ async def test_restart_memory(c, s, n): | |
|
||
while not s.workers: | ||
await asyncio.sleep(0.1) | ||
|
||
|
||
@gen_cluster(Worker=Nanny, nthreads=[("", 1)]) | ||
async def test_scheduler_crash_doesnt_restart(s, a): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This fails on main |
||
# Simulate a scheduler crash by disconnecting it first | ||
# (`s.close()` would tell workers to cleanly shut down) | ||
bcomm = next(iter(s.stream_comms.values())) | ||
bcomm.abort() | ||
await s.close() | ||
|
||
while a.status != Status.closing_gracefully: | ||
await asyncio.sleep(0.01) | ||
|
||
await a.finished() | ||
assert a.status == Status.closed | ||
assert a.process is None |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -221,7 +221,7 @@ async def _force_close(self): | |
2. If it doesn't, log and kill the process | ||
""" | ||
try: | ||
await asyncio.wait_for(self.close(executor_wait=False), 30) | ||
await asyncio.wait_for(self.close(nanny=False, executor_wait=False), 30) | ||
except (Exception, BaseException): # <-- include BaseException here or not?? | ||
# Worker is in a very broken state if closing fails. We need to shut down immediately, | ||
# to ensure things don't get even worse and this worker potentially deadlocks the cluster. | ||
|
@@ -785,7 +785,7 @@ def __init__( | |
"get_data": self.get_data, | ||
"update_data": self.update_data, | ||
"free_keys": self.handle_free_keys, | ||
"terminate": self.terminate, | ||
"terminate": self.close, | ||
"ping": pingpong, | ||
"upload_file": self.upload_file, | ||
"call_stack": self.get_call_stack, | ||
|
@@ -807,7 +807,6 @@ def __init__( | |
|
||
stream_handlers = { | ||
"close": self.close, | ||
"terminate": self.terminate, | ||
"cancel-compute": self.handle_cancel_compute, | ||
"acquire-replicas": self.handle_acquire_replicas, | ||
"compute-task": self.handle_compute_task, | ||
|
@@ -1440,24 +1439,32 @@ async def start_unsafe(self): | |
self.start_periodic_callbacks() | ||
return self | ||
|
||
async def terminate(self, **kwargs): | ||
return await self.close(nanny=True, **kwargs) | ||
|
||
Comment on lines
-1443
to
-1445
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I opted to remove If others would prefer to see |
||
@log_errors | ||
async def close( | ||
self, | ||
timeout=30, | ||
executor_wait=True, | ||
nanny=False, | ||
nanny=True, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By changing the default back to True, resolves: |
||
): | ||
# FIXME: The worker should not be allowed to close the nanny. Ownership | ||
# is the other way round. If an external caller wants to close | ||
# nanny+worker, the nanny must be notified first. ==> Remove kwarg | ||
# nanny, see also Scheduler.retire_workers | ||
if self.status in (Status.closed, Status.closing): | ||
if self.status in (Status.closed, Status.closing, Status.failed): | ||
await self.finished() | ||
return | ||
|
||
if self.status == Status.init: | ||
# If the worker is still in startup/init and is started by a nanny, | ||
# this means the nanny itself is not up, yet. If the Nanny isn't up, | ||
# yet, it's server will not accept any incoming RPC requests and | ||
# will block until the startup is finished. | ||
# Therefore, this worker trying to communicate with the Nanny during | ||
# startup is not possible and we cannot close it. | ||
# In this case, the Nanny will automatically close after inspecting | ||
# the worker status | ||
nanny = False | ||
Comment on lines
+1453
to
+1466
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added another commit to be more explicit about the stati and added a more thorough explanation of why this is happening cc @gjoseph92 |
||
|
||
disable_gc_diagnosis() | ||
|
||
try: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Resolves https://github.com/dask/distributed/pull/6363/files#r878475185
This is equivalent to main before the PR:
distributed/distributed/scheduler.py
Lines 4747 to 4777 in 41a54ee
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is required for some tests. https://github.com/dask/distributed/pull/6363/files#r879719934
I agree than an explicit test would be better but I expect something to break. If CI is happy, I'm happy, though.