-
-
Notifications
You must be signed in to change notification settings - Fork 719
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Restart workers when worker-ttl expires #8538
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -547,7 +547,7 @@ def __init__( | |
self._memory_unmanaged_old = 0 | ||
self._memory_unmanaged_history = deque() | ||
self.metrics = {} | ||
self.last_seen = 0 | ||
self.last_seen = time() | ||
self.time_delay = 0 | ||
self.bandwidth = parse_bytes(dask.config.get("distributed.scheduler.bandwidth")) | ||
self.actors = set() | ||
|
@@ -6336,7 +6336,10 @@ async def restart_workers( | |
# FIXME does not raise if the process fails to shut down, | ||
# see https://github.com/dask/distributed/pull/6427/files#r894917424 | ||
# NOTE: Nanny will automatically restart worker process when it's killed | ||
nanny.kill(reason=stimulus_id, timeout=timeout), | ||
# NOTE: Don't propagate timeout to kill(): we don't want to | ||
# spend (.8*.8)=64% of our end-to-end timeout waiting for a hung | ||
# process to restart. | ||
nanny.kill(reason=stimulus_id), | ||
timeout, | ||
) | ||
for nanny in nannies | ||
|
@@ -8404,19 +8407,29 @@ async def get_worker_monitor_info(self, recent=False, starts=None): | |
# Cleanup # | ||
########### | ||
|
||
async def check_worker_ttl(self): | ||
@log_errors | ||
async def check_worker_ttl(self) -> None: | ||
now = time() | ||
stimulus_id = f"check-worker-ttl-{now}" | ||
assert self.worker_ttl | ||
ttl = max(self.worker_ttl, 10 * heartbeat_interval(len(self.workers))) | ||
to_restart = [] | ||
|
||
for ws in self.workers.values(): | ||
if (ws.last_seen < now - self.worker_ttl) and ( | ||
ws.last_seen < now - 10 * heartbeat_interval(len(self.workers)) | ||
): | ||
last_seen = now - ws.last_seen | ||
if last_seen > ttl: | ||
to_restart.append(ws.address) | ||
logger.warning( | ||
"Worker failed to heartbeat within %s seconds. Closing: %s", | ||
self.worker_ttl, | ||
ws, | ||
f"Worker failed to heartbeat for {last_seen:.0f}s; " | ||
f"{'attempting restart' if ws.nanny else 'removing'}: {ws}" | ||
) | ||
await self.remove_worker(address=ws.address, stimulus_id=stimulus_id) | ||
|
||
if to_restart: | ||
await self.restart_workers( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you briefly check that the tracking in https://github.com/coiled/platform/blob/4dbd6f449884464caaba09b470aa06394a22d024/analytics/preload_scripts/telemetry.py#L772 still works? I don't see a reason why not, but want to be sure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Afraid I can no longer do that. Is there anything else I can do to push this PR through the finishing line? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should still work from what I can tell. Closing workers are a bit of a brittle thing, though, so it's not impossible that there is some kind of race condition ongoing where that condition would no longer work. If that's the case, we can look into it later |
||
to_restart, | ||
wait_for_workers=False, | ||
stimulus_id=stimulus_id, | ||
) | ||
|
||
def check_idle(self) -> float | None: | ||
if self.status in (Status.closing, Status.closed): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Align behaviour to ClientState