-
-
Notifications
You must be signed in to change notification settings - Fork 719
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add no worker timeout for scheduler #8371
Changes from 11 commits
f9f4c8e
28f3f92
28891d3
0923268
15fe0a7
aa6c918
0288779
81f63bb
092574b
b403353
1056669
c675f87
ae56655
50c7024
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -3517,6 +3517,12 @@ | |||||
default_port = 8786 | ||||||
_instances: ClassVar[weakref.WeakSet[Scheduler]] = weakref.WeakSet() | ||||||
|
||||||
worker_ttl: float | None | ||||||
idle_since: float | None | ||||||
idle_timeout: float | None | ||||||
_no_workers_since: float | None # Note: not None iff there are pending tasks | ||||||
no_workers_timeout: float | None | ||||||
|
||||||
def __init__( | ||||||
self, | ||||||
loop=None, | ||||||
|
@@ -3578,16 +3584,19 @@ | |||||
self.service_kwargs = service_kwargs or {} | ||||||
self.services = {} | ||||||
self.scheduler_file = scheduler_file | ||||||
worker_ttl = worker_ttl or dask.config.get("distributed.scheduler.worker-ttl") | ||||||
self.worker_ttl = parse_timedelta(worker_ttl) if worker_ttl else None | ||||||
idle_timeout = idle_timeout or dask.config.get( | ||||||
"distributed.scheduler.idle-timeout" | ||||||
|
||||||
self.worker_ttl = parse_timedelta( | ||||||
worker_ttl or dask.config.get("distributed.scheduler.worker-ttl") | ||||||
) | ||||||
self.idle_timeout = parse_timedelta( | ||||||
idle_timeout or dask.config.get("distributed.scheduler.idle-timeout") | ||||||
) | ||||||
if idle_timeout: | ||||||
self.idle_timeout = parse_timedelta(idle_timeout) | ||||||
else: | ||||||
self.idle_timeout = None | ||||||
self.idle_since = time() | ||||||
self.no_workers_timeout = parse_timedelta( | ||||||
dask.config.get("distributed.scheduler.no-workers-timeout") | ||||||
) | ||||||
self._no_workers_since = None | ||||||
|
||||||
self.time_started = self.idle_since # compatibility for dask-gateway | ||||||
self._replica_lock = RLock() | ||||||
self.bandwidth_workers = defaultdict(float) | ||||||
|
@@ -3860,9 +3869,12 @@ | |||||
pc = PeriodicCallback(self.check_worker_ttl, self.worker_ttl * 1000) | ||||||
self.periodic_callbacks["worker-ttl"] = pc | ||||||
|
||||||
pc = PeriodicCallback(self.check_idle, (self.idle_timeout or 1) * 1000 / 4) | ||||||
pc = PeriodicCallback(self.check_idle, 250) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These methods are tiny and running them 4 times per second is inconsequential. |
||||||
self.periodic_callbacks["idle-timeout"] = pc | ||||||
|
||||||
pc = PeriodicCallback(self._check_no_workers, 250) | ||||||
self.periodic_callbacks["no-workers-timeout"] = pc | ||||||
|
||||||
if extensions is None: | ||||||
extensions = DEFAULT_EXTENSIONS.copy() | ||||||
if not dask.config.get("distributed.scheduler.work-stealing"): | ||||||
|
@@ -8141,7 +8153,7 @@ | |||||
|
||||||
def check_idle(self) -> float | None: | ||||||
if self.status in (Status.closing, Status.closed): | ||||||
return None | ||||||
return None # pragma: nocover | ||||||
|
||||||
if self.transition_counter != self._idle_transition_counter: | ||||||
self._idle_transition_counter = self.transition_counter | ||||||
|
@@ -8178,6 +8190,33 @@ | |||||
self._ongoing_background_tasks.call_soon(self.close) | ||||||
return self.idle_since | ||||||
|
||||||
def _check_no_workers(self) -> None: | ||||||
if self.status in (Status.closing, Status.closed): | ||||||
return # pragma: nocover | ||||||
|
||||||
if (not self.queued and not self.unrunnable) or (self.queued and self.workers): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also check whether we have tasks in processing (regardless of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so. "there's a task to run" translates to
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't that translate to:
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, let me rephrase: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Do you see a use case that the current logic doesn't cover? I can't think of any... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gen_cluster(
client=True,
nthreads=[("", 1)],
config={"distributed.scheduler.no-workers-timeout": "100ms"},
)
async def test_no_workers_timeout_with_worker(c, s, a):
"""Do not trip no-workers-timeout when there are tasks processing"""
import time
s._check_no_workers()
await asyncio.sleep(0.2)
assert s.status == Status.running
f1 = c.submit(time.sleep, 2)
f2 = c.submit(inc, 1, key="x", workers=["127.0.0.2:1234"])
await f1
assert s.status == Status.running would kill the scheduler before we're able to complete There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👀 I had not seen that use case. Thank you. Fixed. |
||||||
self._no_workers_since = None | ||||||
return | ||||||
|
||||||
# 1. There are queued or unrunnable tasks and no workers at all | ||||||
# 2. There are unrunnable tasks and no workers satisfy their restrictions | ||||||
# (Only rootish tasks can be queued, and rootish tasks can't have restrictions) | ||||||
|
||||||
if not self._no_workers_since: | ||||||
self._no_workers_since = time() | ||||||
return | ||||||
|
||||||
if ( | ||||||
self.no_workers_timeout | ||||||
and time() > self._no_workers_since + self.no_workers_timeout | ||||||
): | ||||||
logger.info( | ||||||
"Tasks have been without any workers to run them for %s; " | ||||||
"shutting scheduler down", | ||||||
format_time(self.no_workers_timeout), | ||||||
) | ||||||
self._ongoing_background_tasks.call_soon(self.close) | ||||||
|
||||||
def adaptive_target(self, target_duration=None): | ||||||
"""Desired number of workers based on the current workload | ||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given that
idle-timeout
is currentlynull
, I'd also default tonull
forno-workers-timeout
. If I don't want to shut down my cluster when there's nothing at all to be done, I probably also don't want to shut it down if I have something to be done but lack the means to do so.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, there's the use case of adaptive clusters that scale down to zero or almost zero. There you likely want to keep the scheduler always running, but if the cluster hangs e.g. while 100 CPU workers are up because a single GPU worker failed to start, you want to tear it down quickly.
However, I agree that None is generally a more desirable default particularly for non-adaptive situations.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair point