-
-
Notifications
You must be signed in to change notification settings - Fork 719
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Validate and debug state machine on handle_compute_task #6327
Changes from all commits
87ff61b
3ef5ba7
9408cab
49d9a5b
8adc739
070d2ef
329d1b0
79402f2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1855,6 +1855,10 @@ def handle_acquire_replicas( | |
who_has: dict[str, Collection[str]], | ||
stimulus_id: str, | ||
) -> None: | ||
if self.validate: | ||
assert set(keys) == who_has.keys() | ||
assert all(who_has.values()) | ||
|
||
recommendations: Recs = {} | ||
for key in keys: | ||
ts = self.ensure_task_exists( | ||
|
@@ -1872,6 +1876,10 @@ def handle_acquire_replicas( | |
self.update_who_has(who_has) | ||
self.transitions(recommendations, stimulus_id=stimulus_id) | ||
|
||
if self.validate: | ||
for key in keys: | ||
assert self.tasks[key].state != "released", self.story(key) | ||
|
||
def ensure_task_exists( | ||
self, key: str, *, priority: tuple[int, ...], stimulus_id: str | ||
) -> TaskState: | ||
|
@@ -1892,19 +1900,18 @@ def handle_compute_task( | |
*, | ||
key: str, | ||
who_has: dict[str, Collection[str]], | ||
nbytes: dict[str, int], | ||
priority: tuple[int, ...], | ||
duration: float, | ||
function=None, | ||
args=None, | ||
kwargs=None, | ||
task=no_value, # distributed.scheduler.TaskState.run_spec | ||
nbytes: dict[str, int] | None = None, | ||
resource_restrictions: dict[str, float] | None = None, | ||
actor: bool = False, | ||
annotations: dict | None = None, | ||
stimulus_id: str, | ||
) -> None: | ||
self.log.append((key, "compute-task", stimulus_id, time())) | ||
try: | ||
ts = self.tasks[key] | ||
logger.debug( | ||
|
@@ -1913,47 +1920,14 @@ def handle_compute_task( | |
) | ||
except KeyError: | ||
self.tasks[key] = ts = TaskState(key) | ||
|
||
ts.run_spec = SerializedTask(function, args, kwargs, task) | ||
|
||
assert isinstance(priority, tuple) | ||
priority = priority + (self.generation,) | ||
self.generation -= 1 | ||
|
||
if actor: | ||
self.actors[ts.key] = None | ||
|
||
ts.exception = None | ||
ts.traceback = None | ||
ts.exception_text = "" | ||
ts.traceback_text = "" | ||
ts.priority = priority | ||
ts.duration = duration | ||
if resource_restrictions: | ||
ts.resource_restrictions = resource_restrictions | ||
ts.annotations = annotations | ||
self.log.append((key, "compute-task", ts.state, stimulus_id, time())) | ||
|
||
recommendations: Recs = {} | ||
instructions: Instructions = [] | ||
for dependency in who_has: | ||
dep_ts = self.ensure_task_exists( | ||
key=dependency, | ||
priority=priority, | ||
stimulus_id=stimulus_id, | ||
) | ||
|
||
# link up to child / parents | ||
ts.dependencies.add(dep_ts) | ||
dep_ts.dependents.add(ts) | ||
|
||
if nbytes is not None: | ||
for key, value in nbytes.items(): | ||
self.tasks[key].nbytes = value | ||
|
||
if ts.state in READY | {"executing", "waiting", "resumed"}: | ||
if ts.state in READY | {"executing", "long-running", "waiting", "resumed"}: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I omitted a unit test for this - something to write after the state machine refactor for sure |
||
pass | ||
elif ts.state == "memory": | ||
recommendations[ts] = "memory" | ||
instructions.append( | ||
self._get_task_finished_msg(ts, stimulus_id=stimulus_id) | ||
) | ||
|
@@ -1966,12 +1940,56 @@ def handle_compute_task( | |
"error", | ||
}: | ||
recommendations[ts] = "waiting" | ||
else: # pragma: no cover | ||
|
||
ts.run_spec = SerializedTask(function, args, kwargs, task) | ||
|
||
assert isinstance(priority, tuple) | ||
priority = priority + (self.generation,) | ||
self.generation -= 1 | ||
|
||
if actor: | ||
self.actors[ts.key] = None | ||
|
||
ts.exception = None | ||
ts.traceback = None | ||
ts.exception_text = "" | ||
ts.traceback_text = "" | ||
ts.priority = priority | ||
ts.duration = duration | ||
if resource_restrictions: | ||
ts.resource_restrictions = resource_restrictions | ||
ts.annotations = annotations | ||
|
||
if self.validate: | ||
assert who_has.keys() == nbytes.keys() | ||
assert all(who_has.values()) | ||
|
||
for dep_key, dep_workers in who_has.items(): | ||
dep_ts = self.ensure_task_exists( | ||
key=dep_key, | ||
priority=priority, | ||
stimulus_id=stimulus_id, | ||
) | ||
# link up to child / parents | ||
ts.dependencies.add(dep_ts) | ||
dep_ts.dependents.add(ts) | ||
|
||
for dep_key, value in nbytes.items(): | ||
self.tasks[dep_key].nbytes = value | ||
|
||
self.update_who_has(who_has) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This move prevents deps to be created in resumed state by ensure_task_exists and then remain there because there's nothing actually needing them. |
||
else: # pragma: nocover | ||
raise RuntimeError(f"Unexpected task state encountered {ts} {stimulus_id}") | ||
|
||
self._handle_instructions(instructions) | ||
self.update_who_has(who_has) | ||
self.transitions(recommendations, stimulus_id=stimulus_id) | ||
self._handle_instructions(instructions) | ||
|
||
if self.validate: | ||
# All previously unknown tasks that were created above by | ||
# ensure_tasks_exists() have been transitioned to fetch or flight | ||
assert all( | ||
ts2.state != "released" for ts2 in (ts, *ts.dependencies) | ||
), self.story(ts, *ts.dependencies) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the moment of writing, this assertion fails in test_stress_scatter_death 0.4% of the times on a fast desktop. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI this assert is not 100% correct. There is a case for valid tasks left in released in the case of cancelled/resumed tasks. I'll open a follow up PR with a case reproducing this condition There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. flowchart TD
A1[A1 - forgotten / not known] --> B1[B1 - flight]
A2[A1 - forgotten / not known] --> B1[B1 - flight]
B1 --> C1[C1 - waiting]
free-keys / cancel B1 flowchart TD
A1[A1 - forgotten / not known] --> B1[B1 - cancelled]
A2[A1 - forgotten / not known] --> B1[B1 - cancelled]
B1 --> C1[C1 - forgotten]
compute-task B1 flowchart TD
A1[A1 - released] --> B1[B1 - resumed]
A2[A1 - released] --> B1[B1 - resumed]
B1 --> C1[C1 - forgotten]
gather-dep finishes w/ Error flowchart TD
A1[A1 - fetch] --> B1[B1 - waiting]
A2[A1 - fetch] --> B1[B1 - waiting]
B1 --> C1[C1 - forgotten]
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think I understand from the above diagram how this can end up with a released state by the end of compute-task? |
||
|
||
######################## | ||
# Worker State Machine # | ||
|
@@ -3436,7 +3454,6 @@ async def find_missing(self) -> None: | |
self.scheduler.who_has, | ||
keys=[ts.key for ts in self._missing_dep_flight], | ||
) | ||
who_has = {k: v for k, v in who_has.items() if v} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Redundant - update_who_has already throws away empty lists of workers |
||
self.update_who_has(who_has) | ||
recommendations: Recs = {} | ||
for ts in self._missing_dep_flight: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test is functionally identical to before - all changes are just cosmetic.