-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Endpoint support for automatic task retries (#1342)
This PR adds automatic retries to the GlobusComputeEngine via a new kwarg option `max_retries_on_system_failure`. If set, the engine will automatically resubmit any infrastructure-failed tasks. This is designed to *only* handle infrastructure level failures such as `ManagerLost` (often from walltime-truncated batch jobs) and does not handle task failures (e.g `KeyError` during function execution). If the task fails to complete even after the maximum allowed retries the full exception history will be reported. The core functionality is implemented via `GlobusComputeEngineBase._retry_table` that tracks exception history and the Engine specific error handler: `GlobusComputeEngine._handle_task_exception`. By default, `GlobusComputeEngine.max_retries_on_system_failure` is set to 0 since retrying compute intensive tasks could unintentionally waste the user's resource allocation. Here's a yaml config example that uses this engine setting: ``` engine: type: GlobusComputeEngine max_retries_on_system_failure: 2 ```
- Loading branch information
Showing
6 changed files
with
268 additions
and
23 deletions.
There are no files selected for viewing
14 changes: 14 additions & 0 deletions
14
changelog.d/20231031_230517_yadudoc1729_task_retries_1.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
New Functionality | ||
^^^^^^^^^^^^^^^^^ | ||
|
||
- ``GlobusComputeEngine`` can now be configured to automatically retry task failures when | ||
node failures (e.g nodes are lost due to batch job reaching walltime) occur. This option | ||
is set to 0 by default to avoid unintentional resource wastage from retrying tasks. | ||
Traceback history from all prior attempts is supplied if the last retry attempt fails. | ||
Here's a snippet from config.yaml: | ||
|
||
.. code-block:: yaml | ||
engine: | ||
type: GlobusComputeEngine | ||
max_retries_on_system_failure: 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
139 changes: 139 additions & 0 deletions
139
compute_endpoint/tests/integration/endpoint/executors/test_gcengine_retries.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
import uuid | ||
from queue import Queue | ||
|
||
import pytest | ||
from globus_compute_common import messagepack | ||
from globus_compute_endpoint.engines import GlobusComputeEngine | ||
from globus_compute_endpoint.strategies import SimpleStrategy | ||
from globus_compute_sdk.serialize import ComputeSerializer | ||
from parsl.executors.high_throughput.interchange import ManagerLost | ||
from parsl.providers import LocalProvider | ||
from tests.utils import ez_pack_function, kill_manager, succeed_after_n_runs | ||
|
||
|
||
@pytest.fixture | ||
def gc_engine_with_retries(tmp_path): | ||
ep_id = uuid.uuid4() | ||
engine = GlobusComputeEngine( | ||
address="127.0.0.1", | ||
max_workers=1, | ||
heartbeat_period=1, | ||
heartbeat_threshold=1, | ||
max_retries_on_system_failure=0, | ||
provider=LocalProvider( | ||
init_blocks=0, | ||
min_blocks=0, | ||
max_blocks=1, | ||
), | ||
strategy=SimpleStrategy(interval=0.1, max_idletime=0), | ||
) | ||
engine._status_report_thread.reporting_period = 1 | ||
queue = Queue() | ||
engine.start(endpoint_id=ep_id, run_dir=tmp_path, results_passthrough=queue) | ||
yield engine | ||
engine.shutdown() | ||
|
||
|
||
def test_gce_kill_manager(gc_engine_with_retries): | ||
engine = gc_engine_with_retries | ||
engine.max_retries_on_system_failure = 0 | ||
queue = engine.results_passthrough | ||
task_id = uuid.uuid1() | ||
serializer = ComputeSerializer() | ||
|
||
# Confirm error message for ManagerLost | ||
task_body = ez_pack_function(serializer, kill_manager, (), {}) | ||
task_message = messagepack.pack( | ||
messagepack.message_types.Task(task_id=task_id, task_buffer=task_body) | ||
) | ||
|
||
future = engine.submit(task_id, task_message) | ||
|
||
with pytest.raises(ManagerLost): | ||
future.result() | ||
|
||
flag = False | ||
for _i in range(4): | ||
q_msg = queue.get(timeout=2) | ||
assert isinstance(q_msg, dict) | ||
|
||
packed_result_q = q_msg["message"] | ||
result = messagepack.unpack(packed_result_q) | ||
if isinstance(result, messagepack.message_types.Result): | ||
assert result.task_id == task_id | ||
if result.error_details and "ManagerLost" in result.data: | ||
flag = True | ||
break | ||
|
||
assert flag, "Result message missing" | ||
|
||
|
||
def test_success_after_1_fail(gc_engine_with_retries, tmp_path): | ||
engine = gc_engine_with_retries | ||
engine.max_retries_on_system_failure = 2 | ||
fail_count = 1 | ||
queue = engine.results_passthrough | ||
task_id = uuid.uuid1() | ||
serializer = ComputeSerializer() | ||
task_body = ez_pack_function( | ||
serializer, succeed_after_n_runs, (tmp_path,), {"fail_count": fail_count} | ||
) | ||
task_message = messagepack.pack( | ||
messagepack.message_types.Task(task_id=task_id, task_buffer=task_body) | ||
) | ||
engine.submit(task_id, task_message) | ||
|
||
flag = False | ||
for _i in range(10): | ||
q_msg = queue.get(timeout=5) | ||
assert isinstance(q_msg, dict) | ||
|
||
packed_result_q = q_msg["message"] | ||
result = messagepack.unpack(packed_result_q) | ||
if isinstance(result, messagepack.message_types.Result): | ||
assert result.task_id == task_id | ||
assert result.error_details is None | ||
flag = True | ||
break | ||
|
||
assert flag, "Expected result packet, but none received" | ||
|
||
|
||
def test_repeated_fail(gc_engine_with_retries, tmp_path): | ||
engine = gc_engine_with_retries | ||
engine.max_retries_on_system_failure = 2 | ||
fail_count = 3 | ||
queue = engine.results_passthrough | ||
task_id = uuid.uuid1() | ||
serializer = ComputeSerializer() | ||
task_body = ez_pack_function( | ||
serializer, succeed_after_n_runs, (tmp_path,), {"fail_count": fail_count} | ||
) | ||
task_message = messagepack.pack( | ||
messagepack.message_types.Task(task_id=task_id, task_buffer=task_body) | ||
) | ||
engine.submit(task_id, task_message) | ||
|
||
flag = False | ||
for _i in range(10): | ||
q_msg = queue.get(timeout=5) | ||
assert isinstance(q_msg, dict) | ||
|
||
packed_result_q = q_msg["message"] | ||
result = messagepack.unpack(packed_result_q) | ||
if isinstance(result, messagepack.message_types.Result): | ||
assert result.task_id == task_id | ||
assert result.error_details | ||
assert "ManagerLost" in result.data | ||
count = result.data.count("Traceback from attempt") | ||
assert count == fail_count, "Got incorrect # of failure reports" | ||
assert "final attempt" in result.data | ||
flag = True | ||
break | ||
|
||
assert flag, "Expected ManagerLost in failed result.data, but none received" | ||
|
||
|
||
def test_default_retries_is_0(): | ||
engine = GlobusComputeEngine(address="127.0.0.1") | ||
assert engine.max_retries_on_system_failure == 0, "Users must knowingly opt-in" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters