Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: increase timeouts for running experiments on k8s after env split #9530

Merged
merged 2 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions e2e_tests/tests/cluster/managed_cluster_k8s.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,18 @@ def __init__(self) -> None:

# Verify we have pulled our image.
# TODO this won't work if we have multiple nodes.
utils.wait_for_command_state(sess, utils.run_command(sess, 0, slots=0), "TERMINATED", 300)
utils.wait_for_command_state(sess, utils.run_command(sess, 0, slots=1), "TERMINATED", 300)
utils.wait_for_command_state(
sess,
utils.run_command(sess, 0, slots=0),
"TERMINATED",
utils.KUBERNETES_EXPERIMENT_TIMEOUT,
)
utils.wait_for_command_state(
sess,
utils.run_command(sess, 0, slots=1),
"TERMINATED",
utils.KUBERNETES_EXPERIMENT_TIMEOUT,
)

def kill_master(self) -> None:
self._scale_master(up=False)
Expand Down
12 changes: 9 additions & 3 deletions e2e_tests/tests/cluster/test_agent_disable.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ def test_disable_agent_experiment_resume() -> None:
["--config", "max_restarts=0"],
)
exp.wait_for_experiment_state(
sess, exp_id, bindings.experimentv1State.RUNNING, max_wait_secs=300
sess,
exp_id,
bindings.experimentv1State.RUNNING,
max_wait_secs=utils.KUBERNETES_EXPERIMENT_TIMEOUT,
)

with _disable_agent(admin, agent_id):
Expand Down Expand Up @@ -123,7 +126,7 @@ def test_disable_agent_zero_slots() -> None:

command_id = utils.run_zero_slot_command(sess, sleep=180)
# Wait for it to run.
utils.wait_for_command_state(sess, command_id, "RUNNING", 300)
utils.wait_for_command_state(sess, command_id, "RUNNING", utils.KUBERNETES_EXPERIMENT_TIMEOUT)

try:
with _disable_agent(admin, agent_id):
Expand Down Expand Up @@ -155,7 +158,10 @@ def test_drain_agent() -> None:
["--config", "hyperparameters.training_batch_seconds=0.15"], # Take 15 seconds.
)
exp.wait_for_experiment_state(
sess, experiment_id, bindings.experimentv1State.RUNNING, max_wait_secs=300
sess,
experiment_id,
bindings.experimentv1State.RUNNING,
max_wait_secs=utils.KUBERNETES_EXPERIMENT_TIMEOUT,
)
exp.wait_for_experiment_active_workload(sess, experiment_id)
exp.wait_for_experiment_workload_progress(sess, experiment_id)
Expand Down
2 changes: 2 additions & 0 deletions e2e_tests/tests/cluster/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from tests import config as conf
from tests import detproc

KUBERNETES_EXPERIMENT_TIMEOUT = 600


class _HTTPServerWithRequest(http.server.HTTPServer):
def __init__(
Expand Down
Loading