Skip to content

Commit

Permalink
test: increase timeouts for running experiments on k8s after env split (
Browse files Browse the repository at this point in the history
  • Loading branch information
NicholasBlaskey authored Jun 17, 2024
1 parent 0f6eb24 commit edbeee9
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 5 deletions.
14 changes: 12 additions & 2 deletions e2e_tests/tests/cluster/managed_cluster_k8s.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,18 @@ def __init__(self) -> None:

# Verify we have pulled our image.
# TODO this won't work if we have multiple nodes.
utils.wait_for_command_state(sess, utils.run_command(sess, 0, slots=0), "TERMINATED", 300)
utils.wait_for_command_state(sess, utils.run_command(sess, 0, slots=1), "TERMINATED", 300)
utils.wait_for_command_state(
sess,
utils.run_command(sess, 0, slots=0),
"TERMINATED",
utils.KUBERNETES_EXPERIMENT_TIMEOUT,
)
utils.wait_for_command_state(
sess,
utils.run_command(sess, 0, slots=1),
"TERMINATED",
utils.KUBERNETES_EXPERIMENT_TIMEOUT,
)

def kill_master(self) -> None:
self._scale_master(up=False)
Expand Down
12 changes: 9 additions & 3 deletions e2e_tests/tests/cluster/test_agent_disable.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ def test_disable_agent_experiment_resume() -> None:
["--config", "max_restarts=0"],
)
exp.wait_for_experiment_state(
sess, exp_id, bindings.experimentv1State.RUNNING, max_wait_secs=300
sess,
exp_id,
bindings.experimentv1State.RUNNING,
max_wait_secs=utils.KUBERNETES_EXPERIMENT_TIMEOUT,
)

with _disable_agent(admin, agent_id):
Expand Down Expand Up @@ -123,7 +126,7 @@ def test_disable_agent_zero_slots() -> None:

command_id = utils.run_zero_slot_command(sess, sleep=180)
# Wait for it to run.
utils.wait_for_command_state(sess, command_id, "RUNNING", 300)
utils.wait_for_command_state(sess, command_id, "RUNNING", utils.KUBERNETES_EXPERIMENT_TIMEOUT)

try:
with _disable_agent(admin, agent_id):
Expand Down Expand Up @@ -155,7 +158,10 @@ def test_drain_agent() -> None:
["--config", "hyperparameters.training_batch_seconds=0.15"], # Take 15 seconds.
)
exp.wait_for_experiment_state(
sess, experiment_id, bindings.experimentv1State.RUNNING, max_wait_secs=300
sess,
experiment_id,
bindings.experimentv1State.RUNNING,
max_wait_secs=utils.KUBERNETES_EXPERIMENT_TIMEOUT,
)
exp.wait_for_experiment_active_workload(sess, experiment_id)
exp.wait_for_experiment_workload_progress(sess, experiment_id)
Expand Down
2 changes: 2 additions & 0 deletions e2e_tests/tests/cluster/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from tests import config as conf
from tests import detproc

KUBERNETES_EXPERIMENT_TIMEOUT = 600


class _HTTPServerWithRequest(http.server.HTTPServer):
def __init__(
Expand Down

0 comments on commit edbeee9

Please sign in to comment.