Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci: extend experiment timeout for slurm test #9601

Merged
merged 10 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2966,6 +2966,9 @@ jobs:
agent-use:
type: string
default: ""
collect-det-job-logs:
type: boolean
default: true
# Following https://circleci.com/docs/2.0/runner-installation-linux/index.html#start-the-service
machine: true
resource_class: <<parameters.runner_class>>
Expand Down Expand Up @@ -3180,6 +3183,7 @@ jobs:
master-host: localhost
managed-devcluster: false
extra-pytest-flags: <<parameters.extra-pytest-flags>>
collect-det-job-logs: <<parameters.collect-det-job-logs>>

- store_test_results:
path: /tmp/test-results/
Expand Down Expand Up @@ -5275,6 +5279,18 @@ workflows:
security:
initial_user_password: ${INITIAL_USER_PASSWORD}

- test-e2e-slurm:
name: test-e2e-slurm-restart
context:
- dev-ci-cluster-default-user-credentials
mark: "e2e_slurm_restart"
filters: *upstream-feature-branch
requires:
- package-and-push-system-local-ee
- request-hpc-tests
extra-pytest-flags: "--no-compare-stats"
collect-det-job-logs: false

- test-e2e-slurm:
name: test-e2e-slurm-gpu
context:
Expand Down Expand Up @@ -5527,6 +5543,7 @@ workflows:
requires:
- package-and-push-system-local-ee
extra-pytest-flags: "--no-compare-stats"
collect-det-job-logs: false
- test-e2e-slurm:
name: test-e2e-slurm-preemption
context:
Expand Down
12 changes: 9 additions & 3 deletions e2e_tests/tests/cluster/test_master_restart.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,10 @@ def test_master_restart_generic_task_pause(

@pytest.mark.managed_devcluster
def _test_master_restart_reattach_recover_experiment(
restartable_managed_cluster: abstract_cluster.Cluster, downtime: int, exp_timeout: int = 60
restartable_managed_cluster: abstract_cluster.Cluster,
downtime: int,
exp_timeout: int = 60,
max_workload_ticks: int = conf.MAX_TRIAL_BUILD_SECS,
) -> None:
sess = api_utils.user_session()
try:
Expand All @@ -151,15 +154,18 @@ def _test_master_restart_reattach_recover_experiment(
)

# TODO(ilia): don't wait for progress.
exp.wait_for_experiment_workload_progress(sess, exp_id)
exp.wait_for_experiment_workload_progress(sess, exp_id, max_workload_ticks)

if downtime >= 0:
restartable_managed_cluster.kill_master()
time.sleep(downtime)
restartable_managed_cluster.restart_master()

exp.wait_for_experiment_state(
sess, exp_id, bindings.experimentv1State.COMPLETED, max_wait_secs=downtime + exp_timeout
sess,
exp_id,
bindings.experimentv1State.COMPLETED,
max_wait_secs=downtime + exp_timeout,
)
trials = exp.experiment_trials(sess, exp_id)

Expand Down
2 changes: 1 addition & 1 deletion e2e_tests/tests/cluster/test_master_restart_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_master_restart_reattach_recover_experiment_slurm(
managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster, downtime: int
) -> None:
test_master_restart._test_master_restart_reattach_recover_experiment(
managed_slurm_cluster_restarts, downtime
managed_slurm_cluster_restarts, downtime, max_workload_ticks=500
)


Expand Down
2 changes: 1 addition & 1 deletion e2e_tests/tests/experiment/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ def wait_for_at_least_n_trials(
def wait_for_experiment_workload_progress(
sess: api.Session, experiment_id: int, max_ticks: int = conf.MAX_TRIAL_BUILD_SECS
) -> None:
for _ in range(conf.MAX_TRIAL_BUILD_SECS):
for _ in range(max_ticks):
trials = experiment_trials(sess, experiment_id)
if len(trials) > 0:
only_trial = trials[0]
Expand Down
Loading