From 4695c253a5a6c3f1781a068e8b6895bd6a495fb4 Mon Sep 17 00:00:00 2001 From: Rohan Vaidya <52589173+rohanvaidya45@users.noreply.github.com> Date: Tue, 5 Dec 2023 20:31:24 -0800 Subject: [PATCH] Remove Exponential Backoff for Retry Until Up (#2821) * remove exponential backoff * add back random jitter * format * use backoff * format * add comment * rerun checks * format * pylint * form * change to 30 --- sky/backends/cloud_vm_ray_backend.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 6007bf377266..dfcbe32df904 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -87,7 +87,7 @@ # Time gap between retries after failing to provision in all possible places. # Used only if --retry-until-up is set. -_RETRY_UNTIL_UP_INIT_GAP_SECONDS = 60 +_RETRY_UNTIL_UP_INIT_GAP_SECONDS = 30 # The maximum retry count for fetching IP address. _FETCH_IP_MAX_ATTEMPTS = 3 @@ -2873,7 +2873,13 @@ def _provision( # TODO(suquark): once we have sky on PyPI, we should directly # install sky from PyPI. local_wheel_path, wheel_hash = wheel_utils.build_sky_wheel() - backoff = common_utils.Backoff(_RETRY_UNTIL_UP_INIT_GAP_SECONDS) + # The most frequent reason for the failure of a provision + # request is resource unavailability instead of rate + # limiting; to make users wait shorter, we do not make + # backoffs exponential. + backoff = common_utils.Backoff( + initial_backoff=_RETRY_UNTIL_UP_INIT_GAP_SECONDS, + max_backoff_factor=1) attempt_cnt = 1 while True: # For on-demand instances, RetryingVmProvisioner will retry @@ -2927,7 +2933,7 @@ def _provision( f'{colorama.Style.BRIGHT}=== Retry until up ===' f'{colorama.Style.RESET_ALL}\n' f'Retrying provisioning after {gap_seconds:.0f}s ' - '(exponential backoff with random jittering). ' + '(backoff with random jittering). ' f'Already tried {attempt_cnt} attempt{plural}.') attempt_cnt += 1 time.sleep(gap_seconds)