skypilot-org · Michaelvll · Jul 3, 2023 · Jul 1, 2023 · Jul 2, 2023 · Jul 2, 2023
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
@@ -2047,7 +2047,7 @@ def _update_cluster_status(
 def _refresh_cluster_record(
         cluster_name: str,
         *,
-        force_refresh: bool = False,
+        force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
         acquire_per_cluster_status_lock: bool = True
 ) -> Optional[Dict[str, Any]]:
     """Refresh the cluster, and return the possibly updated record.
@@ -2058,8 +2058,10 @@ def _refresh_cluster_record(
 
     Args:
         cluster_name: The name of the cluster.
-        force_refresh: if True, refresh the cluster status even if it may be
-            skipped. Otherwise (the default), only refresh if the cluster:
+        force_refresh_statuses: if specified, refresh the cluster has one of
+            the specified statuses. Additionally, clusters satisfying the
+            following conditions will always be refreshed no matter the
+            argument is specified or not:
                 1. is a spot cluster, or
                 2. is a non-spot cluster, is not STOPPED, and autostop is set.
         acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
@@ -2089,7 +2091,9 @@ def _refresh_cluster_record(
         use_spot = handle.launched_resources.use_spot
         has_autostop = (record['status'] != status_lib.ClusterStatus.STOPPED and
                         record['autostop'] >= 0)
-        if force_refresh or has_autostop or use_spot:
+        force_refresh_for_cluster = (force_refresh_statuses is not None and
+                                     record['status'] in force_refresh_statuses)
+        if force_refresh_for_cluster or has_autostop or use_spot:
             record = _update_cluster_status(
                 cluster_name,
                 acquire_per_cluster_status_lock=acquire_per_cluster_status_lock)
@@ -2100,7 +2104,7 @@ def _refresh_cluster_record(
 def refresh_cluster_status_handle(
     cluster_name: str,
     *,
-    force_refresh: bool = False,
+    force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
     acquire_per_cluster_status_lock: bool = True,
 ) -> Tuple[Optional[status_lib.ClusterStatus],
            Optional[backends.ResourceHandle]]:
@@ -2112,7 +2116,7 @@ def refresh_cluster_status_handle(
     """
     record = _refresh_cluster_record(
         cluster_name,
-        force_refresh=force_refresh,
+        force_refresh_statuses=force_refresh_statuses,
         acquire_per_cluster_status_lock=acquire_per_cluster_status_lock)
     if record is None:
         return None, None
@@ -2331,7 +2335,7 @@ def _refresh_cluster(cluster_name):
         try:
             record = _refresh_cluster_record(
                 cluster_name,
-                force_refresh=True,
+                force_refresh_statuses=set(status_lib.ClusterStatus),
                 acquire_per_cluster_status_lock=True)
         except (exceptions.ClusterStatusFetchingError,
                 exceptions.CloudUserIdentityError,

diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -3746,7 +3746,23 @@ def _check_existing_cluster(
             cluster_name)
         prev_cluster_status, handle = (
             backend_utils.refresh_cluster_status_handle(
-                cluster_name, acquire_per_cluster_status_lock=False))
+                cluster_name,
+                # We force refresh for the init status to determine the actual
+                # state of a previous cluster in INIT state.
+                #
+                # This is important for the case, where an existing cluster is
+                # transitioned into INIT state due to key interruption during
+                # launching, with the following steps:
+                # (1) launch, after answering prompt immediately ctrl-c;
+                # (2) launch again.
+                # If we don't refresh the state of the cluster and reset it back
+                # to STOPPED, our failover logic will consider it as an abnormal
+                # cluster after hitting resources capacity limit on the cloud,
+                # and will start failover. This is not desired, because the user
+                # may want to keep the data on the disk of that cluster.
+                force_refresh_statuses={status_lib.ClusterStatus.INIT},
+                acquire_per_cluster_status_lock=False,
+            ))
         if prev_cluster_status is not None:
             assert handle is not None
             # Cluster already exists.

diff --git a/sky/spot/controller.py b/sky/spot/controller.py
@@ -201,7 +201,8 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
             # determine whether the cluster is preempted.
             (cluster_status,
              handle) = backend_utils.refresh_cluster_status_handle(
-                 cluster_name, force_refresh=True)
+                 cluster_name,
+                 force_refresh_statuses=set(status_lib.ClusterStatus))
 
             if cluster_status != status_lib.ClusterStatus.UP:
                 # The cluster is (partially) preempted. It can be down, INIT

diff --git a/sky/spot/recovery_strategy.py b/sky/spot/recovery_strategy.py
@@ -182,7 +182,8 @@ def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
             try:
                 cluster_status, _ = (
                     backend_utils.refresh_cluster_status_handle(
-                        self.cluster_name, force_refresh=True))
+                        self.cluster_name,
+                        force_refresh_statuses=set(status_lib.ClusterStatus)))
             except Exception as e:  # pylint: disable=broad-except
                 # If any unexpected error happens, retry the job checking
                 # loop.

diff --git a/sky/spot/spot_utils.py b/sky/spot/spot_utils.py
@@ -749,13 +749,14 @@ def is_spot_controller_up(
           identity.
     """
     try:
-        # Set force_refresh=False to make sure the refresh only happens when the
-        # controller is INIT/UP. This optimization avoids unnecessary costly
-        # refresh when the controller is already stopped. This optimization is
-        # based on the assumption that the user will not start the controller
-        # manually from the cloud console.
+        # Set force_refresh_statuses=None to make sure the refresh only happens
+        # when the controller is INIT/UP (triggered in these statuses as the
+        # autostop is always set for spot controller). This optimization avoids
+        # unnecessary costly refresh when the controller is already stopped.
+        # This optimization is based on the assumption that the user will not
+        # start the controller manually from the cloud console.
         controller_status, handle = backend_utils.refresh_cluster_status_handle(
-            SPOT_CONTROLLER_NAME, force_refresh=False)
+            SPOT_CONTROLLER_NAME, force_refresh_statuses=None)
     except exceptions.ClusterStatusFetchingError as e:
         # We do not catch the exceptions related to the cluster owner identity
         # mismatch, please refer to the comment in

diff --git a/tests/test_spot.py b/tests/test_spot.py
@@ -103,7 +103,7 @@ def test_down_spot_controller(self, _mock_cluster_state, monkeypatch):
         def mock_cluster_refresh_up(
             cluster_name: str,
             *,
-            force_refresh: bool = False,
+            force_refresh_statuses: bool = False,
             acquire_per_cluster_status_lock: bool = True,
         ):
             record = global_user_state.get_cluster_from_name(cluster_name)