skypilot-org · Michaelvll · Aug 3, 2023 · Jul 13, 2023 · Jul 13, 2023 · Jul 13, 2023
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -623,11 +623,18 @@ class GangSchedulingStatus(enum.Enum):
         GANG_FAILED = 1
         HEAD_FAILED = 2
 
-    def __init__(self, log_dir: str, dag: 'dag.Dag',
+    def __init__(self,
+                 log_dir: str,
+                 dag: 'dag.Dag',
                  optimize_target: 'optimizer.OptimizeTarget',
                  requested_features: Set[clouds.CloudImplementationFeatures],
-                 local_wheel_path: pathlib.Path, wheel_hash: str):
+                 local_wheel_path: pathlib.Path,
+                 wheel_hash: str,
+                 blocked_resources: Optional[Iterable[
+                     resources_lib.Resources]] = None):
         self._blocked_resources: Set[resources_lib.Resources] = set()
+        if blocked_resources:
+            self._blocked_resources.update(blocked_resources)
 
         self.log_dir = os.path.expanduser(log_dir)
         self._dag = dag
@@ -2452,8 +2459,13 @@ def _provision(
                 # of optimization infinitely.
                 try:
                     provisioner = RetryingVmProvisioner(
-                        self.log_dir, self._dag, self._optimize_target,
-                        self._requested_features, local_wheel_path, wheel_hash)
+                        self.log_dir,
+                        self._dag,
+                        self._optimize_target,
+                        self._requested_features,
+                        local_wheel_path,
+                        wheel_hash,
+                        blocked_resources=task.blocked_resources)
                     config_dict = provisioner.provision_with_retries(
                         task, to_provision_config, dryrun, stream_logs)
                     break

diff --git a/sky/spot/recovery_strategy.py b/sky/spot/recovery_strategy.py
@@ -1,15 +1,21 @@
-"""The strategy to handle launching/recovery/termination of spot clusters."""
+"""The strategy to handle launching/recovery/termination of spot clusters.
+
+In the YAML file, the user can specify the strategy to use for spot jobs.
+
+resources:
+    spot_recovery: AGGRESSIVE_FAILOVER
+"""
 import time
 import traceback
 import typing
 from typing import Optional, Tuple
 
 import sky
+from sky import backends
 from sky import exceptions
 from sky import global_user_state
 from sky import sky_logging
 from sky import status_lib
-from sky import backends
 from sky.backends import backend_utils
 from sky.skylet import job_lib
 from sky.spot import spot_utils
@@ -356,7 +362,7 @@ def _launch(self,
             time.sleep(gap_seconds)
 
 
-class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER', default=True):
+class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER', default=False):
     """Failover strategy: wait in same region and failover after timout."""
 
     _MAX_RETRY_CNT = 240  # Retry for 4 hours.
@@ -444,3 +450,77 @@ def recover(self) -> float:
                         f'{self._MAX_RETRY_CNT} times.')
 
             return job_submitted_at
+
+
+class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
+                                    name='EAGER_FAILOVER',
+                                    default=True):
+    """Aggressive failover strategy
+
+    This strategy is an extension of the failover strategy. Instead of waiting
+    in the same region when the preemption happens, it immediately terminates
+    the cluster and relaunches it in a different regions. This is based on the
+    observation that the preemption is likely to happen again shortly in the
+    same region, so trying other regions first is more likely to get a longer
+    running cluster.
+    """
+
+    def recover(self) -> float:
+        # 1. Terminate the current cluster
+        # 2. Launch the cluster without retrying the previously launched region
+        # 3. Launch the cluster with no cloud/region constraint or respect the
+        #    original user specification.
+
+        # Step 1
+        logger.debug('Terminating unhealthy spot cluster and '
+                     'reset cloud region.')
+        terminate_cluster(self.cluster_name)
+
+        # Step 2
+        logger.debug('Relaunch the cluster skipping the previously launched '
+                     'cloud/region.')
+        if self._launched_cloud_region is not None:
+            task = self.dag.tasks[0]
+            resources = list(task.resources)[0]
+            if resources.region is None and resources.zone is None:
+                # Optimization: We only block the previously launched region,
+                # if the task does not specify a region or zone, because,
+                # otherwise, we will spend unnecessary time for skipping the
+                # only specified region/zone.
+                launched_cloud, launched_region = self._launched_cloud_region
+                task.blocked_resources = {
+                    resources.copy(cloud=launched_cloud, region=launched_region)
+                }
+                # Not using self.launch to avoid the retry until up logic.
+                job_submitted_at = self._launch(raise_on_failure=False)
+                task.blocked_resources = None
+                # Restore the original dag, i.e. reset the region constraint.
+                if job_submitted_at is not None:
+                    return job_submitted_at
+                self._launched_cloud_region = None
+                terminate_cluster(self.cluster_name)
+
+        # Retry the entire block until the cluster is up, so that the ratio of
+        # the time spent in the current region and the time spent in the other
+        # region is consistent during the retry.
+        while True:
+            # Step 3
+            logger.debug('Relaunch the cluster  without constraining to prior '
+                         'cloud/region.')
+            # Not using self.launch to avoid the retry until up logic.
+            job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
+                                            raise_on_failure=False)
+            if job_submitted_at is None:
+                # Failed to launch the cluster.
+                if self.retry_until_up:
+                    gap_seconds = self.RETRY_INIT_GAP_SECONDS
+                    logger.info('Retrying to recover the spot cluster in '
+                                f'{gap_seconds:.1f} seconds.')
+                    time.sleep(gap_seconds)
+                    continue
+                with ux_utils.print_exception_no_traceback():
+                    raise exceptions.ResourcesUnavailableError(
+                        f'Failed to recover the spot cluster after retrying '
+                        f'{self._MAX_RETRY_CNT} times.')
+
+            return job_submitted_at
diff --git a/sky/task.py b/sky/task.py
@@ -118,6 +118,7 @@ def __init__(
         # Advanced:
         docker_image: Optional[str] = None,
         event_callback: Optional[str] = None,
+        blocked_resources: Optional['resources_lib.Resources'] = None,
     ):
         """Initializes a Task.
 
@@ -194,6 +195,9 @@ def __init__(
         self.estimated_outputs_size_gigabytes = None
         # Default to CPUNode
         self.resources = {sky.Resources()}
+        # Resources that this task cannot run on.
+        self.blocked_resources = blocked_resources
+
         self.time_estimator_func: Optional[Callable[['sky.Resources'],
                                                     int]] = None
         self.file_mounts: Optional[Dict[str, str]] = None