skypilot-org · infwinston · Dec 15, 2022 · Dec 7, 2022 · Dec 8, 2022 · Dec 8, 2022
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
@@ -1228,34 +1228,67 @@ def _get_tpu_vm_pod_ips(ray_config: Dict[str, Any],
     query_cmd = (f'gcloud compute tpus tpu-vm list --filter='
                  f'\\(labels.ray-cluster-name={cluster_name}\\) '
                  f'--zone={zone} --format=value\\(name\\)')
-    if not get_internal_ips:
-        tpuvm_cmd = (f'gcloud compute tpus tpu-vm describe $({query_cmd})'
-                     f' --zone {zone} --format="value[delimiter=\'\\n\']'
-                     '(networkEndpoints.accessConfig.externalIp)"')
-    else:
-        tpuvm_cmd = (f'gcloud compute tpus tpu-vm describe $({query_cmd})'
-                     f' --zone {zone} --format="value[delimiter=\'\\n\']'
-                     '(networkEndpoints.ipAddress)"')
-
-    rcode, stdout, stderr = log_lib.run_with_log(tpuvm_cmd,
-                                                 '/dev/null',
-                                                 shell=True,
-                                                 stream_logs=False,
-                                                 require_outputs=True)
-    if rcode != 0:
-        failure_massage = ('Failed to run gcloud to get TPU VM Pod IPs.\n'
+    returncode, stdout, stderr = log_lib.run_with_log(query_cmd,
+                                                      '/dev/null',
+                                                      shell=True,
+                                                      stream_logs=False,
+                                                      require_outputs=True)
+    if returncode != 0:
+        failure_massage = ('Failed to run gcloud to get TPU VM IDs.\n'
                            '**** STDOUT ****\n'
-                           '{stdout}\n'
+                           f'{stdout}\n'
                            '**** STDERR ****\n'
-                           '{stderr}\n'
+                           f'{stderr}\n'
                            '**** CMD ****\n'
-                           '{tpuvm_cmd}')
+                           f'{query_cmd}\n')
         with ux_utils.print_exception_no_traceback():
-            raise RuntimeError(
-                failure_massage.format(stdout=stdout,
-                                       stderr=stderr,
-                                       tpuvm_cmd=tpuvm_cmd))
-    all_ips = re.findall(IP_ADDR_REGEX, stdout)
+            raise RuntimeError(failure_massage)
+    if len(stdout) == 0:
+        logger.warning('No TPU VMs found with cluster name '
+                       f'{cluster_name} in zone {zone}.')
+    if len(stdout.splitlines()) > 1:
+        logger.warning('Found more than one TPU VM with cluster name '
+                       f'{cluster_name} in zone {zone}.')
+
+    all_ips = []
+    for tpu_id in stdout.splitlines():
+        tpuvm_cmd = (f'gcloud compute tpus tpu-vm describe {tpu_id}'
+                     f' --zone {zone} --format=json')
+        returncode, stdout, stderr = log_lib.run_with_log(tpuvm_cmd,
+                                                          '/dev/null',
+                                                          shell=True,
+                                                          stream_logs=False,
+                                                          require_outputs=True)
+        if returncode != 0:
+            failure_massage = ('Failed to run gcloud tpu-vm describe.\n'
+                               '**** STDOUT ****\n'
+                               f'{stdout}\n'
+                               '**** STDERR ****\n'
+                               f'{stderr}\n'
+                               '**** CMD ****\n'
+                               f'{tpuvm_cmd}\n')
+            with ux_utils.print_exception_no_traceback():
+                raise RuntimeError(failure_massage)
+
+        tpuvm_json = json.loads(stdout)
+        if tpuvm_json['state'] != 'READY':
+            # May be a leaked preempted resource.
+            logger.warning(f'TPU VM {tpu_id} is not in READY state. '
+                           'Could be a garbage resource. Skipping...')
+            continue
+
+        if not get_internal_ips:
+            ips = [
+                endpoint['accessConfig']['externalIp']
+                for endpoint in tpuvm_json['networkEndpoints']
+            ]
+        else:
+            ips = [
+                endpoint['ipAddress']
+                for endpoint in tpuvm_json['networkEndpoints']
+            ]
+        all_ips.extend(ips)
+
     return all_ips
 
 

diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -2657,6 +2657,7 @@ def teardown_no_lock(self,
         elif (terminate and
               (prev_status == global_user_state.ClusterStatus.STOPPED or
                use_tpu_vm)):
+            terminate_cmds = []
             # For TPU VMs, gcloud CLI is used for VM termination.
             if isinstance(cloud, clouds.AWS):
                 # TODO(zhwu): Room for optimization. We can move these cloud
@@ -2669,7 +2670,7 @@ def teardown_no_lock(self,
                     f'Name=tag:ray-cluster-name,Values={handle.cluster_name} '
                     f'--query Reservations[].Instances[].InstanceId '
                     '--output text')
-                terminate_cmd = (
+                terminate_cmds.append(
                     f'aws ec2 terminate-instances --region {region} '
                     f'--instance-ids $({query_cmd})')
             elif isinstance(cloud, clouds.GCP):
@@ -2684,15 +2685,25 @@ def teardown_no_lock(self,
                         f'gcloud compute tpus tpu-vm list --filter='
                         f'\\(labels.ray-cluster-name={cluster_name}\\) '
                         f'--zone={zone} --format=value\\(name\\)')
-                    terminate_cmd = (
-                        f'gcloud compute tpus tpu-vm delete --zone={zone}'
-                        f' --quiet $({query_cmd})')
+                    returncode, stdout, stderr = log_lib.run_with_log(
+                        query_cmd,
+                        log_abs_path,
+                        shell=True,
+                        stream_logs=False,
+                        require_outputs=True)
+
+                    # Needs to create a list as GCP does not allow deleting
+                    # multiple TPU VMs at once
+                    for tpu_id in stdout.splitlines():
+                        terminate_cmds.append(
+                            f'gcloud compute tpus tpu-vm delete --zone={zone} '
+                            f'--quiet {tpu_id}')
                 else:
                     query_cmd = (
                         f'gcloud compute instances list --filter='
                         f'\\(labels.ray-cluster-name={cluster_name}\\) '
                         f'--zones={zone} --format=value\\(name\\)')
-                    terminate_cmd = (
+                    terminate_cmds.append(
                         f'gcloud compute instances delete --zone={zone}'
                         f' --quiet $({query_cmd})')
             else:
@@ -2701,12 +2712,13 @@ def teardown_no_lock(self,
                                      f'cluster {cluster_name!r}.')
             with backend_utils.safe_console_status(f'[bold cyan]Terminating '
                                                    f'[green]{cluster_name}'):
-                returncode, stdout, stderr = log_lib.run_with_log(
-                    terminate_cmd,
-                    log_abs_path,
-                    shell=True,
-                    stream_logs=False,
-                    require_outputs=True)
+                for terminate_cmd in terminate_cmds:
+                    returncode, stdout, stderr = log_lib.run_with_log(
+                        terminate_cmd,
+                        log_abs_path,
+                        shell=True,
+                        stream_logs=False,
+                        require_outputs=True)
         else:
             config['provider']['cache_stopped_nodes'] = not terminate
             with tempfile.NamedTemporaryFile('w',

diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py
@@ -351,3 +351,12 @@ def accelerator_in_region_or_zone(self,
                                       zone: Optional[str] = None) -> bool:
         return service_catalog.accelerator_in_region_or_zone(
             accelerator, acc_count, region, zone, 'aws')
+
+    def need_cleanup_after_preemption(
+            self, resources: 'resources_lib.Resources') -> bool:
+        """Returns whether a spot resource needs cleanup after preeemption."""
+        # By default, AWS Spot instances are not restartable after preemption.
+        # "Terminate interrupted Spot Instances (this is the default behavior)"
+        # See: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/interruption-behavior.html # pylint: disable=line-too-long
+        del resources  # unused
+        return False
diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py
@@ -305,6 +305,16 @@ def accelerator_in_region_or_zone(self,
         return service_catalog.accelerator_in_region_or_zone(
             accelerator, acc_count, region, zone, 'azure')
 
+    def need_cleanup_after_preemption(self,
+                                      resources: 'resources.Resources') -> bool:
+        """Returns whether a spot resource needs cleanup after preeemption."""
+        # By default, Azure Spot instances are restartable after preemption.
+        # "When creating an Azure Spot Virtual Machine, you can set
+        # the eviction policy to Deallocate (default) or Delete."
+        # See: https://learn.microsoft.com/en-us/azure/virtual-machines/spot-vms#eviction-policy # pylint: disable=line-too-long
+        del resources  # Unused.
+        return True
+
     @classmethod
     def get_project_id(cls, dryrun: bool = False) -> str:
         if dryrun:

diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py
@@ -214,5 +214,10 @@ def accelerator_in_region_or_zone(self,
         """Returns whether the accelerator is valid in the region or zone."""
         raise NotImplementedError
 
+    def need_cleanup_after_preemption(self,
+                                      resource: 'resources.Resources') -> bool:
+        """Returns whether a spot resource needs cleanup after preeemption."""
+        raise NotImplementedError
+
     def __repr__(self):
         return self._REPR
diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py
@@ -468,6 +468,22 @@ def accelerator_in_region_or_zone(self,
         return service_catalog.accelerator_in_region_or_zone(
             accelerator, acc_count, region, zone, 'gcp')
 
+    def need_cleanup_after_preemption(self,
+                                      resources: 'resources.Resources') -> bool:
+        """Returns whether a spot resource needs cleanup after preeemption."""
+        # By default, GCP Compute VMs are restartable after preemption.
+        # "If ... not specified, then Compute Engine stops the VM,
+        # transitioning the VM to a TERMINATED state."
+        # See: https://cloud.google.com/compute/docs/instances/spot#preemption-process # pylint: disable=line-too-long
+        # However, Spot TPU VMs are not restartable after preemption.
+        # "If your Cloud TPU is preempted,
+        # you must delete it and create a new one ..."
+        # See: https://cloud.google.com/tpu/docs/preemptible#tpu-vm
+
+        # pylint: disable=import-outside-toplevel
+        from sky.utils import tpu_utils
+        return not tpu_utils.is_tpu_vm(resources)
+
     @classmethod
     def get_project_id(cls, dryrun: bool = False) -> str:
         # TODO(zhwu): change the project id fetching with the following command

diff --git a/sky/resources.py b/sky/resources.py
@@ -260,6 +260,10 @@ def _set_accelerators(
     def is_launchable(self) -> bool:
         return self.cloud is not None and self._instance_type is not None
 
+    def need_cleanup_after_preemption(self) -> bool:
+        """Returns whether a spot resource needs cleanup after preeemption."""
+        return self.cloud.need_cleanup_after_preemption(self)
+
     def _set_region_zone(self, region: Optional[str],
                          zone: Optional[str]) -> None:
         if region is None and zone is None:

diff --git a/sky/spot/controller.py b/sky/spot/controller.py
@@ -145,6 +145,13 @@ def _run(self):
                             'cluster is healthy. Try to recover the job '
                             '(the cluster will not be restarted).')
 
+            resources = list(self._task.resources)[0]
+            if not resources.need_cleanup_after_preemption():
+                # Some spot resource may need to be cleaned up after
+                # preemption, if the resource is not reusable.
+                logger.info('Cleaning up the preempted spot cluster...')
+                self._strategy_executor.terminate_cluster()
+
             # Try to recover the spot jobs, when the cluster is preempted
             # or the job status is failed to be fetched.
             spot_state.set_recovering(self._job_id)

diff --git a/sky/spot/recovery_strategy.py b/sky/spot/recovery_strategy.py
@@ -276,10 +276,6 @@ def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]:
                                            launched_resources.region)
         return launch_time
 
-    def terminate_cluster(self, max_retry: int = 3) -> None:
-        super().terminate_cluster(max_retry)
-        self._launched_cloud_region = None
-
     def recover(self) -> float:
         # 1. Cancel the jobs and launch the cluster with the STOPPED status,
         #    so that it will try on the current region first until timeout.
@@ -313,7 +309,9 @@ def recover(self) -> float:
                     return launched_time
 
             # Step 2
-            logger.debug('Terminating unhealthy spot cluster.')
+            logger.debug('Terminating unhealthy spot cluster and '
+                         'reset cloud region.')
+            self._launched_cloud_region = None
             self.terminate_cluster()
 
             # Step 3

diff --git a/sky/utils/tpu_utils.py b/sky/utils/tpu_utils.py
@@ -4,28 +4,29 @@
 from sky import resources as resources_lib
 
 
-def is_tpu(resources: resources_lib.Resources) -> bool:
-    if resources.accelerators is None:
+def is_tpu(resources: Optional[resources_lib.Resources]) -> bool:
+    if resources is None or resources.accelerators is None:
         return False
     acc, _ = list(resources.accelerators.items())[0]
     return acc.startswith('tpu')
 
 
-def is_tpu_vm(resources: resources_lib.Resources) -> bool:
-    if resources.accelerator_args is None:
+def is_tpu_vm(resources: Optional[resources_lib.Resources]) -> bool:
+    if resources is None or resources.accelerator_args is None:
         return False
     return resources.accelerator_args.get('tpu_vm', False)
 
 
-def is_tpu_vm_pod(resources: resources_lib.Resources) -> bool:
-    if not is_tpu_vm(resources):
+def is_tpu_vm_pod(resources: Optional[resources_lib.Resources]) -> bool:
+    if resources is None or not is_tpu_vm(resources):
         return False
     acc, _ = list(resources.accelerators.items())[0]
     return acc not in ['tpu-v2-8', 'tpu-v3-8']
 
 
-def get_num_tpu_devices(resources: resources_lib.Resources) -> Optional[int]:
-    if not is_tpu(resources):
+def get_num_tpu_devices(
+        resources: Optional[resources_lib.Resources]) -> Optional[int]:
+    if resources is None or not is_tpu(resources):
         return None
     acc, _ = list(resources.accelerators.items())[0]
     num_tpu_devices = int(int(acc.split('-')[2]) / 8)