skypilot-org · infwinston · Dec 6, 2022 · Dec 1, 2022 · Dec 1, 2022 · Dec 1, 2022
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
@@ -1226,11 +1226,8 @@ def _get_tpu_vm_pod_ips(ray_config: Dict[str, Any],
 
     cluster_name = ray_config['cluster_name']
     zone = ray_config['provider']['availability_zone']
-    # Excluding preempted VMs is safe as they are already terminated and
-    # do not charge.
     query_cmd = (f'gcloud compute tpus tpu-vm list --filter='
-                 f'"(labels.ray-cluster-name={cluster_name} AND '
-                 f'state!=PREEMPTED)" '
+                 f'\\(labels.ray-cluster-name={cluster_name}\\) '
                  f'--zone={zone} --format=value\\(name\\)')
     if not get_internal_ips:
         tpuvm_cmd = (f'gcloud compute tpus tpu-vm describe $({query_cmd})'

diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -2680,12 +2680,9 @@ def teardown_no_lock(self,
                     # check if gcloud includes TPU VM API
                     backend_utils.check_gcp_cli_include_tpu_vm()
 
-                    # Excluding preempted VMs is safe as they are already
-                    # terminated and do not charge.
                     query_cmd = (
                         f'gcloud compute tpus tpu-vm list --filter='
-                        f'"(labels.ray-cluster-name={cluster_name} AND '
-                        f'state!=PREEMPTED)" '
+                        f'\\(labels.ray-cluster-name={cluster_name}\\) '
                         f'--zone={zone} --format=value\\(name\\)')
                     terminate_cmd = (
                         f'gcloud compute tpus tpu-vm delete --zone={zone}'

diff --git a/sky/spot/recovery_strategy.py b/sky/spot/recovery_strategy.py
@@ -12,6 +12,7 @@
 from sky.spot import spot_utils
 from sky.usage import usage_lib
 from sky.utils import common_utils
+from sky.utils import tpu_utils
 from sky.utils import ux_utils
 
 if typing.TYPE_CHECKING:
@@ -305,6 +306,12 @@ def recover(self) -> float:
                 new_resources = resources.copy(cloud=launched_cloud,
                                                region=launched_region)
                 task.set_resources({new_resources})
+
+                # Note: Preempted TPU VM cannot be reused and needs to be
+                # cleaned up. Otherwise, it will occupy the quota.
+                is_tpuvm = tpu_utils.is_tpu_vm(new_resources)
+                if is_tpuvm:
+                    self.terminate_cluster()
 # GCP does not clean up preempted TPU VMs. We remove it ourselves. 
 # TODO(wei-lin): handle multi-node cases. 
 if use_tpu_vm and len(status_list) == 0: 
     backend = backends.CloudVmRayBackend() 
     handle = global_user_state.get_handle_from_cluster_name(cluster) 
     backend.teardown_no_lock(handle, 
 # GCP does not clean up preempted TPU VMs. We remove it ourselves. 
 # TODO(wei-lin): handle multi-node cases. 
 if use_tpu_vm and len(status_list) == 0: 
     backend = backends.CloudVmRayBackend() 
     handle = global_user_state.get_handle_from_cluster_name(cluster) 
     backend.teardown_no_lock(handle, 
                 # Not using self.launch to avoid the retry until up logic.
                 launched_time = self._launch(raise_on_failure=False)
                 # Restore the original dag, i.e. reset the region constraint.