From 59cb4e9625e98b06fd293d0dd5cea5deb89ea358 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 6 Jan 2025 14:10:55 -0800 Subject: [PATCH] [k8s] Fix `--purge` not cleaning up cluster in stale k8s context (#4514) * Fix purge not cleaning up stale k8s context cluster * update comment * Apply purge after printing warnings. * lint * Fix comments * clean up condition --- sky/backends/cloud_vm_ray_backend.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index c972928cd7d..2316888b44c 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4216,11 +4216,20 @@ def post_teardown_cleanup(self, attempts = 0 while True: logger.debug(f'instance statuses attempt {attempts + 1}') - node_status_dict = provision_lib.query_instances( - repr(cloud), - cluster_name_on_cloud, - config['provider'], - non_terminated_only=False) + try: + node_status_dict = provision_lib.query_instances( + repr(cloud), + cluster_name_on_cloud, + config['provider'], + non_terminated_only=False) + except Exception as e: # pylint: disable=broad-except + if purge: + logger.warning( + f'Failed to query instances. Skipping since purge is ' + f'set. Details: ' + f'{common_utils.format_exception(e, use_bracket=True)}') + break + raise unexpected_node_state: Optional[Tuple[str, str]] = None for node_id, node_status in node_status_dict.items(): @@ -4239,8 +4248,13 @@ def post_teardown_cleanup(self, time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS) else: (node_id, node_status) = unexpected_node_state - raise RuntimeError(f'Instance {node_id} in unexpected state ' - f'{node_status}.') + if purge: + logger.warning(f'Instance {node_id} in unexpected ' + f'state {node_status}. Skipping since purge ' + 'is set.') + break + raise RuntimeError(f'Instance {node_id} in unexpected ' + f'state {node_status}.') global_user_state.remove_cluster(handle.cluster_name, terminate=terminate)