From 05013e96e035d326dc8d916aac734d6e2b4e6997 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 11 Feb 2023 20:27:18 -0800 Subject: [PATCH] [Spot] Disallow `sky down` spot controller when in-progress spot jobs exist (#1667) * Expose failure reason for spot jobs * Add failure reason for normal failure * Failure reason hint for sky logs sky-spot-controller * require failure reason for all * Fix the conftest * fix controller name * revert SKYPILOT_USER * Show controller processs logs with sky spot logs for better UX * revert usage user ID * do not overwrite failure reason for resource unavailable * format * lint * address comments * fix comment * Update docs/source/examples/spot-jobs.rst Co-authored-by: Zongheng Yang * improve readability and refactoring * address comments * format * Add comment * address comments * format * Add failover history to the error rasied by _launch * Add comment * Update sky/spot/recovery_strategy.py Co-authored-by: Zongheng Yang * refactor * Address comment * Update sky/spot/recovery_strategy.py Co-authored-by: Zongheng Yang * format * format * fix exception name * Disallow termination of spot controller when spot job is running * Fix message * Fix message * address comments * Update sky/cli.py Co-authored-by: Zongheng Yang * Fix UX * Update sky/cli.py Co-authored-by: Zongheng Yang * change confirm str to `delete` --------- Co-authored-by: Zongheng Yang --- sky/cli.py | 67 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 11cc38f6195a..bf9262bd6ee5 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2202,7 +2202,7 @@ def down( purge=purge) -def _hint_for_down_spot_controller(controller_name: str): +def _hint_or_raise_for_down_spot_controller(controller_name: str): # spot_jobs will be empty when the spot cluster is not running. cluster_status, _ = backend_utils.refresh_cluster_status_handle( controller_name) @@ -2210,22 +2210,34 @@ def _hint_for_down_spot_controller(controller_name: str): click.echo('Managed spot controller has already been torn down.') return + if cluster_status == global_user_state.ClusterStatus.INIT: + with ux_utils.print_exception_no_traceback(): + raise exceptions.NotSupportedError( + f'{colorama.Fore.RED}Tearing down the spot controller while ' + 'it is in INIT state is not supported, as we cannot ' + 'guarantee that all the spot jobs are finished. Please wait ' + 'until the spot controller is UP or fix it with ' + f'{colorama.Style.BRIGHT}sky start ' + f'{spot_lib.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.') msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' f'spot controller ({cluster_status.value}). Please be ' f'aware of the following:{colorama.Style.RESET_ALL}' '\n * All logs and status information of the spot ' 'jobs (output of `sky spot queue`) will be lost.') + click.echo(msg) if cluster_status == global_user_state.ClusterStatus.UP: - try: - spot_jobs = core.spot_queue(refresh=False) - except exceptions.ClusterNotUpError: - # The spot controller cluster status changed during querying - # the spot jobs, use the latest cluster status, so that the - # message for INIT and STOPPED states will be correctly - # added to the message. - cluster_status = backend_utils.refresh_cluster_status_handle( - controller_name) - spot_jobs = [] + with backend_utils.safe_console_status( + '[bold cyan]Checking for in-progress spot jobs[/]'): + try: + spot_jobs = core.spot_queue(refresh=False) + except exceptions.ClusterNotUpError: + # The spot controller cluster status changed during querying + # the spot jobs, use the latest cluster status, so that the + # message for INIT and STOPPED states will be correctly + # added to the message. + cluster_status = backend_utils.refresh_cluster_status_handle( + controller_name) + spot_jobs = [] # Find in-progress spot jobs, and hint users to cancel them. non_terminal_jobs = [ @@ -2233,17 +2245,20 @@ def _hint_for_down_spot_controller(controller_name: str): ] if (cluster_status == global_user_state.ClusterStatus.UP and non_terminal_jobs): - msg += ('\n * In-progress spot jobs found, their resources ' - 'will not be terminated and require manual cleanup:\n') job_table = spot_lib.format_job_table(non_terminal_jobs, show_all=False) + msg = (f'{colorama.Fore.RED}In-progress spot jobs found. ' + 'To avoid resource leakage, cancel all jobs first: ' + f'{colorama.Style.BRIGHT}sky spot cancel -a' + f'{colorama.Style.RESET_ALL}\n') # Add prefix to each line to align with the bullet point. msg += '\n'.join( [' ' + line for line in job_table.split('\n') if line != '']) - if cluster_status == global_user_state.ClusterStatus.INIT: - msg += ('\n * If there are pending/in-progress spot jobs, those ' - 'resources will not be terminated and require manual cleanup.') - click.echo(msg) + with ux_utils.print_exception_no_traceback(): + raise exceptions.NotSupportedError(msg) + else: + click.echo(' * No in-progress spot jobs found. It should be safe ' + 'to terminate (see caveats above).') def _down_or_stop_clusters( @@ -2318,14 +2333,16 @@ def _down_or_stop_clusters( # TODO(zhwu): We can only have one reserved cluster (spot # controller). assert len(reserved_clusters) == 1, reserved_clusters - _hint_for_down_spot_controller(reserved_clusters[0]) - - if not no_confirm: - click.confirm('Proceed?', - default=False, - abort=True, - show_default=True) - no_confirm = True + _hint_or_raise_for_down_spot_controller(reserved_clusters[0]) + confirm_str = 'delete' + user_input = click.prompt( + f'To proceed, please check the warning above and type ' + f'{colorama.Style.BRIGHT}{confirm_str!r}' + f'{colorama.Style.RESET_ALL}', + type=str) + if user_input != confirm_str: + raise click.Abort() + no_confirm = True names += reserved_clusters if apply_to_all: