Skip to content

Commit

Permalink
[Spot] Disallow sky down spot controller when in-progress spot jobs…
Browse files Browse the repository at this point in the history
… exist (skypilot-org#1667)

* Expose failure reason for spot jobs

* Add failure reason for normal failure

* Failure reason hint for sky logs sky-spot-controller

* require failure reason for all

* Fix the conftest

* fix controller name

* revert SKYPILOT_USER

* Show controller processs logs with sky spot logs for better UX

* revert usage user ID

* do not overwrite failure reason for resource unavailable

* format

* lint

* address comments

* fix comment

* Update docs/source/examples/spot-jobs.rst

Co-authored-by: Zongheng Yang <[email protected]>

* improve readability and refactoring

* address comments

* format

* Add comment

* address comments

* format

* Add failover history to the error rasied by _launch

* Add comment

* Update sky/spot/recovery_strategy.py

Co-authored-by: Zongheng Yang <[email protected]>

* refactor

* Address comment

* Update sky/spot/recovery_strategy.py

Co-authored-by: Zongheng Yang <[email protected]>

* format

* format

* fix exception name

* Disallow termination of spot controller when spot job is running

* Fix message

* Fix message

* address comments

* Update sky/cli.py

Co-authored-by: Zongheng Yang <[email protected]>

* Fix UX

* Update sky/cli.py

Co-authored-by: Zongheng Yang <[email protected]>

* change confirm str to `delete`

---------

Co-authored-by: Zongheng Yang <[email protected]>
  • Loading branch information
2 people authored and Sumanth committed Feb 22, 2023
1 parent 8007519 commit 05013e9
Showing 1 changed file with 42 additions and 25 deletions.
67 changes: 42 additions & 25 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2202,48 +2202,63 @@ def down(
purge=purge)


def _hint_for_down_spot_controller(controller_name: str):
def _hint_or_raise_for_down_spot_controller(controller_name: str):
# spot_jobs will be empty when the spot cluster is not running.
cluster_status, _ = backend_utils.refresh_cluster_status_handle(
controller_name)
if cluster_status is None:
click.echo('Managed spot controller has already been torn down.')
return

if cluster_status == global_user_state.ClusterStatus.INIT:
with ux_utils.print_exception_no_traceback():
raise exceptions.NotSupportedError(
f'{colorama.Fore.RED}Tearing down the spot controller while '
'it is in INIT state is not supported, as we cannot '
'guarantee that all the spot jobs are finished. Please wait '
'until the spot controller is UP or fix it with '
f'{colorama.Style.BRIGHT}sky start '
f'{spot_lib.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.')
msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
f'spot controller ({cluster_status.value}). Please be '
f'aware of the following:{colorama.Style.RESET_ALL}'
'\n * All logs and status information of the spot '
'jobs (output of `sky spot queue`) will be lost.')
click.echo(msg)
if cluster_status == global_user_state.ClusterStatus.UP:
try:
spot_jobs = core.spot_queue(refresh=False)
except exceptions.ClusterNotUpError:
# The spot controller cluster status changed during querying
# the spot jobs, use the latest cluster status, so that the
# message for INIT and STOPPED states will be correctly
# added to the message.
cluster_status = backend_utils.refresh_cluster_status_handle(
controller_name)
spot_jobs = []
with backend_utils.safe_console_status(
'[bold cyan]Checking for in-progress spot jobs[/]'):
try:
spot_jobs = core.spot_queue(refresh=False)
except exceptions.ClusterNotUpError:
# The spot controller cluster status changed during querying
# the spot jobs, use the latest cluster status, so that the
# message for INIT and STOPPED states will be correctly
# added to the message.
cluster_status = backend_utils.refresh_cluster_status_handle(
controller_name)
spot_jobs = []

# Find in-progress spot jobs, and hint users to cancel them.
non_terminal_jobs = [
job for job in spot_jobs if not job['status'].is_terminal()
]
if (cluster_status == global_user_state.ClusterStatus.UP and
non_terminal_jobs):
msg += ('\n * In-progress spot jobs found, their resources '
'will not be terminated and require manual cleanup:\n')
job_table = spot_lib.format_job_table(non_terminal_jobs,
show_all=False)
msg = (f'{colorama.Fore.RED}In-progress spot jobs found. '
'To avoid resource leakage, cancel all jobs first: '
f'{colorama.Style.BRIGHT}sky spot cancel -a'
f'{colorama.Style.RESET_ALL}\n')
# Add prefix to each line to align with the bullet point.
msg += '\n'.join(
[' ' + line for line in job_table.split('\n') if line != ''])
if cluster_status == global_user_state.ClusterStatus.INIT:
msg += ('\n * If there are pending/in-progress spot jobs, those '
'resources will not be terminated and require manual cleanup.')
click.echo(msg)
with ux_utils.print_exception_no_traceback():
raise exceptions.NotSupportedError(msg)
else:
click.echo(' * No in-progress spot jobs found. It should be safe '
'to terminate (see caveats above).')


def _down_or_stop_clusters(
Expand Down Expand Up @@ -2318,14 +2333,16 @@ def _down_or_stop_clusters(
# TODO(zhwu): We can only have one reserved cluster (spot
# controller).
assert len(reserved_clusters) == 1, reserved_clusters
_hint_for_down_spot_controller(reserved_clusters[0])

if not no_confirm:
click.confirm('Proceed?',
default=False,
abort=True,
show_default=True)
no_confirm = True
_hint_or_raise_for_down_spot_controller(reserved_clusters[0])
confirm_str = 'delete'
user_input = click.prompt(
f'To proceed, please check the warning above and type '
f'{colorama.Style.BRIGHT}{confirm_str!r}'
f'{colorama.Style.RESET_ALL}',
type=str)
if user_input != confirm_str:
raise click.Abort()
no_confirm = True
names += reserved_clusters

if apply_to_all:
Expand Down

0 comments on commit 05013e9

Please sign in to comment.