From 60cd0d61df5f63b84b2b7af5310a9888bb81dde4 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 18 Oct 2022 00:21:42 -0700 Subject: [PATCH 01/11] Show controller in status and enable tear down. --- sky/cli.py | 35 ++++++++++++++++++++--------- sky/core.py | 8 ++----- sky/utils/cli_utils/status_utils.py | 19 +++++++++++++--- 3 files changed, 43 insertions(+), 19 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 40870c37377..f715c639e90 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1268,11 +1268,19 @@ def status(all: bool, refresh: bool): # pylint: disable=redefined-builtin '(down)', e.g. '1m (down)', the cluster will be autodowned, rather than autostopped. """ - cluster_records = core.status(all=all, refresh=refresh) + cluster_records = core.status(refresh=refresh) + filtered_cluster_records = [] + reserved_clusters = [] + for cluster_record in cluster_records: + if cluster_record['name'] in backend_utils.SKY_RESERVED_CLUSTER_NAMES: + reserved_clusters.append(cluster_record) + else: + filtered_cluster_records.append(cluster_record) local_clusters = onprem_utils.check_and_get_local_clusters( suppress_error=True) - status_utils.show_status_table(cluster_records, all) + status_utils.show_status_table(filtered_cluster_records, all) status_utils.show_local_status_table(local_clusters) + status_utils.show_status_table(reserved_clusters, all, is_reserved=True) @cli.command() @@ -1894,14 +1902,21 @@ def _down_or_stop_clusters( # Make sure the reserved clusters are explicitly specified without other # normal clusters and purge is True. if len(reserved_clusters) > 0: - if not purge: - msg = (f'{operation} reserved cluster(s) ' - f'{reserved_clusters_str} is not supported.') - if down: - msg += ( - '\nPlease specify --purge (-p) to force-terminate the ' - 'reserved cluster(s).') - raise click.UsageError(msg) + if not down: + raise click.UsageError( + f'{operation} reserved cluster(s) ' + f'{reserved_clusters_str} is not supported.') + else: + click.secho( + 'WARNING: Tearing down a SkyPilot reserved cluster. If ' + 'any related job had not finished, resources leakage ' + 'could happen.', + fg='yellow') + click.confirm('Do you want to continue?', + default=False, + abort=True, + show_default=True) + no_confirm = True if len(names) != 0: names_str = ', '.join(map(repr, names)) raise click.UsageError( diff --git a/sky/core.py b/sky/core.py index 15d20fbaf6d..315addea3a3 100644 --- a/sky/core.py +++ b/sky/core.py @@ -30,7 +30,7 @@ @usage_lib.entrypoint -def status(all: bool, refresh: bool) -> List[Dict[str, Any]]: +def status(refresh: bool) -> List[Dict[str, Any]]: """Get the cluster status in dict. Please refer to the sky.cli.status for the document. @@ -52,7 +52,7 @@ def status(all: bool, refresh: bool) -> List[Dict[str, Any]]: ] """ - cluster_records = backend_utils.get_clusters(all, refresh) + cluster_records = backend_utils.get_clusters(True, refresh) return cluster_records @@ -167,10 +167,6 @@ def down(cluster_name: str, purge: bool = False): ValueError: cluster does not exist. sky.exceptions.NotSupportedError: the cluster is not supported. """ - if (cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES and not purge): - raise exceptions.NotSupportedError( - f'Tearing down sky reserved cluster {cluster_name!r} ' - f'is not supported.') handle = global_user_state.get_handle_from_cluster_name(cluster_name) if handle is None: raise ValueError(f'Cluster {cluster_name!r} does not exist.') diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 6931ee875ce..801380cbcdf 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -32,7 +32,9 @@ def calc(self, record): return val -def show_status_table(cluster_records: List[Dict[str, Any]], show_all: bool): +def show_status_table(cluster_records: List[Dict[str, Any]], + show_all: bool, + is_reserved: bool = False): """Compute cluster table values and display.""" # TODO(zhwu): Update the information for autostop clusters. @@ -68,12 +70,23 @@ def show_status_table(cluster_records: List[Dict[str, Any]], show_all: bool): pending_autostop += _is_pending_autostop(record) if cluster_records: + name = 'Clusters' + if is_reserved: + click.echo() + name = 'Reserved Clusters' + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{name}: ' + f'{colorama.Style.RESET_ALL}') click.echo(cluster_table) - if pending_autostop: + if is_reserved: + click.echo( + f'{colorama.Style.DIM}Reserved clusters will be autostopped ' + 'when inactive. Refresh statuses with: sky status --refresh' + f'{colorama.Style.RESET_ALL}') + elif pending_autostop: click.echo( '\n' f'You have {pending_autostop} clusters with auto{{stop,down}} ' - 'scheduled. Refresh statuses with: `sky status --refresh`.') + 'scheduled. Refresh statuses with: sky status --refresh') else: click.echo('No existing clusters.') From 102427715ea8f8007a34550b8e072961657f76a7 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 18 Oct 2022 00:21:53 -0700 Subject: [PATCH 02/11] Reduce the autostop for controller to save cost --- sky/spot/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/spot/constants.py b/sky/spot/constants.py index 9accab396dc..33d069327b1 100644 --- a/sky/spot/constants.py +++ b/sky/spot/constants.py @@ -1,6 +1,6 @@ """Constants used for Managed Spot.""" -SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 30 +SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 SPOT_CONTROLLER_TEMPLATE = 'spot-controller.yaml.j2' SPOT_CONTROLLER_YAML_PREFIX = '~/.sky/spot_controller' From f22eba19e15e208e033cd44918790b08c71997b0 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 18 Oct 2022 00:33:39 -0700 Subject: [PATCH 03/11] fix test --- tests/test_spot.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/test_spot.py b/tests/test_spot.py index 549411a4dcc..df0c9e50bab 100644 --- a/tests/test_spot.py +++ b/tests/test_spot.py @@ -97,11 +97,9 @@ def _mock_cluster_state(self, _mock_db_conn): def test_down_spot_controller(self, _mock_cluster_state): cli_runner = cli_testing.CliRunner() - result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME]) - assert result.exit_code == click.UsageError.exit_code - assert ( - f'Terminating reserved cluster(s) \'{spot.SPOT_CONTROLLER_NAME}\' ' - 'is not supported' in result.output) + result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME], input='n') + assert 'WARNING: Tearing down a SkyPilot reserved cluster.' in result.output + assert isinstance(result.exception, SystemExit), result.exception result = cli_runner.invoke(cli.down, ['sky-spot-con*']) assert not result.exception From 799e75369c3f1fc8fb7767280c8c6cb88b20f3d2 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 18 Oct 2022 00:35:27 -0700 Subject: [PATCH 04/11] format --- tests/test_spot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_spot.py b/tests/test_spot.py index df0c9e50bab..98ca744968e 100644 --- a/tests/test_spot.py +++ b/tests/test_spot.py @@ -97,7 +97,8 @@ def _mock_cluster_state(self, _mock_db_conn): def test_down_spot_controller(self, _mock_cluster_state): cli_runner = cli_testing.CliRunner() - result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME], input='n') + result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME], + input='n') assert 'WARNING: Tearing down a SkyPilot reserved cluster.' in result.output assert isinstance(result.exception, SystemExit), result.exception From 69a6ffb88aa6e59408d19655552306f907588602 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 20 Oct 2022 00:03:52 -0700 Subject: [PATCH 05/11] address comments --- sky/backends/backend_utils.py | 4 +- sky/cli.py | 83 ++++++++++++++++++++++------- sky/core.py | 3 +- sky/utils/cli_utils/status_utils.py | 34 ++++++------ 4 files changed, 85 insertions(+), 39 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 2f07393214f..ecfc25825c0 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -101,7 +101,9 @@ # Note: This value cannot be too small, otherwise OOM issue may occur. DEFAULT_TASK_CPU_DEMAND = 0.5 -SKY_RESERVED_CLUSTER_NAMES = [spot_lib.SPOT_CONTROLLER_NAME] +SKY_RESERVED_CLUSTER_NAMES = { + spot_lib.SPOT_CONTROLLER_NAME: 'Managed spot controller' +} # Filelocks for the cluster status change. CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock') diff --git a/sky/cli.py b/sky/cli.py index e05e1e0d783..76caf58cf1d 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -27,6 +27,7 @@ listed in "sky --help". Take care to put logically connected commands close to each other. """ +import collections import datetime import functools import getpass @@ -1280,18 +1281,32 @@ def status(all: bool, refresh: bool): # pylint: disable=redefined-builtin autostopped. """ cluster_records = core.status(refresh=refresh) - filtered_cluster_records = [] - reserved_clusters = [] + nonreserved_cluster_records = [] + reserved_clusters = collections.defaultdict(list) for cluster_record in cluster_records: - if cluster_record['name'] in backend_utils.SKY_RESERVED_CLUSTER_NAMES: - reserved_clusters.append(cluster_record) + cluster_name = cluster_record['name'] + if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: + cluster_group_name = backend_utils.SKY_RESERVED_CLUSTER_NAMES[ + cluster_name] + reserved_clusters[cluster_group_name].append(cluster_record) else: - filtered_cluster_records.append(cluster_record) + nonreserved_cluster_records.append(cluster_record) local_clusters = onprem_utils.check_and_get_local_clusters( suppress_error=True) - status_utils.show_status_table(filtered_cluster_records, all) + + num_pending_autostop = 0 + num_pending_autostop += status_utils.show_status_table( + nonreserved_cluster_records, all) + for cluster_group_name, cluster_records in reserved_clusters.items(): + num_pending_autostop += status_utils.show_status_table( + cluster_records, all, reserved_group_name=cluster_group_name) + if num_pending_autostop > 0: + click.echo( + '\n' + f'{colorama.Style.DIM}You have {num_pending_autostop} clusters ' + 'with auto{stop,down} scheduled. Refresh statuses with: ' + f'sky status --refresh{colorama.Style.RESET_ALL}') status_utils.show_local_status_table(local_clusters) - status_utils.show_status_table(reserved_clusters, all, is_reserved=True) @cli.command() @@ -1913,28 +1928,58 @@ def _down_or_stop_clusters( # Make sure the reserved clusters are explicitly specified without other # normal clusters and purge is True. if len(reserved_clusters) > 0: + if len(names) != 0: + names_str = ', '.join(map(repr, names)) + raise click.UsageError( + f'{operation} reserved cluster(s) ' + f'{reserved_clusters_str} with multiple other cluster(s) ' + f'{names_str} is not supported.\n' + f'Please omit the reserved cluster(s) {reserved_clusters}.') if not down: raise click.UsageError( f'{operation} reserved cluster(s) ' f'{reserved_clusters_str} is not supported.') else: - click.secho( - 'WARNING: Tearing down a SkyPilot reserved cluster. If ' - 'any related job had not finished, resources leakage ' - 'could happen.', - fg='yellow') + # TODO(zhwu): We can only have one reserved cluster (spot + # controller). + assert len(reserved_clusters) == 1, reserved_clusters + # spot_jobs will be empty when the spot cluster is not running. + cluster_name = reserved_clusters[0] + cluster_status, _ = backend_utils.refresh_cluster_status_handle( + cluster_name) + if cluster_status is None: + click.echo( + 'Managed spot controller has already been torn down.') + return + msg = ( + f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' + f'spot controller ({status.value}). Please be aware of the ' + 'following:' + '\n\t- All logs and status information of the spot jobs ' + 'will be lost.') + if cluster_status == global_user_state.ClusterStatus.INIT: + msg += ( + '\n\t- Resource leakage may happen caused by spot jobs ' + 'being submitted, and unfinished spot jobs.') + elif cluster_status == global_user_state.ClusterStatus.UP: + spot_jobs = core.spot_status(refresh=False) + non_terminal_jobs = [ + job for job in spot_jobs + if not job['status'].is_terminal() + ] + if non_terminal_jobs: + msg += ( + '\n\t- Resource leakage may happen caused by the ' + 'following unfinished spot jobs:') + spot_lib.format_job_table(non_terminal_jobs, + show_all=False) + click.echo(msg) + click.confirm('Do you want to continue?', default=False, abort=True, show_default=True) no_confirm = True - if len(names) != 0: - names_str = ', '.join(map(repr, names)) - raise click.UsageError( - f'{operation} reserved cluster(s) ' - f'{reserved_clusters_str} with multiple other cluster(s) ' - f'{names_str} is not supported.\n' - f'Please omit the reserved cluster(s) {reserved_clusters}.') names += reserved_clusters if apply_to_all: diff --git a/sky/core.py b/sky/core.py index 315addea3a3..414c3e7db41 100644 --- a/sky/core.py +++ b/sky/core.py @@ -52,7 +52,8 @@ def status(refresh: bool) -> List[Dict[str, Any]]: ] """ - cluster_records = backend_utils.get_clusters(True, refresh) + cluster_records = backend_utils.get_clusters(include_reserved=True, + refresh=refresh) return cluster_records diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 801380cbcdf..79f3fbe0031 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -1,5 +1,5 @@ """Utilities for sky status.""" -from typing import Any, Callable, Dict, List +from typing import Any, Callable, Dict, List, Optional import click import colorama @@ -34,8 +34,12 @@ def calc(self, record): def show_status_table(cluster_records: List[Dict[str, Any]], show_all: bool, - is_reserved: bool = False): - """Compute cluster table values and display.""" + reserved_group_name: Optional[str] = None) -> int: + """Compute cluster table values and display. + + Returns: + Number of pending autostop clusters. + """ # TODO(zhwu): Update the information for autostop clusters. status_columns = [ @@ -70,25 +74,19 @@ def show_status_table(cluster_records: List[Dict[str, Any]], pending_autostop += _is_pending_autostop(record) if cluster_records: - name = 'Clusters' - if is_reserved: - click.echo() - name = 'Reserved Clusters' - click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{name}: ' - f'{colorama.Style.RESET_ALL}') - click.echo(cluster_table) - if is_reserved: + if reserved_group_name is not None: click.echo( - f'{colorama.Style.DIM}Reserved clusters will be autostopped ' - 'when inactive. Refresh statuses with: sky status --refresh' + f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'{reserved_group_name}: {colorama.Style.RESET_ALL}' + f'{colorama.Style.DIM}(will be autostopped when inactive)' f'{colorama.Style.RESET_ALL}') - elif pending_autostop: - click.echo( - '\n' - f'You have {pending_autostop} clusters with auto{{stop,down}} ' - 'scheduled. Refresh statuses with: sky status --refresh') + else: + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters: ' + f'{colorama.Style.RESET_ALL}') + click.echo(cluster_table) else: click.echo('No existing clusters.') + return pending_autostop def show_local_status_table(local_clusters: List[str]): From 166ca7ea6a64ba943198da8606180eef123e1510 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 20 Oct 2022 00:30:45 -0700 Subject: [PATCH 06/11] fix down output --- sky/cli.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 76caf58cf1d..e4a39ac37c3 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1951,16 +1951,20 @@ def _down_or_stop_clusters( click.echo( 'Managed spot controller has already been torn down.') return + + cnt = 1 msg = ( f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' - f'spot controller ({status.value}). Please be aware of the ' - 'following:' - '\n\t- All logs and status information of the spot jobs ' - 'will be lost.') + f'spot controller ({cluster_status.value}). Please be ' + f'aware of the following:{colorama.Style.RESET_ALL}' + f'\n {cnt}. All logs and status information of the spot ' + 'jobs will be lost.') + cnt += 1 if cluster_status == global_user_state.ClusterStatus.INIT: msg += ( - '\n\t- Resource leakage may happen caused by spot jobs ' - 'being submitted, and unfinished spot jobs.') + f'\n {cnt}. Resource leakage may happen caused by ' + 'spot jobs being submitted, and in-progress spot jobs.') + cnt += 1 elif cluster_status == global_user_state.ClusterStatus.UP: spot_jobs = core.spot_status(refresh=False) non_terminal_jobs = [ @@ -1969,10 +1973,15 @@ def _down_or_stop_clusters( ] if non_terminal_jobs: msg += ( - '\n\t- Resource leakage may happen caused by the ' - 'following unfinished spot jobs:') - spot_lib.format_job_table(non_terminal_jobs, - show_all=False) + f'\n {cnt}. Resource leakage may happen caused by ' + 'the following in-progress spot jobs:\n') + job_table = spot_lib.format_job_table(non_terminal_jobs, + show_all=False) + msg += '\n'.join([ + ' ' + line + for line in job_table.split('\n') + if line != '' + ]) click.echo(msg) click.confirm('Do you want to continue?', From 64fc667b383c055c48c631a6384c9a01371482da Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 20 Oct 2022 01:12:10 -0700 Subject: [PATCH 07/11] fix test --- tests/test_spot.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/test_spot.py b/tests/test_spot.py index 98ca744968e..f80f47ea866 100644 --- a/tests/test_spot.py +++ b/tests/test_spot.py @@ -94,13 +94,29 @@ def _mock_cluster_state(self, _mock_db_conn): ready=True) @pytest.mark.timeout(60) - def test_down_spot_controller(self, _mock_cluster_state): - cli_runner = cli_testing.CliRunner() + def test_down_spot_controller(self, _mock_cluster_state, monkeypatch): + + def mock_cluster_refresh_up( + cluster_name: str, + *, + force_refresh: bool = False, + acquire_per_cluster_status_lock: bool = True, + ): + record = global_user_state.get_cluster_from_name(cluster_name) + return record['status'], record['handle'] + + monkeypatch.setattr( + 'sky.backends.backend_utils.refresh_cluster_status_handle', + mock_cluster_refresh_up) + monkeypatch.setattr('sky.core.spot_status', lambda refresh: []) + + cli_runner = cli_testing.CliRunner() result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME], input='n') - assert 'WARNING: Tearing down a SkyPilot reserved cluster.' in result.output - assert isinstance(result.exception, SystemExit), result.exception + assert 'WARNING: Tearing down the managed spot controller (UP).' in result.output + assert isinstance(result.exception, + SystemExit), (result.exception, result.output) result = cli_runner.invoke(cli.down, ['sky-spot-con*']) assert not result.exception From 39090a450e2d096b3bff4e390f1de41103815a8b Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 20 Oct 2022 11:42:18 -0700 Subject: [PATCH 08/11] address comments --- sky/backends/backend_utils.py | 12 ++-- sky/cli.py | 101 +++++++++++++++------------- sky/utils/cli_utils/status_utils.py | 4 +- 3 files changed, 61 insertions(+), 56 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index ecfc25825c0..fe7a613d507 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -101,6 +101,8 @@ # Note: This value cannot be too small, otherwise OOM issue may occur. DEFAULT_TASK_CPU_DEMAND = 0.5 +# Mapping from reserved cluster names to the corresponding group name (logging purpose). +# NOTE: each group can only have one reserved cluster name for now. SKY_RESERVED_CLUSTER_NAMES = { spot_lib.SPOT_CONTROLLER_NAME: 'Managed spot controller' } @@ -1929,13 +1931,11 @@ def check_cluster_name_not_reserved( If the cluster name is reserved, return the error message. Otherwise, return None. """ - usage = 'internal use' - if cluster_name == spot_lib.SPOT_CONTROLLER_NAME: - usage = 'spot controller' - msg = f'Cluster {cluster_name!r} is reserved for {usage}.' - if operation_str is not None: - msg += f' {operation_str} is not allowed.' if cluster_name in SKY_RESERVED_CLUSTER_NAMES: + msg = (f'Cluster {cluster_name!r} is reserved for ' + f'{SKY_RESERVED_CLUSTER_NAMES[cluster_name].lower()}.') + if operation_str is not None: + msg += f' {operation_str} is not allowed.' with ux_utils.print_exception_no_traceback(): raise ValueError(msg) diff --git a/sky/cli.py b/sky/cli.py index e4a39ac37c3..7061948ca3a 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -27,7 +27,6 @@ listed in "sky --help". Take care to put logically connected commands close to each other. """ -import collections import datetime import functools import getpass @@ -1282,13 +1281,13 @@ def status(all: bool, refresh: bool): # pylint: disable=redefined-builtin """ cluster_records = core.status(refresh=refresh) nonreserved_cluster_records = [] - reserved_clusters = collections.defaultdict(list) + reserved_clusters = dict() for cluster_record in cluster_records: cluster_name = cluster_record['name'] if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: cluster_group_name = backend_utils.SKY_RESERVED_CLUSTER_NAMES[ cluster_name] - reserved_clusters[cluster_group_name].append(cluster_record) + reserved_clusters[cluster_group_name] = cluster_record else: nonreserved_cluster_records.append(cluster_record) local_clusters = onprem_utils.check_and_get_local_clusters( @@ -1297,9 +1296,9 @@ def status(all: bool, refresh: bool): # pylint: disable=redefined-builtin num_pending_autostop = 0 num_pending_autostop += status_utils.show_status_table( nonreserved_cluster_records, all) - for cluster_group_name, cluster_records in reserved_clusters.items(): + for cluster_group_name, cluster_record in reserved_clusters.items(): num_pending_autostop += status_utils.show_status_table( - cluster_records, all, reserved_group_name=cluster_group_name) + [cluster_record], all, reserved_group_name=cluster_group_name) if num_pending_autostop > 0: click.echo( '\n' @@ -1876,6 +1875,51 @@ def down( purge=purge) +def _hints_for_down_spot_controller(controller_name: str): + # spot_jobs will be empty when the spot cluster is not running. + cluster_status, _ = backend_utils.refresh_cluster_status_handle( + controller_name) + if cluster_status is None: + click.echo('Managed spot controller has already been torn down.') + return + + msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' + f'spot controller ({cluster_status.value}). Please be ' + f'aware of the following:{colorama.Style.RESET_ALL}' + '\n * All logs and status information of the spot ' + 'jobs will be lost.') + if cluster_status == global_user_state.ClusterStatus.UP: + try: + spot_jobs = core.spot_status(refresh=False) + except exceptions.ClusterNotUpError: + # The spot controller cluster status changed during querying + # the spot jobs, use the latest cluster status, so that the + # message for INIT and STOPPED states will be correctly + # added to the message. + cluster_status = backend_utils.refresh_cluster_status_handle( + controller_name) + spot_jobs = [] + + # Find in-progress spot jobs, and hint users to cancel them. + non_terminal_jobs = [ + job for job in spot_jobs if not job['status'].is_terminal() + ] + if (cluster_status == global_user_state.ClusterStatus.UP and + non_terminal_jobs): + msg += ('\n * In-progress spot jobs found, their resources ' + 'will not be terminated and require manual cleanup' + '(sky spot cancel --all):\n') + job_table = spot_lib.format_job_table(non_terminal_jobs, + show_all=False) + # Add prefix to each line to align with the bullet point. + msg += '\n'.join( + [' ' + line for line in job_table.split('\n') if line != '']) + if cluster_status == global_user_state.ClusterStatus.INIT: + msg += ('\n * If there are pending/in-progress spot jobs, those ' + 'resources will not be terminated and require manual cleanup.') + click.echo(msg) + + def _down_or_stop_clusters( names: Tuple[str], apply_to_all: Optional[bool], @@ -1926,7 +1970,7 @@ def _down_or_stop_clusters( '`sky stop/autostop`.')) ] # Make sure the reserved clusters are explicitly specified without other - # normal clusters and purge is True. + # normal clusters. if len(reserved_clusters) > 0: if len(names) != 0: names_str = ', '.join(map(repr, names)) @@ -1943,48 +1987,9 @@ def _down_or_stop_clusters( # TODO(zhwu): We can only have one reserved cluster (spot # controller). assert len(reserved_clusters) == 1, reserved_clusters - # spot_jobs will be empty when the spot cluster is not running. - cluster_name = reserved_clusters[0] - cluster_status, _ = backend_utils.refresh_cluster_status_handle( - cluster_name) - if cluster_status is None: - click.echo( - 'Managed spot controller has already been torn down.') - return - - cnt = 1 - msg = ( - f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' - f'spot controller ({cluster_status.value}). Please be ' - f'aware of the following:{colorama.Style.RESET_ALL}' - f'\n {cnt}. All logs and status information of the spot ' - 'jobs will be lost.') - cnt += 1 - if cluster_status == global_user_state.ClusterStatus.INIT: - msg += ( - f'\n {cnt}. Resource leakage may happen caused by ' - 'spot jobs being submitted, and in-progress spot jobs.') - cnt += 1 - elif cluster_status == global_user_state.ClusterStatus.UP: - spot_jobs = core.spot_status(refresh=False) - non_terminal_jobs = [ - job for job in spot_jobs - if not job['status'].is_terminal() - ] - if non_terminal_jobs: - msg += ( - f'\n {cnt}. Resource leakage may happen caused by ' - 'the following in-progress spot jobs:\n') - job_table = spot_lib.format_job_table(non_terminal_jobs, - show_all=False) - msg += '\n'.join([ - ' ' + line - for line in job_table.split('\n') - if line != '' - ]) - click.echo(msg) - - click.confirm('Do you want to continue?', + _hints_for_down_spot_controller(reserved_clusters[0]) + + click.confirm('Proceed?', default=False, abort=True, show_default=True) diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 79f3fbe0031..d2c268dadbc 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -38,7 +38,7 @@ def show_status_table(cluster_records: List[Dict[str, Any]], """Compute cluster table values and display. Returns: - Number of pending autostop clusters. + Number of pending auto{stop,down} clusters. """ # TODO(zhwu): Update the information for autostop clusters. @@ -78,7 +78,7 @@ def show_status_table(cluster_records: List[Dict[str, Any]], click.echo( f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'{reserved_group_name}: {colorama.Style.RESET_ALL}' - f'{colorama.Style.DIM}(will be autostopped when inactive)' + f'{colorama.Style.DIM}(will be autostopped if idle for 30min)' f'{colorama.Style.RESET_ALL}') else: click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters: ' From d95107dbd25e65b8dc1ed216d5e658c8d6d6b173 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 20 Oct 2022 11:48:34 -0700 Subject: [PATCH 09/11] Fix the docs for spot job --- docs/source/examples/spot-jobs.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/examples/spot-jobs.rst b/docs/source/examples/spot-jobs.rst index 17656f924e0..04c0b270ab0 100644 --- a/docs/source/examples/spot-jobs.rst +++ b/docs/source/examples/spot-jobs.rst @@ -206,13 +206,13 @@ Spot controller (Advanced) ------------------------------- There will be a single spot controller VM (a small on-demand CPU VM) running in the background to manage all the spot jobs. -It will be autostopped after all spot jobs finished and no new spot job is submitted for 30 minutes. Typically **no user intervention** is needed. -You can find the controller with :code:`sky status -a`, and refresh the status with :code:`sky status -ar`. +It will be autostopped after all spot jobs finished and no new spot job is submitted for 10 minutes. Typically **no user intervention** is needed. +You can find the controller with :code:`sky status`, and refresh the status with :code:`sky status -r`. Although, the cost of the spot controller is negligible (~$0.4/hour when running and less than $0.004/hour when stopped), you can still tear it down manually with -:code:`sky down -p sky-spot-controller-`, where the ```` can be found in the output of :code:`sky status -a`. +:code:`sky down `, where the ```` can be found in the output of :code:`sky status`. .. note:: - Tearing down the spot controller when there are still spot jobs running will cause resource leakage of those spot VMs. + Tearing down the spot controller will lose all logs and status information for the spot jobs and can cause resource leakage when there are still in-progress spot jobs. From 92bda4c464e4b73600648beda4fdde625bc27563 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 20 Oct 2022 12:08:11 -0700 Subject: [PATCH 10/11] fix space --- sky/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 7061948ca3a..50d7c2707c1 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1907,7 +1907,7 @@ def _hints_for_down_spot_controller(controller_name: str): if (cluster_status == global_user_state.ClusterStatus.UP and non_terminal_jobs): msg += ('\n * In-progress spot jobs found, their resources ' - 'will not be terminated and require manual cleanup' + 'will not be terminated and require manual cleanup ' '(sky spot cancel --all):\n') job_table = spot_lib.format_job_table(non_terminal_jobs, show_all=False) From 20903c7ad818ec14c4e3a713ba7de6a62e255792 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 20 Oct 2022 23:32:31 -0700 Subject: [PATCH 11/11] address comments --- sky/cli.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 50d7c2707c1..313f7e61d05 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1875,7 +1875,7 @@ def down( purge=purge) -def _hints_for_down_spot_controller(controller_name: str): +def _hint_for_down_spot_controller(controller_name: str): # spot_jobs will be empty when the spot cluster is not running. cluster_status, _ = backend_utils.refresh_cluster_status_handle( controller_name) @@ -1887,7 +1887,7 @@ def _hints_for_down_spot_controller(controller_name: str): f'spot controller ({cluster_status.value}). Please be ' f'aware of the following:{colorama.Style.RESET_ALL}' '\n * All logs and status information of the spot ' - 'jobs will be lost.') + 'jobs (output of sky spot status) will be lost.') if cluster_status == global_user_state.ClusterStatus.UP: try: spot_jobs = core.spot_status(refresh=False) @@ -1907,8 +1907,7 @@ def _hints_for_down_spot_controller(controller_name: str): if (cluster_status == global_user_state.ClusterStatus.UP and non_terminal_jobs): msg += ('\n * In-progress spot jobs found, their resources ' - 'will not be terminated and require manual cleanup ' - '(sky spot cancel --all):\n') + 'will not be terminated and require manual cleanup:\n') job_table = spot_lib.format_job_table(non_terminal_jobs, show_all=False) # Add prefix to each line to align with the bullet point. @@ -1987,7 +1986,7 @@ def _down_or_stop_clusters( # TODO(zhwu): We can only have one reserved cluster (spot # controller). assert len(reserved_clusters) == 1, reserved_clusters - _hints_for_down_spot_controller(reserved_clusters[0]) + _hint_for_down_spot_controller(reserved_clusters[0]) click.confirm('Proceed?', default=False,