From 60cd0d61df5f63b84b2b7af5310a9888bb81dde4 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 18 Oct 2022 00:21:42 -0700
Subject: [PATCH 01/11] Show controller in status and enable tear down.

---
 sky/cli.py                          | 35 ++++++++++++++++++++---------
 sky/core.py                         |  8 ++-----
 sky/utils/cli_utils/status_utils.py | 19 +++++++++++++---
 3 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/sky/cli.py b/sky/cli.py
index 40870c37377..f715c639e90 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -1268,11 +1268,19 @@ def status(all: bool, refresh: bool):  # pylint: disable=redefined-builtin
     '(down)', e.g. '1m (down)', the cluster will be autodowned, rather than
     autostopped.
     """
-    cluster_records = core.status(all=all, refresh=refresh)
+    cluster_records = core.status(refresh=refresh)
+    filtered_cluster_records = []
+    reserved_clusters = []
+    for cluster_record in cluster_records:
+        if cluster_record['name'] in backend_utils.SKY_RESERVED_CLUSTER_NAMES:
+            reserved_clusters.append(cluster_record)
+        else:
+            filtered_cluster_records.append(cluster_record)
     local_clusters = onprem_utils.check_and_get_local_clusters(
         suppress_error=True)
-    status_utils.show_status_table(cluster_records, all)
+    status_utils.show_status_table(filtered_cluster_records, all)
     status_utils.show_local_status_table(local_clusters)
+    status_utils.show_status_table(reserved_clusters, all, is_reserved=True)
 
 
 @cli.command()
@@ -1894,14 +1902,21 @@ def _down_or_stop_clusters(
         # Make sure the reserved clusters are explicitly specified without other
         # normal clusters and purge is True.
         if len(reserved_clusters) > 0:
-            if not purge:
-                msg = (f'{operation} reserved cluster(s) '
-                       f'{reserved_clusters_str} is not supported.')
-                if down:
-                    msg += (
-                        '\nPlease specify --purge (-p) to force-terminate the '
-                        'reserved cluster(s).')
-                raise click.UsageError(msg)
+            if not down:
+                raise click.UsageError(
+                    f'{operation} reserved cluster(s) '
+                    f'{reserved_clusters_str} is not supported.')
+            else:
+                click.secho(
+                    'WARNING: Tearing down a SkyPilot reserved cluster. If '
+                    'any related job had not finished, resources leakage '
+                    'could happen.',
+                    fg='yellow')
+                click.confirm('Do you want to continue?',
+                              default=False,
+                              abort=True,
+                              show_default=True)
+                no_confirm = True
             if len(names) != 0:
                 names_str = ', '.join(map(repr, names))
                 raise click.UsageError(
diff --git a/sky/core.py b/sky/core.py
index 15d20fbaf6d..315addea3a3 100644
--- a/sky/core.py
+++ b/sky/core.py
@@ -30,7 +30,7 @@
 
 
 @usage_lib.entrypoint
-def status(all: bool, refresh: bool) -> List[Dict[str, Any]]:
+def status(refresh: bool) -> List[Dict[str, Any]]:
     """Get the cluster status in dict.
 
     Please refer to the sky.cli.status for the document.
@@ -52,7 +52,7 @@ def status(all: bool, refresh: bool) -> List[Dict[str, Any]]:
         ]
 
     """
-    cluster_records = backend_utils.get_clusters(all, refresh)
+    cluster_records = backend_utils.get_clusters(True, refresh)
     return cluster_records
 
 
@@ -167,10 +167,6 @@ def down(cluster_name: str, purge: bool = False):
         ValueError: cluster does not exist.
         sky.exceptions.NotSupportedError: the cluster is not supported.
     """
-    if (cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES and not purge):
-        raise exceptions.NotSupportedError(
-            f'Tearing down sky reserved cluster {cluster_name!r} '
-            f'is not supported.')
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
     if handle is None:
         raise ValueError(f'Cluster {cluster_name!r} does not exist.')
diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py
index 6931ee875ce..801380cbcdf 100644
--- a/sky/utils/cli_utils/status_utils.py
+++ b/sky/utils/cli_utils/status_utils.py
@@ -32,7 +32,9 @@ def calc(self, record):
         return val
 
 
-def show_status_table(cluster_records: List[Dict[str, Any]], show_all: bool):
+def show_status_table(cluster_records: List[Dict[str, Any]],
+                      show_all: bool,
+                      is_reserved: bool = False):
     """Compute cluster table values and display."""
     # TODO(zhwu): Update the information for autostop clusters.
 
@@ -68,12 +70,23 @@ def show_status_table(cluster_records: List[Dict[str, Any]], show_all: bool):
         pending_autostop += _is_pending_autostop(record)
 
     if cluster_records:
+        name = 'Clusters'
+        if is_reserved:
+            click.echo()
+            name = 'Reserved Clusters'
+        click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{name}: '
+                   f'{colorama.Style.RESET_ALL}')
         click.echo(cluster_table)
-        if pending_autostop:
+        if is_reserved:
+            click.echo(
+                f'{colorama.Style.DIM}Reserved clusters will be autostopped '
+                'when inactive. Refresh statuses with: sky status --refresh'
+                f'{colorama.Style.RESET_ALL}')
+        elif pending_autostop:
             click.echo(
                 '\n'
                 f'You have {pending_autostop} clusters with auto{{stop,down}} '
-                'scheduled. Refresh statuses with: `sky status --refresh`.')
+                'scheduled. Refresh statuses with: sky status --refresh')
     else:
         click.echo('No existing clusters.')
 

From 102427715ea8f8007a34550b8e072961657f76a7 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 18 Oct 2022 00:21:53 -0700
Subject: [PATCH 02/11] Reduce the autostop for controller to save cost

---
 sky/spot/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sky/spot/constants.py b/sky/spot/constants.py
index 9accab396dc..33d069327b1 100644
--- a/sky/spot/constants.py
+++ b/sky/spot/constants.py
@@ -1,6 +1,6 @@
 """Constants used for Managed Spot."""
 
-SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 30
+SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10
 
 SPOT_CONTROLLER_TEMPLATE = 'spot-controller.yaml.j2'
 SPOT_CONTROLLER_YAML_PREFIX = '~/.sky/spot_controller'

From f22eba19e15e208e033cd44918790b08c71997b0 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 18 Oct 2022 00:33:39 -0700
Subject: [PATCH 03/11] fix test

---
 tests/test_spot.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/test_spot.py b/tests/test_spot.py
index 549411a4dcc..df0c9e50bab 100644
--- a/tests/test_spot.py
+++ b/tests/test_spot.py
@@ -97,11 +97,9 @@ def _mock_cluster_state(self, _mock_db_conn):
     def test_down_spot_controller(self, _mock_cluster_state):
         cli_runner = cli_testing.CliRunner()
 
-        result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME])
-        assert result.exit_code == click.UsageError.exit_code
-        assert (
-            f'Terminating reserved cluster(s) \'{spot.SPOT_CONTROLLER_NAME}\' '
-            'is not supported' in result.output)
+        result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME], input='n')
+        assert 'WARNING: Tearing down a SkyPilot reserved cluster.' in result.output
+        assert isinstance(result.exception, SystemExit), result.exception
 
         result = cli_runner.invoke(cli.down, ['sky-spot-con*'])
         assert not result.exception

From 799e75369c3f1fc8fb7767280c8c6cb88b20f3d2 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 18 Oct 2022 00:35:27 -0700
Subject: [PATCH 04/11] format

---
 tests/test_spot.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_spot.py b/tests/test_spot.py
index df0c9e50bab..98ca744968e 100644
--- a/tests/test_spot.py
+++ b/tests/test_spot.py
@@ -97,7 +97,8 @@ def _mock_cluster_state(self, _mock_db_conn):
     def test_down_spot_controller(self, _mock_cluster_state):
         cli_runner = cli_testing.CliRunner()
 
-        result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME], input='n')
+        result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME],
+                                   input='n')
         assert 'WARNING: Tearing down a SkyPilot reserved cluster.' in result.output
         assert isinstance(result.exception, SystemExit), result.exception
 

From 69a6ffb88aa6e59408d19655552306f907588602 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 20 Oct 2022 00:03:52 -0700
Subject: [PATCH 05/11] address comments

---
 sky/backends/backend_utils.py       |  4 +-
 sky/cli.py                          | 83 ++++++++++++++++++++++-------
 sky/core.py                         |  3 +-
 sky/utils/cli_utils/status_utils.py | 34 ++++++------
 4 files changed, 85 insertions(+), 39 deletions(-)

diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
index 2f07393214f..ecfc25825c0 100644
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -101,7 +101,9 @@
 # Note: This value cannot be too small, otherwise OOM issue may occur.
 DEFAULT_TASK_CPU_DEMAND = 0.5
 
-SKY_RESERVED_CLUSTER_NAMES = [spot_lib.SPOT_CONTROLLER_NAME]
+SKY_RESERVED_CLUSTER_NAMES = {
+    spot_lib.SPOT_CONTROLLER_NAME: 'Managed spot controller'
+}
 
 # Filelocks for the cluster status change.
 CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
diff --git a/sky/cli.py b/sky/cli.py
index e05e1e0d783..76caf58cf1d 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -27,6 +27,7 @@
 listed in "sky --help".  Take care to put logically connected commands close to
 each other.
 """
+import collections
 import datetime
 import functools
 import getpass
@@ -1280,18 +1281,32 @@ def status(all: bool, refresh: bool):  # pylint: disable=redefined-builtin
     autostopped.
     """
     cluster_records = core.status(refresh=refresh)
-    filtered_cluster_records = []
-    reserved_clusters = []
+    nonreserved_cluster_records = []
+    reserved_clusters = collections.defaultdict(list)
     for cluster_record in cluster_records:
-        if cluster_record['name'] in backend_utils.SKY_RESERVED_CLUSTER_NAMES:
-            reserved_clusters.append(cluster_record)
+        cluster_name = cluster_record['name']
+        if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES:
+            cluster_group_name = backend_utils.SKY_RESERVED_CLUSTER_NAMES[
+                cluster_name]
+            reserved_clusters[cluster_group_name].append(cluster_record)
         else:
-            filtered_cluster_records.append(cluster_record)
+            nonreserved_cluster_records.append(cluster_record)
     local_clusters = onprem_utils.check_and_get_local_clusters(
         suppress_error=True)
-    status_utils.show_status_table(filtered_cluster_records, all)
+
+    num_pending_autostop = 0
+    num_pending_autostop += status_utils.show_status_table(
+        nonreserved_cluster_records, all)
+    for cluster_group_name, cluster_records in reserved_clusters.items():
+        num_pending_autostop += status_utils.show_status_table(
+            cluster_records, all, reserved_group_name=cluster_group_name)
+    if num_pending_autostop > 0:
+        click.echo(
+            '\n'
+            f'{colorama.Style.DIM}You have {num_pending_autostop} clusters '
+            'with auto{stop,down} scheduled. Refresh statuses with: '
+            f'sky status --refresh{colorama.Style.RESET_ALL}')
     status_utils.show_local_status_table(local_clusters)
-    status_utils.show_status_table(reserved_clusters, all, is_reserved=True)
 
 
 @cli.command()
@@ -1913,28 +1928,58 @@ def _down_or_stop_clusters(
         # Make sure the reserved clusters are explicitly specified without other
         # normal clusters and purge is True.
         if len(reserved_clusters) > 0:
+            if len(names) != 0:
+                names_str = ', '.join(map(repr, names))
+                raise click.UsageError(
+                    f'{operation} reserved cluster(s) '
+                    f'{reserved_clusters_str} with multiple other cluster(s) '
+                    f'{names_str} is not supported.\n'
+                    f'Please omit the reserved cluster(s) {reserved_clusters}.')
             if not down:
                 raise click.UsageError(
                     f'{operation} reserved cluster(s) '
                     f'{reserved_clusters_str} is not supported.')
             else:
-                click.secho(
-                    'WARNING: Tearing down a SkyPilot reserved cluster. If '
-                    'any related job had not finished, resources leakage '
-                    'could happen.',
-                    fg='yellow')
+                # TODO(zhwu): We can only have one reserved cluster (spot
+                # controller).
+                assert len(reserved_clusters) == 1, reserved_clusters
+                # spot_jobs will be empty when the spot cluster is not running.
+                cluster_name = reserved_clusters[0]
+                cluster_status, _ = backend_utils.refresh_cluster_status_handle(
+                    cluster_name)
+                if cluster_status is None:
+                    click.echo(
+                        'Managed spot controller has already been torn down.')
+                    return
+                msg = (
+                    f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
+                    f'spot controller ({status.value}). Please be aware of the '
+                    'following:'
+                    '\n\t- All logs and status information of the spot jobs '
+                    'will be lost.')
+                if cluster_status == global_user_state.ClusterStatus.INIT:
+                    msg += (
+                        '\n\t- Resource leakage may happen caused by spot jobs '
+                        'being submitted, and unfinished spot jobs.')
+                elif cluster_status == global_user_state.ClusterStatus.UP:
+                    spot_jobs = core.spot_status(refresh=False)
+                    non_terminal_jobs = [
+                        job for job in spot_jobs
+                        if not job['status'].is_terminal()
+                    ]
+                    if non_terminal_jobs:
+                        msg += (
+                            '\n\t- Resource leakage may happen caused by the '
+                            'following unfinished spot jobs:')
+                        spot_lib.format_job_table(non_terminal_jobs,
+                                                  show_all=False)
+                click.echo(msg)
+
                 click.confirm('Do you want to continue?',
                               default=False,
                               abort=True,
                               show_default=True)
                 no_confirm = True
-            if len(names) != 0:
-                names_str = ', '.join(map(repr, names))
-                raise click.UsageError(
-                    f'{operation} reserved cluster(s) '
-                    f'{reserved_clusters_str} with multiple other cluster(s) '
-                    f'{names_str} is not supported.\n'
-                    f'Please omit the reserved cluster(s) {reserved_clusters}.')
         names += reserved_clusters
 
     if apply_to_all:
diff --git a/sky/core.py b/sky/core.py
index 315addea3a3..414c3e7db41 100644
--- a/sky/core.py
+++ b/sky/core.py
@@ -52,7 +52,8 @@ def status(refresh: bool) -> List[Dict[str, Any]]:
         ]
 
     """
-    cluster_records = backend_utils.get_clusters(True, refresh)
+    cluster_records = backend_utils.get_clusters(include_reserved=True,
+                                                 refresh=refresh)
     return cluster_records
 
 
diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py
index 801380cbcdf..79f3fbe0031 100644
--- a/sky/utils/cli_utils/status_utils.py
+++ b/sky/utils/cli_utils/status_utils.py
@@ -1,5 +1,5 @@
 """Utilities for sky status."""
-from typing import Any, Callable, Dict, List
+from typing import Any, Callable, Dict, List, Optional
 import click
 import colorama
 
@@ -34,8 +34,12 @@ def calc(self, record):
 
 def show_status_table(cluster_records: List[Dict[str, Any]],
                       show_all: bool,
-                      is_reserved: bool = False):
-    """Compute cluster table values and display."""
+                      reserved_group_name: Optional[str] = None) -> int:
+    """Compute cluster table values and display.
+
+    Returns:
+        Number of pending autostop clusters.
+    """
     # TODO(zhwu): Update the information for autostop clusters.
 
     status_columns = [
@@ -70,25 +74,19 @@ def show_status_table(cluster_records: List[Dict[str, Any]],
         pending_autostop += _is_pending_autostop(record)
 
     if cluster_records:
-        name = 'Clusters'
-        if is_reserved:
-            click.echo()
-            name = 'Reserved Clusters'
-        click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{name}: '
-                   f'{colorama.Style.RESET_ALL}')
-        click.echo(cluster_table)
-        if is_reserved:
+        if reserved_group_name is not None:
             click.echo(
-                f'{colorama.Style.DIM}Reserved clusters will be autostopped '
-                'when inactive. Refresh statuses with: sky status --refresh'
+                f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
+                f'{reserved_group_name}: {colorama.Style.RESET_ALL}'
+                f'{colorama.Style.DIM}(will be autostopped when inactive)'
                 f'{colorama.Style.RESET_ALL}')
-        elif pending_autostop:
-            click.echo(
-                '\n'
-                f'You have {pending_autostop} clusters with auto{{stop,down}} '
-                'scheduled. Refresh statuses with: sky status --refresh')
+        else:
+            click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters: '
+                       f'{colorama.Style.RESET_ALL}')
+        click.echo(cluster_table)
     else:
         click.echo('No existing clusters.')
+    return pending_autostop
 
 
 def show_local_status_table(local_clusters: List[str]):

From 166ca7ea6a64ba943198da8606180eef123e1510 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 20 Oct 2022 00:30:45 -0700
Subject: [PATCH 06/11] fix down output

---
 sky/cli.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/sky/cli.py b/sky/cli.py
index 76caf58cf1d..e4a39ac37c3 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -1951,16 +1951,20 @@ def _down_or_stop_clusters(
                     click.echo(
                         'Managed spot controller has already been torn down.')
                     return
+
+                cnt = 1
                 msg = (
                     f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
-                    f'spot controller ({status.value}). Please be aware of the '
-                    'following:'
-                    '\n\t- All logs and status information of the spot jobs '
-                    'will be lost.')
+                    f'spot controller ({cluster_status.value}). Please be '
+                    f'aware of the following:{colorama.Style.RESET_ALL}'
+                    f'\n {cnt}. All logs and status information of the spot '
+                    'jobs will be lost.')
+                cnt += 1
                 if cluster_status == global_user_state.ClusterStatus.INIT:
                     msg += (
-                        '\n\t- Resource leakage may happen caused by spot jobs '
-                        'being submitted, and unfinished spot jobs.')
+                        f'\n {cnt}. Resource leakage may happen caused by '
+                        'spot jobs being submitted, and in-progress spot jobs.')
+                    cnt += 1
                 elif cluster_status == global_user_state.ClusterStatus.UP:
                     spot_jobs = core.spot_status(refresh=False)
                     non_terminal_jobs = [
@@ -1969,10 +1973,15 @@ def _down_or_stop_clusters(
                     ]
                     if non_terminal_jobs:
                         msg += (
-                            '\n\t- Resource leakage may happen caused by the '
-                            'following unfinished spot jobs:')
-                        spot_lib.format_job_table(non_terminal_jobs,
-                                                  show_all=False)
+                            f'\n {cnt}. Resource leakage may happen caused by '
+                            'the following in-progress spot jobs:\n')
+                        job_table = spot_lib.format_job_table(non_terminal_jobs,
+                                                              show_all=False)
+                        msg += '\n'.join([
+                            '    ' + line
+                            for line in job_table.split('\n')
+                            if line != ''
+                        ])
                 click.echo(msg)
 
                 click.confirm('Do you want to continue?',

From 64fc667b383c055c48c631a6384c9a01371482da Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 20 Oct 2022 01:12:10 -0700
Subject: [PATCH 07/11] fix test

---
 tests/test_spot.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/tests/test_spot.py b/tests/test_spot.py
index 98ca744968e..f80f47ea866 100644
--- a/tests/test_spot.py
+++ b/tests/test_spot.py
@@ -94,13 +94,29 @@ def _mock_cluster_state(self, _mock_db_conn):
                                                 ready=True)
 
     @pytest.mark.timeout(60)
-    def test_down_spot_controller(self, _mock_cluster_state):
-        cli_runner = cli_testing.CliRunner()
+    def test_down_spot_controller(self, _mock_cluster_state, monkeypatch):
+
+        def mock_cluster_refresh_up(
+            cluster_name: str,
+            *,
+            force_refresh: bool = False,
+            acquire_per_cluster_status_lock: bool = True,
+        ):
+            record = global_user_state.get_cluster_from_name(cluster_name)
+            return record['status'], record['handle']
+
+        monkeypatch.setattr(
+            'sky.backends.backend_utils.refresh_cluster_status_handle',
+            mock_cluster_refresh_up)
 
+        monkeypatch.setattr('sky.core.spot_status', lambda refresh: [])
+
+        cli_runner = cli_testing.CliRunner()
         result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME],
                                    input='n')
-        assert 'WARNING: Tearing down a SkyPilot reserved cluster.' in result.output
-        assert isinstance(result.exception, SystemExit), result.exception
+        assert 'WARNING: Tearing down the managed spot controller (UP).' in result.output
+        assert isinstance(result.exception,
+                          SystemExit), (result.exception, result.output)
 
         result = cli_runner.invoke(cli.down, ['sky-spot-con*'])
         assert not result.exception

From 39090a450e2d096b3bff4e390f1de41103815a8b Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 20 Oct 2022 11:42:18 -0700
Subject: [PATCH 08/11] address comments

---
 sky/backends/backend_utils.py       |  12 ++--
 sky/cli.py                          | 101 +++++++++++++++-------------
 sky/utils/cli_utils/status_utils.py |   4 +-
 3 files changed, 61 insertions(+), 56 deletions(-)

diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
index ecfc25825c0..fe7a613d507 100644
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -101,6 +101,8 @@
 # Note: This value cannot be too small, otherwise OOM issue may occur.
 DEFAULT_TASK_CPU_DEMAND = 0.5
 
+# Mapping from reserved cluster names to the corresponding group name (logging purpose).
+# NOTE: each group can only have one reserved cluster name for now.
 SKY_RESERVED_CLUSTER_NAMES = {
     spot_lib.SPOT_CONTROLLER_NAME: 'Managed spot controller'
 }
@@ -1929,13 +1931,11 @@ def check_cluster_name_not_reserved(
     If the cluster name is reserved, return the error message. Otherwise,
     return None.
     """
-    usage = 'internal use'
-    if cluster_name == spot_lib.SPOT_CONTROLLER_NAME:
-        usage = 'spot controller'
-    msg = f'Cluster {cluster_name!r} is reserved for {usage}.'
-    if operation_str is not None:
-        msg += f' {operation_str} is not allowed.'
     if cluster_name in SKY_RESERVED_CLUSTER_NAMES:
+        msg = (f'Cluster {cluster_name!r} is reserved for '
+               f'{SKY_RESERVED_CLUSTER_NAMES[cluster_name].lower()}.')
+        if operation_str is not None:
+            msg += f' {operation_str} is not allowed.'
         with ux_utils.print_exception_no_traceback():
             raise ValueError(msg)
 
diff --git a/sky/cli.py b/sky/cli.py
index e4a39ac37c3..7061948ca3a 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -27,7 +27,6 @@
 listed in "sky --help".  Take care to put logically connected commands close to
 each other.
 """
-import collections
 import datetime
 import functools
 import getpass
@@ -1282,13 +1281,13 @@ def status(all: bool, refresh: bool):  # pylint: disable=redefined-builtin
     """
     cluster_records = core.status(refresh=refresh)
     nonreserved_cluster_records = []
-    reserved_clusters = collections.defaultdict(list)
+    reserved_clusters = dict()
     for cluster_record in cluster_records:
         cluster_name = cluster_record['name']
         if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES:
             cluster_group_name = backend_utils.SKY_RESERVED_CLUSTER_NAMES[
                 cluster_name]
-            reserved_clusters[cluster_group_name].append(cluster_record)
+            reserved_clusters[cluster_group_name] = cluster_record
         else:
             nonreserved_cluster_records.append(cluster_record)
     local_clusters = onprem_utils.check_and_get_local_clusters(
@@ -1297,9 +1296,9 @@ def status(all: bool, refresh: bool):  # pylint: disable=redefined-builtin
     num_pending_autostop = 0
     num_pending_autostop += status_utils.show_status_table(
         nonreserved_cluster_records, all)
-    for cluster_group_name, cluster_records in reserved_clusters.items():
+    for cluster_group_name, cluster_record in reserved_clusters.items():
         num_pending_autostop += status_utils.show_status_table(
-            cluster_records, all, reserved_group_name=cluster_group_name)
+            [cluster_record], all, reserved_group_name=cluster_group_name)
     if num_pending_autostop > 0:
         click.echo(
             '\n'
@@ -1876,6 +1875,51 @@ def down(
                            purge=purge)
 
 
+def _hints_for_down_spot_controller(controller_name: str):
+    # spot_jobs will be empty when the spot cluster is not running.
+    cluster_status, _ = backend_utils.refresh_cluster_status_handle(
+        controller_name)
+    if cluster_status is None:
+        click.echo('Managed spot controller has already been torn down.')
+        return
+
+    msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
+           f'spot controller ({cluster_status.value}). Please be '
+           f'aware of the following:{colorama.Style.RESET_ALL}'
+           '\n * All logs and status information of the spot '
+           'jobs will be lost.')
+    if cluster_status == global_user_state.ClusterStatus.UP:
+        try:
+            spot_jobs = core.spot_status(refresh=False)
+        except exceptions.ClusterNotUpError:
+            # The spot controller cluster status changed during querying
+            # the spot jobs, use the latest cluster status, so that the
+            # message for INIT and STOPPED states will be correctly
+            # added to the message.
+            cluster_status = backend_utils.refresh_cluster_status_handle(
+                controller_name)
+            spot_jobs = []
+
+        # Find in-progress spot jobs, and hint users to cancel them.
+        non_terminal_jobs = [
+            job for job in spot_jobs if not job['status'].is_terminal()
+        ]
+        if (cluster_status == global_user_state.ClusterStatus.UP and
+                non_terminal_jobs):
+            msg += ('\n * In-progress spot jobs found, their resources '
+                    'will not be terminated and require manual cleanup'
+                    '(sky spot cancel --all):\n')
+            job_table = spot_lib.format_job_table(non_terminal_jobs,
+                                                  show_all=False)
+            # Add prefix to each line to align with the bullet point.
+            msg += '\n'.join(
+                ['   ' + line for line in job_table.split('\n') if line != ''])
+    if cluster_status == global_user_state.ClusterStatus.INIT:
+        msg += ('\n * If there are pending/in-progress spot jobs, those '
+                'resources will not be terminated and require manual cleanup.')
+    click.echo(msg)
+
+
 def _down_or_stop_clusters(
         names: Tuple[str],
         apply_to_all: Optional[bool],
@@ -1926,7 +1970,7 @@ def _down_or_stop_clusters(
                     '`sky stop/autostop`.'))
             ]
         # Make sure the reserved clusters are explicitly specified without other
-        # normal clusters and purge is True.
+        # normal clusters.
         if len(reserved_clusters) > 0:
             if len(names) != 0:
                 names_str = ', '.join(map(repr, names))
@@ -1943,48 +1987,9 @@ def _down_or_stop_clusters(
                 # TODO(zhwu): We can only have one reserved cluster (spot
                 # controller).
                 assert len(reserved_clusters) == 1, reserved_clusters
-                # spot_jobs will be empty when the spot cluster is not running.
-                cluster_name = reserved_clusters[0]
-                cluster_status, _ = backend_utils.refresh_cluster_status_handle(
-                    cluster_name)
-                if cluster_status is None:
-                    click.echo(
-                        'Managed spot controller has already been torn down.')
-                    return
-
-                cnt = 1
-                msg = (
-                    f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
-                    f'spot controller ({cluster_status.value}). Please be '
-                    f'aware of the following:{colorama.Style.RESET_ALL}'
-                    f'\n {cnt}. All logs and status information of the spot '
-                    'jobs will be lost.')
-                cnt += 1
-                if cluster_status == global_user_state.ClusterStatus.INIT:
-                    msg += (
-                        f'\n {cnt}. Resource leakage may happen caused by '
-                        'spot jobs being submitted, and in-progress spot jobs.')
-                    cnt += 1
-                elif cluster_status == global_user_state.ClusterStatus.UP:
-                    spot_jobs = core.spot_status(refresh=False)
-                    non_terminal_jobs = [
-                        job for job in spot_jobs
-                        if not job['status'].is_terminal()
-                    ]
-                    if non_terminal_jobs:
-                        msg += (
-                            f'\n {cnt}. Resource leakage may happen caused by '
-                            'the following in-progress spot jobs:\n')
-                        job_table = spot_lib.format_job_table(non_terminal_jobs,
-                                                              show_all=False)
-                        msg += '\n'.join([
-                            '    ' + line
-                            for line in job_table.split('\n')
-                            if line != ''
-                        ])
-                click.echo(msg)
-
-                click.confirm('Do you want to continue?',
+                _hints_for_down_spot_controller(reserved_clusters[0])
+
+                click.confirm('Proceed?',
                               default=False,
                               abort=True,
                               show_default=True)
diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py
index 79f3fbe0031..d2c268dadbc 100644
--- a/sky/utils/cli_utils/status_utils.py
+++ b/sky/utils/cli_utils/status_utils.py
@@ -38,7 +38,7 @@ def show_status_table(cluster_records: List[Dict[str, Any]],
     """Compute cluster table values and display.
 
     Returns:
-        Number of pending autostop clusters.
+        Number of pending auto{stop,down} clusters.
     """
     # TODO(zhwu): Update the information for autostop clusters.
 
@@ -78,7 +78,7 @@ def show_status_table(cluster_records: List[Dict[str, Any]],
             click.echo(
                 f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
                 f'{reserved_group_name}: {colorama.Style.RESET_ALL}'
-                f'{colorama.Style.DIM}(will be autostopped when inactive)'
+                f'{colorama.Style.DIM}(will be autostopped if idle for 30min)'
                 f'{colorama.Style.RESET_ALL}')
         else:
             click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters: '

From d95107dbd25e65b8dc1ed216d5e658c8d6d6b173 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 20 Oct 2022 11:48:34 -0700
Subject: [PATCH 09/11] Fix the docs for spot job

---
 docs/source/examples/spot-jobs.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/examples/spot-jobs.rst b/docs/source/examples/spot-jobs.rst
index 17656f924e0..04c0b270ab0 100644
--- a/docs/source/examples/spot-jobs.rst
+++ b/docs/source/examples/spot-jobs.rst
@@ -206,13 +206,13 @@ Spot controller (Advanced)
 -------------------------------
 
 There will be a single spot controller VM (a small on-demand CPU VM) running in the background to manage all the spot jobs.
-It will be autostopped after all spot jobs finished and no new spot job is submitted for 30 minutes. Typically **no user intervention** is needed. 
-You can find the controller with :code:`sky status -a`, and refresh the status with :code:`sky status -ar`.
+It will be autostopped after all spot jobs finished and no new spot job is submitted for 10 minutes. Typically **no user intervention** is needed. 
+You can find the controller with :code:`sky status`, and refresh the status with :code:`sky status -r`.
 
 Although, the cost of the spot controller is negligible (~$0.4/hour when running and less than $0.004/hour when stopped), 
 you can still tear it down manually with 
-:code:`sky down -p sky-spot-controller-<hash>`, where the ``<hash>`` can be found in the output of :code:`sky status -a`.
+:code:`sky down <spot-controller-name>`, where the ``<spot-controller-name>`` can be found in the output of :code:`sky status`.
 
 .. note::
-  Tearing down the spot controller when there are still spot jobs running will cause resource leakage of those spot VMs.
+  Tearing down the spot controller will lose all logs and status information for the spot jobs and can cause resource leakage when there are still in-progress spot jobs.
 

From 92bda4c464e4b73600648beda4fdde625bc27563 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 20 Oct 2022 12:08:11 -0700
Subject: [PATCH 10/11] fix space

---
 sky/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sky/cli.py b/sky/cli.py
index 7061948ca3a..50d7c2707c1 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -1907,7 +1907,7 @@ def _hints_for_down_spot_controller(controller_name: str):
         if (cluster_status == global_user_state.ClusterStatus.UP and
                 non_terminal_jobs):
             msg += ('\n * In-progress spot jobs found, their resources '
-                    'will not be terminated and require manual cleanup'
+                    'will not be terminated and require manual cleanup '
                     '(sky spot cancel --all):\n')
             job_table = spot_lib.format_job_table(non_terminal_jobs,
                                                   show_all=False)

From 20903c7ad818ec14c4e3a713ba7de6a62e255792 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 20 Oct 2022 23:32:31 -0700
Subject: [PATCH 11/11] address comments

---
 sky/cli.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sky/cli.py b/sky/cli.py
index 50d7c2707c1..313f7e61d05 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -1875,7 +1875,7 @@ def down(
                            purge=purge)
 
 
-def _hints_for_down_spot_controller(controller_name: str):
+def _hint_for_down_spot_controller(controller_name: str):
     # spot_jobs will be empty when the spot cluster is not running.
     cluster_status, _ = backend_utils.refresh_cluster_status_handle(
         controller_name)
@@ -1887,7 +1887,7 @@ def _hints_for_down_spot_controller(controller_name: str):
            f'spot controller ({cluster_status.value}). Please be '
            f'aware of the following:{colorama.Style.RESET_ALL}'
            '\n * All logs and status information of the spot '
-           'jobs will be lost.')
+           'jobs (output of sky spot status) will be lost.')
     if cluster_status == global_user_state.ClusterStatus.UP:
         try:
             spot_jobs = core.spot_status(refresh=False)
@@ -1907,8 +1907,7 @@ def _hints_for_down_spot_controller(controller_name: str):
         if (cluster_status == global_user_state.ClusterStatus.UP and
                 non_terminal_jobs):
             msg += ('\n * In-progress spot jobs found, their resources '
-                    'will not be terminated and require manual cleanup '
-                    '(sky spot cancel --all):\n')
+                    'will not be terminated and require manual cleanup:\n')
             job_table = spot_lib.format_job_table(non_terminal_jobs,
                                                   show_all=False)
             # Add prefix to each line to align with the bullet point.
@@ -1987,7 +1986,7 @@ def _down_or_stop_clusters(
                 # TODO(zhwu): We can only have one reserved cluster (spot
                 # controller).
                 assert len(reserved_clusters) == 1, reserved_clusters
-                _hints_for_down_spot_controller(reserved_clusters[0])
+                _hint_for_down_spot_controller(reserved_clusters[0])
 
                 click.confirm('Proceed?',
                               default=False,