-
Notifications
You must be signed in to change notification settings - Fork 554
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Spot] Show spot controller in sky status and simplify tearing down #1270
Changes from 8 commits
60cd0d6
1024277
c4b7ca8
f22eba1
799e753
69a6ffb
166ca7e
64fc667
39090a4
d9aa5c2
d95107d
92bda4c
20903c7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ | |
listed in "sky --help". Take care to put logically connected commands close to | ||
each other. | ||
""" | ||
import collections | ||
import datetime | ||
import functools | ||
import getpass | ||
|
@@ -1279,10 +1280,32 @@ def status(all: bool, refresh: bool): # pylint: disable=redefined-builtin | |
'(down)', e.g. '1m (down)', the cluster will be autodowned, rather than | ||
autostopped. | ||
""" | ||
cluster_records = core.status(all=all, refresh=refresh) | ||
cluster_records = core.status(refresh=refresh) | ||
nonreserved_cluster_records = [] | ||
reserved_clusters = collections.defaultdict(list) | ||
for cluster_record in cluster_records: | ||
cluster_name = cluster_record['name'] | ||
if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: | ||
cluster_group_name = backend_utils.SKY_RESERVED_CLUSTER_NAMES[ | ||
cluster_name] | ||
reserved_clusters[cluster_group_name].append(cluster_record) | ||
else: | ||
nonreserved_cluster_records.append(cluster_record) | ||
local_clusters = onprem_utils.check_and_get_local_clusters( | ||
suppress_error=True) | ||
status_utils.show_status_table(cluster_records, all) | ||
|
||
num_pending_autostop = 0 | ||
num_pending_autostop += status_utils.show_status_table( | ||
nonreserved_cluster_records, all) | ||
for cluster_group_name, cluster_records in reserved_clusters.items(): | ||
num_pending_autostop += status_utils.show_status_table( | ||
cluster_records, all, reserved_group_name=cluster_group_name) | ||
if num_pending_autostop > 0: | ||
click.echo( | ||
'\n' | ||
f'{colorama.Style.DIM}You have {num_pending_autostop} clusters ' | ||
'with auto{stop,down} scheduled. Refresh statuses with: ' | ||
f'sky status --refresh{colorama.Style.RESET_ALL}') | ||
status_utils.show_local_status_table(local_clusters) | ||
|
||
|
||
|
@@ -1905,21 +1928,67 @@ def _down_or_stop_clusters( | |
# Make sure the reserved clusters are explicitly specified without other | ||
# normal clusters and purge is True. | ||
if len(reserved_clusters) > 0: | ||
if not purge: | ||
msg = (f'{operation} reserved cluster(s) ' | ||
f'{reserved_clusters_str} is not supported.') | ||
if down: | ||
msg += ( | ||
'\nPlease specify --purge (-p) to force-terminate the ' | ||
'reserved cluster(s).') | ||
raise click.UsageError(msg) | ||
if len(names) != 0: | ||
names_str = ', '.join(map(repr, names)) | ||
raise click.UsageError( | ||
f'{operation} reserved cluster(s) ' | ||
f'{reserved_clusters_str} with multiple other cluster(s) ' | ||
f'{names_str} is not supported.\n' | ||
f'Please omit the reserved cluster(s) {reserved_clusters}.') | ||
if not down: | ||
raise click.UsageError( | ||
f'{operation} reserved cluster(s) ' | ||
f'{reserved_clusters_str} is not supported.') | ||
else: | ||
# TODO(zhwu): We can only have one reserved cluster (spot | ||
Michaelvll marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# controller). | ||
assert len(reserved_clusters) == 1, reserved_clusters | ||
# spot_jobs will be empty when the spot cluster is not running. | ||
cluster_name = reserved_clusters[0] | ||
cluster_status, _ = backend_utils.refresh_cluster_status_handle( | ||
cluster_name) | ||
if cluster_status is None: | ||
click.echo( | ||
'Managed spot controller has already been torn down.') | ||
return | ||
|
||
cnt = 1 | ||
msg = ( | ||
f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' | ||
f'spot controller ({cluster_status.value}). Please be ' | ||
f'aware of the following:{colorama.Style.RESET_ALL}' | ||
f'\n {cnt}. All logs and status information of the spot ' | ||
Michaelvll marked this conversation as resolved.
Show resolved
Hide resolved
|
||
'jobs will be lost.') | ||
cnt += 1 | ||
if cluster_status == global_user_state.ClusterStatus.INIT: | ||
msg += ( | ||
f'\n {cnt}. Resource leakage may happen caused by ' | ||
'spot jobs being submitted, and in-progress spot jobs.') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: If there are pending/in-progress spot jobs, those resources will not be terminated and require manual cleanup. Actually, why do we show this when the controller is INIT? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because it is possible that another |
||
cnt += 1 | ||
elif cluster_status == global_user_state.ClusterStatus.UP: | ||
spot_jobs = core.spot_status(refresh=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if before this line, a concurrent There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point! Added an error handling for it. PTAL. |
||
non_terminal_jobs = [ | ||
job for job in spot_jobs | ||
if not job['status'].is_terminal() | ||
] | ||
if non_terminal_jobs: | ||
msg += ( | ||
f'\n {cnt}. Resource leakage may happen caused by ' | ||
Michaelvll marked this conversation as resolved.
Show resolved
Hide resolved
|
||
'the following in-progress spot jobs:\n') | ||
job_table = spot_lib.format_job_table(non_terminal_jobs, | ||
show_all=False) | ||
msg += '\n'.join([ | ||
' ' + line | ||
for line in job_table.split('\n') | ||
if line != '' | ||
]) | ||
click.echo(msg) | ||
|
||
click.confirm('Do you want to continue?', | ||
Michaelvll marked this conversation as resolved.
Show resolved
Hide resolved
|
||
default=False, | ||
Michaelvll marked this conversation as resolved.
Show resolved
Hide resolved
|
||
abort=True, | ||
show_default=True) | ||
no_confirm = True | ||
names += reserved_clusters | ||
|
||
if apply_to_all: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to simplify this group based logic? Maybe specializing to just 1 spot controller?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good. Changed each group to only has 1 cluster.