Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make sky start on the spot controller aware of autostop. #1453

Merged
merged 5 commits into from
Dec 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1956,7 +1956,7 @@ def check_cluster_name_not_reserved(
return None.
"""
if cluster_name in SKY_RESERVED_CLUSTER_NAMES:
msg = (f'Cluster {cluster_name!r} is reserved for '
msg = (f'Cluster {cluster_name!r} is reserved for the '
f'{SKY_RESERVED_CLUSTER_NAMES[cluster_name].lower()}.')
if operation_str is not None:
msg += f' {operation_str} is not allowed.'
Expand Down
34 changes: 30 additions & 4 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,7 @@ def launch(
and they undergo job queue scheduling.
"""
backend_utils.check_cluster_name_not_reserved(
cluster, operation_str='Launching task on it')
cluster, operation_str='Launching tasks on it')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does it refer to?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

» sky launch -c sky-spot-controller-8a3968f2 echo hi
ValueError: Cluster 'sky-spot-controller-8a3968f2' is reserved for managed spot controller. Launching task on it is not allowed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of passing in operation_str (adding is not allowed may not be general for all reserved clusters), pass in error_str with error_str= Launching task on {cluster} is not allowed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks fine to me:

» ag -w check_cluster_name_not_reserved -A1
sky/execution.py
359:    backend_utils.check_cluster_name_not_reserved(cluster_name,
360-                                                  operation_str='sky.launch')
--
437:    backend_utils.check_cluster_name_not_reserved(cluster_name,
438-                                                  operation_str='sky.exec')

sky/backends/backend_utils.py
1950:def check_cluster_name_not_reserved(
1951-        cluster_name: Optional[str],

sky/core.py
442:    backend_utils.check_cluster_name_not_reserved(
443-        cluster_name, operation_str='Cancelling jobs')

sky/cli.py
1176:    backend_utils.check_cluster_name_not_reserved(
1177-        cluster, operation_str='Launching tasks on it')
--
1314:    backend_utils.check_cluster_name_not_reserved(
1315-        cluster, operation_str='Executing task on it')

as it doesn't repeat "is not allowed". cc @Michaelvll

Copy link
Collaborator

@Michaelvll Michaelvll Nov 30, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like it would be better to show more detailed information as the operation_str does, instead of a general is not allowed. It will provide more information for the user as well as the developer for debugging.

Sorry, I miss understood the proposal. I think avoiding the repeat might be fine for now? We can change the behavior, when we meet other situations that require different strings for the last part?

if backend_name is None:
backend_name = backends.CloudVmRayBackend.NAME

Expand Down Expand Up @@ -1917,6 +1917,32 @@ def start(
if not to_start:
return

# Checks for reserved clusters (spot controller).
reserved, non_reserved = [], []
for name in to_start:
if name in backend_utils.SKY_RESERVED_CLUSTER_NAMES:
reserved.append(name)
else:
non_reserved.append(name)
if reserved and non_reserved:
michaelzhiluo marked this conversation as resolved.
Show resolved Hide resolved
assert len(reserved) == 1, reserved
# Keep this behavior the same as _down_or_stop_clusters().
raise click.UsageError(
'Starting the spot controller with other cluster(s) '
'is currently not supported.\n'
'Please start the former independently.')
if reserved:
assert len(reserved) == 1, reserved
bold = backend_utils.BOLD
reset_bold = backend_utils.RESET_BOLD
if idle_minutes_to_autostop is not None:
raise click.UsageError(
'Autostop options are currently not allowed when starting the '
'spot controller. Use the default autostop settings by directly'
f' calling: {bold}sky start {reserved[0]}{reset_bold}')
idle_minutes_to_autostop = (
spot_lib.SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP)

if not yes:
cluster_str = 'clusters' if len(to_start) > 1 else 'cluster'
cluster_list = ', '.join(to_start)
Expand Down Expand Up @@ -2107,13 +2133,13 @@ def _down_or_stop_clusters(
names_str = ', '.join(map(repr, names))
raise click.UsageError(
f'{operation} reserved cluster(s) '
f'{reserved_clusters_str} with multiple other cluster(s) '
f'{names_str} is not supported.\n'
f'{reserved_clusters_str} with other cluster(s) '
f'{names_str} is currently not supported.\n'
f'Please omit the reserved cluster(s) {reserved_clusters}.')
if not down:
raise click.UsageError(
f'{operation} reserved cluster(s) '
f'{reserved_clusters_str} is not supported.')
f'{reserved_clusters_str} is currently not supported.')
else:
# TODO(zhwu): We can only have one reserved cluster (spot
# controller).
Expand Down
27 changes: 21 additions & 6 deletions sky/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,19 @@ def _start(
f'Starting cluster {cluster_name!r} with backend {backend.NAME} '
'is not supported.')

if cluster_name == spot.SPOT_CONTROLLER_NAME:
if down:
raise ValueError('Using autodown (rather than autostop) is not '
'supported for the spot controller. Pass '
'`down=False` or omit it instead.')
if idle_minutes_to_autostop is not None:
raise ValueError(
'Passing a custom autostop setting is currently not '
'supported when starting the spot controller. To '
'fix: omit the `idle_minutes_to_autostop` argument to use the '
'default autostop settings.')
idle_minutes_to_autostop = spot.SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP

# NOTE: if spot_queue() calls _start() and hits here, that entrypoint
# would have a cluster name (the controller) filled in.
usage_lib.record_cluster_name_for_current_operation(cluster_name)
Expand Down Expand Up @@ -148,8 +161,12 @@ def start(
Useful for upgrading SkyPilot runtime.

Raises:
ValueError: the specified cluster does not exist; or if ``down`` is set
to True but ``idle_minutes_to_autostop`` is None.
ValueError: argument values are invalid: (1) the specified cluster does
not exist; (2) if ``down`` is set to True but
``idle_minutes_to_autostop`` is None; (3) if the specified cluster is
the managed spot controller, and either ``idle_minutes_to_autostop``
is not None or ``down`` is True (omit them to use the default
autostop settings).
sky.exceptions.NotSupportedError: if the cluster to restart was
launched using a non-default backend that does not support this
operation.
Expand Down Expand Up @@ -612,9 +629,7 @@ def spot_queue(refresh: bool) -> List[Dict[str, Any]]:
'Restarting controller for latest status...'
f'{colorama.Style.RESET_ALL}')

handle = _start(spot.SPOT_CONTROLLER_NAME,
idle_minutes_to_autostop=spot.
SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP)
handle = _start(spot.SPOT_CONTROLLER_NAME)

if handle is None or handle.head_ip is None:
raise exceptions.ClusterNotUpError('Spot controller is not up.')
Expand Down Expand Up @@ -709,7 +724,7 @@ def spot_tail_logs(name: Optional[str], job_id: Optional[int],
# TODO(zhwu): Automatically restart the spot controller
controller_status, handle = _is_spot_controller_up(
'Please restart the spot controller with '
f'`sky start {spot.SPOT_CONTROLLER_NAME} -i 5`.')
f'`sky start {spot.SPOT_CONTROLLER_NAME}`.')
if handle is None or handle.head_ip is None:
msg = 'All jobs finished.'
if controller_status == global_user_state.ClusterStatus.INIT:
Expand Down
11 changes: 10 additions & 1 deletion tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,8 @@ def test_use_spot():
def test_spot():
"""Test the spot yaml."""
name = _get_cluster_name()
cancel_command = (
f'sky spot cancel -y -n {name}-1; sky spot cancel -y -n {name}-2')
test = Test(
'managed-spot',
[
Expand All @@ -770,8 +772,15 @@ def test_spot():
'sleep 200',
f's=$(sky spot queue); printf "$s"; echo; echo; printf "$s" | grep {name}-1 | head -n1 | grep CANCELLED',
f's=$(sky spot queue); printf "$s"; echo; echo; printf "$s" | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"',
# Test autostop. This assumes no regular spot jobs are running.
cancel_command,
'sleep 720', # Sleep for a bit more than the default 10m.
'sky status --refresh | grep sky-spot-controller- | grep STOPPED',
'sky start "sky-spot-controller-*" -y',
# Ensures it's up and the autostop setting is restored.
'sky status | grep sky-spot-controller- | grep UP | grep 10m',
],
f'sky spot cancel -y -n {name}-1; sky spot cancel -y -n {name}-2',
cancel_command,
)
run_one_test(test)

Expand Down
4 changes: 2 additions & 2 deletions tests/test_spot.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def test_stop_spot_controller(self, _mock_cluster_state):
assert result.exit_code == click.UsageError.exit_code
assert (
f'Stopping reserved cluster(s) \'{spot.SPOT_CONTROLLER_NAME}\' is '
'not supported' in result.output)
'currently not supported' in result.output)

result = cli_runner.invoke(cli.stop, ['sky-spot-con*'])
assert not result.exception
Expand All @@ -157,7 +157,7 @@ def test_autostop_spot_controller(self, _mock_cluster_state):
result = cli_runner.invoke(cli.autostop, [spot.SPOT_CONTROLLER_NAME])
assert result.exit_code == click.UsageError.exit_code
assert ('Scheduling autostop on reserved cluster(s) '
f'\'{spot.SPOT_CONTROLLER_NAME}\' is not supported'
f'\'{spot.SPOT_CONTROLLER_NAME}\' is currently not supported'
in result.output)

result = cli_runner.invoke(cli.autostop, ['sky-spot-con*'])
Expand Down