Skip to content

Commit

Permalink
[Autodown] Support for autodown (skypilot-org#1217)
Browse files Browse the repository at this point in the history
* Support for autodown

* Change API to terminate

* fix flag

* fix autostop

* fix comment

* address comment

* address comment

* format

* Rename terminate to down

* add smoke test

* fix autodown for multi-node

* format

* fix syntax

* use gcp for autodown test

* fix smoke test

* fix smoke test

* address comments

* Add comment

* Switch back to terminate

* fix comments

* Change back to tear down

* Change to tear down

* fix comment

* change the logic of --down to use auto-down by default

* Use autodown for --down and address comments

* fix comment

* fix ux

* Add test for cancel

* fix UX

* fix test_smoke

* address comments

* fix

* fix logging and comment

* fix environment variable overwrite

* fix smoke test

* print info
  • Loading branch information
Michaelvll authored and ewzeng committed Oct 24, 2022
1 parent 99e4ae8 commit 9253e7f
Show file tree
Hide file tree
Showing 11 changed files with 334 additions and 127 deletions.
4 changes: 3 additions & 1 deletion sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1650,7 +1650,9 @@ def _update_cluster_status_no_lock(
backend.set_autostop(handle, -1, stream_logs=False)
except (Exception, SystemExit): # pylint: disable=broad-except
logger.debug('Failed to reset autostop.')
global_user_state.set_cluster_autostop_value(handle.cluster_name, -1)
global_user_state.set_cluster_autostop_value(handle.cluster_name,
-1,
to_down=False)

# If the user starts part of a STOPPED cluster, we still need a status to
# represent the abnormal status. For spot cluster, it can also represent
Expand Down
22 changes: 19 additions & 3 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,7 @@ def ray_up():
# different order from directly running in the console. The
# `--log-style` and `--log-color` flags do not work. To reproduce,
# `ray up --log-style pretty --log-color true | tee tmp.out`.

returncode, stdout, stderr = log_lib.run_with_log(
# NOTE: --no-restart solves the following bug. Without it, if
# 'ray up' (sky launch) twice on a cluster with >1 node, the
Expand All @@ -1135,7 +1136,15 @@ def ray_up():
line_processor=log_utils.RayUpLineProcessor(),
# Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
# time during 'ray up' if insufficient capacity occurs.
env=dict(os.environ, BOTO_MAX_RETRIES='5'),
env=dict(
os.environ,
BOTO_MAX_RETRIES='5',
# Use environment variables to disable the ray usage stats
# (to avoid the 10 second wait for usage collection
# confirmation), as the ray version on the user's machine
# may be lower version that does not support the
# `--disable-usage-stats` flag.
RAY_USAGE_STATS_ENABLED='0'),
require_outputs=True,
# Disable stdin to avoid ray outputs mess up the terminal with
# misaligned output when multithreading/multiprocessing are used
Expand Down Expand Up @@ -1335,10 +1344,16 @@ def _ensure_cluster_ray_started(self,
'of the local cluster. Check if ray[default]==1.13.0 '
'is installed or running correctly.')
backend.run_on_head(handle, 'ray stop', use_cached_head_ip=False)

log_lib.run_with_log(
['ray', 'up', '-y', '--restart-only', handle.cluster_yaml],
log_abs_path,
stream_logs=False,
# Use environment variables to disable the ray usage collection
# (avoid the 10 second wait for usage collection confirmation),
# as the ray version on the user's machine may be lower version
# that does not support the `--disable-usage-stats` flag.
env=dict(os.environ, RAY_USAGE_STATS_ENABLED='0'),
# Disable stdin to avoid ray outputs mess up the terminal with
# misaligned output when multithreading/multiprocessing is used.
# Refer to: https://github.com/ray-project/ray/blob/d462172be7c5779abf37609aed08af112a533e1e/python/ray/autoscaler/_private/subprocess_output_util.py#L264 # pylint: disable=line-too-long
Expand Down Expand Up @@ -2608,10 +2623,11 @@ def post_teardown_cleanup(self,
def set_autostop(self,
handle: ResourceHandle,
idle_minutes_to_autostop: Optional[int],
down: bool = False,
stream_logs: bool = True) -> None:
if idle_minutes_to_autostop is not None:
code = autostop_lib.AutostopCodeGen.set_autostop(
idle_minutes_to_autostop, self.NAME)
idle_minutes_to_autostop, self.NAME, down)
returncode, _, stderr = self.run_on_head(handle,
code,
require_outputs=True,
Expand All @@ -2622,7 +2638,7 @@ def set_autostop(self,
stderr=stderr,
stream_logs=stream_logs)
global_user_state.set_cluster_autostop_value(
handle.cluster_name, idle_minutes_to_autostop)
handle.cluster_name, idle_minutes_to_autostop, down)

# TODO(zhwu): Refactor this to a CommandRunner class, so different backends
# can support its own command runner.
Expand Down
Loading

0 comments on commit 9253e7f

Please sign in to comment.