Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Core: Fix AWS/GCP autostop with new provisioner. #2719

Merged
merged 4 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions sky/skylet/attempt_skylet.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,24 @@ def restart_skylet():
running = (proc.returncode == 0)

version_match = False
found_version = None
if os.path.exists(VERSION_FILE):
with open(VERSION_FILE) as f:
if f.read().strip() == constants.SKYLET_VERSION:
found_version = f.read().strip()
if found_version == constants.SKYLET_VERSION:
version_match = True

version_string = (f' (found version {found_version}, new version '
f'{constants.SKYLET_VERSION})')
if not running:
print('Skylet is not running. Starting...')
print('Skylet is not running. Starting (version '
f'{constants.SKYLET_VERSION})...')
elif not version_match:
print('Skylet is staled. Restarting...')
print(f'Skylet is stale{version_string}. Restarting...')
else:
print('Skylet is running with the latest version.')
print(
f'Skylet is running with the latest version {constants.SKYLET_VERSION}.'
)

if not running or not version_match:
restart_skylet()
8 changes: 5 additions & 3 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@
# lifetime of the job.
TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'

# The version of skylet. We should bump this version whenever we need the skylet
# to be restarted on existing clusters updated with the new version of SkyPilot,
# The version of skylet. MUST bump this version whenever we need the skylet to
# be restarted on existing clusters updated with the new version of SkyPilot,
# e.g., when we add new events to skylet, or we fix a bug in skylet.
SKYLET_VERSION = '3'
#
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
SKYLET_VERSION = '4'
SKYLET_VERSION_FILE = '~/.sky/skylet_version'

# `sky spot dashboard`-related
Expand Down
11 changes: 8 additions & 3 deletions sky/skylet/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,20 @@ def _stop_cluster(self, autostop_config):
config = common_utils.read_yaml(self._ray_yaml_path)

provider_module = config['provider']['module']
provider_search = re.search(r'(?:providers|provision)\.(.*)(\.)?',
# Examples:
# 'sky.skylet.providers.aws.AWSNodeProviderV2' -> 'aws'
# 'sky.provision.aws' -> 'aws'
provider_search = re.search(r'(?:providers|provision)\.(\w+)\.?',
provider_module)
assert provider_search is not None, config
provider_name = provider_search.group(1).lower()

if provider_name in ['aws', 'gcp']:
if provider_name in ('aws', 'gcp'):
logger.info('Using new provisioner to stop the cluster.')
self._stop_cluster_with_new_provisioner(autostop_config, config,
provider_name)
return
logger.info('Not using new provisioner to stop the cluster. '
f'Cloud of this cluster: {provider_name}')

is_cluster_multinode = config['max_workers'] > 0

Expand Down