Skip to content

Commit

Permalink
Merge branch 'master' into fluidstack-provisioner
Browse files Browse the repository at this point in the history
  • Loading branch information
mjibril authored Feb 13, 2024
2 parents bf0283c + 2f0432b commit 366c380
Show file tree
Hide file tree
Showing 45 changed files with 1,694 additions and 282 deletions.
59 changes: 59 additions & 0 deletions docs/source/reference/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,43 @@ Available fields and semantics:
# permission to create a security group.
security_group_name: my-security-group
# Identity to use for all AWS instances (optional).
#
# LOCAL_CREDENTIALS: The user's local credential files will be uploaded to
# AWS instances created by SkyPilot. They are used for accessing cloud
# resources (e.g., private buckets) or launching new instances (e.g., for
# spot/serve controllers).
#
# SERVICE_ACCOUNT: Local credential files are not uploaded to AWS
# instances. SkyPilot will auto-create and reuse a service account (IAM
# role) for AWS instances.
#
# Two caveats of SERVICE_ACCOUNT for multicloud users:
#
# - This only affects AWS instances. Local AWS credentials will still be
# uploaded to non-AWS instances (since those instances may need to access
# AWS resources).
# - If the SkyPilot spot/serve controller is on AWS, this setting will make
# non-AWS managed spot jobs / non-AWS service replicas fail to access any
# resources on AWS (since the controllers don't have AWS credential
# files to assign to these non-AWS instances).
#
# Default: 'LOCAL_CREDENTIALS'.
remote_identity: LOCAL_CREDENTIALS
# Advanced GCP configurations (optional).
# Apply to all new instances but not existing ones.
gcp:
# Labels to assign to all instances launched by SkyPilot (optional).
#
# Example use case: cost tracking by user/team/project.
#
# Users should guarantee that these key-values are valid GCP labels, otherwise
# errors from the cloud provider will be surfaced.
instance_tags:
Owner: user-unique-name
my-tag: my-value
# VPC to use (optional).
#
# Default: null, which implies the following behavior. First, all existing
Expand Down Expand Up @@ -165,6 +199,31 @@ Available fields and semantics:
- projects/my-project/reservations/my-reservation1
- projects/my-project/reservations/my-reservation2
# Identity to use for all GCP instances (optional).
#
# LOCAL_CREDENTIALS: The user's local credential files will be uploaded to
# GCP instances created by SkyPilot. They are used for accessing cloud
# resources (e.g., private buckets) or launching new instances (e.g., for
# spot/serve controllers).
#
# SERVICE_ACCOUNT: Local credential files are not uploaded to GCP
# instances. SkyPilot will auto-create and reuse a service account for GCP
# instances.
#
# Two caveats of SERVICE_ACCOUNT for multicloud users:
#
# - This only affects GCP instances. Local GCP credentials will still be
# uploaded to non-GCP instances (since those instances may need to access
# GCP resources).
# - If the SkyPilot spot/serve controller is on GCP, this setting will make
# non-GCP managed spot jobs / non-GCP service replicas fail to access any
# resources on GCP (since the controllers don't have GCP credential
# files to assign to these non-GCP instances).
#
# Default: 'LOCAL_CREDENTIALS'.
remote_identity: LOCAL_CREDENTIALS
# Advanced Kubernetes configurations (optional).
kubernetes:
# The networking mode for accessing SSH jump pod (optional).
Expand Down
1 change: 1 addition & 0 deletions examples/using_file_mounts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

resources:
cloud: aws
cpus: 2+

workdir: .

Expand Down
2 changes: 2 additions & 0 deletions sky/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def get_git_commit():
IBM = clouds.IBM
AWS = clouds.AWS
Azure = clouds.Azure
Cudo = clouds.Cudo
GCP = clouds.GCP
Lambda = clouds.Lambda
SCP = clouds.SCP
Expand All @@ -91,6 +92,7 @@ def get_git_commit():
'__version__',
'AWS',
'Azure',
'Cudo',
'GCP',
'IBM',
'Kubernetes',
Expand Down
29 changes: 29 additions & 0 deletions sky/adaptors/cudo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Cudo Compute cloud adaptor."""

import functools

_cudo_sdk = None


def import_package(func):

@functools.wraps(func)
def wrapper(*args, **kwargs):
global _cudo_sdk
if _cudo_sdk is None:
try:
import cudo_compute as _cudo # pylint: disable=import-outside-toplevel
_cudo_sdk = _cudo
except ImportError:
raise ImportError(
'Fail to import dependencies for Cudo Compute.'
'Try pip install "skypilot[cudo]"') from None
return func(*args, **kwargs)

return wrapper


@import_package
def cudo():
"""Return the Cudo Compute package."""
return _cudo_sdk
28 changes: 22 additions & 6 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,10 @@ def write_cluster_config(
not appear in the catalog, or an ssh_proxy_command is specified but
not for the given region, or GPUs are requested in a Kubernetes
cluster but the cluster does not have nodes labeled with GPU types.
exceptions.InvalidCloudConfigs: if the user specifies some config for the
cloud that is not valid, e.g. remote_identity: SERVICE_ACCOUNT
for a cloud that does not support it, the caller should skip the
cloud in this case.
"""
# task.best_resources may not be equal to to_provision if the user
# is running a job with less resources than the cluster has.
Expand Down Expand Up @@ -786,7 +790,18 @@ def write_cluster_config(
(str(to_provision.cloud).lower(), 'specific_reservations'), set()))

assert cluster_name is not None
credentials = sky_check.get_cloud_credential_file_mounts()
excluded_clouds = []
remote_identity = skypilot_config.get_nested(
(str(cloud).lower(), 'remote_identity'), 'LOCAL_CREDENTIALS')
if remote_identity == 'SERVICE_ACCOUNT':
if not cloud.supports_service_account_on_remote():
raise exceptions.InvalidCloudConfigs(
'remote_identity: SERVICE_ACCOUNT is specified in '
f'{skypilot_config.loaded_config_path!r} for {cloud}, but it '
'is not supported by this cloud. Remove the config or set: '
'`remote_identity: LOCAL_CREDENTIALS`.')
excluded_clouds = [cloud]
credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)

ip_list = None
auth_config = {'ssh_private_key': auth.PRIVATE_SSH_KEY_PATH}
Expand Down Expand Up @@ -828,10 +843,9 @@ def write_cluster_config(
instance_tags = {}
instance_tags = skypilot_config.get_nested(
(str(cloud).lower(), 'instance_tags'), {})
if not isinstance(instance_tags, dict):
with ux_utils.print_exception_no_traceback():
raise ValueError('Custom instance_tags in config.yaml should '
f'be a dict, but received {type(instance_tags)}.')
# instance_tags is a dict, which is guaranteed by the type check in
# schemas.py
assert isinstance(instance_tags, dict), instance_tags

# Dump the Ray ports to a file for Ray job submission
dump_port_command = (
Expand Down Expand Up @@ -963,7 +977,9 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
"""
config = common_utils.read_yaml(cluster_config_file)
# Check the availability of the cloud type.
if isinstance(cloud, (clouds.AWS, clouds.OCI, clouds.SCP, clouds.Vsphere)):
if isinstance(
cloud,
(clouds.AWS, clouds.OCI, clouds.SCP, clouds.Vsphere, clouds.Cudo)):
config = auth.configure_ssh_info(config)
elif isinstance(cloud, clouds.GCP):
config = auth.setup_gcp_authentication(config)
Expand Down
19 changes: 14 additions & 5 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def _get_cluster_config_template(cloud):
cloud_to_template = {
clouds.AWS: 'aws-ray.yml.j2',
clouds.Azure: 'azure-ray.yml.j2',
clouds.Cudo: 'cudo-ray.yml.j2',
clouds.GCP: 'gcp-ray.yml.j2',
clouds.Lambda: 'lambda-ray.yml.j2',
clouds.IBM: 'ibm-ray.yml.j2',
Expand Down Expand Up @@ -1507,6 +1508,17 @@ def _retry_zones(
# does not have nodes labeled with GPU types.
logger.info(f'{e}')
continue
except exceptions.InvalidCloudConfigs as e:
# Failed due to invalid user configs in ~/.sky/config.yaml.
logger.warning(f'{common_utils.format_exception(e)}')
# We should block the entire cloud if the user config is
# invalid.
_add_to_blocked_resources(
self._blocked_resources,
to_provision.copy(region=None, zone=None))
raise exceptions.ResourcesUnavailableError(
f'Failed to provision on cloud {to_provision.cloud} due to '
f'invalid cloud config: {common_utils.format_exception(e)}')
if dryrun:
return config_dict
cluster_config_file = config_dict['ray']
Expand Down Expand Up @@ -2044,7 +2056,7 @@ def provision_with_retries(
try:
# Recheck cluster name as the 'except:' block below may
# change the cloud assignment.
to_provision.cloud.check_cluster_name_is_valid(cluster_name)
common_utils.check_cluster_name_is_valid(cluster_name)
if dryrun:
cloud_user = None
else:
Expand Down Expand Up @@ -4380,10 +4392,7 @@ def _check_existing_cluster(
usage_lib.messages.usage.set_new_cluster()
# Use the task_cloud, because the cloud in `to_provision` can be changed
# later during the retry.
for resources in task.resources:
task_cloud = (resources.cloud
if resources.cloud is not None else clouds.Cloud)
task_cloud.check_cluster_name_is_valid(cluster_name)
common_utils.check_cluster_name_is_valid(cluster_name)

if to_provision is None:
# The cluster is recently terminated either by autostop or manually
Expand Down
8 changes: 6 additions & 2 deletions sky/check.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Credential checks: check cloud credentials and enable clouds."""
from typing import Dict
from typing import Dict, Iterable, Optional

import click

Expand Down Expand Up @@ -72,7 +72,8 @@ def check(quiet: bool = False, verbose: bool = False) -> None:
global_user_state.set_enabled_clouds(enabled_clouds)


def get_cloud_credential_file_mounts() -> Dict[str, str]:
def get_cloud_credential_file_mounts(
excluded_clouds: Optional[Iterable[clouds.Cloud]]) -> Dict[str, str]:
"""Returns the files necessary to access all enabled clouds.
Returns a dictionary that will be added to a task's file mounts
Expand All @@ -81,6 +82,9 @@ def get_cloud_credential_file_mounts() -> Dict[str, str]:
enabled_clouds = global_user_state.get_enabled_clouds()
file_mounts = {}
for cloud in enabled_clouds:
if (excluded_clouds is not None and
clouds.cloud_in_list(cloud, excluded_clouds)):
continue
cloud_file_mounts = cloud.get_credential_file_mounts()
file_mounts.update(cloud_file_mounts)
# Currently, get_enabled_clouds() does not support r2
Expand Down
11 changes: 1 addition & 10 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4076,16 +4076,7 @@ def spot_launch(
if prompt is not None:
click.confirm(prompt, default=True, abort=True, show_default=True)

for task in dag.tasks:
# We try our best to validate the cluster name before we launch the
# task. If the cloud is not specified, this will only validate the
# cluster name against the regex, and the cloud-specific validation will
# be done by the spot controller when actually launching the spot
# cluster.
for resources in task.resources:
task_cloud = (resources.cloud
if resources.cloud is not None else clouds.Cloud)
task_cloud.check_cluster_name_is_valid(name)
common_utils.check_cluster_name_is_valid(name)

sky.spot_launch(dag,
name,
Expand Down
5 changes: 5 additions & 0 deletions sky/clouds/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Clouds in Sky."""
from sky.clouds.cloud import Cloud
from sky.clouds.cloud import cloud_in_list
from sky.clouds.cloud import CloudImplementationFeatures
from sky.clouds.cloud import ProvisionerVersion
from sky.clouds.cloud import Region
Expand All @@ -12,6 +13,7 @@
from sky.clouds.aws import AWS
from sky.clouds.azure import Azure
from sky.clouds.fluidstack import Fluidstack
from sky.clouds.cudo import Cudo
from sky.clouds.gcp import GCP
from sky.clouds.ibm import IBM
from sky.clouds.kubernetes import Kubernetes
Expand All @@ -27,6 +29,7 @@
'AWS',
'Azure',
'Cloud',
'Cudo',
'GCP',
'Lambda',
'Local',
Expand All @@ -42,4 +45,6 @@
'ProvisionerVersion',
'StatusVersion',
'Fluidstack',
# Utility functions
'cloud_in_list',
]
2 changes: 2 additions & 0 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ class AWS(clouds.Cloud):
# Reference: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html # pylint: disable=line-too-long
_MAX_CLUSTER_NAME_LEN_LIMIT = 248

_SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True

_regions: List[clouds.Region] = []

_INDENT_PREFIX = ' '
Expand Down
Loading

0 comments on commit 366c380

Please sign in to comment.