Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AWS] Failover Optimization from Quota #1953

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
2d1bf15
initial commit
Apr 18, 2023
d674726
more quick changes
Apr 19, 2023
f91a3f1
try push
Apr 21, 2023
164b490
remove test lines
Apr 21, 2023
a647263
quick file add
Apr 21, 2023
8053c9f
things finally working
Apr 21, 2023
c29da94
Merge branch 'master' into aws-quota-optimization
shethhriday29 Apr 21, 2023
63fd297
refactoring
Apr 26, 2023
bc77487
quick string fix
Apr 26, 2023
b858a94
removed test cases
May 12, 2023
6bc102a
Merge branch 'master' of github.com:shethhriday29/skypilot into aws-q…
Jun 17, 2023
959dae4
Merge branch 'master' of github.com:shethhriday29/skypilot into aws-q…
Jun 17, 2023
fe2c0ff
formatting changes, comments, and docstring
Jun 17, 2023
45dd0ff
added test for aws zero quota failover
Jun 19, 2023
f472bf4
Update sky/clouds/cloud.py
shethhriday29 Jun 19, 2023
8d04c00
merge with master
Jun 19, 2023
4e843b8
edits based on romil review
Jun 19, 2023
2ffa694
ran formatter
Jun 19, 2023
669af4c
more romil comments
Jun 20, 2023
f6b8b69
formatting
Jun 20, 2023
1ee1d89
zhanghao edits
Jun 22, 2023
91f22f8
Update tests/test_smoke.py
shethhriday29 Jun 22, 2023
1b8bcc2
Update sky/backends/cloud_vm_ray_backend.py
shethhriday29 Jun 22, 2023
8aa8722
formatting
Jun 22, 2023
93d5f1d
Update tests/test_smoke.py
shethhriday29 Jun 23, 2023
4ae2214
Update sky/clouds/aws.py
shethhriday29 Jun 23, 2023
8943ef1
Update sky/clouds/aws.py
shethhriday29 Jun 23, 2023
e2db07b
took out imports
Jun 23, 2023
268255e
format
Jun 23, 2023
b8b371c
better logging
Jun 24, 2023
f471738
Merge branch 'master' of github.com:shethhriday29/skypilot into aws-q…
Jun 24, 2023
ac54096
formatting
Jun 24, 2023
b5706eb
Merge branch 'master' of github.com:shethhriday29/skypilot into aws-q…
shethhriday29 Jun 26, 2023
0b0a76d
zongheng comments
shethhriday29 Jun 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1357,6 +1357,36 @@ def _retry_zones(
to_provision, 'region should have been set by the optimizer.')
region = clouds.Region(to_provision.region)

# Optimization - check if user has non-zero quota for
# the instance type in the target region. If not, fail early
# instead of trying to provision and failing later.
try:
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved
need_provision = to_provision.cloud.check_quota_available(
to_provision.region, to_provision.instance_type,
to_provision.use_spot)

except Exception as e: # pylint: disable=broad-except
need_provision = True
logger.info(f'Error occurred when trying to check quota. '
f'Proceeding assuming quotas are available. Error: '
f'{common_utils.format_exception(e, use_bracket=True)}')

if not need_provision:
# if quota is found to be zero, raise exception and skip to
# the next region
if to_provision.use_spot:
instance_descriptor = 'spot'
else:
instance_descriptor = 'on-demand'
raise exceptions.ResourcesUnavailableError(
f'{colorama.Fore.YELLOW}Found no quota for '
f'{to_provision.instance_type} {instance_descriptor} '
f'instances in region {to_provision.region}. '
f'{colorama.Style.RESET_ALL}'
f'To request quotas, check the instruction: '
f'https://skypilot.readthedocs.io/en/latest/cloud-setup/quota.html.' # pylint: disable=line-too-long
)

for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
prev_cluster_status):
# Filter out zones that are blocked, if any.
Expand Down
50 changes: 42 additions & 8 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,13 +416,6 @@ def _make(instance_list):
@functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
"""Checks if the user has access credentials to this cloud."""
try:
# pylint: disable=import-outside-toplevel,unused-import
import boto3
import botocore
except ImportError:
raise ImportError('Fail to import dependencies for AWS.'
'Try pip install "skypilot[aws]"') from None

# Checks if the AWS CLI is installed properly
proc = subprocess.run('aws --version',
Expand Down Expand Up @@ -478,7 +471,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
cls._STATIC_CREDENTIAL_HELP_STR)

# Fetch the AWS catalogs
from sky.clouds.service_catalog import aws_catalog # pylint: disable=import-outside-toplevel,unused-import
from sky.clouds.service_catalog import aws_catalog # pylint: disable=import-outside-toplevel
# Trigger the fetch of the availability zones mapping.
aws_catalog.get_default_instance_type()
return True, hints
Expand Down Expand Up @@ -695,6 +688,47 @@ def _get_disk_specs(cls, disk_tier: Optional[str]) -> Dict[str, Any]:
'custom_disk_perf': tier != 'low',
}

@classmethod
def check_quota_available(cls,
region: str,
instance_type: str,
use_spot: bool = False) -> bool:
"""Check if AWS quota is available for `instance_type` in `region`.

AWS-specific implementation of check_quota_available. The function works by
matching the instance_type to the corresponding AWS quota code, and then using
the boto3 Python API to query the region for the specific quota code.

Returns:
False if the quota is found to be zero, and True otherwise.
Raises:
ImportError: if the dependencies for AWS are not able to be installed.
botocore.exceptions.ClientError: error in Boto3 client request.
"""

from sky.clouds.service_catalog import aws_catalog # pylint: disable=import-outside-toplevel,unused-import
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

quota_code = aws_catalog.get_quota_code(instance_type, use_spot)

if quota_code is None:
# Quota code not found in the catalog for the chosen instance_type, try provisioning anyway
return True
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

client = aws.client('service-quotas', region_name=region)
try:
response = client.get_service_quota(ServiceCode='ec2',
QuotaCode=quota_code)
except aws.botocore_exceptions().ClientError:
# Botocore client connection not established, try provisioning anyways
return True

if response['Quota']['Value'] == 0:
# Quota found to be zero, do not try provisioning
return False

# Quota found to be greater than zero, try provisioning
return True

@classmethod
def query_status(cls, name: str, tag_filters: Dict[str, str],
region: Optional[str], zone: Optional[str],
Expand Down
52 changes: 52 additions & 0 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,58 @@ def check_disk_tier_enabled(cls, instance_type: str,
"""
raise NotImplementedError

@classmethod
# pylint: disable=unused-argument
def check_quota_available(cls,
region: str,
instance_type: str,
use_spot: bool = False) -> bool:
"""Check if quota is available for `instance_type` in `region`.

(Currently, check_quota_available is only implemented for AWS.)

The _retry_zones function in cloud_vm_ray_backend goes through different
candidate regions and attempts to provision the requested instance_type
accelerators in the region, until a successful provisioning happens
or all regions with the requested accelerator have been looked at.
Previously, SkyPilot would attempt to provision resources in all of
these regions. However, many regions would have a zero quota or
inadequate quota, meaning these attempted provisions were destined
to fail from the get-go.

Checking the quota is substantially faster than attempting a failed
provision (~1 second vs 30+ seconds) so this function attempts to
check the resource quota and return False if it is found to be zero,
or True otherwise. If False is returned, _retry_zones will not attempt
a provision in the region, saving time.

We are only checking for a nonzero quota, instead of also factoring in
quota utilization because many cloud providers' APIs don't have a
built-in command for checking the real-time utilization. Checking
real-time utilization is a more difficult endeavor that involves
monitoring etc., so we are holding off on that for now.

If for at any point the function fails, whether it's because we can't
import the necessary dependencies or a query using a cloud provider's
API fails, we will return True, because we cannot conclusively say the
relevant quota is zero in these cases, and we don't want to
preemptively exclude regions from an attempted provision if they may
have an adequate quota.

Design choice: We chose a just-in-time approach where
check_quota_available is called immediately before a potential
attempted provision, rather than checking all region quotas
beforehand, storing them, and using those values on-demand. This is
because, for example, _retry_zones may only need to go through one or
a few regions before a successful region, and running a query to check
*every* region's quota beforehand would cause an unnecessary delay.

Returns:
False if the quota is found to be zero, and true otherwise.
"""

return True
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def query_status(cls, name: str, tag_filters: Dict[str, str],
region: Optional[str], zone: Optional[str],
Expand Down
21 changes: 21 additions & 0 deletions sky/clouds/service_catalog/aws_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@
_image_df = common.read_catalog('aws/images.csv',
pull_frequency_hours=_PULL_FREQUENCY_HOURS)

_quotas_df = common.read_catalog('aws/instance_quota_mapping.csv',
pull_frequency_hours=_PULL_FREQUENCY_HOURS)


def _fetch_and_apply_az_mapping(df: pd.DataFrame) -> pd.DataFrame:
"""Maps zone IDs (use1-az1) to zone names (us-east-1x).
Expand Down Expand Up @@ -148,6 +151,24 @@ def _get_df() -> pd.DataFrame:
return _user_df


def get_quota_code(instance_type: str, use_spot: bool) -> Optional[str]:
# Get the quota code from the accelerator instance type
# This will be used in the botocore command to check for
# a non-zero quota

if use_spot:
spot_header = 'SpotInstanceCode'
else:
spot_header = 'OnDemandInstanceCode'
try:
quota_code = _quotas_df.loc[_quotas_df['InstanceType'] == instance_type,
spot_header].values[0]
return quota_code

except IndexError:
return None


def instance_type_exists(instance_type: str) -> bool:
return common.instance_type_exists_impl(_get_df(), instance_type)

Expand Down
36 changes: 36 additions & 0 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,22 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:
raise Exception(f'test failed: less {log_file.name}')


def get_aws_region_for_quota_failover() -> Optional[str]:

candidate_regions = AWS.regions_with_offering(instance_type='p3.16xlarge',
accelerators=None,
use_spot=True,
region=None,
zone=None)

for region in candidate_regions:
if not AWS.check_quota_available(
region=region.name, instance_type='p3.16xlarge', use_spot=True):
return region.name

return None


# ---------- Dry run: 2 Tasks in a chain. ----------
def test_example_app():
test = Test(
Expand Down Expand Up @@ -2199,6 +2215,26 @@ def test_azure_disk_tier():
run_one_test(test)


# ------ Testing Zero Quota Failover ------
@pytest.mark.aws
def test_aws_zero_quota_failover():

name = _get_cluster_name()
region = get_aws_region_for_quota_failover()

if not region:
return

test = Test(
'aws-zero-quota-failover',
[
f'sky launch -y -c {name} --cloud aws --region {region} --gpus V100:8 --use-spot | grep "Found no quota"',
],
f'sky down -y {name}',
)
run_one_test(test)


# ------- Testing user ray cluster --------
def test_user_ray_cluster():
name = _get_cluster_name()
Expand Down