Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AWS] Failover Optimization from Quota #1953

Merged
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
2d1bf15
initial commit
Apr 18, 2023
d674726
more quick changes
Apr 19, 2023
f91a3f1
try push
Apr 21, 2023
164b490
remove test lines
Apr 21, 2023
a647263
quick file add
Apr 21, 2023
8053c9f
things finally working
Apr 21, 2023
c29da94
Merge branch 'master' into aws-quota-optimization
shethhriday29 Apr 21, 2023
63fd297
refactoring
Apr 26, 2023
bc77487
quick string fix
Apr 26, 2023
b858a94
removed test cases
May 12, 2023
6bc102a
Merge branch 'master' of github.com:shethhriday29/skypilot into aws-q…
Jun 17, 2023
959dae4
Merge branch 'master' of github.com:shethhriday29/skypilot into aws-q…
Jun 17, 2023
fe2c0ff
formatting changes, comments, and docstring
Jun 17, 2023
45dd0ff
added test for aws zero quota failover
Jun 19, 2023
f472bf4
Update sky/clouds/cloud.py
shethhriday29 Jun 19, 2023
8d04c00
merge with master
Jun 19, 2023
4e843b8
edits based on romil review
Jun 19, 2023
2ffa694
ran formatter
Jun 19, 2023
669af4c
more romil comments
Jun 20, 2023
f6b8b69
formatting
Jun 20, 2023
1ee1d89
zhanghao edits
Jun 22, 2023
91f22f8
Update tests/test_smoke.py
shethhriday29 Jun 22, 2023
1b8bcc2
Update sky/backends/cloud_vm_ray_backend.py
shethhriday29 Jun 22, 2023
8aa8722
formatting
Jun 22, 2023
93d5f1d
Update tests/test_smoke.py
shethhriday29 Jun 23, 2023
4ae2214
Update sky/clouds/aws.py
shethhriday29 Jun 23, 2023
8943ef1
Update sky/clouds/aws.py
shethhriday29 Jun 23, 2023
e2db07b
took out imports
Jun 23, 2023
268255e
format
Jun 23, 2023
b8b371c
better logging
Jun 24, 2023
f471738
Merge branch 'master' of github.com:shethhriday29/skypilot into aws-q…
Jun 24, 2023
ac54096
formatting
Jun 24, 2023
b5706eb
Merge branch 'master' of github.com:shethhriday29/skypilot into aws-q…
shethhriday29 Jun 26, 2023
0b0a76d
zongheng comments
shethhriday29 Jun 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1331,6 +1331,16 @@ def _retry_zones(
to_provision, 'region should have been set by the optimizer.')
region = clouds.Region(to_provision.region)

if (not to_provision.cloud.check_quota_not_zero(
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved
to_provision.region, to_provision.instance_type,
to_provision.use_spot)):
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

# if quota is found to be zero, raise exception and skip to
# the next region
raise exceptions.ResourcesUnavailableError(
'Zero quota in the region with '
'attempted provisioning')
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
prev_cluster_status):
# Filter out zones that are blocked, if any.
Expand Down
54 changes: 54 additions & 0 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -687,3 +687,57 @@ def _get_disk_specs(cls, disk_tier: Optional[str]) -> Dict[str, Any]:
'disk_throughput': tier2iops[tier] // 16,
'custom_disk_perf': tier != 'low',
}

@classmethod
def check_quota_not_zero(cls,
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved
region: str,
instance_type: str,
use_spot: bool = False) -> bool:
"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style nit: Having a single line description of this function in the first line.

Suggested change
"""
"""Check if the quota is available for the requested instance_type

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Google styleguide: First line should be on the same line as the triple quotes. https://google.github.io/styleguide/pyguide.html#383-functions-and-methods

AWS-specific implmentation of check_quota_not_zero. The function works by
matching the instance_type to the corresponding AWS quota code, and then using
the boto3 Python API to query the region for the specific quota code.

Returns:
False if the quota is found to be zero, and True otherwise.
Raises:
ImportError: if the dependencies for AWS are not able to be installed.
botocore.exceptions.ClientError: error in Boto3 client request.
"""

try:
# pylint: disable=import-outside-toplevel
import boto3
import botocore
except ImportError:
raise ImportError('Fail to import dependencies for AWS.'
'Try pip install "skypilot[aws]"') from None
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

from sky.clouds.service_catalog import aws_catalog # pylint: disable=import-outside-toplevel,unused-import
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

spot_header = ''
if use_spot:
spot_header = 'SpotInstanceCode'
else:
spot_header = 'OnDemandInstanceCode'

quota_code = aws_catalog.get_quota_code(instance_type, spot_header)
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

if quota_code is None:
# Quota code not found in the catalog for the chosen instance_type, try provisioning anyway
return True
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

client = boto3.client('service-quotas', region_name=region)
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved
try:
response = client.get_service_quota(ServiceCode='ec2',
QuotaCode=quota_code)
except botocore.exceptions.ClientError:
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved
# Botocore client connection not established, try provisioning anyways
return True

if response['Quota']['Value'] == 0:
# Quota found to be zero, do not try provisioning
return False

# Quota found to be greater than zero, try provisioning
return True
53 changes: 53 additions & 0 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,5 +467,58 @@ def check_disk_tier_enabled(cls, instance_type: str,
"""
raise NotImplementedError

@classmethod
def check_quota_not_zero(cls,
region: str,
instance_type: str,
use_spot: bool = False) -> bool:
"""
Checks to ensure that a particular accelerator has a nonzero quota
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto (styleguide)

in a region.

(As of right now, check_quota_not_zero is only implemented for AWS.)
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

The _retry_zones funtion in cloud_vm_ray_backend goes through different
candidate regions and attempts to provision the requested instance_type
accelerators in the region, until a successful provisioning happens
or all regions with the requested accelerator have been looked at.
Previously, SkyPilot would attempt to provision resources in all of
these regions. However, many regions would have a zero quota or
inadequate quota, meaning these attempted provisionings were destined
to fail from the get-go.

Checking the quota is substantially faster than attempting a failed
provision (~1 second vs 30+ seconds) so this function attempts to
check the resource quota and return False if it is found to be zero,
or True otherwise. If False is returned, _retry_zones will not attempt
a provision in the region, saving time.

We are only checking for a nonzero quota, instead of also factoring in
quota utilization because many cloud providers' APIs don't have a
built-in command for checking the real-time utilization. Checking
real-time utilization is a more difficult endeavor that involves
monitoring etc., so we are holding off on that for now.

If for at any point the function fails, whether it's because we can't
import the necessary dependencies or a query using a cloud provider's
API fails, we will return True, because we cannot conclusively say the
relevant quota is zero in these cases, and we don't want to
preemptively exclude regions from an attempted provision if they may
have an adequate quota.

Design choice: We chose a just-in-time approach where
check_quota_not_zero is called immediately before a potential
attempted provision, rather than checking all region quotas
beforehand, storing them, and using those values on-demand. This is
because, for example, _retry_zones may only need to go through one or
a few regions before a successful region, and running a query to check
*every* region's quota beforehand would cause an unnecessary delay.

Returns:
False if the quota is found to be zero, and true otherwise.
"""

return True
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

def __repr__(self):
return self._REPR
16 changes: 16 additions & 0 deletions sky/clouds/service_catalog/aws_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@
_image_df = common.read_catalog('aws/images.csv',
pull_frequency_hours=_PULL_FREQUENCY_HOURS)

_quotas_df = common.read_catalog('aws/instance_quota_mapping.csv',
pull_frequency_hours=_PULL_FREQUENCY_HOURS)


def _fetch_and_apply_az_mapping(df: pd.DataFrame) -> pd.DataFrame:
"""Maps zone IDs (use1-az1) to zone names (us-east-1x).
Expand Down Expand Up @@ -148,6 +151,19 @@ def _get_df() -> pd.DataFrame:
return _user_df


def get_quota_code(instance_type: str, spot_header: str) -> Optional[str]:
# Get the quota code from the accelerator instance type
# This will be used in the botocore command to check for
# a non-zero quota
try:
quota_code = _quotas_df.loc[_quotas_df['InstanceType'] == instance_type,
spot_header].values[0]
return quota_code

except IndexError:
return None


def instance_type_exists(instance_type: str) -> bool:
return common.instance_type_exists_impl(_get_df(), instance_type)

Expand Down
3 changes: 2 additions & 1 deletion sky/clouds/service_catalog/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Constants used for service catalog."""
import os

HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
# TODO (Hriday Sheth) change URL once catalog branch is merged in
HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/shethhriday29/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved
CATALOG_SCHEMA_VERSION = 'v5'
LOCAL_CATALOG_DIR = os.path.expanduser('~/.sky/catalogs/')