Skip to content

Commit

Permalink
[show-gpus] Optimize show-gpus for GCP and reorganize TPU (#3113)
Browse files Browse the repository at this point in the history
* optimize show-gpus

* readability

* further optimization for GPU counts only

* filter optimization

* Add comments

* Add comment

* fix comment

* Group and sort TPU

* format

* fix list accelerator test

* fix

* address comments
  • Loading branch information
Michaelvll authored Feb 12, 2024
1 parent 9a77adf commit 3f0ad21
Show file tree
Hide file tree
Showing 13 changed files with 202 additions and 128 deletions.
14 changes: 10 additions & 4 deletions sky/clouds/service_catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def list_accelerators(
clouds: CloudFilter = None,
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True,
) -> 'Dict[str, List[common.InstanceTypeInfo]]':
"""List the names of all accelerators offered by Sky.
Expand All @@ -73,7 +74,7 @@ def list_accelerators(
"""
results = _map_clouds_catalog(clouds, 'list_accelerators', gpus_only,
name_filter, region_filter, quantity_filter,
case_sensitive, all_regions)
case_sensitive, all_regions, require_price)
if not isinstance(results, list):
results = [results]
ret: Dict[str,
Expand All @@ -96,9 +97,14 @@ def list_accelerator_counts(
Returns: A dictionary of canonical accelerator names mapped to a list
of available counts. See usage in cli.py.
"""
results = _map_clouds_catalog(clouds, 'list_accelerators', gpus_only,
name_filter, region_filter, quantity_filter,
False)
results = _map_clouds_catalog(clouds,
'list_accelerators',
gpus_only,
name_filter,
region_filter,
quantity_filter,
all_regions=False,
require_price=False)
if not isinstance(results, list):
results = [results]
accelerator_counts: Dict[str, Set[int]] = collections.defaultdict(set)
Expand Down
4 changes: 3 additions & 1 deletion sky/clouds/service_catalog/aws_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,10 @@ def list_accelerators(
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False) -> Dict[str, List[common.InstanceTypeInfo]]:
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in AWS offering accelerators."""
del require_price # Unused.
return common.list_accelerators_impl('AWS', _get_df(), gpus_only,
name_filter, region_filter,
quantity_filter, case_sensitive,
Expand Down
4 changes: 3 additions & 1 deletion sky/clouds/service_catalog/azure_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,10 @@ def list_accelerators(
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False) -> Dict[str, List[common.InstanceTypeInfo]]:
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in Azure offering GPUs."""
del require_price # Unused.
return common.list_accelerators_impl('Azure', _df, gpus_only, name_filter,
region_filter, quantity_filter,
case_sensitive, all_regions)
19 changes: 10 additions & 9 deletions sky/clouds/service_catalog/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,15 +445,15 @@ def get_instance_type_for_accelerator_impl(


def list_accelerators_impl(
cloud: str,
df: pd.DataFrame,
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
) -> Dict[str, List[InstanceTypeInfo]]:
cloud: str,
df: pd.DataFrame,
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[InstanceTypeInfo]]:
"""Lists accelerators offered in a cloud service catalog.
`name_filter` is a regular expression used to filter accelerator names
Expand All @@ -462,6 +462,7 @@ def list_accelerators_impl(
Returns a mapping from the canonical names of accelerators to a list of
instance types offered by this cloud.
"""
del require_price # Unused.
if gpus_only:
df = df[~df['GpuInfo'].isna()]
df = df.copy() # avoid column assignment warning
Expand Down
179 changes: 118 additions & 61 deletions sky/clouds/service_catalog/gcp_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
For now this service catalog is manually coded. In the future it should be
queried from GCP API.
"""
from collections import defaultdict
import typing
from typing import Dict, List, Optional, Tuple

Expand Down Expand Up @@ -364,68 +363,126 @@ def list_accelerators(
quantity_filter: Optional[int] = None,
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True,
) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in GCP offering GPUs."""
results = common.list_accelerators_impl('GCP', _df, gpus_only, name_filter,
region_filter, quantity_filter,
case_sensitive, all_regions)

# Remove GPUs that are unsupported by SkyPilot.
new_results = {}
for acc_name, acc_info in results.items():
if (acc_name.startswith('tpu') or
acc_name in _NUM_ACC_TO_MAX_CPU_AND_MEMORY or
acc_name in _ACC_INSTANCE_TYPE_DICTS):
new_results[acc_name] = acc_info
results = new_results

# Figure out which instance type to use.
infos_with_instance_type: List[common.InstanceTypeInfo] = sum(
results.values(), [])

new_infos = defaultdict(list)
for info in infos_with_instance_type:
assert pd.isna(info.instance_type) and pd.isna(info.memory), info
# We show TPU-VM prices here, so no instance type is needed.
# Set it as `TPU-VM` to be shown in the table.
if info.accelerator_name.startswith('tpu'):
new_infos[info.accelerator_name].append(
info._replace(instance_type='TPU-VM'))
continue
vm_types, _ = get_instance_type_for_accelerator(info.accelerator_name,
info.accelerator_count,
region=region_filter)
# The acc name & count in `info` are retrieved from the table so we
# could definitely find a column in the original table
# Additionally the way get_instance_type_for_accelerator works
# we will always get either a specialized instance type
# or a default instance type. So we can safely assume that
# vm_types is not None.
assert vm_types is not None
for vm_type in vm_types:
df = _df[_df['InstanceType'] == vm_type]
cpu_count = df['vCPUs'].iloc[0]
memory = df['MemoryGiB'].iloc[0]
vm_price = common.get_hourly_cost_impl(_df,
vm_type,
use_spot=False,
region=region_filter,
zone=None)
vm_spot_price = common.get_hourly_cost_impl(_df,
vm_type,
use_spot=True,
region=region_filter,
zone=None)
new_infos[info.accelerator_name].append(
info._replace(
instance_type=vm_type,
cpu_count=cpu_count,
memory=memory,
# total cost = VM instance + GPU.
price=info.price + vm_price,
spot_price=info.spot_price + vm_spot_price,
))
return new_infos

# This is a simplified version of get_instance_type_for_accelerator, as it
# has to be applied to all the entries in acc_host_df, and the input is more
# restricted.
def _get_host_instance_type(acc_name: str, acc_count: int, region: str,
zone: str) -> Optional[str]:
df = _df[(_df['Region'].str.lower() == region.lower()) &
(_df['AvailabilityZone'].str.lower() == zone.lower()) &
(_df['InstanceType'].notna())]
if acc_name in _ACC_INSTANCE_TYPE_DICTS:
instance_types = _ACC_INSTANCE_TYPE_DICTS[acc_name][acc_count]
df = df[df['InstanceType'].isin(instance_types)]
else:
if acc_name not in _NUM_ACC_TO_NUM_CPU:
acc_name = 'DEFAULT'

cpus = _NUM_ACC_TO_NUM_CPU[acc_name][acc_count]
# The memory size should be at least 4x the requested number of
# vCPUs to be consistent with all other clouds.
memory = cpus * _DEFAULT_GPU_MEMORY_CPU_RATIO
df = df[df['InstanceType'].str.startswith(_DEFAULT_HOST_VM_FAMILY)]
df = df[(df['vCPUs'] >= cpus) & (df['MemoryGiB'] >= memory)]
df = df.dropna(subset=['Price'])
# Some regions may not have the host VM available, although the GPU is
# offered, e.g., a2-megagpu-16g in asia-northeast1.
if df.empty:
return None
row = df.loc[df['Price'].idxmin()]
return row['InstanceType']

acc_host_df = _df
tpu_df = None
# If price information is not required, we do not need to fetch the host VM
# information.
if require_price:
acc_host_df = _df[_df['AcceleratorName'].notna()]
# Filter the acc_host_df first before fetching the host VM information.
# This is to reduce the rows to be processed for optimization.
# TODO: keep it sync with `list_accelerators_impl`
if gpus_only:
acc_host_df = acc_host_df[~acc_host_df['GpuInfo'].isna()]
if name_filter is not None:
acc_host_df = acc_host_df[
acc_host_df['AcceleratorName'].str.contains(name_filter,
case=case_sensitive,
regex=True)]
if region_filter is not None:
acc_host_df = acc_host_df[acc_host_df['Region'].str.contains(
region_filter, case=case_sensitive, regex=True)]
acc_host_df['AcceleratorCount'] = acc_host_df[
'AcceleratorCount'].astype(int)
if quantity_filter is not None:
acc_host_df = acc_host_df[acc_host_df['AcceleratorCount'] ==
quantity_filter]
# Split the TPU and GPU information, as we should not combine the price
# of the TPU and the host VM.
# TODO: Fix the price for TPU VMs.
is_tpu = acc_host_df['AcceleratorName'].str.startswith('tpu-')
tpu_df = acc_host_df[is_tpu]
acc_host_df = acc_host_df[~is_tpu]
if not acc_host_df.empty:
assert acc_host_df['InstanceType'].isna().all(), acc_host_df
acc_host_df = acc_host_df.drop(
columns=['InstanceType', 'vCPUs', 'MemoryGiB'])
# Fetch the host VM information for each accelerator in its region
# and zone.
acc_host_df['InstanceType'] = acc_host_df.apply(
lambda x: _get_host_instance_type(x['AcceleratorName'],
x['AcceleratorCount'],
region=x['Region'],
zone=x['AvailabilityZone']),
axis=1)
acc_host_df.dropna(subset=['InstanceType', 'Price'], inplace=True)
# Combine the price of the host VM and the GPU.
acc_host_df = pd.merge(
acc_host_df,
_df[[
'InstanceType', 'vCPUs', 'MemoryGiB', 'Region',
'AvailabilityZone', 'Price', 'SpotPrice'
]],
on=['InstanceType', 'Region', 'AvailabilityZone'],
how='inner',
suffixes=('', '_host'))
# Combine the price of the host instance type and the GPU.
acc_host_df['Price'] = (acc_host_df['Price'] +
acc_host_df['Price_host'])
acc_host_df['SpotPrice'] = (acc_host_df['SpotPrice'] +
acc_host_df['SpotPrice_host'])
acc_host_df.rename(columns={
'vCPUs_host': 'vCPUs',
'MemoryGiB_host': 'MemoryGiB'
},
inplace=True)
acc_host_df = pd.concat([acc_host_df, tpu_df])
results = common.list_accelerators_impl('GCP', acc_host_df, gpus_only,
name_filter, region_filter,
quantity_filter, case_sensitive,
all_regions)
if tpu_df is not None and not tpu_df.empty:
# Combine the entries for TPUs with the same version.
# TODO: The current TPU prices are the price of the TPU itself, not the
# TPU VM. We should fix this in the future.
for acc_name in list(results.keys()):
if acc_name.startswith('tpu-'):
version = acc_name.split('-')[1]
acc_info = results.pop(acc_name)
tpu_group = f'tpu-{version}'
if tpu_group not in results:
results[tpu_group] = []
results[tpu_group].extend(acc_info)
for acc_group in list(results.keys()):
if acc_group.startswith('tpu-'):
results[acc_group] = sorted(
results[acc_group],
key=lambda x: (x.price, x.spot_price, x.region),
)
return results


def get_region_zones_for_accelerators(
Expand Down
15 changes: 8 additions & 7 deletions sky/clouds/service_catalog/ibm_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,15 @@ def get_region_zones_for_instance_type(instance_type: str,


def list_accelerators(
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
) -> Dict[str, List[common.InstanceTypeInfo]]:
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in IBM offering accelerators."""
del require_price # Unused.
return common.list_accelerators_impl('IBM', _df, gpus_only, name_filter,
region_filter, quantity_filter,
case_sensitive, all_regions)
Expand Down
16 changes: 8 additions & 8 deletions sky/clouds/service_catalog/kubernetes_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:


def list_accelerators(
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
) -> Dict[str, List[common.InstanceTypeInfo]]:
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
del all_regions, require_price # Unused.
k8s_cloud = Kubernetes()
del all_regions # unused
if not any(
map(k8s_cloud.is_same_cloud, global_user_state.get_enabled_clouds())
) or not kubernetes_utils.check_credentials()[0]:
Expand Down
15 changes: 8 additions & 7 deletions sky/clouds/service_catalog/lambda_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,15 @@ def get_region_zones_for_instance_type(instance_type: str,


def list_accelerators(
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
) -> Dict[str, List[common.InstanceTypeInfo]]:
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in Lambda offering GPUs."""
del require_price # Unused.
return common.list_accelerators_impl('Lambda', _df, gpus_only, name_filter,
region_filter, quantity_filter,
case_sensitive, all_regions)
15 changes: 8 additions & 7 deletions sky/clouds/service_catalog/oci_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,15 @@ def get_region_zones_for_instance_type(instance_type: str,


def list_accelerators(
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
) -> Dict[str, List[common.InstanceTypeInfo]]:
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in OCI offering GPUs."""
del require_price # Unused.
return common.list_accelerators_impl('OCI', _get_df(), gpus_only,
name_filter, region_filter,
quantity_filter, case_sensitive,
Expand Down
Loading

0 comments on commit 3f0ad21

Please sign in to comment.