Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[show-gpus] Optimize show-gpus for GCP and reorganize TPU #3113

Merged
merged 14 commits into from
Feb 12, 2024
14 changes: 10 additions & 4 deletions sky/clouds/service_catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def list_accelerators(
clouds: CloudFilter = None,
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True,
) -> 'Dict[str, List[common.InstanceTypeInfo]]':
"""List the names of all accelerators offered by Sky.

Expand All @@ -73,7 +74,7 @@ def list_accelerators(
"""
results = _map_clouds_catalog(clouds, 'list_accelerators', gpus_only,
name_filter, region_filter, quantity_filter,
case_sensitive, all_regions)
case_sensitive, all_regions, require_price)
if not isinstance(results, list):
results = [results]
ret: Dict[str,
Expand All @@ -96,9 +97,14 @@ def list_accelerator_counts(
Returns: A dictionary of canonical accelerator names mapped to a list
of available counts. See usage in cli.py.
"""
results = _map_clouds_catalog(clouds, 'list_accelerators', gpus_only,
name_filter, region_filter, quantity_filter,
False)
results = _map_clouds_catalog(clouds,
'list_accelerators',
gpus_only,
name_filter,
region_filter,
quantity_filter,
all_regions=False,
require_price=False)
if not isinstance(results, list):
results = [results]
accelerator_counts: Dict[str, Set[int]] = collections.defaultdict(set)
Expand Down
4 changes: 3 additions & 1 deletion sky/clouds/service_catalog/aws_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,10 @@ def list_accelerators(
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False) -> Dict[str, List[common.InstanceTypeInfo]]:
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in AWS offering accelerators."""
del require_price # Unused.
return common.list_accelerators_impl('AWS', _get_df(), gpus_only,
name_filter, region_filter,
quantity_filter, case_sensitive,
Expand Down
4 changes: 3 additions & 1 deletion sky/clouds/service_catalog/azure_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,10 @@ def list_accelerators(
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False) -> Dict[str, List[common.InstanceTypeInfo]]:
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in Azure offering GPUs."""
del require_price # Unused.
return common.list_accelerators_impl('Azure', _df, gpus_only, name_filter,
region_filter, quantity_filter,
case_sensitive, all_regions)
19 changes: 10 additions & 9 deletions sky/clouds/service_catalog/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,15 +445,15 @@ def get_instance_type_for_accelerator_impl(


def list_accelerators_impl(
cloud: str,
df: pd.DataFrame,
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
) -> Dict[str, List[InstanceTypeInfo]]:
cloud: str,
df: pd.DataFrame,
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[InstanceTypeInfo]]:
"""Lists accelerators offered in a cloud service catalog.

`name_filter` is a regular expression used to filter accelerator names
Expand All @@ -462,6 +462,7 @@ def list_accelerators_impl(
Returns a mapping from the canonical names of accelerators to a list of
instance types offered by this cloud.
"""
del require_price # Unused.
if gpus_only:
df = df[~df['GpuInfo'].isna()]
df = df.copy() # avoid column assignment warning
Expand Down
172 changes: 111 additions & 61 deletions sky/clouds/service_catalog/gcp_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
For now this service catalog is manually coded. In the future it should be
queried from GCP API.
"""
from collections import defaultdict
import typing
from typing import Dict, List, Optional, Tuple

Expand Down Expand Up @@ -364,68 +363,119 @@ def list_accelerators(
quantity_filter: Optional[int] = None,
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True,
) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in GCP offering GPUs."""
results = common.list_accelerators_impl('GCP', _df, gpus_only, name_filter,
region_filter, quantity_filter,
case_sensitive, all_regions)

# Remove GPUs that are unsupported by SkyPilot.
new_results = {}
for acc_name, acc_info in results.items():
if (acc_name.startswith('tpu') or
acc_name in _NUM_ACC_TO_MAX_CPU_AND_MEMORY or
acc_name in _ACC_INSTANCE_TYPE_DICTS):
new_results[acc_name] = acc_info
results = new_results

# Figure out which instance type to use.
infos_with_instance_type: List[common.InstanceTypeInfo] = sum(
results.values(), [])

new_infos = defaultdict(list)
for info in infos_with_instance_type:
assert pd.isna(info.instance_type) and pd.isna(info.memory), info
# We show TPU-VM prices here, so no instance type is needed.
# Set it as `TPU-VM` to be shown in the table.
if info.accelerator_name.startswith('tpu'):
new_infos[info.accelerator_name].append(
info._replace(instance_type='TPU-VM'))
continue
vm_types, _ = get_instance_type_for_accelerator(info.accelerator_name,
info.accelerator_count,
region=region_filter)
# The acc name & count in `info` are retrieved from the table so we
# could definitely find a column in the original table
# Additionally the way get_instance_type_for_accelerator works
# we will always get either a specialized instance type
# or a default instance type. So we can safely assume that
# vm_types is not None.
assert vm_types is not None
for vm_type in vm_types:
df = _df[_df['InstanceType'] == vm_type]
cpu_count = df['vCPUs'].iloc[0]
memory = df['MemoryGiB'].iloc[0]
vm_price = common.get_hourly_cost_impl(_df,
vm_type,
use_spot=False,
region=region_filter,
zone=None)
vm_spot_price = common.get_hourly_cost_impl(_df,
vm_type,
use_spot=True,
region=region_filter,
zone=None)
new_infos[info.accelerator_name].append(
info._replace(
instance_type=vm_type,
cpu_count=cpu_count,
memory=memory,
# total cost = VM instance + GPU.
price=info.price + vm_price,
spot_price=info.spot_price + vm_spot_price,
))
return new_infos

# This is a simplified version of get_instance_type_for_accelerator, as it
# has to be applied to all the entries in acc_host_df, and the input is more
# restricted.
def _get_host_vm(acc_name: str, acc_count: int, region: str,
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
zone: str) -> Optional[str]:
df = _df[(_df['Region'].str.lower() == region.lower()) &
(_df['AvailabilityZone'].str.lower() == zone.lower()) &
(_df['InstanceType'].notna())]
if acc_name in _ACC_INSTANCE_TYPE_DICTS:
instance_types = _ACC_INSTANCE_TYPE_DICTS[acc_name][acc_count]
df = df[df['InstanceType'].isin(instance_types)]
else:
if acc_name not in _NUM_ACC_TO_NUM_CPU:
acc_name = 'DEFAULT'

cpus = _NUM_ACC_TO_NUM_CPU[acc_name][acc_count]
# The memory size should be at least 4x the requested number of
# vCPUs to be consistent with all other clouds.
memory = cpus * _DEFAULT_GPU_MEMORY_CPU_RATIO
df = df[df['InstanceType'].str.startswith(_DEFAULT_HOST_VM_FAMILY)]
df = df[(df['vCPUs'] >= cpus) & (df['MemoryGiB'] >= memory)]
df = df.dropna(subset=['Price'])
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
# Some regions may not have the host VM available, although the GPU is
# offered, e.g., a2-megagpu-16g in asia-northeast1.
if df.empty:
return None
row = df.loc[df['Price'].idxmin()]
return row['InstanceType']

acc_host_df = _df
tpu_df = None
# If price information is not required, we do not need to fetch the host VM
# information.
if require_price:
acc_host_df = _df[_df['AcceleratorName'].notna()]
# Filter the acc_host_df first before fetching the host VM information.
# This is to reduce the rows to be processed for optimization.
# TODO: keep it sync with `list_accelerators_impl`
if gpus_only:
acc_host_df = acc_host_df[~acc_host_df['GpuInfo'].isna()]
if name_filter is not None:
acc_host_df = acc_host_df[
acc_host_df['AcceleratorName'].str.contains(
name_filter, case=case_sensitive)]
if region_filter is not None:
acc_host_df = acc_host_df[acc_host_df['Region'] == region_filter]
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
if quantity_filter is not None:
acc_host_df = acc_host_df[acc_host_df['AcceleratorCount'] ==
quantity_filter]
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
# Split the TPU and GPU information, as we should not combine the price
# of the TPU and the host VM.
# TODO: Fix the price for TPU VMs.
is_tpu = acc_host_df['AcceleratorName'].str.startswith('tpu-')
tpu_df = acc_host_df[is_tpu]
acc_host_df = acc_host_df[~is_tpu]
if not acc_host_df.empty:
assert acc_host_df['InstanceType'].isna().all(), acc_host_df
acc_host_df = acc_host_df.drop(
columns=['InstanceType', 'vCPUs', 'MemoryGiB'])
# Fetch the host VM information for each accelerator in its region
# and zone.
acc_host_df['InstanceType'] = acc_host_df.apply(
lambda x: _get_host_vm(x['AcceleratorName'],
x['AcceleratorCount'],
region=x['Region'],
zone=x['AvailabilityZone']),
axis=1)
acc_host_df.dropna(subset=['InstanceType', 'Price'], inplace=True)
# Combine the price of the host VM and the GPU.
acc_host_df = pd.merge(
acc_host_df,
_df[[
'InstanceType', 'vCPUs', 'MemoryGiB', 'Region',
'AvailabilityZone', 'Price', 'SpotPrice'
]],
on=['InstanceType', 'Region', 'AvailabilityZone'],
how='inner',
suffixes=('', '_host'))
acc_host_df['Price'] = (acc_host_df['Price'] +
acc_host_df['Price_host'])
acc_host_df['SpotPrice'] = (acc_host_df['SpotPrice'] +
acc_host_df['SpotPrice_host'])
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
acc_host_df.rename(columns={
'vCPUs_host': 'vCPUs',
'MemoryGiB_host': 'MemoryGiB'
},
inplace=True)
acc_host_df = pd.concat([acc_host_df, tpu_df])
results = common.list_accelerators_impl('GCP', acc_host_df, gpus_only,
name_filter, region_filter,
quantity_filter, case_sensitive,
all_regions)
if tpu_df is not None and not tpu_df.empty:
# Combine the entries for TPUs.
for acc_name in list(results.keys()):
if acc_name.startswith('tpu-'):
version = acc_name.split('-')[1]
acc_info = results.pop(acc_name)
tpu_group = f'tpu-{version}'
if tpu_group not in results:
results[tpu_group] = []
results[tpu_group].extend(acc_info)
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
for acc_group in list(results.keys()):
if acc_group.startswith('tpu-'):
results[acc_group] = sorted(
results[acc_group],
key=lambda x: (x.price, x.spot_price, x.region),
)
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
return results


def get_region_zones_for_accelerators(
Expand Down
15 changes: 8 additions & 7 deletions sky/clouds/service_catalog/ibm_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,15 @@ def get_region_zones_for_instance_type(instance_type: str,


def list_accelerators(
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
) -> Dict[str, List[common.InstanceTypeInfo]]:
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in IBM offering accelerators."""
del require_price # Unused.
return common.list_accelerators_impl('IBM', _df, gpus_only, name_filter,
region_filter, quantity_filter,
case_sensitive, all_regions)
Expand Down
16 changes: 8 additions & 8 deletions sky/clouds/service_catalog/kubernetes_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:


def list_accelerators(
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
) -> Dict[str, List[common.InstanceTypeInfo]]:
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
del all_regions, require_price # Unused.
k8s_cloud = Kubernetes()
del all_regions # unused
if not any(
map(k8s_cloud.is_same_cloud, global_user_state.get_enabled_clouds())
) or not kubernetes_utils.check_credentials()[0]:
Expand Down
15 changes: 8 additions & 7 deletions sky/clouds/service_catalog/lambda_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,15 @@ def get_region_zones_for_instance_type(instance_type: str,


def list_accelerators(
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
) -> Dict[str, List[common.InstanceTypeInfo]]:
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in Lambda offering GPUs."""
del require_price # Unused.
return common.list_accelerators_impl('Lambda', _df, gpus_only, name_filter,
region_filter, quantity_filter,
case_sensitive, all_regions)
15 changes: 8 additions & 7 deletions sky/clouds/service_catalog/oci_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,15 @@ def get_region_zones_for_instance_type(instance_type: str,


def list_accelerators(
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
) -> Dict[str, List[common.InstanceTypeInfo]]:
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in OCI offering GPUs."""
del require_price # Unused.
return common.list_accelerators_impl('OCI', _get_df(), gpus_only,
name_filter, region_filter,
quantity_filter, case_sensitive,
Expand Down
15 changes: 8 additions & 7 deletions sky/clouds/service_catalog/runpod_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,15 @@ def get_region_zones_for_instance_type(instance_type: str,


def list_accelerators(
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
) -> Dict[str, List[common.InstanceTypeInfo]]:
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in RunPod offering GPUs."""
del require_price # Unused.
return common.list_accelerators_impl('RunPod', _df, gpus_only, name_filter,
region_filter, quantity_filter,
case_sensitive, all_regions)
Loading
Loading