Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GCP] Add L4 support #2212

Merged
merged 17 commits into from
Jul 18, 2023
7 changes: 5 additions & 2 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,8 +358,8 @@ def make_deploy_resources_variables(
else:
# Convert to GCP names:
# https://cloud.google.com/compute/docs/gpus
if acc == 'A100-80GB':
# A100-80GB has a different name pattern.
if acc in ('A100-80GB', 'L4'):
# A100-80GB and L4 have a different name pattern.
resources_vars['gpu'] = 'nvidia-{}'.format(acc.lower())
else:
resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
Expand All @@ -370,6 +370,9 @@ def make_deploy_resources_variables(
# versions of CUDA as noted below.
# CUDA driver version 470.57.02, CUDA Library 11.4
image_id = 'skypilot:k80-debian-10'
elif acc == 'L4':
# CUDA driver version 525.105.17, CUDA Library 11.8
image_id = 'skypilot:cuda118-debian-11'
else:
# Though the image is called cu113, it actually has later
# versions of CUDA as noted below.
Expand Down
1 change: 1 addition & 0 deletions sky/clouds/service_catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ def get_common_gpus() -> List[str]:
'A100',
'A100-80GB',
'K80',
'L4',
'M60',
'P100',
'T4',
Expand Down
4 changes: 2 additions & 2 deletions sky/clouds/service_catalog/data_fetchers/fetch_gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@

# TODO(woosuk): Make this more robust.
# Refer to: https://github.com/skypilot-org/skypilot/issues/1006
# G2 series has L4 GPU, which is not supported by SkyPilot yet
# Unsupported Series: 'f1', 'm2', 'g2'
# Unsupported Series: 'f1', 'm2'
SERIES_TO_DISCRIPTION = {
'a2': 'A2 Instance',
'c2': 'Compute optimized',
Expand All @@ -69,6 +68,7 @@
'e2': 'E2 Instance',
'f1': 'Micro Instance with burstable CPU',
'g1': 'Small Instance with 1 VCPU',
'g2': 'G2 Instance',
'm1': 'Memory-optimized Instance',
# FIXME(woosuk): Support M2 series.
'm3': 'M3 Memory-optimized Instance',
Expand Down
157 changes: 87 additions & 70 deletions sky/clouds/service_catalog/gcp_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,21 +58,37 @@

# TODO(zongheng): fix A100 info directly in catalog.
# https://cloud.google.com/blog/products/compute/a2-vms-with-nvidia-a100-gpus-are-ga
# count -> vm type
_A100_INSTANCE_TYPE_DICTS = {

# If A100 is used, host VM type must be A2; if L4 is used, VM type must be G2.
# Conversely, A2 can only be used with A100, and G2 only with L4.
# https://cloud.google.com/compute/docs/gpus
# acc_type -> count -> vm types
_ACC_INSTANCE_TYPE_DICTS = {
'A100': {
1: 'a2-highgpu-1g',
2: 'a2-highgpu-2g',
4: 'a2-highgpu-4g',
8: 'a2-highgpu-8g',
16: 'a2-megagpu-16g',
1: ['a2-highgpu-1g'],
2: ['a2-highgpu-2g'],
4: ['a2-highgpu-4g'],
8: ['a2-highgpu-8g'],
16: ['a2-megagpu-16g'],
},
'A100-80GB': {
1: 'a2-ultragpu-1g',
2: 'a2-ultragpu-2g',
4: 'a2-ultragpu-4g',
8: 'a2-ultragpu-8g',
}
1: ['a2-ultragpu-1g'],
2: ['a2-ultragpu-2g'],
4: ['a2-ultragpu-4g'],
8: ['a2-ultragpu-8g'],
},
'L4': {
1: [
'g2-standard-4',
'g2-standard-8',
'g2-standard-12',
'g2-standard-16',
'g2-standard-32',
],
2: ['g2-standard-24'],
4: ['g2-standard-48'],
8: ['g2-standard-96'],
},
}

# Number of CPU cores per GPU based on the AWS setting.
Expand Down Expand Up @@ -231,13 +247,10 @@ def get_instance_type_for_accelerator(
if instance_list is None:
return None, fuzzy_candidate_list

if acc_name in _A100_INSTANCE_TYPE_DICTS:
# If A100 is used, host VM type must be A2.
# https://cloud.google.com/compute/docs/gpus#a100-gpus

if acc_name in _ACC_INSTANCE_TYPE_DICTS:
df = _df[_df['InstanceType'].notna()]
instance_type = _A100_INSTANCE_TYPE_DICTS[acc_name][acc_count]
df = df[df['InstanceType'] == instance_type]
instance_types = _ACC_INSTANCE_TYPE_DICTS[acc_name][acc_count]
df = df[df['InstanceType'].isin(instance_types)]

# Check the cpus and memory specified by the user.
instance_type = common.get_instance_type_for_cpus_mem_impl(
Expand Down Expand Up @@ -345,45 +358,48 @@ def list_accelerators(
for acc_name, acc_info in results.items():
if (acc_name.startswith('tpu') or
acc_name in _NUM_ACC_TO_MAX_CPU_AND_MEMORY or
acc_name in _A100_INSTANCE_TYPE_DICTS):
new_results[acc_name] = acc_info
acc_name in _ACC_INSTANCE_TYPE_DICTS):
new_results[acc_name] = acc_info
results = new_results

a100_infos = results.get('A100', []) + results.get('A100-80GB', [])
if not a100_infos:
return results

# Unlike other GPUs that can be attached to different sizes of N1 VMs,
# A100 GPUs can only be attached to fixed-size A2 VMs.
# A100 GPUs can only be attached to fixed-size A2 VMs,
# and L4 GPUs can only be attached to G2 VMs.
# Thus, we can show their exact cost including the host VM prices.

acc_infos: List[common.InstanceTypeInfo] = sum(
[results.get(a, []) for a in _ACC_INSTANCE_TYPE_DICTS], [])
if not acc_infos:
return results

new_infos = defaultdict(list)
for info in a100_infos:
assert pd.isna(info.instance_type) and pd.isna(info.memory), a100_infos
a100_host_vm_type = _A100_INSTANCE_TYPE_DICTS[info.accelerator_name][
for info in acc_infos:
assert pd.isna(info.instance_type) and pd.isna(info.memory), acc_infos
vm_types = _ACC_INSTANCE_TYPE_DICTS[info.accelerator_name][
info.accelerator_count]
df = _df[_df['InstanceType'] == a100_host_vm_type]
cpu_count = df['vCPUs'].iloc[0]
memory = df['MemoryGiB'].iloc[0]
vm_price = common.get_hourly_cost_impl(_df,
a100_host_vm_type,
use_spot=False,
region=None,
zone=None)
vm_spot_price = common.get_hourly_cost_impl(_df,
a100_host_vm_type,
use_spot=True,
region=None,
zone=None)
new_infos[info.accelerator_name].append(
info._replace(
instance_type=a100_host_vm_type,
cpu_count=cpu_count,
memory=memory,
# total cost = VM instance + GPU.
price=info.price + vm_price,
spot_price=info.spot_price + vm_spot_price,
))
for vm_type in vm_types:
df = _df[_df['InstanceType'] == vm_type]
cpu_count = df['vCPUs'].iloc[0]
memory = df['MemoryGiB'].iloc[0]
vm_price = common.get_hourly_cost_impl(_df,
vm_type,
use_spot=False,
region=None,
zone=None)
vm_spot_price = common.get_hourly_cost_impl(_df,
vm_type,
use_spot=True,
region=None,
zone=None)
new_infos[info.accelerator_name].append(
info._replace(
instance_type=vm_type,
cpu_count=cpu_count,
memory=memory,
# total cost = VM instance + GPU.
price=info.price + vm_price,
spot_price=info.spot_price + vm_spot_price,
))
results.update(new_infos)
return results

Expand Down Expand Up @@ -411,14 +427,15 @@ def check_accelerator_attachable_to_host(instance_type: str,
attached to the host.
"""
if accelerators is None:
if instance_type.startswith('a2-'):
# NOTE: While it is allowed to use A2 machines as CPU-only nodes,
# we exclude this case as it is uncommon and undesirable.
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
'A2 instance types should be used with A100 GPUs. '
'Either use other instance types or specify the '
'accelerators as A100.')
for acc_name, val in _ACC_INSTANCE_TYPE_DICTS.items():
if instance_type in sum(val.values(), []):
# NOTE: While it is allowed to use A2/G2 VMs as CPU-only nodes,
# we exclude this case as it is uncommon and undesirable.
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
f'{instance_type} instance types should be used with '
f'{acc_name} GPUs. Either use other instance types or '
f'specify the accelerators as {acc_name}.')
return

acc = list(accelerators.items())
Expand All @@ -441,15 +458,16 @@ def check_accelerator_attachable_to_host(instance_type: str,
'https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines') # pylint: disable=line-too-long
return

# Treat A100 as a special case.
if acc_name in _A100_INSTANCE_TYPE_DICTS:
a100_instance_type = _A100_INSTANCE_TYPE_DICTS[acc_name][acc_count]
if instance_type != a100_instance_type:
if acc_name in _ACC_INSTANCE_TYPE_DICTS:
matching_types = _ACC_INSTANCE_TYPE_DICTS[acc_name][acc_count]
if instance_type not in matching_types:
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
f'A100:{acc_count} cannot be attached to {instance_type}. '
f'Use {a100_instance_type} instead. Please refer to '
'https://cloud.google.com/compute/docs/gpus#a100-gpus')
f'{acc_name} GPUs cannot be attached to {instance_type}. '
f'Use one of {matching_types} instead. Please refer to '
'https://cloud.google.com/compute/docs/gpus')
return
infwinston marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to remove this return right? otherwise later code won't be run?


elif not instance_type.startswith('n1-'):
# Other GPUs must be attached to N1 machines.
# Refer to: https://cloud.google.com/compute/docs/machine-types#gpus
Expand All @@ -459,8 +477,8 @@ def check_accelerator_attachable_to_host(instance_type: str,
'Use N1 instance types instead. Please refer to: '
'https://cloud.google.com/compute/docs/machine-types#gpus')

if acc_name in _A100_INSTANCE_TYPE_DICTS:
valid_counts = list(_A100_INSTANCE_TYPE_DICTS[acc_name].keys())
if acc_name in _ACC_INSTANCE_TYPE_DICTS:
valid_counts = list(_ACC_INSTANCE_TYPE_DICTS[acc_name].keys())
else:
assert acc_name in _NUM_ACC_TO_MAX_CPU_AND_MEMORY, acc_name
valid_counts = list(_NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name].keys())
Expand All @@ -471,9 +489,8 @@ def check_accelerator_attachable_to_host(instance_type: str,
f'The valid {acc_name} counts are {valid_counts}.')

# Check maximum vCPUs and memory.
if acc_name in _A100_INSTANCE_TYPE_DICTS:
max_cpus, max_memory = get_vcpus_mem_from_instance_type(
a100_instance_type)
if acc_name in _ACC_INSTANCE_TYPE_DICTS:
max_cpus, max_memory = get_vcpus_mem_from_instance_type(instance_type)
else:
max_cpus, max_memory = _NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name][
acc_count]
Expand Down