From fd3030f92929e6e40bb67f4ddf1964affc0307a0 Mon Sep 17 00:00:00 2001 From: Yika Luo Date: Tue, 15 Oct 2024 16:21:23 -0700 Subject: [PATCH] [Performance] Use new AWS custom images --- sky/clouds/aws.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 2207a977f25..a0962b17cac 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -32,6 +32,14 @@ logger = sky_logging.init_logger(__name__) +# Image ID tags +_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu' +# For GPU-related package version, +# see sky/clouds/service_catalog/images/provisioners/cuda.sh +_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu' +_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004' +_DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204' + # This local file (under ~/.aws/) will be uploaded to remote nodes (any # cloud), if all of the following conditions hold: # - the current user identity is not using AWS SSO @@ -217,17 +225,20 @@ def zones_provision_loop( @classmethod def _get_default_ami(cls, region_name: str, instance_type: str) -> str: acc = cls.get_accelerators_from_instance_type(instance_type) - image_id = service_catalog.get_image_id_from_tag( - 'skypilot:gpu-ubuntu-2004', region_name, clouds='aws') + image_id = service_catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID, + region_name, + clouds='aws') if acc is not None: + image_id = service_catalog.get_image_id_from_tag( + _DEFAULT_GPU_IMAGE_ID, region_name, clouds='aws') assert len(acc) == 1, acc acc_name = list(acc.keys())[0] if acc_name == 'K80': image_id = service_catalog.get_image_id_from_tag( - 'skypilot:k80-ubuntu-2004', region_name, clouds='aws') + _DEFAULT_GPU_K80_IMAGE_ID, region_name, clouds='aws') if acc_name in ['Trainium', 'Inferentia']: image_id = service_catalog.get_image_id_from_tag( - 'skypilot:neuron-ubuntu-2204', region_name, clouds='aws') + _DEFAULT_NEURON_IMAGE_ID, region_name, clouds='aws') if image_id is not None: return image_id # Raise ResourcesUnavailableError to make sure the failover in