From 13d85ee3e21807d1d348346e7b808098aee2802d Mon Sep 17 00:00:00 2001 From: hemildesai Date: Mon, 2 Oct 2023 00:17:58 -0700 Subject: [PATCH 01/11] Add sky show-gpus support for Kubernetes --- sky/cli.py | 3 - .../service_catalog/kubernetes_catalog.py | 67 ++++++++++++++++++- sky/utils/kubernetes_utils.py | 17 +++++ 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 362fa0edcf4..50e71bf8783 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3231,9 +3231,6 @@ def show_gpus( type is the lowest across all regions for both on-demand and spot instances. There may be multiple regions with the same lowest price. """ - # validation for the --cloud kubernetes - if cloud == 'kubernetes': - raise click.UsageError('Kubernetes does not have a service catalog.') # validation for the --region flag if region is not None and cloud is None: raise click.UsageError( diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 86033dab94e..10174b01e47 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -3,10 +3,13 @@ Kubernetes does not require a catalog of instances, but we need an image catalog mapping SkyPilot image tags to corresponding container image tags. """ +from typing import Dict, List, Optional, Set, Tuple -from typing import Optional +import pandas as pd +from sky.clouds.service_catalog import CloudFilter from sky.clouds.service_catalog import common +from sky.utils import kubernetes_utils _PULL_FREQUENCY_HOURS = 7 @@ -26,3 +29,65 @@ def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]: def is_image_tag_valid(tag: str, region: Optional[str]) -> bool: """Returns whether the image tag is valid.""" return common.is_image_tag_valid_impl(_image_df, tag, region) + + +def list_accelerators( + gpus_only: bool, + name_filter: Optional[str], + region_filter: Optional[str], + quantity_filter: Optional[int], + case_sensitive: bool = True +) -> Dict[str, List[common.InstanceTypeInfo]]: + has_gpu = kubernetes_utils.detect_gpu_resource() + if not has_gpu: + return {} + + label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter() + if not label_formatter: + return {} + + accelerators: Set[Tuple[str, int]] = set() + key = label_formatter.get_label_key() + nodes = kubernetes_utils.get_kubernetes_nodes() + for node in nodes: + if key in node.metadata.labels: + accelerator_name = label_formatter.get_accelerator_from_label_value( + node.metadata.labels.get(key)) + accelerator_count = int( + node.status.allocatable.get('nvidia.com/gpu', 0)) + + if accelerator_name and accelerator_count > 0: + accelerators.add((accelerator_name, accelerator_count)) + + result = [] + for accelerator_name, accelerator_count in accelerators: + result.append( + common.InstanceTypeInfo(cloud="Kubernetes", + instance_type=None, + accelerator_name=accelerator_name, + accelerator_count=accelerator_count, + cpu_count=None, + device_memory=None, + memory=None, + price=0.0, + spot_price=0.0, + region='')) + + df = pd.DataFrame(result, + columns=[ + 'Cloud', 'InstanceType', 'AcceleratorName', + 'AcceleratorCount', 'vCPUs', 'DeviceMemoryGiB', + 'MemoryGiB', 'Price', 'SpotPrice', 'Region' + ]) + df['GpuInfo'] = True + + return common.list_accelerators_impl('Kubernetes', df, gpus_only, + name_filter, region_filter, + quantity_filter, case_sensitive) + + +def validate_region_zone( + region_name: Optional[str], + zone_name: Optional[str], + clouds: CloudFilter = None) -> Tuple[Optional[str], Optional[str]]: + return (region_name, zone_name) diff --git a/sky/utils/kubernetes_utils.py b/sky/utils/kubernetes_utils.py index 7d46024e961..fa1cfe63137 100644 --- a/sky/utils/kubernetes_utils.py +++ b/sky/utils/kubernetes_utils.py @@ -79,6 +79,11 @@ def get_label_value(cls, accelerator: str) -> str: """Given a GPU type, returns the label value to be used""" raise NotImplementedError + @classmethod + def get_accelerator_from_label_value(cls, value: str) -> str: + """Given a label value, returns the GPU type""" + raise NotImplementedError + def get_gke_accelerator_name(accelerator: str) -> str: """Returns the accelerator name for GKE clusters @@ -112,6 +117,10 @@ def get_label_value(cls, accelerator: str) -> str: # See sky.utils.kubernetes.gpu_labeler. return accelerator.lower() + @classmethod + def get_accelerator_from_label_value(cls, value: str) -> str: + return value.upper() + class CoreWeaveLabelFormatter(GPULabelFormatter): """CoreWeave label formatter @@ -130,6 +139,10 @@ def get_label_key(cls) -> str: def get_label_value(cls, accelerator: str) -> str: return accelerator.upper() + @classmethod + def get_accelerator_from_label_value(cls, value: str) -> str: + return value + class GKELabelFormatter(GPULabelFormatter): """GKE label formatter @@ -148,6 +161,10 @@ def get_label_key(cls) -> str: def get_label_value(cls, accelerator: str) -> str: return get_gke_accelerator_name(accelerator) + @classmethod + def get_accelerator_from_label_value(cls, value: str) -> str: + return value.split('-')[-1].upper() + # LABEL_FORMATTER_REGISTRY stores the label formats SkyPilot will try to # discover the accelerator type from. The order of the list is important, as From 0b07ca667e29237b939cd2c8a3af5fcd49fde761 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Mon, 2 Oct 2023 16:38:20 -0700 Subject: [PATCH 02/11] Update sky/clouds/service_catalog/kubernetes_catalog.py Co-authored-by: Romil Bhardwaj --- sky/clouds/service_catalog/kubernetes_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 10174b01e47..d7c64a1683e 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -71,7 +71,7 @@ def list_accelerators( memory=None, price=0.0, spot_price=0.0, - region='')) + region='kubernetes')) df = pd.DataFrame(result, columns=[ From caeb3ab79c099949dcabd4268391b7d751c74f42 Mon Sep 17 00:00:00 2001 From: hemildesai Date: Mon, 2 Oct 2023 16:49:32 -0700 Subject: [PATCH 03/11] PR feedback --- sky/clouds/service_catalog/__init__.py | 3 +++ sky/clouds/service_catalog/kubernetes_catalog.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index 03d62144103..f06ad4fb770 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -20,6 +20,9 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs): if clouds is None: clouds = list(_ALL_CLOUDS) + if method_name == "list_accelerators": + clouds.append("kubernetes") + single = isinstance(clouds, str) if single: clouds = [clouds] # type: ignore diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index d7c64a1683e..e7b10a38cc9 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -57,7 +57,8 @@ def list_accelerators( node.status.allocatable.get('nvidia.com/gpu', 0)) if accelerator_name and accelerator_count > 0: - accelerators.add((accelerator_name, accelerator_count)) + for count in range(1, accelerator_count + 1): + accelerators.add((accelerator_name, count)) result = [] for accelerator_name, accelerator_count in accelerators: From 839ea038ae310006f7270ce3f21f1febaea7cc90 Mon Sep 17 00:00:00 2001 From: hemildesai Date: Tue, 3 Oct 2023 21:22:22 -0700 Subject: [PATCH 04/11] PR feedback part 2 --- sky/clouds/service_catalog/__init__.py | 7 +++++-- sky/clouds/service_catalog/kubernetes_catalog.py | 7 ++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index f06ad4fb770..1656fcb46d9 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -20,8 +20,11 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs): if clouds is None: clouds = list(_ALL_CLOUDS) - if method_name == "list_accelerators": - clouds.append("kubernetes") + + # TODO(hemil): Remove this once the common service catalog functions are refactored from clouds/kubernetes.py to kubernetes_catalog.py, + # and add kubernetes + if method_name == 'list_accelerators': + clouds.append('kubernetes') single = isinstance(clouds, str) if single: diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index e7b10a38cc9..eef1039f338 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -6,7 +6,9 @@ from typing import Dict, List, Optional, Set, Tuple import pandas as pd +from sky import global_user_state +from sky.clouds import Kubernetes from sky.clouds.service_catalog import CloudFilter from sky.clouds.service_catalog import common from sky.utils import kubernetes_utils @@ -38,6 +40,9 @@ def list_accelerators( quantity_filter: Optional[int], case_sensitive: bool = True ) -> Dict[str, List[common.InstanceTypeInfo]]: + if Kubernetes not in global_user_state.get_enabled_clouds() or not kubernetes_utils.check_credentials()[0]: + return {} + has_gpu = kubernetes_utils.detect_gpu_resource() if not has_gpu: return {} @@ -63,7 +68,7 @@ def list_accelerators( result = [] for accelerator_name, accelerator_count in accelerators: result.append( - common.InstanceTypeInfo(cloud="Kubernetes", + common.InstanceTypeInfo(cloud='Kubernetes', instance_type=None, accelerator_name=accelerator_name, accelerator_count=accelerator_count, From 58b9f3ef6a44dd98e3dd647521f67a0397199133 Mon Sep 17 00:00:00 2001 From: hemildesai Date: Tue, 3 Oct 2023 21:31:07 -0700 Subject: [PATCH 05/11] Format fix --- sky/clouds/service_catalog/kubernetes_catalog.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index eef1039f338..fd41e7eabd8 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -6,8 +6,8 @@ from typing import Dict, List, Optional, Set, Tuple import pandas as pd -from sky import global_user_state +from sky import global_user_state from sky.clouds import Kubernetes from sky.clouds.service_catalog import CloudFilter from sky.clouds.service_catalog import common @@ -40,7 +40,8 @@ def list_accelerators( quantity_filter: Optional[int], case_sensitive: bool = True ) -> Dict[str, List[common.InstanceTypeInfo]]: - if Kubernetes not in global_user_state.get_enabled_clouds() or not kubernetes_utils.check_credentials()[0]: + if Kubernetes not in global_user_state.get_enabled_clouds( + ) or not kubernetes_utils.check_credentials()[0]: return {} has_gpu = kubernetes_utils.detect_gpu_resource() From fa0df6ac83a2cdb8b50e6e6c78f472d3e2d70a4f Mon Sep 17 00:00:00 2001 From: hemildesai Date: Thu, 5 Oct 2023 00:20:45 -0700 Subject: [PATCH 06/11] PR feedback part 3 --- sky/cli.py | 9 +++++++++ sky/utils/kubernetes_utils.py | 9 ++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 50e71bf8783..4c69a9012bf 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3261,6 +3261,11 @@ def _output(): clouds=cloud, region_filter=region, ) + + if len(result) == 0 and cloud == 'kubernetes': + yield kubernetes_utils.NO_GPU_ERROR_MESSAGE + return + # "Common" GPUs for gpu in service_catalog.get_common_gpus(): if gpu in result: @@ -3317,6 +3322,10 @@ def _output(): case_sensitive=False) if len(result) == 0: + if cloud == 'kubernetes': + yield kubernetes_utils.NO_GPU_ERROR_MESSAGE + return + quantity_str = (f' with requested quantity {quantity}' if quantity else '') yield f'Resources \'{name}\'{quantity_str} not found. ' diff --git a/sky/utils/kubernetes_utils.py b/sky/utils/kubernetes_utils.py index fa1cfe63137..7048430746e 100644 --- a/sky/utils/kubernetes_utils.py +++ b/sky/utils/kubernetes_utils.py @@ -30,6 +30,7 @@ 'T': 2**40, 'P': 2**50, } +NO_GPU_ERROR_MESSAGE = 'No GPUs found in Kubernetes cluster. If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs (e.g., skypilot.co/accelerators) are setup correctly. To further debug, run: sky check.' logger = sky_logging.init_logger(__name__) @@ -163,7 +164,13 @@ def get_label_value(cls, accelerator: str) -> str: @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: - return value.split('-')[-1].upper() + if value.startswith('nvidia-tesla-'): + return value.replace('nvidia-tesla-', '').upper() + elif value.startswith('nvidia-'): + return value.replace('nvidia-', '').upper() + else: + raise ValueError( + f'Invalid accelerator name in GKE cluster: {value}') # LABEL_FORMATTER_REGISTRY stores the label formats SkyPilot will try to From 2134873e47e43b5912c43d2fcf8b6812ac0d938b Mon Sep 17 00:00:00 2001 From: hemildesai Date: Thu, 5 Oct 2023 19:24:52 -0700 Subject: [PATCH 07/11] Fix bug with checking enabled clouds in k8s list_accelerators --- sky/clouds/service_catalog/kubernetes_catalog.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index fd41e7eabd8..95dc7161821 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -40,7 +40,10 @@ def list_accelerators( quantity_filter: Optional[int], case_sensitive: bool = True ) -> Dict[str, List[common.InstanceTypeInfo]]: - if Kubernetes not in global_user_state.get_enabled_clouds( + k8s_cloud = Kubernetes() + if not any( + map(lambda cloud: k8s_cloud.is_same_cloud(cloud), + global_user_state.get_enabled_clouds()) ) or not kubernetes_utils.check_credentials()[0]: return {} From 817b7c33d453eff049dc8831f72f1864c4bb1beb Mon Sep 17 00:00:00 2001 From: hemildesai Date: Fri, 6 Oct 2023 17:31:33 -0700 Subject: [PATCH 08/11] Pylint fixes --- sky/clouds/service_catalog/__init__.py | 4 ++-- sky/clouds/service_catalog/kubernetes_catalog.py | 3 +-- sky/utils/kubernetes_utils.py | 5 ++++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index 1656fcb46d9..b1034d11204 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -21,8 +21,8 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs): if clouds is None: clouds = list(_ALL_CLOUDS) - # TODO(hemil): Remove this once the common service catalog functions are refactored from clouds/kubernetes.py to kubernetes_catalog.py, - # and add kubernetes + # TODO(hemil): Remove this once the common service catalog functions are refactored + # from clouds/kubernetes.py to kubernetes_catalog.py and add kubernetes to _ALL_CLOUDS if method_name == 'list_accelerators': clouds.append('kubernetes') diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 95dc7161821..63495123e92 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -42,8 +42,7 @@ def list_accelerators( ) -> Dict[str, List[common.InstanceTypeInfo]]: k8s_cloud = Kubernetes() if not any( - map(lambda cloud: k8s_cloud.is_same_cloud(cloud), - global_user_state.get_enabled_clouds()) + map(k8s_cloud.is_same_cloud, global_user_state.get_enabled_clouds()) ) or not kubernetes_utils.check_credentials()[0]: return {} diff --git a/sky/utils/kubernetes_utils.py b/sky/utils/kubernetes_utils.py index 7048430746e..c7c52b89cc7 100644 --- a/sky/utils/kubernetes_utils.py +++ b/sky/utils/kubernetes_utils.py @@ -30,7 +30,10 @@ 'T': 2**40, 'P': 2**50, } -NO_GPU_ERROR_MESSAGE = 'No GPUs found in Kubernetes cluster. If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs (e.g., skypilot.co/accelerators) are setup correctly. To further debug, run: sky check.' +NO_GPU_ERROR_MESSAGE = 'No GPUs found in Kubernetes cluster. \ +If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs \ +(e.g., skypilot.co/accelerators) are setup correctly. \ +To further debug, run: sky check.' logger = sky_logging.init_logger(__name__) From ddb7c886b2099571ba8dc614d91e4b16f0476aba Mon Sep 17 00:00:00 2001 From: hemildesai Date: Fri, 6 Oct 2023 17:40:13 -0700 Subject: [PATCH 09/11] Pylint fixes part 2 --- sky/clouds/service_catalog/__init__.py | 5 +++-- sky/clouds/service_catalog/kubernetes_catalog.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index b1034d11204..446cfc36936 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -21,8 +21,9 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs): if clouds is None: clouds = list(_ALL_CLOUDS) - # TODO(hemil): Remove this once the common service catalog functions are refactored - # from clouds/kubernetes.py to kubernetes_catalog.py and add kubernetes to _ALL_CLOUDS + # TODO(hemil): Remove this once the common service catalog + # functions are refactored from clouds/kubernetes.py to kubernetes_catalog.py + # and add kubernetes to _ALL_CLOUDS if method_name == 'list_accelerators': clouds.append('kubernetes') diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 63495123e92..0d7d0cdb480 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -98,5 +98,5 @@ def list_accelerators( def validate_region_zone( region_name: Optional[str], zone_name: Optional[str], - clouds: CloudFilter = None) -> Tuple[Optional[str], Optional[str]]: + clouds: CloudFilter = None) -> Tuple[Optional[str], Optional[str]]: # type: ignore return (region_name, zone_name) From e8c1f9b8a1d63736476e88de6b2485731d5c625f Mon Sep 17 00:00:00 2001 From: hemildesai Date: Fri, 6 Oct 2023 17:43:41 -0700 Subject: [PATCH 10/11] Pylint fixes part 3 --- sky/clouds/service_catalog/kubernetes_catalog.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 0d7d0cdb480..f0b623a7662 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -96,7 +96,8 @@ def list_accelerators( def validate_region_zone( - region_name: Optional[str], - zone_name: Optional[str], - clouds: CloudFilter = None) -> Tuple[Optional[str], Optional[str]]: # type: ignore + region_name: Optional[str], + zone_name: Optional[str], + clouds: CloudFilter = None # type: ignore +) -> Tuple[Optional[str], Optional[str]]: return (region_name, zone_name) From 6b0b40d2ad0c4fba5b4614554026612eb51b5099 Mon Sep 17 00:00:00 2001 From: hemildesai Date: Fri, 6 Oct 2023 17:52:05 -0700 Subject: [PATCH 11/11] Pylint fixes part 4 --- sky/clouds/service_catalog/__init__.py | 4 ++-- sky/clouds/service_catalog/kubernetes_catalog.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index 446cfc36936..61b3e44919b 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -22,8 +22,8 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs): clouds = list(_ALL_CLOUDS) # TODO(hemil): Remove this once the common service catalog - # functions are refactored from clouds/kubernetes.py to kubernetes_catalog.py - # and add kubernetes to _ALL_CLOUDS + # functions are refactored from clouds/kubernetes.py to + # kubernetes_catalog.py and add kubernetes to _ALL_CLOUDS if method_name == 'list_accelerators': clouds.append('kubernetes') diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index f0b623a7662..2127bb5e37f 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -98,6 +98,6 @@ def list_accelerators( def validate_region_zone( region_name: Optional[str], zone_name: Optional[str], - clouds: CloudFilter = None # type: ignore + clouds: CloudFilter = None # pylint: disable=unused-argument ) -> Tuple[Optional[str], Optional[str]]: return (region_name, zone_name)