Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sky show-gpus support for Kubernetes #2638

Merged
merged 11 commits into from
Oct 7, 2023
12 changes: 9 additions & 3 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3231,9 +3231,6 @@ def show_gpus(
type is the lowest across all regions for both on-demand and spot
instances. There may be multiple regions with the same lowest price.
"""
# validation for the --cloud kubernetes
if cloud == 'kubernetes':
raise click.UsageError('Kubernetes does not have a service catalog.')
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
# validation for the --region flag
if region is not None and cloud is None:
raise click.UsageError(
Expand Down Expand Up @@ -3264,6 +3261,11 @@ def _output():
clouds=cloud,
region_filter=region,
)

if len(result) == 0 and cloud == 'kubernetes':
yield kubernetes_utils.NO_GPU_ERROR_MESSAGE
return

# "Common" GPUs
for gpu in service_catalog.get_common_gpus():
if gpu in result:
Expand Down Expand Up @@ -3320,6 +3322,10 @@ def _output():
case_sensitive=False)

if len(result) == 0:
if cloud == 'kubernetes':
yield kubernetes_utils.NO_GPU_ERROR_MESSAGE
return

quantity_str = (f' with requested quantity {quantity}'
if quantity else '')
yield f'Resources \'{name}\'{quantity_str} not found. '
Expand Down
6 changes: 6 additions & 0 deletions sky/clouds/service_catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs):
if clouds is None:
clouds = list(_ALL_CLOUDS)

# TODO(hemil): Remove this once the common service catalog functions are refactored from clouds/kubernetes.py to kubernetes_catalog.py,
# and add kubernetes
if method_name == 'list_accelerators':
clouds.append('kubernetes')

single = isinstance(clouds, str)
if single:
clouds = [clouds] # type: ignore
Expand Down
74 changes: 73 additions & 1 deletion sky/clouds/service_catalog/kubernetes_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@
Kubernetes does not require a catalog of instances, but we need an image catalog
mapping SkyPilot image tags to corresponding container image tags.
"""
from typing import Dict, List, Optional, Set, Tuple

from typing import Optional
import pandas as pd

from sky import global_user_state
from sky.clouds import Kubernetes
from sky.clouds.service_catalog import CloudFilter
from sky.clouds.service_catalog import common
from sky.utils import kubernetes_utils

_PULL_FREQUENCY_HOURS = 7

Expand All @@ -26,3 +31,70 @@ def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
"""Returns whether the image tag is valid."""
return common.is_image_tag_valid_impl(_image_df, tag, region)


def list_accelerators(
gpus_only: bool,
name_filter: Optional[str],
region_filter: Optional[str],
quantity_filter: Optional[int],
case_sensitive: bool = True
) -> Dict[str, List[common.InstanceTypeInfo]]:
if Kubernetes not in global_user_state.get_enabled_clouds(
) or not kubernetes_utils.check_credentials()[0]:
return {}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, there seems to be a bug - my GKE cluster has V100 yet this shows the error message:

# sky show-gpus does not find GPUs:
(base) ➜  ~ sky show-gpus --cloud kubernetes
No GPUs found in Kubernetes cluster. If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs (e.g., skypilot.co/accelerators) are setup correctly. To further debug, run: sky check.


# Confirming sky launch detects GPUs:
(base) ➜  ~ sky launch -c test --gpus V100:1
I 10-05 14:31:23 optimizer.py:682] == Optimizer ==
I 10-05 14:31:23 optimizer.py:693] Target: minimizing cost
I 10-05 14:31:23 optimizer.py:705] Estimated cost: $0.0 / hour
I 10-05 14:31:23 optimizer.py:705]
I 10-05 14:31:23 optimizer.py:777] Considered resources (1 node):
I 10-05 14:31:23 optimizer.py:826] ------------------------------------------------------------------------------------------------------
I 10-05 14:31:23 optimizer.py:826]  CLOUD        INSTANCE           vCPUs   Mem(GB)   ACCELERATORS   REGION/ZONE     COST ($)   CHOSEN
I 10-05 14:31:23 optimizer.py:826] ------------------------------------------------------------------------------------------------------
I 10-05 14:31:23 optimizer.py:826]  Kubernetes   2CPU--8GB--1V100   2       8         V100:1         kubernetes      0.00          ✔
I 10-05 14:31:23 optimizer.py:826]  IBM          gx2-8x64x1v100     8       64        V100:1         us-east         2.50
I 10-05 14:31:23 optimizer.py:826]  GCP          n1-highmem-8       8       52        V100:1         us-central1-a   2.95
I 10-05 14:31:23 optimizer.py:826]  AWS          p3.2xlarge         8       61        V100:1         us-east-1       3.06
I 10-05 14:31:23 optimizer.py:826]  Azure        Standard_NC6s_v3   6       112       V100:1         eastus          3.06
I 10-05 14:31:23 optimizer.py:826] ------------------------------------------------------------------------------------------------------
I 10-05 14:31:23 optimizer.py:826]
Launching a new cluster 'test'. Proceed? [Y/n]: ^CAborted!


# Confirming the underlying code can detect GPUs:
(base) ➜  ~ SKYPILOT_DEBUG=1 python -c "import sky;print(sky.utils.kubernetes_utils.get_gpu_label_key_value('v100'))"
D 10-05 14:32:58 skypilot_config.py:157] Using config path: /Users/romilb/.sky/config.yaml
D 10-05 14:32:58 skypilot_config.py:160] Config loaded:
D 10-05 14:32:58 skypilot_config.py:160] None
D 10-05 14:32:58 skypilot_config.py:166] Config syntax check passed.
('cloud.google.com/gke-accelerator', 'nvidia-tesla-v100')

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comparison is failing - Kubernetes not in global_user_state.get_enabled_clouds(). You may need to use Kubernetes() and the is_same_cloud comparator.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 2134873.
Checked both clusters with/without GPUs.


has_gpu = kubernetes_utils.detect_gpu_resource()
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
if not has_gpu:
return {}

label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter()
if not label_formatter:
return {}

accelerators: Set[Tuple[str, int]] = set()
key = label_formatter.get_label_key()
nodes = kubernetes_utils.get_kubernetes_nodes()
for node in nodes:
if key in node.metadata.labels:
accelerator_name = label_formatter.get_accelerator_from_label_value(
node.metadata.labels.get(key))
accelerator_count = int(
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
node.status.allocatable.get('nvidia.com/gpu', 0))

if accelerator_name and accelerator_count > 0:
for count in range(1, accelerator_count + 1):
accelerators.add((accelerator_name, count))

result = []
for accelerator_name, accelerator_count in accelerators:
result.append(
common.InstanceTypeInfo(cloud='Kubernetes',
instance_type=None,
accelerator_name=accelerator_name,
accelerator_count=accelerator_count,
cpu_count=None,
device_memory=None,
memory=None,
price=0.0,
spot_price=0.0,
region='kubernetes'))

df = pd.DataFrame(result,
columns=[
'Cloud', 'InstanceType', 'AcceleratorName',
'AcceleratorCount', 'vCPUs', 'DeviceMemoryGiB',
'MemoryGiB', 'Price', 'SpotPrice', 'Region'
])
df['GpuInfo'] = True

return common.list_accelerators_impl('Kubernetes', df, gpus_only,
name_filter, region_filter,
quantity_filter, case_sensitive)


def validate_region_zone(
region_name: Optional[str],
zone_name: Optional[str],
clouds: CloudFilter = None) -> Tuple[Optional[str], Optional[str]]:
return (region_name, zone_name)
24 changes: 24 additions & 0 deletions sky/utils/kubernetes_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
'T': 2**40,
'P': 2**50,
}
NO_GPU_ERROR_MESSAGE = 'No GPUs found in Kubernetes cluster. If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs (e.g., skypilot.co/accelerators) are setup correctly. To further debug, run: sky check.'

logger = sky_logging.init_logger(__name__)

Expand Down Expand Up @@ -79,6 +80,11 @@ def get_label_value(cls, accelerator: str) -> str:
"""Given a GPU type, returns the label value to be used"""
raise NotImplementedError

@classmethod
def get_accelerator_from_label_value(cls, value: str) -> str:
"""Given a label value, returns the GPU type"""
raise NotImplementedError


def get_gke_accelerator_name(accelerator: str) -> str:
"""Returns the accelerator name for GKE clusters
Expand Down Expand Up @@ -112,6 +118,10 @@ def get_label_value(cls, accelerator: str) -> str:
# See sky.utils.kubernetes.gpu_labeler.
return accelerator.lower()

@classmethod
def get_accelerator_from_label_value(cls, value: str) -> str:
return value.upper()


class CoreWeaveLabelFormatter(GPULabelFormatter):
"""CoreWeave label formatter
Expand All @@ -130,6 +140,10 @@ def get_label_key(cls) -> str:
def get_label_value(cls, accelerator: str) -> str:
return accelerator.upper()

@classmethod
def get_accelerator_from_label_value(cls, value: str) -> str:
return value


class GKELabelFormatter(GPULabelFormatter):
"""GKE label formatter
Expand All @@ -148,6 +162,16 @@ def get_label_key(cls) -> str:
def get_label_value(cls, accelerator: str) -> str:
return get_gke_accelerator_name(accelerator)

@classmethod
def get_accelerator_from_label_value(cls, value: str) -> str:
if value.startswith('nvidia-tesla-'):
return value.replace('nvidia-tesla-', '').upper()
elif value.startswith('nvidia-'):
return value.replace('nvidia-', '').upper()
else:
raise ValueError(
f'Invalid accelerator name in GKE cluster: {value}')


# LABEL_FORMATTER_REGISTRY stores the label formats SkyPilot will try to
# discover the accelerator type from. The order of the list is important, as
Expand Down
Loading