Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[k8s] Add label validation during sky check #2653

Merged
merged 9 commits into from
Nov 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions docs/source/reference/kubernetes/kubernetes-setup.rst
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,10 @@ Please follow their respective guides to deploy your Kubernetes cluster.

Setting up GPU support
~~~~~~~~~~~~~~~~~~~~~~
If your Kubernetes cluster has Nvidia GPUs, make sure you have the Nvidia
device plugin installed (i.e., ``nvidia.com/gpu`` resource is available on each node).
Additionally, you will need to label each node in your cluster with the GPU type.
For example, a node with v100 GPUs must have a label :code:`skypilot.co/accelerators: v100`.
If your Kubernetes cluster has Nvidia GPUs, ensure that:

1. The Nvidia device plugin is installed (i.e., ``nvidia.com/gpu`` resource is available on each node).
2. Each node in your cluster is labelled with the GPU type. This labelling can be done by adding a label of the format ``skypilot.co/accelerators: <gpu_name>``, where the ``<gpu_name>`` is the lowercase name of the GPU. For example, a node with V100 GPUs must have a label :code:`skypilot.co/accelerators: v100`.

We provide a convenience script that automatically detects GPU types and labels each node. You can run it with:

Expand All @@ -185,6 +185,9 @@ We provide a convenience script that automatically detects GPU types and labels
You can check if nodes have been labeled by running `kubectl describe nodes` and looking for labels of the format `skypilot.co/accelerators: <gpu_name>`.


.. note::
GPU labels are case-sensitive. Ensure that the GPU name is lowercase if you are using the ``skypilot.co/accelerators`` label.

.. note::
GPU labelling is not required on GKE clusters - SkyPilot will automatically use GKE provided labels. However, you will still need to install `drivers <https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers>`_.

Expand Down
73 changes: 56 additions & 17 deletions sky/utils/kubernetes_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,20 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
"""Given a label value, returns the GPU type"""
raise NotImplementedError

@classmethod
def validate_label_value(cls, value: str) -> Tuple[bool, str]:
"""Validates if the specified label value is correct.

Used to check if the labelling on the cluster is correct and
preemptively raise an error if it is not.

Returns:
bool: True if the label value is valid, False otherwise.
str: Error message if the label value is invalid, None otherwise.
"""
del value
return True, ''


def get_gke_accelerator_name(accelerator: str) -> str:
"""Returns the accelerator name for GKE clusters
Expand Down Expand Up @@ -97,6 +111,14 @@ def get_label_value(cls, accelerator: str) -> str:
def get_accelerator_from_label_value(cls, value: str) -> str:
return value.upper()

@classmethod
def validate_label_value(cls, value: str) -> Tuple[bool, str]:
"""Values must be all lowercase for the SkyPilot formatter."""
is_valid = value == value.lower()
return is_valid, (f'Label value {value!r} must be lowercase if using '
f'the {cls.get_label_key()} label.'
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
if not is_valid else '')


class CoreWeaveLabelFormatter(GPULabelFormatter):
"""CoreWeave label formatter
Expand Down Expand Up @@ -157,31 +179,32 @@ def get_accelerator_from_label_value(cls, value: str) -> str:


def detect_gpu_label_formatter(
) -> Tuple[Optional[GPULabelFormatter], List[Tuple[str, str]]]:
) -> Tuple[Optional[GPULabelFormatter], Dict[str, List[Tuple[str, str]]]]:
"""Detects the GPU label formatter for the Kubernetes cluster

Returns:
GPULabelFormatter: The GPU label formatter for the cluster, if found.
List[Tuple[str, str]]: The set of labels and values across all nodes.
Dict[str, List[Tuple[str, str]]]: A mapping of nodes and the list of
labels on each node. E.g., {'node1': [('label1', 'value1')]}
"""
# Get all labels across all nodes
node_labels: List[Tuple[str, str]] = []
node_labels: Dict[str, List[Tuple[str, str]]] = {}
nodes = get_kubernetes_nodes()
for node in nodes:
node_labels.extend(node.metadata.labels.items())
node_labels[node.metadata.name] = []
for label, value in node.metadata.labels.items():
node_labels[node.metadata.name].append((label, value))

label_formatter = None

# Check if the node labels contain any of the GPU label prefixes
for lf in LABEL_FORMATTER_REGISTRY:
label_key = lf.get_label_key()
for label, _ in node_labels:
if label.startswith(label_key):
label_formatter = lf()
break

if label_formatter is not None:
break
for _, label_list in node_labels.items():
for label, _ in label_list:
if label.startswith(label_key):
label_formatter = lf()
return label_formatter, node_labels

return label_formatter, node_labels

Expand Down Expand Up @@ -343,6 +366,17 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
'the documentation on how to set up node labels.'
f'{suffix}')
if label_formatter is not None:
# Validate the label value on all nodes labels to ensure they are
# correctly setup and will behave as expected.
for node_name, label_list in node_labels.items():
for label, value in label_list:
if label == label_formatter.get_label_key():
is_valid, reason = label_formatter.validate_label_value(
value)
if not is_valid:
raise exceptions.ResourcesUnavailableError(
f'Node {node_name!r} in Kubernetes cluster has '
f'invalid GPU label: {label}={value}. {reason}')
if check_mode:
# If check mode is enabled and we reached so far, we can
# conclude that the cluster is setup correctly and return.
Expand All @@ -355,17 +389,22 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
# node. It does not (and should not) check if the resource
# quantity is available since that is dynamic and can change
# during scheduling.
for label, value in node_labels:
if label == k8s_acc_label_key and value == k8s_acc_label_value:
# If a node is found, we can break out of the loop
# and proceed to deploy.
return k8s_acc_label_key, k8s_acc_label_value
for node_name, label_list in node_labels.items():
for label, value in label_list:
if (label == k8s_acc_label_key and
value == k8s_acc_label_value):
# If a node is found, we can break out of the loop
# and proceed to deploy.
return k8s_acc_label_key, k8s_acc_label_value
# If no node is found with the requested acc_type, raise error
with ux_utils.print_exception_no_traceback():
suffix = ''
if env_options.Options.SHOW_DEBUG_INFO.get():
all_labels = []
for node_name, label_list in node_labels.items():
all_labels.extend(label_list)
gpus_available = set(
v for k, v in node_labels if k == k8s_acc_label_key)
v for k, v in all_labels if k == k8s_acc_label_key)
suffix = f' Available GPUs on the cluster: {gpus_available}'
raise exceptions.ResourcesUnavailableError(
'Could not find any node in the Kubernetes cluster '
Expand Down
2 changes: 1 addition & 1 deletion tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _get_az_mappings(_):
# the cluster to detect available cluster resources.
monkeypatch.setattr(
'sky.utils.kubernetes_utils.detect_gpu_label_formatter',
lambda *_args, **_kwargs: [kubernetes_utils.SkyPilotLabelFormatter, []])
lambda *_args, **_kwargs: [kubernetes_utils.SkyPilotLabelFormatter, {}])
monkeypatch.setattr('sky.utils.kubernetes_utils.detect_gpu_resource',
lambda *_args, **_kwargs: [True, []])
monkeypatch.setattr('sky.utils.kubernetes_utils.check_instance_fits',
Expand Down