diff --git a/docs/source/reference/kubernetes/kubernetes-setup.rst b/docs/source/reference/kubernetes/kubernetes-setup.rst index 975ebeba177..355a06f7a8d 100644 --- a/docs/source/reference/kubernetes/kubernetes-setup.rst +++ b/docs/source/reference/kubernetes/kubernetes-setup.rst @@ -167,10 +167,10 @@ Please follow their respective guides to deploy your Kubernetes cluster. Setting up GPU support ~~~~~~~~~~~~~~~~~~~~~~ -If your Kubernetes cluster has Nvidia GPUs, make sure you have the Nvidia -device plugin installed (i.e., ``nvidia.com/gpu`` resource is available on each node). -Additionally, you will need to label each node in your cluster with the GPU type. -For example, a node with v100 GPUs must have a label :code:`skypilot.co/accelerators: v100`. +If your Kubernetes cluster has Nvidia GPUs, ensure that: + +1. The Nvidia device plugin is installed (i.e., ``nvidia.com/gpu`` resource is available on each node). +2. Each node in your cluster is labelled with the GPU type. This labelling can be done by adding a label of the format ``skypilot.co/accelerators: ``, where the ```` is the lowercase name of the GPU. For example, a node with V100 GPUs must have a label :code:`skypilot.co/accelerators: v100`. We provide a convenience script that automatically detects GPU types and labels each node. You can run it with: @@ -185,6 +185,9 @@ We provide a convenience script that automatically detects GPU types and labels You can check if nodes have been labeled by running `kubectl describe nodes` and looking for labels of the format `skypilot.co/accelerators: `. +.. note:: + GPU labels are case-sensitive. Ensure that the GPU name is lowercase if you are using the ``skypilot.co/accelerators`` label. + .. note:: GPU labelling is not required on GKE clusters - SkyPilot will automatically use GKE provided labels. However, you will still need to install `drivers `_. diff --git a/sky/utils/kubernetes_utils.py b/sky/utils/kubernetes_utils.py index b958e3f3477..e4a5d27be7a 100644 --- a/sky/utils/kubernetes_utils.py +++ b/sky/utils/kubernetes_utils.py @@ -60,6 +60,20 @@ def get_accelerator_from_label_value(cls, value: str) -> str: """Given a label value, returns the GPU type""" raise NotImplementedError + @classmethod + def validate_label_value(cls, value: str) -> Tuple[bool, str]: + """Validates if the specified label value is correct. + + Used to check if the labelling on the cluster is correct and + preemptively raise an error if it is not. + + Returns: + bool: True if the label value is valid, False otherwise. + str: Error message if the label value is invalid, None otherwise. + """ + del value + return True, '' + def get_gke_accelerator_name(accelerator: str) -> str: """Returns the accelerator name for GKE clusters @@ -97,6 +111,14 @@ def get_label_value(cls, accelerator: str) -> str: def get_accelerator_from_label_value(cls, value: str) -> str: return value.upper() + @classmethod + def validate_label_value(cls, value: str) -> Tuple[bool, str]: + """Values must be all lowercase for the SkyPilot formatter.""" + is_valid = value == value.lower() + return is_valid, (f'Label value {value!r} must be lowercase if using ' + f'the {cls.get_label_key()} label.' + if not is_valid else '') + class CoreWeaveLabelFormatter(GPULabelFormatter): """CoreWeave label formatter @@ -157,31 +179,32 @@ def get_accelerator_from_label_value(cls, value: str) -> str: def detect_gpu_label_formatter( -) -> Tuple[Optional[GPULabelFormatter], List[Tuple[str, str]]]: +) -> Tuple[Optional[GPULabelFormatter], Dict[str, List[Tuple[str, str]]]]: """Detects the GPU label formatter for the Kubernetes cluster Returns: GPULabelFormatter: The GPU label formatter for the cluster, if found. - List[Tuple[str, str]]: The set of labels and values across all nodes. + Dict[str, List[Tuple[str, str]]]: A mapping of nodes and the list of + labels on each node. E.g., {'node1': [('label1', 'value1')]} """ # Get all labels across all nodes - node_labels: List[Tuple[str, str]] = [] + node_labels: Dict[str, List[Tuple[str, str]]] = {} nodes = get_kubernetes_nodes() for node in nodes: - node_labels.extend(node.metadata.labels.items()) + node_labels[node.metadata.name] = [] + for label, value in node.metadata.labels.items(): + node_labels[node.metadata.name].append((label, value)) label_formatter = None # Check if the node labels contain any of the GPU label prefixes for lf in LABEL_FORMATTER_REGISTRY: label_key = lf.get_label_key() - for label, _ in node_labels: - if label.startswith(label_key): - label_formatter = lf() - break - - if label_formatter is not None: - break + for _, label_list in node_labels.items(): + for label, _ in label_list: + if label.startswith(label_key): + label_formatter = lf() + return label_formatter, node_labels return label_formatter, node_labels @@ -343,6 +366,17 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: 'the documentation on how to set up node labels.' f'{suffix}') if label_formatter is not None: + # Validate the label value on all nodes labels to ensure they are + # correctly setup and will behave as expected. + for node_name, label_list in node_labels.items(): + for label, value in label_list: + if label == label_formatter.get_label_key(): + is_valid, reason = label_formatter.validate_label_value( + value) + if not is_valid: + raise exceptions.ResourcesUnavailableError( + f'Node {node_name!r} in Kubernetes cluster has ' + f'invalid GPU label: {label}={value}. {reason}') if check_mode: # If check mode is enabled and we reached so far, we can # conclude that the cluster is setup correctly and return. @@ -355,17 +389,22 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: # node. It does not (and should not) check if the resource # quantity is available since that is dynamic and can change # during scheduling. - for label, value in node_labels: - if label == k8s_acc_label_key and value == k8s_acc_label_value: - # If a node is found, we can break out of the loop - # and proceed to deploy. - return k8s_acc_label_key, k8s_acc_label_value + for node_name, label_list in node_labels.items(): + for label, value in label_list: + if (label == k8s_acc_label_key and + value == k8s_acc_label_value): + # If a node is found, we can break out of the loop + # and proceed to deploy. + return k8s_acc_label_key, k8s_acc_label_value # If no node is found with the requested acc_type, raise error with ux_utils.print_exception_no_traceback(): suffix = '' if env_options.Options.SHOW_DEBUG_INFO.get(): + all_labels = [] + for node_name, label_list in node_labels.items(): + all_labels.extend(label_list) gpus_available = set( - v for k, v in node_labels if k == k8s_acc_label_key) + v for k, v in all_labels if k == k8s_acc_label_key) suffix = f' Available GPUs on the cluster: {gpus_available}' raise exceptions.ResourcesUnavailableError( 'Could not find any node in the Kubernetes cluster ' diff --git a/tests/common.py b/tests/common.py index 835f53ef251..4844def6bb9 100644 --- a/tests/common.py +++ b/tests/common.py @@ -55,7 +55,7 @@ def _get_az_mappings(_): # the cluster to detect available cluster resources. monkeypatch.setattr( 'sky.utils.kubernetes_utils.detect_gpu_label_formatter', - lambda *_args, **_kwargs: [kubernetes_utils.SkyPilotLabelFormatter, []]) + lambda *_args, **_kwargs: [kubernetes_utils.SkyPilotLabelFormatter, {}]) monkeypatch.setattr('sky.utils.kubernetes_utils.detect_gpu_resource', lambda *_args, **_kwargs: [True, []]) monkeypatch.setattr('sky.utils.kubernetes_utils.check_instance_fits',