From 3fbde3925cab66491107bee672d87c00be267776 Mon Sep 17 00:00:00 2001 From: Doyoung Kim <34902420+landscapepainter@users.noreply.github.com> Date: Mon, 6 Nov 2023 16:28:44 -0800 Subject: [PATCH] [k8s] Ensure Jump Pod is in "Running" Status Before Proceeding (#2589) * wait for jump pod to be running * testing images * update ssh-jump-pod-name * nit * re-read pod status from _rase_pod_scheduling_error * Update == 'Running' to != 'Pending' * comment * fix to remove pending jump pod when terminating * update comment --- .../providers/kubernetes/node_provider.py | 33 +++++++++++++------ sky/utils/kubernetes_utils.py | 13 ++++---- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 8963225cc3f..81230ff4ef8 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -164,8 +164,16 @@ def _set_node_tags(self, node_id, tags): def _raise_pod_scheduling_errors(self, new_nodes): for new_node in new_nodes: - pod_status = new_node.status.phase - pod_name = new_node._metadata._name + pod = kubernetes.core_api().read_namespaced_pod( + new_node.metadata.name, self.namespace) + pod_status = pod.status.phase + # When there are multiple pods involved while launching instance, + # there may be a single pod causing issue while others are + # scheduled. In this case, we make sure to not surface the error + # message from the pod that is already scheduled. + if pod_status != 'Pending': + continue + pod_name = pod._metadata._name events = kubernetes.core_api().list_namespaced_event( self.namespace, field_selector=(f'involvedObject.name={pod_name},' @@ -173,8 +181,8 @@ def _raise_pod_scheduling_errors(self, new_nodes): # Events created in the past hours are kept by # Kubernetes python client and we want to surface # the latest event message - events_desc_by_time = \ - sorted(events.items, + events_desc_by_time = sorted( + events.items, key=lambda e: e.metadata.creation_timestamp, reverse=True) for event in events_desc_by_time: @@ -200,8 +208,8 @@ def _raise_pod_scheduling_errors(self, new_nodes): lf.get_label_key() for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY ] - if new_node.spec.node_selector: - for label_key in new_node.spec.node_selector.keys(): + if pod.spec.node_selector: + for label_key in pod.spec.node_selector.keys(): if label_key in gpu_lf_keys: # TODO(romilb): We may have additional node # affinity selectors in the future - in that @@ -210,7 +218,7 @@ def _raise_pod_scheduling_errors(self, new_nodes): 'didn\'t match Pod\'s node affinity/selector' in event_message: raise config.KubernetesError( f'{lack_resource_msg.format(resource="GPU")} ' - f'Verify if {new_node.spec.node_selector[label_key]}' + f'Verify if {pod.spec.node_selector[label_key]}' ' is available in the cluster.') raise config.KubernetesError(f'{timeout_err_msg} ' f'Pod status: {pod_status}' @@ -257,9 +265,14 @@ def create_node(self, node_config, tags, count): self.namespace, service_spec) new_svcs.append(svc) - # Wait for all pods to be ready, and if it exceeds the timeout, raise an - # exception. If pod's container is ContainerCreating, then we can assume - # that resources have been allocated and we can exit. + # Wait for all pods including jump pod to be ready, and if it + # exceeds the timeout, raise an exception. If pod's container + # is ContainerCreating, then we can assume that resources have been + # allocated and we can exit. + ssh_jump_pod_name = conf['metadata']['labels']['skypilot-ssh-jump'] + jump_pod = kubernetes.core_api().read_namespaced_pod( + ssh_jump_pod_name, self.namespace) + new_nodes.append(jump_pod) start = time.time() while True: if time.time() - start > self.timeout: diff --git a/sky/utils/kubernetes_utils.py b/sky/utils/kubernetes_utils.py index 382ae7c23bf..b958e3f3477 100644 --- a/sky/utils/kubernetes_utils.py +++ b/sky/utils/kubernetes_utils.py @@ -923,12 +923,13 @@ def find(l, predicate): ssh_jump_name, namespace) cont_ready_cond = find(ssh_jump_pod.status.conditions, lambda c: c.type == 'ContainersReady') - if cont_ready_cond and \ - cont_ready_cond.status == 'False': - # The main container is not ready. To be on the safe side - # and prevent a dangling ssh jump pod, lets remove it and - # the service. Otherwise main container is ready and its lifecycle - # management script takes care of the cleaning. + if (cont_ready_cond and cont_ready_cond.status + == 'False') or ssh_jump_pod.status.phase == 'Pending': + # Either the main container is not ready or the pod failed + # to schedule. To be on the safe side and prevent a dangling + # ssh jump pod, lets remove it and the service. Otherwise, main + # container is ready and its lifecycle management script takes + # care of the cleaning. kubernetes.core_api().delete_namespaced_pod(ssh_jump_name, namespace) kubernetes.core_api().delete_namespaced_service(