Skip to content

Commit

Permalink
[k8s] Ensure Jump Pod is in "Running" Status Before Proceeding (#2589)
Browse files Browse the repository at this point in the history
* wait for jump pod to be running

* testing images

* update ssh-jump-pod-name

* nit

* re-read pod status from _rase_pod_scheduling_error

* Update == 'Running' to != 'Pending'

* comment

* fix to remove pending jump pod when terminating

* update comment
  • Loading branch information
landscapepainter authored Nov 7, 2023
1 parent 5a35ab6 commit 3fbde39
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 16 deletions.
33 changes: 23 additions & 10 deletions sky/skylet/providers/kubernetes/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,17 +164,25 @@ def _set_node_tags(self, node_id, tags):

def _raise_pod_scheduling_errors(self, new_nodes):
for new_node in new_nodes:
pod_status = new_node.status.phase
pod_name = new_node._metadata._name
pod = kubernetes.core_api().read_namespaced_pod(
new_node.metadata.name, self.namespace)
pod_status = pod.status.phase
# When there are multiple pods involved while launching instance,
# there may be a single pod causing issue while others are
# scheduled. In this case, we make sure to not surface the error
# message from the pod that is already scheduled.
if pod_status != 'Pending':
continue
pod_name = pod._metadata._name
events = kubernetes.core_api().list_namespaced_event(
self.namespace,
field_selector=(f'involvedObject.name={pod_name},'
'involvedObject.kind=Pod'))
# Events created in the past hours are kept by
# Kubernetes python client and we want to surface
# the latest event message
events_desc_by_time = \
sorted(events.items,
events_desc_by_time = sorted(
events.items,
key=lambda e: e.metadata.creation_timestamp,
reverse=True)
for event in events_desc_by_time:
Expand All @@ -200,8 +208,8 @@ def _raise_pod_scheduling_errors(self, new_nodes):
lf.get_label_key()
for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
]
if new_node.spec.node_selector:
for label_key in new_node.spec.node_selector.keys():
if pod.spec.node_selector:
for label_key in pod.spec.node_selector.keys():
if label_key in gpu_lf_keys:
# TODO(romilb): We may have additional node
# affinity selectors in the future - in that
Expand All @@ -210,7 +218,7 @@ def _raise_pod_scheduling_errors(self, new_nodes):
'didn\'t match Pod\'s node affinity/selector' in event_message:
raise config.KubernetesError(
f'{lack_resource_msg.format(resource="GPU")} '
f'Verify if {new_node.spec.node_selector[label_key]}'
f'Verify if {pod.spec.node_selector[label_key]}'
' is available in the cluster.')
raise config.KubernetesError(f'{timeout_err_msg} '
f'Pod status: {pod_status}'
Expand Down Expand Up @@ -257,9 +265,14 @@ def create_node(self, node_config, tags, count):
self.namespace, service_spec)
new_svcs.append(svc)

# Wait for all pods to be ready, and if it exceeds the timeout, raise an
# exception. If pod's container is ContainerCreating, then we can assume
# that resources have been allocated and we can exit.
# Wait for all pods including jump pod to be ready, and if it
# exceeds the timeout, raise an exception. If pod's container
# is ContainerCreating, then we can assume that resources have been
# allocated and we can exit.
ssh_jump_pod_name = conf['metadata']['labels']['skypilot-ssh-jump']
jump_pod = kubernetes.core_api().read_namespaced_pod(
ssh_jump_pod_name, self.namespace)
new_nodes.append(jump_pod)
start = time.time()
while True:
if time.time() - start > self.timeout:
Expand Down
13 changes: 7 additions & 6 deletions sky/utils/kubernetes_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,12 +923,13 @@ def find(l, predicate):
ssh_jump_name, namespace)
cont_ready_cond = find(ssh_jump_pod.status.conditions,
lambda c: c.type == 'ContainersReady')
if cont_ready_cond and \
cont_ready_cond.status == 'False':
# The main container is not ready. To be on the safe side
# and prevent a dangling ssh jump pod, lets remove it and
# the service. Otherwise main container is ready and its lifecycle
# management script takes care of the cleaning.
if (cont_ready_cond and cont_ready_cond.status
== 'False') or ssh_jump_pod.status.phase == 'Pending':
# Either the main container is not ready or the pod failed
# to schedule. To be on the safe side and prevent a dangling
# ssh jump pod, lets remove it and the service. Otherwise, main
# container is ready and its lifecycle management script takes
# care of the cleaning.
kubernetes.core_api().delete_namespaced_pod(ssh_jump_name,
namespace)
kubernetes.core_api().delete_namespaced_service(
Expand Down

0 comments on commit 3fbde39

Please sign in to comment.