Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix wait for terminated pods or for minion nodes in the just-restarted cluster #11106

Closed
wants to merge 14 commits into from
12 changes: 11 additions & 1 deletion pkg/minikube/bootstrapper/bsutil/kverify/pod_ready.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ package kverify
import (
"context"
"fmt"
"regexp"
"strings"
"time"

core "k8s.io/api/core/v1"
Expand All @@ -43,6 +45,7 @@ func WaitExtra(cs *kubernetes.Clientset, labels []string, timeout time.Duration)
return fmt.Errorf("error listing pods in %q namespace: %w", meta.NamespaceSystem, err)
}

unstartedMinion := regexp.MustCompile(`node "(.*-m[0-9]+)" has status "Ready":"Unknown"`)
for _, pod := range pods.Items {
if time.Since(start) > timeout {
return fmt.Errorf("timed out waiting %v for all system-critical and pods with labels %v to be %q", timeout, labels, core.NodeReady)
Expand All @@ -63,7 +66,14 @@ func WaitExtra(cs *kubernetes.Clientset, labels []string, timeout time.Duration)
}
if match || pod.Spec.PriorityClassName == "system-cluster-critical" || pod.Spec.PriorityClassName == "system-node-critical" {
if err := waitPodCondition(cs, pod.Name, pod.Namespace, core.PodReady, timeout); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about instead of waiting till it fails and then check the error message to see if it is a "Termianted pod" how about at beggning only range through pods which are not Terminated ?

	if err != nil {
		return fmt.Errorf("error listing pods in %q namespace: %w", meta.NamespaceSystem, err)
	}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or as @ilya-zuyev noted for _, pod := range pods.Items { might have a condition we could check

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think something like that (ie, 'skipping!') was my original intention:

if pod.Status.Phase != core.PodRunning && pod.Status.Phase != core.PodPending {
return core.ConditionUnknown, fmt.Sprintf("pod %q in %q namespace has status phase %q (skipping!): %+v", pod.Name, pod.Namespace, pod.Status.Phase, pod.Status)
}

but it somehow got lost along the way (when i also added host checking)

i'll have a look...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@medyagh @ilya-zuyev thanks for catching this!
it shouldn't have waited on non-running and non-pending pods because of:

if pod.Status.Phase != core.PodRunning && pod.Status.Phase != core.PodPending {
return core.ConditionUnknown, fmt.Sprintf("pod %q in %q namespace has status phase %q (skipping!): %+v", pod.Name, pod.Namespace, pod.Status.Phase, pod.Status)
}

and
if status == core.ConditionUnknown {
klog.Info(reason)
return false, fmt.Errorf(reason)
}

then
if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkCondition); err != nil {
return fmt.Errorf("waitPodCondition: %w", err)
}

would exit immediately (due to the returned error from checkCondition()), but i've missed that specific one in:
terminatedPod := strings.HasSuffix(err.Error(), "not found")
if !unstartedMinion.MatchString(err.Error()) && !terminatedPod {

so i've corrected that in 6779c72 in #11209 to test it there first

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ilya-zuyev @medyagh actually, this was ok - no need to change anything here as wait works as intended (it will skip if pod is gone or if the just-restarted-minion node is not yet ready (detailed explanation above still holds)
i'll close this one in favour of #11209
/close

klog.Errorf("WaitExtra: %v", err)
// ignore unstarted minion nodes and terminated pods, eg:
// error: `waitPodCondition: node "docker-containerd-multinode-wait-m02" hosting pod "kube-proxy-frz7b" in "kube-system" namespace is currently not "Ready": node "docker-containerd-multinode-wait-m02" has status "Ready":"Unknown"`, or
// error: `waitPodCondition: error getting pod "coredns-74ff55c5b-57fhw" in "kube-system" namespace: pods "coredns-74ff55c5b-57fhw" not found`
terminatedPod := strings.HasSuffix(err.Error(), "not found")
if !unstartedMinion.MatchString(err.Error()) && !terminatedPod {
klog.Errorf("WaitExtra: %v", err)
}
continue
}
break
}
Expand Down
47 changes: 24 additions & 23 deletions pkg/minikube/bootstrapper/kubeadm/kubeadm.go
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,30 @@ func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {
}
}

cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
return errors.Wrap(err, "runtime")
}

// We must ensure that the apiserver is healthy before proceeding
if err := kverify.WaitForAPIServerProcess(cr, k, cfg, k.c, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "apiserver healthz")
}

if err := kverify.WaitForHealthyAPIServer(cr, k, cfg, k.c, client, time.Now(), hostname, port, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "apiserver health")
}

// because reboots clear /etc/cni
if err := k.applyCNI(cfg); err != nil {
return errors.Wrap(err, "apply cni")
}

if err := kverify.WaitForSystemPods(cr, k, cfg, k.c, client, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "system pods")
}

// must be called after applyCNI
if cfg.VerifyComponents[kverify.ExtraKey] {
// after kubelet is restarted (with 'kubeadm init phase kubelet-start' above),
// it appears as to be immediately Ready as well as all kube-system pods (last observed state),
Expand Down Expand Up @@ -709,29 +733,6 @@ func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {
}
}

cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
return errors.Wrap(err, "runtime")
}

// We must ensure that the apiserver is healthy before proceeding
if err := kverify.WaitForAPIServerProcess(cr, k, cfg, k.c, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "apiserver healthz")
}

if err := kverify.WaitForHealthyAPIServer(cr, k, cfg, k.c, client, time.Now(), hostname, port, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "apiserver health")
}

// because reboots clear /etc/cni
if err := k.applyCNI(cfg); err != nil {
return errors.Wrap(err, "apply cni")
}

if err := kverify.WaitForSystemPods(cr, k, cfg, k.c, client, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "system pods")
}

if err := kverify.NodePressure(client); err != nil {
adviseNodePressure(err, cfg.Name, cfg.Driver)
}
Expand Down