fix status-check to return success only on exact success criteria mat…

…ch (#6010) * fix status-check to return success only on exact success criteria match * improve coverage * address PR review comments * add UT
GoogleContainerTools · Jun 15, 2021 · f76873c · f76873c
1 parent 7f754f6
commit f76873c
Show file tree

Hide file tree

Showing 3 changed files with 217 additions and 35 deletions.
diff --git a/pkg/diag/validator/pod.go b/pkg/diag/validator/pod.go
@@ -120,37 +120,87 @@ func (p *PodValidator) getPodStatus(pod *v1.Pod) *podStatus {
 
 func getPodStatus(pod *v1.Pod) (proto.StatusCode, []string, error) {
 	// See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-conditions
-	for _, c := range pod.Status.Conditions {
-		if c.Type == v1.PodScheduled {
-			switch c.Status {
-			case v1.ConditionFalse:
-				logrus.Debugf("Pod %q not scheduled: checking tolerations", pod.Name)
-				sc, err := getUntoleratedTaints(c.Reason, c.Message)
-				return sc, nil, err
-			case v1.ConditionTrue:
-				logrus.Debugf("Pod %q scheduled: checking container statuses", pod.Name)
-				// TODO(dgageot): Add EphemeralContainerStatuses
-				cs := append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)
-				// See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-states
-				statusCode, logs, err := getContainerStatus(pod, cs)
-				if statusCode == proto.StatusCode_STATUSCHECK_POD_INITIALIZING {
-					// Determine if an init container is still running and fetch the init logs.
-					for _, c := range pod.Status.InitContainerStatuses {
-						if c.State.Waiting != nil {
-							return statusCode, []string{}, fmt.Errorf("waiting for init container %s to start", c.Name)
-						} else if c.State.Running != nil {
-							return statusCode, getPodLogs(pod, c.Name), fmt.Errorf("waiting for init container %s to complete", c.Name)
-						}
-					}
+
+	// If the event type PodReady with status True is found then we return success immediately
+	if isPodReady(pod) {
+		return proto.StatusCode_STATUSCHECK_SUCCESS, nil, nil
+	}
+	// If the event type PodScheduled with status False is found then we check if it is due to taints and tolerations.
+	if c, ok := isPodNotScheduled(pod); ok {
+		logrus.Debugf("Pod %q not scheduled: checking tolerations", pod.Name)
+		sc, err := getUntoleratedTaints(c.Reason, c.Message)
+		return sc, nil, err
+	}
+	// we can check the container status if the pod has been scheduled successfully. This can be determined by having the event
+	// PodScheduled with status True, or a ContainerReady or PodReady event with status False.
+	if isPodScheduledButNotReady(pod) {
+		logrus.Debugf("Pod %q scheduled but not ready: checking container statuses", pod.Name)
+		// TODO(dgageot): Add EphemeralContainerStatuses
+		cs := append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)
+		// See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-states
+		statusCode, logs, err := getContainerStatus(pod, cs)
+		if statusCode == proto.StatusCode_STATUSCHECK_POD_INITIALIZING {
+			// Determine if an init container is still running and fetch the init logs.
+			for _, c := range pod.Status.InitContainerStatuses {
+				if c.State.Waiting != nil {
+					return statusCode, []string{}, fmt.Errorf("waiting for init container %s to start", c.Name)
+				} else if c.State.Running != nil {
+					return statusCode, getPodLogs(pod, c.Name), fmt.Errorf("waiting for init container %s to complete", c.Name)
 				}
-				return statusCode, logs, err
-			case v1.ConditionUnknown:
-				logrus.Debugf("Pod %q scheduling condition is unknown", pod.Name)
-				return proto.StatusCode_STATUSCHECK_UNKNOWN, nil, fmt.Errorf(c.Message)
 			}
 		}
+		return statusCode, logs, err
 	}
-	return proto.StatusCode_STATUSCHECK_SUCCESS, nil, nil
+
+	if c, ok := isPodStatusUnknown(pod); ok {
+		logrus.Debugf("Pod %q condition status of type %s is unknown", pod.Name, c.Type)
+		return proto.StatusCode_STATUSCHECK_UNKNOWN, nil, fmt.Errorf(c.Message)
+	}
+
+	logrus.Debugf("Unable to determine current service state of pod %q", pod.Name)
+	return proto.StatusCode_STATUSCHECK_UNKNOWN, nil, fmt.Errorf("unable to determine current service state of pod %q", pod.Name)
+}
+
+func isPodReady(pod *v1.Pod) bool {
+	for _, c := range pod.Status.Conditions {
+		if c.Type == v1.PodReady && c.Status == v1.ConditionTrue {
+			return true
+		}
+	}
+	return false
+}
+
+func isPodNotScheduled(pod *v1.Pod) (v1.PodCondition, bool) {
+	for _, c := range pod.Status.Conditions {
+		if c.Type == v1.PodScheduled && c.Status == v1.ConditionFalse {
+			return c, true
+		}
+	}
+	return v1.PodCondition{}, false
+}
+
+func isPodScheduledButNotReady(pod *v1.Pod) bool {
+	for _, c := range pod.Status.Conditions {
+		if c.Type == v1.PodScheduled && c.Status == v1.ConditionTrue {
+			return true
+		}
+		if c.Type == v1.ContainersReady && c.Status == v1.ConditionFalse {
+			return true
+		}
+		if c.Type == v1.PodReady && c.Status == v1.ConditionFalse {
+			return true
+		}
+	}
+	return false
+}
+
+func isPodStatusUnknown(pod *v1.Pod) (v1.PodCondition, bool) {
+	for _, c := range pod.Status.Conditions {
+		if c.Status == v1.ConditionUnknown {
+			return c, true
+		}
+	}
+	return v1.PodCondition{}, false
 }
 
 func getContainerStatus(po *v1.Pod, cs []v1.ContainerStatus) (proto.StatusCode, []string, error) {

diff --git a/pkg/diag/validator/pod_test.go b/pkg/diag/validator/pod_test.go
@@ -190,6 +190,25 @@ func TestRun(t *testing.T) {
 					ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS,
 				}, nil)},
 		},
+		{
+			description: "all pod containers are ready",
+			pods: []*v1.Pod{{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo",
+					Namespace: "test",
+				},
+				TypeMeta: metav1.TypeMeta{Kind: "Pod"},
+				Status: v1.PodStatus{
+					Phase:      v1.PodSucceeded,
+					Conditions: []v1.PodCondition{{Type: v1.ContainersReady, Status: v1.ConditionTrue}},
+				},
+			}},
+			expected: []Resource{NewResource("test", "Pod", "foo", "Succeeded",
+				proto.ActionableErr{
+					Message: "",
+					ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS,
+				}, nil)},
+		},
 		{
 			description: "One of the pod containers is in Terminated State",
 			pods: []*v1.Pod{{
@@ -218,6 +237,36 @@ func TestRun(t *testing.T) {
 					ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS,
 				}, nil)},
 		},
+		{
+			description: "one of the pod containers is in Terminated State with non zero exit code but pod condition is Ready",
+			pods: []*v1.Pod{{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo",
+					Namespace: "test",
+				},
+				TypeMeta: metav1.TypeMeta{Kind: "Pod"},
+				Status: v1.PodStatus{
+					Phase:      v1.PodRunning,
+					Conditions: []v1.PodCondition{{Type: v1.PodReady, Status: v1.ConditionFalse}},
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name:  "foo-container",
+							Image: "foo-image",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{ExitCode: 1, Message: "panic caused"},
+							},
+						},
+					}},
+			}},
+			expected: []Resource{NewResource("test", "Pod", "foo", "Running",
+				proto.ActionableErr{
+					Message: "container foo-container terminated with exit code 1",
+					ErrCode: proto.StatusCode_STATUSCHECK_CONTAINER_TERMINATED,
+					Suggestions: []*proto.Suggestion{
+						{SuggestionCode: proto.SuggestionCode_CHECK_CONTAINER_LOGS,
+							Action: "Try checking container logs"},
+					}}, []string{})},
+		},
 		{
 			description: "one of the pod containers is in Terminated State with non zero exit code",
 			pods: []*v1.Pod{{
@@ -307,17 +356,26 @@ func TestRun(t *testing.T) {
 				Status: v1.PodStatus{
 					Phase: v1.PodPending,
 					Conditions: []v1.PodCondition{{
-						Type:    v1.PodScheduled,
-						Status:  v1.ConditionFalse,
-						Reason:  v1.PodReasonUnschedulable,
-						Message: "0/2 nodes are available: 1 node(s) had taint {node.kubernetes.io/disk-pressure: }, that the pod didn't tolerate, 1 node(s) had taint {node.kubernetes.io/unreachable: }, that the pod didn't tolerate",
+						Type:   v1.PodScheduled,
+						Status: v1.ConditionFalse,
+						Reason: v1.PodReasonUnschedulable,
+						Message: "0/7 nodes are available: " +
+							"1 node(s) had taint {node.kubernetes.io/memory-pressure: }, that the pod didn't tolerate, " +
+							"1 node(s) had taint {node.kubernetes.io/disk-pressure: }, that the pod didn't tolerate, " +
+							"1 node(s) had taint {node.kubernetes.io/pid-pressure: }, that the pod didn't tolerate, " +
+							"1 node(s) had taint {node.kubernetes.io/not-ready: }, that the pod didn't tolerate, " +
+							"1 node(s) had taint {node.kubernetes.io/unreachable: }, that the pod didn't tolerate, " +
+							"1 node(s) had taint {node.kubernetes.io/unschedulable: }, that the pod didn't tolerate, " +
+							"1 node(s) had taint {node.kubernetes.io/network-unavailable: }, that the pod didn't tolerate, ",
 					}},
 				},
 			}},
 			expected: []Resource{NewResource("test", "Pod", "foo", "Pending",
 				proto.ActionableErr{
-					Message: "Unschedulable: 0/2 nodes available: 1 node has disk pressure, 1 node is unreachable",
-					ErrCode: proto.StatusCode_STATUSCHECK_NODE_DISK_PRESSURE,
+					Message: "Unschedulable: 0/7 nodes available: 1 node has memory pressure, " +
+						"1 node has disk pressure, 1 node has PID pressure, 1 node is not ready, " +
+						"1 node is unreachable, 1 node is unschedulable, 1 node's network not available",
+					ErrCode: proto.StatusCode_STATUSCHECK_NODE_PID_PRESSURE,
 				}, nil)},
 		},
 		{
@@ -675,3 +733,73 @@ func testPodValidator(k kubernetes.Interface, _ map[string]string) *PodValidator
 	rs := []Recommender{recommender.ContainerError{}}
 	return &PodValidator{k: k, recos: rs}
 }
+
+func TestPodConditionChecks(t *testing.T) {
+	tests := []struct {
+		description string
+		conditions  []v1.PodCondition
+		expected    result
+	}{
+		{
+			description: "pod is ready",
+			conditions: []v1.PodCondition{
+				{Type: v1.PodReady, Status: v1.ConditionTrue},
+				{Type: v1.ContainersReady, Status: v1.ConditionTrue},
+			},
+			expected: result{isReady: true},
+		},
+		{
+			description: "pod scheduling failed",
+			conditions: []v1.PodCondition{
+				{Type: v1.PodScheduled, Status: v1.ConditionFalse},
+			},
+			expected: result{isNotScheduled: true},
+		},
+		{
+			description: "pod scheduled with no ready event",
+			conditions: []v1.PodCondition{
+				{Type: v1.PodScheduled, Status: v1.ConditionTrue},
+			},
+			expected: result{iScheduledNotReady: true},
+		},
+		{
+			description: "pod is scheduled, with failed containers ready event",
+			conditions: []v1.PodCondition{
+				{Type: v1.ContainersReady, Status: v1.ConditionFalse},
+			},
+			expected: result{iScheduledNotReady: true},
+		},
+		{
+			description: "pod is scheduled, with failed pod ready event",
+			conditions: []v1.PodCondition{
+				{Type: v1.PodReady, Status: v1.ConditionFalse},
+			},
+			expected: result{iScheduledNotReady: true},
+		},
+		{
+			description: "pod status is unknown",
+			conditions: []v1.PodCondition{
+				{Type: v1.PodScheduled, Status: v1.ConditionUnknown},
+			},
+			expected: result{isUnknown: true},
+		},
+	}
+	for _, test := range tests {
+		testutil.Run(t, test.description, func(t *testutil.T) {
+			pod := v1.Pod{Status: v1.PodStatus{Conditions: test.conditions}}
+			r := result{}
+			r.isReady = isPodReady(&pod)
+			_, r.isNotScheduled = isPodNotScheduled(&pod)
+			r.iScheduledNotReady = isPodScheduledButNotReady(&pod)
+			_, r.isUnknown = isPodStatusUnknown(&pod)
+			t.CheckDeepEqual(test.expected, r, cmp.AllowUnexported(result{}))
+		})
+	}
+}
+
+type result struct {
+	isReady            bool
+	isNotScheduled     bool
+	iScheduledNotReady bool
+	isUnknown          bool
+}
diff --git a/pkg/skaffold/deploy/status/status_check.go b/pkg/skaffold/deploy/status/status_check.go
@@ -184,6 +184,8 @@ func getDeployments(ctx context.Context, client kubernetes.Interface, ns string,
 
 func pollDeploymentStatus(ctx context.Context, cfg pkgkubectl.Config, r *resource.Deployment) {
 	pollDuration := time.Duration(defaultPollPeriodInMilliseconds) * time.Millisecond
+	ticker := time.NewTicker(pollDuration)
+	defer ticker.Stop()
 	// Add poll duration to account for one last attempt after progressDeadlineSeconds.
 	timeoutContext, cancel := context.WithTimeout(ctx, r.Deadline()+pollDuration)
 	logrus.Debugf("checking status %s", r)
@@ -204,7 +206,7 @@ func pollDeploymentStatus(ctx context.Context, cfg pkgkubectl.Config, r *resourc
 				})
 			}
 			return
-		case <-time.After(pollDuration):
+		case <-ticker.C:
 			r.CheckStatus(timeoutContext, cfg)
 			if r.IsStatusCheckCompleteOrCancelled() {
 				return
@@ -270,12 +272,14 @@ func (s statusChecker) printStatusCheckSummary(out io.Writer, r *resource.Deploy
 
 // printDeploymentStatus prints resource statuses until all status check are completed or context is cancelled.
 func (s statusChecker) printDeploymentStatus(ctx context.Context, out io.Writer, deployments []*resource.Deployment) {
+	ticker := time.NewTicker(reportStatusTime)
+	defer ticker.Stop()
 	for {
 		var allDone bool
 		select {
 		case <-ctx.Done():
 			return
-		case <-time.After(reportStatusTime):
+		case <-ticker.C:
 			allDone = s.printStatus(deployments, out)
 		}
 		if allDone {