Skip to content

Commit

Permalink
fix status-check to return success only on exact success criteria mat…
Browse files Browse the repository at this point in the history
…ch (#6010)

* fix status-check to return success only on exact success criteria match

* improve coverage

* address PR review comments

* add UT
  • Loading branch information
gsquared94 authored Jun 15, 2021
1 parent 7f754f6 commit f76873c
Show file tree
Hide file tree
Showing 3 changed files with 217 additions and 35 deletions.
104 changes: 77 additions & 27 deletions pkg/diag/validator/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,37 +120,87 @@ func (p *PodValidator) getPodStatus(pod *v1.Pod) *podStatus {

func getPodStatus(pod *v1.Pod) (proto.StatusCode, []string, error) {
// See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-conditions
for _, c := range pod.Status.Conditions {
if c.Type == v1.PodScheduled {
switch c.Status {
case v1.ConditionFalse:
logrus.Debugf("Pod %q not scheduled: checking tolerations", pod.Name)
sc, err := getUntoleratedTaints(c.Reason, c.Message)
return sc, nil, err
case v1.ConditionTrue:
logrus.Debugf("Pod %q scheduled: checking container statuses", pod.Name)
// TODO(dgageot): Add EphemeralContainerStatuses
cs := append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)
// See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-states
statusCode, logs, err := getContainerStatus(pod, cs)
if statusCode == proto.StatusCode_STATUSCHECK_POD_INITIALIZING {
// Determine if an init container is still running and fetch the init logs.
for _, c := range pod.Status.InitContainerStatuses {
if c.State.Waiting != nil {
return statusCode, []string{}, fmt.Errorf("waiting for init container %s to start", c.Name)
} else if c.State.Running != nil {
return statusCode, getPodLogs(pod, c.Name), fmt.Errorf("waiting for init container %s to complete", c.Name)
}
}

// If the event type PodReady with status True is found then we return success immediately
if isPodReady(pod) {
return proto.StatusCode_STATUSCHECK_SUCCESS, nil, nil
}
// If the event type PodScheduled with status False is found then we check if it is due to taints and tolerations.
if c, ok := isPodNotScheduled(pod); ok {
logrus.Debugf("Pod %q not scheduled: checking tolerations", pod.Name)
sc, err := getUntoleratedTaints(c.Reason, c.Message)
return sc, nil, err
}
// we can check the container status if the pod has been scheduled successfully. This can be determined by having the event
// PodScheduled with status True, or a ContainerReady or PodReady event with status False.
if isPodScheduledButNotReady(pod) {
logrus.Debugf("Pod %q scheduled but not ready: checking container statuses", pod.Name)
// TODO(dgageot): Add EphemeralContainerStatuses
cs := append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)
// See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-states
statusCode, logs, err := getContainerStatus(pod, cs)
if statusCode == proto.StatusCode_STATUSCHECK_POD_INITIALIZING {
// Determine if an init container is still running and fetch the init logs.
for _, c := range pod.Status.InitContainerStatuses {
if c.State.Waiting != nil {
return statusCode, []string{}, fmt.Errorf("waiting for init container %s to start", c.Name)
} else if c.State.Running != nil {
return statusCode, getPodLogs(pod, c.Name), fmt.Errorf("waiting for init container %s to complete", c.Name)
}
return statusCode, logs, err
case v1.ConditionUnknown:
logrus.Debugf("Pod %q scheduling condition is unknown", pod.Name)
return proto.StatusCode_STATUSCHECK_UNKNOWN, nil, fmt.Errorf(c.Message)
}
}
return statusCode, logs, err
}
return proto.StatusCode_STATUSCHECK_SUCCESS, nil, nil

if c, ok := isPodStatusUnknown(pod); ok {
logrus.Debugf("Pod %q condition status of type %s is unknown", pod.Name, c.Type)
return proto.StatusCode_STATUSCHECK_UNKNOWN, nil, fmt.Errorf(c.Message)
}

logrus.Debugf("Unable to determine current service state of pod %q", pod.Name)
return proto.StatusCode_STATUSCHECK_UNKNOWN, nil, fmt.Errorf("unable to determine current service state of pod %q", pod.Name)
}

func isPodReady(pod *v1.Pod) bool {
for _, c := range pod.Status.Conditions {
if c.Type == v1.PodReady && c.Status == v1.ConditionTrue {
return true
}
}
return false
}

func isPodNotScheduled(pod *v1.Pod) (v1.PodCondition, bool) {
for _, c := range pod.Status.Conditions {
if c.Type == v1.PodScheduled && c.Status == v1.ConditionFalse {
return c, true
}
}
return v1.PodCondition{}, false
}

func isPodScheduledButNotReady(pod *v1.Pod) bool {
for _, c := range pod.Status.Conditions {
if c.Type == v1.PodScheduled && c.Status == v1.ConditionTrue {
return true
}
if c.Type == v1.ContainersReady && c.Status == v1.ConditionFalse {
return true
}
if c.Type == v1.PodReady && c.Status == v1.ConditionFalse {
return true
}
}
return false
}

func isPodStatusUnknown(pod *v1.Pod) (v1.PodCondition, bool) {
for _, c := range pod.Status.Conditions {
if c.Status == v1.ConditionUnknown {
return c, true
}
}
return v1.PodCondition{}, false
}

func getContainerStatus(po *v1.Pod, cs []v1.ContainerStatus) (proto.StatusCode, []string, error) {
Expand Down
140 changes: 134 additions & 6 deletions pkg/diag/validator/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,25 @@ func TestRun(t *testing.T) {
ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS,
}, nil)},
},
{
description: "all pod containers are ready",
pods: []*v1.Pod{{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
Namespace: "test",
},
TypeMeta: metav1.TypeMeta{Kind: "Pod"},
Status: v1.PodStatus{
Phase: v1.PodSucceeded,
Conditions: []v1.PodCondition{{Type: v1.ContainersReady, Status: v1.ConditionTrue}},
},
}},
expected: []Resource{NewResource("test", "Pod", "foo", "Succeeded",
proto.ActionableErr{
Message: "",
ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS,
}, nil)},
},
{
description: "One of the pod containers is in Terminated State",
pods: []*v1.Pod{{
Expand Down Expand Up @@ -218,6 +237,36 @@ func TestRun(t *testing.T) {
ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS,
}, nil)},
},
{
description: "one of the pod containers is in Terminated State with non zero exit code but pod condition is Ready",
pods: []*v1.Pod{{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
Namespace: "test",
},
TypeMeta: metav1.TypeMeta{Kind: "Pod"},
Status: v1.PodStatus{
Phase: v1.PodRunning,
Conditions: []v1.PodCondition{{Type: v1.PodReady, Status: v1.ConditionFalse}},
ContainerStatuses: []v1.ContainerStatus{
{
Name: "foo-container",
Image: "foo-image",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{ExitCode: 1, Message: "panic caused"},
},
},
}},
}},
expected: []Resource{NewResource("test", "Pod", "foo", "Running",
proto.ActionableErr{
Message: "container foo-container terminated with exit code 1",
ErrCode: proto.StatusCode_STATUSCHECK_CONTAINER_TERMINATED,
Suggestions: []*proto.Suggestion{
{SuggestionCode: proto.SuggestionCode_CHECK_CONTAINER_LOGS,
Action: "Try checking container logs"},
}}, []string{})},
},
{
description: "one of the pod containers is in Terminated State with non zero exit code",
pods: []*v1.Pod{{
Expand Down Expand Up @@ -307,17 +356,26 @@ func TestRun(t *testing.T) {
Status: v1.PodStatus{
Phase: v1.PodPending,
Conditions: []v1.PodCondition{{
Type: v1.PodScheduled,
Status: v1.ConditionFalse,
Reason: v1.PodReasonUnschedulable,
Message: "0/2 nodes are available: 1 node(s) had taint {node.kubernetes.io/disk-pressure: }, that the pod didn't tolerate, 1 node(s) had taint {node.kubernetes.io/unreachable: }, that the pod didn't tolerate",
Type: v1.PodScheduled,
Status: v1.ConditionFalse,
Reason: v1.PodReasonUnschedulable,
Message: "0/7 nodes are available: " +
"1 node(s) had taint {node.kubernetes.io/memory-pressure: }, that the pod didn't tolerate, " +
"1 node(s) had taint {node.kubernetes.io/disk-pressure: }, that the pod didn't tolerate, " +
"1 node(s) had taint {node.kubernetes.io/pid-pressure: }, that the pod didn't tolerate, " +
"1 node(s) had taint {node.kubernetes.io/not-ready: }, that the pod didn't tolerate, " +
"1 node(s) had taint {node.kubernetes.io/unreachable: }, that the pod didn't tolerate, " +
"1 node(s) had taint {node.kubernetes.io/unschedulable: }, that the pod didn't tolerate, " +
"1 node(s) had taint {node.kubernetes.io/network-unavailable: }, that the pod didn't tolerate, ",
}},
},
}},
expected: []Resource{NewResource("test", "Pod", "foo", "Pending",
proto.ActionableErr{
Message: "Unschedulable: 0/2 nodes available: 1 node has disk pressure, 1 node is unreachable",
ErrCode: proto.StatusCode_STATUSCHECK_NODE_DISK_PRESSURE,
Message: "Unschedulable: 0/7 nodes available: 1 node has memory pressure, " +
"1 node has disk pressure, 1 node has PID pressure, 1 node is not ready, " +
"1 node is unreachable, 1 node is unschedulable, 1 node's network not available",
ErrCode: proto.StatusCode_STATUSCHECK_NODE_PID_PRESSURE,
}, nil)},
},
{
Expand Down Expand Up @@ -675,3 +733,73 @@ func testPodValidator(k kubernetes.Interface, _ map[string]string) *PodValidator
rs := []Recommender{recommender.ContainerError{}}
return &PodValidator{k: k, recos: rs}
}

func TestPodConditionChecks(t *testing.T) {
tests := []struct {
description string
conditions []v1.PodCondition
expected result
}{
{
description: "pod is ready",
conditions: []v1.PodCondition{
{Type: v1.PodReady, Status: v1.ConditionTrue},
{Type: v1.ContainersReady, Status: v1.ConditionTrue},
},
expected: result{isReady: true},
},
{
description: "pod scheduling failed",
conditions: []v1.PodCondition{
{Type: v1.PodScheduled, Status: v1.ConditionFalse},
},
expected: result{isNotScheduled: true},
},
{
description: "pod scheduled with no ready event",
conditions: []v1.PodCondition{
{Type: v1.PodScheduled, Status: v1.ConditionTrue},
},
expected: result{iScheduledNotReady: true},
},
{
description: "pod is scheduled, with failed containers ready event",
conditions: []v1.PodCondition{
{Type: v1.ContainersReady, Status: v1.ConditionFalse},
},
expected: result{iScheduledNotReady: true},
},
{
description: "pod is scheduled, with failed pod ready event",
conditions: []v1.PodCondition{
{Type: v1.PodReady, Status: v1.ConditionFalse},
},
expected: result{iScheduledNotReady: true},
},
{
description: "pod status is unknown",
conditions: []v1.PodCondition{
{Type: v1.PodScheduled, Status: v1.ConditionUnknown},
},
expected: result{isUnknown: true},
},
}
for _, test := range tests {
testutil.Run(t, test.description, func(t *testutil.T) {
pod := v1.Pod{Status: v1.PodStatus{Conditions: test.conditions}}
r := result{}
r.isReady = isPodReady(&pod)
_, r.isNotScheduled = isPodNotScheduled(&pod)
r.iScheduledNotReady = isPodScheduledButNotReady(&pod)
_, r.isUnknown = isPodStatusUnknown(&pod)
t.CheckDeepEqual(test.expected, r, cmp.AllowUnexported(result{}))
})
}
}

type result struct {
isReady bool
isNotScheduled bool
iScheduledNotReady bool
isUnknown bool
}
8 changes: 6 additions & 2 deletions pkg/skaffold/deploy/status/status_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ func getDeployments(ctx context.Context, client kubernetes.Interface, ns string,

func pollDeploymentStatus(ctx context.Context, cfg pkgkubectl.Config, r *resource.Deployment) {
pollDuration := time.Duration(defaultPollPeriodInMilliseconds) * time.Millisecond
ticker := time.NewTicker(pollDuration)
defer ticker.Stop()
// Add poll duration to account for one last attempt after progressDeadlineSeconds.
timeoutContext, cancel := context.WithTimeout(ctx, r.Deadline()+pollDuration)
logrus.Debugf("checking status %s", r)
Expand All @@ -204,7 +206,7 @@ func pollDeploymentStatus(ctx context.Context, cfg pkgkubectl.Config, r *resourc
})
}
return
case <-time.After(pollDuration):
case <-ticker.C:
r.CheckStatus(timeoutContext, cfg)
if r.IsStatusCheckCompleteOrCancelled() {
return
Expand Down Expand Up @@ -270,12 +272,14 @@ func (s statusChecker) printStatusCheckSummary(out io.Writer, r *resource.Deploy

// printDeploymentStatus prints resource statuses until all status check are completed or context is cancelled.
func (s statusChecker) printDeploymentStatus(ctx context.Context, out io.Writer, deployments []*resource.Deployment) {
ticker := time.NewTicker(reportStatusTime)
defer ticker.Stop()
for {
var allDone bool
select {
case <-ctx.Done():
return
case <-time.After(reportStatusTime):
case <-ticker.C:
allDone = s.printStatus(deployments, out)
}
if allDone {
Expand Down

0 comments on commit f76873c

Please sign in to comment.