Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-1.12] Don't drop traffic when upgrading a deployment fails #14840

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 32 additions & 19 deletions pkg/apis/serving/k8s_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,29 +50,42 @@ func TransformDeploymentStatus(ds *appsv1.DeploymentStatus) *duckv1.Status {
// The absence of this condition means no failure has occurred. If we find it
// below, we'll overwrite this.
depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady)
depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, "Deploying", "")

for _, cond := range ds.Conditions {
// TODO(jonjohnsonjr): Should we care about appsv1.DeploymentAvailable here?
switch cond.Type {
case appsv1.DeploymentProgressing:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkTrue(DeploymentConditionProgressing)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkFalse(DeploymentConditionProgressing, cond.Reason, cond.Message)
conds := []appsv1.DeploymentConditionType{
appsv1.DeploymentProgressing,
appsv1.DeploymentReplicaFailure,
}

for _, wantType := range conds {
for _, cond := range ds.Conditions {
if wantType != cond.Type {
continue
}
case appsv1.DeploymentReplicaFailure:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkFalse(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady)

switch cond.Type {
case appsv1.DeploymentProgressing:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkTrue(DeploymentConditionProgressing)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkFalse(DeploymentConditionProgressing, cond.Reason, cond.Message)
}
case appsv1.DeploymentReplicaFailure:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkFalse(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady)
}
}

}
}

return s
}
41 changes: 38 additions & 3 deletions pkg/apis/serving/k8s_lifecycle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ func TestTransformDeploymentStatus(t *testing.T) {
Conditions: []apis.Condition{{
Type: DeploymentConditionProgressing,
Status: corev1.ConditionUnknown,
Reason: "Deploying",
}, {
Type: DeploymentConditionReplicaSetReady,
Status: corev1.ConditionTrue,
}, {
Type: DeploymentConditionReady,
Status: corev1.ConditionUnknown,
Reason: "Deploying",
}},
},
}, {
Expand Down Expand Up @@ -147,7 +149,7 @@ func TestTransformDeploymentStatus(t *testing.T) {
Type: appsv1.DeploymentReplicaFailure,
Status: corev1.ConditionTrue,
Reason: "ReplicaSetReason",
Message: "Something bag happened",
Message: "Something bad happened",
}},
},
want: &duckv1.Status{
Expand All @@ -158,12 +160,45 @@ func TestTransformDeploymentStatus(t *testing.T) {
Type: DeploymentConditionReplicaSetReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something bag happened",
Message: "Something bad happened",
}, {
Type: DeploymentConditionReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something bag happened",
Message: "Something bad happened",
}},
},
}, {
name: "replica failure has priority over progressing",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you elaborate where priority is defined, I see that DeploymentConditionProgressing is the same before and after, so no change there.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'priority' here means that the replicafailure message is the last one applied so it is surfaced to the deployment's Ready condition.

ds: &appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentReplicaFailure,
Status: corev1.ConditionTrue,
Reason: "ReplicaSetReason",
Message: "Something really bad happened",
}, {
Type: appsv1.DeploymentProgressing,
Status: corev1.ConditionFalse,
Reason: "ProgressingReason",
Message: "Something bad happened",
}},
},
want: &duckv1.Status{
Conditions: []apis.Condition{{
Type: DeploymentConditionProgressing,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reading this I was kind of confused seeing that:

DeploymentConditionProgressing apis.ConditionType = "Progressing"
DeploymentProgressing DeploymentConditionType = "Progressing"

condition types defined in knative.dev/pkg are just two:

	// ConditionReady specifies that the resource is ready.
	// For long-running resources.
	ConditionReady ConditionType = "Ready"
	// ConditionSucceeded specifies that the resource has finished.
	// For resource which run to completion.
	ConditionSucceeded ConditionType = "Succeeded"

Also going from deployment conditions to duckv1 conditions and back
seems a bit complex, eventually we have:

func TransformDeploymentStatus(ds *appsv1.DeploymentStatus) *duckv1.Status {
	s := &duckv1.Status{}

	depCondSet.Manage(s).InitializeConditions()
	// The absence of this condition means no failure has occurred. If we find it
	// below, we'll overwrite this.
	depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady)
	depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, "Deploying", "")
....
func (rs *RevisionStatus) PropagateDeploymentStatus(original *appsv1.DeploymentStatus) {
	ds := serving.TransformDeploymentStatus(original)
	cond := ds.GetCondition(serving.DeploymentConditionReady)
...

I am wondering if mapping deployment conditions directly to revision conditions would be more readable.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering if mapping deployment conditions directly to revision conditions would be more readable.

I'm open folks cleaning this up in a follow up PR.

The thing with the deployment conditions is their polarity is weird - ReplicaCreateFailure=False is actually good

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thing with the deployment conditions is their polarity is weird - ReplicaCreateFailure=False is actually good

Yes, had to read that three times to get :D

Status: corev1.ConditionFalse,
Reason: "ProgressingReason",
Message: "Something bad happened",
}, {
Type: DeploymentConditionReplicaSetReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something really bad happened",
}, {
Type: DeploymentConditionReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something really bad happened",
}},
},
}}
Expand Down
7 changes: 0 additions & 7 deletions pkg/apis/serving/v1/revision_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package v1
import (
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
net "knative.dev/networking/pkg/apis/networking"
"knative.dev/pkg/kmeta"
Expand Down Expand Up @@ -144,9 +143,3 @@ func (rs *RevisionStatus) IsActivationRequired() bool {
c := revisionCondSet.Manage(rs).GetCondition(RevisionConditionActive)
return c != nil && c.Status != corev1.ConditionTrue
}

// IsReplicaSetFailure returns true if the deployment replicaset failed to create
func (rs *RevisionStatus) IsReplicaSetFailure(deploymentStatus *appsv1.DeploymentStatus) bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do we cover this part?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here https://github.com/knative/serving/pull/14840/files#diff-f23ba125b15b4c6e491938d04e4d412d0b550197c51078c7de359ba7abc0da17R71

We always propagate the status now - and this is surfaced as a deployment condition

ds := serving.TransformDeploymentStatus(deploymentStatus)
return ds != nil && ds.GetCondition(serving.DeploymentConditionReplicaSetReady).IsFalse()
}
43 changes: 0 additions & 43 deletions pkg/apis/serving/v1/revision_helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"testing"
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"

Expand Down Expand Up @@ -268,45 +267,3 @@ func TestSetRoutingState(t *testing.T) {
t.Error("Expected default value for unparsable annotationm but got:", got)
}
}

func TestIsReplicaSetFailure(t *testing.T) {
revisionStatus := RevisionStatus{}
cases := []struct {
name string
status appsv1.DeploymentStatus
IsReplicaSetFailure bool
}{{
name: "empty deployment status should not be a failure",
status: appsv1.DeploymentStatus{},
}, {
name: "Ready deployment status should not be a failure",
status: appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentAvailable, Status: corev1.ConditionTrue,
}},
},
}, {
name: "ReplicasetFailure true should be a failure",
status: appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentReplicaFailure, Status: corev1.ConditionTrue,
}},
},
IsReplicaSetFailure: true,
}, {
name: "ReplicasetFailure false should not be a failure",
status: appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentReplicaFailure, Status: corev1.ConditionFalse,
}},
},
}}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got, want := revisionStatus.IsReplicaSetFailure(&tc.status), tc.IsReplicaSetFailure; got != want {
t.Errorf("IsReplicaSetFailure = %v, want: %v", got, want)
}
})
}
}
23 changes: 13 additions & 10 deletions pkg/apis/serving/v1/revision_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ func (rs *RevisionStatus) PropagateDeploymentStatus(original *appsv1.DeploymentS

// PropagateAutoscalerStatus propagates autoscaler's status to the revision's status.
func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodAutoscalerStatus) {
resUnavailable := rs.GetCondition(RevisionConditionResourcesAvailable).IsFalse()

// Reflect the PA status in our own.
cond := ps.GetCondition(autoscalingv1alpha1.PodAutoscalerConditionReady)
rs.ActualReplicas = nil
Expand All @@ -183,20 +185,29 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA
}

if cond == nil {
rs.MarkActiveUnknown("Deploying", "")
rs.MarkActiveUnknown(ReasonDeploying, "")

if !resUnavailable {
rs.MarkResourcesAvailableUnknown(ReasonDeploying, "")
}
return
}

// Don't mark the resources available, if deployment status already determined
// it isn't so.
resUnavailable := rs.GetCondition(RevisionConditionResourcesAvailable).IsFalse()
if ps.IsScaleTargetInitialized() && !resUnavailable {
// Precondition for PA being initialized is SKS being active and
// that implies that |service.endpoints| > 0.
rs.MarkResourcesAvailableTrue()
rs.MarkContainerHealthyTrue()
}

// Mark resource unavailable if we don't have a Service Name and the deployment is ready
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we somehow combine this with the above statements (from https://github.com/knative/serving/pull/14840/files#diff-831a9383e7db7880978acf31f7dfec777beb08b900b1d0e1c55a5aed42e602cbR173 down)?
It feels like both parts work on RevisionConditionResourcesAvailable and PodAutoscalerConditionReady and set rs.MarkResourcesAvailableUnknown

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or in other words, the full function is a bit hard to grasp.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you want to combine it? My hope here is to keep the conditionals straight forward. Keeping them separate helps with that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm I'd need more time to fiddle around with the current code. But maybe better to keep it here and do it on main afterwards (if even).

// This can happen when we have initial scale set to 0
if rs.GetCondition(RevisionConditionResourcesAvailable).IsTrue() && ps.ServiceName == "" {
rs.MarkResourcesAvailableUnknown(ReasonDeploying, "")
}

switch cond.Status {
case corev1.ConditionUnknown:
rs.MarkActiveUnknown(cond.Reason, cond.Message)
Expand All @@ -222,14 +233,6 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA
rs.MarkActiveFalse(cond.Reason, cond.Message)
case corev1.ConditionTrue:
rs.MarkActiveTrue()

// Precondition for PA being active is SKS being active and
// that implies that |service.endpoints| > 0.
//
// Note: This is needed for backwards compatibility as we're adding the new
// ScaleTargetInitialized condition to gate readiness.
rs.MarkResourcesAvailableTrue()
rs.MarkContainerHealthyTrue()
}
}

Expand Down
68 changes: 43 additions & 25 deletions pkg/apis/serving/v1/revision_lifecycle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package v1

import (
"fmt"
"sort"
"testing"

Expand Down Expand Up @@ -536,6 +537,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) {

// PodAutoscaler becomes ready, making us active.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand All @@ -551,6 +553,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) {

// PodAutoscaler flipping back to Unknown causes Active become ongoing immediately.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand All @@ -566,6 +569,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) {

// PodAutoscaler becoming unready makes Active false, but doesn't affect readiness.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand All @@ -582,32 +586,45 @@ func TestPropagateAutoscalerStatus(t *testing.T) {
apistest.CheckConditionSucceeded(r, RevisionConditionResourcesAvailable, t)
}

func TestPAResAvailableNoOverride(t *testing.T) {
r := &RevisionStatus{}
r.InitializeConditions()
apistest.CheckConditionOngoing(r, RevisionConditionReady, t)

// Deployment determined that something's wrong, e.g. the only pod
// has crashed.
r.MarkResourcesAvailableFalse("somehow", "somewhere")
func TestPropagateAutoscalerStatus_NoOverridingResourcesAvailable(t *testing.T) {
// Cases to test Ready condition
// we fix ScaleTargetInitialized to True
cases := []corev1.ConditionStatus{
corev1.ConditionTrue,
corev1.ConditionFalse,
corev1.ConditionUnknown,
}

// PodAutoscaler achieved initial scale.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Status: corev1.ConditionUnknown,
}, {
Type: autoscalingv1alpha1.PodAutoscalerConditionScaleTargetInitialized,
Status: corev1.ConditionTrue,
}},
},
})
// Verify we did not override this.
apistest.CheckConditionFailed(r, RevisionConditionResourcesAvailable, t)
cond := r.GetCondition(RevisionConditionResourcesAvailable)
if got, notWant := cond.Reason, ReasonProgressDeadlineExceeded; got == notWant {
t.Error("PA Status propagation overrode the ResourcesAvailable status")
for _, tc := range cases {
t.Run(fmt.Sprintf("ready is %v", tc), func(t *testing.T) {

r := &RevisionStatus{}
r.InitializeConditions()
apistest.CheckConditionOngoing(r, RevisionConditionReady, t)

// Deployment determined that something's wrong, e.g. the only pod
// has crashed.
r.MarkResourcesAvailableFalse("somehow", "somewhere")
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Status: tc,
}, {
Type: autoscalingv1alpha1.PodAutoscalerConditionScaleTargetInitialized,
Status: corev1.ConditionTrue,
}},
},
})

// Verify we did not override this.
apistest.CheckConditionFailed(r, RevisionConditionResourcesAvailable, t)

cond := r.GetCondition(RevisionConditionResourcesAvailable)
if got, notWant := cond.Reason, ReasonProgressDeadlineExceeded; got == notWant {
t.Error("PodAutoscalerStatus propagation overrode the ResourcesAvailable status")
}
})
}
}

Expand Down Expand Up @@ -671,6 +688,7 @@ func TestPropagateAutoscalerStatusRace(t *testing.T) {

// The PodAutoscaler might have been ready but it's scaled down already.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand Down
Loading
Loading