Skip to content

Commit

Permalink
[RELEASE-1.11] Don't drop traffic when upgrading a deployment fails (k…
Browse files Browse the repository at this point in the history
…native#14864) (knative#655)

* Surface Replica failures over Progressing failures

When transforming the deployment status to the revision
we want to bubble up the more severe condition to Ready.

Since Replica failures will include a more actionable error
message this condition is preferred

* Stop always marking the revision healthy when the PA is Ready

This isn't accurate when the Revision has failed to rollout
an update to it's deployment

* Various updates to the revision reconciler

1. PA Reachability now depends on the status of the Deployment

If we have available replicas we don't mark the revision as
unreachable. This allows ongoing requests to be handled

2. Always propagate the K8s Deployment Status to the Revision.

We don't need to gate this depending on whether the Revision
required activation. Since the only two conditions we propagate
from the Deployment is Progressing and ReplicaSetFailure=False

3. Mark Revision as Deploying if the PA's service name isn't set

* test deployment failures don't drop traffic on upgrade

* fix boilerplate check

---------

Co-authored-by: Knative Prow Robot <[email protected]>
Co-authored-by: dprotaso <[email protected]>
Co-authored-by: John Doe <johndoe@localhost>
  • Loading branch information
4 people authored and ReToCode committed Mar 27, 2024
1 parent fcf9166 commit 0280701
Show file tree
Hide file tree
Showing 14 changed files with 343 additions and 100 deletions.
51 changes: 32 additions & 19 deletions pkg/apis/serving/k8s_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,29 +50,42 @@ func TransformDeploymentStatus(ds *appsv1.DeploymentStatus) *duckv1.Status {
// The absence of this condition means no failure has occurred. If we find it
// below, we'll overwrite this.
depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady)
depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, "Deploying", "")

for _, cond := range ds.Conditions {
// TODO(jonjohnsonjr): Should we care about appsv1.DeploymentAvailable here?
switch cond.Type {
case appsv1.DeploymentProgressing:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkTrue(DeploymentConditionProgressing)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkFalse(DeploymentConditionProgressing, cond.Reason, cond.Message)
conds := []appsv1.DeploymentConditionType{
appsv1.DeploymentProgressing,
appsv1.DeploymentReplicaFailure,
}

for _, wantType := range conds {
for _, cond := range ds.Conditions {
if wantType != cond.Type {
continue
}
case appsv1.DeploymentReplicaFailure:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkFalse(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady)

switch cond.Type {
case appsv1.DeploymentProgressing:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkTrue(DeploymentConditionProgressing)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkFalse(DeploymentConditionProgressing, cond.Reason, cond.Message)
}
case appsv1.DeploymentReplicaFailure:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkFalse(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady)
}
}

}
}

return s
}
41 changes: 38 additions & 3 deletions pkg/apis/serving/k8s_lifecycle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ func TestTransformDeploymentStatus(t *testing.T) {
Conditions: []apis.Condition{{
Type: DeploymentConditionProgressing,
Status: corev1.ConditionUnknown,
Reason: "Deploying",
}, {
Type: DeploymentConditionReplicaSetReady,
Status: corev1.ConditionTrue,
}, {
Type: DeploymentConditionReady,
Status: corev1.ConditionUnknown,
Reason: "Deploying",
}},
},
}, {
Expand Down Expand Up @@ -147,7 +149,7 @@ func TestTransformDeploymentStatus(t *testing.T) {
Type: appsv1.DeploymentReplicaFailure,
Status: corev1.ConditionTrue,
Reason: "ReplicaSetReason",
Message: "Something bag happened",
Message: "Something bad happened",
}},
},
want: &duckv1.Status{
Expand All @@ -158,12 +160,45 @@ func TestTransformDeploymentStatus(t *testing.T) {
Type: DeploymentConditionReplicaSetReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something bag happened",
Message: "Something bad happened",
}, {
Type: DeploymentConditionReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something bag happened",
Message: "Something bad happened",
}},
},
}, {
name: "replica failure has priority over progressing",
ds: &appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentReplicaFailure,
Status: corev1.ConditionTrue,
Reason: "ReplicaSetReason",
Message: "Something really bad happened",
}, {
Type: appsv1.DeploymentProgressing,
Status: corev1.ConditionFalse,
Reason: "ProgressingReason",
Message: "Something bad happened",
}},
},
want: &duckv1.Status{
Conditions: []apis.Condition{{
Type: DeploymentConditionProgressing,
Status: corev1.ConditionFalse,
Reason: "ProgressingReason",
Message: "Something bad happened",
}, {
Type: DeploymentConditionReplicaSetReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something really bad happened",
}, {
Type: DeploymentConditionReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something really bad happened",
}},
},
}}
Expand Down
23 changes: 13 additions & 10 deletions pkg/apis/serving/v1/revision_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ func (rs *RevisionStatus) PropagateDeploymentStatus(original *appsv1.DeploymentS

// PropagateAutoscalerStatus propagates autoscaler's status to the revision's status.
func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodAutoscalerStatus) {
resUnavailable := rs.GetCondition(RevisionConditionResourcesAvailable).IsFalse()

// Reflect the PA status in our own.
cond := ps.GetCondition(autoscalingv1alpha1.PodAutoscalerConditionReady)

Expand All @@ -183,20 +185,29 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA
}

if cond == nil {
rs.MarkActiveUnknown("Deploying", "")
rs.MarkActiveUnknown(ReasonDeploying, "")

if !resUnavailable {
rs.MarkResourcesAvailableUnknown(ReasonDeploying, "")
}
return
}

// Don't mark the resources available, if deployment status already determined
// it isn't so.
resUnavailable := rs.GetCondition(RevisionConditionResourcesAvailable).IsFalse()
if ps.IsScaleTargetInitialized() && !resUnavailable {
// Precondition for PA being initialized is SKS being active and
// that implies that |service.endpoints| > 0.
rs.MarkResourcesAvailableTrue()
rs.MarkContainerHealthyTrue()
}

// Mark resource unavailable if we don't have a Service Name and the deployment is ready
// This can happen when we have initial scale set to 0
if rs.GetCondition(RevisionConditionResourcesAvailable).IsTrue() && ps.ServiceName == "" {
rs.MarkResourcesAvailableUnknown(ReasonDeploying, "")
}

switch cond.Status {
case corev1.ConditionUnknown:
rs.MarkActiveUnknown(cond.Reason, cond.Message)
Expand All @@ -222,14 +233,6 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA
rs.MarkActiveFalse(cond.Reason, cond.Message)
case corev1.ConditionTrue:
rs.MarkActiveTrue()

// Precondition for PA being active is SKS being active and
// that implies that |service.endpoints| > 0.
//
// Note: This is needed for backwards compatibility as we're adding the new
// ScaleTargetInitialized condition to gate readiness.
rs.MarkResourcesAvailableTrue()
rs.MarkContainerHealthyTrue()
}
}

Expand Down
68 changes: 43 additions & 25 deletions pkg/apis/serving/v1/revision_lifecycle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package v1

import (
"fmt"
"sort"
"testing"

Expand Down Expand Up @@ -536,6 +537,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) {

// PodAutoscaler becomes ready, making us active.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand All @@ -551,6 +553,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) {

// PodAutoscaler flipping back to Unknown causes Active become ongoing immediately.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand All @@ -566,6 +569,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) {

// PodAutoscaler becoming unready makes Active false, but doesn't affect readiness.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand All @@ -582,32 +586,45 @@ func TestPropagateAutoscalerStatus(t *testing.T) {
apistest.CheckConditionSucceeded(r, RevisionConditionResourcesAvailable, t)
}

func TestPAResAvailableNoOverride(t *testing.T) {
r := &RevisionStatus{}
r.InitializeConditions()
apistest.CheckConditionOngoing(r, RevisionConditionReady, t)

// Deployment determined that something's wrong, e.g. the only pod
// has crashed.
r.MarkResourcesAvailableFalse("somehow", "somewhere")
func TestPropagateAutoscalerStatus_NoOverridingResourcesAvailable(t *testing.T) {
// Cases to test Ready condition
// we fix ScaleTargetInitialized to True
cases := []corev1.ConditionStatus{
corev1.ConditionTrue,
corev1.ConditionFalse,
corev1.ConditionUnknown,
}

// PodAutoscaler achieved initial scale.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Status: corev1.ConditionUnknown,
}, {
Type: autoscalingv1alpha1.PodAutoscalerConditionScaleTargetInitialized,
Status: corev1.ConditionTrue,
}},
},
})
// Verify we did not override this.
apistest.CheckConditionFailed(r, RevisionConditionResourcesAvailable, t)
cond := r.GetCondition(RevisionConditionResourcesAvailable)
if got, notWant := cond.Reason, ReasonProgressDeadlineExceeded; got == notWant {
t.Error("PA Status propagation overrode the ResourcesAvailable status")
for _, tc := range cases {
t.Run(fmt.Sprintf("ready is %v", tc), func(t *testing.T) {

r := &RevisionStatus{}
r.InitializeConditions()
apistest.CheckConditionOngoing(r, RevisionConditionReady, t)

// Deployment determined that something's wrong, e.g. the only pod
// has crashed.
r.MarkResourcesAvailableFalse("somehow", "somewhere")
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Status: tc,
}, {
Type: autoscalingv1alpha1.PodAutoscalerConditionScaleTargetInitialized,
Status: corev1.ConditionTrue,
}},
},
})

// Verify we did not override this.
apistest.CheckConditionFailed(r, RevisionConditionResourcesAvailable, t)

cond := r.GetCondition(RevisionConditionResourcesAvailable)
if got, notWant := cond.Reason, ReasonProgressDeadlineExceeded; got == notWant {
t.Error("PodAutoscalerStatus propagation overrode the ResourcesAvailable status")
}
})
}
}

Expand Down Expand Up @@ -671,6 +688,7 @@ func TestPropagateAutoscalerStatusRace(t *testing.T) {

// The PodAutoscaler might have been ready but it's scaled down already.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand Down
16 changes: 8 additions & 8 deletions pkg/reconciler/autoscaling/kpa/kpa_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ func markScaleTargetInitialized(pa *autoscalingv1alpha1.PodAutoscaler) {

func kpa(ns, n string, opts ...PodAutoscalerOption) *autoscalingv1alpha1.PodAutoscaler {
rev := newTestRevision(ns, n)
kpa := revisionresources.MakePA(rev)
kpa := revisionresources.MakePA(rev, nil)
kpa.Generation = 1
kpa.Annotations[autoscaling.ClassAnnotationKey] = "kpa.autoscaling.knative.dev"
kpa.Annotations[autoscaling.MetricAnnotationKey] = "concurrency"
Expand Down Expand Up @@ -1303,7 +1303,7 @@ func TestGlobalResyncOnUpdateAutoscalerConfigMap(t *testing.T) {
rev := newTestRevision(testNamespace, testRevision)
newDeployment(ctx, t, fakedynamicclient.Get(ctx), testRevision+"-deployment", 3)

kpa := revisionresources.MakePA(rev)
kpa := revisionresources.MakePA(rev, nil)
sks := aresources.MakeSKS(kpa, nv1a1.SKSOperationModeServe, minActivators)
sks.Status.PrivateServiceName = "bogus"
sks.Status.InitializeConditions()
Expand Down Expand Up @@ -1372,7 +1372,7 @@ func TestReconcileDeciderCreatesAndDeletes(t *testing.T) {

newDeployment(ctx, t, fakedynamicclient.Get(ctx), testRevision+"-deployment", 3)

kpa := revisionresources.MakePA(rev)
kpa := revisionresources.MakePA(rev, nil)
sks := sks(testNamespace, testRevision, WithDeployRef(kpa.Spec.ScaleTargetRef.Name), WithSKSReady)
fakenetworkingclient.Get(ctx).NetworkingV1alpha1().ServerlessServices(testNamespace).Create(ctx, sks, metav1.CreateOptions{})
fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{})
Expand Down Expand Up @@ -1446,7 +1446,7 @@ func TestUpdate(t *testing.T) {
fakekubeclient.Get(ctx).CoreV1().Pods(testNamespace).Create(ctx, pod, metav1.CreateOptions{})
fakefilteredpodsinformer.Get(ctx, serving.RevisionUID).Informer().GetIndexer().Add(pod)

kpa := revisionresources.MakePA(rev)
kpa := revisionresources.MakePA(rev, nil)
kpa.SetDefaults(context.Background())
fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{})
fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa)
Expand Down Expand Up @@ -1525,7 +1525,7 @@ func TestControllerCreateError(t *testing.T) {
createErr: want,
})

kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision))
kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision), nil)
fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{})
fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa)

Expand Down Expand Up @@ -1568,7 +1568,7 @@ func TestControllerUpdateError(t *testing.T) {
createErr: want,
})

kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision))
kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision), nil)
fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{})
fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa)

Expand Down Expand Up @@ -1610,7 +1610,7 @@ func TestControllerGetError(t *testing.T) {
getErr: want,
})

kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision))
kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision), nil)
fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{})
fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa)

Expand Down Expand Up @@ -1649,7 +1649,7 @@ func TestScaleFailure(t *testing.T) {

// Only put the KPA in the lister, which will prompt failures scaling it.
rev := newTestRevision(testNamespace, testRevision)
kpa := revisionresources.MakePA(rev)
kpa := revisionresources.MakePA(rev, nil)
fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa)

newDeployment(ctx, t, fakedynamicclient.Get(ctx), testRevision+"-deployment", 3)
Expand Down
2 changes: 1 addition & 1 deletion pkg/reconciler/autoscaling/kpa/scaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ func TestDisableScaleToZero(t *testing.T) {

func newKPA(ctx context.Context, t *testing.T, servingClient clientset.Interface, revision *v1.Revision) *autoscalingv1alpha1.PodAutoscaler {
t.Helper()
pa := revisionresources.MakePA(revision)
pa := revisionresources.MakePA(revision, nil)
pa.Status.InitializeConditions()
_, err := servingClient.AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, pa, metav1.CreateOptions{})
if err != nil {
Expand Down
8 changes: 6 additions & 2 deletions pkg/reconciler/revision/cruds.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ func (c *Reconciler) createImageCache(ctx context.Context, rev *v1.Revision, con
return c.cachingclient.CachingV1alpha1().Images(image.Namespace).Create(ctx, image, metav1.CreateOptions{})
}

func (c *Reconciler) createPA(ctx context.Context, rev *v1.Revision) (*autoscalingv1alpha1.PodAutoscaler, error) {
pa := resources.MakePA(rev)
func (c *Reconciler) createPA(
ctx context.Context,
rev *v1.Revision,
deployment *appsv1.Deployment,
) (*autoscalingv1alpha1.PodAutoscaler, error) {
pa := resources.MakePA(rev, deployment)
return c.client.AutoscalingV1alpha1().PodAutoscalers(pa.Namespace).Create(ctx, pa, metav1.CreateOptions{})
}
Loading

0 comments on commit 0280701

Please sign in to comment.