diff --git a/pkg/apis/serving/k8s_lifecycle.go b/pkg/apis/serving/k8s_lifecycle.go index 9703eb8054d2..f63060c8b521 100644 --- a/pkg/apis/serving/k8s_lifecycle.go +++ b/pkg/apis/serving/k8s_lifecycle.go @@ -50,29 +50,42 @@ func TransformDeploymentStatus(ds *appsv1.DeploymentStatus) *duckv1.Status { // The absence of this condition means no failure has occurred. If we find it // below, we'll overwrite this. depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady) + depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, "Deploying", "") - for _, cond := range ds.Conditions { - // TODO(jonjohnsonjr): Should we care about appsv1.DeploymentAvailable here? - switch cond.Type { - case appsv1.DeploymentProgressing: - switch cond.Status { - case corev1.ConditionUnknown: - depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, cond.Reason, cond.Message) - case corev1.ConditionTrue: - depCondSet.Manage(s).MarkTrue(DeploymentConditionProgressing) - case corev1.ConditionFalse: - depCondSet.Manage(s).MarkFalse(DeploymentConditionProgressing, cond.Reason, cond.Message) + conds := []appsv1.DeploymentConditionType{ + appsv1.DeploymentProgressing, + appsv1.DeploymentReplicaFailure, + } + + for _, wantType := range conds { + for _, cond := range ds.Conditions { + if wantType != cond.Type { + continue } - case appsv1.DeploymentReplicaFailure: - switch cond.Status { - case corev1.ConditionUnknown: - depCondSet.Manage(s).MarkUnknown(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message) - case corev1.ConditionTrue: - depCondSet.Manage(s).MarkFalse(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message) - case corev1.ConditionFalse: - depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady) + + switch cond.Type { + case appsv1.DeploymentProgressing: + switch cond.Status { + case corev1.ConditionUnknown: + depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, cond.Reason, cond.Message) + case corev1.ConditionTrue: + depCondSet.Manage(s).MarkTrue(DeploymentConditionProgressing) + case corev1.ConditionFalse: + depCondSet.Manage(s).MarkFalse(DeploymentConditionProgressing, cond.Reason, cond.Message) + } + case appsv1.DeploymentReplicaFailure: + switch cond.Status { + case corev1.ConditionUnknown: + depCondSet.Manage(s).MarkUnknown(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message) + case corev1.ConditionTrue: + depCondSet.Manage(s).MarkFalse(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message) + case corev1.ConditionFalse: + depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady) + } } + } } + return s } diff --git a/pkg/apis/serving/k8s_lifecycle_test.go b/pkg/apis/serving/k8s_lifecycle_test.go index fcb76f327095..344e95d9482a 100644 --- a/pkg/apis/serving/k8s_lifecycle_test.go +++ b/pkg/apis/serving/k8s_lifecycle_test.go @@ -40,12 +40,14 @@ func TestTransformDeploymentStatus(t *testing.T) { Conditions: []apis.Condition{{ Type: DeploymentConditionProgressing, Status: corev1.ConditionUnknown, + Reason: "Deploying", }, { Type: DeploymentConditionReplicaSetReady, Status: corev1.ConditionTrue, }, { Type: DeploymentConditionReady, Status: corev1.ConditionUnknown, + Reason: "Deploying", }}, }, }, { @@ -147,7 +149,7 @@ func TestTransformDeploymentStatus(t *testing.T) { Type: appsv1.DeploymentReplicaFailure, Status: corev1.ConditionTrue, Reason: "ReplicaSetReason", - Message: "Something bag happened", + Message: "Something bad happened", }}, }, want: &duckv1.Status{ @@ -158,12 +160,45 @@ func TestTransformDeploymentStatus(t *testing.T) { Type: DeploymentConditionReplicaSetReady, Status: corev1.ConditionFalse, Reason: "ReplicaSetReason", - Message: "Something bag happened", + Message: "Something bad happened", }, { Type: DeploymentConditionReady, Status: corev1.ConditionFalse, Reason: "ReplicaSetReason", - Message: "Something bag happened", + Message: "Something bad happened", + }}, + }, + }, { + name: "replica failure has priority over progressing", + ds: &appsv1.DeploymentStatus{ + Conditions: []appsv1.DeploymentCondition{{ + Type: appsv1.DeploymentReplicaFailure, + Status: corev1.ConditionTrue, + Reason: "ReplicaSetReason", + Message: "Something really bad happened", + }, { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionFalse, + Reason: "ProgressingReason", + Message: "Something bad happened", + }}, + }, + want: &duckv1.Status{ + Conditions: []apis.Condition{{ + Type: DeploymentConditionProgressing, + Status: corev1.ConditionFalse, + Reason: "ProgressingReason", + Message: "Something bad happened", + }, { + Type: DeploymentConditionReplicaSetReady, + Status: corev1.ConditionFalse, + Reason: "ReplicaSetReason", + Message: "Something really bad happened", + }, { + Type: DeploymentConditionReady, + Status: corev1.ConditionFalse, + Reason: "ReplicaSetReason", + Message: "Something really bad happened", }}, }, }} diff --git a/pkg/apis/serving/v1/revision_lifecycle.go b/pkg/apis/serving/v1/revision_lifecycle.go index 48d2cbb1fb02..da3d16ce6114 100644 --- a/pkg/apis/serving/v1/revision_lifecycle.go +++ b/pkg/apis/serving/v1/revision_lifecycle.go @@ -169,6 +169,8 @@ func (rs *RevisionStatus) PropagateDeploymentStatus(original *appsv1.DeploymentS // PropagateAutoscalerStatus propagates autoscaler's status to the revision's status. func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodAutoscalerStatus) { + resUnavailable := rs.GetCondition(RevisionConditionResourcesAvailable).IsFalse() + // Reflect the PA status in our own. cond := ps.GetCondition(autoscalingv1alpha1.PodAutoscalerConditionReady) @@ -183,13 +185,16 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA } if cond == nil { - rs.MarkActiveUnknown("Deploying", "") + rs.MarkActiveUnknown(ReasonDeploying, "") + + if !resUnavailable { + rs.MarkResourcesAvailableUnknown(ReasonDeploying, "") + } return } // Don't mark the resources available, if deployment status already determined // it isn't so. - resUnavailable := rs.GetCondition(RevisionConditionResourcesAvailable).IsFalse() if ps.IsScaleTargetInitialized() && !resUnavailable { // Precondition for PA being initialized is SKS being active and // that implies that |service.endpoints| > 0. @@ -197,6 +202,12 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA rs.MarkContainerHealthyTrue() } + // Mark resource unavailable if we don't have a Service Name and the deployment is ready + // This can happen when we have initial scale set to 0 + if rs.GetCondition(RevisionConditionResourcesAvailable).IsTrue() && ps.ServiceName == "" { + rs.MarkResourcesAvailableUnknown(ReasonDeploying, "") + } + switch cond.Status { case corev1.ConditionUnknown: rs.MarkActiveUnknown(cond.Reason, cond.Message) @@ -222,14 +233,6 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA rs.MarkActiveFalse(cond.Reason, cond.Message) case corev1.ConditionTrue: rs.MarkActiveTrue() - - // Precondition for PA being active is SKS being active and - // that implies that |service.endpoints| > 0. - // - // Note: This is needed for backwards compatibility as we're adding the new - // ScaleTargetInitialized condition to gate readiness. - rs.MarkResourcesAvailableTrue() - rs.MarkContainerHealthyTrue() } } diff --git a/pkg/apis/serving/v1/revision_lifecycle_test.go b/pkg/apis/serving/v1/revision_lifecycle_test.go index e613cf5a36df..7032c94ade52 100644 --- a/pkg/apis/serving/v1/revision_lifecycle_test.go +++ b/pkg/apis/serving/v1/revision_lifecycle_test.go @@ -17,6 +17,7 @@ limitations under the License. package v1 import ( + "fmt" "sort" "testing" @@ -536,6 +537,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) { // PodAutoscaler becomes ready, making us active. r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ + ServiceName: "some-service", Status: duckv1.Status{ Conditions: duckv1.Conditions{{ Type: autoscalingv1alpha1.PodAutoscalerConditionReady, @@ -551,6 +553,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) { // PodAutoscaler flipping back to Unknown causes Active become ongoing immediately. r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ + ServiceName: "some-service", Status: duckv1.Status{ Conditions: duckv1.Conditions{{ Type: autoscalingv1alpha1.PodAutoscalerConditionReady, @@ -566,6 +569,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) { // PodAutoscaler becoming unready makes Active false, but doesn't affect readiness. r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ + ServiceName: "some-service", Status: duckv1.Status{ Conditions: duckv1.Conditions{{ Type: autoscalingv1alpha1.PodAutoscalerConditionReady, @@ -582,32 +586,45 @@ func TestPropagateAutoscalerStatus(t *testing.T) { apistest.CheckConditionSucceeded(r, RevisionConditionResourcesAvailable, t) } -func TestPAResAvailableNoOverride(t *testing.T) { - r := &RevisionStatus{} - r.InitializeConditions() - apistest.CheckConditionOngoing(r, RevisionConditionReady, t) - - // Deployment determined that something's wrong, e.g. the only pod - // has crashed. - r.MarkResourcesAvailableFalse("somehow", "somewhere") +func TestPropagateAutoscalerStatus_NoOverridingResourcesAvailable(t *testing.T) { + // Cases to test Ready condition + // we fix ScaleTargetInitialized to True + cases := []corev1.ConditionStatus{ + corev1.ConditionTrue, + corev1.ConditionFalse, + corev1.ConditionUnknown, + } - // PodAutoscaler achieved initial scale. - r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ - Status: duckv1.Status{ - Conditions: duckv1.Conditions{{ - Type: autoscalingv1alpha1.PodAutoscalerConditionReady, - Status: corev1.ConditionUnknown, - }, { - Type: autoscalingv1alpha1.PodAutoscalerConditionScaleTargetInitialized, - Status: corev1.ConditionTrue, - }}, - }, - }) - // Verify we did not override this. - apistest.CheckConditionFailed(r, RevisionConditionResourcesAvailable, t) - cond := r.GetCondition(RevisionConditionResourcesAvailable) - if got, notWant := cond.Reason, ReasonProgressDeadlineExceeded; got == notWant { - t.Error("PA Status propagation overrode the ResourcesAvailable status") + for _, tc := range cases { + t.Run(fmt.Sprintf("ready is %v", tc), func(t *testing.T) { + + r := &RevisionStatus{} + r.InitializeConditions() + apistest.CheckConditionOngoing(r, RevisionConditionReady, t) + + // Deployment determined that something's wrong, e.g. the only pod + // has crashed. + r.MarkResourcesAvailableFalse("somehow", "somewhere") + r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{{ + Type: autoscalingv1alpha1.PodAutoscalerConditionReady, + Status: tc, + }, { + Type: autoscalingv1alpha1.PodAutoscalerConditionScaleTargetInitialized, + Status: corev1.ConditionTrue, + }}, + }, + }) + + // Verify we did not override this. + apistest.CheckConditionFailed(r, RevisionConditionResourcesAvailable, t) + + cond := r.GetCondition(RevisionConditionResourcesAvailable) + if got, notWant := cond.Reason, ReasonProgressDeadlineExceeded; got == notWant { + t.Error("PodAutoscalerStatus propagation overrode the ResourcesAvailable status") + } + }) } } @@ -671,6 +688,7 @@ func TestPropagateAutoscalerStatusRace(t *testing.T) { // The PodAutoscaler might have been ready but it's scaled down already. r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ + ServiceName: "some-service", Status: duckv1.Status{ Conditions: duckv1.Conditions{{ Type: autoscalingv1alpha1.PodAutoscalerConditionReady, diff --git a/pkg/reconciler/autoscaling/kpa/kpa_test.go b/pkg/reconciler/autoscaling/kpa/kpa_test.go index b12a56cb3a6b..a14f82c3ca27 100644 --- a/pkg/reconciler/autoscaling/kpa/kpa_test.go +++ b/pkg/reconciler/autoscaling/kpa/kpa_test.go @@ -221,7 +221,7 @@ func markScaleTargetInitialized(pa *autoscalingv1alpha1.PodAutoscaler) { func kpa(ns, n string, opts ...PodAutoscalerOption) *autoscalingv1alpha1.PodAutoscaler { rev := newTestRevision(ns, n) - kpa := revisionresources.MakePA(rev) + kpa := revisionresources.MakePA(rev, nil) kpa.Generation = 1 kpa.Annotations[autoscaling.ClassAnnotationKey] = "kpa.autoscaling.knative.dev" kpa.Annotations[autoscaling.MetricAnnotationKey] = "concurrency" @@ -1303,7 +1303,7 @@ func TestGlobalResyncOnUpdateAutoscalerConfigMap(t *testing.T) { rev := newTestRevision(testNamespace, testRevision) newDeployment(ctx, t, fakedynamicclient.Get(ctx), testRevision+"-deployment", 3) - kpa := revisionresources.MakePA(rev) + kpa := revisionresources.MakePA(rev, nil) sks := aresources.MakeSKS(kpa, nv1a1.SKSOperationModeServe, minActivators) sks.Status.PrivateServiceName = "bogus" sks.Status.InitializeConditions() @@ -1372,7 +1372,7 @@ func TestReconcileDeciderCreatesAndDeletes(t *testing.T) { newDeployment(ctx, t, fakedynamicclient.Get(ctx), testRevision+"-deployment", 3) - kpa := revisionresources.MakePA(rev) + kpa := revisionresources.MakePA(rev, nil) sks := sks(testNamespace, testRevision, WithDeployRef(kpa.Spec.ScaleTargetRef.Name), WithSKSReady) fakenetworkingclient.Get(ctx).NetworkingV1alpha1().ServerlessServices(testNamespace).Create(ctx, sks, metav1.CreateOptions{}) fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{}) @@ -1446,7 +1446,7 @@ func TestUpdate(t *testing.T) { fakekubeclient.Get(ctx).CoreV1().Pods(testNamespace).Create(ctx, pod, metav1.CreateOptions{}) fakefilteredpodsinformer.Get(ctx, serving.RevisionUID).Informer().GetIndexer().Add(pod) - kpa := revisionresources.MakePA(rev) + kpa := revisionresources.MakePA(rev, nil) kpa.SetDefaults(context.Background()) fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{}) fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa) @@ -1525,7 +1525,7 @@ func TestControllerCreateError(t *testing.T) { createErr: want, }) - kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision)) + kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision), nil) fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{}) fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa) @@ -1568,7 +1568,7 @@ func TestControllerUpdateError(t *testing.T) { createErr: want, }) - kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision)) + kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision), nil) fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{}) fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa) @@ -1610,7 +1610,7 @@ func TestControllerGetError(t *testing.T) { getErr: want, }) - kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision)) + kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision), nil) fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{}) fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa) @@ -1649,7 +1649,7 @@ func TestScaleFailure(t *testing.T) { // Only put the KPA in the lister, which will prompt failures scaling it. rev := newTestRevision(testNamespace, testRevision) - kpa := revisionresources.MakePA(rev) + kpa := revisionresources.MakePA(rev, nil) fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa) newDeployment(ctx, t, fakedynamicclient.Get(ctx), testRevision+"-deployment", 3) diff --git a/pkg/reconciler/autoscaling/kpa/scaler_test.go b/pkg/reconciler/autoscaling/kpa/scaler_test.go index b9538514c35e..6b9b8e1f917a 100644 --- a/pkg/reconciler/autoscaling/kpa/scaler_test.go +++ b/pkg/reconciler/autoscaling/kpa/scaler_test.go @@ -640,7 +640,7 @@ func TestDisableScaleToZero(t *testing.T) { func newKPA(ctx context.Context, t *testing.T, servingClient clientset.Interface, revision *v1.Revision) *autoscalingv1alpha1.PodAutoscaler { t.Helper() - pa := revisionresources.MakePA(revision) + pa := revisionresources.MakePA(revision, nil) pa.Status.InitializeConditions() _, err := servingClient.AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, pa, metav1.CreateOptions{}) if err != nil { diff --git a/pkg/reconciler/revision/cruds.go b/pkg/reconciler/revision/cruds.go index bd8b3436fd30..447cd35e86a4 100644 --- a/pkg/reconciler/revision/cruds.go +++ b/pkg/reconciler/revision/cruds.go @@ -116,7 +116,11 @@ func (c *Reconciler) createImageCache(ctx context.Context, rev *v1.Revision, con return c.cachingclient.CachingV1alpha1().Images(image.Namespace).Create(ctx, image, metav1.CreateOptions{}) } -func (c *Reconciler) createPA(ctx context.Context, rev *v1.Revision) (*autoscalingv1alpha1.PodAutoscaler, error) { - pa := resources.MakePA(rev) +func (c *Reconciler) createPA( + ctx context.Context, + rev *v1.Revision, + deployment *appsv1.Deployment, +) (*autoscalingv1alpha1.PodAutoscaler, error) { + pa := resources.MakePA(rev, deployment) return c.client.AutoscalingV1alpha1().PodAutoscalers(pa.Namespace).Create(ctx, pa, metav1.CreateOptions{}) } diff --git a/pkg/reconciler/revision/reconcile_resources.go b/pkg/reconciler/revision/reconcile_resources.go index 4818c35b9c10..33ce2ba03abe 100644 --- a/pkg/reconciler/revision/reconcile_resources.go +++ b/pkg/reconciler/revision/reconcile_resources.go @@ -49,36 +49,27 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1.Revision) // Deployment does not exist. Create it. rev.Status.MarkResourcesAvailableUnknown(v1.ReasonDeploying, "") rev.Status.MarkContainerHealthyUnknown(v1.ReasonDeploying, "") - deployment, err = c.createDeployment(ctx, rev) - if err != nil { + if _, err = c.createDeployment(ctx, rev); err != nil { return fmt.Errorf("failed to create deployment %q: %w", deploymentName, err) } logger.Infof("Created deployment %q", deploymentName) + return nil } else if err != nil { return fmt.Errorf("failed to get deployment %q: %w", deploymentName, err) } else if !metav1.IsControlledBy(deployment, rev) { // Surface an error in the revision's status, and return an error. rev.Status.MarkResourcesAvailableFalse(v1.ReasonNotOwned, v1.ResourceNotOwnedMessage("Deployment", deploymentName)) return fmt.Errorf("revision: %q does not own Deployment: %q", rev.Name, deploymentName) - } else { - // The deployment exists, but make sure that it has the shape that we expect. - deployment, err = c.checkAndUpdateDeployment(ctx, rev, deployment) - if err != nil { - return fmt.Errorf("failed to update deployment %q: %w", deploymentName, err) - } + } - // Now that we have a Deployment, determine whether there is any relevant - // status to surface in the Revision. - // - // TODO(jonjohnsonjr): Should we check Generation != ObservedGeneration? - // The autoscaler mutates the deployment pretty often, which would cause us - // to flip back and forth between Ready and Unknown every time we scale up - // or down. - if !rev.Status.IsActivationRequired() { - rev.Status.PropagateDeploymentStatus(&deployment.Status) - } + // The deployment exists, but make sure that it has the shape that we expect. + deployment, err = c.checkAndUpdateDeployment(ctx, rev, deployment) + if err != nil { + return fmt.Errorf("failed to update deployment %q: %w", deploymentName, err) } + rev.Status.PropagateDeploymentStatus(&deployment.Status) + // If a container keeps crashing (no active pods in the deployment although we want some) if *deployment.Spec.Replicas > 0 && deployment.Status.AvailableReplicas == 0 { pods, err := c.kubeclient.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{LabelSelector: metav1.FormatLabelSelector(deployment.Spec.Selector)}) @@ -145,6 +136,13 @@ func (c *Reconciler) reconcileImageCache(ctx context.Context, rev *v1.Revision) func (c *Reconciler) reconcilePA(ctx context.Context, rev *v1.Revision) error { ns := rev.Namespace + + deploymentName := resourcenames.Deployment(rev) + deployment, err := c.deploymentLister.Deployments(ns).Get(deploymentName) + if err != nil { + return err + } + paName := resourcenames.PA(rev) logger := logging.FromContext(ctx) logger.Info("Reconciling PA: ", paName) @@ -152,7 +150,7 @@ func (c *Reconciler) reconcilePA(ctx context.Context, rev *v1.Revision) error { pa, err := c.podAutoscalerLister.PodAutoscalers(ns).Get(paName) if apierrs.IsNotFound(err) { // PA does not exist. Create it. - pa, err = c.createPA(ctx, rev) + pa, err = c.createPA(ctx, rev, deployment) if err != nil { return fmt.Errorf("failed to create PA %q: %w", paName, err) } @@ -167,7 +165,7 @@ func (c *Reconciler) reconcilePA(ctx context.Context, rev *v1.Revision) error { // Perhaps tha PA spec changed underneath ourselves? // We no longer require immutability, so need to reconcile PA each time. - tmpl := resources.MakePA(rev) + tmpl := resources.MakePA(rev, deployment) logger.Debugf("Desired PASpec: %#v", tmpl.Spec) if !equality.Semantic.DeepEqual(tmpl.Spec, pa.Spec) { diff, _ := kmp.SafeDiff(tmpl.Spec, pa.Spec) // Can't realistically fail on PASpec. @@ -203,7 +201,7 @@ func hasDeploymentTimedOut(deployment *appsv1.Deployment) bool { func (c *Reconciler) reconcileSecret(ctx context.Context, rev *v1.Revision) error { ns := rev.Namespace logger := logging.FromContext(ctx) - logger.Info("Reconciling Secret: ", networking.ServingCertName, " at namespace: ", ns) + logger.Info("Reconciling Secret for system-internal-tls: ", networking.ServingCertName, " at namespace: ", ns) secret, err := c.kubeclient.CoreV1().Secrets(ns).Get(ctx, networking.ServingCertName, metav1.GetOptions{}) if apierrs.IsNotFound(err) { diff --git a/pkg/reconciler/revision/resources/pa.go b/pkg/reconciler/revision/resources/pa.go index e51f4e8553f7..32a2cff3cbb1 100644 --- a/pkg/reconciler/revision/resources/pa.go +++ b/pkg/reconciler/revision/resources/pa.go @@ -17,6 +17,7 @@ limitations under the License. package resources import ( + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -28,7 +29,7 @@ import ( ) // MakePA makes a Knative Pod Autoscaler resource from a revision. -func MakePA(rev *v1.Revision) *autoscalingv1alpha1.PodAutoscaler { +func MakePA(rev *v1.Revision, deployment *appsv1.Deployment) *autoscalingv1alpha1.PodAutoscaler { return &autoscalingv1alpha1.PodAutoscaler{ ObjectMeta: metav1.ObjectMeta{ Name: names.PA(rev), @@ -45,20 +46,27 @@ func MakePA(rev *v1.Revision) *autoscalingv1alpha1.PodAutoscaler { Name: names.Deployment(rev), }, ProtocolType: rev.GetProtocol(), - Reachability: reachability(rev), + Reachability: reachability(rev, deployment), }, } } -func reachability(rev *v1.Revision) autoscalingv1alpha1.ReachabilityType { +func reachability(rev *v1.Revision, deployment *appsv1.Deployment) autoscalingv1alpha1.ReachabilityType { // check infra failures - conds := []apis.ConditionType{ + infraFailure := false + for _, cond := range []apis.ConditionType{ v1.RevisionConditionResourcesAvailable, v1.RevisionConditionContainerHealthy, + } { + if c := rev.Status.GetCondition(cond); c != nil && c.IsFalse() { + infraFailure = true + break + } } - for _, cond := range conds { - if c := rev.Status.GetCondition(cond); c != nil && c.IsFalse() { + if infraFailure && deployment != nil && deployment.Spec.Replicas != nil { + // If we have an infra failure and no ready replicas - then this revision is unreachable + if *deployment.Spec.Replicas > 0 && deployment.Status.ReadyReplicas == 0 { return autoscalingv1alpha1.ReachabilityUnreachable } } diff --git a/pkg/reconciler/revision/resources/pa_test.go b/pkg/reconciler/revision/resources/pa_test.go index dbc1dc7e2dc5..cb4e101acbb1 100644 --- a/pkg/reconciler/revision/resources/pa_test.go +++ b/pkg/reconciler/revision/resources/pa_test.go @@ -20,6 +20,7 @@ import ( "testing" "github.com/google/go-cmp/cmp" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" duckv1 "knative.dev/pkg/apis/duck/v1" @@ -35,6 +36,7 @@ func TestMakePA(t *testing.T) { tests := []struct { name string rev *v1.Revision + dep *appsv1.Deployment want *autoscalingv1alpha1.PodAutoscaler }{{ name: "name is bar (Concurrency=1, Reachable=true)", @@ -243,6 +245,14 @@ func TestMakePA(t *testing.T) { }}, }, { name: "failed deployment", + dep: &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{ + Replicas: ptr.Int32(1), + }, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 0, + }, + }, rev: &v1.Revision{ ObjectMeta: metav1.ObjectMeta{ Namespace: "blah", @@ -305,6 +315,14 @@ func TestMakePA(t *testing.T) { }, { // Crashlooping container that never starts name: "failed container", + dep: &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{ + Replicas: ptr.Int32(1), + }, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 0, + }, + }, rev: &v1.Revision{ ObjectMeta: metav1.ObjectMeta{ Namespace: "blah", @@ -368,7 +386,7 @@ func TestMakePA(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got := MakePA(test.rev) + got := MakePA(test.rev, test.dep) if !cmp.Equal(got, test.want) { t.Error("MakePA (-want, +got) =", cmp.Diff(test.want, got)) } diff --git a/pkg/reconciler/revision/table_test.go b/pkg/reconciler/revision/table_test.go index a31ffe122087..920f88541e6b 100644 --- a/pkg/reconciler/revision/table_test.go +++ b/pkg/reconciler/revision/table_test.go @@ -244,7 +244,9 @@ func TestReconcile(t *testing.T) { WithRoutingState(v1.RoutingStateReserve, fc), withDefaultContainerStatuses(), WithRevisionObservedGeneration(1)), pa("foo", "stable-deactivation", - WithNoTraffic("NoTraffic", "This thing is inactive."), WithReachabilityUnreachable, + WithPAStatusService("stable-deactivation"), + WithNoTraffic("NoTraffic", "This thing is inactive."), + WithReachabilityUnreachable, WithScaleTargetInitialized), deploy(t, "foo", "stable-deactivation"), image("foo", "stable-deactivation"), @@ -306,6 +308,7 @@ func TestReconcile(t *testing.T) { WithRoutingState(v1.RoutingStateReserve, fc), MarkRevisionReady, WithRevisionObservedGeneration(1)), pa("foo", "pa-inactive", + WithPAStatusService("pa-inactive"), WithNoTraffic("NoTraffic", "This thing is inactive."), WithScaleTargetInitialized, WithReachabilityUnreachable), @@ -350,6 +353,7 @@ func TestReconcile(t *testing.T) { Objects: []runtime.Object{ Revision("foo", "pa-inactive", allUnknownConditions, WithLogURL, + MarkDeploying(v1.ReasonDeploying), WithRevisionObservedGeneration(1)), pa("foo", "pa-inactive", WithNoTraffic("NoTraffic", "This thing is inactive."), WithPAStatusService("")), @@ -361,6 +365,7 @@ func TestReconcile(t *testing.T) { Object: Revision("foo", "pa-inactive", WithLogURL, withDefaultContainerStatuses(), allUnknownConditions, MarkInactive("NoTraffic", "This thing is inactive."), + MarkDeploying(v1.ReasonDeploying), WithRevisionObservedGeneration(1)), }}, Key: "foo/pa-inactive", @@ -398,6 +403,7 @@ func TestReconcile(t *testing.T) { // Protocol type is the only thing that can be changed on PA Objects: []runtime.Object{ Revision("foo", "fix-mutated-pa", + allUnknownConditions, WithLogURL, MarkRevisionReady, WithRoutingState(v1.RoutingStateActive, fc)), pa("foo", "fix-mutated-pa", WithProtocolType(networking.ProtocolH2C), @@ -452,6 +458,7 @@ func TestReconcile(t *testing.T) { // status of the Revision. Objects: []runtime.Object{ Revision("foo", "deploy-timeout", + allUnknownConditions, WithRoutingState(v1.RoutingStateActive, fc), WithLogURL, MarkActive), pa("foo", "deploy-timeout", WithReachabilityReachable), @@ -479,6 +486,7 @@ func TestReconcile(t *testing.T) { // It then verifies that Reconcile propagates this into the status of the Revision. Objects: []runtime.Object{ Revision("foo", "deploy-replica-failure", + allUnknownConditions, WithRoutingState(v1.RoutingStateActive, fc), WithLogURL, MarkActive), pa("foo", "deploy-replica-failure", WithReachabilityReachable), @@ -503,7 +511,8 @@ func TestReconcile(t *testing.T) { // Test the propagation of ImagePullBackoff from user container. Objects: []runtime.Object{ Revision("foo", "pull-backoff", - WithLogURL, MarkActivating("Deploying", ""), + WithLogURL, + allUnknownConditions, WithRoutingState(v1.RoutingStateActive, fc), ), pa("foo", "pull-backoff", WithReachabilityReachable), // pa can't be ready since deployment times out. @@ -529,6 +538,7 @@ func TestReconcile(t *testing.T) { // that Reconcile propagates this into the status of the Revision. Objects: []runtime.Object{ Revision("foo", "pod-error", + allUnknownConditions, WithRoutingState(v1.RoutingStateActive, fc), WithLogURL, allUnknownConditions, MarkActive), pa("foo", "pod-error", WithReachabilityReachable), // PA can't be ready, since no traffic. @@ -807,7 +817,7 @@ func withInitContainerStatuses() RevisionOption { // TODO(mattmoor): Come up with a better name for this. func allUnknownConditions(r *v1.Revision) { WithInitRevConditions(r) - MarkDeploying("")(r) + MarkDeploying("Deploying")(r) MarkActivating("Deploying", "")(r) } @@ -865,7 +875,7 @@ func imageInit(namespace, name string, co ...configOption) []runtime.Object { func pa(namespace, name string, ko ...PodAutoscalerOption) *autoscalingv1alpha1.PodAutoscaler { rev := Revision(namespace, name) - k := resources.MakePA(rev) + k := resources.MakePA(rev, nil) for _, opt := range ko { opt(k) diff --git a/test/upgrade/deployment_failure.go b/test/upgrade/deployment_failure.go new file mode 100644 index 000000000000..324c149ca909 --- /dev/null +++ b/test/upgrade/deployment_failure.go @@ -0,0 +1,134 @@ +/* +Copyright 2024 The Knative Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package upgrade + +import ( + "context" + + admissionv1 "k8s.io/api/admissionregistration/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "knative.dev/pkg/ptr" + "knative.dev/pkg/test/helpers" + pkgupgrade "knative.dev/pkg/test/upgrade" + "knative.dev/serving/pkg/apis/autoscaling" + v1 "knative.dev/serving/pkg/apis/serving/v1" + "knative.dev/serving/test" + "knative.dev/serving/test/e2e" + v1test "knative.dev/serving/test/v1" +) + +func DeploymentFailurePostUpgrade() pkgupgrade.Operation { + return pkgupgrade.NewOperation("DeploymentFailurePostUpgrade", func(c pkgupgrade.Context) { + clients := e2e.Setup(c.T) + + names := test.ResourceNames{ + Service: "deployment-upgrade-failure", + Image: test.HelloWorld, + } + + service, err := clients.ServingClient.Services.Get(context.Background(), names.Service, metav1.GetOptions{}) + if err != nil { + c.T.Fatal("Failed to get Service: ", err) + } + + // Deployment failures should surface to the Service Ready Condition + if err := v1test.WaitForServiceState(clients.ServingClient, names.Service, v1test.IsServiceFailed, "ServiceIsNotReady"); err != nil { + c.T.Fatal("Service did not transition to Ready=False", err) + } + + // Traffic should still work since the deployment has an active replicaset + url := service.Status.URL.URL() + assertServiceResourcesUpdated(c.T, clients, names, url, test.HelloWorldText) + }) +} + +func DeploymentFailurePreUpgrade() pkgupgrade.Operation { + return pkgupgrade.NewOperation("DeploymentFailurePreUpgrade", func(c pkgupgrade.Context) { + c.T.Log("Creating Service") + ctx := context.Background() + + clients := e2e.Setup(c.T) + names := &test.ResourceNames{ + Service: "deployment-upgrade-failure", + Image: test.HelloWorld, + } + + resources, err := v1test.CreateServiceReady(c.T, clients, names, func(s *v1.Service) { + s.Spec.Template.Annotations = map[string]string{ + autoscaling.MinScaleAnnotation.Key(): "1", + autoscaling.MaxScaleAnnotation.Key(): "1", + } + }) + + if err != nil { + c.T.Fatal("Failed to create Service:", err) + } + + url := resources.Service.Status.URL.URL() + // This polls until we get a 200 with the right body. + assertServiceResourcesUpdated(c.T, clients, *names, url, test.HelloWorldText) + + // Setup webhook that fails when deployment is updated + // Failing to update the Deployment shouldn't cause a traffic drop + // note: the deployment is only updated if the controllers change the spec + // and this happens when the queue proxy image is changed when upgrading + c.T.Log("Creating Failing Webhook") + noSideEffects := admissionv1.SideEffectClassNone + + selector := &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "serving.knative.dev/service": names.Service, + }, + } + + // Create a broken webhook that breaks scheduling Pods + _, err = clients.KubeClient.AdmissionregistrationV1().MutatingWebhookConfigurations().Create( + ctx, + &admissionv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{ + Name: "broken-webhook", + }, + Webhooks: []admissionv1.MutatingWebhook{{ + AdmissionReviewVersions: []string{"v1"}, + Name: "webhook.non-existing.dev", + ClientConfig: admissionv1.WebhookClientConfig{ + Service: &admissionv1.ServiceReference{ + Name: helpers.AppendRandomString("non-existing"), + Namespace: helpers.AppendRandomString("non-existing"), + }, + }, + ObjectSelector: selector, + TimeoutSeconds: ptr.Int32(5), + SideEffects: &noSideEffects, + Rules: []admissionv1.RuleWithOperations{{ + Operations: []admissionv1.OperationType{"CREATE"}, + Rule: admissionv1.Rule{ + APIGroups: []string{""}, // core + APIVersions: []string{"v1"}, + Resources: []string{"pods"}, + }, + }}, + }}, + }, + metav1.CreateOptions{}, + ) + + if err != nil { + c.T.Fatal("Failed to create bad webhook:", err) + } + }) +} diff --git a/test/upgrade/postupgrade.go b/test/upgrade/postupgrade.go index 299e9a4c2370..61f5b1121600 100644 --- a/test/upgrade/postupgrade.go +++ b/test/upgrade/postupgrade.go @@ -42,6 +42,7 @@ func ServingPostUpgradeTests() []pkgupgrade.Operation { CreateNewServicePostUpgradeTest(), InitialScalePostUpgradeTest(), CRDStoredVersionPostUpgradeTest(), + DeploymentFailurePostUpgrade(), } } diff --git a/test/upgrade/preupgrade.go b/test/upgrade/preupgrade.go index 3edcdde8266e..d00f5ef86e87 100644 --- a/test/upgrade/preupgrade.go +++ b/test/upgrade/preupgrade.go @@ -35,6 +35,7 @@ func ServingPreUpgradeTests() []pkgupgrade.Operation { ServicePreUpgradeAndScaleToZeroTest(), BYORevisionPreUpgradeTest(), InitialScalePreUpgradeTest(), + DeploymentFailurePreUpgrade(), } }