From d2613132aa4b3e0915ca9f1ff3252c93e76e334e Mon Sep 17 00:00:00 2001 From: myname4423 <57184070+myname4423@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:29:58 +0800 Subject: [PATCH] improve finalising logic for canary release (#229) improve finalising logic for canary release-2 Signed-off-by: yunbo Co-authored-by: yunbo --- api/v1alpha1/rollout_types.go | 2 - api/v1beta1/rollout_types.go | 24 +-- .../bases/rollouts.kruise.io_rollouts.yaml | 5 +- pkg/controller/rollout/rollout_canary.go | 142 +++++++++++----- pkg/controller/rollout/rollout_progressing.go | 12 +- .../rollout/rollout_progressing_test.go | 157 +++++++++++++++++- pkg/controller/rollout/rollout_status.go | 4 +- pkg/trafficrouting/manager.go | 11 +- test/e2e/rollout_v1beta1_test.go | 2 +- 9 files changed, 287 insertions(+), 72 deletions(-) diff --git a/api/v1alpha1/rollout_types.go b/api/v1alpha1/rollout_types.go index d67a5388..526c110e 100644 --- a/api/v1alpha1/rollout_types.go +++ b/api/v1alpha1/rollout_types.go @@ -171,8 +171,6 @@ type RolloutStatus struct { // Conditions a list of conditions a rollout can have. // +optional Conditions []RolloutCondition `json:"conditions,omitempty"` - // +optional - //BlueGreenStatus *BlueGreenStatus `json:"blueGreenStatus,omitempty"` // Phase is the rollout phase. Phase RolloutPhase `json:"phase,omitempty"` // Message provides details on why the rollout is in its current phase diff --git a/api/v1beta1/rollout_types.go b/api/v1beta1/rollout_types.go index c0b862e7..d1bd53a4 100644 --- a/api/v1beta1/rollout_types.go +++ b/api/v1beta1/rollout_types.go @@ -445,7 +445,7 @@ type CanaryStatus struct { // BlueGreenStatus status fields that only pertain to the blueGreen rollout type BlueGreenStatus struct { CommonStatus `json:",inline"` - // CanaryRevision is calculated by rollout based on podTemplateHash, and the internal logic flow uses + // UpdatedRevision is calculated by rollout based on podTemplateHash, and the internal logic flow uses // It may be different from rs podTemplateHash in different k8s versions, so it cannot be used as service selector label UpdatedRevision string `json:"updatedRevision"` // UpdatedReplicas the numbers of updated pods @@ -558,29 +558,29 @@ const ( type FinalisingStepType string const ( - // some work that should be done before pod scaling down. - // For BlueGreenStrategy: - // we rout all traffic to stable or new version based on FinaliseReason - // For CanaryStrategy: - // we remove the selector of stable service - FinalisingStepTypePreparing FinalisingStepType = "Preparing" + // Route all traffic to stable or new version based on FinaliseReason (for bluegreen) + FinalisingStepTypeRouteAllTraffic FinalisingStepType = "RouteAllTraffic" + // Restore the stable Service, i.e. remove corresponding selector + FinalisingStepTypeStableService FinalisingStepType = "RestoreStableService" + // Remove the canary Service + FinalisingStepTypeRemoveCanaryService FinalisingStepType = "RemoveCanaryService" + // Patch Batch Release to scale down (exception: the canary Deployment will be // scaled down in FinalisingStepTypeDeleteBR step) // For Both BlueGreenStrategy and CanaryStrategy: // set workload.pause=false, set workload.partition=0 FinalisingStepTypeBatchRelease FinalisingStepType = "PatchBatchRelease" - //TODO - Currently, the next three steps are in the same function, FinalisingTrafficRouting - // we should try to separate the FinalisingStepTypeGateway and FinalisingStepTypeCanaryService - // with graceful time to prevent some potential issues - // Restore the stable Service (i.e. remove corresponding selector) - FinalisingStepTypeStableService FinalisingStepType = "RestoreStableService" + // Execute the FinalisingTrafficRouting function + FinalisingStepTypeTrafficRouting FinalisingStepType = "FinalisingTrafficRouting" // Restore the GatewayAPI/Ingress/Istio FinalisingStepTypeGateway FinalisingStepType = "RestoreGateway" // Delete Canary Service FinalisingStepTypeDeleteCanaryService FinalisingStepType = "DeleteCanaryService" // Delete Batch Release FinalisingStepTypeDeleteBR FinalisingStepType = "DeleteBatchRelease" + // All needed work done + FinalisingStepTypeEnd FinalisingStepType = "END" ) // +genclient diff --git a/config/crd/bases/rollouts.kruise.io_rollouts.yaml b/config/crd/bases/rollouts.kruise.io_rollouts.yaml index 32571da3..00b5b43f 100644 --- a/config/crd/bases/rollouts.kruise.io_rollouts.yaml +++ b/config/crd/bases/rollouts.kruise.io_rollouts.yaml @@ -534,8 +534,7 @@ spec: format: int64 type: integer phase: - description: BlueGreenStatus *BlueGreenStatus `json:"blueGreenStatus,omitempty"` - Phase is the rollout phase. + description: Phase is the rollout phase. type: string type: object type: object @@ -1475,7 +1474,7 @@ spec: format: int32 type: integer updatedRevision: - description: CanaryRevision is calculated by rollout based on + description: UpdatedRevision is calculated by rollout based on podTemplateHash, and the internal logic flow uses It may be different from rs podTemplateHash in different k8s versions, so it cannot be used as service selector label diff --git a/pkg/controller/rollout/rollout_canary.go b/pkg/controller/rollout/rollout_canary.go index c56b550c..63c01703 100644 --- a/pkg/controller/rollout/rollout_canary.go +++ b/pkg/controller/rollout/rollout_canary.go @@ -363,43 +363,66 @@ func (m *canaryReleaseManager) doCanaryJump(c *RolloutContext) (jumped bool) { // cleanup after rollout is completed or finished func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, error) { + canaryStatus := c.NewStatus.CanaryStatus // when CanaryStatus is nil, which means canary action hasn't started yet, don't need doing cleanup - if c.NewStatus.CanaryStatus == nil { + if canaryStatus == nil { return true, nil } - // 1. rollout progressing complete, remove rollout progressing annotation in workload + // rollout progressing complete, remove rollout progressing annotation in workload err := m.removeRolloutProgressingAnnotation(c) if err != nil { return false, err } tr := newTrafficRoutingContext(c) - // 2. remove stable service the pod revision selector, so stable service will be selector all version pods. - done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr) - c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime - if err != nil || !done { - return done, err - } - // 3. set workload.pause=false; set workload.partition=0 - done, err = m.finalizingBatchRelease(c) - if err != nil || !done { - return done, err - } - // 4. modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods. - done, err = m.trafficRoutingManager.FinalisingTrafficRouting(tr) - c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime - if err != nil || !done { - return done, err - } - // 5. delete batchRelease crd - done, err = m.removeBatchRelease(c) - if err != nil { - klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) - return false, err - } else if !done { + // execute steps based on the predefined order for each reason + nextStep := nextTask(c.FinalizeReason, canaryStatus.FinalisingStep) + // if current step is empty, set it with the first step + // if current step is end, we just return + if len(canaryStatus.FinalisingStep) == 0 { + canaryStatus.FinalisingStep = nextStep + canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + } else if canaryStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd { + klog.Infof("rollout(%s/%s) finalising process is already completed", c.Rollout.Namespace, c.Rollout.Name) + return true, nil + } + klog.Infof("rollout(%s/%s) Finalising Step is %s", c.Rollout.Namespace, c.Rollout.Name, canaryStatus.FinalisingStep) + + var retry bool + // the order of steps is maitained by calculating thenextStep + switch canaryStatus.FinalisingStep { + // set workload.pause=false; set workload.partition=0 + case v1beta1.FinalisingStepTypeBatchRelease: + retry, err = m.finalizingBatchRelease(c) + // delete batchRelease + case v1beta1.FinalisingStepTypeDeleteBR: + retry, err = m.removeBatchRelease(c) + // restore the gateway resources (ingress/gatewayAPI/Istio), that means + // only stable Service will accept the traffic + case v1beta1.FinalisingStepTypeGateway: + retry, err = m.trafficRoutingManager.RestoreGateway(tr) + // restore the stable service + case v1beta1.FinalisingStepTypeStableService: + retry, err = m.trafficRoutingManager.RestoreStableService(tr) + // remove canary service + case v1beta1.FinalisingStepTypeRemoveCanaryService: + retry, err = m.trafficRoutingManager.RemoveCanaryService(tr) + + default: + nextStep = nextTask(c.FinalizeReason, "") + klog.Warningf("unexpected finalising step, current step(%s), start from the first step(%s)", canaryStatus.FinalisingStep, nextStep) + canaryStatus.FinalisingStep = nextStep return false, nil } - klog.Infof("rollout(%s/%s) doCanaryFinalising success", c.Rollout.Namespace, c.Rollout.Name) - return true, nil + if err != nil || retry { + return false, err + } + // current step is done, run the next step + canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + canaryStatus.FinalisingStep = nextStep + if canaryStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd { + return true, nil + } + return false, nil } func (m *canaryReleaseManager) removeRolloutProgressingAnnotation(c *RolloutContext) error { @@ -508,37 +531,39 @@ func createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32, return br } +// bool means if we need retry; if error is not nil, always retry func (m *canaryReleaseManager) removeBatchRelease(c *RolloutContext) (bool, error) { batch := &v1beta1.BatchRelease{} err := m.Get(context.TODO(), client.ObjectKey{Namespace: c.Rollout.Namespace, Name: c.Rollout.Name}, batch) if err != nil && errors.IsNotFound(err) { - return true, nil + return false, nil } else if err != nil { klog.Errorf("rollout(%s/%s) fetch BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name) - return false, err + return true, err } if !batch.DeletionTimestamp.IsZero() { klog.Infof("rollout(%s/%s) BatchRelease is terminating, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) - return false, nil + return true, nil } //delete batchRelease err = m.Delete(context.TODO(), batch) if err != nil { klog.Errorf("rollout(%s/%s) delete BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) - return false, err + return true, err } klog.Infof("rollout(%s/%s) deleting BatchRelease, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) - return false, nil + return true, nil } +// bool means if we need retry; if error is not nil, always retry func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool, error) { br, err := m.fetchBatchRelease(c.Rollout.Namespace, c.Rollout.Name) if err != nil { if errors.IsNotFound(err) { - return true, nil + return false, nil } - return false, err + return true, err } waitReady := c.WaitReady // The Completed phase means batchRelease controller has processed all it @@ -546,7 +571,7 @@ func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool, if br.Spec.ReleasePlan.BatchPartition == nil && br.Status.Phase == v1beta1.RolloutPhaseCompleted { klog.Infof("rollout(%s/%s) finalizing batchRelease(%s) done", c.Rollout.Namespace, c.Rollout.Name, util.DumpJSON(br.Status)) - return true, nil + return false, nil } // If BatchPartition is nil, BatchRelease will directly resume workload via: @@ -559,11 +584,11 @@ func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool, switch br.Spec.ReleasePlan.FinalizingPolicy { case v1beta1.WaitResumeFinalizingPolicyType: if waitReady { // no need to patch again - return false, nil + return true, nil } default: if !waitReady { // no need to patch again - return false, nil + return true, nil } } } @@ -577,10 +602,10 @@ func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool, // Patch BatchPartition and FinalizingPolicy, BatchPartition always patch null here. body := fmt.Sprintf(`{"spec":{"releasePlan":{"batchPartition":null,"finalizingPolicy":"%s"}}}`, policy) if err = m.Patch(context.TODO(), br, client.RawPatch(types.MergePatchType, []byte(body))); err != nil { - return false, err + return true, err } klog.Infof("rollout(%s/%s) patch batchRelease(%s) success", c.Rollout.Namespace, c.Rollout.Name, body) - return false, nil + return true, nil } // syncBatchRelease sync status of br to canaryStatus, and sync rollout-id of canaryStatus to br. @@ -601,3 +626,42 @@ func (m *canaryReleaseManager) syncBatchRelease(br *v1beta1.BatchRelease, canary } return nil } + +// calculate next task +func nextTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.FinalisingStepType { + var taskSequence []v1beta1.FinalisingStepType + //REVIEW - should we consider more complex scenarios? + // like, user rollbacks the workload and disables the Rollout at the same time? + switch reason { + // in rollback, we cannot remove selector of the stable service in the first step, + // which may cause some traffic to problematic new version, so we should route all traffic to stable service + // in the first step + case v1beta1.FinaliseReasonRollback: // rollback + taskSequence = []v1beta1.FinalisingStepType{ + v1beta1.FinalisingStepTypeGateway, // route all traffic to stable version + v1beta1.FinalisingStepTypeBatchRelease, // scale up old, scale down new + v1beta1.FinalisingStepTypeDeleteBR, + v1beta1.FinalisingStepTypeStableService, + v1beta1.FinalisingStepTypeRemoveCanaryService, + } + default: // others: success/disabled/deleting rollout + taskSequence = []v1beta1.FinalisingStepType{ + v1beta1.FinalisingStepTypeStableService, + v1beta1.FinalisingStepTypeGateway, + v1beta1.FinalisingStepTypeRemoveCanaryService, + v1beta1.FinalisingStepTypeBatchRelease, // scale up new, scale down old + v1beta1.FinalisingStepTypeDeleteBR, + } + } + // if currentTask is empty, return first task + if len(currentTask) == 0 { + return taskSequence[0] + } + // find next task + for i := range taskSequence { + if currentTask == taskSequence[i] && i < len(taskSequence)-1 { + return taskSequence[i+1] + } + } + return v1beta1.FinalisingStepTypeEnd +} diff --git a/pkg/controller/rollout/rollout_progressing.go b/pkg/controller/rollout/rollout_progressing.go index 7cfabf8d..a1214cbf 100644 --- a/pkg/controller/rollout/rollout_progressing.go +++ b/pkg/controller/rollout/rollout_progressing.go @@ -45,6 +45,8 @@ type RolloutContext struct { RecheckTime *time.Time // wait stable workload pods ready WaitReady bool + // finalising reason + FinalizeReason string } // parameter1 retryReconcile, parameter2 error @@ -116,6 +118,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason) var done bool rolloutContext.WaitReady = true + rolloutContext.FinalizeReason = v1beta1.FinaliseReasonSuccess done, err = r.doFinalising(rolloutContext) if err != nil { return nil, err @@ -140,6 +143,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout case v1alpha1.ProgressingReasonCancelling: klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason) var done bool + rolloutContext.FinalizeReason = v1beta1.FinaliseReasonRollback done, err = r.doFinalising(rolloutContext) if err != nil { return nil, err @@ -416,11 +420,11 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error) } // if no trafficRouting exists, simply remove batchRelease if !c.Rollout.Spec.Strategy.HasTrafficRoutings() { - done, err := releaseManager.removeBatchRelease(c) + retry, err := releaseManager.removeBatchRelease(c) if err != nil { klog.Errorf("rollout(%s/%s) DoFinalising batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) return false, err - } else if !done { + } else if retry { return false, nil } return true, nil @@ -454,11 +458,11 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error) // canary deployment, for other release, the v2 pods won't be deleted immediately // in both cases, only the stable pods (v1) accept the traffic case v1beta1.FinalisingStepTypeDeleteBR: - done, err := releaseManager.removeBatchRelease(c) + retry, err := releaseManager.removeBatchRelease(c) if err != nil { klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) return false, err - } else if !done { + } else if retry { return false, nil } klog.Infof("rollout(%s/%s) in step (%s), and success", c.Rollout.Namespace, c.Rollout.Name, subStatus.FinalisingStep) diff --git a/pkg/controller/rollout/rollout_progressing_test.go b/pkg/controller/rollout/rollout_progressing_test.go index 15098490..67ad5435 100644 --- a/pkg/controller/rollout/rollout_progressing_test.go +++ b/pkg/controller/rollout/rollout_progressing_test.go @@ -175,7 +175,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { }, }, { - name: "ReconcileRolloutProgressing rolling -> finalizing", + name: "ReconcileRolloutProgressing rolling -> finalizing", // call FinalisingTrafficRouting to restore stable Service getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { dep1 := deploymentDemo.DeepCopy() dep2 := deploymentDemo.DeepCopy() @@ -235,7 +235,77 @@ func TestReconcileRolloutProgressing(t *testing.T) { }, }, { - name: "ReconcileRolloutProgressing finalizing1", + name: "ReconcileRolloutProgressing finalizing1", // call FinalisingTrafficRouting to remove canary service + getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { + dep1 := deploymentDemo.DeepCopy() + delete(dep1.Annotations, util.InRolloutProgressingAnnotation) + dep1.Status = apps.DeploymentStatus{ + ObservedGeneration: 2, + Replicas: 10, + UpdatedReplicas: 5, + ReadyReplicas: 10, + AvailableReplicas: 10, + } + rs1 := rsDemo.DeepCopy() + return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1} + }, + getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) { + s1 := demoService.DeepCopy() + s1.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v1" + s2 := demoService.DeepCopy() + s2.Name = s1.Name + "-canary" + return []*corev1.Service{s1, s2}, []*netv1.Ingress{demoIngress.DeepCopy()} + }, + getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease, *v1alpha1.TrafficRouting) { + obj := rolloutDemo.DeepCopy() + obj.Annotations[v1alpha1.TrafficRoutingAnnotation] = "tr-demo" + obj.Status.CanaryStatus.ObservedWorkloadGeneration = 2 + obj.Status.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1" + obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547" + obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" + obj.Status.CanaryStatus.CurrentStepIndex = 4 + obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonFinalising + cond.Status = corev1.ConditionTrue + util.SetRolloutCondition(&obj.Status, *cond) + br := batchDemo.DeepCopy() + br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{ + { + CanaryReplicas: intstr.FromInt(1), + }, + } + tr := demoTR.DeepCopy() + tr.Finalizers = []string{util.ProgressingRolloutFinalizer(rolloutDemo.Name)} + return obj, br, tr + }, + expectStatus: func() *v1beta1.RolloutStatus { + s := rolloutDemo.Status.DeepCopy() + s.CanaryStatus.ObservedWorkloadGeneration = 2 + s.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + s.CanaryStatus.StableRevision = "pod-template-hash-v1" + s.CanaryStatus.CanaryRevision = "6f8cc56547" + s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" + s.CanaryStatus.CurrentStepIndex = 4 + s.CanaryStatus.NextStepIndex = 0 + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService + s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonFinalising + cond.Status = corev1.ConditionTrue + util.SetRolloutCondition(s, *cond) + s.CurrentStepIndex = s.CanaryStatus.CurrentStepIndex + s.CurrentStepState = s.CanaryStatus.CurrentStepState + return s + }, + expectTr: func() *v1alpha1.TrafficRouting { + tr := demoTR.DeepCopy() + return tr + }, + }, + { + name: "ReconcileRolloutProgressing finalizing2", // go to the next finalising step getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { dep1 := deploymentDemo.DeepCopy() delete(dep1.Annotations, util.InRolloutProgressingAnnotation) @@ -262,6 +332,9 @@ func TestReconcileRolloutProgressing(t *testing.T) { obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" obj.Status.CanaryStatus.CurrentStepIndex = 4 obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + // given that the selector of stable Service is removed + // we will go on it the next step, i.e. patch restoreGateway + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising cond.Status = corev1.ConditionTrue @@ -285,6 +358,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" s.CanaryStatus.CurrentStepIndex = 4 s.CanaryStatus.NextStepIndex = 0 + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeGateway s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising @@ -300,7 +374,76 @@ func TestReconcileRolloutProgressing(t *testing.T) { }, }, { - name: "ReconcileRolloutProgressing finalizing2", + name: "ReconcileRolloutProgressing finalizing3", + getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { + dep1 := deploymentDemo.DeepCopy() + delete(dep1.Annotations, util.InRolloutProgressingAnnotation) + dep1.Status = apps.DeploymentStatus{ + ObservedGeneration: 2, + Replicas: 10, + UpdatedReplicas: 5, + ReadyReplicas: 10, + AvailableReplicas: 10, + } + rs1 := rsDemo.DeepCopy() + return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1} + }, + getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) { + return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()} + }, + getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease, *v1alpha1.TrafficRouting) { + obj := rolloutDemo.DeepCopy() + obj.Annotations[v1alpha1.TrafficRoutingAnnotation] = "tr-demo" + obj.Status.CanaryStatus.ObservedWorkloadGeneration = 2 + obj.Status.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1" + obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547" + obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" + obj.Status.CanaryStatus.CurrentStepIndex = 4 + obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + // because the batchRelease hasn't completed (ie. br.Status.Phase is not completed), + // it will take more than one reconciles to go on to the next step + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeBatchRelease + cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonFinalising + cond.Status = corev1.ConditionTrue + util.SetRolloutCondition(&obj.Status, *cond) + br := batchDemo.DeepCopy() + br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{ + { + CanaryReplicas: intstr.FromInt(1), + }, + } + tr := demoTR.DeepCopy() + tr.Finalizers = []string{util.ProgressingRolloutFinalizer(rolloutDemo.Name)} + return obj, br, tr + }, + expectStatus: func() *v1beta1.RolloutStatus { + s := rolloutDemo.Status.DeepCopy() + s.CanaryStatus.ObservedWorkloadGeneration = 2 + s.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + s.CanaryStatus.StableRevision = "pod-template-hash-v1" + s.CanaryStatus.CanaryRevision = "6f8cc56547" + s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" + s.CanaryStatus.CurrentStepIndex = 4 + s.CanaryStatus.NextStepIndex = 0 + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeBatchRelease + s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonFinalising + cond.Status = corev1.ConditionTrue + util.SetRolloutCondition(s, *cond) + s.CurrentStepIndex = s.CanaryStatus.CurrentStepIndex + s.CurrentStepState = s.CanaryStatus.CurrentStepState + return s + }, + expectTr: func() *v1alpha1.TrafficRouting { + tr := demoTR.DeepCopy() + return tr + }, + }, + { + name: "ReconcileRolloutProgressing finalizing4", getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { dep1 := deploymentDemo.DeepCopy() delete(dep1.Annotations, util.InRolloutProgressingAnnotation) @@ -326,6 +469,9 @@ func TestReconcileRolloutProgressing(t *testing.T) { obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" obj.Status.CanaryStatus.CurrentStepIndex = 4 obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + // the batchRelease has completed (ie. br.Status.Phase is completed), + // we expect the finalizing step to be next step, i.e. deleteBatchRelease + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeBatchRelease cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising cond.Status = corev1.ConditionTrue @@ -348,6 +494,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" s.CanaryStatus.CurrentStepIndex = 4 s.CanaryStatus.NextStepIndex = 0 + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted cond2 := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond2.Reason = v1alpha1.ProgressingReasonFinalising @@ -385,6 +532,9 @@ func TestReconcileRolloutProgressing(t *testing.T) { obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" obj.Status.CanaryStatus.CurrentStepIndex = 4 obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + // deleteBatchRelease is the last step, and it won't wait a grace time + // after this step, this release should be succeeded + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising cond.Status = corev1.ConditionTrue @@ -401,6 +551,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { s.CanaryStatus.CurrentStepIndex = 4 s.CanaryStatus.NextStepIndex = 0 s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeEnd cond2 := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond2.Reason = v1alpha1.ProgressingReasonCompleted cond2.Status = corev1.ConditionFalse diff --git a/pkg/controller/rollout/rollout_status.go b/pkg/controller/rollout/rollout_status.go index 69a13d07..840a1a46 100755 --- a/pkg/controller/rollout/rollout_status.go +++ b/pkg/controller/rollout/rollout_status.go @@ -238,7 +238,7 @@ func (r *RolloutReconciler) reconcileRolloutTerminating(rollout *v1beta1.Rollout klog.Errorf("rollout(%s/%s) get workload failed: %s", rollout.Namespace, rollout.Name, err.Error()) return nil, err } - c := &RolloutContext{Rollout: rollout, NewStatus: newStatus, Workload: workload} + c := &RolloutContext{Rollout: rollout, NewStatus: newStatus, Workload: workload, FinalizeReason: v1beta1.FinaliseReasonDelete} done, err := r.doFinalising(c) if err != nil { return nil, err @@ -262,7 +262,7 @@ func (r *RolloutReconciler) reconcileRolloutDisabling(rollout *v1beta1.Rollout, klog.Errorf("rollout(%s/%s) get workload failed: %s", rollout.Namespace, rollout.Name, err.Error()) return nil, err } - c := &RolloutContext{Rollout: rollout, NewStatus: newStatus, Workload: workload} + c := &RolloutContext{Rollout: rollout, NewStatus: newStatus, Workload: workload, FinalizeReason: v1beta1.FinaliseReasonDisalbed} done, err := r.doFinalising(c) if err != nil { return nil, err diff --git a/pkg/trafficrouting/manager.go b/pkg/trafficrouting/manager.go index a1fe2813..b7a3edb6 100644 --- a/pkg/trafficrouting/manager.go +++ b/pkg/trafficrouting/manager.go @@ -204,26 +204,25 @@ func (m *Manager) FinalisingTrafficRouting(c *TrafficRoutingContext) (bool, erro if len(c.ObjectRef) == 0 { return true, nil } - klog.Infof("%s start finalising traffic routing", c.Key) - // remove stable service the pod revision selector, so stable service will be selector all version pods. + klog.InfoS("start finalising traffic routing", "rollout", c.Key) + // remove stable service the pod revision selector, so stable service will select pods of all versions. if retry, err := m.RestoreStableService(c); err != nil || retry { return false, err } - klog.Infof("%s restore stable service success", c.Key) + klog.InfoS("restore stable service success", "rollout", c.Key) // modify network(ingress & gateway api) configuration, route all traffic to stable service if retry, err := m.RestoreGateway(c); err != nil || retry { return false, err } - klog.Infof("%s restore gateway success", c.Key) + klog.InfoS("restore gateway success", "rollout", c.Key) // remove canary service if retry, err := m.RemoveCanaryService(c); err != nil || retry { return false, err } - klog.Infof("%s remove canary service success, finalising traffic routing is done", c.Key) + klog.InfoS("remove canary service success, finalising traffic routing is done", "rollout", c.Key) return true, nil } -// RestoreGateway restore gateway resources without graceful time // returns: // - if error is not nil, usually we need to retry later. Only if error is nil, we consider the bool. // - The bool value indicates whether retry is needed. If true, it usually means diff --git a/test/e2e/rollout_v1beta1_test.go b/test/e2e/rollout_v1beta1_test.go index 799b62e0..0ee70658 100644 --- a/test/e2e/rollout_v1beta1_test.go +++ b/test/e2e/rollout_v1beta1_test.go @@ -213,7 +213,7 @@ var _ = SIGDescribe("Rollout v1beta1", func() { body := fmt.Sprintf(`{"status":{"canaryStatus":{"currentStepState":"%s"}}}`, v1beta1.CanaryStepStateReady) Expect(k8sClient.Status().Patch(context.TODO(), clone, client.RawPatch(types.MergePatchType, []byte(body)))).NotTo(HaveOccurred()) return false - }, 10*time.Second, time.Second).Should(BeTrue()) + }, 10*time.Second, time.Millisecond*500).Should(BeTrue()) } RolloutJumpCanaryStep := func(name string, target int) {