diff --git a/pkg/controller/rollout/rollout_bluegreen.go b/pkg/controller/rollout/rollout_bluegreen.go new file mode 100644 index 00000000..b1a1f815 --- /dev/null +++ b/pkg/controller/rollout/rollout_bluegreen.go @@ -0,0 +1,487 @@ +/* +Copyright 2022 The Kruise Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rollout + +import ( + "context" + "fmt" + "reflect" + "time" + + "github.com/openkruise/rollouts/api/v1alpha1" + "github.com/openkruise/rollouts/api/v1beta1" + "github.com/openkruise/rollouts/pkg/trafficrouting" + "github.com/openkruise/rollouts/pkg/util" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + "k8s.io/klog/v2" + utilpointer "k8s.io/utils/pointer" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type blueGreenReleaseManager struct { + client.Client + trafficRoutingManager *trafficrouting.Manager + recorder record.EventRecorder +} + +func (m *blueGreenReleaseManager) runCanary(c *RolloutContext) error { + blueGreenStatus := c.NewStatus.BlueGreenStatus + if br, err := fetchBatchRelease(m.Client, c.Rollout.Namespace, c.Rollout.Name); err != nil && !errors.IsNotFound(err) { + klog.Errorf("rollout(%s/%s) fetch batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) + return err + } else if err == nil { + // This line will do something important: + // - sync status from br to Rollout: to better observability; + // - sync rollout-id from Rollout to br: to make BatchRelease + // relabels pods in the scene where only rollout-id is changed. + if err = m.syncBatchRelease(br, blueGreenStatus); err != nil { + klog.Errorf("rollout(%s/%s) sync batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) + return err + } + } + if blueGreenStatus.PodTemplateHash == "" { + blueGreenStatus.PodTemplateHash = c.Workload.PodTemplateHash + } + + if m.doCanaryJump(c) { + klog.Infof("rollout(%s/%s) canary step jumped", c.Rollout.Namespace, c.Rollout.Name) + return nil + } + // When the first batch is trafficRouting rolling and the next steps are rolling release, + // We need to clean up the canary-related resources first and then rollout the rest of the batch. + currentStep := c.Rollout.Spec.Strategy.BlueGreen.Steps[blueGreenStatus.CurrentStepIndex-1] + if currentStep.Traffic == nil && len(currentStep.Matches) == 0 { + tr := newTrafficRoutingContext(c) + done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr) + blueGreenStatus.LastUpdateTime = tr.LastUpdateTime + if err != nil { + return err + } else if !done { + klog.Infof("rollout(%s/%s) cleaning up canary-related resources", c.Rollout.Namespace, c.Rollout.Name) + expectedTime := time.Now().Add(tr.RecheckDuration) + c.RecheckTime = &expectedTime + return nil + } + } + + switch blueGreenStatus.CurrentStepState { + // before CanaryStepStateUpgrade, handle some special cases, to prevent traffic loss + case v1beta1.CanaryStepStateInit: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateInit) + tr := newTrafficRoutingContext(c) + if currentStep.Traffic == nil && len(currentStep.Matches) == 0 { + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStateInit, blueGreenStatus.CurrentStepState) + return nil + } + //TODO - consider istio subsets + if blueGreenStatus.CurrentStepIndex == 1 { + klog.Infof("special case detected: rollout(%s/%s) patch stable Service", c.Rollout.Namespace, c.Rollout.Name) + retry, err := m.trafficRoutingManager.PatchStableService(tr) + if err != nil { + return err + } else if retry { + expectedTime := time.Now().Add(tr.RecheckDuration) + c.RecheckTime = &expectedTime + return nil + } + } + + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStateInit, blueGreenStatus.CurrentStepState) + + case v1beta1.CanaryStepStateUpgrade: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateUpgrade) + done, err := m.doCanaryUpgrade(c) + if err != nil { + return err + } else if done { + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateTrafficRouting + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStateUpgrade, blueGreenStatus.CurrentStepState) + } + + case v1beta1.CanaryStepStateTrafficRouting: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateTrafficRouting) + tr := newTrafficRoutingContext(c) + done, err := m.trafficRoutingManager.DoTrafficRouting(tr) + blueGreenStatus.LastUpdateTime = tr.LastUpdateTime + if err != nil { + return err + } else if done { + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateMetricsAnalysis + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStateTrafficRouting, blueGreenStatus.CurrentStepState) + } + expectedTime := time.Now().Add(time.Duration(defaultGracePeriodSeconds) * time.Second) + c.RecheckTime = &expectedTime + + case v1beta1.CanaryStepStateMetricsAnalysis: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateMetricsAnalysis) + done, err := m.doCanaryMetricsAnalysis(c) + if err != nil { + return err + } else if done { + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStatePaused + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStateMetricsAnalysis, blueGreenStatus.CurrentStepState) + } + + case v1beta1.CanaryStepStatePaused: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStatePaused) + done, err := m.doCanaryPaused(c) + if err != nil { + return err + } else if done { + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateReady + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStatePaused, blueGreenStatus.CurrentStepState) + } + + case v1beta1.CanaryStepStateReady: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateReady) + // run next step + if len(c.Rollout.Spec.Strategy.BlueGreen.Steps) > int(blueGreenStatus.CurrentStepIndex) { + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + blueGreenStatus.CurrentStepIndex++ + blueGreenStatus.NextStepIndex = util.NextBatchIndex(c.Rollout, blueGreenStatus.CurrentStepIndex) + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateInit + klog.Infof("rollout(%s/%s) bluegreen step from(%d) -> to(%d)", c.Rollout.Namespace, c.Rollout.Name, blueGreenStatus.CurrentStepIndex-1, blueGreenStatus.CurrentStepIndex) + } else { + klog.Infof("rollout(%s/%s) bluegreen run all steps, and completed", c.Rollout.Namespace, c.Rollout.Name) + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + return nil + } + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStateReady, blueGreenStatus.CurrentStepState) + // canary completed + case v1beta1.CanaryStepStateCompleted: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateCompleted) + } + + return nil +} + +func (m *blueGreenReleaseManager) doCanaryUpgrade(c *RolloutContext) (bool, error) { + // verify whether batchRelease configuration is the latest + steps := len(c.Rollout.Spec.Strategy.BlueGreen.Steps) + blueGreenStatus := c.NewStatus.BlueGreenStatus + cond := util.GetRolloutCondition(*c.NewStatus, v1beta1.RolloutConditionProgressing) + cond.Message = fmt.Sprintf("Rollout is in step(%d/%d), and upgrade workload to new version", blueGreenStatus.CurrentStepIndex, steps) + c.NewStatus.Message = cond.Message + // run batch release to upgrade the workloads + done, br, err := runBatchRelease(m, c.Rollout, getRolloutID(c.Workload), blueGreenStatus.CurrentStepIndex, c.Workload.IsInRollback) + if err != nil { + return false, err + } else if !done { + return false, nil + } + if br.Status.ObservedReleasePlanHash != util.HashReleasePlanBatches(&br.Spec.ReleasePlan) || + br.Generation != br.Status.ObservedGeneration { + klog.Infof("rollout(%s/%s) batchRelease status is inconsistent, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) + return false, nil + } + // check whether batchRelease is ready(whether new pods is ready.) + if br.Status.CanaryStatus.CurrentBatchState != v1beta1.ReadyBatchState || + br.Status.CanaryStatus.CurrentBatch+1 < blueGreenStatus.CurrentStepIndex { + klog.Infof("rollout(%s/%s) batchRelease status(%s) is not ready, and wait a moment", c.Rollout.Namespace, c.Rollout.Name, util.DumpJSON(br.Status)) + return false, nil + } + m.recorder.Eventf(c.Rollout, corev1.EventTypeNormal, "Progressing", fmt.Sprintf("upgrade step(%d) canary pods with new versions done", blueGreenStatus.CurrentStepIndex)) + klog.Infof("rollout(%s/%s) batch(%s) state(%s), and success", + c.Rollout.Namespace, c.Rollout.Name, util.DumpJSON(br.Status), br.Status.CanaryStatus.CurrentBatchState) + // set the latest PodTemplateHash to selector the latest pods. + blueGreenStatus.PodTemplateHash = c.Workload.PodTemplateHash + return true, nil +} + +func (m *blueGreenReleaseManager) doCanaryMetricsAnalysis(c *RolloutContext) (bool, error) { + // todo + return true, nil +} + +func (m *blueGreenReleaseManager) doCanaryPaused(c *RolloutContext) (bool, error) { + blueGreenStatus := c.NewStatus.BlueGreenStatus + currentStep := c.Rollout.Spec.Strategy.BlueGreen.Steps[blueGreenStatus.CurrentStepIndex-1] + steps := len(c.Rollout.Spec.Strategy.BlueGreen.Steps) + cond := util.GetRolloutCondition(*c.NewStatus, v1beta1.RolloutConditionProgressing) + // need manual confirmation + if currentStep.Pause.Duration == nil { + klog.Infof("rollout(%s/%s) don't set pause duration, and need manual confirmation", c.Rollout.Namespace, c.Rollout.Name) + cond.Message = fmt.Sprintf("Rollout is in step(%d/%d), and you need manually confirm to enter the next step", blueGreenStatus.CurrentStepIndex, steps) + c.NewStatus.Message = cond.Message + return false, nil + } + cond.Message = fmt.Sprintf("Rollout is in step(%d/%d), and wait duration(%d seconds) to enter the next step", blueGreenStatus.CurrentStepIndex, steps, *currentStep.Pause.Duration) + c.NewStatus.Message = cond.Message + // wait duration time, then go to next step + duration := time.Second * time.Duration(*currentStep.Pause.Duration) + expectedTime := blueGreenStatus.LastUpdateTime.Add(duration) + if expectedTime.Before(time.Now()) { + klog.Infof("rollout(%s/%s) canary step(%d) paused duration(%d seconds), and go to the next step", + c.Rollout.Namespace, c.Rollout.Name, blueGreenStatus.CurrentStepIndex, *currentStep.Pause.Duration) + return true, nil + } + c.RecheckTime = &expectedTime + return false, nil +} + +func (m *blueGreenReleaseManager) doCanaryJump(c *RolloutContext) (jumped bool) { + bluegreenStatus := c.NewStatus.BlueGreenStatus + nextIndex := bluegreenStatus.NextStepIndex + /* + we set the CurrentStepIndex same as NextStepIndex to prevent currentStepIndex from out of range + for example, if we had a rollout with 4 steps and CurrentStepIndex was 2 + then, the user removed 3 steps from the plan, we can calculate NextStepIndex is 1 correctly, + but CurrentStepIndex remains 2, which could cause out of range. + */ + resetCurrentIndex := false + if int(bluegreenStatus.CurrentStepIndex) > len(c.Rollout.Spec.Strategy.BlueGreen.Steps) { + bluegreenStatus.CurrentStepIndex = nextIndex + resetCurrentIndex = true + } + currentStep := c.Rollout.Spec.Strategy.BlueGreen.Steps[bluegreenStatus.CurrentStepIndex-1] + if resetCurrentIndex || nextIndex != util.NextBatchIndex(c.Rollout, bluegreenStatus.CurrentStepIndex) && nextIndex > 0 { + currentIndexBackup := bluegreenStatus.CurrentStepIndex + currentStepStateBackup := bluegreenStatus.CurrentStepState + bluegreenStatus.CurrentStepIndex = nextIndex + bluegreenStatus.NextStepIndex = util.NextBatchIndex(c.Rollout, nextIndex) + nextStep := c.Rollout.Spec.Strategy.BlueGreen.Steps[nextIndex-1] + // if the Replicas between currentStep and nextStep is same, we can jump to + // the TrafficRouting step; otherwise, we should start from the Init step + if reflect.DeepEqual(nextStep.Replicas, currentStep.Replicas) && !resetCurrentIndex { + bluegreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + bluegreenStatus.CurrentStepState = v1beta1.CanaryStepStateTrafficRouting + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + bluegreenStatus.CurrentStepIndex, currentStepStateBackup, bluegreenStatus.CurrentStepState) + } else { + bluegreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + bluegreenStatus.CurrentStepState = v1beta1.CanaryStepStateInit + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + bluegreenStatus.CurrentStepIndex, currentStepStateBackup, v1beta1.CanaryStepStateInit) + } + klog.Infof("rollout(%s/%s) canary step from(%d) -> to(%d)", c.Rollout.Namespace, c.Rollout.Name, currentIndexBackup, bluegreenStatus.CurrentStepIndex) + return true + } + return false +} + +// cleanup after rollout is completed or finished +func (m *blueGreenReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, error) { + blueGreenStatus := c.NewStatus.BlueGreenStatus + if blueGreenStatus == nil { + return true, nil + } + // rollout progressing complete, remove rollout progressing annotation in workload + err := removeRolloutProgressingAnnotation(m.Client, c) + if err != nil { + return false, err + } + + tr := newTrafficRoutingContext(c) + // execute steps based on the predefined order for each reason + nextStep := nextBlueGreenTask(c.FinalizeReason, blueGreenStatus.FinalisingStep) + // if current step is empty, set it with the first step + // if current step is end, we just return + if len(blueGreenStatus.FinalisingStep) == 0 { + blueGreenStatus.FinalisingStep = nextStep + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + } else if blueGreenStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd { + klog.Infof("rollout(%s/%s) finalising process is already completed", c.Rollout.Namespace, c.Rollout.Name) + return true, nil + } + klog.Infof("rollout(%s/%s) Finalising Step is %s", c.Rollout.Namespace, c.Rollout.Name, blueGreenStatus.FinalisingStep) + + var retry bool + // the order of steps is maitained by calculating thenextStep + switch blueGreenStatus.FinalisingStep { + // set workload.pause=false; set workload.partition=0 + case v1beta1.FinalisingStepTypeBatchRelease: + retry, err = finalizingBatchRelease(m.Client, c) + // delete batchRelease + case v1beta1.FinalisingStepTypeDeleteBR: + retry, err = removeBatchRelease(m.Client, c) + // restore the gateway resources (ingress/gatewayAPI/Istio), that means + // only stable Service will accept the traffic + case v1beta1.FinalisingStepTypeGateway: + retry, err = m.trafficRoutingManager.RestoreGateway(tr) + // restore the stable service + case v1beta1.FinalisingStepTypeStableService: + retry, err = m.trafficRoutingManager.RestoreStableService(tr) + // remove canary service + case v1beta1.FinalisingStepTypeRemoveCanaryService: + retry, err = m.trafficRoutingManager.RemoveCanaryService(tr) + // route all traffic to new version + case v1beta1.FinalisingStepTypeRouteAllTraffic: + retry, err = m.trafficRoutingManager.RouteAllTrafficToNewVersion(tr) + default: + nextStep = nextBlueGreenTask(c.FinalizeReason, "") + klog.Warningf("unexpected finalising step, current step(%s), start from the first step(%s)", blueGreenStatus.FinalisingStep, nextStep) + blueGreenStatus.FinalisingStep = nextStep + return false, nil + } + if err != nil || retry { + return false, err + } + // current step is done, run the next step + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + blueGreenStatus.FinalisingStep = nextStep + if blueGreenStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd { + return true, nil + } + + return false, nil +} + +func (m *blueGreenReleaseManager) fetchClient() client.Client { + return m.Client +} + +func (m *blueGreenReleaseManager) createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32, isRollback bool) *v1beta1.BatchRelease { + var batches []v1beta1.ReleaseBatch + for _, step := range rollout.Spec.Strategy.BlueGreen.Steps { + batches = append(batches, v1beta1.ReleaseBatch{CanaryReplicas: *step.Replicas}) + } + br := &v1beta1.BatchRelease{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: rollout.Namespace, + Name: rollout.Name, + OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(rollout, rolloutControllerKind)}, + }, + Spec: v1beta1.BatchReleaseSpec{ + WorkloadRef: v1beta1.ObjectRef{ + APIVersion: rollout.Spec.WorkloadRef.APIVersion, + Kind: rollout.Spec.WorkloadRef.Kind, + Name: rollout.Spec.WorkloadRef.Name, + }, + ReleasePlan: v1beta1.ReleasePlan{ + Batches: batches, + RolloutID: rolloutID, + BatchPartition: utilpointer.Int32Ptr(batch), + FailureThreshold: rollout.Spec.Strategy.BlueGreen.FailureThreshold, + RollingStyle: v1beta1.BlueGreenRollingStyle, + }, + }, + } + annotations := map[string]string{} + if isRollback { + annotations[v1alpha1.RollbackInBatchAnnotation] = rollout.Annotations[v1alpha1.RollbackInBatchAnnotation] + } + if len(annotations) > 0 { + br.Annotations = annotations + } + return br +} + +// syncBatchRelease sync status of br to blueGreenStatus, and sync rollout-id of blueGreenStatus to br. +func (m *blueGreenReleaseManager) syncBatchRelease(br *v1beta1.BatchRelease, blueGreenStatus *v1beta1.BlueGreenStatus) error { + // sync from BatchRelease status to Rollout blueGreenStatus + blueGreenStatus.UpdatedReplicas = br.Status.CanaryStatus.UpdatedReplicas + blueGreenStatus.UpdatedReadyReplicas = br.Status.CanaryStatus.UpdatedReadyReplicas + // Do not remove this line currently, otherwise, users will be not able to judge whether the BatchRelease works + // in the scene where only rollout-id changed. + // TODO: optimize the logic to better understand + blueGreenStatus.Message = fmt.Sprintf("BatchRelease is at state %s, rollout-id %s, step %d", + br.Status.CanaryStatus.CurrentBatchState, br.Status.ObservedRolloutID, br.Status.CanaryStatus.CurrentBatch+1) + + // sync rolloutId from blueGreenStatus to BatchRelease + if blueGreenStatus.ObservedRolloutID != br.Spec.ReleasePlan.RolloutID { + body := fmt.Sprintf(`{"spec":{"releasePlan":{"rolloutID":"%s"}}}`, blueGreenStatus.ObservedRolloutID) + return m.Patch(context.TODO(), br, client.RawPatch(types.MergePatchType, []byte(body))) + } + return nil +} + +/* +- Rollback Scenario: +why the first step is to restore the gateway? (aka. route all traffic to stable version) +we cannot remove selector of the stable service firstly as canary does, because users are allowed to configure "0%" traffic +in bluegreen strategy. Consider the following example: + - replicas: 50% // step 1 + traffic: 0% + +if user is at step 1, and then attempts to rollback directly, Rollout should route all traffic to stable service +(keep unchanged actually). However, if we remove the selector of the stable service instead, we would inadvertently +route some traffic to the new version for a period, which is undesirable. + +- Rollout Deletion and Disabling Scenario: +If Rollout is being deleted or disabled, it suggests users want to release the new version using workload built-in strategy, +such as rollingUpdate for Deployment, instead of blue-green or canary. And thus, we can simply remove +the label selector of the stable service, routing traffic to reach both stable and updated pods. + +- Rollout success Scenario: +This indicates the rollout has completed its final batch and the user has confirmed to +transition fully to the new version. We can simply route all traffic to new version. Additionally, given that all +traffic is routed to the canary Service, it is safe to remove selector of stable Service, which additionally works +as a workaround for a bug caused by ingress-nginx controller (see https://github.com/kubernetes/ingress-nginx/issues/9635) +*/ +func nextBlueGreenTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.FinalisingStepType { + var taskSequence []v1beta1.FinalisingStepType + switch reason { + case v1beta1.FinaliseReasonSuccess: // success + taskSequence = []v1beta1.FinalisingStepType{ + v1beta1.FinalisingStepTypeRouteAllTraffic, + v1beta1.FinalisingStepTypeStableService, + v1beta1.FinalisingStepTypeBatchRelease, + v1beta1.FinalisingStepTypeGateway, + + v1beta1.FinalisingStepTypeRemoveCanaryService, + v1beta1.FinalisingStepTypeDeleteBR, + } + + case v1beta1.FinaliseReasonRollback: // rollback + taskSequence = []v1beta1.FinalisingStepType{ + v1beta1.FinalisingStepTypeGateway, // route all traffic to stable version + v1beta1.FinalisingStepTypeBatchRelease, + v1beta1.FinalisingStepTypeStableService, + + v1beta1.FinalisingStepTypeRemoveCanaryService, + v1beta1.FinalisingStepTypeDeleteBR, + } + default: // others: disabled/deleting rollout + taskSequence = []v1beta1.FinalisingStepType{ + v1beta1.FinalisingStepTypeStableService, + v1beta1.FinalisingStepTypeBatchRelease, // scale up new, scale down old + v1beta1.FinalisingStepTypeGateway, + + v1beta1.FinalisingStepTypeRemoveCanaryService, + v1beta1.FinalisingStepTypeDeleteBR, + } + } + // if currentTask is empty, return first task + if len(currentTask) == 0 { + return taskSequence[0] + } + // find next task + for i := range taskSequence { + if currentTask == taskSequence[i] && i < len(taskSequence)-1 { + return taskSequence[i+1] + } + } + return v1beta1.FinalisingStepTypeEnd +} diff --git a/pkg/controller/rollout/rollout_bluegreen_test.go b/pkg/controller/rollout/rollout_bluegreen_test.go new file mode 100644 index 00000000..1666a700 --- /dev/null +++ b/pkg/controller/rollout/rollout_bluegreen_test.go @@ -0,0 +1,331 @@ +/* +Copyright 2022 The Kruise Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rollout + +import ( + "context" + "reflect" + "testing" + + "github.com/openkruise/rollouts/api/v1alpha1" + "github.com/openkruise/rollouts/api/v1beta1" + "github.com/openkruise/rollouts/pkg/trafficrouting" + "github.com/openkruise/rollouts/pkg/util" + apps "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + netv1 "k8s.io/api/networking/v1" + + // metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/tools/record" + utilpointer "k8s.io/utils/pointer" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestBlueGreenRunCanary(t *testing.T) { + cases := []struct { + name string + getObj func() ([]*apps.Deployment, []*apps.ReplicaSet) + getNetwork func() ([]*corev1.Service, []*netv1.Ingress) + getRollout func() (*v1beta1.Rollout, *v1beta1.BatchRelease) + expectStatus func() *v1beta1.RolloutStatus + expectBr func() *v1beta1.BatchRelease + }{ + { + name: "run bluegreen upgrade1", + getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { + dep1 := deploymentDemo.DeepCopy() + rs1 := rsDemo.DeepCopy() + return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1} + }, + getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) { + return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()} + }, + getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease) { + obj := rolloutDemoBlueGreen.DeepCopy() + obj.Status.BlueGreenStatus.ObservedWorkloadGeneration = 2 + obj.Status.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.Status.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + obj.Status.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + obj.Status.BlueGreenStatus.CurrentStepIndex = 1 + obj.Status.BlueGreenStatus.NextStepIndex = 2 + obj.Status.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade + cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonInRolling + util.SetRolloutCondition(&obj.Status, *cond) + return obj, nil + }, + expectStatus: func() *v1beta1.RolloutStatus { + s := rolloutDemoBlueGreen.Status.DeepCopy() + s.BlueGreenStatus.ObservedWorkloadGeneration = 2 + s.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + s.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + s.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + s.BlueGreenStatus.CurrentStepIndex = 1 + s.BlueGreenStatus.NextStepIndex = 2 + s.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade + cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonInRolling + util.SetRolloutCondition(s, *cond) + return s + }, + expectBr: func() *v1beta1.BatchRelease { + br := batchDemo.DeepCopy() + br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{ + { + CanaryReplicas: intstr.FromString("50%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + } + br.Spec.ReleasePlan.BatchPartition = utilpointer.Int32(0) + br.Spec.ReleasePlan.RollingStyle = v1beta1.BlueGreenRollingStyle + return br + }, + }, + { + name: "run bluegreen traffic routing", + getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { + dep1 := deploymentDemo.DeepCopy() + rs1 := rsDemo.DeepCopy() + rs2 := rsDemo.DeepCopy() + rs2.Name = "echoserver-canary" + rs2.Labels["pod-template-hash"] = "pod-template-hash-v2" + rs2.Spec.Template.Spec.Containers[0].Image = "echoserver:v2" + return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1, rs2} + }, + getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) { + return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()} + }, + getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease) { + obj := rolloutDemoBlueGreen.DeepCopy() + obj.Status.BlueGreenStatus.ObservedWorkloadGeneration = 2 + obj.Status.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.Status.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + obj.Status.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + obj.Status.BlueGreenStatus.CurrentStepIndex = 1 + obj.Status.BlueGreenStatus.NextStepIndex = 2 + obj.Status.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade + cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonInRolling + util.SetRolloutCondition(&obj.Status, *cond) + br := batchDemo.DeepCopy() + br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{ + { + CanaryReplicas: intstr.FromString("50%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + } + br.Spec.ReleasePlan.BatchPartition = utilpointer.Int32(0) + br.Spec.ReleasePlan.RollingStyle = v1beta1.BlueGreenRollingStyle + br.Status = v1beta1.BatchReleaseStatus{ + ObservedGeneration: 1, + ObservedReleasePlanHash: util.HashReleasePlanBatches(&br.Spec.ReleasePlan), + CanaryStatus: v1beta1.BatchReleaseCanaryStatus{ + CurrentBatchState: v1beta1.ReadyBatchState, + CurrentBatch: 0, + UpdatedReplicas: 1, + UpdatedReadyReplicas: 1, + }, + } + return obj, br + }, + expectStatus: func() *v1beta1.RolloutStatus { + s := rolloutDemoBlueGreen.Status.DeepCopy() + s.BlueGreenStatus.ObservedWorkloadGeneration = 2 + s.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + s.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + s.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + s.BlueGreenStatus.PodTemplateHash = "pod-template-hash-v2" + s.BlueGreenStatus.UpdatedReplicas = 1 + s.BlueGreenStatus.UpdatedReadyReplicas = 1 + s.BlueGreenStatus.CurrentStepIndex = 1 + s.BlueGreenStatus.NextStepIndex = 2 + s.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateTrafficRouting + cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonInRolling + util.SetRolloutCondition(s, *cond) + return s + }, + expectBr: func() *v1beta1.BatchRelease { + br := batchDemo.DeepCopy() + br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{ + { + CanaryReplicas: intstr.FromString("50%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + } + br.Spec.ReleasePlan.BatchPartition = utilpointer.Int32(0) + br.Spec.ReleasePlan.RollingStyle = v1beta1.BlueGreenRollingStyle + return br + }, + }, + } + + for _, cs := range cases { + t.Run(cs.name, func(t *testing.T) { + deps, rss := cs.getObj() + rollout, br := cs.getRollout() + fc := fake.NewClientBuilder().WithScheme(scheme).WithObjects(rollout).Build() + for _, rs := range rss { + _ = fc.Create(context.TODO(), rs) + } + for _, dep := range deps { + _ = fc.Create(context.TODO(), dep) + } + if br != nil { + _ = fc.Create(context.TODO(), br) + } + ss, in := cs.getNetwork() + _ = fc.Create(context.TODO(), ss[0]) + _ = fc.Create(context.TODO(), in[0]) + r := &RolloutReconciler{ + Client: fc, + Scheme: scheme, + Recorder: record.NewFakeRecorder(10), + finder: util.NewControllerFinder(fc), + trafficRoutingManager: trafficrouting.NewTrafficRoutingManager(fc), + } + r.blueGreenManager = &blueGreenReleaseManager{ + Client: fc, + trafficRoutingManager: r.trafficRoutingManager, + recorder: r.Recorder, + } + workload, err := r.finder.GetWorkloadForRef(rollout) + if err != nil { + t.Fatalf("GetWorkloadForRef failed: %s", err.Error()) + } + c := &RolloutContext{ + Rollout: rollout, + NewStatus: rollout.Status.DeepCopy(), + Workload: workload, + } + err = r.blueGreenManager.runCanary(c) + if err != nil { + t.Fatalf("reconcileRolloutProgressing failed: %s", err.Error()) + } + checkBatchReleaseEqual(fc, t, client.ObjectKey{Name: rollout.Name}, cs.expectBr()) + cStatus := c.NewStatus.DeepCopy() + cStatus.Message = "" + if cStatus.BlueGreenStatus != nil { + cStatus.BlueGreenStatus.LastUpdateTime = nil + cStatus.BlueGreenStatus.Message = "" + } + cond := util.GetRolloutCondition(*cStatus, v1beta1.RolloutConditionProgressing) + cond.Message = "" + util.SetRolloutCondition(cStatus, *cond) + expectStatus := cs.expectStatus() + if !reflect.DeepEqual(expectStatus, cStatus) { + t.Fatalf("expect(%s), but get(%s)", util.DumpJSON(cs.expectStatus()), util.DumpJSON(cStatus)) + } + }) + } +} + +func TestBlueGreenRunCanaryPaused(t *testing.T) { + cases := []struct { + name string + getRollout func() *v1beta1.Rollout + expectStatus func() *v1beta1.RolloutStatus + }{ + { + name: "paused, last step, 60% weight", + getRollout: func() *v1beta1.Rollout { + obj := rolloutDemoBlueGreen.DeepCopy() + obj.Status.BlueGreenStatus.ObservedWorkloadGeneration = 2 + obj.Status.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.Status.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + obj.Status.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + obj.Status.BlueGreenStatus.CurrentStepIndex = 3 + obj.Status.BlueGreenStatus.NextStepIndex = 4 + obj.Status.BlueGreenStatus.PodTemplateHash = "pod-template-hash-v2" + obj.Status.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStatePaused + return obj + }, + expectStatus: func() *v1beta1.RolloutStatus { + obj := rolloutDemoBlueGreen.Status.DeepCopy() + obj.BlueGreenStatus.ObservedWorkloadGeneration = 2 + obj.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + obj.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + obj.BlueGreenStatus.CurrentStepIndex = 3 + obj.BlueGreenStatus.NextStepIndex = 4 + obj.BlueGreenStatus.PodTemplateHash = "pod-template-hash-v2" + obj.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStatePaused + return obj + }, + }, + } + + for _, cs := range cases { + t.Run(cs.name, func(t *testing.T) { + rollout := cs.getRollout() + fc := fake.NewClientBuilder().WithScheme(scheme).WithObjects(rollout).Build() + r := &RolloutReconciler{ + Client: fc, + Scheme: scheme, + Recorder: record.NewFakeRecorder(10), + finder: util.NewControllerFinder(fc), + trafficRoutingManager: trafficrouting.NewTrafficRoutingManager(fc), + } + r.blueGreenManager = &blueGreenReleaseManager{ + Client: fc, + trafficRoutingManager: r.trafficRoutingManager, + recorder: r.Recorder, + } + c := &RolloutContext{ + Rollout: rollout, + NewStatus: rollout.Status.DeepCopy(), + } + err := r.blueGreenManager.runCanary(c) + if err != nil { + t.Fatalf("reconcileRolloutProgressing failed: %s", err.Error()) + } + cStatus := c.NewStatus.DeepCopy() + cStatus.BlueGreenStatus.LastUpdateTime = nil + cStatus.BlueGreenStatus.Message = "" + cStatus.Message = "" + if !reflect.DeepEqual(cs.expectStatus(), cStatus) { + t.Fatalf("expect(%s), but get(%s)", util.DumpJSON(cs.expectStatus()), util.DumpJSON(cStatus)) + } + }) + } +} diff --git a/pkg/controller/rollout/rollout_canary.go b/pkg/controller/rollout/rollout_canary.go index 63c01703..7111e5ee 100644 --- a/pkg/controller/rollout/rollout_canary.go +++ b/pkg/controller/rollout/rollout_canary.go @@ -29,11 +29,9 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/client-go/tools/record" - "k8s.io/client-go/util/retry" "k8s.io/klog/v2" utilpointer "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" @@ -47,7 +45,7 @@ type canaryReleaseManager struct { func (m *canaryReleaseManager) runCanary(c *RolloutContext) error { canaryStatus := c.NewStatus.CanaryStatus - if br, err := m.fetchBatchRelease(c.Rollout.Namespace, c.Rollout.Name); err != nil && !errors.IsNotFound(err) { + if br, err := fetchBatchRelease(m.Client, c.Rollout.Namespace, c.Rollout.Name); err != nil && !errors.IsNotFound(err) { klog.Errorf("rollout(%s/%s) fetch batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) return err } else if err == nil { @@ -194,13 +192,7 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error { klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, canaryStatus.CurrentStepIndex, v1beta1.CanaryStepStateTrafficRouting, canaryStatus.CurrentStepState) } - // in two cases, we should wait the default grace period - // - a period after CanaryStepStateUpgrade is just done (https://github.com/openkruise/rollouts/pull/185) - // - a period after CanaryStepStateTrafficRouting is just done - if tr.RecheckDuration <= 0 { - tr.RecheckDuration = time.Duration(trafficrouting.GetGraceSeconds(c.Rollout.Spec.Strategy.GetTrafficRouting(), defaultGracePeriodSeconds)) * time.Second - } - expectedTime := time.Now().Add(tr.RecheckDuration) + expectedTime := time.Now().Add(time.Duration(defaultGracePeriodSeconds) * time.Second) c.RecheckTime = &expectedTime case v1beta1.CanaryStepStateMetricsAnalysis: @@ -259,7 +251,7 @@ func (m *canaryReleaseManager) doCanaryUpgrade(c *RolloutContext) (bool, error) cond.Message = fmt.Sprintf("Rollout is in step(%d/%d), and upgrade workload to new version", canaryStatus.CurrentStepIndex, steps) c.NewStatus.Message = cond.Message // run batch release to upgrade the workloads - done, br, err := m.runBatchRelease(c.Rollout, getRolloutID(c.Workload), canaryStatus.CurrentStepIndex, c.Workload.IsInRollback) + done, br, err := runBatchRelease(m, c.Rollout, getRolloutID(c.Workload), canaryStatus.CurrentStepIndex, c.Workload.IsInRollback) if err != nil { return false, err } else if !done { @@ -369,13 +361,13 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro return true, nil } // rollout progressing complete, remove rollout progressing annotation in workload - err := m.removeRolloutProgressingAnnotation(c) + err := removeRolloutProgressingAnnotation(m.Client, c) if err != nil { return false, err } tr := newTrafficRoutingContext(c) // execute steps based on the predefined order for each reason - nextStep := nextTask(c.FinalizeReason, canaryStatus.FinalisingStep) + nextStep := nextCanaryTask(c.FinalizeReason, canaryStatus.FinalisingStep) // if current step is empty, set it with the first step // if current step is end, we just return if len(canaryStatus.FinalisingStep) == 0 { @@ -392,10 +384,10 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro switch canaryStatus.FinalisingStep { // set workload.pause=false; set workload.partition=0 case v1beta1.FinalisingStepTypeBatchRelease: - retry, err = m.finalizingBatchRelease(c) + retry, err = finalizingBatchRelease(m.Client, c) // delete batchRelease case v1beta1.FinalisingStepTypeDeleteBR: - retry, err = m.removeBatchRelease(c) + retry, err = removeBatchRelease(m.Client, c) // restore the gateway resources (ingress/gatewayAPI/Istio), that means // only stable Service will accept the traffic case v1beta1.FinalisingStepTypeGateway: @@ -408,7 +400,7 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro retry, err = m.trafficRoutingManager.RemoveCanaryService(tr) default: - nextStep = nextTask(c.FinalizeReason, "") + nextStep = nextCanaryTask(c.FinalizeReason, "") klog.Warningf("unexpected finalising step, current step(%s), start from the first step(%s)", canaryStatus.FinalisingStep, nextStep) canaryStatus.FinalisingStep = nextStep return false, nil @@ -425,75 +417,11 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro return false, nil } -func (m *canaryReleaseManager) removeRolloutProgressingAnnotation(c *RolloutContext) error { - if c.Workload == nil { - return nil - } - if _, ok := c.Workload.Annotations[util.InRolloutProgressingAnnotation]; !ok { - return nil - } - workloadRef := c.Rollout.Spec.WorkloadRef - workloadGVK := schema.FromAPIVersionAndKind(workloadRef.APIVersion, workloadRef.Kind) - obj := util.GetEmptyWorkloadObject(workloadGVK) - obj.SetNamespace(c.Workload.Namespace) - obj.SetName(c.Workload.Name) - body := fmt.Sprintf(`{"metadata":{"annotations":{"%s":null}}}`, util.InRolloutProgressingAnnotation) - if err := m.Patch(context.TODO(), obj, client.RawPatch(types.MergePatchType, []byte(body))); err != nil { - klog.Errorf("rollout(%s/%s) patch workload(%s) failed: %s", c.Rollout.Namespace, c.Rollout.Name, c.Workload.Name, err.Error()) - return err - } - klog.Infof("remove rollout(%s/%s) workload(%s) annotation[%s] success", c.Rollout.Namespace, c.Rollout.Name, c.Workload.Name, util.InRolloutProgressingAnnotation) - return nil +func (m *canaryReleaseManager) fetchClient() client.Client { + return m.Client } -func (m *canaryReleaseManager) runBatchRelease(rollout *v1beta1.Rollout, rolloutId string, batch int32, isRollback bool) (bool, *v1beta1.BatchRelease, error) { - batch = batch - 1 - br, err := m.fetchBatchRelease(rollout.Namespace, rollout.Name) - if errors.IsNotFound(err) { - // create new BatchRelease Crd - br = createBatchRelease(rollout, rolloutId, batch, isRollback) - if err = m.Create(context.TODO(), br); err != nil && !errors.IsAlreadyExists(err) { - klog.Errorf("rollout(%s/%s) create BatchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) - return false, nil, err - } - klog.Infof("rollout(%s/%s) create BatchRelease(%s) success", rollout.Namespace, rollout.Name, util.DumpJSON(br)) - return false, br, nil - } else if err != nil { - klog.Errorf("rollout(%s/%s) fetch BatchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) - return false, nil, err - } - - // check whether batchRelease configuration is the latest - newBr := createBatchRelease(rollout, rolloutId, batch, isRollback) - if reflect.DeepEqual(br.Spec, newBr.Spec) && reflect.DeepEqual(br.Annotations, newBr.Annotations) { - klog.Infof("rollout(%s/%s) do batchRelease batch(%d) success", rollout.Namespace, rollout.Name, batch+1) - return true, br, nil - } - // update batchRelease to the latest version - if err = retry.RetryOnConflict(retry.DefaultBackoff, func() error { - if err = m.Get(context.TODO(), client.ObjectKey{Namespace: newBr.Namespace, Name: newBr.Name}, br); err != nil { - klog.Errorf("error getting BatchRelease(%s/%s) from client", newBr.Namespace, newBr.Name) - return err - } - br.Spec = newBr.Spec - br.Annotations = newBr.Annotations - return m.Client.Update(context.TODO(), br) - }); err != nil { - klog.Errorf("rollout(%s/%s) update batchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) - return false, nil, err - } - klog.Infof("rollout(%s/%s) update batchRelease(%s) configuration to latest", rollout.Namespace, rollout.Name, util.DumpJSON(br)) - return false, br, nil -} - -func (m *canaryReleaseManager) fetchBatchRelease(ns, name string) (*v1beta1.BatchRelease, error) { - br := &v1beta1.BatchRelease{} - // batchRelease.name is equal related rollout.name - err := m.Get(context.TODO(), client.ObjectKey{Namespace: ns, Name: name}, br) - return br, err -} - -func createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32, isRollback bool) *v1beta1.BatchRelease { +func (m *canaryReleaseManager) createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32, isRollback bool) *v1beta1.BatchRelease { var batches []v1beta1.ReleaseBatch for _, step := range rollout.Spec.Strategy.Canary.Steps { batches = append(batches, v1beta1.ReleaseBatch{CanaryReplicas: *step.Replicas}) @@ -531,83 +459,6 @@ func createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32, return br } -// bool means if we need retry; if error is not nil, always retry -func (m *canaryReleaseManager) removeBatchRelease(c *RolloutContext) (bool, error) { - batch := &v1beta1.BatchRelease{} - err := m.Get(context.TODO(), client.ObjectKey{Namespace: c.Rollout.Namespace, Name: c.Rollout.Name}, batch) - if err != nil && errors.IsNotFound(err) { - return false, nil - } else if err != nil { - klog.Errorf("rollout(%s/%s) fetch BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name) - return true, err - } - if !batch.DeletionTimestamp.IsZero() { - klog.Infof("rollout(%s/%s) BatchRelease is terminating, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) - return true, nil - } - - //delete batchRelease - err = m.Delete(context.TODO(), batch) - if err != nil { - klog.Errorf("rollout(%s/%s) delete BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) - return true, err - } - klog.Infof("rollout(%s/%s) deleting BatchRelease, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) - return true, nil -} - -// bool means if we need retry; if error is not nil, always retry -func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool, error) { - br, err := m.fetchBatchRelease(c.Rollout.Namespace, c.Rollout.Name) - if err != nil { - if errors.IsNotFound(err) { - return false, nil - } - return true, err - } - waitReady := c.WaitReady - // The Completed phase means batchRelease controller has processed all it - // should process. If BatchRelease phase is completed, we can do nothing. - if br.Spec.ReleasePlan.BatchPartition == nil && - br.Status.Phase == v1beta1.RolloutPhaseCompleted { - klog.Infof("rollout(%s/%s) finalizing batchRelease(%s) done", c.Rollout.Namespace, c.Rollout.Name, util.DumpJSON(br.Status)) - return false, nil - } - - // If BatchPartition is nil, BatchRelease will directly resume workload via: - // - * set workload Paused = false if it needs; - // - * set workload Partition = null if it needs. - if br.Spec.ReleasePlan.BatchPartition == nil { - // - If checkReady is true, finalizing policy must be "WaitResume"; - // - If checkReady is false, finalizing policy must be NOT "WaitResume"; - // Otherwise, we should correct it. - switch br.Spec.ReleasePlan.FinalizingPolicy { - case v1beta1.WaitResumeFinalizingPolicyType: - if waitReady { // no need to patch again - return true, nil - } - default: - if !waitReady { // no need to patch again - return true, nil - } - } - } - - // Correct finalizing policy. - policy := v1beta1.ImmediateFinalizingPolicyType - if waitReady { - policy = v1beta1.WaitResumeFinalizingPolicyType - } - - // Patch BatchPartition and FinalizingPolicy, BatchPartition always patch null here. - body := fmt.Sprintf(`{"spec":{"releasePlan":{"batchPartition":null,"finalizingPolicy":"%s"}}}`, policy) - if err = m.Patch(context.TODO(), br, client.RawPatch(types.MergePatchType, []byte(body))); err != nil { - return true, err - } - klog.Infof("rollout(%s/%s) patch batchRelease(%s) success", c.Rollout.Namespace, c.Rollout.Name, body) - return true, nil -} - // syncBatchRelease sync status of br to canaryStatus, and sync rollout-id of canaryStatus to br. func (m *canaryReleaseManager) syncBatchRelease(br *v1beta1.BatchRelease, canaryStatus *v1beta1.CanaryStatus) error { // sync from BatchRelease status to Rollout canaryStatus @@ -628,7 +479,7 @@ func (m *canaryReleaseManager) syncBatchRelease(br *v1beta1.BatchRelease, canary } // calculate next task -func nextTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.FinalisingStepType { +func nextCanaryTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.FinalisingStepType { var taskSequence []v1beta1.FinalisingStepType //REVIEW - should we consider more complex scenarios? // like, user rollbacks the workload and disables the Rollout at the same time? diff --git a/pkg/controller/rollout/rollout_controller.go b/pkg/controller/rollout/rollout_controller.go index a1fe5606..cccc82a9 100755 --- a/pkg/controller/rollout/rollout_controller.go +++ b/pkg/controller/rollout/rollout_controller.go @@ -66,6 +66,7 @@ type RolloutReconciler struct { finder *util.ControllerFinder trafficRoutingManager *trafficrouting.Manager canaryManager *canaryReleaseManager + blueGreenManager *blueGreenReleaseManager } //+kubebuilder:rbac:groups=rollouts.kruise.io,resources=rollouts,verbs=get;list;watch;create;update;patch;delete @@ -198,5 +199,10 @@ func (r *RolloutReconciler) SetupWithManager(mgr ctrl.Manager) error { trafficRoutingManager: r.trafficRoutingManager, recorder: r.Recorder, } + r.blueGreenManager = &blueGreenReleaseManager{ + Client: mgr.GetClient(), + trafficRoutingManager: r.trafficRoutingManager, + recorder: r.Recorder, + } return nil } diff --git a/pkg/controller/rollout/rollout_controller_test.go b/pkg/controller/rollout/rollout_controller_test.go index 02fec15e..3d29d379 100644 --- a/pkg/controller/rollout/rollout_controller_test.go +++ b/pkg/controller/rollout/rollout_controller_test.go @@ -106,6 +106,73 @@ var ( }, }, } + + rolloutDemoBlueGreen = &v1beta1.Rollout{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rollout-demo", + Labels: map[string]string{}, + Annotations: map[string]string{ + util.RolloutHashAnnotation: "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd", + }, + }, + Spec: v1beta1.RolloutSpec{ + WorkloadRef: v1beta1.ObjectRef{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "echoserver", + }, + Strategy: v1beta1.RolloutStrategy{ + BlueGreen: &v1beta1.BlueGreenStrategy{ + Steps: []v1beta1.CanaryStep{ + { + TrafficRoutingStrategy: v1beta1.TrafficRoutingStrategy{ + Traffic: utilpointer.String("0%"), + }, + Replicas: &intstr.IntOrString{StrVal: "50%", Type: intstr.String}, + }, + { + TrafficRoutingStrategy: v1beta1.TrafficRoutingStrategy{ + Traffic: utilpointer.String("0%"), + }, + Replicas: &intstr.IntOrString{StrVal: "100%", Type: intstr.String}, + }, + { + TrafficRoutingStrategy: v1beta1.TrafficRoutingStrategy{ + Traffic: utilpointer.String("50%"), + }, + Replicas: &intstr.IntOrString{StrVal: "100%", Type: intstr.String}, + }, + { + TrafficRoutingStrategy: v1beta1.TrafficRoutingStrategy{ + Traffic: utilpointer.String("100%"), + }, + Replicas: &intstr.IntOrString{StrVal: "100%", Type: intstr.String}, + }, + }, + TrafficRoutings: []v1beta1.TrafficRoutingRef{ + { + Service: "echoserver", + Ingress: &v1beta1.IngressTrafficRouting{ + Name: "echoserver", + }, + GracePeriodSeconds: 0, // To facilitate testing, don't wait after traffic routing operation + }, + }, + }, + }, + }, + Status: v1beta1.RolloutStatus{ + Phase: v1beta1.RolloutPhaseProgressing, + BlueGreenStatus: &v1beta1.BlueGreenStatus{}, + Conditions: []v1beta1.RolloutCondition{ + { + Type: v1beta1.RolloutConditionProgressing, + Reason: v1alpha1.ProgressingReasonInitializing, + Status: corev1.ConditionTrue, + }, + }, + }, + } maxUnavailable = intstr.FromString("20%") deploymentDemo = &apps.Deployment{ TypeMeta: metav1.TypeMeta{ diff --git a/pkg/controller/rollout/rollout_progressing.go b/pkg/controller/rollout/rollout_progressing.go index a1214cbf..a27f7d66 100644 --- a/pkg/controller/rollout/rollout_progressing.go +++ b/pkg/controller/rollout/rollout_progressing.go @@ -378,8 +378,7 @@ func (r *RolloutReconciler) getReleaseManager(rollout *v1beta1.Rollout) (Release if rollout.Spec.Strategy.IsCanaryStragegy() { return r.canaryManager, nil } else if rollout.Spec.Strategy.IsBlueGreenRelease() { - // placeholder for upcoming PR - // return r.blueGreenManager, nil + return r.blueGreenManager, nil } return nil, fmt.Errorf("unknown rolling style: %s, and thus cannot call corresponding release manager", rollout.Spec.Strategy.GetRollingStyle()) } @@ -420,7 +419,7 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error) } // if no trafficRouting exists, simply remove batchRelease if !c.Rollout.Spec.Strategy.HasTrafficRoutings() { - retry, err := releaseManager.removeBatchRelease(c) + retry, err := removeBatchRelease(releaseManager.fetchClient(), c) if err != nil { klog.Errorf("rollout(%s/%s) DoFinalising batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) return false, err @@ -458,7 +457,7 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error) // canary deployment, for other release, the v2 pods won't be deleted immediately // in both cases, only the stable pods (v1) accept the traffic case v1beta1.FinalisingStepTypeDeleteBR: - retry, err := releaseManager.removeBatchRelease(c) + retry, err := removeBatchRelease(releaseManager.fetchClient(), c) if err != nil { klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) return false, err @@ -495,7 +494,7 @@ func (r *RolloutReconciler) recalculateCanaryStep(c *RolloutContext) (int32, err if err != nil { return 0, err } - batch, err := releaseManager.fetchBatchRelease(c.Rollout.Namespace, c.Rollout.Name) + batch, err := fetchBatchRelease(releaseManager.fetchClient(), c.Rollout.Namespace, c.Rollout.Name) if errors.IsNotFound(err) { return 1, nil } else if err != nil { @@ -506,7 +505,10 @@ func (r *RolloutReconciler) recalculateCanaryStep(c *RolloutContext) (int32, err if c.NewStatus != nil { currentIndex = c.NewStatus.GetSubStatus().CurrentStepIndex - 1 } - steps := append([]int{}, int(currentIndex)) + steps := make([]int, 0) + if ci := int(currentIndex); ci >= 0 && ci < len(c.Rollout.Spec.Strategy.GetSteps()) { + steps = append(steps, ci) + } // we don't distinguish between the changes in Replicas and Traffic // Whatever the change is, we recalculate the step. // we put the current step index first for retrieval, so that if Traffic is the only change, diff --git a/pkg/controller/rollout/rollout_releaseManager.go b/pkg/controller/rollout/rollout_releaseManager.go index ebc82f3e..960c95f7 100644 --- a/pkg/controller/rollout/rollout_releaseManager.go +++ b/pkg/controller/rollout/rollout_releaseManager.go @@ -17,7 +17,18 @@ limitations under the License. package rollout import ( + "context" + "fmt" + "reflect" + "github.com/openkruise/rollouts/api/v1beta1" + "github.com/openkruise/rollouts/pkg/util" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" ) type ReleaseManager interface { @@ -27,8 +38,154 @@ type ReleaseManager interface { doCanaryJump(c *RolloutContext) bool // called when user accomplishes a release / does a rollback, or disables/removes the Rollout Resource doCanaryFinalising(c *RolloutContext) (bool, error) - // fetch the BatchRelease object - fetchBatchRelease(ns, name string) (*v1beta1.BatchRelease, error) - // remove the BatchRelease object - removeBatchRelease(c *RolloutContext) (bool, error) + // create btach release + createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32, isRollback bool) *v1beta1.BatchRelease + // retrun client + fetchClient() client.Client +} + +func fetchBatchRelease(cli client.Client, ns, name string) (*v1beta1.BatchRelease, error) { + br := &v1beta1.BatchRelease{} + // batchRelease.name is equal related rollout.name + err := cli.Get(context.TODO(), client.ObjectKey{Namespace: ns, Name: name}, br) + return br, err +} + +func removeRolloutProgressingAnnotation(cli client.Client, c *RolloutContext) error { + if c.Workload == nil { + return nil + } + if _, ok := c.Workload.Annotations[util.InRolloutProgressingAnnotation]; !ok { + return nil + } + workloadRef := c.Rollout.Spec.WorkloadRef + workloadGVK := schema.FromAPIVersionAndKind(workloadRef.APIVersion, workloadRef.Kind) + obj := util.GetEmptyWorkloadObject(workloadGVK) + obj.SetNamespace(c.Workload.Namespace) + obj.SetName(c.Workload.Name) + body := fmt.Sprintf(`{"metadata":{"annotations":{"%s":null}}}`, util.InRolloutProgressingAnnotation) + if err := cli.Patch(context.TODO(), obj, client.RawPatch(types.MergePatchType, []byte(body))); err != nil { + klog.Errorf("rollout(%s/%s) patch workload(%s) failed: %s", c.Rollout.Namespace, c.Rollout.Name, c.Workload.Name, err.Error()) + return err + } + klog.Infof("remove rollout(%s/%s) workload(%s) annotation[%s] success", c.Rollout.Namespace, c.Rollout.Name, c.Workload.Name, util.InRolloutProgressingAnnotation) + return nil +} + +// bool means if we need retry; if error is not nil, always retry +func removeBatchRelease(cli client.Client, c *RolloutContext) (bool, error) { + batch := &v1beta1.BatchRelease{} + err := cli.Get(context.TODO(), client.ObjectKey{Namespace: c.Rollout.Namespace, Name: c.Rollout.Name}, batch) + if err != nil && errors.IsNotFound(err) { + return false, nil + } else if err != nil { + klog.Errorf("rollout(%s/%s) fetch BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name) + return true, err + } + if !batch.DeletionTimestamp.IsZero() { + klog.Infof("rollout(%s/%s) BatchRelease is terminating, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) + return true, nil + } + + //delete batchRelease + err = cli.Delete(context.TODO(), batch) + if err != nil { + klog.Errorf("rollout(%s/%s) delete BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) + return true, err + } + klog.Infof("rollout(%s/%s) deleting BatchRelease, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) + return true, nil +} + +// bool means if we need retry; if error is not nil, always retry +func finalizingBatchRelease(cli client.Client, c *RolloutContext) (bool, error) { + br, err := fetchBatchRelease(cli, c.Rollout.Namespace, c.Rollout.Name) + if err != nil { + if errors.IsNotFound(err) { + return false, nil + } + return true, err + } + waitReady := c.WaitReady + // The Completed phase means batchRelease controller has processed all it + // should process. If BatchRelease phase is completed, we can do nothing. + if br.Spec.ReleasePlan.BatchPartition == nil && + br.Status.Phase == v1beta1.RolloutPhaseCompleted { + klog.Infof("rollout(%s/%s) finalizing batchRelease(%s) done", c.Rollout.Namespace, c.Rollout.Name, util.DumpJSON(br.Status)) + return false, nil + } + + // If BatchPartition is nil, BatchRelease will directly resume workload via: + // - * set workload Paused = false if it needs; + // - * set workload Partition = null if it needs. + if br.Spec.ReleasePlan.BatchPartition == nil { + // - If checkReady is true, finalizing policy must be "WaitResume"; + // - If checkReady is false, finalizing policy must be NOT "WaitResume"; + // Otherwise, we should correct it. + switch br.Spec.ReleasePlan.FinalizingPolicy { + case v1beta1.WaitResumeFinalizingPolicyType: + if waitReady { // no need to patch again + return true, nil + } + default: + if !waitReady { // no need to patch again + return true, nil + } + } + } + + // Correct finalizing policy. + policy := v1beta1.ImmediateFinalizingPolicyType + if waitReady { + policy = v1beta1.WaitResumeFinalizingPolicyType + } + + // Patch BatchPartition and FinalizingPolicy, BatchPartition always patch null here. + body := fmt.Sprintf(`{"spec":{"releasePlan":{"batchPartition":null,"finalizingPolicy":"%s"}}}`, policy) + if err = cli.Patch(context.TODO(), br, client.RawPatch(types.MergePatchType, []byte(body))); err != nil { + return true, err + } + klog.Infof("rollout(%s/%s) patch batchRelease(%s) success", c.Rollout.Namespace, c.Rollout.Name, body) + return true, nil +} + +func runBatchRelease(m ReleaseManager, rollout *v1beta1.Rollout, rolloutId string, batch int32, isRollback bool) (bool, *v1beta1.BatchRelease, error) { + cli := m.fetchClient() + batch = batch - 1 + br, err := fetchBatchRelease(cli, rollout.Namespace, rollout.Name) + if errors.IsNotFound(err) { + // create new BatchRelease Crd + br = m.createBatchRelease(rollout, rolloutId, batch, isRollback) + if err = cli.Create(context.TODO(), br); err != nil && !errors.IsAlreadyExists(err) { + klog.Errorf("rollout(%s/%s) create BatchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) + return false, nil, err + } + klog.Infof("rollout(%s/%s) create BatchRelease(%s) success", rollout.Namespace, rollout.Name, util.DumpJSON(br)) + return false, br, nil + } else if err != nil { + klog.Errorf("rollout(%s/%s) fetch BatchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) + return false, nil, err + } + + // check whether batchRelease configuration is the latest + newBr := m.createBatchRelease(rollout, rolloutId, batch, isRollback) + if reflect.DeepEqual(br.Spec, newBr.Spec) && reflect.DeepEqual(br.Annotations, newBr.Annotations) { + klog.Infof("rollout(%s/%s) do batchRelease batch(%d) success", rollout.Namespace, rollout.Name, batch+1) + return true, br, nil + } + // update batchRelease to the latest version + if err = retry.RetryOnConflict(retry.DefaultBackoff, func() error { + if err = cli.Get(context.TODO(), client.ObjectKey{Namespace: newBr.Namespace, Name: newBr.Name}, br); err != nil { + klog.Errorf("error getting BatchRelease(%s/%s) from client", newBr.Namespace, newBr.Name) + return err + } + br.Spec = newBr.Spec + br.Annotations = newBr.Annotations + return cli.Update(context.TODO(), br) + }); err != nil { + klog.Errorf("rollout(%s/%s) update batchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) + return false, nil, err + } + klog.Infof("rollout(%s/%s) update batchRelease(%s) configuration to latest", rollout.Namespace, rollout.Name, util.DumpJSON(br)) + return false, br, nil } diff --git a/pkg/trafficrouting/manager.go b/pkg/trafficrouting/manager.go index b7a3edb6..55fb3fb7 100644 --- a/pkg/trafficrouting/manager.go +++ b/pkg/trafficrouting/manager.go @@ -34,6 +34,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" "k8s.io/utils/integer" + utilpointer "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -223,6 +224,41 @@ func (m *Manager) FinalisingTrafficRouting(c *TrafficRoutingContext) (bool, erro return true, nil } +// returns: +// - if error is not nil, usually we need to retry later. Only if error is nil, we consider the bool. +// - The bool value indicates whether retry is needed. If true, it usually means +// gateway resources have been updated and we need to wait for `graceSeconds`. +// +// only if error is nil AND retry is false, this calling can be considered as completed +func (m *Manager) RouteAllTrafficToNewVersion(c *TrafficRoutingContext) (bool, error) { + klog.InfoS("route all traffic to new version", "rollout", c.Key) + if len(c.ObjectRef) == 0 { + return false, nil + } + // build up the network provider + stableService := c.ObjectRef[0].Service + cServiceName := getCanaryServiceName(stableService, c.OnlyTrafficRouting, c.DisableGenerateCanaryService) + trController, err := newNetworkProvider(m.Client, c, stableService, cServiceName) + if err != nil { + klog.Errorf("%s newTrafficRoutingController failed: %s", c.Key, err.Error()) + return false, err + } + graceSeconds := GetGraceSeconds(c.ObjectRef, defaultGracePeriodSeconds) + retry, remaining, err := grace.RunWithGraceSeconds(string(c.OwnerRef.UID), "updateRoute", graceSeconds, func() (bool, error) { + // route all traffic to new version + c.Strategy.Matches = nil + c.Strategy.Traffic = utilpointer.StringPtr("100%") + //NOTE - This return value "verified" has the opposite semantics with "modified" + verified, err := trController.EnsureRoutes(context.TODO(), &c.Strategy) + if !verified { + c.LastUpdateTime = &metav1.Time{Time: time.Now()} + } + return !verified, err + }) + UpdateRecheckDuration(c, remaining) + return retry, err +} + // returns: // - if error is not nil, usually we need to retry later. Only if error is nil, we consider the bool. // - The bool value indicates whether retry is needed. If true, it usually means @@ -419,11 +455,11 @@ func (m *Manager) RestoreStableService(c *TrafficRoutingContext) (bool, error) { serviceName := c.ObjectRef[0].Service err := m.Get(context.TODO(), client.ObjectKey{Namespace: c.Namespace, Name: serviceName}, stableService) if errors.IsNotFound(err) { - return true, nil + return false, nil } if err != nil { klog.Errorf("%s get stable service(%s) failed: %s", c.Key, serviceName, err.Error()) - return false, err + return true, err } // restore stable Service diff --git a/pkg/trafficrouting/manager_test.go b/pkg/trafficrouting/manager_test.go index 2ce96fd3..354416e9 100644 --- a/pkg/trafficrouting/manager_test.go +++ b/pkg/trafficrouting/manager_test.go @@ -1293,8 +1293,10 @@ func TestRestoreGateway(t *testing.T) { // the second call, it should be no error and no retry time.Sleep(1 * time.Second) retry, err = manager.RestoreGateway(c) - if err != nil || retry { - t.Fatalf("RestoreGateway failed: %s", err) + if err != nil { + t.Fatalf("RestoreGateway failed: %s", err.Error()) + } else if retry { + t.Fatalf("RestoreGateway failed: retry should be false") } }) } @@ -1403,8 +1405,175 @@ func TestRemoveCanaryService(t *testing.T) { // the second call, it should be no error and no retry time.Sleep(1 * time.Second) retry, err = manager.RemoveCanaryService(c) - if err != nil || retry { - t.Fatalf("RemoveCanaryService failed: %s", err) + if err != nil { + t.Fatalf("RemoveCanaryService failed: %s", err.Error()) + } else if retry { + t.Fatalf("RemoveCanaryService failed: retry should be false") + } + }) + } +} + +func TestRouteAllTrafficToNewVersion(t *testing.T) { + cases := []struct { + name string + getObj func() ([]*corev1.Service, []*netv1.Ingress) + getRollout func() (*v1beta1.Rollout, *util.Workload) + onlyTrafficRouting bool + expectObj func() ([]*corev1.Service, []*netv1.Ingress) + expectNotFound func() ([]*corev1.Service, []*netv1.Ingress) + retry bool + }{ + { + name: "Route all traffic test1", + getObj: func() ([]*corev1.Service, []*netv1.Ingress) { + s1 := demoService.DeepCopy() + s2 := demoService.DeepCopy() + s2.Name = "echoserver-canary" + s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" + c1 := demoIngress.DeepCopy() + c2 := demoIngress.DeepCopy() + c2.Name = "echoserver-canary" + c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" + c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100" + c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" + return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + }, + getRollout: func() (*v1beta1.Rollout, *util.Workload) { + obj := demoRollout.DeepCopy() + obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + obj.Status.CanaryStatus.CurrentStepIndex = 4 + obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-time.Hour)} + return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey} + }, + expectObj: func() ([]*corev1.Service, []*netv1.Ingress) { + // service and ingress remain unchanged + s1 := demoService.DeepCopy() + s2 := demoService.DeepCopy() + s2.Name = "echoserver-canary" + s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" + c1 := demoIngress.DeepCopy() + c2 := demoIngress.DeepCopy() + c2.Name = "echoserver-canary" + c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" + c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100" + c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" + return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + }, + expectNotFound: func() ([]*corev1.Service, []*netv1.Ingress) { + return nil, nil + }, + retry: false, + }, + { + name: "Route all traffic test2", + getObj: func() ([]*corev1.Service, []*netv1.Ingress) { + s1 := demoService.DeepCopy() + s2 := demoService.DeepCopy() + s2.Name = "echoserver-canary" + s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" + c1 := demoIngress.DeepCopy() + c2 := demoIngress.DeepCopy() + c2.Name = "echoserver-canary" + c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" + c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "50" + c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" + return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + }, + getRollout: func() (*v1beta1.Rollout, *util.Workload) { + obj := demoRollout.DeepCopy() + obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + obj.Status.CanaryStatus.CurrentStepIndex = 4 + obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-time.Hour)} + return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey} + }, + expectObj: func() ([]*corev1.Service, []*netv1.Ingress) { + // service and ingress remain unchanged + s1 := demoService.DeepCopy() + s2 := demoService.DeepCopy() + s2.Name = "echoserver-canary" + s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" + c1 := demoIngress.DeepCopy() + c2 := demoIngress.DeepCopy() + c2.Name = "echoserver-canary" + c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" + c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100" + c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" + return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + }, + expectNotFound: func() ([]*corev1.Service, []*netv1.Ingress) { + return nil, nil + }, + retry: true, + }, + } + + for _, cs := range cases { + t.Run(cs.name, func(t *testing.T) { + ss, ig := cs.getObj() + cli := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ig[0], ss[0], demoConf.DeepCopy()).Build() + if len(ss) == 2 { + _ = cli.Create(context.TODO(), ss[1]) + } + if len(ig) == 2 { + _ = cli.Create(context.TODO(), ig[1]) + } + rollout, workload := cs.getRollout() + newStatus := rollout.Status.DeepCopy() + currentStep := rollout.Spec.Strategy.Canary.Steps[newStatus.CanaryStatus.CurrentStepIndex-1] + c := &TrafficRoutingContext{ + Key: fmt.Sprintf("Rollout(%s/%s)", rollout.Namespace, rollout.Name), + Namespace: rollout.Namespace, + ObjectRef: rollout.Spec.Strategy.Canary.TrafficRoutings, + Strategy: currentStep.TrafficRoutingStrategy, + OwnerRef: *metav1.NewControllerRef(rollout, v1beta1.SchemeGroupVersion.WithKind("Rollout")), + RevisionLabelKey: workload.RevisionLabelKey, + StableRevision: newStatus.CanaryStatus.StableRevision, + CanaryRevision: newStatus.CanaryStatus.PodTemplateHash, + LastUpdateTime: newStatus.CanaryStatus.LastUpdateTime, + OnlyTrafficRouting: cs.onlyTrafficRouting, + } + manager := NewTrafficRoutingManager(cli) + retry, err := manager.RouteAllTrafficToNewVersion(c) + if err != nil { + t.Fatalf("RouteAllTrafficToNewVersion first failed: %s", err.Error()) + } + if retry != cs.retry { + t.Fatalf("RouteAllTrafficToNewVersion expect(%v), but get(%v)", cs.retry, retry) + } + ss, ig = cs.expectObj() + for _, obj := range ss { + checkObjEqual(cli, t, obj) + } + for _, obj := range ig { + checkObjEqual(cli, t, obj) + } + + ss, ig = cs.expectNotFound() + for _, obj := range ss { + checkNotFound(cli, t, obj) + } + for _, obj := range ig { + checkNotFound(cli, t, obj) + } + // if done, no need check again + if !cs.retry { + return + } + // the second call, it should be no error and no retry + time.Sleep(1 * time.Second) + retry, err = manager.RouteAllTrafficToNewVersion(c) + if err != nil { + t.Fatalf("RouteAllTrafficToNewVersion failed: %s", err.Error()) + } else if retry { + t.Fatalf("RouteAllTrafficToNewVersion failed: retry should be false") + } + ss, ig = cs.expectObj() + for _, obj := range ss { + checkObjEqual(cli, t, obj) + } + for _, obj := range ig { + checkObjEqual(cli, t, obj) } }) } diff --git a/pkg/util/controller_finder.go b/pkg/util/controller_finder.go index 886b8191..e9ea1c6b 100644 --- a/pkg/util/controller_finder.go +++ b/pkg/util/controller_finder.go @@ -94,6 +94,13 @@ func (r *ControllerFinder) GetWorkloadForRef(rollout *rolloutv1beta1.Rollout) (* return workload, err } } + } else if rollout.Spec.Strategy.GetRollingStyle() == rolloutv1beta1.BlueGreenRollingStyle { + for _, finder := range r.bluegreenStyleFinders() { + workload, err := finder(rollout.Namespace, &workloadRef) + if workload != nil || err != nil { + return workload, err + } + } } else { for _, finder := range r.partitionStyleFinders() { workload, err := finder(rollout.Namespace, &workloadRef) @@ -115,6 +122,10 @@ func (r *ControllerFinder) partitionStyleFinders() []ControllerFinderFunc { return []ControllerFinderFunc{r.getKruiseCloneSet, r.getAdvancedDeployment, r.getStatefulSetLikeWorkload, r.getKruiseDaemonSet} } +func (r *ControllerFinder) bluegreenStyleFinders() []ControllerFinderFunc { + return []ControllerFinderFunc{r.getKruiseCloneSet, r.getAdvancedDeployment} +} + var ( ControllerKindRS = apps.SchemeGroupVersion.WithKind("ReplicaSet") ControllerKindDep = apps.SchemeGroupVersion.WithKind("Deployment")