Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the logic of grace period #226

Merged
merged 1 commit into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 17 additions & 12 deletions pkg/controller/rollout/rollout_canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,19 +76,18 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error {
klog.Infof("rollout(%s/%s) canary step jumped", c.Rollout.Namespace, c.Rollout.Name)
return nil
}
gracePeriodSeconds := util.GracePeriodSecondsOrDefault(c.Rollout.Spec.Strategy.GetTrafficRouting(), defaultGracePeriodSeconds)
// When the first batch is trafficRouting rolling and the next steps are rolling release,
// We need to clean up the canary-related resources first and then rollout the rest of the batch.
currentStep := c.Rollout.Spec.Strategy.Canary.Steps[canaryStatus.CurrentStepIndex-1]
if currentStep.Traffic == nil && len(currentStep.Matches) == 0 {
tr := newTrafficRoutingContext(c)
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr, false)
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil {
return err
} else if !done {
klog.Infof("rollout(%s/%s) cleaning up canary-related resources", c.Rollout.Namespace, c.Rollout.Name)
expectedTime := time.Now().Add(time.Duration(gracePeriodSeconds) * time.Second)
expectedTime := time.Now().Add(tr.RecheckDuration)
c.RecheckTime = &expectedTime
return nil
}
Expand Down Expand Up @@ -116,11 +115,11 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error {
expectedReplicas, _ := intstr.GetScaledValueFromIntOrPercent(currentStep.Replicas, int(c.Workload.Replicas), true)
if expectedReplicas >= int(c.Workload.Replicas) && v1beta1.IsRealPartition(c.Rollout) {
klog.Infof("special case detected: rollout(%s/%s) restore stable Service", c.Rollout.Namespace, c.Rollout.Name)
done, err := m.trafficRoutingManager.RestoreStableService(tr)
retry, err := m.trafficRoutingManager.RestoreStableService(tr)
if err != nil {
return err
} else if !done {
expectedTime := time.Now().Add(time.Duration(gracePeriodSeconds) * time.Second)
} else if retry {
expectedTime := time.Now().Add(tr.RecheckDuration)
c.RecheckTime = &expectedTime
return nil
}
Expand All @@ -146,11 +145,11 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error {
if canaryStatus.CurrentStepIndex == 1 {
if !tr.DisableGenerateCanaryService {
klog.Infof("special case detected: rollout(%s/%s) patch stable Service", c.Rollout.Namespace, c.Rollout.Name)
done, err := m.trafficRoutingManager.PatchStableService(tr)
retry, err := m.trafficRoutingManager.PatchStableService(tr)
if err != nil {
return err
} else if !done {
expectedTime := time.Now().Add(time.Duration(gracePeriodSeconds) * time.Second)
} else if retry {
expectedTime := time.Now().Add(tr.RecheckDuration)
c.RecheckTime = &expectedTime
return nil
}
Expand Down Expand Up @@ -195,7 +194,13 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error {
klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name,
canaryStatus.CurrentStepIndex, v1beta1.CanaryStepStateTrafficRouting, canaryStatus.CurrentStepState)
}
expectedTime := time.Now().Add(time.Duration(gracePeriodSeconds) * time.Second)
// in two cases, we should wait the default grace period
// - a period after CanaryStepStateUpgrade is just done (https://github.com/openkruise/rollouts/pull/185)
// - a period after CanaryStepStateTrafficRouting is just done
if tr.RecheckDuration <= 0 {
tr.RecheckDuration = time.Duration(trafficrouting.GetGraceSeconds(c.Rollout.Spec.Strategy.GetTrafficRouting(), defaultGracePeriodSeconds)) * time.Second
}
expectedTime := time.Now().Add(tr.RecheckDuration)
c.RecheckTime = &expectedTime

case v1beta1.CanaryStepStateMetricsAnalysis:
Expand Down Expand Up @@ -369,7 +374,7 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro
}
tr := newTrafficRoutingContext(c)
// 2. remove stable service the pod revision selector, so stable service will be selector all version pods.
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr, true)
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil || !done {
return done, err
Expand All @@ -380,7 +385,7 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro
return done, err
}
// 4. modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods.
done, err = m.trafficRoutingManager.FinalisingTrafficRouting(tr, false)
done, err = m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil || !done {
return done, err
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/rollout/rollout_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ var (
Ingress: &v1beta1.IngressTrafficRouting{
Name: "echoserver",
},
GracePeriodSeconds: 0, // To facilitate testing, don't wait after traffic routing operation
},
},
},
Expand Down
22 changes: 4 additions & 18 deletions pkg/controller/rollout/rollout_progressing.go
Original file line number Diff line number Diff line change
Expand Up @@ -431,11 +431,6 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error)
if subStatus == nil {
return true, nil
}
gracePeriodSeconds := c.Rollout.Spec.Strategy.GetTrafficRouting()[0].GracePeriodSeconds
// To ensure respect for graceful time between these steps, we set start timer before the first step
if len(subStatus.FinalisingStep) == 0 {
subStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
}
tr := newTrafficRoutingContext(c)
klog.Infof("rollout(%s/%s) Finalising Step is %s", c.Rollout.Namespace, c.Rollout.Name, subStatus.FinalisingStep)
switch subStatus.FinalisingStep {
Expand All @@ -446,21 +441,11 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error)
// firstly, restore the gateway resources (ingress/gatewayAPI/Istio), that means
// only stable Service will accept the traffic
case v1beta1.FinalisingStepTypeGateway:
//TODO - RestoreGateway returns (bool, error) pair instead of error only.
// return (fasle, nil): gateway is patched successfully, but we need time to observe; recheck later
// return (true, nil): gateway is patched successfully, and accepts the update successfully; go to next step then
// return (false, error): gateway encounters error when patched, or the update is not accepted; recheck later
err := r.trafficRoutingManager.RestoreGateway(tr)
if err != nil {
retry, err := r.trafficRoutingManager.RestoreGateway(tr)
if err != nil || retry {
subStatus.LastUpdateTime = tr.LastUpdateTime
return false, err
}
// usually, GracePeriodSeconds means duration to wait after an operation is done,
// we use defaultGracePeriodSeconds+1 here because the timer started before the RestoreGateway step
if subStatus.LastUpdateTime != nil && time.Since(subStatus.LastUpdateTime.Time) < time.Second*time.Duration(gracePeriodSeconds+1) {
klog.Infof("rollout(%s/%s) in step (%s), and wait %d seconds", c.Rollout.Namespace, c.Rollout.Name, subStatus.FinalisingStep, gracePeriodSeconds+1)
return false, nil
}
klog.Infof("rollout(%s/%s) in step (%s), and success", c.Rollout.Namespace, c.Rollout.Name, subStatus.FinalisingStep)
subStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
subStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR
Expand Down Expand Up @@ -490,7 +475,8 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error)
first step of v3 release.
*/
case v1beta1.FinalisingStepTypeDeleteCanaryService:
err := r.trafficRoutingManager.RemoveCanaryService(tr)
// ignore the grace period because it is the last step
_, err := r.trafficRoutingManager.RemoveCanaryService(tr)
if err != nil {
subStatus.LastUpdateTime = tr.LastUpdateTime
return false, err
Expand Down
10 changes: 7 additions & 3 deletions pkg/controller/rollout/rollout_progressing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ func TestReconcileRolloutProgressing(t *testing.T) {
},
},
{
name: "ReconcileRolloutProgressing rolling -> continueRelease1",
name: "ReconcileRolloutProgressing rolling -> continueRelease1", // add grace time to test the first step: restoring the gateway
getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) {
dep1 := deploymentDemo.DeepCopy()
dep1.Spec.Template.Spec.Containers[0].Image = "echoserver:v3"
Expand All @@ -558,10 +558,14 @@ func TestReconcileRolloutProgressing(t *testing.T) {
return []*apps.Deployment{dep1, dep2}, []*apps.ReplicaSet{rs1, rs2}
},
getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) {
return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()}
c1 := demoIngress.DeepCopy()
c2 := demoIngress.DeepCopy()
c2.Name = c2.Name + "-canary"
return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{c1, c2}
},
getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease, *v1alpha1.TrafficRouting) {
obj := rolloutDemo.DeepCopy()
obj.Spec.Strategy.Canary.TrafficRoutings[0].GracePeriodSeconds = 1 // add grace time to test fine step in continuous logic
obj.Status.CanaryStatus.ObservedWorkloadGeneration = 2
obj.Status.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd"
obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1"
Expand Down Expand Up @@ -635,7 +639,7 @@ func TestReconcileRolloutProgressing(t *testing.T) {
obj.Status.CanaryStatus.CanaryReadyReplicas = 3
obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade
obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteCanaryService
obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR
cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing)
cond.Reason = v1alpha1.ProgressingReasonInRolling
util.SetRolloutCondition(&obj.Status, *cond)
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/trafficrouting/trafficrouting_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,13 @@ func (r *TrafficRoutingReconciler) Reconcile(ctx context.Context, req ctrl.Reque
done, err = r.trafficRoutingManager.DoTrafficRouting(newTrafficRoutingContext(tr))
}
case v1alpha1.TrafficRoutingPhaseFinalizing:
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr), false)
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr))
if done {
newStatus.Phase = v1alpha1.TrafficRoutingPhaseHealthy
newStatus.Message = "TrafficRouting is Healthy"
}
case v1alpha1.TrafficRoutingPhaseTerminating:
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr), false)
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr))
if done {
// remove trafficRouting finalizer
err = r.handleFinalizer(tr)
Expand Down
Loading
Loading