From 72e1c0b936c8eb7c55de4f68c0afce751b998cfe Mon Sep 17 00:00:00 2001 From: Wei-Xiang Sun Date: Wed, 12 Jul 2023 17:52:13 +0800 Subject: [PATCH] Advanced deployment scale down old unhealthy pods firstly (#150) * advanced deployment scale down old unhealthy pods firstly Signed-off-by: mingzhou.swx * add e2e for advanced deployment scale down old unhealthy pod first Signed-off-by: mingzhou.swx --------- Signed-off-by: mingzhou.swx Co-authored-by: mingzhou.swx --- pkg/controller/deployment/sync.go | 24 ++++++++++++++++++++++++ test/e2e/deployment_test.go | 21 +++++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/pkg/controller/deployment/sync.go b/pkg/controller/deployment/sync.go index 9b821e96..afa6c4df 100644 --- a/pkg/controller/deployment/sync.go +++ b/pkg/controller/deployment/sync.go @@ -302,6 +302,30 @@ func (dc *DeploymentController) scale(ctx context.Context, deployment *apps.Depl // replica sets. deploymentReplicasToAdd := allowedSize - allRSsReplicas + // Scale down the unhealthy replicas in old replica sets firstly to avoid some bad cases. + // For example: + // _______________________________________________________________________________________ + // | ReplicaSet | oldRS-1 | oldRS-2 | newRS | + // | --------------| -------------------|----------------------|--------------------------| + // | Replicas | 4 healthy Pods | 2 unhealthy Pods | 4 unhealthy Pods | + // --------------------------------------------------------------------------------------- + // If we want to scale down these replica sets from 10 to 6, we expect to scale down the oldRS-2 + // from 2 to 0 firstly, then scale down oldRS-1 1 Pod and newRS 1 Pod based on proportion. + // + // We do not scale down the newRS unhealthy Pods with higher priority, because these new revision + // Pods may be just created, not the one with the crash or other problems. + var err error + var cleanupCount int32 + if deploymentReplicasToAdd < 0 { + oldRSs, cleanupCount, err = dc.cleanupUnhealthyReplicas(ctx, oldRSs, deployment, -deploymentReplicasToAdd) + if err != nil { + return err + } + klog.V(4).Infof("Cleaned up unhealthy replicas from old RSes by %d during scaling", cleanupCount) + deploymentReplicasToAdd += cleanupCount + allRSs = deploymentutil.FilterActiveReplicaSets(append(oldRSs, newRS)) + } + // The additional replicas should be distributed proportionally amongst the active // replica sets from the larger to the smaller in size replica set. Scaling direction // drives what happens in case we are trying to scale replica sets of the same size. diff --git a/test/e2e/deployment_test.go b/test/e2e/deployment_test.go index a6a12cda..b857f929 100644 --- a/test/e2e/deployment_test.go +++ b/test/e2e/deployment_test.go @@ -44,7 +44,7 @@ var _ = SIGDescribe("Advanced Deployment", func() { return k8sClient.Get(context.TODO(), key, object) } - UpdateDeployment := func(deployment *apps.Deployment, version string) *apps.Deployment { + UpdateDeployment := func(deployment *apps.Deployment, version string, images ...string) *apps.Deployment { By(fmt.Sprintf("update deployment %v to version: %v", client.ObjectKeyFromObject(deployment), version)) var clone *apps.Deployment Expect(retry.RetryOnConflict(defaultRetry, func() error { @@ -53,7 +53,11 @@ var _ = SIGDescribe("Advanced Deployment", func() { if err != nil { return err } - clone.Spec.Template.Spec.Containers[0].Image = deployment.Spec.Template.Spec.Containers[0].Image + if len(images) == 0 { + clone.Spec.Template.Spec.Containers[0].Image = deployment.Spec.Template.Spec.Containers[0].Image + } else { + clone.Spec.Template.Spec.Containers[0].Image = images[0] + } clone.Spec.Template.Spec.Containers[0].Env[0].Value = version strategy := unmarshal(clone.Annotations[rolloutsv1alpha1.DeploymentStrategyAnnotation]) strategy.Paused = true @@ -324,6 +328,19 @@ var _ = SIGDescribe("Advanced Deployment", func() { UpdatePartitionWithCheck(deployment, intstr.FromInt(3)) UpdatePartitionWithCheck(deployment, intstr.FromInt(5)) }) + + It("scale down old unhealthy first", func() { + deployment := &apps.Deployment{} + deployment.Namespace = namespace + Expect(ReadYamlToObject("./test_data/deployment/deployment.yaml", deployment)).ToNot(HaveOccurred()) + deployment.Annotations["rollouts.kruise.io/deployment-strategy"] = `{"rollingUpdate":{"maxUnavailable":0,"maxSurge":1}}` + CreateObject(deployment) + UpdateDeployment(deployment, "version2", "busybox:not-exists") + UpdatePartitionWithoutCheck(deployment, intstr.FromInt(1)) + CheckReplicas(deployment, 6, 5, 1) + UpdateDeployment(deployment, "version3", "busybox:1.32") + CheckReplicas(deployment, 5, 5, 0) + }) }) })