diff --git a/.pipelines/run-autoscaling-e2e.yml b/.pipelines/run-autoscaling-e2e.yml index 3b656902b2..4204e59c19 100644 --- a/.pipelines/run-autoscaling-e2e.yml +++ b/.pipelines/run-autoscaling-e2e.yml @@ -29,7 +29,7 @@ stages: - template: ./cnm-image.yml - job: create_aks_cluster_and_run_tests_job dependsOn: [build_push_ccm_image_job, build_push_cnm_image_job] - timeoutInMinutes: 300 + timeoutInMinutes: 340 steps: - task: GoTool@0 inputs: diff --git a/.pipelines/run-autoscaling-multipool-e2e.yml b/.pipelines/run-autoscaling-multipool-e2e.yml index 72e1daffa4..55c5718854 100644 --- a/.pipelines/run-autoscaling-multipool-e2e.yml +++ b/.pipelines/run-autoscaling-multipool-e2e.yml @@ -29,7 +29,7 @@ stages: - template: ./cnm-image.yml - job: create_aks_cluster_and_run_tests_job dependsOn: [build_push_ccm_image_job, build_push_cnm_image_job] - timeoutInMinutes: 300 + timeoutInMinutes: 340 steps: - task: GoTool@0 inputs: diff --git a/tests/e2e/autoscaling/autoscaler.go b/tests/e2e/autoscaling/autoscaler.go index cdd5693990..fc7e56aea9 100644 --- a/tests/e2e/autoscaling/autoscaler.go +++ b/tests/e2e/autoscaling/autoscaler.go @@ -140,7 +140,11 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut Expect(err).NotTo(HaveOccurred()) waitForScaleUpToComplete(cs, ns, initNodeCount+1) + err = utils.HoldAutoScaleNodes(cs, initNodeCount+1) + Expect(err).NotTo(HaveOccurred()) waitForScaleDownToComplete(cs, ns, initNodeCount, deployment) + err = utils.HoldAutoScaleNodes(cs, initNodeCount) + Expect(err).NotTo(HaveOccurred()) }) It("should scale up, deploy a statefulset with disks attached, scale down, and certain pods + disks should be evicted to a new node", func() { @@ -154,6 +158,8 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut }() Expect(err).NotTo(HaveOccurred()) waitForScaleUpToComplete(cs, ns, initNodeCount+1) + err = utils.HoldAutoScaleNodes(cs, initNodeCount+1) + Expect(err).NotTo(HaveOccurred()) By("Deploying a StatefulSet") statefulSetManifest := createStatefulSetWithPVCManifest(basename+"-statefulset", int32(2), map[string]string{"app": basename + "-statefulset"}) @@ -183,6 +189,8 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut } waitForScaleDownToComplete(cs, ns, initNodeCount, deployment) + err = utils.HoldAutoScaleNodes(cs, initNodeCount) + Expect(err).NotTo(HaveOccurred()) By("Waiting for certain StatefulSet's pods + disks to be evicted to new nodes") err = waitForStatefulSetComplete(cs, ns, statefulSet) @@ -247,6 +255,8 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut }() Expect(err).NotTo(HaveOccurred()) waitForScaleUpToComplete(cs, ns, len(nodes)+10) + err = utils.HoldAutoScaleNodes(cs, len(nodes)+10) + Expect(err).NotTo(HaveOccurred()) By("Checking the balancing state of the node groups") nodes, err = utils.GetAgentNodes(cs) @@ -256,6 +266,8 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut Expect(isBalance).To(BeTrue()) waitForScaleDownToComplete(cs, ns, initNodeCount, scaleUpDeployment) + err = utils.HoldAutoScaleNodes(cs, initNodeCount) + Expect(err).NotTo(HaveOccurred()) }) It("should support one node pool with slow scaling", Label(utils.TestSuiteLabelSingleNodePool), func() { @@ -413,7 +425,11 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut Expect(err).NotTo(HaveOccurred()) waitForScaleUpToComplete(cs, ns, initNodeCount+1) + err = utils.HoldAutoScaleNodes(cs, initNodeCount+1) + Expect(err).NotTo(HaveOccurred()) waitForScaleDownToComplete(cs, ns, initNodeCount, deployment) + err = utils.HoldAutoScaleNodes(cs, initNodeCount) + Expect(err).NotTo(HaveOccurred()) }) }) diff --git a/tests/e2e/utils/node_utils.go b/tests/e2e/utils/node_utils.go index 5631358633..341ea18bab 100644 --- a/tests/e2e/utils/node_utils.go +++ b/tests/e2e/utils/node_utils.go @@ -221,6 +221,7 @@ func WaitAutoScaleNodes(cs clientset.Interface, targetNodeCount int, isScaleDown poll := 60 * time.Second autoScaleTimeOut := 90 * time.Minute nodeConditions := map[string][]v1.NodeCondition{} + previousNodeCount := -1 if err = wait.PollImmediate(poll, autoScaleTimeOut, func() (bool, error) { nodes, err = GetAgentNodes(cs) if err != nil { @@ -237,13 +238,31 @@ func WaitAutoScaleNodes(cs clientset.Interface, targetNodeCount int, isScaleDown for _, node := range nodes { nodeConditions[node.Name] = node.Status.Conditions } + Logf("Detect %v nodes, target %v", len(nodes), targetNodeCount) - if len(nodes) > targetNodeCount && !isScaleDown { + + // Overscaling validation + if isScaleDown && len(nodes) < targetNodeCount { + Logf("error: less nodes than expected, Node conditions: %v", nodeConditions) + return false, fmt.Errorf("there are less nodes than expected") + } else if !isScaleDown && len(nodes) > targetNodeCount { Logf("error: more nodes than expected, Node conditions: %v", nodeConditions) - err = fmt.Errorf("there are more nodes than expected") - return false, err + return false, fmt.Errorf("there are more nodes than expected") } - return (targetNodeCount > len(nodes) && isScaleDown) || targetNodeCount == len(nodes), nil + + // Monotonous autoscaling progress validation + if previousNodeCount != -1 { + if isScaleDown && previousNodeCount < len(nodes) { + Logf("error: unexpected scale up while expecting scale down, Node conditions: %v", nodeConditions) + return false, fmt.Errorf("unexpected scale up while expecting scale down") + } else if !isScaleDown && previousNodeCount > len(nodes) { + Logf("error: unexpected scale down while expecting scale up, Node conditions: %v", nodeConditions) + return false, fmt.Errorf("unexpected scale down while expecting scale up") + } + } + previousNodeCount = len(nodes) + + return len(nodes) == targetNodeCount, nil }); errors.Is(err, wait.ErrWaitTimeout) { Logf("Node conditions: %v", nodeConditions) return fmt.Errorf("Fail to get target node count in limited time") @@ -252,6 +271,45 @@ func WaitAutoScaleNodes(cs clientset.Interface, targetNodeCount int, isScaleDown return err } +// HoldAutoScaleNodes validate node count to not change for few minutes +func HoldAutoScaleNodes(cs clientset.Interface, targetNodeCount int) error { + Logf(fmt.Sprintf("checking node count stability... Target node count: %v", targetNodeCount)) + var nodes []v1.Node + var err error + poll := 60 * time.Second + checkDuration := 5 * time.Minute + nodeConditions := map[string][]v1.NodeCondition{} + if err = wait.PollImmediate(poll, checkDuration, func() (bool, error) { + nodes, err = GetAgentNodes(cs) + if err != nil { + if IsRetryableAPIError(err) { + return false, nil + } + return false, err + } + if nodes == nil { + err = fmt.Errorf("Unexpected nil node list") + return false, err + } + nodeConditions = map[string][]v1.NodeCondition{} + for _, node := range nodes { + nodeConditions[node.Name] = node.Status.Conditions + } + + if len(nodes) != targetNodeCount { + Logf("error: unexpected node count changes, Node conditions: %v", nodeConditions) + return false, fmt.Errorf("unexpected node count changes") + } + + return false, nil + }); errors.Is(err, wait.ErrWaitTimeout) { + // Survived + err = nil + } + Logf("Node conditions: %v", nodeConditions) + return err +} + // IsControlPlaneNode returns true if the node has a control-plane role label. // The control-plane role is determined by looking for: // * a node-role.kubernetes.io/control-plane or node-role.kubernetes.io/master="" label