Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: improve autoscaling E2E to prevent unexpected scale operations #5576

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pipelines/run-autoscaling-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ stages:
- template: ./cnm-image.yml
- job: create_aks_cluster_and_run_tests_job
dependsOn: [build_push_ccm_image_job, build_push_cnm_image_job]
timeoutInMinutes: 300
timeoutInMinutes: 340
steps:
- task: GoTool@0
inputs:
Expand Down
2 changes: 1 addition & 1 deletion .pipelines/run-autoscaling-multipool-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ stages:
- template: ./cnm-image.yml
- job: create_aks_cluster_and_run_tests_job
dependsOn: [build_push_ccm_image_job, build_push_cnm_image_job]
timeoutInMinutes: 300
timeoutInMinutes: 340
steps:
- task: GoTool@0
inputs:
Expand Down
16 changes: 16 additions & 0 deletions tests/e2e/autoscaling/autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,11 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut
Expect(err).NotTo(HaveOccurred())

waitForScaleUpToComplete(cs, ns, initNodeCount+1)
err = utils.HoldAutoScaleNodes(cs, initNodeCount+1)
Expect(err).NotTo(HaveOccurred())
waitForScaleDownToComplete(cs, ns, initNodeCount, deployment)
err = utils.HoldAutoScaleNodes(cs, initNodeCount)
Expect(err).NotTo(HaveOccurred())
})

It("should scale up, deploy a statefulset with disks attached, scale down, and certain pods + disks should be evicted to a new node", func() {
Expand All @@ -154,6 +158,8 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut
}()
Expect(err).NotTo(HaveOccurred())
waitForScaleUpToComplete(cs, ns, initNodeCount+1)
err = utils.HoldAutoScaleNodes(cs, initNodeCount+1)
Expect(err).NotTo(HaveOccurred())

By("Deploying a StatefulSet")
statefulSetManifest := createStatefulSetWithPVCManifest(basename+"-statefulset", int32(2), map[string]string{"app": basename + "-statefulset"})
Expand Down Expand Up @@ -183,6 +189,8 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut
}

waitForScaleDownToComplete(cs, ns, initNodeCount, deployment)
err = utils.HoldAutoScaleNodes(cs, initNodeCount)
Expect(err).NotTo(HaveOccurred())

By("Waiting for certain StatefulSet's pods + disks to be evicted to new nodes")
err = waitForStatefulSetComplete(cs, ns, statefulSet)
Expand Down Expand Up @@ -247,6 +255,8 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut
}()
Expect(err).NotTo(HaveOccurred())
waitForScaleUpToComplete(cs, ns, len(nodes)+10)
err = utils.HoldAutoScaleNodes(cs, len(nodes)+10)
Expect(err).NotTo(HaveOccurred())

By("Checking the balancing state of the node groups")
nodes, err = utils.GetAgentNodes(cs)
Expand All @@ -256,6 +266,8 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut
Expect(isBalance).To(BeTrue())

waitForScaleDownToComplete(cs, ns, initNodeCount, scaleUpDeployment)
err = utils.HoldAutoScaleNodes(cs, initNodeCount)
Expect(err).NotTo(HaveOccurred())
})

It("should support one node pool with slow scaling", Label(utils.TestSuiteLabelSingleNodePool), func() {
Expand Down Expand Up @@ -413,7 +425,11 @@ var _ = Describe("Cluster size autoscaler", Label(utils.TestSuiteLabelFeatureAut
Expect(err).NotTo(HaveOccurred())

waitForScaleUpToComplete(cs, ns, initNodeCount+1)
err = utils.HoldAutoScaleNodes(cs, initNodeCount+1)
Expect(err).NotTo(HaveOccurred())
waitForScaleDownToComplete(cs, ns, initNodeCount, deployment)
err = utils.HoldAutoScaleNodes(cs, initNodeCount)
Expect(err).NotTo(HaveOccurred())
})
})

Expand Down
66 changes: 62 additions & 4 deletions tests/e2e/utils/node_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ func WaitAutoScaleNodes(cs clientset.Interface, targetNodeCount int, isScaleDown
poll := 60 * time.Second
autoScaleTimeOut := 90 * time.Minute
nodeConditions := map[string][]v1.NodeCondition{}
previousNodeCount := -1
if err = wait.PollImmediate(poll, autoScaleTimeOut, func() (bool, error) {
nodes, err = GetAgentNodes(cs)
if err != nil {
Expand All @@ -237,13 +238,31 @@ func WaitAutoScaleNodes(cs clientset.Interface, targetNodeCount int, isScaleDown
for _, node := range nodes {
nodeConditions[node.Name] = node.Status.Conditions
}

Logf("Detect %v nodes, target %v", len(nodes), targetNodeCount)
if len(nodes) > targetNodeCount && !isScaleDown {

// Overscaling validation
if isScaleDown && len(nodes) < targetNodeCount {
Logf("error: less nodes than expected, Node conditions: %v", nodeConditions)
return false, fmt.Errorf("there are less nodes than expected")
} else if !isScaleDown && len(nodes) > targetNodeCount {
Logf("error: more nodes than expected, Node conditions: %v", nodeConditions)
err = fmt.Errorf("there are more nodes than expected")
return false, err
return false, fmt.Errorf("there are more nodes than expected")
}
return (targetNodeCount > len(nodes) && isScaleDown) || targetNodeCount == len(nodes), nil

// Monotonous autoscaling progress validation
if previousNodeCount != -1 {
if isScaleDown && previousNodeCount < len(nodes) {
Logf("error: unexpected scale up while expecting scale down, Node conditions: %v", nodeConditions)
return false, fmt.Errorf("unexpected scale up while expecting scale down")
} else if !isScaleDown && previousNodeCount > len(nodes) {
Logf("error: unexpected scale down while expecting scale up, Node conditions: %v", nodeConditions)
return false, fmt.Errorf("unexpected scale down while expecting scale up")
}
}
previousNodeCount = len(nodes)

return len(nodes) == targetNodeCount, nil
}); errors.Is(err, wait.ErrWaitTimeout) {
Logf("Node conditions: %v", nodeConditions)
return fmt.Errorf("Fail to get target node count in limited time")
Expand All @@ -252,6 +271,45 @@ func WaitAutoScaleNodes(cs clientset.Interface, targetNodeCount int, isScaleDown
return err
}

// HoldAutoScaleNodes validate node count to not change for few minutes
func HoldAutoScaleNodes(cs clientset.Interface, targetNodeCount int) error {
Logf(fmt.Sprintf("checking node count stability... Target node count: %v", targetNodeCount))
var nodes []v1.Node
var err error
poll := 60 * time.Second
checkDuration := 5 * time.Minute
comtalyst marked this conversation as resolved.
Show resolved Hide resolved
nodeConditions := map[string][]v1.NodeCondition{}
if err = wait.PollImmediate(poll, checkDuration, func() (bool, error) {
nodes, err = GetAgentNodes(cs)
if err != nil {
if IsRetryableAPIError(err) {
return false, nil
}
return false, err
}
if nodes == nil {
err = fmt.Errorf("Unexpected nil node list")
return false, err
}
nodeConditions = map[string][]v1.NodeCondition{}
for _, node := range nodes {
nodeConditions[node.Name] = node.Status.Conditions
}

if len(nodes) != targetNodeCount {
Logf("error: unexpected node count changes, Node conditions: %v", nodeConditions)
return false, fmt.Errorf("unexpected node count changes")
}

return false, nil
}); errors.Is(err, wait.ErrWaitTimeout) {
// Survived
err = nil
}
Logf("Node conditions: %v", nodeConditions)
return err
}

// IsControlPlaneNode returns true if the node has a control-plane role label.
// The control-plane role is determined by looking for:
// * a node-role.kubernetes.io/control-plane or node-role.kubernetes.io/master="" label
Expand Down
Loading