test: add budgets replacement tests (#5602)

Co-authored-by: Jonathan Innis <[email protected]>
aws · Feb 5, 2024 · ce9f728 · ce9f728
1 parent 6bf42a2
commit ce9f728
Show file tree

Hide file tree

Showing 4 changed files with 290 additions and 14 deletions.
diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go
@@ -486,6 +486,12 @@ func (env *Environment) ExpectNodeClaimCount(comparator string, count int) {
 	Expect(len(nodeClaimList.Items)).To(BeNumerically(comparator, count))
 }
 
+func NodeClaimNames(nodeClaims []*corev1beta1.NodeClaim) []string {
+	return lo.Map(nodeClaims, func(n *corev1beta1.NodeClaim, index int) string {
+		return n.Name
+	})
+}
+
 func NodeNames(nodes []*v1.Node) []string {
 	return lo.Map(nodes, func(n *v1.Node, index int) string {
 		return n.Name
@@ -506,29 +512,30 @@ func (env *Environment) ConsistentlyExpectNodeCount(comparator string, count int
 
 func (env *Environment) ConsistentlyExpectNoDisruptions(nodeCount int, duration time.Duration) (taintedNodes []*v1.Node) {
 	GinkgoHelper()
-	return env.ConsistentlyExpectDisruptingNodesWithNodeCount(0, nodeCount, duration)
+	return env.ConsistentlyExpectDisruptionsWithNodeCount(0, nodeCount, duration)
 }
 
-func (env *Environment) ConsistentlyExpectDisruptingNodesWithNodeCount(taintedNodeCount int, nodeCount int, duration time.Duration) (taintedNodes []*v1.Node) {
+// ConsistentlyExpectDisruptionsWithNodeCount will continually ensure that there are exactly disruptingNodes with totalNodes (including replacements and existing nodes)
+func (env *Environment) ConsistentlyExpectDisruptionsWithNodeCount(disruptingNodes, totalNodes int, duration time.Duration) (taintedNodes []*v1.Node) {
 	GinkgoHelper()
 	nodes := []v1.Node{}
 	Consistently(func(g Gomega) {
 		// Ensure we don't change our NodeClaims
 		nodeClaimList := &corev1beta1.NodeClaimList{}
 		g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed())
-		g.Expect(nodeClaimList.Items).To(HaveLen(nodeCount))
+		g.Expect(nodeClaimList.Items).To(HaveLen(totalNodes))
 
 		nodeList := &v1.NodeList{}
 		g.Expect(env.Client.List(env, nodeList, client.HasLabels{test.DiscoveryLabel})).To(Succeed())
-		g.Expect(nodeList.Items).To(HaveLen(nodeCount))
+		g.Expect(nodeList.Items).To(HaveLen(totalNodes))
 
 		nodes = lo.Filter(nodeList.Items, func(n v1.Node, _ int) bool {
 			_, ok := lo.Find(n.Spec.Taints, func(t v1.Taint) bool {
 				return corev1beta1.IsDisruptingTaint(t)
 			})
 			return ok
 		})
-		g.Expect(nodes).To(HaveLen(taintedNodeCount))
+		g.Expect(nodes).To(HaveLen(disruptingNodes))
 	}, duration).Should(Succeed())
 	return lo.ToSlicePtr(nodes)
 }
@@ -556,6 +563,18 @@ func (env *Environment) EventuallyExpectNodesUntaintedWithTimeout(timeout time.D
 	}).WithTimeout(timeout).Should(Succeed())
 }
 
+func (env *Environment) EventuallyExpectNodeClaimCount(comparator string, count int) []*corev1beta1.NodeClaim {
+	GinkgoHelper()
+	By(fmt.Sprintf("waiting for nodes to be %s to %d", comparator, count))
+	nodeClaimList := &corev1beta1.NodeClaimList{}
+	Eventually(func(g Gomega) {
+		g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed())
+		g.Expect(len(nodeClaimList.Items)).To(BeNumerically(comparator, count),
+			fmt.Sprintf("expected %d nodeclaims, had %d (%v)", count, len(nodeClaimList.Items), NodeClaimNames(lo.ToSlicePtr(nodeClaimList.Items))))
+	}).Should(Succeed())
+	return lo.ToSlicePtr(nodeClaimList.Items)
+}
+
 func (env *Environment) EventuallyExpectNodeCount(comparator string, count int) []*v1.Node {
 	GinkgoHelper()
 	By(fmt.Sprintf("waiting for nodes to be %s to %d", comparator, count))

diff --git a/test/suites/consolidation/suite_test.go b/test/suites/consolidation/suite_test.go
@@ -126,7 +126,7 @@ var _ = Describe("Consolidation", func() {
 
 			// Ensure that we get two nodes tainted, and they have overlap during the drift
 			env.EventuallyExpectTaintedNodeCount("==", 2)
-			nodes = env.ConsistentlyExpectDisruptingNodesWithNodeCount(2, 5, time.Second*5)
+			nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(2, 5, 5*time.Second)
 
 			// Remove the finalizer from each node so that we can terminate
 			for _, node := range nodes {
@@ -139,7 +139,7 @@ var _ = Describe("Consolidation", func() {
 
 			// This check ensures that we are consolidating nodes at the same time
 			env.EventuallyExpectTaintedNodeCount("==", 2)
-			nodes = env.ConsistentlyExpectDisruptingNodesWithNodeCount(2, 3, time.Second*5)
+			nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(2, 3, 5*time.Second)
 
 			for _, node := range nodes {
 				Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed())
@@ -206,16 +206,117 @@ var _ = Describe("Consolidation", func() {
 			nodePool.Spec.Disruption.ConsolidateAfter = nil
 			env.ExpectUpdated(nodePool)
 
-			// Ensure that we get two nodes tainted, and they have overlap during the drift
+			// Ensure that we get two nodes tainted, and they have overlap during consolidation
 			env.EventuallyExpectTaintedNodeCount("==", 2)
-			nodes = env.ConsistentlyExpectDisruptingNodesWithNodeCount(2, 3, time.Second*5)
+			nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(2, 3, 5*time.Second)
 
 			for _, node := range nodes {
 				Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed())
 			}
 			env.EventuallyExpectNotFound(nodes[0], nodes[1])
 			env.ExpectNodeCount("==", 1)
 		})
+		It("should respect budgets for non-empty replace consolidation", func() {
+			appLabels := map[string]string{"app": "large-app"}
+			// This test will hold consolidation until we are ready to execute it
+			nodePool.Spec.Disruption.ConsolidateAfter = &corev1beta1.NillableDuration{}
+
+			nodePool = test.ReplaceRequirements(nodePool,
+				v1.NodeSelectorRequirement{
+					Key:      v1beta1.LabelInstanceSize,
+					Operator: v1.NodeSelectorOpIn,
+					Values:   []string{"xlarge", "2xlarge"},
+				},
+				// Add an Exists operator so that we can select on a fake partition later
+				v1.NodeSelectorRequirement{
+					Key:      "test-partition",
+					Operator: v1.NodeSelectorOpExists,
+				},
+			)
+			nodePool.Labels = appLabels
+			// We're expecting to create 5 nodes, so we'll expect to see at most 3 nodes deleting at one time.
+			nodePool.Spec.Disruption.Budgets = []corev1beta1.Budget{{
+				Nodes: "3",
+			}}
+
+			ds := test.DaemonSet(test.DaemonSetOptions{
+				Selector: appLabels,
+				PodOptions: test.PodOptions{
+					ObjectMeta: metav1.ObjectMeta{
+						Labels: appLabels,
+					},
+					// Each 2xlarge has 8 cpu, so each node should fit no more than 1 pod since each node will have.
+					// an equivalently sized daemonset
+					ResourceRequirements: v1.ResourceRequirements{
+						Requests: v1.ResourceList{
+							v1.ResourceCPU: resource.MustParse("3"),
+						},
+					},
+				},
+			})
+
+			env.ExpectCreated(ds)
+
+			// Make 5 pods all with different deployments and different test partitions, so that each pod can be put
+			// on a separate node.
+			selector = labels.SelectorFromSet(appLabels)
+			numPods = 5
+			deployments := make([]*appsv1.Deployment, numPods)
+			for i := range lo.Range(int(numPods)) {
+				deployments[i] = test.Deployment(test.DeploymentOptions{
+					Replicas: 1,
+					PodOptions: test.PodOptions{
+						ObjectMeta: metav1.ObjectMeta{
+							Labels: appLabels,
+						},
+						NodeSelector: map[string]string{"test-partition": fmt.Sprintf("%d", i)},
+						// Each 2xlarge has 8 cpu, so each node should fit no more than 1 pod since each node will have.
+						// an equivalently sized daemonset
+						ResourceRequirements: v1.ResourceRequirements{
+							Requests: v1.ResourceList{
+								v1.ResourceCPU: resource.MustParse("3"),
+							},
+						},
+					},
+				})
+			}
+
+			env.ExpectCreated(nodeClass, nodePool, deployments[0], deployments[1], deployments[2], deployments[3], deployments[4])
+
+			env.EventuallyExpectCreatedNodeClaimCount("==", 5)
+			nodes := env.EventuallyExpectCreatedNodeCount("==", 5)
+			// Check that all daemonsets and deployment pods are online
+			env.EventuallyExpectHealthyPodCount(selector, int(numPods)*2)
+
+			By("cordoning and adding finalizer to the nodes")
+			// Add a finalizer to each node so that we can stop termination disruptions
+			for _, node := range nodes {
+				Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed())
+				node.Finalizers = append(node.Finalizers, common.TestingFinalizer)
+				env.ExpectUpdated(node)
+			}
+
+			// Delete the daemonset so that the nodes can be consolidated to smaller size
+			env.ExpectDeleted(ds)
+			// Check that all daemonsets and deployment pods are online
+			env.EventuallyExpectHealthyPodCount(selector, int(numPods))
+
+			By("enabling consolidation")
+			nodePool.Spec.Disruption.ConsolidateAfter = nil
+			env.ExpectUpdated(nodePool)
+
+			// Ensure that we get two nodes tainted, and they have overlap during the consolidation
+			env.EventuallyExpectTaintedNodeCount("==", 3)
+			env.EventuallyExpectNodeClaimCount("==", 8)
+			env.EventuallyExpectNodeCount("==", 8)
+			nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(3, 8, 5*time.Second)
+
+			for _, node := range nodes {
+				Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed())
+			}
+			env.EventuallyExpectNotFound(nodes[0], nodes[1], nodes[2])
+			env.ExpectNodeCount("==", 5)
+		})
 		It("should not allow consolidation if the budget is fully blocking", func() {
 			// We're going to define a budget that doesn't allow any consolidation to happen
 			nodePool.Spec.Disruption.Budgets = []corev1beta1.Budget{{

diff --git a/test/suites/drift/suite_test.go b/test/suites/drift/suite_test.go
@@ -161,7 +161,7 @@ var _ = Describe("Drift", func() {
 
 			// Ensure that we get two nodes tainted, and they have overlap during the drift
 			env.EventuallyExpectTaintedNodeCount("==", 2)
-			nodes = env.ConsistentlyExpectDisruptingNodesWithNodeCount(2, 3, 5*time.Second)
+			nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(2, 3, 5*time.Second)
 
 			// Remove the finalizer from each node so that we can terminate
 			for _, node := range nodes {
@@ -253,7 +253,7 @@ var _ = Describe("Drift", func() {
 
 			// Ensure that we get two nodes tainted, and they have overlap during the drift
 			env.EventuallyExpectTaintedNodeCount("==", 2)
-			nodes = env.ConsistentlyExpectDisruptingNodesWithNodeCount(2, 3, time.Minute)
+			nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(2, 3, 30*time.Second)
 
 			By("removing the finalizer from the nodes")
 			Expect(env.ExpectTestingFinalizerRemoved(nodes[0])).To(Succeed())
@@ -263,6 +263,84 @@ var _ = Describe("Drift", func() {
 			// the node should be gone
 			env.EventuallyExpectNotFound(nodes[0], nodes[1])
 		})
+		It("should respect budgets for non-empty replace drift", func() {
+			appLabels := map[string]string{"app": "large-app"}
+
+			nodePool = coretest.ReplaceRequirements(nodePool,
+				v1.NodeSelectorRequirement{
+					Key:      v1beta1.LabelInstanceSize,
+					Operator: v1.NodeSelectorOpIn,
+					Values:   []string{"xlarge"},
+				},
+				// Add an Exists operator so that we can select on a fake partition later
+				v1.NodeSelectorRequirement{
+					Key:      "test-partition",
+					Operator: v1.NodeSelectorOpExists,
+				},
+			)
+			nodePool.Labels = appLabels
+			// We're expecting to create 5 nodes, so we'll expect to see at most 3 nodes deleting at one time.
+			nodePool.Spec.Disruption.Budgets = []corev1beta1.Budget{{
+				Nodes: "3",
+			}}
+
+			// Make 5 pods all with different deployments and different test partitions, so that each pod can be put
+			// on a separate node.
+			selector = labels.SelectorFromSet(appLabels)
+			numPods = 5
+			deployments := make([]*appsv1.Deployment, numPods)
+			for i := range lo.Range(numPods) {
+				deployments[i] = coretest.Deployment(coretest.DeploymentOptions{
+					Replicas: 1,
+					PodOptions: coretest.PodOptions{
+						ObjectMeta: metav1.ObjectMeta{
+							Labels: appLabels,
+						},
+						NodeSelector: map[string]string{"test-partition": fmt.Sprintf("%d", i)},
+						// Each xlarge has 4 cpu, so each node should fit no more than 1 pod.
+						ResourceRequirements: v1.ResourceRequirements{
+							Requests: v1.ResourceList{
+								v1.ResourceCPU: resource.MustParse("3"),
+							},
+						},
+					},
+				})
+			}
+
+			env.ExpectCreated(nodeClass, nodePool, deployments[0], deployments[1], deployments[2], deployments[3], deployments[4])
+
+			env.EventuallyExpectCreatedNodeClaimCount("==", 5)
+			nodes := env.EventuallyExpectCreatedNodeCount("==", 5)
+			// Check that all deployment pods are online
+			env.EventuallyExpectHealthyPodCount(selector, numPods)
+
+			By("cordoning and adding finalizer to the nodes")
+			// Add a finalizer to each node so that we can stop termination disruptions
+			for _, node := range nodes {
+				Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed())
+				node.Finalizers = append(node.Finalizers, common.TestingFinalizer)
+				env.ExpectUpdated(node)
+			}
+
+			// Check that all daemonsets and deployment pods are online
+			env.EventuallyExpectHealthyPodCount(selector, numPods)
+
+			By("drifting the nodepool")
+			nodePool.Spec.Template.Annotations = lo.Assign(nodePool.Spec.Template.Annotations, map[string]string{"test-annotation": "drift"})
+			env.ExpectUpdated(nodePool)
+
+			// Ensure that we get two nodes tainted, and they have overlap during the drift
+			env.EventuallyExpectTaintedNodeCount("==", 3)
+			env.EventuallyExpectNodeClaimCount("==", 8)
+			env.EventuallyExpectNodeCount("==", 8)
+			nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(3, 8, 5*time.Second)
+
+			for _, node := range nodes {
+				Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed())
+			}
+			env.EventuallyExpectNotFound(nodes[0], nodes[1], nodes[2])
+			env.ExpectNodeCount("==", 5)
+		})
 		It("should not allow drift if the budget is fully blocking", func() {
 			// We're going to define a budget that doesn't allow any drift to happen
 			nodePool.Spec.Disruption.Budgets = []corev1beta1.Budget{{