Skip to content

Commit

Permalink
ci: Fix E2E test flakes (#7827)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis authored Mar 3, 2025
1 parent ee54205 commit 51e7f86
Show file tree
Hide file tree
Showing 8 changed files with 33 additions and 37 deletions.
24 changes: 11 additions & 13 deletions pkg/controllers/nodeclass/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,8 @@ import (
"github.com/mitchellh/hashstructure/v2"
"github.com/patrickmn/go-cache"
"github.com/samber/lo"

"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/ec2"
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -153,8 +151,8 @@ func (v *Validation) validateCreateFleetAuthorization(
_ *karpv1.NodeClaim,
tags map[string]string,
) (reason string, err error) {
createFleetInput := instance.GetCreateFleetInput(nodeClass, string(karpv1.CapacityTypeOnDemand), tags, mockLaunchTemplateConfig())
createFleetInput.DryRun = aws.Bool(true)
createFleetInput := instance.GetCreateFleetInput(nodeClass, karpv1.CapacityTypeOnDemand, tags, mockLaunchTemplateConfig())
createFleetInput.DryRun = lo.ToPtr(true)
if _, err := v.ec2api.CreateFleet(ctx, createFleetInput); awserrors.IgnoreDryRunError(err) != nil {
if awserrors.IgnoreUnauthorizedOperationError(err) != nil {
// Dry run should only ever return UnauthorizedOperation or DryRunOperation so if we receive any other error
Expand All @@ -173,7 +171,7 @@ func (v *Validation) validateCreateLaunchTemplateAuthorization(
tags map[string]string,
) (reason string, err error) {
createLaunchTemplateInput := launchtemplate.GetCreateLaunchTemplateInput(ctx, mockOptions(*nodeClaim, nodeClass, tags), corev1.IPv4Protocol, "")
createLaunchTemplateInput.DryRun = aws.Bool(true)
createLaunchTemplateInput.DryRun = lo.ToPtr(true)
if _, err := v.ec2api.CreateLaunchTemplate(ctx, createLaunchTemplateInput); awserrors.IgnoreDryRunError(err) != nil {
if awserrors.IgnoreUnauthorizedOperationError(err) != nil {
// Dry run should only ever return UnauthorizedOperation or DryRunOperation so if we receive any other error
Expand Down Expand Up @@ -207,16 +205,16 @@ func (v *Validation) validateRunInstancesAuthorization(

runInstancesInput := &ec2.RunInstancesInput{
DryRun: lo.ToPtr(true),
MaxCount: aws.Int32(1),
MinCount: aws.Int32(1),
MaxCount: lo.ToPtr[int32](1),
MinCount: lo.ToPtr[int32](1),
InstanceType: instanceType,
MetadataOptions: &ec2types.InstanceMetadataOptionsRequest{
HttpEndpoint: ec2types.InstanceMetadataEndpointState(lo.FromPtr(nodeClass.Spec.MetadataOptions.HTTPEndpoint)),
HttpTokens: ec2types.HttpTokensState(lo.FromPtr(nodeClass.Spec.MetadataOptions.HTTPTokens)),
HttpProtocolIpv6: ec2types.InstanceMetadataProtocolState(lo.FromPtr(nodeClass.Spec.MetadataOptions.HTTPProtocolIPv6)),
//aws sdk v2 changed this type to *int32 instead of *int64
//nolint: gosec
HttpPutResponseHopLimit: aws.Int32(int32(lo.FromPtr(nodeClass.Spec.MetadataOptions.HTTPPutResponseHopLimit))),
HttpPutResponseHopLimit: lo.ToPtr(int32(lo.FromPtr(nodeClass.Spec.MetadataOptions.HTTPPutResponseHopLimit))),
},
TagSpecifications: []ec2types.TagSpecification{
{
Expand Down Expand Up @@ -290,18 +288,18 @@ func mockLaunchTemplateConfig() []ec2types.FleetLaunchTemplateConfigRequest {
return []ec2types.FleetLaunchTemplateConfigRequest{
{
LaunchTemplateSpecification: &ec2types.FleetLaunchTemplateSpecificationRequest{
LaunchTemplateName: aws.String("mock-lt-name"),
LaunchTemplateId: aws.String("lt-1234567890abcdef0"),
Version: aws.String("1"),
LaunchTemplateName: lo.ToPtr("mock-lt-name"),
LaunchTemplateId: lo.ToPtr("lt-1234567890abcdef0"),
Version: lo.ToPtr("1"),
},
Overrides: []ec2types.FleetLaunchTemplateOverridesRequest{
{
InstanceType: ec2types.InstanceTypeT3Micro,
SubnetId: aws.String("subnet-1234567890abcdef0"),
SubnetId: lo.ToPtr("subnet-1234567890abcdef0"),
},
{
InstanceType: ec2types.InstanceTypeT3Small,
SubnetId: aws.String("subnet-1234567890abcdef1"),
SubnetId: lo.ToPtr("subnet-1234567890abcdef1"),
},
},
},
Expand Down
2 changes: 1 addition & 1 deletion test/pkg/debug/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func (c *PodController) Register(_ context.Context, m manager.Manager) error {
},
},
predicate.NewPredicateFuncs(func(o client.Object) bool {
return o.GetNamespace() != "kube-system"
return o.GetNamespace() != "kube-system" && o.GetNamespace() != "prometheus"
}),
)).
WithOptions(controller.Options{MaxConcurrentReconciles: 10, SkipNameValidation: lo.ToPtr(true)}).
Expand Down
4 changes: 2 additions & 2 deletions test/pkg/environment/common/expectations.go
Original file line number Diff line number Diff line change
Expand Up @@ -691,13 +691,13 @@ func (env *Environment) EventuallyExpectNodesUntaintedWithTimeout(timeout time.D
}).WithTimeout(timeout).Should(Succeed())
}

func (env *Environment) EventuallyExpectNodeClaimCount(comparator string, count int) []*karpv1.NodeClaim {
func (env *Environment) EventuallyExpectLaunchedNodeClaimCount(comparator string, count int) []*karpv1.NodeClaim {
GinkgoHelper()
By(fmt.Sprintf("waiting for nodes to be %s to %d", comparator, count))
nodeClaimList := &karpv1.NodeClaimList{}
Eventually(func(g Gomega) {
g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed())
g.Expect(len(nodeClaimList.Items)).To(BeNumerically(comparator, count),
g.Expect(lo.CountBy(nodeClaimList.Items, func(nc karpv1.NodeClaim) bool { return nc.StatusConditions().IsTrue(karpv1.ConditionTypeLaunched) })).To(BeNumerically(comparator, count),
fmt.Sprintf("expected %d nodeclaims, had %d (%v)", count, len(nodeClaimList.Items), NodeClaimNames(lo.ToSlicePtr(nodeClaimList.Items))))
}).Should(Succeed())
return lo.ToSlicePtr(nodeClaimList.Items)
Expand Down
6 changes: 3 additions & 3 deletions test/suites/consolidation/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ var _ = Describe("Consolidation", Ordered, func() {

// Ensure that we get three nodes tainted, and they have overlap during the consolidation
env.EventuallyExpectTaintedNodeCount("==", 3)
env.EventuallyExpectNodeClaimCount("==", 8)
env.EventuallyExpectLaunchedNodeClaimCount("==", 8)
env.EventuallyExpectNodeCount("==", 8)

env.ConsistentlyExpectDisruptionsUntilNoneLeft(5, 3, 10*time.Minute)
Expand Down Expand Up @@ -925,7 +925,7 @@ var _ = Describe("Consolidation", Ordered, func() {
Replicas: 1,
})
env.ExpectCreated(nodePool, nodeClass, dep)
env.EventuallyExpectNodeClaimsReady(env.EventuallyExpectNodeClaimCount("==", 1)...)
env.EventuallyExpectNodeClaimsReady(env.EventuallyExpectLaunchedNodeClaimCount("==", 1)...)
n := env.EventuallyExpectNodeCount("==", int(1))[0]
Expect(n.Labels).To(HaveKeyWithValue(corev1.LabelInstanceTypeStable, string(ec2types.InstanceTypeM5Large)))
Expect(n.Labels).To(HaveKeyWithValue(karpv1.CapacityTypeLabelKey, karpv1.CapacityTypeOnDemand))
Expand Down Expand Up @@ -969,7 +969,7 @@ var _ = Describe("Consolidation", Ordered, func() {
// Start by only enabling the m5.xlarge capacity reservation, ensuring it's provisioned
nodeClass.Spec.CapacityReservationSelectorTerms = []v1.CapacityReservationSelectorTerm{{ID: xlargeCapacityReservationID}}
env.ExpectCreated(nodePool, nodeClass, dep)
env.EventuallyExpectNodeClaimsReady(env.EventuallyExpectNodeClaimCount("==", 1)...)
env.EventuallyExpectNodeClaimsReady(env.EventuallyExpectLaunchedNodeClaimCount("==", 1)...)
n := env.EventuallyExpectNodeCount("==", int(1))[0]
Expect(n.Labels).To(HaveKeyWithValue(corev1.LabelInstanceTypeStable, string(ec2types.InstanceTypeM5Xlarge)))
Expect(n.Labels).To(HaveKeyWithValue(karpv1.CapacityTypeLabelKey, karpv1.CapacityTypeReserved))
Expand Down
4 changes: 2 additions & 2 deletions test/suites/drift/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1000,7 +1000,7 @@ var _ = Describe("Drift", Ordered, func() {
},
})
env.ExpectCreated(nodePool, nodeClass, pod)
nc := env.EventuallyExpectNodeClaimCount("==", 1)[0]
nc := env.EventuallyExpectLaunchedNodeClaimCount("==", 1)[0]
env.EventuallyExpectNodeClaimsReady(nc)
n := env.EventuallyExpectCreatedNodeCount("==", 1)[0]
Expect(n.Labels).To(HaveKeyWithValue(corev1.LabelInstanceTypeStable, string(ec2types.InstanceTypeM5Large)))
Expand Down Expand Up @@ -1037,7 +1037,7 @@ var _ = Describe("Drift", Ordered, func() {
})
env.ExpectCreated(nodePool, nodeClass, pod)

nc := env.EventuallyExpectNodeClaimCount("==", 1)[0]
nc := env.EventuallyExpectLaunchedNodeClaimCount("==", 1)[0]
req, ok := lo.Find(nc.Spec.Requirements, func(req karpv1.NodeSelectorRequirementWithMinValues) bool {
return req.Key == v1.LabelCapacityReservationID
})
Expand Down
4 changes: 2 additions & 2 deletions test/suites/integration/instance_profile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ var _ = Describe("InstanceProfile Generation", func() {
nodeClass.Spec.Role = fmt.Sprintf("KarpenterNodeRole-%s", "invalidRole")
env.ExpectCreated(nodeClass)
ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: v1.ConditionTypeInstanceProfileReady, Status: metav1.ConditionUnknown})
ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: v1.ConditionTypeValidationSucceeded, Status: metav1.ConditionFalse})
ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: status.ConditionReady, Status: metav1.ConditionFalse})
ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: v1.ConditionTypeValidationSucceeded, Status: metav1.ConditionUnknown})
ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: status.ConditionReady, Status: metav1.ConditionUnknown})
})
})
22 changes: 10 additions & 12 deletions test/suites/integration/nodeclass_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"

v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1"

Expand Down Expand Up @@ -54,29 +55,26 @@ var _ = Describe("NodeClass IAM Permissions", func() {
}`, action)

deployment := &appsv1.Deployment{}
err := env.Client.Get(env.Context, types.NamespacedName{
Expect(env.Client.Get(env.Context, types.NamespacedName{
Namespace: "kube-system",
Name: "karpenter",
}, deployment)
Expect(err).To(BeNil())
}, deployment)).To(Succeed())

sa := &corev1.ServiceAccount{}
err = env.Client.Get(env.Context, types.NamespacedName{
Expect(env.Client.Get(env.Context, types.NamespacedName{
Namespace: "kube-system",
Name: deployment.Spec.Template.Spec.ServiceAccountName,
}, sa)
Expect(err).To(BeNil())
}, sa)).To(Succeed())

roleName = strings.Split(sa.Annotations["eks.amazonaws.com/role-arn"], "/")[1]
policyName = fmt.Sprintf("TestPolicy-%s", uuid.New().String())

_, err = env.IAMAPI.PutRolePolicy(env.Context, &iam.PutRolePolicyInput{
_, err := env.IAMAPI.PutRolePolicy(env.Context, &iam.PutRolePolicyInput{
RoleName: aws.String(roleName),
PolicyName: aws.String(policyName),
PolicyDocument: aws.String(policyDoc),
})
Expect(err).To(BeNil())

DeferCleanup(func() {
_, err := env.IAMAPI.DeleteRolePolicy(env.Context, &iam.DeleteRolePolicyInput{
RoleName: aws.String(roleName),
Expand All @@ -87,10 +85,10 @@ var _ = Describe("NodeClass IAM Permissions", func() {

env.ExpectCreated(nodeClass)
Eventually(func(g Gomega) {
env.ExpectUpdated(nodeClass)
g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed())
g.Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeValidationSucceeded).IsFalse()).To(BeTrue())
g.Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeValidationSucceeded).Reason).To(Equal(expectedMessage))
}, "240s", "5s").Should(Succeed())
}).Should(Succeed())
ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: status.ConditionReady, Status: metav1.ConditionFalse, Message: "ValidationSucceeded=False"})
},
Entry("should fail when CreateFleet is denied",
Expand All @@ -107,8 +105,8 @@ var _ = Describe("NodeClass IAM Permissions", func() {
It("should succeed with all required permissions", func() {
env.ExpectCreated(nodeClass)
Eventually(func(g Gomega) {
env.ExpectUpdated(nodeClass)
g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed())
g.Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeValidationSucceeded).IsTrue()).To(BeTrue())
}, "60s", "5s").Should(Succeed())
}).Should(Succeed())
})
})
4 changes: 2 additions & 2 deletions test/suites/scheduling/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() {
})
env.ExpectCreated(nodePool, nodeClass, pod)

nc := env.EventuallyExpectNodeClaimCount("==", 1)[0]
nc := env.EventuallyExpectLaunchedNodeClaimCount("==", 1)[0]
req, ok := lo.Find(nc.Spec.Requirements, func(req karpv1.NodeSelectorRequirementWithMinValues) bool {
return req.Key == v1.LabelCapacityReservationID
})
Expand Down Expand Up @@ -808,7 +808,7 @@ var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() {
env.ExpectCreated(nodePool, nodeClass, pods[0], pods[1])

reservedCount := 0
for _, nc := range env.EventuallyExpectNodeClaimCount("==", 2) {
for _, nc := range env.EventuallyExpectLaunchedNodeClaimCount("==", 2) {
req, ok := lo.Find(nc.Spec.Requirements, func(req karpv1.NodeSelectorRequirementWithMinValues) bool {
return req.Key == v1.LabelCapacityReservationID
})
Expand Down

0 comments on commit 51e7f86

Please sign in to comment.