Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cherrypick(v0.32): feat: Add Versioned for EC2NodeClass Hash to Prevent Drift on EC2NodeClass CRD Upgrade #5929

Merged
merged 3 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ require (
github.com/PuerkitoBio/goquery v1.8.1
github.com/avast/retry-go v3.0.0+incompatible
github.com/aws/aws-sdk-go v1.48.0
github.com/aws/karpenter-core v0.32.8
github.com/aws/karpenter-core v0.32.9-0.20240326160553-6642110e7c0c
github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c
github.com/go-logr/zapr v1.3.0
github.com/imdario/mergo v0.3.16
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHS
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
github.com/aws/aws-sdk-go v1.48.0 h1:1SeJ8agckRDQvnSCt1dGZYAwUaoD2Ixj6IaXB4LCv8Q=
github.com/aws/aws-sdk-go v1.48.0/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk=
github.com/aws/karpenter-core v0.32.8 h1:w8XqihzF19w1P/P3+2h/DYJNX7pixNQ9QQAepHcQrSk=
github.com/aws/karpenter-core v0.32.8/go.mod h1:DY6qKd/bLlTrWFDtEdYQtapSUtCArP72DpkseN5EB8U=
github.com/aws/karpenter-core v0.32.9-0.20240326160553-6642110e7c0c h1:lrPySM7TrIohE0TfZmTQqYzsZ1SPILFPQrr3IrsgNuU=
github.com/aws/karpenter-core v0.32.9-0.20240326160553-6642110e7c0c/go.mod h1:DY6qKd/bLlTrWFDtEdYQtapSUtCArP72DpkseN5EB8U=
github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c h1:oXWwIttmjYLbBKhLazG21aQvpJ3NOOr8IXhCJ/p6e/M=
github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c/go.mod h1:l/TIBsaCx/IrOr0Xvlj/cHLOf05QzuQKEZ1hx2XWmfU=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
Expand Down
6 changes: 6 additions & 0 deletions pkg/apis/v1beta1/ec2nodeclass.go
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,12 @@ type EC2NodeClass struct {
IsNodeTemplate bool `json:"-" hash:"ignore"`
}

// We need to bump the EC2NodeClassHashVersion when we make an update to the EC2NodeClass CRD under these conditions:
// 1. A field changes its default value for an existing field that is already hashed
// 2. A field is added to the hash calculation with an already-set value
// 3. A field is removed from the hash calculations
const EC2NodeClassHashVersion = "v1"

func (in *EC2NodeClass) Hash() string {
return fmt.Sprint(lo.Must(hashstructure.Hash(in.Spec, hashstructure.FormatV2, &hashstructure.HashOptions{
SlicesAsSets: true,
Expand Down
3 changes: 2 additions & 1 deletion pkg/apis/v1beta1/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ var (
LabelInstanceAcceleratorName = Group + "/instance-accelerator-name"
LabelInstanceAcceleratorManufacturer = Group + "/instance-accelerator-manufacturer"
LabelInstanceAcceleratorCount = Group + "/instance-accelerator-count"
AnnotationNodeClassHash = Group + "/ec2nodeclass-hash"
AnnotationEC2NodeClassHash = Group + "/ec2nodeclass-hash"
AnnotationEC2NodeClassHashVersion = Group + "/ec2nodeclass-hash-version"
AnnotationInstanceTagged = Group + "/tagged"
)
15 changes: 14 additions & 1 deletion pkg/cloudprovider/drift.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,26 @@ func (c *CloudProvider) areStaticFieldsDrifted(nodeClaim *corev1beta1.NodeClaim,
if nodeClaim.IsMachine {
ownerHashKey = v1alpha1.AnnotationNodeTemplateHash
} else {
ownerHashKey = v1beta1.AnnotationNodeClassHash
ownerHashKey = v1beta1.AnnotationEC2NodeClassHash
}
nodeClassHash, foundHashNodeClass := nodeClass.Annotations[ownerHashKey]
nodeClaimHash, foundHashNodeClaim := nodeClaim.Annotations[ownerHashKey]
if !foundHashNodeClass || !foundHashNodeClaim {
return ""
}

if !nodeClass.IsNodeTemplate {
nodeClassHashVersion, foundNodeClassHashVersion := nodeClass.Annotations[v1beta1.AnnotationEC2NodeClassHashVersion]
nodeClaimHashVersion, foundNodeClaimHashVersion := nodeClaim.Annotations[v1beta1.AnnotationEC2NodeClassHashVersion]
if !foundNodeClassHashVersion || !foundNodeClaimHashVersion {
return ""
}
// validate that the hash version for the EC2NodeClass is the same as the NodeClaim before evaluating for static drift
if nodeClassHashVersion != nodeClaimHashVersion {
return ""
}
}

if nodeClassHash != nodeClaimHash {
return lo.Ternary(nodeClaim.IsMachine, NodeTemplateDrift, NodeClassDrift)
}
Expand Down
64 changes: 58 additions & 6 deletions pkg/cloudprovider/nodeclaim_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ var _ = Describe("NodeClaim/CloudProvider", func() {
cloudProviderNodeClaim, err := cloudProvider.Create(ctx, nodeClaim)
Expect(err).To(BeNil())
Expect(cloudProviderNodeClaim).ToNot(BeNil())
_, ok := cloudProviderNodeClaim.ObjectMeta.Annotations[v1beta1.AnnotationNodeClassHash]
_, ok := cloudProviderNodeClaim.ObjectMeta.Annotations[v1beta1.AnnotationEC2NodeClassHash]
Expect(ok).To(BeTrue())
})
Context("EC2 Context", func() {
Expand Down Expand Up @@ -194,10 +194,14 @@ var _ = Describe("NodeClaim/CloudProvider", func() {
Reservations: []*ec2.Reservation{{Instances: []*ec2.Instance{instance}}},
})
nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{
v1beta1.AnnotationNodeClassHash: nodeClass.Hash(),
v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash(),
v1beta1.AnnotationEC2NodeClassHashVersion: v1beta1.EC2NodeClassHashVersion,
})
nodeClaim.Status.ProviderID = fake.ProviderID(lo.FromPtr(instance.InstanceId))
nodeClaim.Annotations = lo.Assign(nodeClaim.Annotations, map[string]string{v1beta1.AnnotationNodeClassHash: nodeClass.Hash()})
nodeClaim.Annotations = lo.Assign(nodeClaim.Annotations, map[string]string{
v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash(),
v1beta1.AnnotationEC2NodeClassHashVersion: v1beta1.EC2NodeClassHashVersion,
})
nodeClaim.Labels = lo.Assign(nodeClaim.Labels, map[string]string{v1.LabelInstanceTypeStable: selectedInstanceType.Name})
})
It("should not fail if NodeClass does not exist", func() {
Expand Down Expand Up @@ -226,7 +230,7 @@ var _ = Describe("NodeClaim/CloudProvider", func() {
instance.SecurityGroups = []*ec2.GroupIdentifier{{GroupId: aws.String(fake.SecurityGroupID())}}
// Assign a fake hash
nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{
v1beta1.AnnotationNodeClassHash: "abcdefghijkl",
v1beta1.AnnotationEC2NodeClassHash: "abcdefghijkl",
})
ExpectApplied(ctx, env.Client, nodeClass)
isDrifted, err := cloudProvider.IsDrifted(ctx, nodeClaim)
Expand Down Expand Up @@ -327,7 +331,7 @@ var _ = Describe("NodeClaim/CloudProvider", func() {
Expect(isDrifted).To(BeEmpty())

Expect(mergo.Merge(nodeClass, changes, mergo.WithOverride))
nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{v1beta1.AnnotationNodeClassHash: nodeClass.Hash()})
nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash()})

ExpectApplied(ctx, env.Client, nodeClass)
isDrifted, err = cloudProvider.IsDrifted(ctx, nodeClaim)
Expand All @@ -350,7 +354,7 @@ var _ = Describe("NodeClaim/CloudProvider", func() {
Expect(isDrifted).To(BeEmpty())

Expect(mergo.Merge(nodeClass, changes, mergo.WithOverride))
nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{v1beta1.AnnotationNodeClassHash: nodeClass.Hash()})
nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash()})

ExpectApplied(ctx, env.Client, nodeClass)
isDrifted, err = cloudProvider.IsDrifted(ctx, nodeClaim)
Expand All @@ -371,6 +375,54 @@ var _ = Describe("NodeClaim/CloudProvider", func() {
Expect(err).NotTo(HaveOccurred())
Expect(isDrifted).To(BeEmpty())
})
It("should not return drifted if the NodeClaim's karpenter.k8s.aws/ec2nodeclass-hash-version annotation does not match the EC2NodeClass's", func() {
nodeClass.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-111111",
v1beta1.AnnotationEC2NodeClassHashVersion: "test-hash-version-1",
}
nodeClaim.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-222222",
v1beta1.AnnotationEC2NodeClassHashVersion: "test-hash-version-2",
}
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
isDrifted, err := cloudProvider.IsDrifted(ctx, nodeClaim)
Expect(err).NotTo(HaveOccurred())
Expect(isDrifted).To(BeEmpty())
})
It("should not return drifted if karpenter.k8s.aws/ec2nodeclass-hash-version annotation is not present on the NodeClass", func() {
nodeClass.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-111111",
}
nodeClaim.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-222222",
v1beta1.AnnotationEC2NodeClassHashVersion: "test-hash-version-2",
}
// should trigger drift
nodeClass.Spec.Tags = map[string]string{
"Test Key": "Test Value",
}
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
isDrifted, err := cloudProvider.IsDrifted(ctx, nodeClaim)
Expect(err).NotTo(HaveOccurred())
Expect(isDrifted).To(BeEmpty())
})
It("should not return drifted if karpenter.k8s.aws/ec2nodeclass-hash-version annotation is not present on the NodeClaim", func() {
nodeClass.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-111111",
v1beta1.AnnotationEC2NodeClassHashVersion: "test-hash-version-1",
}
nodeClaim.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-222222",
}
// should trigger drift
nodeClass.Spec.Tags = map[string]string{
"Test Key": "Test Value",
}
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
isDrifted, err := cloudProvider.IsDrifted(ctx, nodeClaim)
Expect(err).NotTo(HaveOccurred())
Expect(isDrifted).To(BeEmpty())
})
})
})
Context("Subnet Compatibility", func() {
Expand Down
51 changes: 50 additions & 1 deletion pkg/controllers/nodeclass/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,13 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClass *v1beta1.EC2NodeCl
if !nodeClass.IsNodeTemplate {
controllerutil.AddFinalizer(nodeClass, v1beta1.TerminationFinalizer)
}
if nodeClass.Annotations[v1beta1.AnnotationEC2NodeClassHashVersion] != v1beta1.EC2NodeClassHashVersion {
if err := c.updateNodeClaimHash(ctx, nodeClass); err != nil {
return reconcile.Result{}, err
}
}
nodeClass.Annotations = lo.Assign(nodeClass.Annotations, nodeclassutil.HashAnnotation(nodeClass))

err := multierr.Combine(
c.resolveSubnets(ctx, nodeClass),
c.resolveSecurityGroups(ctx, nodeClass),
Expand Down Expand Up @@ -216,6 +222,49 @@ func (c *Controller) resolveInstanceProfile(ctx context.Context, nodeClass *v1be
return nil
}

// Updating `ec2nodeclass-hash-version` annotation inside the karpenter controller means a breaking change has been made to the hash calculation.
// `ec2nodeclass-hash` annotation on the EC2NodeClass will be updated, due to the breaking change, making the `ec2nodeclass-hash` on the NodeClaim different from
// EC2NodeClass. Since, we cannot rely on the `ec2nodeclass-hash` on the NodeClaims, due to the breaking change, we will need to re-calculate the hash and update the annotation.
// For more information on the Drift Hash Versioning: https://github.com/kubernetes-sigs/karpenter/blob/main/designs/drift-hash-versioning.md
func (c *Controller) updateNodeClaimHash(ctx context.Context, nodeClass *v1beta1.EC2NodeClass) error {
if nodeClass.IsNodeTemplate {
return nil
}

ncList := &corev1beta1.NodeClaimList{}
if err := c.kubeClient.List(ctx, ncList, client.MatchingFields{"spec.nodeClassRef.name": nodeClass.Name}); err != nil {
return err
}

errs := make([]error, len(ncList.Items))
for i := range ncList.Items {
nc := ncList.Items[i]
stored := nc.DeepCopy()

if nc.Annotations[v1beta1.AnnotationEC2NodeClassHashVersion] != v1beta1.EC2NodeClassHashVersion {
nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
v1beta1.AnnotationEC2NodeClassHashVersion: v1beta1.EC2NodeClassHashVersion,
})

// Any NodeClaim that is already drifted will remain drifted if the karpenter.k8s.aws/nodepool-hash-version doesn't match
// Since the hashing mechanism has changed we will not be able to determine if the drifted status of the NodeClaim has changed
if nc.StatusConditions().GetCondition(corev1beta1.Drifted) == nil {
nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash(),
})
}

if !equality.Semantic.DeepEqual(stored, nc) {
if err := c.kubeClient.Patch(ctx, &nc, client.MergeFrom(stored)); err != nil {
errs[i] = client.IgnoreNotFound(err)
}
}
}
}

return multierr.Combine(errs...)
}

var _ corecontroller.FinalizingTypedController[*v1beta1.EC2NodeClass] = (*NodeClassController)(nil)

//nolint:revive
Expand All @@ -230,7 +279,7 @@ func NewNodeClassController(kubeClient client.Client, recorder events.Recorder,
})
}

func (c *NodeClassController) Name() string {
func (c *Controller) Name() string {
return "nodeclass"
}

Expand Down
Loading
Loading