Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add Versioned for EC2NodeClass Hash to Prevent Drift on EC2NodeClass CRD Upgrade #5770

Merged
merged 3 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pkg/apis/v1beta1/ec2nodeclass.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,12 @@ type EC2NodeClass struct {
Status EC2NodeClassStatus `json:"status,omitempty"`
}

// We need to bump the EC2NodeClassHashVersion when we make an update to the EC2NodeClass CRD under these conditions:
// 1. A field changes its default value for an existing field that is already hashed
// 2. A field is added to the hash calculation with an already-set value
// 3. A field is removed from the hash calculations
const EC2NodeClassHashVersion = "v1"

func (in *EC2NodeClass) Hash() string {
return fmt.Sprint(lo.Must(hashstructure.Hash(in.Spec, hashstructure.FormatV2, &hashstructure.HashOptions{
SlicesAsSets: true,
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/v1beta1/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ var (
LabelInstanceAcceleratorManufacturer = Group + "/instance-accelerator-manufacturer"
LabelInstanceAcceleratorCount = Group + "/instance-accelerator-count"
AnnotationEC2NodeClassHash = Group + "/ec2nodeclass-hash"
AnnotationEC2NodeClassHashVersion = Group + "/ec2nodeclass-hash-version"
AnnotationInstanceTagged = Group + "/tagged"

TagNodeClaim = v1beta1.Group + "/nodeclaim"
Expand Down
5 changes: 4 additions & 1 deletion pkg/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,10 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *corev1beta1.NodeC
return i.Name == instance.Type
})
nc := c.instanceToNodeClaim(instance, instanceType)
nc.Annotations = lo.Assign(nc.Annotations, map[string]string{v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash()})
nc.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{
v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash(),
v1beta1.AnnotationEC2NodeClassHashVersion: v1beta1.EC2NodeClassHashVersion,
})
return nc, nil
}

Expand Down
13 changes: 10 additions & 3 deletions pkg/cloudprovider/drift.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,16 @@ func (c *CloudProvider) areSecurityGroupsDrifted(ctx context.Context, ec2Instanc
}

func (c *CloudProvider) areStaticFieldsDrifted(nodeClaim *corev1beta1.NodeClaim, nodeClass *v1beta1.EC2NodeClass) cloudprovider.DriftReason {
nodeClassHash, foundHashNodeClass := nodeClass.Annotations[v1beta1.AnnotationEC2NodeClassHash]
nodeClaimHash, foundHashNodeClaim := nodeClaim.Annotations[v1beta1.AnnotationEC2NodeClassHash]
if !foundHashNodeClass || !foundHashNodeClaim {
nodeClassHash, foundNodeClassHash := nodeClass.Annotations[v1beta1.AnnotationEC2NodeClassHash]
nodeClassHashVersion, foundNodeClassHashVersion := nodeClass.Annotations[v1beta1.AnnotationEC2NodeClassHashVersion]
nodeClaimHash, foundNodeClaimHash := nodeClaim.Annotations[v1beta1.AnnotationEC2NodeClassHash]
nodeClaimHashVersion, foundNodeClaimHashVersion := nodeClaim.Annotations[v1beta1.AnnotationEC2NodeClassHashVersion]

if !foundNodeClassHash || !foundNodeClaimHash || !foundNodeClassHashVersion || !foundNodeClaimHashVersion {
return ""
}
// validate that the hash version for the EC2NodeClass is the same as the NodeClaim before evaluating for static drift
if nodeClassHashVersion != nodeClaimHashVersion {
return ""
}
return lo.Ternary(nodeClassHash != nodeClaimHash, NodeClassDrift, "")
Expand Down
62 changes: 58 additions & 4 deletions pkg/cloudprovider/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -593,10 +593,14 @@ var _ = Describe("CloudProvider", func() {
Reservations: []*ec2.Reservation{{Instances: []*ec2.Instance{instance}}},
})
nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{
v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash(),
v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash(),
v1beta1.AnnotationEC2NodeClassHashVersion: v1beta1.EC2NodeClassHashVersion,
})
nodeClaim.Status.ProviderID = fake.ProviderID(lo.FromPtr(instance.InstanceId))
nodeClaim.Annotations = lo.Assign(nodeClaim.Annotations, map[string]string{v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash()})
nodeClaim.Annotations = lo.Assign(nodeClaim.Annotations, map[string]string{
v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash(),
v1beta1.AnnotationEC2NodeClassHashVersion: v1beta1.EC2NodeClassHashVersion,
})
nodeClaim.Labels = lo.Assign(nodeClaim.Labels, map[string]string{v1.LabelInstanceTypeStable: selectedInstanceType.Name})
})
It("should not fail if NodeClass does not exist", func() {
Expand Down Expand Up @@ -760,8 +764,58 @@ var _ = Describe("CloudProvider", func() {
Entry("Subnet Drift", v1beta1.EC2NodeClass{Spec: v1beta1.EC2NodeClassSpec{SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{{Tags: map[string]string{"sn-key-1": "sn-value-1"}}}}}),
Entry("SecurityGroup Drift", v1beta1.EC2NodeClass{Spec: v1beta1.EC2NodeClassSpec{SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{{Tags: map[string]string{"sg-key": "sg-value"}}}}}),
)
It("should not return drifted if karpenter.k8s.aws/nodeclass-hash annotation is not present on the NodeClaim", func() {
nodeClaim.Annotations = map[string]string{}
It("should not return drifted if karpenter.k8s.aws/ec2nodeclass-hash annotation is not present on the NodeClaim", func() {
nodeClaim.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHashVersion: v1beta1.EC2NodeClassHashVersion,
}
nodeClass.Spec.Tags = map[string]string{
"Test Key": "Test Value",
}
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
isDrifted, err := cloudProvider.IsDrifted(ctx, nodeClaim)
Expect(err).NotTo(HaveOccurred())
Expect(isDrifted).To(BeEmpty())
})
It("should not return drifted if the NodeClaim's karpenter.k8s.aws/ec2nodeclass-hash-version annotation does not match the EC2NodeClass's", func() {
nodeClass.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-111111",
v1beta1.AnnotationEC2NodeClassHashVersion: "test-hash-version-1",
}
nodeClaim.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-222222",
v1beta1.AnnotationEC2NodeClassHashVersion: "test-hash-version-2",
}
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
isDrifted, err := cloudProvider.IsDrifted(ctx, nodeClaim)
Expect(err).NotTo(HaveOccurred())
Expect(isDrifted).To(BeEmpty())
})
It("should not return drifted if karpenter.k8s.aws/ec2nodeclass-hash-version annotation is not present on the NodeClass", func() {
nodeClass.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-111111",
}
nodeClaim.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-222222",
v1beta1.AnnotationEC2NodeClassHashVersion: "test-hash-version-2",
}
// should trigger drift
nodeClass.Spec.Tags = map[string]string{
"Test Key": "Test Value",
}
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
isDrifted, err := cloudProvider.IsDrifted(ctx, nodeClaim)
Expect(err).NotTo(HaveOccurred())
Expect(isDrifted).To(BeEmpty())
})
It("should not return drifted if karpenter.k8s.aws/ec2nodeclass-hash-version annotation is not present on the NodeClaim", func() {
nodeClass.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-111111",
v1beta1.AnnotationEC2NodeClassHashVersion: "test-hash-version-1",
}
nodeClaim.ObjectMeta.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "test-hash-222222",
}
// should trigger drift
nodeClass.Spec.Tags = map[string]string{
"Test Key": "Test Value",
}
Expand Down
51 changes: 50 additions & 1 deletion pkg/controllers/nodeclass/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,17 @@ func NewController(kubeClient client.Client, recorder events.Recorder, subnetPro
func (c *Controller) Reconcile(ctx context.Context, nodeClass *v1beta1.EC2NodeClass) (reconcile.Result, error) {
stored := nodeClass.DeepCopy()
controllerutil.AddFinalizer(nodeClass, v1beta1.TerminationFinalizer)
nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash()})

if nodeClass.Annotations[v1beta1.AnnotationEC2NodeClassHashVersion] != v1beta1.EC2NodeClassHashVersion {
if err := c.updateNodeClaimHash(ctx, nodeClass); err != nil {
return reconcile.Result{}, err
}
}
nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{
v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash(),
v1beta1.AnnotationEC2NodeClassHashVersion: v1beta1.EC2NodeClassHashVersion,
})

err := multierr.Combine(
c.resolveSubnets(ctx, nodeClass),
c.resolveSecurityGroups(ctx, nodeClass),
Expand Down Expand Up @@ -222,6 +232,45 @@ func (c *Controller) resolveInstanceProfile(ctx context.Context, nodeClass *v1be
return nil
}

// Updating `ec2nodeclass-hash-version` annotation inside the karpenter controller means a breaking change has been made to the hash calculation.
// `ec2nodeclass-hash` annotation on the EC2NodeClass will be updated, due to the breaking change, making the `ec2nodeclass-hash` on the NodeClaim different from
// EC2NodeClass. Since, we cannot rely on the `ec2nodeclass-hash` on the NodeClaims, due to the breaking change, we will need to re-calculate the hash and update the annotation.
// For more information on the Drift Hash Versioning: https://github.com/kubernetes-sigs/karpenter/blob/main/designs/drift-hash-versioning.md
func (c *Controller) updateNodeClaimHash(ctx context.Context, nodeClass *v1beta1.EC2NodeClass) error {
ncList := &corev1beta1.NodeClaimList{}
if err := c.kubeClient.List(ctx, ncList, client.MatchingFields{"spec.nodeClassRef.name": nodeClass.Name}); err != nil {
return err
}

errs := make([]error, len(ncList.Items))
for i := range ncList.Items {
nc := ncList.Items[i]
stored := nc.DeepCopy()

if nc.Annotations[v1beta1.AnnotationEC2NodeClassHashVersion] != v1beta1.EC2NodeClassHashVersion {
nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
v1beta1.AnnotationEC2NodeClassHashVersion: v1beta1.EC2NodeClassHashVersion,
})

// Any NodeClaim that is already drifted will remain drifted if the karpenter.k8s.aws/nodepool-hash-version doesn't match
// Since the hashing mechanism has changed we will not be able to determine if the drifted status of the NodeClaim has changed
if nc.StatusConditions().GetCondition(corev1beta1.Drifted) == nil {
nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash(),
})
}

if !equality.Semantic.DeepEqual(stored, nc) {
if err := c.kubeClient.Patch(ctx, &nc, client.MergeFrom(stored)); err != nil {
engedaam marked this conversation as resolved.
Show resolved Hide resolved
errs[i] = client.IgnoreNotFound(err)
}
}
}
}

return multierr.Combine(errs...)
}

func (c *Controller) Name() string {
return "nodeclass"
}
Expand Down
127 changes: 125 additions & 2 deletions pkg/controllers/nodeclass/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/imdario/mergo"
"github.com/samber/lo"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/record"
_ "knative.dev/pkg/system/testing"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -833,7 +834,7 @@ var _ = Describe("NodeClassController", func() {
})
})
Context("Static Drift Hash", func() {
DescribeTable("should update the static drift hash when static field is updated", func(changes *v1beta1.EC2NodeClass) {
DescribeTable("should update the drift hash when static field is updated", func(changes *v1beta1.EC2NodeClass) {
ExpectApplied(ctx, env.Client, nodeClass)
ExpectReconcileSucceeded(ctx, nodeClassController, client.ObjectKeyFromObject(nodeClass))
nodeClass = ExpectExists(ctx, env.Client, nodeClass)
Expand All @@ -860,7 +861,7 @@ var _ = Describe("NodeClassController", func() {
Entry("MetadataOptions Drift", &v1beta1.EC2NodeClass{Spec: v1beta1.EC2NodeClassSpec{MetadataOptions: &v1beta1.MetadataOptions{HTTPEndpoint: aws.String("disabled")}}}),
Entry("Context Drift", &v1beta1.EC2NodeClass{Spec: v1beta1.EC2NodeClassSpec{Context: aws.String("context-2")}}),
)
It("should not update the static drift hash when dynamic field is updated", func() {
It("should not update the drift hash when dynamic field is updated", func() {
ExpectApplied(ctx, env.Client, nodeClass)
ExpectReconcileSucceeded(ctx, nodeClassController, client.ObjectKeyFromObject(nodeClass))
nodeClass = ExpectExists(ctx, env.Client, nodeClass)
Expand Down Expand Up @@ -889,6 +890,128 @@ var _ = Describe("NodeClassController", func() {
nodeClass = ExpectExists(ctx, env.Client, nodeClass)
Expect(nodeClass.Annotations[v1beta1.AnnotationEC2NodeClassHash]).To(Equal(expectedHash))
})
It("should update ec2nodeclass-hash-version annotation when the ec2nodeclass-hash-version on the NodeClass does not match with the controller hash version", func() {
nodeClass.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "abceduefed",
v1beta1.AnnotationEC2NodeClassHashVersion: "test",
}
ExpectApplied(ctx, env.Client, nodeClass)

ExpectReconcileSucceeded(ctx, nodeClassController, client.ObjectKeyFromObject(nodeClass))
nodeClass = ExpectExists(ctx, env.Client, nodeClass)

expectedHash := nodeClass.Hash()
// Expect ec2nodeclass-hash on the NodeClass to be updated
Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHash, expectedHash))
Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHashVersion, v1beta1.EC2NodeClassHashVersion))
})
It("should update ec2nodeclass-hash-versions on all NodeClaims when the ec2nodeclass-hash-version does not match with the controller hash version", func() {
nodeClass.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "abceduefed",
v1beta1.AnnotationEC2NodeClassHashVersion: "test",
}
nodeClaimOne := coretest.NodeClaim(corev1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "123456",
v1beta1.AnnotationEC2NodeClassHashVersion: "test",
},
},
Spec: corev1beta1.NodeClaimSpec{
NodeClassRef: &corev1beta1.NodeClassReference{
Name: nodeClass.Name,
},
},
})
nodeClaimTwo := coretest.NodeClaim(corev1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "123456",
v1beta1.AnnotationEC2NodeClassHashVersion: "test",
},
},
Spec: corev1beta1.NodeClaimSpec{
NodeClassRef: &corev1beta1.NodeClassReference{
Name: nodeClass.Name,
},
},
})

ExpectApplied(ctx, env.Client, nodeClass, nodeClaimOne, nodeClaimTwo)

ExpectReconcileSucceeded(ctx, nodeClassController, client.ObjectKeyFromObject(nodeClass))
nodeClass = ExpectExists(ctx, env.Client, nodeClass)
nodeClaimOne = ExpectExists(ctx, env.Client, nodeClaimOne)
nodeClaimTwo = ExpectExists(ctx, env.Client, nodeClaimTwo)

expectedHash := nodeClass.Hash()
// Expect ec2nodeclass-hash on the NodeClaims to be updated
Expect(nodeClaimOne.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHash, expectedHash))
Expect(nodeClaimOne.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHashVersion, v1beta1.EC2NodeClassHashVersion))
Expect(nodeClaimTwo.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHash, expectedHash))
Expect(nodeClaimTwo.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHashVersion, v1beta1.EC2NodeClassHashVersion))
})
It("should not update ec2nodeclass-hash on all NodeClaims when the ec2nodeclass-hash-version matches the controller hash version", func() {
nodeClass.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "abceduefed",
v1beta1.AnnotationEC2NodeClassHashVersion: "test-version",
}
nodeClaim := coretest.NodeClaim(corev1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "1234564654",
v1beta1.AnnotationEC2NodeClassHashVersion: v1beta1.EC2NodeClassHashVersion,
},
},
Spec: corev1beta1.NodeClaimSpec{
NodeClassRef: &corev1beta1.NodeClassReference{
Name: nodeClass.Name,
},
},
})
ExpectApplied(ctx, env.Client, nodeClass, nodeClaim)

ExpectReconcileSucceeded(ctx, nodeClassController, client.ObjectKeyFromObject(nodeClass))
nodeClass = ExpectExists(ctx, env.Client, nodeClass)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)

expectedHash := nodeClass.Hash()

// Expect ec2nodeclass-hash on the NodeClass to be updated
Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHash, expectedHash))
Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHashVersion, v1beta1.EC2NodeClassHashVersion))
// Expect ec2nodeclass-hash on the NodeClaims to stay the same
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHash, "1234564654"))
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHashVersion, v1beta1.EC2NodeClassHashVersion))
})
It("should not update ec2nodeclass-hash on the NodeClaim if it's drifted and the ec2nodeclass-hash-version does not match the controller hash version", func() {
nodeClass.Annotations = map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "abceduefed",
v1beta1.AnnotationEC2NodeClassHashVersion: "test",
}
nodeClaim := coretest.NodeClaim(corev1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
v1beta1.AnnotationEC2NodeClassHash: "123456",
v1beta1.AnnotationEC2NodeClassHashVersion: "test",
},
},
Spec: corev1beta1.NodeClaimSpec{
NodeClassRef: &corev1beta1.NodeClassReference{
Name: nodeClass.Name,
},
},
})
nodeClaim.StatusConditions().MarkTrue(corev1beta1.Drifted)
ExpectApplied(ctx, env.Client, nodeClass, nodeClaim)

ExpectReconcileSucceeded(ctx, nodeClassController, client.ObjectKeyFromObject(nodeClass))
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)

// Expect ec2nodeclass-hash on the NodeClaims to stay the same
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHash, "123456"))
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationEC2NodeClassHashVersion, v1beta1.EC2NodeClassHashVersion))
})
})
Context("NodeClass Termination", func() {
var profileName string
Expand Down
Loading
Loading