Skip to content

Commit

Permalink
Merge branch 'main' into nodeclass-validation-bypass
Browse files Browse the repository at this point in the history
  • Loading branch information
rschalo authored Dec 11, 2024
2 parents 59a9404 + ed92913 commit d2b2db7
Show file tree
Hide file tree
Showing 19 changed files with 217 additions and 192 deletions.
30 changes: 26 additions & 4 deletions .github/workflows/stale.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,39 @@ jobs:
if: github.repository == 'aws/karpenter-provider-aws'
name: Stale issue bot
steps:
# PR stale-out
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
stale-issue-message: 'This issue has been inactive for 14 days. StaleBot will close this stale issue after 14 more days of inactivity.'
exempt-issue-labels: 'bug,chore,feature,documentation,testing,operational-excellence,automation,roadmap'
stale-issue-label: 'lifecycle/stale'
close-issue-label: 'lifecycle/closed'
only-issue-labels: 'ignore' # Ignore this step for Issues
stale-pr-message: 'This PR has been inactive for 14 days. StaleBot will close this stale PR after 14 more days of inactivity.'
exempt-pr-labels: 'blocked,needs-review,needs-design'
stale-pr-label: 'lifecycle/stale'
close-pr-label: 'lifecycle/closed'
days-before-stale: 14
days-before-close: 14
operations-per-run: 300
# Issue stale-out for "triage/needs-information"
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
stale-issue-message: 'This issue has been inactive for 14 days. StaleBot will close this stale issue after 14 more days of inactivity.'
only-issue-labels: 'triage/needs-information'
stale-issue-label: 'lifecycle/stale'
close-issue-label: 'lifecycle/closed'
only-pr-labels: 'ignore' # Ignore this step for PRs
days-before-stale: 14
days-before-close: 14
operations-per-run: 300
# Issue stale-out for "triage/solved"
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
stale-issue-message: 'This issue has been inactive for 7 days and is marked as "triage/solved". StaleBot will close this stale issue after 7 more days of inactivity.'
only-issue-labels: 'triage/solved'
stale-issue-label: 'lifecycle/stale'
close-issue-label: 'lifecycle/closed'
only-pr-labels: 'ignore' # Ignore this step for PRs
days-before-stale: 7
days-before-close: 7
operations-per-run: 300
1 change: 1 addition & 0 deletions ADOPTERS.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ If you are open to others contacting you about your use of Karpenter on Slack, a
| GlobalDots | Using Karpenter to scale Kubernetes clusters for a lot of our clients & for internal needs | `@vainkop` | [GlobalDots](https://globaldots.com) |
| Grafana Labs | Using Karpenter as our Autoscaling tool on EKS | `@paulajulve`, `@logyball` | [Homepage](https://grafana.com/) & [Blog](https://grafana.com/blog/2023/11/09/how-grafana-labs-switched-to-karpenter-to-reduce-costs-and-complexities-in-amazon-eks/) |
| H2O.ai | Dynamically scaling CPU and GPU nodes for AI workloads | `@Ophir Zahavi`, `@Asaf Oren` | [H2O.ai](https://h2o.ai/) |
| HENNGE K.K. | Dynamically scaling production workloads in Tokyo region | `@furqan.habibi`, `@Hans Gunawan` | [HENNGE](https://hennge.com/global/) |
| Homa | Using Karpenter to manage dynamically big instances and save cost effectively with disruptions | `@afreyermuth98`, `@alexbescond` | [Homa](https://www.homagames.com/) |
| idealo | Scaling multi-arch IPv6 clusters hosting web and event-driven applications | `@Heiko Rothe` | [Homepage](https://www.idealo.de) |
| Kaltura | Using karpenter to deliver video to millions of end users | `@Ido Ziv` | [Homepage](https://corp.kaltura.com/) |
Expand Down
1 change: 0 additions & 1 deletion pkg/apis/v1/ec2nodeclass_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ const (
ConditionTypeSubnetsReady = "SubnetsReady"
ConditionTypeSecurityGroupsReady = "SecurityGroupsReady"
ConditionTypeAMIsReady = "AMIsReady"
ConditionTypeAMIsDeprecated = "AMIsDeprecated"
ConditionTypeInstanceProfileReady = "InstanceProfileReady"
ConditionTypeValidationSucceeded = "ValidationSucceeded"
)
Expand Down
15 changes: 3 additions & 12 deletions pkg/controllers/nodeclass/status/ami.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ func (a *AMI) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass) (reconc
if len(amis) == 0 {
nodeClass.Status.AMIs = nil
nodeClass.StatusConditions().SetFalse(v1.ConditionTypeAMIsReady, "AMINotFound", "AMISelector did not match any AMIs")
return reconcile.Result{}, nil
// If users have omitted the necessary tags from their AMIs and later add them, we need to reprocess the information.
// Returning 'ok' in this case means that the nodeclass will remain in an unready state until the component is restarted.
return reconcile.Result{RequeueAfter: time.Minute}, nil
}
nodeClass.Status.AMIs = lo.Map(amis, func(ami amifamily.AMI, _ int) v1.AMI {
reqs := lo.Map(ami.Requirements.NodeSelectorRequirements(), func(item karpv1.NodeSelectorRequirementWithMinValues, _ int) corev1.NodeSelectorRequirement {
Expand All @@ -63,17 +65,6 @@ func (a *AMI) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass) (reconc
}
})

// If deprecated AMIs are discovered set the AMIsDeprecated status condition
// If no deprecated AMIs are present, and previous status condition for AMIsDeprecated exists, remove the condition
hasDeprecatedAMIs := lo.Filter(nodeClass.Status.AMIs, func(ami v1.AMI, _ int) bool {
return ami.Deprecated
})
hasDeprecatedCondition := nodeClass.StatusConditions().Get(v1.ConditionTypeAMIsDeprecated) != nil
if len(hasDeprecatedAMIs) > 0 {
nodeClass.StatusConditions().SetTrue(v1.ConditionTypeAMIsDeprecated)
} else if hasDeprecatedCondition {
_ = nodeClass.StatusConditions().Clear(v1.ConditionTypeAMIsDeprecated)
}
nodeClass.StatusConditions().SetTrue(v1.ConditionTypeAMIsReady)
return reconcile.Result{RequeueAfter: 5 * time.Minute}, nil
}
3 changes: 0 additions & 3 deletions pkg/controllers/nodeclass/status/ami_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,6 @@ var _ = Describe("NodeClass AMI Status Controller", func() {
},
},
))
Expect(nodeClass.StatusConditions().IsTrue(v1.ConditionTypeAMIsDeprecated)).To(BeTrue())
Expect(nodeClass.StatusConditions().IsTrue(v1.ConditionTypeAMIsReady)).To(BeTrue())
})
It("should remove AMIDeprecated status condition when non deprecated AMIs are discovered", func() {
Expand Down Expand Up @@ -678,7 +677,6 @@ var _ = Describe("NodeClass AMI Status Controller", func() {
},
))
// Checks if both AMIsReady and AMIsDeprecated status conditions are set
Expect(nodeClass.StatusConditions().IsTrue(v1.ConditionTypeAMIsDeprecated)).To(BeTrue())
Expect(nodeClass.StatusConditions().IsTrue(v1.ConditionTypeAMIsReady)).To(BeTrue())

// rediscover AMIs again and reconcile
Expand Down Expand Up @@ -740,7 +738,6 @@ var _ = Describe("NodeClass AMI Status Controller", func() {
},
))
// Since all AMIs discovered are non deprecated, the status conditions should remove AMIsDeprecated and only set AMIsReady
Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeAMIsDeprecated)).To(BeNil())
Expect(nodeClass.StatusConditions().IsTrue(v1.ConditionTypeAMIsReady)).To(BeTrue())
})
})
Expand Down
4 changes: 3 additions & 1 deletion pkg/controllers/nodeclass/status/securitygroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ func (sg *SecurityGroup) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeCla
if len(securityGroups) == 0 && len(nodeClass.Spec.SecurityGroupSelectorTerms) > 0 {
nodeClass.Status.SecurityGroups = nil
nodeClass.StatusConditions().SetFalse(v1.ConditionTypeSecurityGroupsReady, "SecurityGroupsNotFound", "SecurityGroupSelector did not match any SecurityGroups")
return reconcile.Result{}, nil
// If users have omitted the necessary tags from their SecurityGroups and later add them, we need to reprocess the information.
// Returning 'ok' in this case means that the nodeclass will remain in an unready state until the component is restarted.
return reconcile.Result{RequeueAfter: time.Minute}, nil
}
sort.Slice(securityGroups, func(i, j int) bool {
return *securityGroups[i].GroupId < *securityGroups[j].GroupId
Expand Down
4 changes: 3 additions & 1 deletion pkg/controllers/nodeclass/status/subnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ func (s *Subnet) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass) (rec
if len(subnets) == 0 {
nodeClass.Status.Subnets = nil
nodeClass.StatusConditions().SetFalse(v1.ConditionTypeSubnetsReady, "SubnetsNotFound", "SubnetSelector did not match any Subnets")
return reconcile.Result{}, nil
// If users have omitted the necessary tags from their Subnets and later add them, we need to reprocess the information.
// Returning 'ok' in this case means that the nodeclass will remain in an unready state until the component is restarted.
return reconcile.Result{RequeueAfter: time.Minute}, nil
}
sort.Slice(subnets, func(i, j int) bool {
if int(*subnets[i].AvailableIpAddressCount) != int(*subnets[j].AvailableIpAddressCount) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/providers/amifamily/ami.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func NewDefaultProvider(clk clock.Clock, versionProvider version.Provider, ssmPr
}
}

// Get Returning a list of AMIs with its associated requirements
// List Returning a list of AMIs with its associated requirements
func (p *DefaultProvider) List(ctx context.Context, nodeClass *v1.EC2NodeClass) (AMIs, error) {
p.Lock()
defer p.Unlock()
Expand Down
5 changes: 4 additions & 1 deletion pkg/providers/instancetype/zz_generated.bandwidth.go
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,7 @@ var (
"c7i.12xlarge": 18750,
"i4g.8xlarge": 18750,
"i4i.8xlarge": 18750,
"i8g.16xlarge": 18750,
"m6a.12xlarge": 18750,
"m6i.12xlarge": 18750,
"m6id.12xlarge": 18750,
Expand Down Expand Up @@ -689,7 +690,6 @@ var (
"i4g.16xlarge": 37500,
"i4i.16xlarge": 37500,
"i7ie.18xlarge": 37500,
"i8g.16xlarge": 37500,
"m6a.24xlarge": 37500,
"m6i.24xlarge": 37500,
"m6id.24xlarge": 37500,
Expand Down Expand Up @@ -892,5 +892,8 @@ var (
"trn1n.32xlarge": 1600000,
"p5.48xlarge": 3200000,
"p5e.48xlarge": 3200000,
"p5en.48xlarge": 3200000,
"trn2.48xlarge": 3200000,
"trn2u.48xlarge": 3200000,
}
)
6 changes: 5 additions & 1 deletion pkg/providers/pricing/zz_generated.pricing_aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License.

package pricing

// generated at 2024-12-02T13:14:31Z for us-east-1
// generated at 2024-12-09T13:15:15Z for us-east-1

import ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"

Expand Down Expand Up @@ -167,6 +167,10 @@ var InitialOnDemandPricesAWS = map[string]map[ec2types.InstanceType]float64{
"i7ie.12xlarge": 6.237600, "i7ie.18xlarge": 9.356400, "i7ie.24xlarge": 12.475200, "i7ie.2xlarge": 1.039600,
"i7ie.3xlarge": 1.559400, "i7ie.48xlarge": 24.950400, "i7ie.6xlarge": 3.118800, "i7ie.large": 0.259900,
"i7ie.xlarge": 0.519800,
// i8g family
"i8g.12xlarge": 4.118400, "i8g.16xlarge": 5.491200, "i8g.24xlarge": 8.236800, "i8g.2xlarge": 0.686400,
"i8g.4xlarge": 1.372800, "i8g.8xlarge": 2.745600, "i8g.large": 0.171600, "i8g.metal-24xl": 9.060480,
"i8g.xlarge": 0.343200,
// im4gn family
"im4gn.16xlarge": 5.820670, "im4gn.2xlarge": 0.727580, "im4gn.4xlarge": 1.455170, "im4gn.8xlarge": 2.910340,
"im4gn.large": 0.181900, "im4gn.xlarge": 0.363790,
Expand Down
4 changes: 3 additions & 1 deletion pkg/providers/pricing/zz_generated.pricing_aws_us_gov.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License.

package pricing

// generated at 2024-12-02T13:14:37Z for us-east-1
// generated at 2024-12-09T13:15:20Z for us-east-1

import ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"

Expand Down Expand Up @@ -438,6 +438,8 @@ var InitialOnDemandPricesUSGov = map[string]map[ec2types.InstanceType]float64{
"u-6tb1.112xlarge": 65.433000, "u-6tb1.56xlarge": 55.610750,
// u-9tb1 family
"u-9tb1.112xlarge": 98.150000,
// u7in-24tb family
"u7in-24tb.224xlarge": 366.422000,
// x1 family
"x1.16xlarge": 8.003000, "x1.32xlarge": 16.006000,
// x1e family
Expand Down
1 change: 0 additions & 1 deletion test/suites/ami/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ var _ = Describe("AMI", func() {
Expect(len(nc.Status.AMIs)).To(BeNumerically("==", 1))
Expect(nc.Status.AMIs[0].Deprecated).To(BeTrue())
ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: v1.ConditionTypeAMIsReady, Status: metav1.ConditionTrue})
ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: v1.ConditionTypeAMIsDeprecated, Status: metav1.ConditionTrue})
})
It("should prioritize launch with non-deprecated AMIs", func() {
nodeClass.Spec.AMIFamily = lo.ToPtr(v1.AMIFamilyAL2023)
Expand Down
5 changes: 2 additions & 3 deletions test/suites/drift/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,8 @@ var _ = Describe("Drift", func() {

By("validating the deprecated status condition has propagated")
Eventually(func(g Gomega) {
g.Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeAMIsDeprecated).IsTrue()).To(BeTrue())
g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).Should(Succeed())
g.Expect(nodeClass.Status.AMIs[0].Deprecated).To(BeTrue())
g.Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeAMIsReady).IsTrue()).To(BeTrue())
}).Should(Succeed())

Expand All @@ -397,10 +398,8 @@ var _ = Describe("Drift", func() {
pod = env.EventuallyExpectHealthyPodCount(selector, numPods)[0]
env.ExpectInstance(pod.Spec.NodeName).To(HaveField("ImageId", HaveValue(Equal(amdAMI))))

By("validating the deprecated status condition has been removed")
Eventually(func(g Gomega) {
g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).Should(Succeed())
g.Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeAMIsDeprecated)).To(BeNil())
g.Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeAMIsReady).IsTrue()).To(BeTrue())
}).Should(Succeed())
})
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cat <<EOF | envsubst | kubectl apply -f -
---
apiVersion: flowcontrol.apiserver.k8s.io/v1beta3
apiVersion: flowcontrol.apiserver.k8s.io/v1
kind: FlowSchema
metadata:
name: karpenter-leader-election
Expand All @@ -13,22 +13,23 @@ spec:
rules:
- resourceRules:
- apiGroups:
- coordination.k8s.io
- coordination.k8s.io
namespaces:
- '*'
- '*'
resources:
- leases
- leases
verbs:
- get
- create
- update
- get
- create
- update
subjects:
- kind: ServiceAccount
serviceAccount:
name: karpenter
namespace: "${KARPENTER_NAMESPACE}"
- kind: ServiceAccount
serviceAccount:
name: karpenter
namespace: "${KARPENTER_NAMESPACE}"
---
apiVersion: flowcontrol.apiserver.k8s.io/v1beta3
apiVersion: flowcontrol.apiserver.k8s.io/v1
kind: FlowSchema
metadata:
name: karpenter-workload
Expand All @@ -39,24 +40,24 @@ spec:
priorityLevelConfiguration:
name: workload-high
rules:
- nonResourceRules:
- nonResourceURLs:
- '*'
verbs:
- '*'
resourceRules:
- apiGroups:
- '*'
clusterScope: true
namespaces:
- '*'
resources:
- '*'
verbs:
- '*'
subjects:
- kind: ServiceAccount
serviceAccount:
name: karpenter
namespace: "${KARPENTER_NAMESPACE}"
- nonResourceRules:
- nonResourceURLs:
- '*'
verbs:
- '*'
resourceRules:
- apiGroups:
- '*'
clusterScope: true
namespaces:
- '*'
resources:
- '*'
verbs:
- '*'
subjects:
- kind: ServiceAccount
serviceAccount:
name: karpenter
namespace: "${KARPENTER_NAMESPACE}"
EOF
3 changes: 1 addition & 2 deletions website/content/en/preview/concepts/nodeclasses.md
Original file line number Diff line number Diff line change
Expand Up @@ -1566,8 +1566,7 @@ NodeClasses have the following status conditions:
| SubnetsReady | Subnets are discovered. |
| SecurityGroupsReady | Security Groups are discovered. |
| InstanceProfileReady | Instance Profile is discovered. |
| AMIsReady | AMIs are discovered. |
| AMIsDeprecated | AMIs are discovered, but they are deprecated. Individual deprecated AMIs can be identified by reviewing the `status.amis`. |
| AMIsReady | AMIs are discovered. |
| Ready | Top level condition that indicates if the nodeClass is ready. If any of the underlying conditions is `False` then this condition is set to `False` and `Message` on the condition indicates the dependency that was not resolved. |

If a NodeClass is not ready, NodePools that reference it through their `nodeClassRef` will not be considered for scheduling.
Loading

0 comments on commit d2b2db7

Please sign in to comment.