From 4965d7cacc5df29cb9c17ceff928bc8b97dc2610 Mon Sep 17 00:00:00 2001 From: Jason Deal Date: Wed, 14 Aug 2024 15:22:54 -0700 Subject: [PATCH 01/21] remove alpha refs from troubleshooting --- website/content/en/docs/troubleshooting.md | 132 ++++------------- website/content/en/preview/troubleshooting.md | 134 ++++-------------- website/content/en/v1.0/troubleshooting.md | 134 ++++-------------- 3 files changed, 83 insertions(+), 317 deletions(-) diff --git a/website/content/en/docs/troubleshooting.md b/website/content/en/docs/troubleshooting.md index df04ea95b956..c51d4f341ef5 100644 --- a/website/content/en/docs/troubleshooting.md +++ b/website/content/en/docs/troubleshooting.md @@ -75,12 +75,12 @@ If a long cluster name causes the Karpenter node role name to exceed 64 characte Keep in mind that `KarpenterNodeRole-` is just a recommendation from the getting started guide. Instead of using the eksctl role, you can shorten the name to anything you like, as long as it has the right permissions. -### Unknown field in Provisioner spec +### Unknown field in NodePool or EC2NodeClass spec If you are upgrading from an older version of Karpenter, there may have been changes in the CRD between versions. Attempting to utilize newer functionality which is surfaced in newer versions of the CRD may result in the following error message: ``` -error: error validating "STDIN": error validating data: ValidationError(Provisioner.spec): unknown field "" in sh.karpenter.v1alpha5.Provisioner.spec; if you choose to ignore these errors, turn validation off with --validate=false +Error from server (BadRequest): error when creating "STDIN": NodePool in version "v1" cannot be handled as a NodePool: strict decoding error: unknown field "spec.template.spec.nodeClassRef.foo" ``` If you see this error, you can solve the problem by following the [Custom Resource Definition Upgrade Guidance](../upgrade-guide/#custom-resource-definition-crd-upgrades). @@ -91,11 +91,10 @@ Info on whether there has been a change to the CRD between versions of Karpenter `0.16.0` changed the default replicas from 1 to 2. -Karpenter won't launch capacity to run itself (log related to the `karpenter.sh/provisioner-name DoesNotExist requirement`) +Karpenter won't launch capacity to run itself (log related to the `karpenter.sh/nodepool DoesNotExist requirement`) so it can't provision for the second Karpenter pod. -To solve this you can either reduce the replicas back from 2 to 1, or ensure there is enough capacity that isn't being managed by Karpenter -(these are instances with the name `karpenter.sh/provisioner-name/`) to run both pods. +To solve this you can either reduce the replicas back from 2 to 1, or ensure there is enough capacity that isn't being managed by Karpenter to run both pods. To do so on AWS increase the `minimum` and `desired` parameters on the node group autoscaling group to launch at lease 2 instances. @@ -144,52 +143,6 @@ You can fix this by patching the node objects: kubectl get nodes -ojsonpath='{range .items[*].metadata}{@.name}:{@.finalizers}{"\n"}' | grep "karpenter.sh/termination" | cut -d ':' -f 1 | xargs kubectl patch node --type='json' -p='[{"op": "remove", "path": "/metadata/finalizers"}]' ``` -## Webhooks - -### Failed calling webhook "validation.webhook.provisioners.karpenter.sh" - -If you are not able to create a provisioner due to `Internal error occurred: failed calling webhook "validation.webhook.provisioners.karpenter.sh":` - -Webhooks were renamed in `0.19.0`. There's a bug in ArgoCD's upgrade workflow where webhooks are leaked. This results in Provisioner's failing to be validated, since the validation server no longer corresponds to the webhook definition. - -Delete the stale webhooks. - -```text -kubectl delete mutatingwebhookconfigurations defaulting.webhook.provisioners.karpenter.sh -kubectl delete validatingwebhookconfiguration validation.webhook.provisioners.karpenter.sh -``` - -### Failed calling webhook "defaulting.webhook.karpenter.sh" - -The `defaulting.webhook.karpenter.sh` mutating webhook was removed in `0.27.3`. If you are coming from an older version of Karpenter where this webhook existed and the webhook was not managed by Helm, you may need to delete the stale webhook. - -```text -kubectl delete mutatingwebhookconfigurations defaulting.webhook.karpenter.sh -``` - -If you are not able to create a provisioner due to `Error from server (InternalError): error when creating "provisioner.yaml": Internal error occurred: failed calling webhook "defaulting.webhook.karpenter.sh": Post "https://karpenter-webhook.karpenter.svc:443/default-resource?timeout=10s": context deadline exceeded` - -Verify that the karpenter pod is running (should see 2/2 containers with a "Ready" status) - -```text -kubectl get po -A -l app.kubernetes.io/name=karpenter -NAME READY STATUS RESTARTS AGE -karpenter-7b46fb5c-gcr9z 2/2 Running 0 17h -``` - -Karpenter service has endpoints assigned to it - -```text -kubectl get ep -A -l app.kubernetes.io/name=karpenter -NAMESPACE NAME ENDPOINTS AGE -karpenter karpenter 192.168.39.88:8443,192.168.39.88:8080 16d -``` - -Your security groups are not blocking you from reaching your webhook. - -This is especially relevant if you have used `terraform-eks-module` version `>=18` since that version changed its security -approach, and now it's much more restrictive. - ## Provisioning ### Instances with swap volumes fail to register with control plane @@ -201,7 +154,7 @@ Some instance types (c1.medium and m1.small) are given limited amount of memory ``` ##### Solutions -Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the Provisioner requirements to use larger instance types. +Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the NodePool requirements to use larger instance types. ### DaemonSets can result in deployment failures @@ -209,7 +162,7 @@ For Karpenter versions `0.5.3` and earlier, DaemonSets were not properly conside This sometimes caused nodes to be deployed that could not meet the needs of the requested DaemonSets and workloads. This issue no longer occurs after Karpenter version `0.5.3` (see [PR #1155](https://github.com/aws/karpenter/pull/1155)). -If you are using a pre `0.5.3` version of Karpenter, one workaround is to set your provisioner to only use larger instance types that you know will be big enough for the DaemonSet and the workload. +If you are using a pre `0.5.3` version of Karpenter, one workaround is to set your NodePool to only use larger instance types that you know will be big enough for the DaemonSet and the workload. For more information, see [Issue #1084](https://github.com/aws/karpenter/issues/1084). Examples of this behavior are included in [Issue #1180](https://github.com/aws/karpenter/issues/1180). @@ -224,55 +177,24 @@ This behavior is not unique to Karpenter and can also occur with the standard `k To prevent this, you can set LimitRanges on pod deployments on a per-namespace basis. See the Karpenter [Best Practices Guide](https://aws.github.io/aws-eks-best-practices/karpenter/#use-limitranges-to-configure-defaults-for-resource-requests-and-limits) for further information on the use of LimitRanges. -### Missing subnetSelector and securityGroupSelector tags causes provisioning failures - -Starting with Karpenter `0.5.5`, if you are using Karpenter-generated launch template, provisioners require that [subnetSelector]({{}}) and [securityGroupSelector]({{}}) tags be set to match your cluster. -The [Provisioner]({{}}) section in the Karpenter Getting Started Guide uses the following example: - -```text -kind: AWSNodeTemplate -spec: - subnetSelector: - karpenter.sh/discovery: ${CLUSTER_NAME} - securityGroupSelector: - karpenter.sh/discovery: ${CLUSTER_NAME} -``` - -To check your subnet and security group selectors, type the following: - -```bash -aws ec2 describe-subnets --filters Name=tag:karpenter.sh/discovery,Values=${CLUSTER_NAME} -``` - -*Returns subnets matching the selector* - -```bash -aws ec2 describe-security-groups --filters Name=tag:karpenter.sh/discovery,Values=${CLUSTER_NAME} -``` - -*Returns security groups matching the selector* - -Provisioners created without those tags and run in more recent Karpenter versions will fail with this message when you try to run the provisioner: - -```text - field(s): spec.provider.securityGroupSelector, spec.provider.subnetSelector -``` - ### Pods using Security Groups for Pods stuck in "ContainerCreating" state for up to 30 minutes before transitioning to "Running" -When leveraging [Security Groups for Pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html), Karpenter will launch nodes as expected but pods will be stuck in "ContainerCreating" state for up to 30 minutes before transitioning to "Running". This is related to an interaction between Karpenter and the [amazon-vpc-resource-controller](https://github.com/aws/amazon-vpc-resource-controller-k8s) when a pod requests `vpc.amazonaws.com/pod-eni` resources. More info can be found in [issue #1252](https://github.com/aws/karpenter/issues/1252). +When leveraging [Security Groups for Pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html), Karpenter will launch nodes as expected but pods will be stuck in "ContainerCreating" state for up to 30 minutes before transitioning to "Running". +This is related to an interaction between Karpenter and the [amazon-vpc-resource-controller](https://github.com/aws/amazon-vpc-resource-controller-k8s) when a pod requests `vpc.amazonaws.com/pod-eni` resources. +More info can be found in [issue #1252](https://github.com/aws/karpenter/issues/1252). -To workaround this problem, add the `vpc.amazonaws.com/has-trunk-attached: "false"` label in your Karpenter Provisioner spec and ensure instance-type requirements include [instance-types which support ENI trunking](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go). +To workaround this problem, add the `vpc.amazonaws.com/has-trunk-attached: "false"` label in your Karpenter NodePool spec and ensure instance-type requirements include [instance-types which support ENI trunking](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go). ```yaml -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner +apiVersion: karpenter.sh/v1 +kind: NodePool metadata: name: default spec: - labels: - vpc.amazonaws.com/has-trunk-attached: "false" - ttlSecondsAfterEmpty: 30 + template + metadata: + labels: + vpc.amazonaws.com/has-trunk-attached: "false" ``` ### Pods using PVCs can hit volume limits and fail to scale-up @@ -341,19 +263,19 @@ Note that Karpenter is not aware if [Security Groups per Pod](https://aws.github To avoid this discrepancy between `maxPods` and the supported pod density of the EC2 instance based on ENIs and allocatable IPs, you can perform one of the following actions on your cluster: 1. Enable [Prefix Delegation](https://www.eksworkshop.com/docs/networking/prefix/) to increase the number of allocatable IPs for the ENIs on each instance type -2. Reduce your `maxPods` value to be under the maximum pod density for the instance types assigned to your Provisioner -3. Remove the `maxPods` value from your [`kubeletConfiguration`]({{}}) if you no longer need it and instead rely on the defaulted values from Karpenter and EKS AMIs. +2. Reduce your `maxPods` value to be under the maximum pod density for the instance types assigned to your NodePods +3. Remove the `maxPods` value from your [`kubeletConfiguration`]({{}}) if you no longer need it and instead rely on the defaulted values from Karpenter and EKS AMIs. -For more information on pod density, view the [Pod Density Section in the NodePools doc]({{}}). +For more information on pod density, view the [Pod Density Section in the NodePools doc]({{}}). #### IP exhaustion in a subnet -When a node is launched by Karpenter, it is assigned to a subnet within your VPC based on the [`subnetSelector`]({{}}) value in your [`AWSNodeTemplate`]({{}})). When a subnet becomes IP address constrained, EC2 may think that it can successfully launch an instance in the subnet; however, when the CNI tries to assign IPs to the pods, there are none remaining. In this case, your pod will stay in a `ContainerCreating` state until an IP address is freed in the subnet and the CNI can assign one to the pod. +When a node is launched by Karpenter, it is assigned to a subnet within your VPC based on the [`subnetSelector`]({{}}) value in your [`EC2NodeClass`]({{}})). When a subnet becomes IP address constrained, EC2 may think that it can successfully launch an instance in the subnet; however, when the CNI tries to assign IPs to the pods, there are none remaining. In this case, your pod will stay in a `ContainerCreating` state until an IP address is freed in the subnet and the CNI can assign one to the pod. ##### Solutions 1. Use `topologySpreadConstraints` on `topology.kubernetes.io/zone` to spread your pods and nodes more evenly across zones -2. Increase the IP address space (CIDR) for the subnets selected by your `AWSNodeTemplate` +2. Increase the IP address space (CIDR) for the subnets selected by your `EC2NodeClass` 3. Use [custom networking](https://www.eksworkshop.com/docs/networking/custom-networking/) to assign separate IP address spaces to your pods and your nodes 4. [Run your EKS cluster on IPv6](https://aws.github.io/aws-eks-best-practices/networking/ipv6/) (Note: IPv6 clusters have some known limitations which should be well-understood before choosing to use one) @@ -479,7 +401,7 @@ Karpenter determines node initialization using three factors: 1. Node readiness 2. Expected resources are registered -3. Provisioner startup taints are removed +3. NodePool startup taints are removed #### Node Readiness @@ -496,9 +418,9 @@ Common resources that don't register and leave nodes in a non-initialized state: 1. `nvidia.com/gpu` (or any gpu-based resource): A GPU instance type that supports the `nvidia.com/gpu` resource is launched but the daemon/daemonset to register the resource on the node doesn't exist 2. `vpc.amazonaws.com/pod-eni`: An instance type is launched by the `ENABLE_POD_ENI` value is set to `false` in the `vpc-cni` plugin. Karpenter will expect that the `vpc.amazonaws.com/pod-eni` will be registered, but it never will. -#### Provisioner startup taints are removed +#### NodePool startup taints are removed -Karpenter expects all startup taints specified in `.spec.startupTaints` of the provisioner to be completely removed from node `.spec.taints` before it will consider the node initialized. +Karpenter expects all startup taints specified in `.spec.template.spec.startupTaints` of the NodePool to be completely removed from node `.spec.taints` before it will consider the node initialized. ### Node NotReady @@ -513,7 +435,7 @@ The easiest way to start debugging is to connect to the instance and get the Kub ```bash # List the nodes managed by Karpenter -kubectl get node -l karpenter.sh/provisioner-name +kubectl get node -l karpenter.sh/nodepool # Extract the instance ID (replace with a node name from the above listing) INSTANCE_ID=$(kubectl get node -ojson | jq -r ".spec.providerID" | cut -d \/ -f5) # Connect to the instance @@ -526,7 +448,7 @@ For Bottlerocket, you'll need to get access to the root filesystem: ```bash # List the nodes managed by Karpenter -kubectl get node -l karpenter.sh/provisioner-name +kubectl get node -l karpenter.sh/nodepool # Extract the instance ID (replace with a node name from the above listing) INSTANCE_ID=$(kubectl get node -ojson | jq -r ".spec.providerID" | cut -d \/ -f5) # Connect to the instance @@ -613,7 +535,7 @@ This means that your CNI plugin is out of date. You can find instructions on how ### Node terminates before ready on failed encrypted EBS volume If you are using a custom launch template and an encrypted EBS volume, the IAM principal launching the node may not have sufficient permissions to use the KMS customer managed key (CMK) for the EC2 EBS root volume. -This issue also applies to [Block Device Mappings]({{}}) specified in the Provisioner. +This issue also applies to [Block Device Mappings]({{}}) specified in the EC2NodeClass. In either case, this results in the node terminating almost immediately upon creation. Keep in mind that it is possible that EBS Encryption can be enabled without your knowledge. diff --git a/website/content/en/preview/troubleshooting.md b/website/content/en/preview/troubleshooting.md index 2627508f2954..c51d4f341ef5 100644 --- a/website/content/en/preview/troubleshooting.md +++ b/website/content/en/preview/troubleshooting.md @@ -75,12 +75,12 @@ If a long cluster name causes the Karpenter node role name to exceed 64 characte Keep in mind that `KarpenterNodeRole-` is just a recommendation from the getting started guide. Instead of using the eksctl role, you can shorten the name to anything you like, as long as it has the right permissions. -### Unknown field in Provisioner spec +### Unknown field in NodePool or EC2NodeClass spec If you are upgrading from an older version of Karpenter, there may have been changes in the CRD between versions. Attempting to utilize newer functionality which is surfaced in newer versions of the CRD may result in the following error message: ``` -error: error validating "STDIN": error validating data: ValidationError(Provisioner.spec): unknown field "" in sh.karpenter.v1alpha5.Provisioner.spec; if you choose to ignore these errors, turn validation off with --validate=false +Error from server (BadRequest): error when creating "STDIN": NodePool in version "v1" cannot be handled as a NodePool: strict decoding error: unknown field "spec.template.spec.nodeClassRef.foo" ``` If you see this error, you can solve the problem by following the [Custom Resource Definition Upgrade Guidance](../upgrade-guide/#custom-resource-definition-crd-upgrades). @@ -91,11 +91,10 @@ Info on whether there has been a change to the CRD between versions of Karpenter `0.16.0` changed the default replicas from 1 to 2. -Karpenter won't launch capacity to run itself (log related to the `karpenter.sh/provisioner-name DoesNotExist requirement`) +Karpenter won't launch capacity to run itself (log related to the `karpenter.sh/nodepool DoesNotExist requirement`) so it can't provision for the second Karpenter pod. -To solve this you can either reduce the replicas back from 2 to 1, or ensure there is enough capacity that isn't being managed by Karpenter -(these are instances with the name `karpenter.sh/provisioner-name/`) to run both pods. +To solve this you can either reduce the replicas back from 2 to 1, or ensure there is enough capacity that isn't being managed by Karpenter to run both pods. To do so on AWS increase the `minimum` and `desired` parameters on the node group autoscaling group to launch at lease 2 instances. @@ -144,52 +143,6 @@ You can fix this by patching the node objects: kubectl get nodes -ojsonpath='{range .items[*].metadata}{@.name}:{@.finalizers}{"\n"}' | grep "karpenter.sh/termination" | cut -d ':' -f 1 | xargs kubectl patch node --type='json' -p='[{"op": "remove", "path": "/metadata/finalizers"}]' ``` -## Webhooks - -### Failed calling webhook "validation.webhook.provisioners.karpenter.sh" - -If you are not able to create a provisioner due to `Internal error occurred: failed calling webhook "validation.webhook.provisioners.karpenter.sh":` - -Webhooks were renamed in `0.19.0`. There's a bug in ArgoCD's upgrade workflow where webhooks are leaked. This results in Provisioner's failing to be validated, since the validation server no longer corresponds to the webhook definition. - -Delete the stale webhooks. - -```text -kubectl delete mutatingwebhookconfigurations defaulting.webhook.provisioners.karpenter.sh -kubectl delete validatingwebhookconfiguration validation.webhook.provisioners.karpenter.sh -``` - -### Failed calling webhook "defaulting.webhook.karpenter.sh" - -The `defaulting.webhook.karpenter.sh` mutating webhook was removed in `0.27.3`. If you are coming from an older version of Karpenter where this webhook existed and the webhook was not managed by Helm, you may need to delete the stale webhook. - -```text -kubectl delete mutatingwebhookconfigurations defaulting.webhook.karpenter.sh -``` - -If you are not able to create a provisioner due to `Error from server (InternalError): error when creating "provisioner.yaml": Internal error occurred: failed calling webhook "defaulting.webhook.karpenter.sh": Post "https://karpenter-webhook.karpenter.svc:443/default-resource?timeout=10s": context deadline exceeded` - -Verify that the karpenter pod is running (should see 2/2 containers with a "Ready" status) - -```text -kubectl get po -A -l app.kubernetes.io/name=karpenter -NAME READY STATUS RESTARTS AGE -karpenter-7b46fb5c-gcr9z 2/2 Running 0 17h -``` - -Karpenter service has endpoints assigned to it - -```text -kubectl get ep -A -l app.kubernetes.io/name=karpenter -NAMESPACE NAME ENDPOINTS AGE -karpenter karpenter 192.168.39.88:8443,192.168.39.88:8080 16d -``` - -Your security groups are not blocking you from reaching your webhook. - -This is especially relevant if you have used `terraform-eks-module` version `>=18` since that version changed its security -approach, and now it's much more restrictive. - ## Provisioning ### Instances with swap volumes fail to register with control plane @@ -201,7 +154,7 @@ Some instance types (c1.medium and m1.small) are given limited amount of memory ``` ##### Solutions -Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the Provisioner requirements to use larger instance types. +Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the NodePool requirements to use larger instance types. ### DaemonSets can result in deployment failures @@ -209,7 +162,7 @@ For Karpenter versions `0.5.3` and earlier, DaemonSets were not properly conside This sometimes caused nodes to be deployed that could not meet the needs of the requested DaemonSets and workloads. This issue no longer occurs after Karpenter version `0.5.3` (see [PR #1155](https://github.com/aws/karpenter/pull/1155)). -If you are using a pre `0.5.3` version of Karpenter, one workaround is to set your provisioner to only use larger instance types that you know will be big enough for the DaemonSet and the workload. +If you are using a pre `0.5.3` version of Karpenter, one workaround is to set your NodePool to only use larger instance types that you know will be big enough for the DaemonSet and the workload. For more information, see [Issue #1084](https://github.com/aws/karpenter/issues/1084). Examples of this behavior are included in [Issue #1180](https://github.com/aws/karpenter/issues/1180). @@ -224,55 +177,24 @@ This behavior is not unique to Karpenter and can also occur with the standard `k To prevent this, you can set LimitRanges on pod deployments on a per-namespace basis. See the Karpenter [Best Practices Guide](https://aws.github.io/aws-eks-best-practices/karpenter/#use-limitranges-to-configure-defaults-for-resource-requests-and-limits) for further information on the use of LimitRanges. -### Missing subnetSelector and securityGroupSelector tags causes provisioning failures - -Starting with Karpenter `0.5.5`, if you are using Karpenter-generated launch template, provisioners require that [subnetSelector]({{}}) and [securityGroupSelector]({{}}) tags be set to match your cluster. -The [Provisioner]({{}}) section in the Karpenter Getting Started Guide uses the following example: - -```text -kind: AWSNodeTemplate -spec: - subnetSelector: - karpenter.sh/discovery: ${CLUSTER_NAME} - securityGroupSelector: - karpenter.sh/discovery: ${CLUSTER_NAME} -``` - -To check your subnet and security group selectors, type the following: - -```bash -aws ec2 describe-subnets --filters Name=tag:karpenter.sh/discovery,Values=${CLUSTER_NAME} -``` - -*Returns subnets matching the selector* - -```bash -aws ec2 describe-security-groups --filters Name=tag:karpenter.sh/discovery,Values=${CLUSTER_NAME} -``` - -*Returns security groups matching the selector* - -Provisioners created without those tags and run in more recent Karpenter versions will fail with this message when you try to run the provisioner: - -```text - field(s): spec.provider.securityGroupSelector, spec.provider.subnetSelector -``` - ### Pods using Security Groups for Pods stuck in "ContainerCreating" state for up to 30 minutes before transitioning to "Running" -When leveraging [Security Groups for Pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html), Karpenter will launch nodes as expected but pods will be stuck in "ContainerCreating" state for up to 30 minutes before transitioning to "Running". This is related to an interaction between Karpenter and the [amazon-vpc-resource-controller](https://github.com/aws/amazon-vpc-resource-controller-k8s) when a pod requests `vpc.amazonaws.com/pod-eni` resources. More info can be found in [issue #1252](https://github.com/aws/karpenter/issues/1252). +When leveraging [Security Groups for Pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html), Karpenter will launch nodes as expected but pods will be stuck in "ContainerCreating" state for up to 30 minutes before transitioning to "Running". +This is related to an interaction between Karpenter and the [amazon-vpc-resource-controller](https://github.com/aws/amazon-vpc-resource-controller-k8s) when a pod requests `vpc.amazonaws.com/pod-eni` resources. +More info can be found in [issue #1252](https://github.com/aws/karpenter/issues/1252). -To workaround this problem, add the `vpc.amazonaws.com/has-trunk-attached: "false"` label in your Karpenter Provisioner spec and ensure instance-type requirements include [instance-types which support ENI trunking](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go). +To workaround this problem, add the `vpc.amazonaws.com/has-trunk-attached: "false"` label in your Karpenter NodePool spec and ensure instance-type requirements include [instance-types which support ENI trunking](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go). ```yaml -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner +apiVersion: karpenter.sh/v1 +kind: NodePool metadata: name: default spec: - labels: - vpc.amazonaws.com/has-trunk-attached: "false" - ttlSecondsAfterEmpty: 30 + template + metadata: + labels: + vpc.amazonaws.com/has-trunk-attached: "false" ``` ### Pods using PVCs can hit volume limits and fail to scale-up @@ -329,7 +251,7 @@ time=2023-06-12T19:18:15Z type=Warning reason=FailedCreatePodSandBox from=kubele By default, the number of pods on a node is limited by both the number of networking interfaces (ENIs) that may be attached to an instance type and the number of IP addresses that can be assigned to each ENI. See [IP addresses per network interface per instance type](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html#AvailableIpPerENI) for a more detailed information on these instance types' limits. -If the max-pods (configured through your Provisioner [`kubeletConfiguration`]({{}})) is greater than the number of supported IPs for a given instance type, the CNI will fail to assign an IP to the pod and your pod will be left in a `ContainerCreating` state. +If the max-pods (configured through your EC2NodeClass [`kubeletConfiguration`]({{}})) is greater than the number of supported IPs for a given instance type, the CNI will fail to assign an IP to the pod and your pod will be left in a `ContainerCreating` state. If you've enabled [Security Groups per Pod](https://aws.github.io/aws-eks-best-practices/networking/sgpp/), one of the instance's ENIs is reserved as the trunk interface and uses branch interfaces off of that trunk interface to assign different security groups. If you do not have any `SecurityGroupPolicies` configured for your pods, they will be unable to utilize branch interfaces attached to the trunk interface, and IPs will only be available from the non-trunk ENIs. @@ -341,19 +263,19 @@ Note that Karpenter is not aware if [Security Groups per Pod](https://aws.github To avoid this discrepancy between `maxPods` and the supported pod density of the EC2 instance based on ENIs and allocatable IPs, you can perform one of the following actions on your cluster: 1. Enable [Prefix Delegation](https://www.eksworkshop.com/docs/networking/prefix/) to increase the number of allocatable IPs for the ENIs on each instance type -2. Reduce your `maxPods` value to be under the maximum pod density for the instance types assigned to your Provisioner -3. Remove the `maxPods` value from your [`kubeletConfiguration`]({{}}) if you no longer need it and instead rely on the defaulted values from Karpenter and EKS AMIs. +2. Reduce your `maxPods` value to be under the maximum pod density for the instance types assigned to your NodePods +3. Remove the `maxPods` value from your [`kubeletConfiguration`]({{}}) if you no longer need it and instead rely on the defaulted values from Karpenter and EKS AMIs. -For more information on pod density, view the [Pod Density Section in the NodePools doc]({{}}). +For more information on pod density, view the [Pod Density Section in the NodePools doc]({{}}). #### IP exhaustion in a subnet -When a node is launched by Karpenter, it is assigned to a subnet within your VPC based on the [`subnetSelector`]({{}}) value in your [`AWSNodeTemplate`]({{}})). When a subnet becomes IP address constrained, EC2 may think that it can successfully launch an instance in the subnet; however, when the CNI tries to assign IPs to the pods, there are none remaining. In this case, your pod will stay in a `ContainerCreating` state until an IP address is freed in the subnet and the CNI can assign one to the pod. +When a node is launched by Karpenter, it is assigned to a subnet within your VPC based on the [`subnetSelector`]({{}}) value in your [`EC2NodeClass`]({{}})). When a subnet becomes IP address constrained, EC2 may think that it can successfully launch an instance in the subnet; however, when the CNI tries to assign IPs to the pods, there are none remaining. In this case, your pod will stay in a `ContainerCreating` state until an IP address is freed in the subnet and the CNI can assign one to the pod. ##### Solutions 1. Use `topologySpreadConstraints` on `topology.kubernetes.io/zone` to spread your pods and nodes more evenly across zones -2. Increase the IP address space (CIDR) for the subnets selected by your `AWSNodeTemplate` +2. Increase the IP address space (CIDR) for the subnets selected by your `EC2NodeClass` 3. Use [custom networking](https://www.eksworkshop.com/docs/networking/custom-networking/) to assign separate IP address spaces to your pods and your nodes 4. [Run your EKS cluster on IPv6](https://aws.github.io/aws-eks-best-practices/networking/ipv6/) (Note: IPv6 clusters have some known limitations which should be well-understood before choosing to use one) @@ -479,7 +401,7 @@ Karpenter determines node initialization using three factors: 1. Node readiness 2. Expected resources are registered -3. Provisioner startup taints are removed +3. NodePool startup taints are removed #### Node Readiness @@ -496,9 +418,9 @@ Common resources that don't register and leave nodes in a non-initialized state: 1. `nvidia.com/gpu` (or any gpu-based resource): A GPU instance type that supports the `nvidia.com/gpu` resource is launched but the daemon/daemonset to register the resource on the node doesn't exist 2. `vpc.amazonaws.com/pod-eni`: An instance type is launched by the `ENABLE_POD_ENI` value is set to `false` in the `vpc-cni` plugin. Karpenter will expect that the `vpc.amazonaws.com/pod-eni` will be registered, but it never will. -#### Provisioner startup taints are removed +#### NodePool startup taints are removed -Karpenter expects all startup taints specified in `.spec.startupTaints` of the provisioner to be completely removed from node `.spec.taints` before it will consider the node initialized. +Karpenter expects all startup taints specified in `.spec.template.spec.startupTaints` of the NodePool to be completely removed from node `.spec.taints` before it will consider the node initialized. ### Node NotReady @@ -513,7 +435,7 @@ The easiest way to start debugging is to connect to the instance and get the Kub ```bash # List the nodes managed by Karpenter -kubectl get node -l karpenter.sh/provisioner-name +kubectl get node -l karpenter.sh/nodepool # Extract the instance ID (replace with a node name from the above listing) INSTANCE_ID=$(kubectl get node -ojson | jq -r ".spec.providerID" | cut -d \/ -f5) # Connect to the instance @@ -526,7 +448,7 @@ For Bottlerocket, you'll need to get access to the root filesystem: ```bash # List the nodes managed by Karpenter -kubectl get node -l karpenter.sh/provisioner-name +kubectl get node -l karpenter.sh/nodepool # Extract the instance ID (replace with a node name from the above listing) INSTANCE_ID=$(kubectl get node -ojson | jq -r ".spec.providerID" | cut -d \/ -f5) # Connect to the instance @@ -613,7 +535,7 @@ This means that your CNI plugin is out of date. You can find instructions on how ### Node terminates before ready on failed encrypted EBS volume If you are using a custom launch template and an encrypted EBS volume, the IAM principal launching the node may not have sufficient permissions to use the KMS customer managed key (CMK) for the EC2 EBS root volume. -This issue also applies to [Block Device Mappings]({{}}) specified in the Provisioner. +This issue also applies to [Block Device Mappings]({{}}) specified in the EC2NodeClass. In either case, this results in the node terminating almost immediately upon creation. Keep in mind that it is possible that EBS Encryption can be enabled without your knowledge. diff --git a/website/content/en/v1.0/troubleshooting.md b/website/content/en/v1.0/troubleshooting.md index 2627508f2954..c51d4f341ef5 100644 --- a/website/content/en/v1.0/troubleshooting.md +++ b/website/content/en/v1.0/troubleshooting.md @@ -75,12 +75,12 @@ If a long cluster name causes the Karpenter node role name to exceed 64 characte Keep in mind that `KarpenterNodeRole-` is just a recommendation from the getting started guide. Instead of using the eksctl role, you can shorten the name to anything you like, as long as it has the right permissions. -### Unknown field in Provisioner spec +### Unknown field in NodePool or EC2NodeClass spec If you are upgrading from an older version of Karpenter, there may have been changes in the CRD between versions. Attempting to utilize newer functionality which is surfaced in newer versions of the CRD may result in the following error message: ``` -error: error validating "STDIN": error validating data: ValidationError(Provisioner.spec): unknown field "" in sh.karpenter.v1alpha5.Provisioner.spec; if you choose to ignore these errors, turn validation off with --validate=false +Error from server (BadRequest): error when creating "STDIN": NodePool in version "v1" cannot be handled as a NodePool: strict decoding error: unknown field "spec.template.spec.nodeClassRef.foo" ``` If you see this error, you can solve the problem by following the [Custom Resource Definition Upgrade Guidance](../upgrade-guide/#custom-resource-definition-crd-upgrades). @@ -91,11 +91,10 @@ Info on whether there has been a change to the CRD between versions of Karpenter `0.16.0` changed the default replicas from 1 to 2. -Karpenter won't launch capacity to run itself (log related to the `karpenter.sh/provisioner-name DoesNotExist requirement`) +Karpenter won't launch capacity to run itself (log related to the `karpenter.sh/nodepool DoesNotExist requirement`) so it can't provision for the second Karpenter pod. -To solve this you can either reduce the replicas back from 2 to 1, or ensure there is enough capacity that isn't being managed by Karpenter -(these are instances with the name `karpenter.sh/provisioner-name/`) to run both pods. +To solve this you can either reduce the replicas back from 2 to 1, or ensure there is enough capacity that isn't being managed by Karpenter to run both pods. To do so on AWS increase the `minimum` and `desired` parameters on the node group autoscaling group to launch at lease 2 instances. @@ -144,52 +143,6 @@ You can fix this by patching the node objects: kubectl get nodes -ojsonpath='{range .items[*].metadata}{@.name}:{@.finalizers}{"\n"}' | grep "karpenter.sh/termination" | cut -d ':' -f 1 | xargs kubectl patch node --type='json' -p='[{"op": "remove", "path": "/metadata/finalizers"}]' ``` -## Webhooks - -### Failed calling webhook "validation.webhook.provisioners.karpenter.sh" - -If you are not able to create a provisioner due to `Internal error occurred: failed calling webhook "validation.webhook.provisioners.karpenter.sh":` - -Webhooks were renamed in `0.19.0`. There's a bug in ArgoCD's upgrade workflow where webhooks are leaked. This results in Provisioner's failing to be validated, since the validation server no longer corresponds to the webhook definition. - -Delete the stale webhooks. - -```text -kubectl delete mutatingwebhookconfigurations defaulting.webhook.provisioners.karpenter.sh -kubectl delete validatingwebhookconfiguration validation.webhook.provisioners.karpenter.sh -``` - -### Failed calling webhook "defaulting.webhook.karpenter.sh" - -The `defaulting.webhook.karpenter.sh` mutating webhook was removed in `0.27.3`. If you are coming from an older version of Karpenter where this webhook existed and the webhook was not managed by Helm, you may need to delete the stale webhook. - -```text -kubectl delete mutatingwebhookconfigurations defaulting.webhook.karpenter.sh -``` - -If you are not able to create a provisioner due to `Error from server (InternalError): error when creating "provisioner.yaml": Internal error occurred: failed calling webhook "defaulting.webhook.karpenter.sh": Post "https://karpenter-webhook.karpenter.svc:443/default-resource?timeout=10s": context deadline exceeded` - -Verify that the karpenter pod is running (should see 2/2 containers with a "Ready" status) - -```text -kubectl get po -A -l app.kubernetes.io/name=karpenter -NAME READY STATUS RESTARTS AGE -karpenter-7b46fb5c-gcr9z 2/2 Running 0 17h -``` - -Karpenter service has endpoints assigned to it - -```text -kubectl get ep -A -l app.kubernetes.io/name=karpenter -NAMESPACE NAME ENDPOINTS AGE -karpenter karpenter 192.168.39.88:8443,192.168.39.88:8080 16d -``` - -Your security groups are not blocking you from reaching your webhook. - -This is especially relevant if you have used `terraform-eks-module` version `>=18` since that version changed its security -approach, and now it's much more restrictive. - ## Provisioning ### Instances with swap volumes fail to register with control plane @@ -201,7 +154,7 @@ Some instance types (c1.medium and m1.small) are given limited amount of memory ``` ##### Solutions -Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the Provisioner requirements to use larger instance types. +Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the NodePool requirements to use larger instance types. ### DaemonSets can result in deployment failures @@ -209,7 +162,7 @@ For Karpenter versions `0.5.3` and earlier, DaemonSets were not properly conside This sometimes caused nodes to be deployed that could not meet the needs of the requested DaemonSets and workloads. This issue no longer occurs after Karpenter version `0.5.3` (see [PR #1155](https://github.com/aws/karpenter/pull/1155)). -If you are using a pre `0.5.3` version of Karpenter, one workaround is to set your provisioner to only use larger instance types that you know will be big enough for the DaemonSet and the workload. +If you are using a pre `0.5.3` version of Karpenter, one workaround is to set your NodePool to only use larger instance types that you know will be big enough for the DaemonSet and the workload. For more information, see [Issue #1084](https://github.com/aws/karpenter/issues/1084). Examples of this behavior are included in [Issue #1180](https://github.com/aws/karpenter/issues/1180). @@ -224,55 +177,24 @@ This behavior is not unique to Karpenter and can also occur with the standard `k To prevent this, you can set LimitRanges on pod deployments on a per-namespace basis. See the Karpenter [Best Practices Guide](https://aws.github.io/aws-eks-best-practices/karpenter/#use-limitranges-to-configure-defaults-for-resource-requests-and-limits) for further information on the use of LimitRanges. -### Missing subnetSelector and securityGroupSelector tags causes provisioning failures - -Starting with Karpenter `0.5.5`, if you are using Karpenter-generated launch template, provisioners require that [subnetSelector]({{}}) and [securityGroupSelector]({{}}) tags be set to match your cluster. -The [Provisioner]({{}}) section in the Karpenter Getting Started Guide uses the following example: - -```text -kind: AWSNodeTemplate -spec: - subnetSelector: - karpenter.sh/discovery: ${CLUSTER_NAME} - securityGroupSelector: - karpenter.sh/discovery: ${CLUSTER_NAME} -``` - -To check your subnet and security group selectors, type the following: - -```bash -aws ec2 describe-subnets --filters Name=tag:karpenter.sh/discovery,Values=${CLUSTER_NAME} -``` - -*Returns subnets matching the selector* - -```bash -aws ec2 describe-security-groups --filters Name=tag:karpenter.sh/discovery,Values=${CLUSTER_NAME} -``` - -*Returns security groups matching the selector* - -Provisioners created without those tags and run in more recent Karpenter versions will fail with this message when you try to run the provisioner: - -```text - field(s): spec.provider.securityGroupSelector, spec.provider.subnetSelector -``` - ### Pods using Security Groups for Pods stuck in "ContainerCreating" state for up to 30 minutes before transitioning to "Running" -When leveraging [Security Groups for Pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html), Karpenter will launch nodes as expected but pods will be stuck in "ContainerCreating" state for up to 30 minutes before transitioning to "Running". This is related to an interaction between Karpenter and the [amazon-vpc-resource-controller](https://github.com/aws/amazon-vpc-resource-controller-k8s) when a pod requests `vpc.amazonaws.com/pod-eni` resources. More info can be found in [issue #1252](https://github.com/aws/karpenter/issues/1252). +When leveraging [Security Groups for Pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html), Karpenter will launch nodes as expected but pods will be stuck in "ContainerCreating" state for up to 30 minutes before transitioning to "Running". +This is related to an interaction between Karpenter and the [amazon-vpc-resource-controller](https://github.com/aws/amazon-vpc-resource-controller-k8s) when a pod requests `vpc.amazonaws.com/pod-eni` resources. +More info can be found in [issue #1252](https://github.com/aws/karpenter/issues/1252). -To workaround this problem, add the `vpc.amazonaws.com/has-trunk-attached: "false"` label in your Karpenter Provisioner spec and ensure instance-type requirements include [instance-types which support ENI trunking](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go). +To workaround this problem, add the `vpc.amazonaws.com/has-trunk-attached: "false"` label in your Karpenter NodePool spec and ensure instance-type requirements include [instance-types which support ENI trunking](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go). ```yaml -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner +apiVersion: karpenter.sh/v1 +kind: NodePool metadata: name: default spec: - labels: - vpc.amazonaws.com/has-trunk-attached: "false" - ttlSecondsAfterEmpty: 30 + template + metadata: + labels: + vpc.amazonaws.com/has-trunk-attached: "false" ``` ### Pods using PVCs can hit volume limits and fail to scale-up @@ -329,7 +251,7 @@ time=2023-06-12T19:18:15Z type=Warning reason=FailedCreatePodSandBox from=kubele By default, the number of pods on a node is limited by both the number of networking interfaces (ENIs) that may be attached to an instance type and the number of IP addresses that can be assigned to each ENI. See [IP addresses per network interface per instance type](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html#AvailableIpPerENI) for a more detailed information on these instance types' limits. -If the max-pods (configured through your Provisioner [`kubeletConfiguration`]({{}})) is greater than the number of supported IPs for a given instance type, the CNI will fail to assign an IP to the pod and your pod will be left in a `ContainerCreating` state. +If the max-pods (configured through your EC2NodeClass [`kubeletConfiguration`]({{}})) is greater than the number of supported IPs for a given instance type, the CNI will fail to assign an IP to the pod and your pod will be left in a `ContainerCreating` state. If you've enabled [Security Groups per Pod](https://aws.github.io/aws-eks-best-practices/networking/sgpp/), one of the instance's ENIs is reserved as the trunk interface and uses branch interfaces off of that trunk interface to assign different security groups. If you do not have any `SecurityGroupPolicies` configured for your pods, they will be unable to utilize branch interfaces attached to the trunk interface, and IPs will only be available from the non-trunk ENIs. @@ -341,19 +263,19 @@ Note that Karpenter is not aware if [Security Groups per Pod](https://aws.github To avoid this discrepancy between `maxPods` and the supported pod density of the EC2 instance based on ENIs and allocatable IPs, you can perform one of the following actions on your cluster: 1. Enable [Prefix Delegation](https://www.eksworkshop.com/docs/networking/prefix/) to increase the number of allocatable IPs for the ENIs on each instance type -2. Reduce your `maxPods` value to be under the maximum pod density for the instance types assigned to your Provisioner -3. Remove the `maxPods` value from your [`kubeletConfiguration`]({{}}) if you no longer need it and instead rely on the defaulted values from Karpenter and EKS AMIs. +2. Reduce your `maxPods` value to be under the maximum pod density for the instance types assigned to your NodePods +3. Remove the `maxPods` value from your [`kubeletConfiguration`]({{}}) if you no longer need it and instead rely on the defaulted values from Karpenter and EKS AMIs. -For more information on pod density, view the [Pod Density Section in the NodePools doc]({{}}). +For more information on pod density, view the [Pod Density Section in the NodePools doc]({{}}). #### IP exhaustion in a subnet -When a node is launched by Karpenter, it is assigned to a subnet within your VPC based on the [`subnetSelector`]({{}}) value in your [`AWSNodeTemplate`]({{}})). When a subnet becomes IP address constrained, EC2 may think that it can successfully launch an instance in the subnet; however, when the CNI tries to assign IPs to the pods, there are none remaining. In this case, your pod will stay in a `ContainerCreating` state until an IP address is freed in the subnet and the CNI can assign one to the pod. +When a node is launched by Karpenter, it is assigned to a subnet within your VPC based on the [`subnetSelector`]({{}}) value in your [`EC2NodeClass`]({{}})). When a subnet becomes IP address constrained, EC2 may think that it can successfully launch an instance in the subnet; however, when the CNI tries to assign IPs to the pods, there are none remaining. In this case, your pod will stay in a `ContainerCreating` state until an IP address is freed in the subnet and the CNI can assign one to the pod. ##### Solutions 1. Use `topologySpreadConstraints` on `topology.kubernetes.io/zone` to spread your pods and nodes more evenly across zones -2. Increase the IP address space (CIDR) for the subnets selected by your `AWSNodeTemplate` +2. Increase the IP address space (CIDR) for the subnets selected by your `EC2NodeClass` 3. Use [custom networking](https://www.eksworkshop.com/docs/networking/custom-networking/) to assign separate IP address spaces to your pods and your nodes 4. [Run your EKS cluster on IPv6](https://aws.github.io/aws-eks-best-practices/networking/ipv6/) (Note: IPv6 clusters have some known limitations which should be well-understood before choosing to use one) @@ -479,7 +401,7 @@ Karpenter determines node initialization using three factors: 1. Node readiness 2. Expected resources are registered -3. Provisioner startup taints are removed +3. NodePool startup taints are removed #### Node Readiness @@ -496,9 +418,9 @@ Common resources that don't register and leave nodes in a non-initialized state: 1. `nvidia.com/gpu` (or any gpu-based resource): A GPU instance type that supports the `nvidia.com/gpu` resource is launched but the daemon/daemonset to register the resource on the node doesn't exist 2. `vpc.amazonaws.com/pod-eni`: An instance type is launched by the `ENABLE_POD_ENI` value is set to `false` in the `vpc-cni` plugin. Karpenter will expect that the `vpc.amazonaws.com/pod-eni` will be registered, but it never will. -#### Provisioner startup taints are removed +#### NodePool startup taints are removed -Karpenter expects all startup taints specified in `.spec.startupTaints` of the provisioner to be completely removed from node `.spec.taints` before it will consider the node initialized. +Karpenter expects all startup taints specified in `.spec.template.spec.startupTaints` of the NodePool to be completely removed from node `.spec.taints` before it will consider the node initialized. ### Node NotReady @@ -513,7 +435,7 @@ The easiest way to start debugging is to connect to the instance and get the Kub ```bash # List the nodes managed by Karpenter -kubectl get node -l karpenter.sh/provisioner-name +kubectl get node -l karpenter.sh/nodepool # Extract the instance ID (replace with a node name from the above listing) INSTANCE_ID=$(kubectl get node -ojson | jq -r ".spec.providerID" | cut -d \/ -f5) # Connect to the instance @@ -526,7 +448,7 @@ For Bottlerocket, you'll need to get access to the root filesystem: ```bash # List the nodes managed by Karpenter -kubectl get node -l karpenter.sh/provisioner-name +kubectl get node -l karpenter.sh/nodepool # Extract the instance ID (replace with a node name from the above listing) INSTANCE_ID=$(kubectl get node -ojson | jq -r ".spec.providerID" | cut -d \/ -f5) # Connect to the instance @@ -613,7 +535,7 @@ This means that your CNI plugin is out of date. You can find instructions on how ### Node terminates before ready on failed encrypted EBS volume If you are using a custom launch template and an encrypted EBS volume, the IAM principal launching the node may not have sufficient permissions to use the KMS customer managed key (CMK) for the EC2 EBS root volume. -This issue also applies to [Block Device Mappings]({{}}) specified in the Provisioner. +This issue also applies to [Block Device Mappings]({{}}) specified in the EC2NodeClass. In either case, this results in the node terminating almost immediately upon creation. Keep in mind that it is possible that EBS Encryption can be enabled without your knowledge. From ff13d7463c7b5a968efa1c803feb58a8528001b2 Mon Sep 17 00:00:00 2001 From: Amanuel Engeda <74629455+engedaam@users.noreply.github.com> Date: Thu, 15 Aug 2024 09:41:24 -0700 Subject: [PATCH 02/21] docs: Update for the Group for nodeclassref docs (#6770) --- website/content/en/docs/concepts/nodeclasses.md | 2 +- website/content/en/preview/concepts/nodeclasses.md | 2 +- website/content/en/v1.0/concepts/nodeclasses.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/website/content/en/docs/concepts/nodeclasses.md b/website/content/en/docs/concepts/nodeclasses.md index 5b33cd080947..4749988e69e9 100644 --- a/website/content/en/docs/concepts/nodeclasses.md +++ b/website/content/en/docs/concepts/nodeclasses.md @@ -19,7 +19,7 @@ spec: template: spec: nodeClassRef: - apiVersion: karpenter.k8s.aws/v1 + group: karpenter.k8s.aws kind: EC2NodeClass name: default --- diff --git a/website/content/en/preview/concepts/nodeclasses.md b/website/content/en/preview/concepts/nodeclasses.md index 854c02a7f63c..939e052da3f7 100644 --- a/website/content/en/preview/concepts/nodeclasses.md +++ b/website/content/en/preview/concepts/nodeclasses.md @@ -19,7 +19,7 @@ spec: template: spec: nodeClassRef: - apiVersion: karpenter.k8s.aws/v1 + group: karpenter.k8s.aws kind: EC2NodeClass name: default --- diff --git a/website/content/en/v1.0/concepts/nodeclasses.md b/website/content/en/v1.0/concepts/nodeclasses.md index 5b33cd080947..4749988e69e9 100644 --- a/website/content/en/v1.0/concepts/nodeclasses.md +++ b/website/content/en/v1.0/concepts/nodeclasses.md @@ -19,7 +19,7 @@ spec: template: spec: nodeClassRef: - apiVersion: karpenter.k8s.aws/v1 + group: karpenter.k8s.aws kind: EC2NodeClass name: default --- From 65ea5cc2b5a2f9e9a06173eb638b1b2041f12ff0 Mon Sep 17 00:00:00 2001 From: Amanuel Engeda <74629455+engedaam@users.noreply.github.com> Date: Thu, 15 Aug 2024 10:02:40 -0700 Subject: [PATCH 03/21] ci: Fire metric when no resources is cleaned (#6759) --- test/hack/resource/clean/main.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/test/hack/resource/clean/main.go b/test/hack/resource/clean/main.go index dd8b1aa9ca46..2cb51b27dcf5 100644 --- a/test/hack/resource/clean/main.go +++ b/test/hack/resource/clean/main.go @@ -90,19 +90,20 @@ func main() { if err != nil { resourceLogger.Errorf("%v", err) } + cleaned := []string{} resourceLogger.With("ids", ids, "count", len(ids)).Infof("discovered resourceTypes") if len(ids) > 0 { - cleaned, err := resourceTypes[i].Cleanup(ctx, ids) + cleaned, err = resourceTypes[i].Cleanup(ctx, ids) if err != nil { resourceLogger.Errorf("%v", err) } - // Should only fire metrics if the resource have expired - if lo.FromPtr(clusterName) == "" { - if err = metricsClient.FireMetric(ctx, sweeperCleanedResourcesTableName, fmt.Sprintf("%sDeleted", resourceTypes[i].String()), float64(len(cleaned)), lo.Ternary(resourceTypes[i].Global(), "global", cfg.Region)); err != nil { - resourceLogger.Errorf("%v", err) - } - } resourceLogger.With("ids", cleaned, "count", len(cleaned)).Infof("deleted resourceTypes") } + // Should only fire metrics if the resource have expired + if lo.FromPtr(clusterName) == "" { + if err = metricsClient.FireMetric(ctx, sweeperCleanedResourcesTableName, fmt.Sprintf("%sDeleted", resourceTypes[i].String()), float64(len(cleaned)), lo.Ternary(resourceTypes[i].Global(), "global", cfg.Region)); err != nil { + resourceLogger.Errorf("%v", err) + } + } } } From 67273e27152230a2b7ec259758465754434d6d02 Mon Sep 17 00:00:00 2001 From: Nick Tran <10810510+njtran@users.noreply.github.com> Date: Thu, 15 Aug 2024 10:16:37 -0700 Subject: [PATCH 04/21] chore: update gomod and add cloudprovider method (#6773) --- go.mod | 4 ++-- go.sum | 8 ++++---- pkg/cloudprovider/cloudprovider.go | 4 ++++ pkg/fake/cloudprovider.go | 4 ++++ 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 8e20376615ee..41b37882c58e 100644 --- a/go.mod +++ b/go.mod @@ -31,8 +31,8 @@ require ( k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20240102154912-e7106e64919e knative.dev/pkg v0.0.0-20231010144348-ca8c009405dd - sigs.k8s.io/controller-runtime v0.18.4 - sigs.k8s.io/karpenter v1.0.0 + sigs.k8s.io/controller-runtime v0.18.5 + sigs.k8s.io/karpenter v1.0.1-0.20240815170320-bb7468a3a758 sigs.k8s.io/yaml v1.4.0 ) diff --git a/go.sum b/go.sum index 72b001cad9e2..5395ecd1a912 100644 --- a/go.sum +++ b/go.sum @@ -757,12 +757,12 @@ knative.dev/pkg v0.0.0-20231010144348-ca8c009405dd/go.mod h1:36cYnaOVHkzmhgybmYX rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= -sigs.k8s.io/controller-runtime v0.18.4 h1:87+guW1zhvuPLh1PHybKdYFLU0YJp4FhJRmiHvm5BZw= -sigs.k8s.io/controller-runtime v0.18.4/go.mod h1:TVoGrfdpbA9VRFaRnKgk9P5/atA0pMwq+f+msb9M8Sg= +sigs.k8s.io/controller-runtime v0.18.5 h1:nTHio/W+Q4aBlQMgbnC5hZb4IjIidyrizMai9P6n4Rk= +sigs.k8s.io/controller-runtime v0.18.5/go.mod h1:TVoGrfdpbA9VRFaRnKgk9P5/atA0pMwq+f+msb9M8Sg= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/karpenter v1.0.0 h1:aucPhMbulRSzqu3x4ndUGYJaiinwDpwtQx/U5uwenCk= -sigs.k8s.io/karpenter v1.0.0/go.mod h1:3NLmsnHHw8p4VutpjTOPUZyhE3qH6yGTs8O94Lsu8uw= +sigs.k8s.io/karpenter v1.0.1-0.20240815170320-bb7468a3a758 h1:VEibnW+C/lW8QVgGlsZadhhTPXwhkR2CQj828zHu8Ao= +sigs.k8s.io/karpenter v1.0.1-0.20240815170320-bb7468a3a758/go.mod h1:SGH7B5ZSeaCXBnwvj4cSmIPC6TqRq7kPZmQyJRdxC6k= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= diff --git a/pkg/cloudprovider/cloudprovider.go b/pkg/cloudprovider/cloudprovider.go index 81ee525ea522..acde46ed2276 100644 --- a/pkg/cloudprovider/cloudprovider.go +++ b/pkg/cloudprovider/cloudprovider.go @@ -209,6 +209,10 @@ func (c *CloudProvider) Delete(ctx context.Context, nodeClaim *karpv1.NodeClaim) return c.instanceProvider.Delete(ctx, id) } +func (c *CloudProvider) DisruptionReasons() []karpv1.DisruptionReason { + return nil +} + func (c *CloudProvider) IsDrifted(ctx context.Context, nodeClaim *karpv1.NodeClaim) (cloudprovider.DriftReason, error) { // Not needed when GetInstanceTypes removes nodepool dependency nodePoolName, ok := nodeClaim.Labels[karpv1.NodePoolLabelKey] diff --git a/pkg/fake/cloudprovider.go b/pkg/fake/cloudprovider.go index 98b05ad876c3..75d9ba516725 100644 --- a/pkg/fake/cloudprovider.go +++ b/pkg/fake/cloudprovider.go @@ -77,6 +77,10 @@ func (c *CloudProvider) Delete(context.Context, *karpv1.NodeClaim) error { return nil } +func (c *CloudProvider) DisruptionReasons() []karpv1.DisruptionReason { + return nil +} + // Name returns the CloudProvider implementation name. func (c *CloudProvider) Name() string { return "fake" From 3e6cbc5a09c47c187300841238ee03903a08617e Mon Sep 17 00:00:00 2001 From: Nick Tran <10810510+njtran@users.noreply.github.com> Date: Thu, 15 Aug 2024 12:06:03 -0700 Subject: [PATCH 05/21] docs: fix cloud formation resources (#6774) --- website/content/en/docs/upgrading/v1-migration.md | 6 +++--- website/content/en/preview/upgrading/v1-migration.md | 6 +++--- website/content/en/v1.0/upgrading/v1-migration.md | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/website/content/en/docs/upgrading/v1-migration.md b/website/content/en/docs/upgrading/v1-migration.md index 36e018253e4e..69930b2ea9c1 100644 --- a/website/content/en/docs/upgrading/v1-migration.md +++ b/website/content/en/docs/upgrading/v1-migration.md @@ -107,7 +107,7 @@ The upgrade guide will first require upgrading to your latest patch version prio ```bash TEMPOUT=$(mktemp) - curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ + curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ @@ -293,7 +293,7 @@ echo "${KARPENTER_NAMESPACE}" "${KARPENTER_VERSION}" "${CLUSTER_NAME}" "${TEMPOU **v0.33.6 and v0.34.7:** ```bash -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPENTER_VERSION}"/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ @@ -303,7 +303,7 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPE **v0.35+:** ```bash -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ diff --git a/website/content/en/preview/upgrading/v1-migration.md b/website/content/en/preview/upgrading/v1-migration.md index 36e018253e4e..69930b2ea9c1 100644 --- a/website/content/en/preview/upgrading/v1-migration.md +++ b/website/content/en/preview/upgrading/v1-migration.md @@ -107,7 +107,7 @@ The upgrade guide will first require upgrading to your latest patch version prio ```bash TEMPOUT=$(mktemp) - curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ + curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ @@ -293,7 +293,7 @@ echo "${KARPENTER_NAMESPACE}" "${KARPENTER_VERSION}" "${CLUSTER_NAME}" "${TEMPOU **v0.33.6 and v0.34.7:** ```bash -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPENTER_VERSION}"/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ @@ -303,7 +303,7 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPE **v0.35+:** ```bash -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ diff --git a/website/content/en/v1.0/upgrading/v1-migration.md b/website/content/en/v1.0/upgrading/v1-migration.md index 36e018253e4e..69930b2ea9c1 100644 --- a/website/content/en/v1.0/upgrading/v1-migration.md +++ b/website/content/en/v1.0/upgrading/v1-migration.md @@ -107,7 +107,7 @@ The upgrade guide will first require upgrading to your latest patch version prio ```bash TEMPOUT=$(mktemp) - curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ + curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ @@ -293,7 +293,7 @@ echo "${KARPENTER_NAMESPACE}" "${KARPENTER_VERSION}" "${CLUSTER_NAME}" "${TEMPOU **v0.33.6 and v0.34.7:** ```bash -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPENTER_VERSION}"/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ @@ -303,7 +303,7 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPE **v0.35+:** ```bash -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ From 75a7aa5b7d5f205f4114ec6e05de09ec074198f6 Mon Sep 17 00:00:00 2001 From: Kaji Date: Fri, 16 Aug 2024 05:40:26 +0900 Subject: [PATCH 06/21] docs: invalid Getting Started NodePool yaml (#6769) Co-authored-by: kajihiro --- .../scripts/step12-add-nodepool.sh | 5 +++-- .../migrating-from-cas/scripts/step10-create-nodepool.sh | 5 +++-- .../scripts/step12-add-nodepool.sh | 5 +++-- .../migrating-from-cas/scripts/step10-create-nodepool.sh | 5 +++-- .../scripts/step12-add-nodepool.sh | 5 +++-- .../migrating-from-cas/scripts/step10-create-nodepool.sh | 5 +++-- 6 files changed, 18 insertions(+), 12 deletions(-) diff --git a/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh b/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh index f3625e936810..85213a3457c3 100755 --- a/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh +++ b/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh @@ -23,14 +23,15 @@ spec: operator: Gt values: ["2"] nodeClassRef: - apiVersion: karpenter.k8s.aws/v1 + group: karpenter.k8s.aws kind: EC2NodeClass name: default + expireAfter: 720h # 30 * 24h = 720h limits: cpu: 1000 disruption: consolidationPolicy: WhenEmptyOrUnderutilized - expireAfter: 720h # 30 * 24h = 720h + consolidateAfter: 1m --- apiVersion: karpenter.k8s.aws/v1 kind: EC2NodeClass diff --git a/website/content/en/docs/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh b/website/content/en/docs/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh index f3625e936810..85213a3457c3 100644 --- a/website/content/en/docs/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh +++ b/website/content/en/docs/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh @@ -23,14 +23,15 @@ spec: operator: Gt values: ["2"] nodeClassRef: - apiVersion: karpenter.k8s.aws/v1 + group: karpenter.k8s.aws kind: EC2NodeClass name: default + expireAfter: 720h # 30 * 24h = 720h limits: cpu: 1000 disruption: consolidationPolicy: WhenEmptyOrUnderutilized - expireAfter: 720h # 30 * 24h = 720h + consolidateAfter: 1m --- apiVersion: karpenter.k8s.aws/v1 kind: EC2NodeClass diff --git a/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh b/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh index f3625e936810..85213a3457c3 100755 --- a/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh +++ b/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh @@ -23,14 +23,15 @@ spec: operator: Gt values: ["2"] nodeClassRef: - apiVersion: karpenter.k8s.aws/v1 + group: karpenter.k8s.aws kind: EC2NodeClass name: default + expireAfter: 720h # 30 * 24h = 720h limits: cpu: 1000 disruption: consolidationPolicy: WhenEmptyOrUnderutilized - expireAfter: 720h # 30 * 24h = 720h + consolidateAfter: 1m --- apiVersion: karpenter.k8s.aws/v1 kind: EC2NodeClass diff --git a/website/content/en/preview/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh b/website/content/en/preview/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh index f3625e936810..85213a3457c3 100644 --- a/website/content/en/preview/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh +++ b/website/content/en/preview/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh @@ -23,14 +23,15 @@ spec: operator: Gt values: ["2"] nodeClassRef: - apiVersion: karpenter.k8s.aws/v1 + group: karpenter.k8s.aws kind: EC2NodeClass name: default + expireAfter: 720h # 30 * 24h = 720h limits: cpu: 1000 disruption: consolidationPolicy: WhenEmptyOrUnderutilized - expireAfter: 720h # 30 * 24h = 720h + consolidateAfter: 1m --- apiVersion: karpenter.k8s.aws/v1 kind: EC2NodeClass diff --git a/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh b/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh index f3625e936810..85213a3457c3 100755 --- a/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh +++ b/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step12-add-nodepool.sh @@ -23,14 +23,15 @@ spec: operator: Gt values: ["2"] nodeClassRef: - apiVersion: karpenter.k8s.aws/v1 + group: karpenter.k8s.aws kind: EC2NodeClass name: default + expireAfter: 720h # 30 * 24h = 720h limits: cpu: 1000 disruption: consolidationPolicy: WhenEmptyOrUnderutilized - expireAfter: 720h # 30 * 24h = 720h + consolidateAfter: 1m --- apiVersion: karpenter.k8s.aws/v1 kind: EC2NodeClass diff --git a/website/content/en/v1.0/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh b/website/content/en/v1.0/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh index f3625e936810..85213a3457c3 100644 --- a/website/content/en/v1.0/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh +++ b/website/content/en/v1.0/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh @@ -23,14 +23,15 @@ spec: operator: Gt values: ["2"] nodeClassRef: - apiVersion: karpenter.k8s.aws/v1 + group: karpenter.k8s.aws kind: EC2NodeClass name: default + expireAfter: 720h # 30 * 24h = 720h limits: cpu: 1000 disruption: consolidationPolicy: WhenEmptyOrUnderutilized - expireAfter: 720h # 30 * 24h = 720h + consolidateAfter: 1m --- apiVersion: karpenter.k8s.aws/v1 kind: EC2NodeClass From 79dea10f9f439805b5ca75838f18dc51da566ed9 Mon Sep 17 00:00:00 2001 From: Boseok Son Date: Fri, 16 Aug 2024 05:41:48 +0900 Subject: [PATCH 07/21] docs: fix typo (#6763) Co-authored-by: Jonathan Innis --- .../getting-started/getting-started-with-karpenter/_index.md | 2 +- .../getting-started/getting-started-with-karpenter/_index.md | 2 +- .../getting-started/getting-started-with-karpenter/_index.md | 2 +- .../getting-started/getting-started-with-karpenter/_index.md | 2 +- .../getting-started/getting-started-with-karpenter/_index.md | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md b/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md index 11c0b9589ecc..14879bbe720e 100644 --- a/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md +++ b/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md @@ -125,7 +125,7 @@ cosign verify public.ecr.aws/karpenter/karpenter:1.0.0 \ ``` {{% alert title="DNS Policy Notice" color="warning" %}} -Karpenter uses the `ClusterFirst` pod DNS policy by default. This is the Kubernetes cluster default and this ensures that Karpetner can reach-out to internal Kubernetes services during its lifetime. There may be cases where you do not have the DNS service that you are using on your cluster up-and-running before Karpenter starts up. The most common case of this is you want Karpenter to manage the node capacity where your DNS service pods are running. +Karpenter uses the `ClusterFirst` pod DNS policy by default. This is the Kubernetes cluster default and this ensures that Karpenter can reach-out to internal Kubernetes services during its lifetime. There may be cases where you do not have the DNS service that you are using on your cluster up-and-running before Karpenter starts up. The most common case of this is you want Karpenter to manage the node capacity where your DNS service pods are running. If you need Karpenter to manage the DNS service pods' capacity, this means that DNS won't be running when Karpenter starts-up. In this case, you will need to set the pod DNS policy to `Default` with `--set dnsPolicy=Default`. This will tell Karpenter to use the host's DNS resolution instead of the internal DNS resolution, ensuring that you don't have a dependency on the DNS service pods to run. More details on this issue can be found in the following Github issues: [#2186](https://github.com/aws/karpenter-provider-aws/issues/2186) and [#4947](https://github.com/aws/karpenter-provider-aws/issues/4947). {{% /alert %}} diff --git a/website/content/en/preview/getting-started/getting-started-with-karpenter/_index.md b/website/content/en/preview/getting-started/getting-started-with-karpenter/_index.md index b2b3f28396bc..4bde420fb1ae 100644 --- a/website/content/en/preview/getting-started/getting-started-with-karpenter/_index.md +++ b/website/content/en/preview/getting-started/getting-started-with-karpenter/_index.md @@ -125,7 +125,7 @@ cosign verify public.ecr.aws/karpenter/karpenter:{{< param "latest_release_versi ``` {{% alert title="DNS Policy Notice" color="warning" %}} -Karpenter uses the `ClusterFirst` pod DNS policy by default. This is the Kubernetes cluster default and this ensures that Karpetner can reach-out to internal Kubernetes services during its lifetime. There may be cases where you do not have the DNS service that you are using on your cluster up-and-running before Karpenter starts up. The most common case of this is you want Karpenter to manage the node capacity where your DNS service pods are running. +Karpenter uses the `ClusterFirst` pod DNS policy by default. This is the Kubernetes cluster default and this ensures that Karpenter can reach-out to internal Kubernetes services during its lifetime. There may be cases where you do not have the DNS service that you are using on your cluster up-and-running before Karpenter starts up. The most common case of this is you want Karpenter to manage the node capacity where your DNS service pods are running. If you need Karpenter to manage the DNS service pods' capacity, this means that DNS won't be running when Karpenter starts-up. In this case, you will need to set the pod DNS policy to `Default` with `--set dnsPolicy=Default`. This will tell Karpenter to use the host's DNS resolution instead of the internal DNS resolution, ensuring that you don't have a dependency on the DNS service pods to run. More details on this issue can be found in the following Github issues: [#2186](https://github.com/aws/karpenter-provider-aws/issues/2186) and [#4947](https://github.com/aws/karpenter-provider-aws/issues/4947). {{% /alert %}} diff --git a/website/content/en/v0.36/getting-started/getting-started-with-karpenter/_index.md b/website/content/en/v0.36/getting-started/getting-started-with-karpenter/_index.md index 5fbe1e7c0438..b4f5f5fcf72e 100644 --- a/website/content/en/v0.36/getting-started/getting-started-with-karpenter/_index.md +++ b/website/content/en/v0.36/getting-started/getting-started-with-karpenter/_index.md @@ -122,7 +122,7 @@ cosign verify public.ecr.aws/karpenter/karpenter:0.36.2 \ ``` {{% alert title="DNS Policy Notice" color="warning" %}} -Karpenter uses the `ClusterFirst` pod DNS policy by default. This is the Kubernetes cluster default and this ensures that Karpetner can reach-out to internal Kubernetes services during its lifetime. There may be cases where you do not have the DNS service that you are using on your cluster up-and-running before Karpenter starts up. The most common case of this is you want Karpenter to manage the node capacity where your DNS service pods are running. +Karpenter uses the `ClusterFirst` pod DNS policy by default. This is the Kubernetes cluster default and this ensures that Karpenter can reach-out to internal Kubernetes services during its lifetime. There may be cases where you do not have the DNS service that you are using on your cluster up-and-running before Karpenter starts up. The most common case of this is you want Karpenter to manage the node capacity where your DNS service pods are running. If you need Karpenter to manage the DNS service pods' capacity, this means that DNS won't be running when Karpenter starts-up. In this case, you will need to set the pod DNS policy to `Default` with `--set dnsPolicy=Default`. This will tell Karpenter to use the host's DNS resolution instead of the internal DNS resolution, ensuring that you don't have a dependency on the DNS service pods to run. More details on this issue can be found in the following Github issues: [#2186](https://github.com/aws/karpenter-provider-aws/issues/2186) and [#4947](https://github.com/aws/karpenter-provider-aws/issues/4947). {{% /alert %}} diff --git a/website/content/en/v0.37/getting-started/getting-started-with-karpenter/_index.md b/website/content/en/v0.37/getting-started/getting-started-with-karpenter/_index.md index 26b6b5de8ad8..1264f6c46cde 100644 --- a/website/content/en/v0.37/getting-started/getting-started-with-karpenter/_index.md +++ b/website/content/en/v0.37/getting-started/getting-started-with-karpenter/_index.md @@ -122,7 +122,7 @@ cosign verify public.ecr.aws/karpenter/karpenter:0.37.0 \ ``` {{% alert title="DNS Policy Notice" color="warning" %}} -Karpenter uses the `ClusterFirst` pod DNS policy by default. This is the Kubernetes cluster default and this ensures that Karpetner can reach-out to internal Kubernetes services during its lifetime. There may be cases where you do not have the DNS service that you are using on your cluster up-and-running before Karpenter starts up. The most common case of this is you want Karpenter to manage the node capacity where your DNS service pods are running. +Karpenter uses the `ClusterFirst` pod DNS policy by default. This is the Kubernetes cluster default and this ensures that Karpenter can reach-out to internal Kubernetes services during its lifetime. There may be cases where you do not have the DNS service that you are using on your cluster up-and-running before Karpenter starts up. The most common case of this is you want Karpenter to manage the node capacity where your DNS service pods are running. If you need Karpenter to manage the DNS service pods' capacity, this means that DNS won't be running when Karpenter starts-up. In this case, you will need to set the pod DNS policy to `Default` with `--set dnsPolicy=Default`. This will tell Karpenter to use the host's DNS resolution instead of the internal DNS resolution, ensuring that you don't have a dependency on the DNS service pods to run. More details on this issue can be found in the following Github issues: [#2186](https://github.com/aws/karpenter-provider-aws/issues/2186) and [#4947](https://github.com/aws/karpenter-provider-aws/issues/4947). {{% /alert %}} diff --git a/website/content/en/v1.0/getting-started/getting-started-with-karpenter/_index.md b/website/content/en/v1.0/getting-started/getting-started-with-karpenter/_index.md index 11c0b9589ecc..14879bbe720e 100644 --- a/website/content/en/v1.0/getting-started/getting-started-with-karpenter/_index.md +++ b/website/content/en/v1.0/getting-started/getting-started-with-karpenter/_index.md @@ -125,7 +125,7 @@ cosign verify public.ecr.aws/karpenter/karpenter:1.0.0 \ ``` {{% alert title="DNS Policy Notice" color="warning" %}} -Karpenter uses the `ClusterFirst` pod DNS policy by default. This is the Kubernetes cluster default and this ensures that Karpetner can reach-out to internal Kubernetes services during its lifetime. There may be cases where you do not have the DNS service that you are using on your cluster up-and-running before Karpenter starts up. The most common case of this is you want Karpenter to manage the node capacity where your DNS service pods are running. +Karpenter uses the `ClusterFirst` pod DNS policy by default. This is the Kubernetes cluster default and this ensures that Karpenter can reach-out to internal Kubernetes services during its lifetime. There may be cases where you do not have the DNS service that you are using on your cluster up-and-running before Karpenter starts up. The most common case of this is you want Karpenter to manage the node capacity where your DNS service pods are running. If you need Karpenter to manage the DNS service pods' capacity, this means that DNS won't be running when Karpenter starts-up. In this case, you will need to set the pod DNS policy to `Default` with `--set dnsPolicy=Default`. This will tell Karpenter to use the host's DNS resolution instead of the internal DNS resolution, ensuring that you don't have a dependency on the DNS service pods to run. More details on this issue can be found in the following Github issues: [#2186](https://github.com/aws/karpenter-provider-aws/issues/2186) and [#4947](https://github.com/aws/karpenter-provider-aws/issues/4947). {{% /alert %}} From cdcab9445b816cd1e7534d131451364ce8444b9c Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Sun, 18 Aug 2024 08:56:20 +0900 Subject: [PATCH 08/21] docs: update node-ownership.md (#6785) --- designs/node-ownership.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/designs/node-ownership.md b/designs/node-ownership.md index 8862b7409c7c..dff14e170254 100644 --- a/designs/node-ownership.md +++ b/designs/node-ownership.md @@ -17,7 +17,7 @@ _Note: This internal Machine CR will come in as an alpha API and an internal des ## Background -Karpenter currently creates the node object on the Kubernetes api server immediately after creating the VM instance. Kubernetes cloud providers (EKS, AKS, GKE, etc.) assume that, ultimately, the kubelet will be the entity responsible for registering the node to the api-server. This is reflected [through the userData](https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh) where KubeletConfig can be set [that is only properly propogated for all values when the kubelet is the node creator](https://github.com/kubernetes/kubernetes/blob/39c76ba2edeadb84a115cc3fbd9204a2177f1c28/pkg/kubelet/kubelet_node_status.go#L286). However, Karpenter’s current architecture necessitates that it both launches the VM instance and creates the node object on the Kubernetes API server in succession (more on this [below](#why-does-karpenter-createoperate-on-the-node-at-all)). +Karpenter currently creates the node object on the Kubernetes api server immediately after creating the VM instance. Kubernetes cloud providers (EKS, AKS, GKE, etc.) assume that, ultimately, the kubelet will be the entity responsible for registering the node to the api-server. This is reflected [through the userData](https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh) where KubeletConfig can be set [that is only properly propagated for all values when the kubelet is the node creator](https://github.com/kubernetes/kubernetes/blob/39c76ba2edeadb84a115cc3fbd9204a2177f1c28/pkg/kubelet/kubelet_node_status.go#L286). However, Karpenter’s current architecture necessitates that it both launches the VM instance and creates the node object on the Kubernetes API server in succession (more on this [below](#why-does-karpenter-createoperate-on-the-node-at-all)). This document describes the current node creation flow for Karpenter as well as the rationale for why Karpenter originally created the node object. It then calls out the specific problems with this approach and recommends an alternative approach to creating the Node object that solves for the current approach’s problems. From 10484ba9421e5c8cbf8614a583f6ef52a4e3e270 Mon Sep 17 00:00:00 2001 From: Greg Roodt Date: Sun, 18 Aug 2024 10:28:17 +1000 Subject: [PATCH 09/21] chore: Adds g6e instances (#6781) --- .../instancetype/zz_generated.bandwidth.go | 8 + .../pricing/zz_generated.pricing_aws.go | 5 +- .../en/preview/reference/instance-types.md | 287 ++++++++++++++++++ 3 files changed, 299 insertions(+), 1 deletion(-) diff --git a/pkg/providers/instancetype/zz_generated.bandwidth.go b/pkg/providers/instancetype/zz_generated.bandwidth.go index d0f8cbbc83ed..2008bfd272d5 100644 --- a/pkg/providers/instancetype/zz_generated.bandwidth.go +++ b/pkg/providers/instancetype/zz_generated.bandwidth.go @@ -235,6 +235,7 @@ var ( "g5.xlarge": 2500, "g5g.2xlarge": 2500, "g6.xlarge": 2500, + "g6e.xlarge": 2500, "h1.2xlarge": 2500, "i3.2xlarge": 2500, "m5.2xlarge": 2500, @@ -314,6 +315,7 @@ var ( "g5.2xlarge": 5000, "g5g.4xlarge": 5000, "g6.2xlarge": 5000, + "g6e.2xlarge": 5000, "h1.4xlarge": 5000, "i3.4xlarge": 5000, "inf1.2xlarge": 5000, @@ -515,6 +517,7 @@ var ( "c6g.12xlarge": 20000, "c6gd.12xlarge": 20000, "g4dn.4xlarge": 20000, + "g6e.4xlarge": 20000, "m5.16xlarge": 20000, "m5a.24xlarge": 20000, "m5ad.24xlarge": 20000, @@ -565,6 +568,7 @@ var ( "g5g.metal": 25000, "g6.16xlarge": 25000, "g6.8xlarge": 25000, + "g6e.8xlarge": 25000, "gr6.8xlarge": 25000, "h1.16xlarge": 25000, "i3.16xlarge": 25000, @@ -639,6 +643,7 @@ var ( "r7gd.16xlarge": 30000, "r7gd.metal": 30000, "r8g.16xlarge": 30000, + "g6e.16xlarge": 35000, "c6a.24xlarge": 37500, "c6i.24xlarge": 37500, "c6id.24xlarge": 37500, @@ -757,6 +762,7 @@ var ( "g4dn.metal": 100000, "g5.48xlarge": 100000, "g6.48xlarge": 100000, + "g6e.12xlarge": 100000, "hpc6a.48xlarge": 100000, "i3en.24xlarge": 100000, "i3en.metal": 100000, @@ -806,6 +812,7 @@ var ( "c6in.metal": 200000, "c7gn.16xlarge": 200000, "c7gn.metal": 200000, + "g6e.24xlarge": 200000, "hpc6id.32xlarge": 200000, "hpc7g.16xlarge": 200000, "hpc7g.4xlarge": 200000, @@ -826,6 +833,7 @@ var ( "hpc7a.48xlarge": 300000, "hpc7a.96xlarge": 300000, "dl1.24xlarge": 400000, + "g6e.48xlarge": 400000, "p4d.24xlarge": 400000, "p4de.24xlarge": 400000, "trn1.32xlarge": 800000, diff --git a/pkg/providers/pricing/zz_generated.pricing_aws.go b/pkg/providers/pricing/zz_generated.pricing_aws.go index d729268b5d82..a469ec5fb821 100644 --- a/pkg/providers/pricing/zz_generated.pricing_aws.go +++ b/pkg/providers/pricing/zz_generated.pricing_aws.go @@ -16,7 +16,7 @@ limitations under the License. package pricing -// generated at 2024-07-10T14:30:58Z for us-east-1 +// generated at 2024-08-16T05:23:42Z for us-east-1 var InitialOnDemandPricesAWS = map[string]map[string]float64{ // us-east-1 @@ -130,6 +130,9 @@ var InitialOnDemandPricesAWS = map[string]map[string]float64{ // g6 family "g6.12xlarge": 4.601600, "g6.16xlarge": 3.396800, "g6.24xlarge": 6.675200, "g6.2xlarge": 0.977600, "g6.48xlarge": 13.350400, "g6.4xlarge": 1.323200, "g6.8xlarge": 2.014400, "g6.xlarge": 0.804800, + // g6e family + "g6e.12xlarge": 10.492640, "g6e.16xlarge": 7.577190, "g6e.24xlarge": 15.065590, "g6e.2xlarge": 2.242080, + "g6e.48xlarge": 30.131180, "g6e.4xlarge": 3.004240, "g6e.8xlarge": 4.528560, "g6e.xlarge": 1.861000, // gr6 family "gr6.4xlarge": 1.539200, "gr6.8xlarge": 2.446400, // h1 family diff --git a/website/content/en/preview/reference/instance-types.md b/website/content/en/preview/reference/instance-types.md index 29dd6b22a6e4..8b9ebc3c8438 100644 --- a/website/content/en/preview/reference/instance-types.md +++ b/website/content/en/preview/reference/instance-types.md @@ -6351,6 +6351,260 @@ below are the resources available with some assumptions and after the instance o |pods|737| |vpc.amazonaws.com/efa|1| |vpc.amazonaws.com/pod-eni|107| +## g6e Family +### `g6e.xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|g| + |karpenter.k8s.aws/instance-cpu|4| + |karpenter.k8s.aws/instance-cpu-manufacturer|amd| + |karpenter.k8s.aws/instance-ebs-bandwidth|5000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|g6e| + |karpenter.k8s.aws/instance-generation|6| + |karpenter.k8s.aws/instance-gpu-count|1| + |karpenter.k8s.aws/instance-gpu-manufacturer|nvidia| + |karpenter.k8s.aws/instance-gpu-memory|45776| + |karpenter.k8s.aws/instance-gpu-name|l40s| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-local-nvme|250| + |karpenter.k8s.aws/instance-memory|32768| + |karpenter.k8s.aws/instance-network-bandwidth|2500| + |karpenter.k8s.aws/instance-size|xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|g6e.xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|3920m| + |ephemeral-storage|17Gi| + |memory|29317Mi| + |nvidia.com/gpu|1| + |pods|58| +### `g6e.2xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|g| + |karpenter.k8s.aws/instance-cpu|8| + |karpenter.k8s.aws/instance-cpu-manufacturer|amd| + |karpenter.k8s.aws/instance-ebs-bandwidth|5000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|g6e| + |karpenter.k8s.aws/instance-generation|6| + |karpenter.k8s.aws/instance-gpu-count|1| + |karpenter.k8s.aws/instance-gpu-manufacturer|nvidia| + |karpenter.k8s.aws/instance-gpu-memory|45776| + |karpenter.k8s.aws/instance-gpu-name|l40s| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-local-nvme|450| + |karpenter.k8s.aws/instance-memory|65536| + |karpenter.k8s.aws/instance-network-bandwidth|5000| + |karpenter.k8s.aws/instance-size|2xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|g6e.2xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|7910m| + |ephemeral-storage|17Gi| + |memory|59627Mi| + |nvidia.com/gpu|1| + |pods|58| +### `g6e.4xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|g| + |karpenter.k8s.aws/instance-cpu|16| + |karpenter.k8s.aws/instance-cpu-manufacturer|amd| + |karpenter.k8s.aws/instance-ebs-bandwidth|8000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|g6e| + |karpenter.k8s.aws/instance-generation|6| + |karpenter.k8s.aws/instance-gpu-count|1| + |karpenter.k8s.aws/instance-gpu-manufacturer|nvidia| + |karpenter.k8s.aws/instance-gpu-memory|45776| + |karpenter.k8s.aws/instance-gpu-name|l40s| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-local-nvme|600| + |karpenter.k8s.aws/instance-memory|131072| + |karpenter.k8s.aws/instance-network-bandwidth|20000| + |karpenter.k8s.aws/instance-size|4xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|g6e.4xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|15890m| + |ephemeral-storage|17Gi| + |memory|118312Mi| + |nvidia.com/gpu|1| + |pods|234| +### `g6e.8xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|g| + |karpenter.k8s.aws/instance-cpu|32| + |karpenter.k8s.aws/instance-cpu-manufacturer|amd| + |karpenter.k8s.aws/instance-ebs-bandwidth|16000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|g6e| + |karpenter.k8s.aws/instance-generation|6| + |karpenter.k8s.aws/instance-gpu-count|1| + |karpenter.k8s.aws/instance-gpu-manufacturer|nvidia| + |karpenter.k8s.aws/instance-gpu-memory|45776| + |karpenter.k8s.aws/instance-gpu-name|l40s| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-local-nvme|900| + |karpenter.k8s.aws/instance-memory|262144| + |karpenter.k8s.aws/instance-network-bandwidth|25000| + |karpenter.k8s.aws/instance-size|8xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|g6e.8xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|31850m| + |ephemeral-storage|17Gi| + |memory|239554Mi| + |nvidia.com/gpu|1| + |pods|234| + |vpc.amazonaws.com/efa|1| +### `g6e.12xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|g| + |karpenter.k8s.aws/instance-cpu|48| + |karpenter.k8s.aws/instance-cpu-manufacturer|amd| + |karpenter.k8s.aws/instance-ebs-bandwidth|20000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|g6e| + |karpenter.k8s.aws/instance-generation|6| + |karpenter.k8s.aws/instance-gpu-count|4| + |karpenter.k8s.aws/instance-gpu-manufacturer|nvidia| + |karpenter.k8s.aws/instance-gpu-memory|183105| + |karpenter.k8s.aws/instance-gpu-name|l40s| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-local-nvme|3800| + |karpenter.k8s.aws/instance-memory|393216| + |karpenter.k8s.aws/instance-network-bandwidth|100000| + |karpenter.k8s.aws/instance-size|12xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|g6e.12xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|47810m| + |ephemeral-storage|17Gi| + |memory|360157Mi| + |nvidia.com/gpu|4| + |pods|292| + |vpc.amazonaws.com/efa|1| +### `g6e.16xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|g| + |karpenter.k8s.aws/instance-cpu|64| + |karpenter.k8s.aws/instance-cpu-manufacturer|amd| + |karpenter.k8s.aws/instance-ebs-bandwidth|20000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|g6e| + |karpenter.k8s.aws/instance-generation|6| + |karpenter.k8s.aws/instance-gpu-count|1| + |karpenter.k8s.aws/instance-gpu-manufacturer|nvidia| + |karpenter.k8s.aws/instance-gpu-memory|45776| + |karpenter.k8s.aws/instance-gpu-name|l40s| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-local-nvme|1900| + |karpenter.k8s.aws/instance-memory|524288| + |karpenter.k8s.aws/instance-network-bandwidth|35000| + |karpenter.k8s.aws/instance-size|16xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|g6e.16xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|63770m| + |ephemeral-storage|17Gi| + |memory|476504Mi| + |nvidia.com/gpu|1| + |pods|737| + |vpc.amazonaws.com/efa|1| +### `g6e.24xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|g| + |karpenter.k8s.aws/instance-cpu|96| + |karpenter.k8s.aws/instance-cpu-manufacturer|amd| + |karpenter.k8s.aws/instance-ebs-bandwidth|30000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|g6e| + |karpenter.k8s.aws/instance-generation|6| + |karpenter.k8s.aws/instance-gpu-count|4| + |karpenter.k8s.aws/instance-gpu-manufacturer|nvidia| + |karpenter.k8s.aws/instance-gpu-memory|183105| + |karpenter.k8s.aws/instance-gpu-name|l40s| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-local-nvme|3800| + |karpenter.k8s.aws/instance-memory|786432| + |karpenter.k8s.aws/instance-network-bandwidth|200000| + |karpenter.k8s.aws/instance-size|24xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|g6e.24xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|95690m| + |ephemeral-storage|17Gi| + |memory|721682Mi| + |nvidia.com/gpu|4| + |pods|492| + |vpc.amazonaws.com/efa|2| +### `g6e.48xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|g| + |karpenter.k8s.aws/instance-cpu|192| + |karpenter.k8s.aws/instance-cpu-manufacturer|amd| + |karpenter.k8s.aws/instance-ebs-bandwidth|60000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|g6e| + |karpenter.k8s.aws/instance-generation|6| + |karpenter.k8s.aws/instance-gpu-count|8| + |karpenter.k8s.aws/instance-gpu-manufacturer|nvidia| + |karpenter.k8s.aws/instance-gpu-memory|366211| + |karpenter.k8s.aws/instance-gpu-name|l40s| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-local-nvme|7600| + |karpenter.k8s.aws/instance-memory|1572864| + |karpenter.k8s.aws/instance-network-bandwidth|400000| + |karpenter.k8s.aws/instance-size|48xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|g6e.48xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|191450m| + |ephemeral-storage|17Gi| + |memory|1449132Mi| + |nvidia.com/gpu|8| + |pods|492| + |vpc.amazonaws.com/efa|4| ## gr6 Family ### `gr6.4xlarge` #### Labels @@ -13383,6 +13637,39 @@ below are the resources available with some assumptions and after the instance o |pods|737| |vpc.amazonaws.com/efa|4| |vpc.amazonaws.com/pod-eni|62| +## p4de Family +### `p4de.24xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|p| + |karpenter.k8s.aws/instance-cpu|96| + |karpenter.k8s.aws/instance-cpu-manufacturer|intel| + |karpenter.k8s.aws/instance-ebs-bandwidth|19000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|p4de| + |karpenter.k8s.aws/instance-generation|4| + |karpenter.k8s.aws/instance-gpu-count|8| + |karpenter.k8s.aws/instance-gpu-manufacturer|nvidia| + |karpenter.k8s.aws/instance-gpu-memory|81920| + |karpenter.k8s.aws/instance-gpu-name|a100| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-local-nvme|8000| + |karpenter.k8s.aws/instance-memory|1179648| + |karpenter.k8s.aws/instance-network-bandwidth|400000| + |karpenter.k8s.aws/instance-size|24xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|p4de.24xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|95690m| + |ephemeral-storage|17Gi| + |memory|1082712Mi| + |nvidia.com/gpu|8| + |pods|737| + |vpc.amazonaws.com/efa|4| ## p5 Family ### `p5.48xlarge` #### Labels From e28477489c64c3433498e087811072ce0a8c9ed6 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Sun, 18 Aug 2024 19:41:07 -0700 Subject: [PATCH 10/21] docs: Fix doc link references to preview (#6793) --- .../scripts/step02-create-cluster-fargate.sh | 2 +- .../scripts/step02-create-cluster.sh | 2 +- .../scripts/step03-iam-cloud-formation.sh | 2 +- .../migrating-from-cas/scripts/step09-deploy.sh | 6 +++--- .../scripts/step02-create-cluster-fargate.sh | 2 +- .../scripts/step02-create-cluster.sh | 2 +- .../scripts/step03-iam-cloud-formation.sh | 2 +- .../migrating-from-cas/scripts/step09-deploy.sh | 6 +++--- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster-fargate.sh b/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster-fargate.sh index 08d877b6ded0..fa577d724e9f 100755 --- a/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster-fargate.sh +++ b/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster-fargate.sh @@ -1,4 +1,4 @@ -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > $TEMPOUT \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > $TEMPOUT \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ diff --git a/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh b/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh index 7c654b3b1140..0ab6f5f464bd 100755 --- a/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh +++ b/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh @@ -1,4 +1,4 @@ -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ diff --git a/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step03-iam-cloud-formation.sh b/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step03-iam-cloud-formation.sh index 7b456047096b..54e826db269b 100755 --- a/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step03-iam-cloud-formation.sh +++ b/website/content/en/docs/getting-started/getting-started-with-karpenter/scripts/step03-iam-cloud-formation.sh @@ -1,6 +1,6 @@ TEMPOUT="$(mktemp)" -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/website/content/en/docs/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ diff --git a/website/content/en/docs/getting-started/migrating-from-cas/scripts/step09-deploy.sh b/website/content/en/docs/getting-started/migrating-from-cas/scripts/step09-deploy.sh index 95b963af4e88..e46742fd22ea 100644 --- a/website/content/en/docs/getting-started/migrating-from-cas/scripts/step09-deploy.sh +++ b/website/content/en/docs/getting-started/migrating-from-cas/scripts/step09-deploy.sh @@ -1,8 +1,8 @@ kubectl create namespace "${KARPENTER_NAMESPACE}" || true kubectl create -f \ - "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/pkg/apis/crds/karpenter.sh_nodepools.yaml" + "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v${KARPENTER_VERSION}/pkg/apis/crds/karpenter.sh_nodepools.yaml" kubectl create -f \ - "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml" + "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v${KARPENTER_VERSION}/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml" kubectl create -f \ - "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/pkg/apis/crds/karpenter.sh_nodeclaims.yaml" + "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v${KARPENTER_VERSION}/pkg/apis/crds/karpenter.sh_nodeclaims.yaml" kubectl apply -f karpenter.yaml diff --git a/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster-fargate.sh b/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster-fargate.sh index 07e9e5add716..fa577d724e9f 100755 --- a/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster-fargate.sh +++ b/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster-fargate.sh @@ -1,4 +1,4 @@ -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/website/content/en/v1.0/getting-started/getting-started-with-karpenter/cloudformation.yaml > $TEMPOUT \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > $TEMPOUT \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ diff --git a/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh b/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh index 47c290b4fdb3..0ab6f5f464bd 100755 --- a/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh +++ b/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh @@ -1,4 +1,4 @@ -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/website/content/en/v1.0/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ diff --git a/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step03-iam-cloud-formation.sh b/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step03-iam-cloud-formation.sh index 169e5f1902bc..54e826db269b 100755 --- a/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step03-iam-cloud-formation.sh +++ b/website/content/en/v1.0/getting-started/getting-started-with-karpenter/scripts/step03-iam-cloud-formation.sh @@ -1,6 +1,6 @@ TEMPOUT="$(mktemp)" -curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/website/content/en/v1.0/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \ +curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ diff --git a/website/content/en/v1.0/getting-started/migrating-from-cas/scripts/step09-deploy.sh b/website/content/en/v1.0/getting-started/migrating-from-cas/scripts/step09-deploy.sh index 95b963af4e88..e46742fd22ea 100644 --- a/website/content/en/v1.0/getting-started/migrating-from-cas/scripts/step09-deploy.sh +++ b/website/content/en/v1.0/getting-started/migrating-from-cas/scripts/step09-deploy.sh @@ -1,8 +1,8 @@ kubectl create namespace "${KARPENTER_NAMESPACE}" || true kubectl create -f \ - "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/pkg/apis/crds/karpenter.sh_nodepools.yaml" + "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v${KARPENTER_VERSION}/pkg/apis/crds/karpenter.sh_nodepools.yaml" kubectl create -f \ - "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml" + "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v${KARPENTER_VERSION}/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml" kubectl create -f \ - "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.0.0/pkg/apis/crds/karpenter.sh_nodeclaims.yaml" + "https://raw.githubusercontent.com/aws/karpenter-provider-aws/v${KARPENTER_VERSION}/pkg/apis/crds/karpenter.sh_nodeclaims.yaml" kubectl apply -f karpenter.yaml From f2b727506993694e452d0ad13dc4b710bb5e78c1 Mon Sep 17 00:00:00 2001 From: jigisha620 Date: Sun, 18 Aug 2024 13:10:26 -0700 Subject: [PATCH 11/21] docs: update cpuCFSquota is supported --- website/content/en/docs/concepts/nodeclasses.md | 1 - website/content/en/preview/concepts/nodeclasses.md | 1 - website/content/en/v1.0/concepts/nodeclasses.md | 1 - 3 files changed, 3 deletions(-) diff --git a/website/content/en/docs/concepts/nodeclasses.md b/website/content/en/docs/concepts/nodeclasses.md index 4749988e69e9..ae526778c0d3 100644 --- a/website/content/en/docs/concepts/nodeclasses.md +++ b/website/content/en/docs/concepts/nodeclasses.md @@ -280,7 +280,6 @@ The Bottlerocket AMIFamily does not support the following fields: * `evictionSoft` * `evictionSoftGracePeriod` * `evictionMaxPodGracePeriod` -* `cpuCFSQuota` If any of these fields are specified on a Bottlerocket EC2NodeClass, they will be ommited from generated UserData and ignored for scheduling purposes. Support for these fields can be tracked via GitHub issue [#3722](https://github.com/aws/karpenter-provider-aws/issues/3722). diff --git a/website/content/en/preview/concepts/nodeclasses.md b/website/content/en/preview/concepts/nodeclasses.md index 939e052da3f7..d6f66ad13674 100644 --- a/website/content/en/preview/concepts/nodeclasses.md +++ b/website/content/en/preview/concepts/nodeclasses.md @@ -280,7 +280,6 @@ The Bottlerocket AMIFamily does not support the following fields: * `evictionSoft` * `evictionSoftGracePeriod` * `evictionMaxPodGracePeriod` -* `cpuCFSQuota` If any of these fields are specified on a Bottlerocket EC2NodeClass, they will be ommited from generated UserData and ignored for scheduling purposes. Support for these fields can be tracked via GitHub issue [#3722](https://github.com/aws/karpenter-provider-aws/issues/3722). diff --git a/website/content/en/v1.0/concepts/nodeclasses.md b/website/content/en/v1.0/concepts/nodeclasses.md index 4749988e69e9..ae526778c0d3 100644 --- a/website/content/en/v1.0/concepts/nodeclasses.md +++ b/website/content/en/v1.0/concepts/nodeclasses.md @@ -280,7 +280,6 @@ The Bottlerocket AMIFamily does not support the following fields: * `evictionSoft` * `evictionSoftGracePeriod` * `evictionMaxPodGracePeriod` -* `cpuCFSQuota` If any of these fields are specified on a Bottlerocket EC2NodeClass, they will be ommited from generated UserData and ignored for scheduling purposes. Support for these fields can be tracked via GitHub issue [#3722](https://github.com/aws/karpenter-provider-aws/issues/3722). From 8213882820b3137d43288869bca8e321b82bfe76 Mon Sep 17 00:00:00 2001 From: jigisha620 Date: Sun, 18 Aug 2024 12:38:04 -0700 Subject: [PATCH 12/21] docs: Update docs for status conditions --- .../content/en/docs/concepts/nodeclasses.md | 32 ++++++------------- website/content/en/docs/concepts/nodepools.md | 10 ++++++ .../content/en/docs/upgrading/v1-migration.md | 2 ++ .../en/preview/concepts/nodeclasses.md | 32 ++++++------------- .../content/en/preview/concepts/nodepools.md | 10 ++++++ .../en/preview/upgrading/v1-migration.md | 2 ++ .../content/en/v1.0/concepts/nodeclasses.md | 32 ++++++------------- website/content/en/v1.0/concepts/nodepools.md | 10 ++++++ .../content/en/v1.0/upgrading/v1-migration.md | 2 ++ 9 files changed, 63 insertions(+), 69 deletions(-) diff --git a/website/content/en/docs/concepts/nodeclasses.md b/website/content/en/docs/concepts/nodeclasses.md index ae526778c0d3..bd7a70d50629 100644 --- a/website/content/en/docs/concepts/nodeclasses.md +++ b/website/content/en/docs/concepts/nodeclasses.md @@ -1524,28 +1524,14 @@ status: [`status.conditions`]({{< ref "#statusconditions" >}}) indicates EC2NodeClass readiness. This will be `Ready` when Karpenter successfully discovers AMIs, Instance Profile, Subnets, Cluster CIDR (AL2023 only) and SecurityGroups for the EC2NodeClass. -```yaml -spec: - role: "KarpenterNodeRole-${CLUSTER_NAME}" -status: - conditions: - Last Transition Time: 2024-05-06T06:04:45Z - Message: Ready - Reason: Ready - Status: True - Type: Ready -``` +NodeClasses have the following status conditions: -If any of the underlying conditions are not resolved then `Status` is `False` and `Message` indicates the dependency that was not resolved. +| Condition Type | Description | +|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| SubnetsReady | Subnets are discovered. | +| SecurityGroupsReady | Security Groups are discovered. | +| InstanceProfileReady | Instance Profile is discovered. | +| AMIsReady | AMIs are discovered | +| Ready | Top level condition that indicates if the nodeClass is ready. If any of the underlying conditions is `False` then this condition is set to `False` and `Message` on the condition indicates the dependency that was not resolved. | -```yaml -spec: - role: "KarpenterNodeRole-${CLUSTER_NAME}" -status: - conditions: - Last Transition Time: 2024-05-06T06:19:46Z - Message: unable to resolve instance profile for node class - Reason: NodeClassNotReady - Status: False - Type: Ready -``` +If a NodeClass is not ready, NodePools that reference it through their `nodeClassRef` will not be considered for scheduling. \ No newline at end of file diff --git a/website/content/en/docs/concepts/nodepools.md b/website/content/en/docs/concepts/nodepools.md index 69198bb0a6b7..0e099bccff39 100644 --- a/website/content/en/docs/concepts/nodepools.md +++ b/website/content/en/docs/concepts/nodepools.md @@ -417,6 +417,16 @@ For more information on weighting NodePools, see the [Weighted NodePools section * The `status.conditions.reason` object indicates the reason for the condition's previous transition. * The `status.conditions.message` object provides human-readable details about the condition's previous transition. +NodePools have the following status conditions: + +| Condition Type | Description | +|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| +| NodeClassReady | Underlying nodeClass is ready | +| ValidationSucceeded | NodePool CRD validation succeeded | +| Ready | Top level condition that indicates if the nodePool is ready. This condition will not be true until all the other conditions on nodePool are true. | + +If a NodePool is not ready, it will not be considered for scheduling. + ## status.resources Objects under `status.resources` provide information about the status of resources such as `cpu`, `memory`, and `ephemeral-storage`. diff --git a/website/content/en/docs/upgrading/v1-migration.md b/website/content/en/docs/upgrading/v1-migration.md index 69930b2ea9c1..068448f1a414 100644 --- a/website/content/en/docs/upgrading/v1-migration.md +++ b/website/content/en/docs/upgrading/v1-migration.md @@ -353,6 +353,8 @@ Karpenter should now be pulling and operating against the v1beta1 APIVersion as * Karpenter now adds a `karpenter.sh/unregistered:NoExecute` taint to nodes in injected UserData when using alias in AMISelectorTerms or non-Custom AMIFamily. When using `amiFamily: Custom`, users will need to add this taint into their UserData, where Karpenter will automatically remove it when provisioning nodes. * Discovered standard AL2023 AMIs will no longer be considered compatible with GPU / accelerator workloads. If you're using an AL2023 EC2NodeClass (without AMISelectorTerms) for these workloads, you will need to select your AMI via AMISelectorTerms (non-alias). * Karpenter now waits for underlying instances to be completely terminated before removing the associated nodes. This means it may take longer for nodes to be deleted and for nodeclaims to get cleaned up. + * NodePools now have [status conditions]({{< relref "../concepts/nodepools/#statusconditions" >}}) that indicate if they are ready. If not, then they will not be considered during scheduling. + * NodeClasses now have [status conditions]({{< relref "../concepts/nodeclasses/#statusconditions" >}}) that indicate if they are ready. If they are not ready, NodePools that reference them through their `nodeClassRef` will not be considered during scheduling. * API Moves: * ExpireAfter has moved from the `NodePool.Spec.Disruption` block to `NodePool.Spec.Template.Spec`, and is now a drift-able field. * `Kubelet` was moved to the EC2NodeClass from the NodePool. diff --git a/website/content/en/preview/concepts/nodeclasses.md b/website/content/en/preview/concepts/nodeclasses.md index d6f66ad13674..0c3e1a1d76e5 100644 --- a/website/content/en/preview/concepts/nodeclasses.md +++ b/website/content/en/preview/concepts/nodeclasses.md @@ -1524,28 +1524,14 @@ status: [`status.conditions`]({{< ref "#statusconditions" >}}) indicates EC2NodeClass readiness. This will be `Ready` when Karpenter successfully discovers AMIs, Instance Profile, Subnets, Cluster CIDR (AL2023 only) and SecurityGroups for the EC2NodeClass. -```yaml -spec: - role: "KarpenterNodeRole-${CLUSTER_NAME}" -status: - conditions: - Last Transition Time: 2024-05-06T06:04:45Z - Message: Ready - Reason: Ready - Status: True - Type: Ready -``` +NodeClasses have the following status conditions: -If any of the underlying conditions are not resolved then `Status` is `False` and `Message` indicates the dependency that was not resolved. +| Condition Type | Description | +|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| SubnetsReady | Subnets are discovered. | +| SecurityGroupsReady | Security Groups are discovered. | +| InstanceProfileReady | Instance Profile is discovered. | +| AMIsReady | AMIs are discovered | +| Ready | Top level condition that indicates if the nodeClass is ready. If any of the underlying conditions is `False` then this condition is set to `False` and `Message` on the condition indicates the dependency that was not resolved. | -```yaml -spec: - role: "KarpenterNodeRole-${CLUSTER_NAME}" -status: - conditions: - Last Transition Time: 2024-05-06T06:19:46Z - Message: unable to resolve instance profile for node class - Reason: NodeClassNotReady - Status: False - Type: Ready -``` +If a NodeClass is not ready, NodePools that reference it through their `nodeClassRef` will not be considered for scheduling. \ No newline at end of file diff --git a/website/content/en/preview/concepts/nodepools.md b/website/content/en/preview/concepts/nodepools.md index 4610bac5afab..fad5644b732b 100644 --- a/website/content/en/preview/concepts/nodepools.md +++ b/website/content/en/preview/concepts/nodepools.md @@ -417,6 +417,16 @@ For more information on weighting NodePools, see the [Weighted NodePools section * The `status.conditions.reason` object indicates the reason for the condition's previous transition. * The `status.conditions.message` object provides human-readable details about the condition's previous transition. +NodePools have the following status conditions: + +| Condition Type | Description | +|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| +| NodeClassReady | Underlying nodeClass is ready | +| ValidationSucceeded | NodePool CRD validation succeeded | +| Ready | Top level condition that indicates if the nodePool is ready. This condition will not be true until all the other conditions on nodePool are true. | + +If a NodePool is not ready, it will not be considered for scheduling. + ## status.resources Objects under `status.resources` provide information about the status of resources such as `cpu`, `memory`, and `ephemeral-storage`. diff --git a/website/content/en/preview/upgrading/v1-migration.md b/website/content/en/preview/upgrading/v1-migration.md index 69930b2ea9c1..068448f1a414 100644 --- a/website/content/en/preview/upgrading/v1-migration.md +++ b/website/content/en/preview/upgrading/v1-migration.md @@ -353,6 +353,8 @@ Karpenter should now be pulling and operating against the v1beta1 APIVersion as * Karpenter now adds a `karpenter.sh/unregistered:NoExecute` taint to nodes in injected UserData when using alias in AMISelectorTerms or non-Custom AMIFamily. When using `amiFamily: Custom`, users will need to add this taint into their UserData, where Karpenter will automatically remove it when provisioning nodes. * Discovered standard AL2023 AMIs will no longer be considered compatible with GPU / accelerator workloads. If you're using an AL2023 EC2NodeClass (without AMISelectorTerms) for these workloads, you will need to select your AMI via AMISelectorTerms (non-alias). * Karpenter now waits for underlying instances to be completely terminated before removing the associated nodes. This means it may take longer for nodes to be deleted and for nodeclaims to get cleaned up. + * NodePools now have [status conditions]({{< relref "../concepts/nodepools/#statusconditions" >}}) that indicate if they are ready. If not, then they will not be considered during scheduling. + * NodeClasses now have [status conditions]({{< relref "../concepts/nodeclasses/#statusconditions" >}}) that indicate if they are ready. If they are not ready, NodePools that reference them through their `nodeClassRef` will not be considered during scheduling. * API Moves: * ExpireAfter has moved from the `NodePool.Spec.Disruption` block to `NodePool.Spec.Template.Spec`, and is now a drift-able field. * `Kubelet` was moved to the EC2NodeClass from the NodePool. diff --git a/website/content/en/v1.0/concepts/nodeclasses.md b/website/content/en/v1.0/concepts/nodeclasses.md index ae526778c0d3..bd7a70d50629 100644 --- a/website/content/en/v1.0/concepts/nodeclasses.md +++ b/website/content/en/v1.0/concepts/nodeclasses.md @@ -1524,28 +1524,14 @@ status: [`status.conditions`]({{< ref "#statusconditions" >}}) indicates EC2NodeClass readiness. This will be `Ready` when Karpenter successfully discovers AMIs, Instance Profile, Subnets, Cluster CIDR (AL2023 only) and SecurityGroups for the EC2NodeClass. -```yaml -spec: - role: "KarpenterNodeRole-${CLUSTER_NAME}" -status: - conditions: - Last Transition Time: 2024-05-06T06:04:45Z - Message: Ready - Reason: Ready - Status: True - Type: Ready -``` +NodeClasses have the following status conditions: -If any of the underlying conditions are not resolved then `Status` is `False` and `Message` indicates the dependency that was not resolved. +| Condition Type | Description | +|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| SubnetsReady | Subnets are discovered. | +| SecurityGroupsReady | Security Groups are discovered. | +| InstanceProfileReady | Instance Profile is discovered. | +| AMIsReady | AMIs are discovered | +| Ready | Top level condition that indicates if the nodeClass is ready. If any of the underlying conditions is `False` then this condition is set to `False` and `Message` on the condition indicates the dependency that was not resolved. | -```yaml -spec: - role: "KarpenterNodeRole-${CLUSTER_NAME}" -status: - conditions: - Last Transition Time: 2024-05-06T06:19:46Z - Message: unable to resolve instance profile for node class - Reason: NodeClassNotReady - Status: False - Type: Ready -``` +If a NodeClass is not ready, NodePools that reference it through their `nodeClassRef` will not be considered for scheduling. \ No newline at end of file diff --git a/website/content/en/v1.0/concepts/nodepools.md b/website/content/en/v1.0/concepts/nodepools.md index 69198bb0a6b7..0e099bccff39 100644 --- a/website/content/en/v1.0/concepts/nodepools.md +++ b/website/content/en/v1.0/concepts/nodepools.md @@ -417,6 +417,16 @@ For more information on weighting NodePools, see the [Weighted NodePools section * The `status.conditions.reason` object indicates the reason for the condition's previous transition. * The `status.conditions.message` object provides human-readable details about the condition's previous transition. +NodePools have the following status conditions: + +| Condition Type | Description | +|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| +| NodeClassReady | Underlying nodeClass is ready | +| ValidationSucceeded | NodePool CRD validation succeeded | +| Ready | Top level condition that indicates if the nodePool is ready. This condition will not be true until all the other conditions on nodePool are true. | + +If a NodePool is not ready, it will not be considered for scheduling. + ## status.resources Objects under `status.resources` provide information about the status of resources such as `cpu`, `memory`, and `ephemeral-storage`. diff --git a/website/content/en/v1.0/upgrading/v1-migration.md b/website/content/en/v1.0/upgrading/v1-migration.md index 69930b2ea9c1..068448f1a414 100644 --- a/website/content/en/v1.0/upgrading/v1-migration.md +++ b/website/content/en/v1.0/upgrading/v1-migration.md @@ -353,6 +353,8 @@ Karpenter should now be pulling and operating against the v1beta1 APIVersion as * Karpenter now adds a `karpenter.sh/unregistered:NoExecute` taint to nodes in injected UserData when using alias in AMISelectorTerms or non-Custom AMIFamily. When using `amiFamily: Custom`, users will need to add this taint into their UserData, where Karpenter will automatically remove it when provisioning nodes. * Discovered standard AL2023 AMIs will no longer be considered compatible with GPU / accelerator workloads. If you're using an AL2023 EC2NodeClass (without AMISelectorTerms) for these workloads, you will need to select your AMI via AMISelectorTerms (non-alias). * Karpenter now waits for underlying instances to be completely terminated before removing the associated nodes. This means it may take longer for nodes to be deleted and for nodeclaims to get cleaned up. + * NodePools now have [status conditions]({{< relref "../concepts/nodepools/#statusconditions" >}}) that indicate if they are ready. If not, then they will not be considered during scheduling. + * NodeClasses now have [status conditions]({{< relref "../concepts/nodeclasses/#statusconditions" >}}) that indicate if they are ready. If they are not ready, NodePools that reference them through their `nodeClassRef` will not be considered during scheduling. * API Moves: * ExpireAfter has moved from the `NodePool.Spec.Disruption` block to `NodePool.Spec.Template.Spec`, and is now a drift-able field. * `Kubelet` was moved to the EC2NodeClass from the NodePool. From 6b26b3a8f8edd634d987977b585b8d71af825762 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 19 Aug 2024 13:28:41 -0700 Subject: [PATCH 13/21] chore(deps): bump the go-deps group with 2 updates (#6796) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 16 ++++++++-------- go.sum | 35 ++++++++++++++++++----------------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/go.mod b/go.mod index 41b37882c58e..41ab1408f359 100644 --- a/go.mod +++ b/go.mod @@ -18,8 +18,8 @@ require ( github.com/onsi/gomega v1.34.1 github.com/patrickmn/go-cache v2.1.0+incompatible github.com/pelletier/go-toml/v2 v2.2.2 - github.com/prometheus/client_golang v1.19.1 - github.com/samber/lo v1.46.0 + github.com/prometheus/client_golang v1.20.0 + github.com/samber/lo v1.47.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.27.0 golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 @@ -45,7 +45,7 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/blendle/zapdriver v1.3.1 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/evanphx/json-patch v5.7.0+incompatible // indirect @@ -73,6 +73,7 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/kelseyhightower/envconfig v1.4.0 // indirect + github.com/klauspost/compress v1.17.9 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mattn/go-runewidth v0.0.15 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect @@ -82,8 +83,8 @@ require ( github.com/olekukonko/tablewriter v0.0.5 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect - github.com/prometheus/common v0.53.0 // indirect - github.com/prometheus/procfs v0.12.0 // indirect + github.com/prometheus/common v0.55.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/prometheus/statsd_exporter v0.24.0 // indirect github.com/rivo/uniseg v0.4.4 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect @@ -92,7 +93,7 @@ require ( go.opencensus.io v0.24.0 // indirect go.uber.org/automaxprocs v1.5.3 // indirect golang.org/x/net v0.28.0 // indirect - golang.org/x/oauth2 v0.18.0 // indirect + golang.org/x/oauth2 v0.21.0 // indirect golang.org/x/sys v0.23.0 // indirect golang.org/x/term v0.23.0 // indirect golang.org/x/text v0.17.0 // indirect @@ -100,12 +101,11 @@ require ( golang.org/x/tools v0.24.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/api v0.146.0 // indirect - google.golang.org/appengine v1.6.8 // indirect google.golang.org/genproto v0.0.0-20231009173412-8bfb1ae86b6c // indirect google.golang.org/genproto/googleapis/api v0.0.0-20231009173412-8bfb1ae86b6c // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20231009173412-8bfb1ae86b6c // indirect google.golang.org/grpc v1.58.3 // indirect - google.golang.org/protobuf v1.34.1 // indirect + google.golang.org/protobuf v1.34.2 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 5395ecd1a912..fce34c403587 100644 --- a/go.sum +++ b/go.sum @@ -75,8 +75,8 @@ github.com/census-instrumentation/opencensus-proto v0.4.1 h1:iKLQ0xPNFxR/2hzXZMr github.com/census-instrumentation/opencensus-proto v0.4.1/go.mod h1:4T9NM4+4Vw91VeyqjLS6ao50K5bOcLKN6Q42XnYaRYw= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= -github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= @@ -238,6 +238,8 @@ github.com/kelseyhightower/envconfig v1.4.0 h1:Im6hONhd3pLkfDFsbRgu68RDNkGF1r3dv github.com/kelseyhightower/envconfig v1.4.0/go.mod h1:cccZRl6mQpaq41TPp5QxidR+Sa3axMbJDNb//FQX6Gg= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= @@ -249,6 +251,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= @@ -295,8 +299,8 @@ github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqr github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY= github.com/prometheus/client_golang v1.12.2/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY= github.com/prometheus/client_golang v1.13.0/go.mod h1:vTeo+zgvILHsnnj/39Ou/1fPN5nJFOEMgftOUOmlvYQ= -github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= -github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_golang v1.20.0 h1:jBzTZ7B099Rg24tny+qngoynol8LtVYlA2bqx3vEloI= +github.com/prometheus/client_golang v1.20.0/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -309,16 +313,16 @@ github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9 github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls= github.com/prometheus/common v0.35.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA= github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA= -github.com/prometheus/common v0.53.0 h1:U2pL9w9nmJwJDa4qqLQ3ZaePJ6ZTwt7cMD3AG3+aLCE= -github.com/prometheus/common v0.53.0/go.mod h1:BrxBKv3FWBIGXw89Mg1AeBq7FSyRzXWI3l3e7W3RN5U= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4= -github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo= -github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/prometheus/statsd_exporter v0.22.7/go.mod h1:N/TevpjkIh9ccs6nuzY3jQn9dFqnUakOjnEuMPJJJnI= github.com/prometheus/statsd_exporter v0.24.0 h1:aZmN6CzS2H1Non1JKZdjkQlAkDtGoQBYIESk2SlU1OI= github.com/prometheus/statsd_exporter v0.24.0/go.mod h1:+dQiRTqn9DnPmN5mI5Xond+k8nuRKzdgh1omxh9OgFY= @@ -332,8 +336,8 @@ github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFR github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/samber/lo v1.46.0 h1:w8G+oaCPgz1PoCJztqymCFaKwXt+5cCXn51uPxExFfQ= -github.com/samber/lo v1.46.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU= +github.com/samber/lo v1.47.0 h1:z7RynLwP5nbyRscyvcD043DWYoOcYRv3mV8lBeqOCLc= +github.com/samber/lo v1.47.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= @@ -471,8 +475,8 @@ golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4Iltr golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= -golang.org/x/oauth2 v0.18.0 h1:09qnuIAgzdx1XplqJvW6CQqMCtGZykZWcXzPMPUusvI= -golang.org/x/oauth2 v0.18.0/go.mod h1:Wf7knwG0MPoWIMMBgFlEaSUDaKskp0dCfrlJRJXbBi8= +golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= +golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -547,7 +551,6 @@ golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= @@ -634,8 +637,6 @@ google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7 google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= -google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= @@ -702,8 +703,8 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg= -google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From ca1bdc44566fcbadec5ff994126d206f1d2f90d1 Mon Sep 17 00:00:00 2001 From: avielb-navina <166999213+avielb-navina@users.noreply.github.com> Date: Mon, 19 Aug 2024 23:59:00 +0300 Subject: [PATCH 14/21] fix: right order to environment variable exports (#6798) Co-authored-by: Jonathan Innis --- .../content/en/docs/upgrading/v1-migration.md | 29 ++++++++++++------- .../en/preview/upgrading/v1-migration.md | 29 ++++++++++++------- .../content/en/v1.0/upgrading/v1-migration.md | 29 ++++++++++++------- 3 files changed, 57 insertions(+), 30 deletions(-) diff --git a/website/content/en/docs/upgrading/v1-migration.md b/website/content/en/docs/upgrading/v1-migration.md index 068448f1a414..7f0d3418cdb4 100644 --- a/website/content/en/docs/upgrading/v1-migration.md +++ b/website/content/en/docs/upgrading/v1-migration.md @@ -32,12 +32,12 @@ The upgrade guide will first require upgrading to your latest patch version prio 1. Set environment variables for your cluster to upgrade to the latest patch version of the current Karpenter version you're running on: ```bash - export KARPENTER_NAMESPACE=kube-system - export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov export CLUSTER_NAME="${USER}-karpenter-demo" export AWS_REGION="us-west-2" export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" + export KARPENTER_NAMESPACE=kube-system + export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" ``` @@ -106,7 +106,7 @@ The upgrade guide will first require upgrading to your latest patch version prio Notable Changes to the IAM Policy include additional tag-scoping for the `eks:eks-cluster-name` tag for instances and instance profiles. ```bash - TEMPOUT=$(mktemp) + export TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ @@ -271,12 +271,19 @@ Since both v1beta1 and v1 will be served, `kubectl` will default to returning th 1. Set environment variables ```bash -export KARPENTER_NAMESPACE="kube-system" +export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov +export CLUSTER_NAME="${USER}-karpenter-demo" +export AWS_REGION="us-west-2" +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" +export KARPENTER_NAMESPACE=kube-system +export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" +``` + +2. Set Karpenter Version + +```bash # Note: v0.33.6 and v0.34.7 include the v prefix, omit it for versions v0.35+ export KARPENTER_VERSION="" -export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" -export CLUSTER_NAME="" -export TEMPOUT="$(mktemp)" ``` {{% alert title="Warning" color="warning" %}} @@ -289,10 +296,11 @@ echo "${KARPENTER_NAMESPACE}" "${KARPENTER_VERSION}" "${CLUSTER_NAME}" "${TEMPOU {{% /alert %}} -2. Rollback the Karpenter Policy +3. Rollback the Karpenter Policy **v0.33.6 and v0.34.7:** ```bash +export TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ @@ -303,6 +311,7 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPE **v0.35+:** ```bash +export TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ @@ -311,7 +320,7 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARP --parameter-overrides "ClusterName=${CLUSTER_NAME}" ``` -3. Rollback the CRDs +4. Rollback the CRDs ```bash helm upgrade --install karpenter-crd oci://public.ecr.aws/karpenter/karpenter-crd --version "${KARPENTER_VERSION}" --namespace "${KARPENTER_NAMESPACE}" --create-namespace \ @@ -321,7 +330,7 @@ helm upgrade --install karpenter-crd oci://public.ecr.aws/karpenter/karpenter-cr --set webhook.port=8443 ``` -4. Rollback the Karpenter Controller +5. Rollback the Karpenter Controller ```bash # Service account annotation can be dropped when using pod identity diff --git a/website/content/en/preview/upgrading/v1-migration.md b/website/content/en/preview/upgrading/v1-migration.md index 068448f1a414..7f0d3418cdb4 100644 --- a/website/content/en/preview/upgrading/v1-migration.md +++ b/website/content/en/preview/upgrading/v1-migration.md @@ -32,12 +32,12 @@ The upgrade guide will first require upgrading to your latest patch version prio 1. Set environment variables for your cluster to upgrade to the latest patch version of the current Karpenter version you're running on: ```bash - export KARPENTER_NAMESPACE=kube-system - export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov export CLUSTER_NAME="${USER}-karpenter-demo" export AWS_REGION="us-west-2" export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" + export KARPENTER_NAMESPACE=kube-system + export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" ``` @@ -106,7 +106,7 @@ The upgrade guide will first require upgrading to your latest patch version prio Notable Changes to the IAM Policy include additional tag-scoping for the `eks:eks-cluster-name` tag for instances and instance profiles. ```bash - TEMPOUT=$(mktemp) + export TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ @@ -271,12 +271,19 @@ Since both v1beta1 and v1 will be served, `kubectl` will default to returning th 1. Set environment variables ```bash -export KARPENTER_NAMESPACE="kube-system" +export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov +export CLUSTER_NAME="${USER}-karpenter-demo" +export AWS_REGION="us-west-2" +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" +export KARPENTER_NAMESPACE=kube-system +export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" +``` + +2. Set Karpenter Version + +```bash # Note: v0.33.6 and v0.34.7 include the v prefix, omit it for versions v0.35+ export KARPENTER_VERSION="" -export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" -export CLUSTER_NAME="" -export TEMPOUT="$(mktemp)" ``` {{% alert title="Warning" color="warning" %}} @@ -289,10 +296,11 @@ echo "${KARPENTER_NAMESPACE}" "${KARPENTER_VERSION}" "${CLUSTER_NAME}" "${TEMPOU {{% /alert %}} -2. Rollback the Karpenter Policy +3. Rollback the Karpenter Policy **v0.33.6 and v0.34.7:** ```bash +export TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ @@ -303,6 +311,7 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPE **v0.35+:** ```bash +export TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ @@ -311,7 +320,7 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARP --parameter-overrides "ClusterName=${CLUSTER_NAME}" ``` -3. Rollback the CRDs +4. Rollback the CRDs ```bash helm upgrade --install karpenter-crd oci://public.ecr.aws/karpenter/karpenter-crd --version "${KARPENTER_VERSION}" --namespace "${KARPENTER_NAMESPACE}" --create-namespace \ @@ -321,7 +330,7 @@ helm upgrade --install karpenter-crd oci://public.ecr.aws/karpenter/karpenter-cr --set webhook.port=8443 ``` -4. Rollback the Karpenter Controller +5. Rollback the Karpenter Controller ```bash # Service account annotation can be dropped when using pod identity diff --git a/website/content/en/v1.0/upgrading/v1-migration.md b/website/content/en/v1.0/upgrading/v1-migration.md index 068448f1a414..7f0d3418cdb4 100644 --- a/website/content/en/v1.0/upgrading/v1-migration.md +++ b/website/content/en/v1.0/upgrading/v1-migration.md @@ -32,12 +32,12 @@ The upgrade guide will first require upgrading to your latest patch version prio 1. Set environment variables for your cluster to upgrade to the latest patch version of the current Karpenter version you're running on: ```bash - export KARPENTER_NAMESPACE=kube-system - export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov export CLUSTER_NAME="${USER}-karpenter-demo" export AWS_REGION="us-west-2" export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" + export KARPENTER_NAMESPACE=kube-system + export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" ``` @@ -106,7 +106,7 @@ The upgrade guide will first require upgrading to your latest patch version prio Notable Changes to the IAM Policy include additional tag-scoping for the `eks:eks-cluster-name` tag for instances and instance profiles. ```bash - TEMPOUT=$(mktemp) + export TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ @@ -271,12 +271,19 @@ Since both v1beta1 and v1 will be served, `kubectl` will default to returning th 1. Set environment variables ```bash -export KARPENTER_NAMESPACE="kube-system" +export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov +export CLUSTER_NAME="${USER}-karpenter-demo" +export AWS_REGION="us-west-2" +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" +export KARPENTER_NAMESPACE=kube-system +export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" +``` + +2. Set Karpenter Version + +```bash # Note: v0.33.6 and v0.34.7 include the v prefix, omit it for versions v0.35+ export KARPENTER_VERSION="" -export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" -export CLUSTER_NAME="" -export TEMPOUT="$(mktemp)" ``` {{% alert title="Warning" color="warning" %}} @@ -289,10 +296,11 @@ echo "${KARPENTER_NAMESPACE}" "${KARPENTER_VERSION}" "${CLUSTER_NAME}" "${TEMPOU {{% /alert %}} -2. Rollback the Karpenter Policy +3. Rollback the Karpenter Policy **v0.33.6 and v0.34.7:** ```bash +export TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ @@ -303,6 +311,7 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/"${KARPE **v0.35+:** ```bash +export TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > ${TEMPOUT} \ && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ @@ -311,7 +320,7 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARP --parameter-overrides "ClusterName=${CLUSTER_NAME}" ``` -3. Rollback the CRDs +4. Rollback the CRDs ```bash helm upgrade --install karpenter-crd oci://public.ecr.aws/karpenter/karpenter-crd --version "${KARPENTER_VERSION}" --namespace "${KARPENTER_NAMESPACE}" --create-namespace \ @@ -321,7 +330,7 @@ helm upgrade --install karpenter-crd oci://public.ecr.aws/karpenter/karpenter-cr --set webhook.port=8443 ``` -4. Rollback the Karpenter Controller +5. Rollback the Karpenter Controller ```bash # Service account annotation can be dropped when using pod identity From c4015934ec799caa397f799eb0db5d06d1f1ed80 Mon Sep 17 00:00:00 2001 From: Elliot Maincourt Date: Mon, 19 Aug 2024 23:19:13 +0200 Subject: [PATCH 15/21] chore: add support for `nodeSelector` to post-install hook (#6800) --- charts/karpenter/templates/post-install-hook.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/charts/karpenter/templates/post-install-hook.yaml b/charts/karpenter/templates/post-install-hook.yaml index b2fd22824b8d..8110fb38d018 100644 --- a/charts/karpenter/templates/post-install-hook.yaml +++ b/charts/karpenter/templates/post-install-hook.yaml @@ -21,6 +21,10 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} containers: - name: post-install-job image: {{ include "karpenter.postInstallHook.image" . }} From add3933944e0e5815f635d5217c956dbb8044c50 Mon Sep 17 00:00:00 2001 From: Thomas Krisch <8754535+kriths@users.noreply.github.com> Date: Wed, 21 Aug 2024 00:40:44 +0200 Subject: [PATCH 16/21] docs: fix example link to example NodePools (#6811) --- .../en/docs/getting-started/migrating-from-cas/_index.md | 2 +- .../en/preview/getting-started/migrating-from-cas/_index.md | 2 +- .../en/v1.0/getting-started/migrating-from-cas/_index.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/website/content/en/docs/getting-started/migrating-from-cas/_index.md b/website/content/en/docs/getting-started/migrating-from-cas/_index.md index 6fc30cd8cd85..5ae3fea172ca 100644 --- a/website/content/en/docs/getting-started/migrating-from-cas/_index.md +++ b/website/content/en/docs/getting-started/migrating-from-cas/_index.md @@ -132,7 +132,7 @@ Now that our deployment is ready we can create the karpenter namespace, create t ## Create default NodePool -We need to create a default NodePool so Karpenter knows what types of nodes we want for unscheduled workloads. You can refer to some of the [example NodePool](https://github.com/aws/karpenter/tree/v1.0.0/examples/v1beta1) for specific needs. +We need to create a default NodePool so Karpenter knows what types of nodes we want for unscheduled workloads. You can refer to some of the [example NodePool](https://github.com/aws/karpenter/tree/v1.0.0/examples/v1) for specific needs. {{% script file="./content/en/{VERSION}/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh" language="bash" %}} diff --git a/website/content/en/preview/getting-started/migrating-from-cas/_index.md b/website/content/en/preview/getting-started/migrating-from-cas/_index.md index 24b03d368c53..5373c65defa2 100644 --- a/website/content/en/preview/getting-started/migrating-from-cas/_index.md +++ b/website/content/en/preview/getting-started/migrating-from-cas/_index.md @@ -132,7 +132,7 @@ Now that our deployment is ready we can create the karpenter namespace, create t ## Create default NodePool -We need to create a default NodePool so Karpenter knows what types of nodes we want for unscheduled workloads. You can refer to some of the [example NodePool](https://github.com/aws/karpenter/tree{{< githubRelRef >}}examples/v1beta1) for specific needs. +We need to create a default NodePool so Karpenter knows what types of nodes we want for unscheduled workloads. You can refer to some of the [example NodePool](https://github.com/aws/karpenter/tree{{< githubRelRef >}}examples/v1) for specific needs. {{% script file="./content/en/{VERSION}/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh" language="bash" %}} diff --git a/website/content/en/v1.0/getting-started/migrating-from-cas/_index.md b/website/content/en/v1.0/getting-started/migrating-from-cas/_index.md index 6fc30cd8cd85..5ae3fea172ca 100644 --- a/website/content/en/v1.0/getting-started/migrating-from-cas/_index.md +++ b/website/content/en/v1.0/getting-started/migrating-from-cas/_index.md @@ -132,7 +132,7 @@ Now that our deployment is ready we can create the karpenter namespace, create t ## Create default NodePool -We need to create a default NodePool so Karpenter knows what types of nodes we want for unscheduled workloads. You can refer to some of the [example NodePool](https://github.com/aws/karpenter/tree/v1.0.0/examples/v1beta1) for specific needs. +We need to create a default NodePool so Karpenter knows what types of nodes we want for unscheduled workloads. You can refer to some of the [example NodePool](https://github.com/aws/karpenter/tree/v1.0.0/examples/v1) for specific needs. {{% script file="./content/en/{VERSION}/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh" language="bash" %}} From 325423eee571fb6f3ecd7a17a1c5cb05bb168e92 Mon Sep 17 00:00:00 2001 From: Jason Deal Date: Tue, 20 Aug 2024 22:48:09 -0700 Subject: [PATCH 17/21] fix: ensure alias version isn't dropped round-trip (#6777) --- pkg/apis/v1/ec2nodeclass_conversion.go | 18 +++++++++++++++++- pkg/apis/v1/ec2nodeclass_conversion_test.go | 14 ++++++++++++++ pkg/apis/v1/labels.go | 2 ++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/pkg/apis/v1/ec2nodeclass_conversion.go b/pkg/apis/v1/ec2nodeclass_conversion.go index caa5284d13c3..86ccdf5e6a20 100644 --- a/pkg/apis/v1/ec2nodeclass_conversion.go +++ b/pkg/apis/v1/ec2nodeclass_conversion.go @@ -51,6 +51,17 @@ func (in *EC2NodeClass) ConvertTo(ctx context.Context, to apis.Convertible) erro v1beta1enc.Spec.AMIFamily = lo.ToPtr(in.AMIFamily()) } + if term, ok := lo.Find(in.Spec.AMISelectorTerms, func(term AMISelectorTerm) bool { + return term.Alias != "" + }); ok { + version := AMIVersionFromAlias(term.Alias) + if version != "latest" { + v1beta1enc.Annotations = lo.Assign(v1beta1enc.Annotations, map[string]string{ + AnnotationAliasVersionCompatibilityKey: version, + }) + } + } + in.Spec.convertTo(&v1beta1enc.Spec) in.Status.convertTo((&v1beta1enc.Status)) return nil @@ -127,6 +138,7 @@ func (in *EC2NodeClassStatus) convertTo(v1beta1enc *v1beta1.EC2NodeClassStatus) func (in *EC2NodeClass) ConvertFrom(ctx context.Context, from apis.Convertible) error { v1beta1enc := from.(*v1beta1.EC2NodeClass) in.ObjectMeta = v1beta1enc.ObjectMeta + in.Annotations = lo.OmitByKeys(in.Annotations, []string{AnnotationAliasVersionCompatibilityKey}) switch lo.FromPtr(v1beta1enc.Spec.AMIFamily) { case AMIFamilyAL2, AMIFamilyAL2023, AMIFamilyBottlerocket, AMIFamilyWindows2019, AMIFamilyWindows2022: @@ -135,7 +147,11 @@ func (in *EC2NodeClass) ConvertFrom(ctx context.Context, from apis.Convertible) if len(v1beta1enc.Spec.AMISelectorTerms) == 0 { in.Spec.AMIFamily = nil in.Spec.AMISelectorTerms = []AMISelectorTerm{{ - Alias: fmt.Sprintf("%s@latest", strings.ToLower(lo.FromPtr(v1beta1enc.Spec.AMIFamily))), + Alias: fmt.Sprintf( + "%s@%s", + strings.ToLower(lo.FromPtr(v1beta1enc.Spec.AMIFamily)), + lo.ValueOr(v1beta1enc.Annotations, AnnotationAliasVersionCompatibilityKey, "latest"), + ), }} } else { in.Spec.AMIFamily = v1beta1enc.Spec.AMIFamily diff --git a/pkg/apis/v1/ec2nodeclass_conversion_test.go b/pkg/apis/v1/ec2nodeclass_conversion_test.go index 4c25eec22ca7..a598e72f030b 100644 --- a/pkg/apis/v1/ec2nodeclass_conversion_test.go +++ b/pkg/apis/v1/ec2nodeclass_conversion_test.go @@ -121,6 +121,12 @@ var _ = Describe("Convert v1 to v1beta1 EC2NodeClass API", func() { Expect(v1ec2nodeclass.ConvertTo(ctx, v1beta1ec2nodeclass)).To(Succeed()) Expect(lo.FromPtr(v1beta1ec2nodeclass.Spec.AMIFamily)).To(Equal(v1beta1.AMIFamilyAL2023)) }) + It("should convert v1 ec2nodeclass alias (pinned)", func() { + v1ec2nodeclass.Spec.AMISelectorTerms = []AMISelectorTerm{{Alias: "al2023@v20240807"}} + Expect(v1ec2nodeclass.ConvertTo(ctx, v1beta1ec2nodeclass)).To(Succeed()) + Expect(lo.FromPtr(v1beta1ec2nodeclass.Spec.AMIFamily)).To(Equal(v1beta1.AMIFamilyAL2023)) + Expect(v1beta1ec2nodeclass.Annotations).To(HaveKeyWithValue(AnnotationAliasVersionCompatibilityKey, "v20240807")) + }) It("should convert v1 ec2nodeclass ami selector terms with the Ubuntu compatibility annotation", func() { v1ec2nodeclass.Annotations = lo.Assign(v1ec2nodeclass.Annotations, map[string]string{ AnnotationUbuntuCompatibilityKey: fmt.Sprintf("%s,%s", AnnotationUbuntuCompatibilityAMIFamily, AnnotationUbuntuCompatibilityBlockDeviceMappings), @@ -402,6 +408,14 @@ var _ = Describe("Convert v1beta1 to v1 EC2NodeClass API", func() { Expect(v1ec2nodeclass.ConvertFrom(ctx, v1beta1ec2nodeclass)).To(Succeed()) Expect(v1ec2nodeclass.Spec.AMISelectorTerms).To(ContainElement(AMISelectorTerm{Alias: "al2023@latest"})) }) + It("should convert v1beta1 ec2nodeclass ami family (alias version annotation)", func() { + v1beta1ec2nodeclass.Spec.AMIFamily = &v1beta1.AMIFamilyAL2023 + v1beta1ec2nodeclass.Annotations = lo.Assign(v1beta1ec2nodeclass.Annotations, map[string]string{ + AnnotationAliasVersionCompatibilityKey: "v20240807", + }) + Expect(v1ec2nodeclass.ConvertFrom(ctx, v1beta1ec2nodeclass)).To(Succeed()) + Expect(v1ec2nodeclass.Spec.AMISelectorTerms).To(ContainElement(AMISelectorTerm{Alias: "al2023@v20240807"})) + }) It("should convert v1beta1 ec2nodeclass ami family with non-custom ami family and ami selector terms", func() { v1beta1ec2nodeclass.Spec.AMIFamily = &v1beta1.AMIFamilyAL2023 v1beta1ec2nodeclass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ diff --git a/pkg/apis/v1/labels.go b/pkg/apis/v1/labels.go index c391b0faab01..9ffa96c0d925 100644 --- a/pkg/apis/v1/labels.go +++ b/pkg/apis/v1/labels.go @@ -131,6 +131,8 @@ var ( AnnotationUbuntuCompatibilityAMIFamily = "amiFamily" AnnotationUbuntuCompatibilityBlockDeviceMappings = "blockDeviceMappings" + AnnotationAliasVersionCompatibilityKey = apis.CompatibilityGroup + "/v1-alias-version" + TagNodeClaim = coreapis.Group + "/nodeclaim" TagManagedLaunchTemplate = apis.Group + "/cluster" TagName = "Name" From c0b1c42f03976e6099fe56780bc9e83ab96dd600 Mon Sep 17 00:00:00 2001 From: Jason Deal Date: Tue, 20 Aug 2024 22:48:56 -0700 Subject: [PATCH 18/21] docs: add troubleshooting note on VM_MEMORY_OVERHEAD_PERCENT (#6805) --- website/content/en/docs/troubleshooting.md | 44 +++++++++++++++++++ website/content/en/preview/troubleshooting.md | 44 +++++++++++++++++++ website/content/en/v0.32/troubleshooting.md | 22 ++++++++++ website/content/en/v0.36/troubleshooting.md | 22 ++++++++++ website/content/en/v0.37/troubleshooting.md | 22 ++++++++++ website/content/en/v1.0/troubleshooting.md | 44 +++++++++++++++++++ 6 files changed, 198 insertions(+) diff --git a/website/content/en/docs/troubleshooting.md b/website/content/en/docs/troubleshooting.md index c51d4f341ef5..bebc0625c5a3 100644 --- a/website/content/en/docs/troubleshooting.md +++ b/website/content/en/docs/troubleshooting.md @@ -325,6 +325,50 @@ then the following solution(s) may resolve your issue. ... ``` +### Karpenter incorrectly computes available resources for a node + +When creating nodes, the allocatable resources Karpenter computed (as seen in logs and `nodeClaim.status.allocatable`) do not always match the allocatable resources on the created node (`node.status.allocatable`). +Karpenter uses the results from `ec2:DescribeInstanceTypes` to determine the resources available on a node launched with a given instance type. +The following computation is used to determine allocatable CPU, memory, and ephemeral storage based on the results returned from `ec2:DescribeInstanceTypes`. + +``` +nodeClaim.allocatable.cpu = instance.cpu - kubeReserved.cpu - systemReserved.cpu +nodeClaim.allocatable.memory = (instance.memory * (1.0 - VM_MEMORY_OVERHEAD_PERCENT)) - kubeReserved.memory - systemReserved.memory - max(evictionSoft.memory.available, evictionHard.memory.available) +nodeClaim.allocatable.ephemeralStorage = instance.storage - kubeReserved.ephemeralStorage - systemReserved.ephemeralStorage - max(evictionSoft.nodefs.available, evictionHard.nodefs.available) +``` + +Most of these factors directly model user configuration (i.e. the KubeletConfiguration options). +On the other hand, `VM_MEMORY_OVERHEAD_PERCENT` models an implicit reduction of available memory that varies by instance type and AMI. +Karpenter can't compute the exact value being modeled, so `VM_MEMORY_OVERHEAD_PERCENT` is a [global setting]({{< ref "./reference/settings.md" >}}) used across all instance type and AMI combinations. +The default value (`7.5%`) has been tuned to closely match reality for the majority of instance types while not overestimating. +As a result, Karpenter will typically underestimate the memory availble on a node for a given instance type. +If you know the real `VM_MEMORY_OVERHEAD_PERCENT` for the specific instances you're provisioning in your cluster, you can tune this value to tighten the bound. +However, this should be done with caution. +A `VM_MEMORY_OVERHEAD_PERCENT` which results in Karpenter overestimating the memory available on a node can result in Karpenter launching nodes which are too small for your workload. +In the worst case, this can result in an instance launch loop and your workload remaining unschedulable indefinitely. + +To detect instances of Karpenter overestimating resource availability, the following status condition can be monitored: + +```bash +$ kg nodeclaim $NODECLAIM_NAME -o jsonpath='{.status.conditions[?(@.type=="ConsistentStateFound")]}' +``` + +```json +{ + "type": "ConsistentStateFound", + "status": "False", + "reason": "ConsistencyCheckFailed", + "message": "Consistency Check Failed", + "lastTransitionTime": "2024-08-19T20:02:16Z" +} +``` + +This can be spot checked like shown above, or monitored via the following metric: + +``` +operator_status_condition_count{type="ConsistentStateFound",kind="NodeClaim",status="False"} +``` + ## Deprovisioning ### Nodes not deprovisioned diff --git a/website/content/en/preview/troubleshooting.md b/website/content/en/preview/troubleshooting.md index c51d4f341ef5..bebc0625c5a3 100644 --- a/website/content/en/preview/troubleshooting.md +++ b/website/content/en/preview/troubleshooting.md @@ -325,6 +325,50 @@ then the following solution(s) may resolve your issue. ... ``` +### Karpenter incorrectly computes available resources for a node + +When creating nodes, the allocatable resources Karpenter computed (as seen in logs and `nodeClaim.status.allocatable`) do not always match the allocatable resources on the created node (`node.status.allocatable`). +Karpenter uses the results from `ec2:DescribeInstanceTypes` to determine the resources available on a node launched with a given instance type. +The following computation is used to determine allocatable CPU, memory, and ephemeral storage based on the results returned from `ec2:DescribeInstanceTypes`. + +``` +nodeClaim.allocatable.cpu = instance.cpu - kubeReserved.cpu - systemReserved.cpu +nodeClaim.allocatable.memory = (instance.memory * (1.0 - VM_MEMORY_OVERHEAD_PERCENT)) - kubeReserved.memory - systemReserved.memory - max(evictionSoft.memory.available, evictionHard.memory.available) +nodeClaim.allocatable.ephemeralStorage = instance.storage - kubeReserved.ephemeralStorage - systemReserved.ephemeralStorage - max(evictionSoft.nodefs.available, evictionHard.nodefs.available) +``` + +Most of these factors directly model user configuration (i.e. the KubeletConfiguration options). +On the other hand, `VM_MEMORY_OVERHEAD_PERCENT` models an implicit reduction of available memory that varies by instance type and AMI. +Karpenter can't compute the exact value being modeled, so `VM_MEMORY_OVERHEAD_PERCENT` is a [global setting]({{< ref "./reference/settings.md" >}}) used across all instance type and AMI combinations. +The default value (`7.5%`) has been tuned to closely match reality for the majority of instance types while not overestimating. +As a result, Karpenter will typically underestimate the memory availble on a node for a given instance type. +If you know the real `VM_MEMORY_OVERHEAD_PERCENT` for the specific instances you're provisioning in your cluster, you can tune this value to tighten the bound. +However, this should be done with caution. +A `VM_MEMORY_OVERHEAD_PERCENT` which results in Karpenter overestimating the memory available on a node can result in Karpenter launching nodes which are too small for your workload. +In the worst case, this can result in an instance launch loop and your workload remaining unschedulable indefinitely. + +To detect instances of Karpenter overestimating resource availability, the following status condition can be monitored: + +```bash +$ kg nodeclaim $NODECLAIM_NAME -o jsonpath='{.status.conditions[?(@.type=="ConsistentStateFound")]}' +``` + +```json +{ + "type": "ConsistentStateFound", + "status": "False", + "reason": "ConsistencyCheckFailed", + "message": "Consistency Check Failed", + "lastTransitionTime": "2024-08-19T20:02:16Z" +} +``` + +This can be spot checked like shown above, or monitored via the following metric: + +``` +operator_status_condition_count{type="ConsistentStateFound",kind="NodeClaim",status="False"} +``` + ## Deprovisioning ### Nodes not deprovisioned diff --git a/website/content/en/v0.32/troubleshooting.md b/website/content/en/v0.32/troubleshooting.md index d0c79437f962..b4744d02da9c 100644 --- a/website/content/en/v0.32/troubleshooting.md +++ b/website/content/en/v0.32/troubleshooting.md @@ -391,6 +391,28 @@ then the following solution(s) may resolve your issue. ... ``` +### Karpenter incorrectly computes available resources for a node + +When creating nodes, the allocatable resources Karpenter computed (as seen in logs and `nodeClaim.status.allocatable`) do not always match the allocatable resources on the created node (`node.status.allocatable`). +Karpenter uses the results from `ec2:DescribeInstanceTypes` to determine the resources available on a node launched with a given instance type. +The following computation is used to determine allocatable CPU, memory, and ephemeral storage based on the results returned from `ec2:DescribeInstanceTypes`. + +``` +nodeClaim.allocatable.cpu = instance.cpu - kubeReserved.cpu - systemReserved.cpu +nodeClaim.allocatable.memory = (instance.memory * (1.0 - VM_MEMORY_OVERHEAD_PERCENT)) - kubeReserved.memory - systemReserved.memory - max(evictionSoft.memory.available, evictionHard.memory.available) +nodeClaim.allocatable.ephemeralStorage = instance.storage - kubeReserved.ephemeralStorage - systemReserved.ephemeralStorage - max(evictionSoft.nodefs.available, evictionHard.nodefs.available) +``` + +Most of these factors directly model user configuration (i.e. the KubeletConfiguration options). +On the other hand, `VM_MEMORY_OVERHEAD_PERCENT` models an implicit reduction of available memory that varies by instance type and AMI. +Karpenter can't compute the exact value being modeled, so `VM_MEMORY_OVERHEAD_PERCENT` is a [global setting]({{< ref "./reference/settings.md" >}}) used across all instance type and AMI combinations. +The default value (`7.5%`) has been tuned to closely match reality for the majority of instance types while not overestimating. +As a result, Karpenter will typically underestimate the memory availble on a node for a given instance type. +If you know the real `VM_MEMORY_OVERHEAD_PERCENT` for the specific instances you're provisioning in your cluster, you can tune this value to tighten the bound. +However, this should be done with caution. +A `VM_MEMORY_OVERHEAD_PERCENT` which results in Karpenter overestimating the memory available on a node can result in Karpenter launching nodes which are too small for your workload. +In the worst case, this can result in an instance launch loop and your workload remaining unschedulable indefinitely. + ## Deprovisioning ### Nodes not deprovisioned diff --git a/website/content/en/v0.36/troubleshooting.md b/website/content/en/v0.36/troubleshooting.md index 6b362e504e86..ed3dc4a21c6d 100644 --- a/website/content/en/v0.36/troubleshooting.md +++ b/website/content/en/v0.36/troubleshooting.md @@ -403,6 +403,28 @@ then the following solution(s) may resolve your issue. ... ``` +### Karpenter incorrectly computes available resources for a node + +When creating nodes, the allocatable resources Karpenter computed (as seen in logs and `nodeClaim.status.allocatable`) do not always match the allocatable resources on the created node (`node.status.allocatable`). +Karpenter uses the results from `ec2:DescribeInstanceTypes` to determine the resources available on a node launched with a given instance type. +The following computation is used to determine allocatable CPU, memory, and ephemeral storage based on the results returned from `ec2:DescribeInstanceTypes`. + +``` +nodeClaim.allocatable.cpu = instance.cpu - kubeReserved.cpu - systemReserved.cpu +nodeClaim.allocatable.memory = (instance.memory * (1.0 - VM_MEMORY_OVERHEAD_PERCENT)) - kubeReserved.memory - systemReserved.memory - max(evictionSoft.memory.available, evictionHard.memory.available) +nodeClaim.allocatable.ephemeralStorage = instance.storage - kubeReserved.ephemeralStorage - systemReserved.ephemeralStorage - max(evictionSoft.nodefs.available, evictionHard.nodefs.available) +``` + +Most of these factors directly model user configuration (i.e. the KubeletConfiguration options). +On the other hand, `VM_MEMORY_OVERHEAD_PERCENT` models an implicit reduction of available memory that varies by instance type and AMI. +Karpenter can't compute the exact value being modeled, so `VM_MEMORY_OVERHEAD_PERCENT` is a [global setting]({{< ref "./reference/settings.md" >}}) used across all instance type and AMI combinations. +The default value (`7.5%`) has been tuned to closely match reality for the majority of instance types while not overestimating. +As a result, Karpenter will typically underestimate the memory availble on a node for a given instance type. +If you know the real `VM_MEMORY_OVERHEAD_PERCENT` for the specific instances you're provisioning in your cluster, you can tune this value to tighten the bound. +However, this should be done with caution. +A `VM_MEMORY_OVERHEAD_PERCENT` which results in Karpenter overestimating the memory available on a node can result in Karpenter launching nodes which are too small for your workload. +In the worst case, this can result in an instance launch loop and your workload remaining unschedulable indefinitely. + ## Deprovisioning ### Nodes not deprovisioned diff --git a/website/content/en/v0.37/troubleshooting.md b/website/content/en/v0.37/troubleshooting.md index 6b362e504e86..ed3dc4a21c6d 100644 --- a/website/content/en/v0.37/troubleshooting.md +++ b/website/content/en/v0.37/troubleshooting.md @@ -403,6 +403,28 @@ then the following solution(s) may resolve your issue. ... ``` +### Karpenter incorrectly computes available resources for a node + +When creating nodes, the allocatable resources Karpenter computed (as seen in logs and `nodeClaim.status.allocatable`) do not always match the allocatable resources on the created node (`node.status.allocatable`). +Karpenter uses the results from `ec2:DescribeInstanceTypes` to determine the resources available on a node launched with a given instance type. +The following computation is used to determine allocatable CPU, memory, and ephemeral storage based on the results returned from `ec2:DescribeInstanceTypes`. + +``` +nodeClaim.allocatable.cpu = instance.cpu - kubeReserved.cpu - systemReserved.cpu +nodeClaim.allocatable.memory = (instance.memory * (1.0 - VM_MEMORY_OVERHEAD_PERCENT)) - kubeReserved.memory - systemReserved.memory - max(evictionSoft.memory.available, evictionHard.memory.available) +nodeClaim.allocatable.ephemeralStorage = instance.storage - kubeReserved.ephemeralStorage - systemReserved.ephemeralStorage - max(evictionSoft.nodefs.available, evictionHard.nodefs.available) +``` + +Most of these factors directly model user configuration (i.e. the KubeletConfiguration options). +On the other hand, `VM_MEMORY_OVERHEAD_PERCENT` models an implicit reduction of available memory that varies by instance type and AMI. +Karpenter can't compute the exact value being modeled, so `VM_MEMORY_OVERHEAD_PERCENT` is a [global setting]({{< ref "./reference/settings.md" >}}) used across all instance type and AMI combinations. +The default value (`7.5%`) has been tuned to closely match reality for the majority of instance types while not overestimating. +As a result, Karpenter will typically underestimate the memory availble on a node for a given instance type. +If you know the real `VM_MEMORY_OVERHEAD_PERCENT` for the specific instances you're provisioning in your cluster, you can tune this value to tighten the bound. +However, this should be done with caution. +A `VM_MEMORY_OVERHEAD_PERCENT` which results in Karpenter overestimating the memory available on a node can result in Karpenter launching nodes which are too small for your workload. +In the worst case, this can result in an instance launch loop and your workload remaining unschedulable indefinitely. + ## Deprovisioning ### Nodes not deprovisioned diff --git a/website/content/en/v1.0/troubleshooting.md b/website/content/en/v1.0/troubleshooting.md index c51d4f341ef5..bebc0625c5a3 100644 --- a/website/content/en/v1.0/troubleshooting.md +++ b/website/content/en/v1.0/troubleshooting.md @@ -325,6 +325,50 @@ then the following solution(s) may resolve your issue. ... ``` +### Karpenter incorrectly computes available resources for a node + +When creating nodes, the allocatable resources Karpenter computed (as seen in logs and `nodeClaim.status.allocatable`) do not always match the allocatable resources on the created node (`node.status.allocatable`). +Karpenter uses the results from `ec2:DescribeInstanceTypes` to determine the resources available on a node launched with a given instance type. +The following computation is used to determine allocatable CPU, memory, and ephemeral storage based on the results returned from `ec2:DescribeInstanceTypes`. + +``` +nodeClaim.allocatable.cpu = instance.cpu - kubeReserved.cpu - systemReserved.cpu +nodeClaim.allocatable.memory = (instance.memory * (1.0 - VM_MEMORY_OVERHEAD_PERCENT)) - kubeReserved.memory - systemReserved.memory - max(evictionSoft.memory.available, evictionHard.memory.available) +nodeClaim.allocatable.ephemeralStorage = instance.storage - kubeReserved.ephemeralStorage - systemReserved.ephemeralStorage - max(evictionSoft.nodefs.available, evictionHard.nodefs.available) +``` + +Most of these factors directly model user configuration (i.e. the KubeletConfiguration options). +On the other hand, `VM_MEMORY_OVERHEAD_PERCENT` models an implicit reduction of available memory that varies by instance type and AMI. +Karpenter can't compute the exact value being modeled, so `VM_MEMORY_OVERHEAD_PERCENT` is a [global setting]({{< ref "./reference/settings.md" >}}) used across all instance type and AMI combinations. +The default value (`7.5%`) has been tuned to closely match reality for the majority of instance types while not overestimating. +As a result, Karpenter will typically underestimate the memory availble on a node for a given instance type. +If you know the real `VM_MEMORY_OVERHEAD_PERCENT` for the specific instances you're provisioning in your cluster, you can tune this value to tighten the bound. +However, this should be done with caution. +A `VM_MEMORY_OVERHEAD_PERCENT` which results in Karpenter overestimating the memory available on a node can result in Karpenter launching nodes which are too small for your workload. +In the worst case, this can result in an instance launch loop and your workload remaining unschedulable indefinitely. + +To detect instances of Karpenter overestimating resource availability, the following status condition can be monitored: + +```bash +$ kg nodeclaim $NODECLAIM_NAME -o jsonpath='{.status.conditions[?(@.type=="ConsistentStateFound")]}' +``` + +```json +{ + "type": "ConsistentStateFound", + "status": "False", + "reason": "ConsistencyCheckFailed", + "message": "Consistency Check Failed", + "lastTransitionTime": "2024-08-19T20:02:16Z" +} +``` + +This can be spot checked like shown above, or monitored via the following metric: + +``` +operator_status_condition_count{type="ConsistentStateFound",kind="NodeClaim",status="False"} +``` + ## Deprovisioning ### Nodes not deprovisioned From 28da0b96b6086679f75e656d31ac65bd7fca2bc0 Mon Sep 17 00:00:00 2001 From: Ruben Laguna Date: Wed, 21 Aug 2024 08:11:57 +0200 Subject: [PATCH 19/21] docs: clarify terminationGracePeriod (#6783) Co-authored-by: Jonathan Innis Co-authored-by: Jonathan Innis --- website/content/en/docs/concepts/disruption.md | 2 +- website/content/en/preview/concepts/disruption.md | 2 +- website/content/en/v1.0/concepts/disruption.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/website/content/en/docs/concepts/disruption.md b/website/content/en/docs/concepts/disruption.md index cccb8d297a8c..e1833a5e43f7 100644 --- a/website/content/en/docs/concepts/disruption.md +++ b/website/content/en/docs/concepts/disruption.md @@ -202,7 +202,7 @@ To enable interruption handling, configure the `--interruption-queue` CLI argume ### TerminationGracePeriod -This is the duration of time that a node can be draining before it's forcibly deleted. A node begins draining when it's deleted. Pods will be deleted preemptively based on its TerminationGracePeriodSeconds before this terminationGracePeriod ends to give as much time to cleanup as possible. Note that if your pod's terminationGracePeriodSeconds is larger than this terminationGracePeriod, Karpenter may forcibly delete the pod before it has its full terminationGracePeriod to cleanup. +You can set a NodePool's `terminationGracePeriod` through the `spec.template.spec.terminationGracePeriod` field. This field defines the duration of time that a node can be draining before it's forcibly deleted. A node begins draining when it's deleted. Pods will be deleted preemptively based on its TerminationGracePeriodSeconds before this terminationGracePeriod ends to give as much time to cleanup as possible. Note that if your pod's terminationGracePeriodSeconds is larger than this terminationGracePeriod, Karpenter may forcibly delete the pod before it has its full terminationGracePeriod to cleanup. This is especially useful in combination with `nodepool.spec.template.spec.expireAfter` to define an absolute maximum on the lifetime of a node, where a node is deleted at `expireAfter` and finishes draining within the `terminationGracePeriod` thereafter. Pods blocking eviction like PDBs and do-not-disrupt will block full draining until the `terminationGracePeriod` is reached. diff --git a/website/content/en/preview/concepts/disruption.md b/website/content/en/preview/concepts/disruption.md index cccb8d297a8c..e1833a5e43f7 100644 --- a/website/content/en/preview/concepts/disruption.md +++ b/website/content/en/preview/concepts/disruption.md @@ -202,7 +202,7 @@ To enable interruption handling, configure the `--interruption-queue` CLI argume ### TerminationGracePeriod -This is the duration of time that a node can be draining before it's forcibly deleted. A node begins draining when it's deleted. Pods will be deleted preemptively based on its TerminationGracePeriodSeconds before this terminationGracePeriod ends to give as much time to cleanup as possible. Note that if your pod's terminationGracePeriodSeconds is larger than this terminationGracePeriod, Karpenter may forcibly delete the pod before it has its full terminationGracePeriod to cleanup. +You can set a NodePool's `terminationGracePeriod` through the `spec.template.spec.terminationGracePeriod` field. This field defines the duration of time that a node can be draining before it's forcibly deleted. A node begins draining when it's deleted. Pods will be deleted preemptively based on its TerminationGracePeriodSeconds before this terminationGracePeriod ends to give as much time to cleanup as possible. Note that if your pod's terminationGracePeriodSeconds is larger than this terminationGracePeriod, Karpenter may forcibly delete the pod before it has its full terminationGracePeriod to cleanup. This is especially useful in combination with `nodepool.spec.template.spec.expireAfter` to define an absolute maximum on the lifetime of a node, where a node is deleted at `expireAfter` and finishes draining within the `terminationGracePeriod` thereafter. Pods blocking eviction like PDBs and do-not-disrupt will block full draining until the `terminationGracePeriod` is reached. diff --git a/website/content/en/v1.0/concepts/disruption.md b/website/content/en/v1.0/concepts/disruption.md index cccb8d297a8c..e1833a5e43f7 100644 --- a/website/content/en/v1.0/concepts/disruption.md +++ b/website/content/en/v1.0/concepts/disruption.md @@ -202,7 +202,7 @@ To enable interruption handling, configure the `--interruption-queue` CLI argume ### TerminationGracePeriod -This is the duration of time that a node can be draining before it's forcibly deleted. A node begins draining when it's deleted. Pods will be deleted preemptively based on its TerminationGracePeriodSeconds before this terminationGracePeriod ends to give as much time to cleanup as possible. Note that if your pod's terminationGracePeriodSeconds is larger than this terminationGracePeriod, Karpenter may forcibly delete the pod before it has its full terminationGracePeriod to cleanup. +You can set a NodePool's `terminationGracePeriod` through the `spec.template.spec.terminationGracePeriod` field. This field defines the duration of time that a node can be draining before it's forcibly deleted. A node begins draining when it's deleted. Pods will be deleted preemptively based on its TerminationGracePeriodSeconds before this terminationGracePeriod ends to give as much time to cleanup as possible. Note that if your pod's terminationGracePeriodSeconds is larger than this terminationGracePeriod, Karpenter may forcibly delete the pod before it has its full terminationGracePeriod to cleanup. This is especially useful in combination with `nodepool.spec.template.spec.expireAfter` to define an absolute maximum on the lifetime of a node, where a node is deleted at `expireAfter` and finishes draining within the `terminationGracePeriod` thereafter. Pods blocking eviction like PDBs and do-not-disrupt will block full draining until the `terminationGracePeriod` is reached. From eded27e7d12398aa227775fa0cd9986d08e51fe6 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Wed, 21 Aug 2024 10:05:38 -0700 Subject: [PATCH 20/21] Add support for pulling instance types from multiple regions (#6815) --- .../compatibility.yaml | 6 +- hack/docs/instancetypes_gen/main.go | 141 +++++++------ hack/docs/version_compatibility_gen/main.go | 4 +- .../en/preview/reference/instance-types.md | 191 +++++++++++++++--- .../en/preview/upgrading/compatibility.md | 4 +- 5 files changed, 247 insertions(+), 99 deletions(-) diff --git a/hack/docs/compatibilitymatrix_gen/compatibility.yaml b/hack/docs/compatibilitymatrix_gen/compatibility.yaml index 19b0701ff966..47e7d5d82964 100644 --- a/hack/docs/compatibilitymatrix_gen/compatibility.yaml +++ b/hack/docs/compatibilitymatrix_gen/compatibility.yaml @@ -45,12 +45,12 @@ compatibility: - appVersion: 0.35.x minK8sVersion: 1.23 maxK8sVersion: 1.29 - - appVersion: 0.36.0 + - appVersion: 0.36.x minK8sVersion: 1.23 maxK8sVersion: 1.29 - - appVersion: 0.37.0 + - appVersion: 0.37.x minK8sVersion: 1.23 maxK8sVersion: 1.30 - - appVersion: 1.0.0 + - appVersion: 1.0.x minK8sVersion: 1.25 maxK8sVersion: 1.30 \ No newline at end of file diff --git a/hack/docs/instancetypes_gen/main.go b/hack/docs/instancetypes_gen/main.go index e5cafcde2803..3a08e342807f 100644 --- a/hack/docs/instancetypes_gen/main.go +++ b/hack/docs/instancetypes_gen/main.go @@ -23,25 +23,29 @@ import ( "sort" "strings" + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" + "github.com/patrickmn/go-cache" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/manager" karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" - coreoperator "sigs.k8s.io/karpenter/pkg/operator" coreoptions "sigs.k8s.io/karpenter/pkg/operator/options" coretest "sigs.k8s.io/karpenter/pkg/test" v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" - "github.com/aws/karpenter-provider-aws/pkg/operator" + awscache "github.com/aws/karpenter-provider-aws/pkg/cache" "github.com/aws/karpenter-provider-aws/pkg/operator/options" + "github.com/aws/karpenter-provider-aws/pkg/providers/instancetype" + "github.com/aws/karpenter-provider-aws/pkg/providers/pricing" + "github.com/aws/karpenter-provider-aws/pkg/providers/subnet" "github.com/aws/karpenter-provider-aws/pkg/test" "sigs.k8s.io/karpenter/pkg/cloudprovider" @@ -83,7 +87,6 @@ func main() { lo.Must0(os.Setenv("SYSTEM_NAMESPACE", "karpenter")) lo.Must0(os.Setenv("AWS_SDK_LOAD_CONFIG", "true")) - lo.Must0(os.Setenv("AWS_REGION", "us-east-1")) ctx := coreoptions.ToContext(context.Background(), coretest.Options()) ctx = options.ToContext(ctx, test.Options(test.OptionsFields{ @@ -92,46 +95,6 @@ func main() { IsolatedVPC: lo.ToPtr(true), // disable pricing lookup })) - ctx, op := operator.NewOperator(ctx, &coreoperator.Operator{ - Manager: &FakeManager{}, - KubernetesInterface: kubernetes.NewForConfigOrDie(&rest.Config{}), - }) - if err := op.InstanceTypesProvider.UpdateInstanceTypes(ctx); err != nil { - log.Fatalf("updating instance types, %s", err) - } - if err := op.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx); err != nil { - log.Fatalf("updating instance types offerings, %s", err) - } - // Fake a NodeClass so we can use it to get InstanceTypes - nodeClass := &v1.EC2NodeClass{ - Spec: v1.EC2NodeClassSpec{ - AMISelectorTerms: []v1.AMISelectorTerm{{ - Alias: "al2023@latest", - }}, - SubnetSelectorTerms: []v1.SubnetSelectorTerm{ - { - Tags: map[string]string{ - "*": "*", - }, - }, - }, - }, - } - subnets, err := op.SubnetProvider.List(ctx, nodeClass) - if err != nil { - log.Fatalf("listing subnets, %s", err) - } - nodeClass.Status.Subnets = lo.Map(subnets, func(ec2subnet *ec2.Subnet, _ int) v1.Subnet { - return v1.Subnet{ - ID: *ec2subnet.SubnetId, - Zone: *ec2subnet.AvailabilityZone, - } - }) - instanceTypes, err := op.InstanceTypesProvider.List(ctx, &v1.KubeletConfiguration{}, nodeClass) - if err != nil { - log.Fatalf("listing instance types, %s", err) - } - outputFileName := flag.Arg(0) f, err := os.Create(outputFileName) if err != nil { @@ -154,21 +117,80 @@ below are the resources available with some assumptions and after the instance o - `+"`blockDeviceMappings` are not configured"+` - `+"`amiFamily` is set to `AL2023`") - // generate a map of family -> instance types along with some other sorted lists. The sorted lists ensure we + // generate a map of family -> map[instance type name]instance types along with some other sorted lists. The sorted lists ensure we // generate consistent docs every run. - families := map[string][]*cloudprovider.InstanceType{} - labelNameMap := sets.String{} - resourceNameMap := sets.String{} - for _, it := range instanceTypes { - familyName := strings.Split(it.Name, ".")[0] - families[familyName] = append(families[familyName], it) - for labelName := range it.Requirements { - labelNameMap.Insert(labelName) + families := map[string]map[string]*cloudprovider.InstanceType{} + labelNameMap := sets.New[string]() + resourceNameMap := sets.New[string]() + + // Iterate through regions and take the union of instance types we discover across both + for _, region := range []string{"us-east-1", "us-west-2"} { + sess := session.Must(session.NewSession(&aws.Config{Region: lo.ToPtr(region)})) + ec2api := ec2.New(sess) + subnetProvider := subnet.NewDefaultProvider(ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval), cache.New(awscache.AvailableIPAddressTTL, awscache.DefaultCleanupInterval), cache.New(awscache.AssociatePublicIPAddressTTL, awscache.DefaultCleanupInterval)) + instanceTypeProvider := instancetype.NewDefaultProvider( + region, + cache.New(awscache.InstanceTypesAndZonesTTL, awscache.DefaultCleanupInterval), + ec2api, + subnetProvider, + awscache.NewUnavailableOfferings(), + pricing.NewDefaultProvider( + ctx, + pricing.NewAPI(sess, *sess.Config.Region), + ec2api, + *sess.Config.Region, + ), + ) + if err = instanceTypeProvider.UpdateInstanceTypes(ctx); err != nil { + log.Fatalf("updating instance types, %s", err) } - for resourceName := range it.Capacity { - resourceNameMap.Insert(string(resourceName)) + if err = instanceTypeProvider.UpdateInstanceTypeOfferings(ctx); err != nil { + log.Fatalf("updating instance types offerings, %s", err) + } + // Fake a NodeClass so we can use it to get InstanceTypes + nodeClass := &v1.EC2NodeClass{ + Spec: v1.EC2NodeClassSpec{ + AMISelectorTerms: []v1.AMISelectorTerm{{ + Alias: "al2023@latest", + }}, + SubnetSelectorTerms: []v1.SubnetSelectorTerm{ + { + Tags: map[string]string{ + "*": "*", + }, + }, + }, + }, + } + subnets, err := subnetProvider.List(ctx, nodeClass) + if err != nil { + log.Fatalf("listing subnets, %s", err) + } + nodeClass.Status.Subnets = lo.Map(subnets, func(ec2subnet *ec2.Subnet, _ int) v1.Subnet { + return v1.Subnet{ + ID: *ec2subnet.SubnetId, + Zone: *ec2subnet.AvailabilityZone, + } + }) + instanceTypes, err := instanceTypeProvider.List(ctx, &v1.KubeletConfiguration{}, nodeClass) + if err != nil { + log.Fatalf("listing instance types, %s", err) + } + for _, it := range instanceTypes { + familyName := strings.Split(it.Name, ".")[0] + if _, ok := families[familyName]; !ok { + families[familyName] = map[string]*cloudprovider.InstanceType{} + } + families[familyName][it.Name] = it + for labelName := range it.Requirements { + labelNameMap.Insert(labelName) + } + for resourceName := range it.Capacity { + resourceNameMap.Insert(string(resourceName)) + } } } + familyNames := lo.Keys(families) sort.Strings(familyNames) @@ -186,10 +208,11 @@ below are the resources available with some assumptions and after the instance o for _, familyName := range familyNames { fmt.Fprintf(f, "## %s Family\n", familyName) + instanceTypes := lo.MapToSlice(families[familyName], func(_ string, it *cloudprovider.InstanceType) *cloudprovider.InstanceType { return it }) // sort the instance types within the family, we sort by CPU and memory which should be a pretty good ordering - sort.Slice(families[familyName], func(a, b int) bool { - lhs := families[familyName][a] - rhs := families[familyName][b] + sort.Slice(instanceTypes, func(a, b int) bool { + lhs := instanceTypes[a] + rhs := instanceTypes[b] lhsResources := lhs.Capacity rhsResources := rhs.Capacity if cpuCmp := resources.Cmp(*lhsResources.Cpu(), *rhsResources.Cpu()); cpuCmp != 0 { @@ -201,7 +224,7 @@ below are the resources available with some assumptions and after the instance o return lhs.Name < rhs.Name }) - for _, it := range families[familyName] { + for _, it := range instanceTypes { fmt.Fprintf(f, "### `%s`\n", it.Name) minusOverhead := resources.Subtract(it.Capacity, it.Overhead.Total()) fmt.Fprintln(f, "#### Labels") diff --git a/hack/docs/version_compatibility_gen/main.go b/hack/docs/version_compatibility_gen/main.go index d64ca8180cdd..9f1248a71025 100644 --- a/hack/docs/version_compatibility_gen/main.go +++ b/hack/docs/version_compatibility_gen/main.go @@ -32,10 +32,10 @@ func main() { os.Exit(0) } - v := strings.TrimPrefix(os.Args[2], "v") + v := strings.TrimSuffix(strings.TrimPrefix(os.Args[2], "v"), ".0") appendVersion := fmt.Sprintf( ` - - appVersion: %s + - appVersion: %s.x minK8sVersion: %s maxK8sVersion: %s`, v, diff --git a/website/content/en/preview/reference/instance-types.md b/website/content/en/preview/reference/instance-types.md index 8b9ebc3c8438..d11285618aee 100644 --- a/website/content/en/preview/reference/instance-types.md +++ b/website/content/en/preview/reference/instance-types.md @@ -4646,6 +4646,132 @@ below are the resources available with some assumptions and after the instance o |pods|737| |vpc.amazonaws.com/efa|1| |vpc.amazonaws.com/pod-eni|107| +## c7i-flex Family +### `c7i-flex.large` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|c| + |karpenter.k8s.aws/instance-cpu|2| + |karpenter.k8s.aws/instance-cpu-manufacturer|intel| + |karpenter.k8s.aws/instance-ebs-bandwidth|10000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|c7i-flex| + |karpenter.k8s.aws/instance-generation|7| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-memory|4096| + |karpenter.k8s.aws/instance-network-bandwidth|390| + |karpenter.k8s.aws/instance-size|large| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|c7i-flex.large| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|1930m| + |ephemeral-storage|17Gi| + |memory|3114Mi| + |pods|29| +### `c7i-flex.xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|c| + |karpenter.k8s.aws/instance-cpu|4| + |karpenter.k8s.aws/instance-cpu-manufacturer|intel| + |karpenter.k8s.aws/instance-ebs-bandwidth|10000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|c7i-flex| + |karpenter.k8s.aws/instance-generation|7| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-memory|8192| + |karpenter.k8s.aws/instance-network-bandwidth|781| + |karpenter.k8s.aws/instance-size|xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|c7i-flex.xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|3920m| + |ephemeral-storage|17Gi| + |memory|6584Mi| + |pods|58| +### `c7i-flex.2xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|c| + |karpenter.k8s.aws/instance-cpu|8| + |karpenter.k8s.aws/instance-cpu-manufacturer|intel| + |karpenter.k8s.aws/instance-ebs-bandwidth|10000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|c7i-flex| + |karpenter.k8s.aws/instance-generation|7| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-memory|16384| + |karpenter.k8s.aws/instance-network-bandwidth|1562| + |karpenter.k8s.aws/instance-size|2xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|c7i-flex.2xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|7910m| + |ephemeral-storage|17Gi| + |memory|14162Mi| + |pods|58| +### `c7i-flex.4xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|c| + |karpenter.k8s.aws/instance-cpu|16| + |karpenter.k8s.aws/instance-cpu-manufacturer|intel| + |karpenter.k8s.aws/instance-ebs-bandwidth|10000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|c7i-flex| + |karpenter.k8s.aws/instance-generation|7| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-memory|32768| + |karpenter.k8s.aws/instance-network-bandwidth|3125| + |karpenter.k8s.aws/instance-size|4xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|c7i-flex.4xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|15890m| + |ephemeral-storage|17Gi| + |memory|27381Mi| + |pods|234| +### `c7i-flex.8xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-category|c| + |karpenter.k8s.aws/instance-cpu|32| + |karpenter.k8s.aws/instance-cpu-manufacturer|intel| + |karpenter.k8s.aws/instance-ebs-bandwidth|10000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|c7i-flex| + |karpenter.k8s.aws/instance-generation|7| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-memory|65536| + |karpenter.k8s.aws/instance-network-bandwidth|6250| + |karpenter.k8s.aws/instance-size|8xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|c7i-flex.8xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |cpu|31850m| + |ephemeral-storage|17Gi| + |memory|57691Mi| + |pods|234| ## d2 Family ### `d2.xlarge` #### Labels @@ -5050,6 +5176,38 @@ below are the resources available with some assumptions and after the instance o |pods|737| |vpc.amazonaws.com/efa|4| |vpc.amazonaws.com/pod-eni|62| +## dl2q Family +### `dl2q.24xlarge` +#### Labels + | Label | Value | + |--|--| + |karpenter.k8s.aws/instance-accelerator-count|8| + |karpenter.k8s.aws/instance-accelerator-manufacturer|qualcomm| + |karpenter.k8s.aws/instance-accelerator-name|qualcomm-ai100| + |karpenter.k8s.aws/instance-category|dl| + |karpenter.k8s.aws/instance-cpu|96| + |karpenter.k8s.aws/instance-cpu-manufacturer|intel| + |karpenter.k8s.aws/instance-ebs-bandwidth|19000| + |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| + |karpenter.k8s.aws/instance-family|dl2q| + |karpenter.k8s.aws/instance-generation|2| + |karpenter.k8s.aws/instance-hypervisor|nitro| + |karpenter.k8s.aws/instance-memory|786432| + |karpenter.k8s.aws/instance-network-bandwidth|100000| + |karpenter.k8s.aws/instance-size|24xlarge| + |kubernetes.io/arch|amd64| + |kubernetes.io/os|linux| + |node.kubernetes.io/instance-type|dl2q.24xlarge| +#### Resources + | Resource | Quantity | + |--|--| + |aws.amazon.com/neuron|8| + |cpu|95690m| + |ephemeral-storage|17Gi| + |memory|718987Mi| + |pods|737| + |vpc.amazonaws.com/efa|1| + |vpc.amazonaws.com/pod-eni|107| ## f1 Family ### `f1.2xlarge` #### Labels @@ -13637,39 +13795,6 @@ below are the resources available with some assumptions and after the instance o |pods|737| |vpc.amazonaws.com/efa|4| |vpc.amazonaws.com/pod-eni|62| -## p4de Family -### `p4de.24xlarge` -#### Labels - | Label | Value | - |--|--| - |karpenter.k8s.aws/instance-category|p| - |karpenter.k8s.aws/instance-cpu|96| - |karpenter.k8s.aws/instance-cpu-manufacturer|intel| - |karpenter.k8s.aws/instance-ebs-bandwidth|19000| - |karpenter.k8s.aws/instance-encryption-in-transit-supported|true| - |karpenter.k8s.aws/instance-family|p4de| - |karpenter.k8s.aws/instance-generation|4| - |karpenter.k8s.aws/instance-gpu-count|8| - |karpenter.k8s.aws/instance-gpu-manufacturer|nvidia| - |karpenter.k8s.aws/instance-gpu-memory|81920| - |karpenter.k8s.aws/instance-gpu-name|a100| - |karpenter.k8s.aws/instance-hypervisor|nitro| - |karpenter.k8s.aws/instance-local-nvme|8000| - |karpenter.k8s.aws/instance-memory|1179648| - |karpenter.k8s.aws/instance-network-bandwidth|400000| - |karpenter.k8s.aws/instance-size|24xlarge| - |kubernetes.io/arch|amd64| - |kubernetes.io/os|linux| - |node.kubernetes.io/instance-type|p4de.24xlarge| -#### Resources - | Resource | Quantity | - |--|--| - |cpu|95690m| - |ephemeral-storage|17Gi| - |memory|1082712Mi| - |nvidia.com/gpu|8| - |pods|737| - |vpc.amazonaws.com/efa|4| ## p5 Family ### `p5.48xlarge` #### Labels diff --git a/website/content/en/preview/upgrading/compatibility.md b/website/content/en/preview/upgrading/compatibility.md index 01d6b0541c37..266c89cbc319 100644 --- a/website/content/en/preview/upgrading/compatibility.md +++ b/website/content/en/preview/upgrading/compatibility.md @@ -15,8 +15,8 @@ Before you begin upgrading Karpenter, consider Karpenter compatibility issues re [comment]: <> (the content below is generated from hack/docs/compataiblitymetrix_gen_docs.go) -| KUBERNETES | 1.24 | 1.25 | 1.26 | 1.27 | 1.28 | 1.29 | 1.30 | -|------------|---------------------|----------|----------|----------|----------|----------|------------| +| KUBERNETES | 1.24 | 1.25 | 1.26 | 1.27 | 1.28 | 1.29 | 1.30 | +|------------|-------------------|----------|----------|----------|----------|----------|----------| | karpenter | \>= 0.21 \<= 0.37 | \>= 0.25 | \>= 0.28 | \>= 0.28 | \>= 0.31 | \>= 0.34 | \>= 0.37 | [comment]: <> (end docs generated content from hack/docs/compataiblitymetrix_gen_docs.go) From bb87d5b9db8dd83a66e464bd2431ef9765997d97 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Wed, 21 Aug 2024 10:33:46 -0700 Subject: [PATCH 21/21] chore: Bump go to use 1.23 (#6809) --- .../karpenter.k8s.aws_ec2nodeclasses.yaml | 46 ++----------------- .../templates/karpenter.sh_nodeclaims.yaml | 20 ++------ .../templates/karpenter.sh_nodepools.yaml | 20 ++------ go.mod | 4 +- go.sum | 4 +- .../karpenter.k8s.aws_ec2nodeclasses.yaml | 46 ++----------------- pkg/apis/crds/karpenter.sh_nodeclaims.yaml | 20 ++------ pkg/apis/crds/karpenter.sh_nodepools.yaml | 20 ++------ pkg/apis/v1/ec2nodeclass.go | 5 +- pkg/apis/v1beta1/ec2nodeclass.go | 4 +- pkg/cloudprovider/cloudprovider.go | 3 +- pkg/providers/instance/instance.go | 10 ++-- 12 files changed, 34 insertions(+), 168 deletions(-) diff --git a/charts/karpenter-crd/templates/karpenter.k8s.aws_ec2nodeclasses.yaml b/charts/karpenter-crd/templates/karpenter.k8s.aws_ec2nodeclasses.yaml index 4d81b475cb1d..1cd12095de75 100644 --- a/charts/karpenter-crd/templates/karpenter.k8s.aws_ec2nodeclasses.yaml +++ b/charts/karpenter-crd/templates/karpenter.k8s.aws_ec2nodeclasses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.1 name: ec2nodeclasses.karpenter.k8s.aws spec: group: karpenter.k8s.aws @@ -164,24 +164,18 @@ spec: gp2 volumes, this represents the baseline performance of the volume and the rate at which the volume accumulates I/O credits for bursting. - The following are the supported values for each volume type: - * gp3: 3,000-16,000 IOPS - * io1: 100-64,000 IOPS - * io2: 100-64,000 IOPS - For io1 and io2 volumes, we guarantee 64,000 IOPS only for Instances built on the Nitro System (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances). Other instance families guarantee performance up to 32,000 IOPS. - This parameter is supported for io1, io2, and gp3 volumes only. This parameter is not supported for gp2, st1, sc1, or standard volumes. format: int64 @@ -204,16 +198,12 @@ spec: a volume size. The following are the supported volumes sizes for each volume type: - * gp2 and gp3: 1-16,384 - * io1 and io2: 4-16,384 - * st1 and sc1: 125-16,384 - * standard: 1-1,024 pattern: ^((?:[1-9][0-9]{0,3}|[1-4][0-9]{4}|[5][0-8][0-9]{3}|59000)Gi|(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|[6][0-3][0-9]{3}|64000)G|([1-9]||[1-5][0-7]|58)Ti|([1-9]||[1-5][0-9]|6[0-3]|64)T)$ type: string @@ -392,14 +382,12 @@ spec: description: |- MetadataOptions for the generated launch template of provisioned nodes. - This specifies the exposure of the Instance Metadata Service to provisioned EC2 nodes. For more information, see Instance Metadata and User Data (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html) in the Amazon Elastic Compute Cloud User Guide. - Refer to recommended, security best practices (https://aws.github.io/aws-eks-best-practices/security/docs/iam/#restrict-access-to-the-instance-profile-assigned-to-the-worker-node) for limiting exposure of Instance Metadata and User Data to pods. @@ -414,7 +402,6 @@ spec: nodes. If metadata options is non-nil, but this parameter is not specified, the default state is "enabled". - If you specify a value of "disabled", instance metadata will not be accessible on the node. enum: @@ -450,14 +437,12 @@ spec: requests. If metadata options is non-nil, but this parameter is not specified, the default state is "required". - If the state is optional, one can choose to retrieve instance metadata with or without a signed token header on the request. If one retrieves the IAM role credentials without a token, the version 1.0 role credentials are returned. If one retrieves the IAM role credentials using a valid signed token, the version 2.0 role credentials are returned. - If the state is "required", one must send a signed token header with any instance metadata retrieval requests. In this state, retrieving the IAM role credentials always returns the version 2.0 credentials; the version @@ -693,12 +678,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -864,24 +844,18 @@ spec: gp2 volumes, this represents the baseline performance of the volume and the rate at which the volume accumulates I/O credits for bursting. - The following are the supported values for each volume type: - * gp3: 3,000-16,000 IOPS - * io1: 100-64,000 IOPS - * io2: 100-64,000 IOPS - For io1 and io2 volumes, we guarantee 64,000 IOPS only for Instances built on the Nitro System (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances). Other instance families guarantee performance up to 32,000 IOPS. - This parameter is supported for io1, io2, and gp3 volumes only. This parameter is not supported for gp2, st1, sc1, or standard volumes. format: int64 @@ -904,16 +878,12 @@ spec: a volume size. The following are the supported volumes sizes for each volume type: - * gp2 and gp3: 1-16,384 - * io1 and io2: 4-16,384 - * st1 and sc1: 125-16,384 - * standard: 1-1,024 pattern: ^((?:[1-9][0-9]{0,3}|[1-4][0-9]{4}|[5][0-8][0-9]{3}|59000)Gi|(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|[6][0-3][0-9]{3}|64000)G|([1-9]||[1-5][0-7]|58)Ti|([1-9]||[1-5][0-9]|6[0-3]|64)T)$ type: string @@ -978,14 +948,12 @@ spec: description: |- MetadataOptions for the generated launch template of provisioned nodes. - This specifies the exposure of the Instance Metadata Service to provisioned EC2 nodes. For more information, see Instance Metadata and User Data (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html) in the Amazon Elastic Compute Cloud User Guide. - Refer to recommended, security best practices (https://aws.github.io/aws-eks-best-practices/security/docs/iam/#restrict-access-to-the-instance-profile-assigned-to-the-worker-node) for limiting exposure of Instance Metadata and User Data to pods. @@ -1000,7 +968,6 @@ spec: nodes. If metadata options is non-nil, but this parameter is not specified, the default state is "enabled". - If you specify a value of "disabled", instance metadata will not be accessible on the node. enum: @@ -1036,14 +1003,12 @@ spec: requests. If metadata options is non-nil, but this parameter is not specified, the default state is "required". - If the state is optional, one can choose to retrieve instance metadata with or without a signed token header on the request. If one retrieves the IAM role credentials without a token, the version 1.0 role credentials are returned. If one retrieves the IAM role credentials using a valid signed token, the version 2.0 role credentials are returned. - If the state is "required", one must send a signed token header with any instance metadata retrieval requests. In this state, retrieving the IAM role credentials always returns the version 2.0 credentials; the version @@ -1269,12 +1234,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/charts/karpenter-crd/templates/karpenter.sh_nodeclaims.yaml b/charts/karpenter-crd/templates/karpenter.sh_nodeclaims.yaml index d32ab39da36c..4aef381296e4 100644 --- a/charts/karpenter-crd/templates/karpenter.sh_nodeclaims.yaml +++ b/charts/karpenter-crd/templates/karpenter.sh_nodeclaims.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.1 name: nodeclaims.karpenter.sh spec: group: karpenter.sh @@ -262,19 +262,15 @@ spec: description: |- TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. - Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. - Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. - The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ @@ -350,12 +346,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -798,12 +789,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/charts/karpenter-crd/templates/karpenter.sh_nodepools.yaml b/charts/karpenter-crd/templates/karpenter.sh_nodepools.yaml index f656ac273252..6de83288ad0e 100644 --- a/charts/karpenter-crd/templates/karpenter.sh_nodepools.yaml +++ b/charts/karpenter-crd/templates/karpenter.sh_nodepools.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.1 name: nodepools.karpenter.sh spec: group: karpenter.sh @@ -392,19 +392,15 @@ spec: description: |- TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. - Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. - Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. - The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ @@ -476,12 +472,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -1047,12 +1038,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/go.mod b/go.mod index 41ab1408f359..4be72649ec49 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/aws/karpenter-provider-aws -go 1.22.5 +go 1.23.0 require ( github.com/Pallinder/go-randomdata v1.2.0 @@ -32,7 +32,7 @@ require ( k8s.io/utils v0.0.0-20240102154912-e7106e64919e knative.dev/pkg v0.0.0-20231010144348-ca8c009405dd sigs.k8s.io/controller-runtime v0.18.5 - sigs.k8s.io/karpenter v1.0.1-0.20240815170320-bb7468a3a758 + sigs.k8s.io/karpenter v1.0.1-0.20240820174000-8e40c0c92224 sigs.k8s.io/yaml v1.4.0 ) diff --git a/go.sum b/go.sum index fce34c403587..95a57abcaab0 100644 --- a/go.sum +++ b/go.sum @@ -762,8 +762,8 @@ sigs.k8s.io/controller-runtime v0.18.5 h1:nTHio/W+Q4aBlQMgbnC5hZb4IjIidyrizMai9P sigs.k8s.io/controller-runtime v0.18.5/go.mod h1:TVoGrfdpbA9VRFaRnKgk9P5/atA0pMwq+f+msb9M8Sg= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/karpenter v1.0.1-0.20240815170320-bb7468a3a758 h1:VEibnW+C/lW8QVgGlsZadhhTPXwhkR2CQj828zHu8Ao= -sigs.k8s.io/karpenter v1.0.1-0.20240815170320-bb7468a3a758/go.mod h1:SGH7B5ZSeaCXBnwvj4cSmIPC6TqRq7kPZmQyJRdxC6k= +sigs.k8s.io/karpenter v1.0.1-0.20240820174000-8e40c0c92224 h1:T1OTA/jwiqWp55+gb8CCT5PoyZjIywjspe9UuLU/dgc= +sigs.k8s.io/karpenter v1.0.1-0.20240820174000-8e40c0c92224/go.mod h1:e6yDwyO/5+h2NqTkvMmHf9ae/UnKbsOdSYuAnV0NErQ= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= diff --git a/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml b/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml index 09a8d1e9dba0..453b758d8683 100644 --- a/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml +++ b/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.1 name: ec2nodeclasses.karpenter.k8s.aws spec: group: karpenter.k8s.aws @@ -164,24 +164,18 @@ spec: gp2 volumes, this represents the baseline performance of the volume and the rate at which the volume accumulates I/O credits for bursting. - The following are the supported values for each volume type: - * gp3: 3,000-16,000 IOPS - * io1: 100-64,000 IOPS - * io2: 100-64,000 IOPS - For io1 and io2 volumes, we guarantee 64,000 IOPS only for Instances built on the Nitro System (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances). Other instance families guarantee performance up to 32,000 IOPS. - This parameter is supported for io1, io2, and gp3 volumes only. This parameter is not supported for gp2, st1, sc1, or standard volumes. format: int64 @@ -204,16 +198,12 @@ spec: a volume size. The following are the supported volumes sizes for each volume type: - * gp2 and gp3: 1-16,384 - * io1 and io2: 4-16,384 - * st1 and sc1: 125-16,384 - * standard: 1-1,024 pattern: ^((?:[1-9][0-9]{0,3}|[1-4][0-9]{4}|[5][0-8][0-9]{3}|59000)Gi|(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|[6][0-3][0-9]{3}|64000)G|([1-9]||[1-5][0-7]|58)Ti|([1-9]||[1-5][0-9]|6[0-3]|64)T)$ type: string @@ -392,14 +382,12 @@ spec: description: |- MetadataOptions for the generated launch template of provisioned nodes. - This specifies the exposure of the Instance Metadata Service to provisioned EC2 nodes. For more information, see Instance Metadata and User Data (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html) in the Amazon Elastic Compute Cloud User Guide. - Refer to recommended, security best practices (https://aws.github.io/aws-eks-best-practices/security/docs/iam/#restrict-access-to-the-instance-profile-assigned-to-the-worker-node) for limiting exposure of Instance Metadata and User Data to pods. @@ -414,7 +402,6 @@ spec: nodes. If metadata options is non-nil, but this parameter is not specified, the default state is "enabled". - If you specify a value of "disabled", instance metadata will not be accessible on the node. enum: @@ -450,14 +437,12 @@ spec: requests. If metadata options is non-nil, but this parameter is not specified, the default state is "required". - If the state is optional, one can choose to retrieve instance metadata with or without a signed token header on the request. If one retrieves the IAM role credentials without a token, the version 1.0 role credentials are returned. If one retrieves the IAM role credentials using a valid signed token, the version 2.0 role credentials are returned. - If the state is "required", one must send a signed token header with any instance metadata retrieval requests. In this state, retrieving the IAM role credentials always returns the version 2.0 credentials; the version @@ -693,12 +678,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -864,24 +844,18 @@ spec: gp2 volumes, this represents the baseline performance of the volume and the rate at which the volume accumulates I/O credits for bursting. - The following are the supported values for each volume type: - * gp3: 3,000-16,000 IOPS - * io1: 100-64,000 IOPS - * io2: 100-64,000 IOPS - For io1 and io2 volumes, we guarantee 64,000 IOPS only for Instances built on the Nitro System (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances). Other instance families guarantee performance up to 32,000 IOPS. - This parameter is supported for io1, io2, and gp3 volumes only. This parameter is not supported for gp2, st1, sc1, or standard volumes. format: int64 @@ -904,16 +878,12 @@ spec: a volume size. The following are the supported volumes sizes for each volume type: - * gp2 and gp3: 1-16,384 - * io1 and io2: 4-16,384 - * st1 and sc1: 125-16,384 - * standard: 1-1,024 pattern: ^((?:[1-9][0-9]{0,3}|[1-4][0-9]{4}|[5][0-8][0-9]{3}|59000)Gi|(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|[6][0-3][0-9]{3}|64000)G|([1-9]||[1-5][0-7]|58)Ti|([1-9]||[1-5][0-9]|6[0-3]|64)T)$ type: string @@ -978,14 +948,12 @@ spec: description: |- MetadataOptions for the generated launch template of provisioned nodes. - This specifies the exposure of the Instance Metadata Service to provisioned EC2 nodes. For more information, see Instance Metadata and User Data (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html) in the Amazon Elastic Compute Cloud User Guide. - Refer to recommended, security best practices (https://aws.github.io/aws-eks-best-practices/security/docs/iam/#restrict-access-to-the-instance-profile-assigned-to-the-worker-node) for limiting exposure of Instance Metadata and User Data to pods. @@ -1000,7 +968,6 @@ spec: nodes. If metadata options is non-nil, but this parameter is not specified, the default state is "enabled". - If you specify a value of "disabled", instance metadata will not be accessible on the node. enum: @@ -1036,14 +1003,12 @@ spec: requests. If metadata options is non-nil, but this parameter is not specified, the default state is "required". - If the state is optional, one can choose to retrieve instance metadata with or without a signed token header on the request. If one retrieves the IAM role credentials without a token, the version 1.0 role credentials are returned. If one retrieves the IAM role credentials using a valid signed token, the version 2.0 role credentials are returned. - If the state is "required", one must send a signed token header with any instance metadata retrieval requests. In this state, retrieving the IAM role credentials always returns the version 2.0 credentials; the version @@ -1269,12 +1234,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml index 064e0dad59bc..2931cb94def2 100644 --- a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml +++ b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.1 name: nodeclaims.karpenter.sh spec: group: karpenter.sh @@ -262,19 +262,15 @@ spec: description: |- TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. - Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. - Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. - The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ @@ -350,12 +346,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -798,12 +789,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index 3e5b2bb2888d..c9ac685b3945 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.15.0 + controller-gen.kubebuilder.io/version: v0.16.1 name: nodepools.karpenter.sh spec: group: karpenter.sh @@ -392,19 +392,15 @@ spec: description: |- TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. - Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. - Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. - The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ @@ -476,12 +472,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string @@ -1047,12 +1038,7 @@ spec: - Unknown type: string type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + description: type of condition in CamelCase or in foo.example.com/CamelCase. maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/pkg/apis/v1/ec2nodeclass.go b/pkg/apis/v1/ec2nodeclass.go index 5ad59c7d8552..a6ef5fb02265 100644 --- a/pkg/apis/v1/ec2nodeclass.go +++ b/pkg/apis/v1/ec2nodeclass.go @@ -325,14 +325,15 @@ type MetadataOptions struct { type BlockDeviceMapping struct { // The device name (for example, /dev/sdh or xvdh). - // +required + // +optional DeviceName *string `json:"deviceName,omitempty"` // EBS contains parameters used to automatically set up EBS volumes when an instance is launched. // +kubebuilder:validation:XValidation:message="snapshotID or volumeSize must be defined",rule="has(self.snapshotID) || has(self.volumeSize)" - // +required + // +optional EBS *BlockDevice `json:"ebs,omitempty"` // RootVolume is a flag indicating if this device is mounted as kubelet root dir. You can // configure at most one root volume in BlockDeviceMappings. + // +optional RootVolume bool `json:"rootVolume,omitempty"` } diff --git a/pkg/apis/v1beta1/ec2nodeclass.go b/pkg/apis/v1beta1/ec2nodeclass.go index 833dcdd5683e..df6808913d1b 100644 --- a/pkg/apis/v1beta1/ec2nodeclass.go +++ b/pkg/apis/v1beta1/ec2nodeclass.go @@ -227,11 +227,11 @@ type MetadataOptions struct { type BlockDeviceMapping struct { // The device name (for example, /dev/sdh or xvdh). - // +required + // +optional DeviceName *string `json:"deviceName,omitempty"` // EBS contains parameters used to automatically set up EBS volumes when an instance is launched. // +kubebuilder:validation:XValidation:message="snapshotID or volumeSize must be defined",rule="has(self.snapshotID) || has(self.volumeSize)" - // +required + // +optional EBS *BlockDevice `json:"ebs,omitempty"` // RootVolume is a flag indicating if this device is mounted as kubelet root dir. You can // configure at most one root volume in BlockDeviceMappings. diff --git a/pkg/cloudprovider/cloudprovider.go b/pkg/cloudprovider/cloudprovider.go index acde46ed2276..37fb6853d14c 100644 --- a/pkg/cloudprovider/cloudprovider.go +++ b/pkg/cloudprovider/cloudprovider.go @@ -16,6 +16,7 @@ package cloudprovider import ( "context" + stderrors "errors" "fmt" "net/http" "time" @@ -103,7 +104,7 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim) } nodeClassReady := nodeClass.StatusConditions().Get(status.ConditionReady) if nodeClassReady.IsFalse() { - return nil, cloudprovider.NewNodeClassNotReadyError(fmt.Errorf(nodeClassReady.Message)) + return nil, cloudprovider.NewNodeClassNotReadyError(stderrors.New(nodeClassReady.Message)) } if nodeClassReady.IsUnknown() { return nil, fmt.Errorf("resolving NodeClass readiness, NodeClass is in Ready=Unknown, %s", nodeClassReady.Message) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 0fe9e783a17c..b46e101e6872 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -493,17 +493,17 @@ func instancesFromOutput(out *ec2.DescribeInstancesOutput) ([]*Instance, error) return lo.Map(instances, func(i *ec2.Instance, _ int) *Instance { return NewInstance(i) }), nil } -func combineFleetErrors(errors []*ec2.CreateFleetError) (errs error) { +func combineFleetErrors(fleetErrs []*ec2.CreateFleetError) (errs error) { unique := sets.NewString() - for _, err := range errors { + for _, err := range fleetErrs { unique.Insert(fmt.Sprintf("%s: %s", aws.StringValue(err.ErrorCode), aws.StringValue(err.ErrorMessage))) } for errorCode := range unique { - errs = multierr.Append(errs, fmt.Errorf(errorCode)) + errs = multierr.Append(errs, errors.New(errorCode)) } // If all the Fleet errors are ICE errors then we should wrap the combined error in the generic ICE error - iceErrorCount := lo.CountBy(errors, func(err *ec2.CreateFleetError) bool { return awserrors.IsUnfulfillableCapacity(err) }) - if iceErrorCount == len(errors) { + iceErrorCount := lo.CountBy(fleetErrs, func(err *ec2.CreateFleetError) bool { return awserrors.IsUnfulfillableCapacity(err) }) + if iceErrorCount == len(fleetErrs) { return cloudprovider.NewInsufficientCapacityError(fmt.Errorf("with fleet error(s), %w", errs)) } return fmt.Errorf("with fleet error(s), %w", errs)