diff --git a/.github/actions/e2e/install-karpenter/action.yaml b/.github/actions/e2e/install-karpenter/action.yaml index ff78d03eb305..122214994bd8 100644 --- a/.github/actions/e2e/install-karpenter/action.yaml +++ b/.github/actions/e2e/install-karpenter/action.yaml @@ -71,6 +71,7 @@ runs: --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="arn:aws:iam::$ACCOUNT_ID:role/karpenter-irsa-$CLUSTER_NAME" \ --set settings.clusterName="$CLUSTER_NAME" \ --set settings.interruptionQueue="$CLUSTER_NAME" \ + --set settings.featureGates.spotToSpotConsolidation=true \ --set controller.resources.requests.cpu=3 \ --set controller.resources.requests.memory=3Gi \ --set controller.resources.limits.cpu=3 \ diff --git a/Makefile b/Makefile index df8de6b952fc..ad33b12e5422 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,7 @@ HELM_OPTS ?= --set serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn=${K --set controller.resources.requests.memory=1Gi \ --set controller.resources.limits.cpu=1 \ --set controller.resources.limits.memory=1Gi \ + --set settings.featureGates.spotToSpotConsolidation=true \ --create-namespace # CR for local builds of Karpenter diff --git a/charts/karpenter/README.md b/charts/karpenter/README.md index c377479af27e..29db446c547f 100644 --- a/charts/karpenter/README.md +++ b/charts/karpenter/README.md @@ -63,8 +63,8 @@ helm upgrade --install --namespace karpenter --create-namespace \ | podAnnotations | object | `{}` | Additional annotations for the pod. | | podDisruptionBudget.maxUnavailable | int | `1` | | | podDisruptionBudget.name | string | `"karpenter"` | | -| podSecurityContext | object | `{"fsGroup":65536}` | SecurityContext for the pod. | | podLabels | object | `{}` | Additional labels for the pod. | +| podSecurityContext | object | `{"fsGroup":65536}` | SecurityContext for the pod. | | priorityClassName | string | `"system-cluster-critical"` | PriorityClass name for the pod. | | replicas | int | `2` | Number of replicas. | | revisionHistoryLimit | int | `10` | The number of old ReplicaSets to retain to allow rollback. | @@ -74,7 +74,7 @@ helm upgrade --install --namespace karpenter --create-namespace \ | serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | | serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | | serviceMonitor.endpointConfig | object | `{}` | Endpoint configuration for the ServiceMonitor. | -| settings | object | `{"assumeRoleARN":"","assumeRoleDuration":"15m","batchIdleDuration":"1s","batchMaxDuration":"10s","clusterCABundle":"","clusterEndpoint":"","clusterName":"","featureGates":{"drift":true},"interruptionQueue":"","isolatedVPC":false,"reservedENIs":"0","vmMemoryOverheadPercent":0.075}` | Global Settings to configure Karpenter | +| settings | object | `{"assumeRoleARN":"","assumeRoleDuration":"15m","batchIdleDuration":"1s","batchMaxDuration":"10s","clusterCABundle":"","clusterEndpoint":"","clusterName":"","featureGates":{"drift":true,"spotToSpotConsolidation":false},"interruptionQueue":"","isolatedVPC":false,"reservedENIs":"0","vmMemoryOverheadPercent":0.075}` | Global Settings to configure Karpenter | | settings.assumeRoleARN | string | `""` | Role to assume for calling AWS services. | | settings.assumeRoleDuration | string | `"15m"` | Duration of assumed credentials in minutes. Default value is 15 minutes. Not used unless assumeRoleARN set. | | settings.batchIdleDuration | string | `"1s"` | The maximum amount of time with no new ending pods that if exceeded ends the current batching window. If pods arrive faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods will be batched separately. | @@ -82,8 +82,9 @@ helm upgrade --install --namespace karpenter --create-namespace \ | settings.clusterCABundle | string | `""` | Cluster CA bundle for TLS configuration of provisioned nodes. If not set, this is taken from the controller's TLS configuration for the API server. | | settings.clusterEndpoint | string | `""` | Cluster endpoint. If not set, will be discovered during startup (EKS only) | | settings.clusterName | string | `""` | Cluster name. | -| settings.featureGates | object | `{"drift":true}` | Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features | +| settings.featureGates | object | `{"drift":true,"spotToSpotConsolidation":false}` | Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features | | settings.featureGates.drift | bool | `true` | drift is in BETA and is enabled by default. Setting drift to false disables the drift disruption method to watch for drift between currently deployed nodes and the desired state of nodes set in nodepools and nodeclasses | +| settings.featureGates.spotToSpotConsolidation | bool | `false` | spotToSpotConsolidation is disabled by default. Setting this to true will enable spot replacement consolidation for both single and multi-node consolidation. | | settings.interruptionQueue | string | `""` | interruptionQueue is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs. | | settings.isolatedVPC | bool | `false` | If true then assume we can't reach AWS services which don't have a VPC endpoint This also has the effect of disabling look-ups to the AWS pricing endpoint | | settings.reservedENIs | string | `"0"` | Reserved ENIs are not included in the calculations for max-pods or kube-reserved This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html | diff --git a/charts/karpenter/templates/deployment.yaml b/charts/karpenter/templates/deployment.yaml index 3eb5fc9340ae..4b4f9450a569 100644 --- a/charts/karpenter/templates/deployment.yaml +++ b/charts/karpenter/templates/deployment.yaml @@ -103,7 +103,7 @@ spec: divisor: "0" resource: limits.memory - name: FEATURE_GATES - value: "Drift={{ .Values.settings.featureGates.drift }}" + value: "Drift={{ .Values.settings.featureGates.drift }},SpotToSpotConsolidation={{ .Values.settings.featureGates.spotToSpotConsolidation }}" {{- with .Values.settings.batchMaxDuration }} - name: BATCH_MAX_DURATION value: "{{ . }}" diff --git a/charts/karpenter/values.yaml b/charts/karpenter/values.yaml index cd6c9a661b22..3e15f84945ad 100644 --- a/charts/karpenter/values.yaml +++ b/charts/karpenter/values.yaml @@ -202,3 +202,6 @@ settings: # Setting drift to false disables the drift disruption method to watch for drift between currently deployed nodes # and the desired state of nodes set in nodepools and nodeclasses drift: true + # -- spotToSpotConsolidation is disabled by default. + # Setting this to true will enable spot replacement consolidation for both single and multi-node consolidation. + spotToSpotConsolidation: false diff --git a/go.mod b/go.mod index 2df55f90e470..efe30a7c306a 100644 --- a/go.mod +++ b/go.mod @@ -27,7 +27,7 @@ require ( k8s.io/utils v0.0.0-20230726121419-3b25d923346b knative.dev/pkg v0.0.0-20231010144348-ca8c009405dd sigs.k8s.io/controller-runtime v0.16.3 - sigs.k8s.io/karpenter v0.33.1-0.20240110172322-1fc448d0415d + sigs.k8s.io/karpenter v0.33.1-0.20240112201343-c383004c469a ) require ( diff --git a/go.sum b/go.sum index 4b5c4bfa1bdb..714a5aa6084e 100644 --- a/go.sum +++ b/go.sum @@ -763,8 +763,8 @@ sigs.k8s.io/controller-runtime v0.16.3 h1:2TuvuokmfXvDUamSx1SuAOO3eTyye+47mJCigw sigs.k8s.io/controller-runtime v0.16.3/go.mod h1:j7bialYoSn142nv9sCOJmQgDXQXxnroFU4VnX/brVJ0= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/karpenter v0.33.1-0.20240110172322-1fc448d0415d h1:xB/ckmh8WlR416uEI+NcgUR8+yPnEOIwjU19gvOuHZw= -sigs.k8s.io/karpenter v0.33.1-0.20240110172322-1fc448d0415d/go.mod h1:h/O8acLmwFmYYmDD9b57+Fknlf7gQThuY19l7jpThYs= +sigs.k8s.io/karpenter v0.33.1-0.20240112201343-c383004c469a h1:EuQ5KFs1PHLfPTGJV+0/EnMkdYBaPgt5CCLKWzwjfGE= +sigs.k8s.io/karpenter v0.33.1-0.20240112201343-c383004c469a/go.mod h1:h/O8acLmwFmYYmDD9b57+Fknlf7gQThuY19l7jpThYs= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= diff --git a/pkg/cloudprovider/cloudprovider.go b/pkg/cloudprovider/cloudprovider.go index caaa628f4a14..d1c5de336d87 100644 --- a/pkg/cloudprovider/cloudprovider.go +++ b/pkg/cloudprovider/cloudprovider.go @@ -240,7 +240,7 @@ func (c *CloudProvider) resolveInstanceTypes(ctx context.Context, nodeClaim *cor reqs := scheduling.NewNodeSelectorRequirements(nodeClaim.Spec.Requirements...) return lo.Filter(instanceTypes, func(i *cloudprovider.InstanceType, _ int) bool { return reqs.Compatible(i.Requirements, scheduling.AllowUndefinedWellKnownLabels) == nil && - len(i.Offerings.Requirements(reqs).Available()) > 0 && + len(i.Offerings.Compatible(reqs).Available()) > 0 && resources.Fits(nodeClaim.Spec.Resources.Requests, i.Allocatable()) }), nil } diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 9774e0a13d37..7d393356da9d 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -374,11 +374,11 @@ func orderInstanceTypesByPrice(instanceTypes []*cloudprovider.InstanceType, requ sort.Slice(instanceTypes, func(i, j int) bool { iPrice := math.MaxFloat64 jPrice := math.MaxFloat64 - if len(instanceTypes[i].Offerings.Available().Requirements(requirements)) > 0 { - iPrice = instanceTypes[i].Offerings.Available().Requirements(requirements).Cheapest().Price + if len(instanceTypes[i].Offerings.Available().Compatible(requirements)) > 0 { + iPrice = instanceTypes[i].Offerings.Available().Compatible(requirements).Cheapest().Price } - if len(instanceTypes[j].Offerings.Available().Requirements(requirements)) > 0 { - jPrice = instanceTypes[j].Offerings.Available().Requirements(requirements).Cheapest().Price + if len(instanceTypes[j].Offerings.Available().Compatible(requirements)) > 0 { + jPrice = instanceTypes[j].Offerings.Available().Compatible(requirements).Cheapest().Price } if iPrice == jPrice { return instanceTypes[i].Name < instanceTypes[j].Name diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index 79763ba7db6c..6e2802e10095 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -339,8 +339,8 @@ var _ = Describe("InstanceTypes", func() { // We need some way to deterministically order them if their prices match reqs := scheduling.NewNodeSelectorRequirements(nodePool.Spec.Template.Spec.Requirements...) sort.Slice(its, func(i, j int) bool { - iPrice := its[i].Offerings.Requirements(reqs).Cheapest().Price - jPrice := its[j].Offerings.Requirements(reqs).Cheapest().Price + iPrice := its[i].Offerings.Compatible(reqs).Cheapest().Price + jPrice := its[j].Offerings.Compatible(reqs).Cheapest().Price if iPrice == jPrice { return its[i].Name < its[j].Name } @@ -397,8 +397,8 @@ var _ = Describe("InstanceTypes", func() { // We need some way to deterministically order them if their prices match reqs := scheduling.NewNodeSelectorRequirements(nodePool.Spec.Template.Spec.Requirements...) sort.Slice(its, func(i, j int) bool { - iPrice := its[i].Offerings.Requirements(reqs).Cheapest().Price - jPrice := its[j].Offerings.Requirements(reqs).Cheapest().Price + iPrice := its[i].Offerings.Compatible(reqs).Cheapest().Price + jPrice := its[j].Offerings.Compatible(reqs).Cheapest().Price if iPrice == jPrice { return its[i].Name < its[j].Name } diff --git a/test/suites/consolidation/suite_test.go b/test/suites/consolidation/suite_test.go index ea2a3ef02c19..8c3257e2e597 100644 --- a/test/suites/consolidation/suite_test.go +++ b/test/suites/consolidation/suite_test.go @@ -21,6 +21,7 @@ import ( "time" "github.com/aws/aws-sdk-go/aws" + "github.com/samber/lo" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -30,8 +31,8 @@ import ( "sigs.k8s.io/karpenter/pkg/test" "github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1" - "github.com/aws/karpenter-provider-aws/test/pkg/debug" + "github.com/aws/karpenter-provider-aws/test/pkg/debug" environmentaws "github.com/aws/karpenter-provider-aws/test/pkg/environment/aws" "github.com/aws/karpenter-provider-aws/test/pkg/environment/common" @@ -62,196 +63,202 @@ var _ = AfterEach(func() { env.Cleanup() }) var _ = AfterEach(func() { env.AfterEach() }) var _ = Describe("Consolidation", func() { - It("should consolidate nodes (delete)", Label(debug.NoWatch), Label(debug.NoEvents), func() { - nodePool := test.NodePool(corev1beta1.NodePool{ - Spec: corev1beta1.NodePoolSpec{ - Disruption: corev1beta1.Disruption{ - ConsolidationPolicy: corev1beta1.ConsolidationPolicyWhenUnderutilized, - // Disable Consolidation until we're ready - ConsolidateAfter: &corev1beta1.NillableDuration{}, - }, - Template: corev1beta1.NodeClaimTemplate{ - Spec: corev1beta1.NodeClaimSpec{ - Requirements: []v1.NodeSelectorRequirement{ - { - Key: corev1beta1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - // we don't replace spot nodes, so this forces us to only delete nodes - Values: []string{corev1beta1.CapacityTypeSpot}, - }, - { - Key: v1beta1.LabelInstanceSize, - Operator: v1.NodeSelectorOpIn, - Values: []string{"medium", "large", "xlarge"}, - }, - { - Key: v1beta1.LabelInstanceFamily, - Operator: v1.NodeSelectorOpNotIn, - // remove some cheap burstable and the odd c1 instance types so we have - // more control over what gets provisioned - Values: []string{"t2", "t3", "c1", "t3a", "t4g"}, + DescribeTable("should consolidate nodes (delete)", Label(debug.NoWatch), Label(debug.NoEvents), + func(spotToSpot bool) { + nodePool := test.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Disruption: corev1beta1.Disruption{ + ConsolidationPolicy: corev1beta1.ConsolidationPolicyWhenUnderutilized, + // Disable Consolidation until we're ready + ConsolidateAfter: &corev1beta1.NillableDuration{}, + }, + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: lo.Ternary(spotToSpot, []string{corev1beta1.CapacityTypeSpot}, []string{corev1beta1.CapacityTypeOnDemand}), + }, + { + Key: v1beta1.LabelInstanceSize, + Operator: v1.NodeSelectorOpIn, + Values: []string{"medium", "large", "xlarge"}, + }, + { + Key: v1beta1.LabelInstanceFamily, + Operator: v1.NodeSelectorOpNotIn, + // remove some cheap burstable and the odd c1 instance types so we have + // more control over what gets provisioned + Values: []string{"t2", "t3", "c1", "t3a", "t4g"}, + }, }, + NodeClassRef: &corev1beta1.NodeClassReference{Name: nodeClass.Name}, }, - NodeClassRef: &corev1beta1.NodeClassReference{Name: nodeClass.Name}, }, }, - }, - }) - - var numPods int32 = 100 - dep := test.Deployment(test.DeploymentOptions{ - Replicas: numPods, - PodOptions: test.PodOptions{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{"app": "large-app"}, - }, - ResourceRequirements: v1.ResourceRequirements{ - Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1")}, - }, - }, - }) - - selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) - env.ExpectCreatedNodeCount("==", 0) - env.ExpectCreated(nodePool, nodeClass, dep) - - env.EventuallyExpectHealthyPodCount(selector, int(numPods)) - - // reduce the number of pods by 60% - dep.Spec.Replicas = aws.Int32(40) - env.ExpectUpdated(dep) - env.EventuallyExpectAvgUtilization(v1.ResourceCPU, "<", 0.5) - - // Enable consolidation as WhenUnderutilized doesn't allow a consolidateAfter value - nodePool.Spec.Disruption.ConsolidateAfter = nil - env.ExpectUpdated(nodePool) - - // With consolidation enabled, we now must delete nodes - env.EventuallyExpectAvgUtilization(v1.ResourceCPU, ">", 0.6) - - env.ExpectDeleted(dep) - }) - It("should consolidate on-demand nodes (replace)", func() { - nodePool := test.NodePool(corev1beta1.NodePool{ - Spec: corev1beta1.NodePoolSpec{ - Disruption: corev1beta1.Disruption{ - ConsolidationPolicy: corev1beta1.ConsolidationPolicyWhenUnderutilized, - // Disable Consolidation until we're ready - ConsolidateAfter: &corev1beta1.NillableDuration{}, + }) + + var numPods int32 = 100 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: numPods, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1")}, + }, }, - Template: corev1beta1.NodeClaimTemplate{ - Spec: corev1beta1.NodeClaimSpec{ - Requirements: []v1.NodeSelectorRequirement{ - { - Key: corev1beta1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - // we don't replace spot nodes, so this forces us to only delete nodes - Values: []string{corev1beta1.CapacityTypeOnDemand}, - }, - { - Key: v1beta1.LabelInstanceSize, - Operator: v1.NodeSelectorOpIn, - Values: []string{"large", "2xlarge"}, + }) + + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreatedNodeCount("==", 0) + env.ExpectCreated(nodePool, nodeClass, dep) + + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + + // reduce the number of pods by 60% + dep.Spec.Replicas = aws.Int32(40) + env.ExpectUpdated(dep) + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, "<", 0.5) + + // Enable consolidation as WhenUnderutilized doesn't allow a consolidateAfter value + nodePool.Spec.Disruption.ConsolidateAfter = nil + env.ExpectUpdated(nodePool) + + // With consolidation enabled, we now must delete nodes + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, ">", 0.6) + + env.ExpectDeleted(dep) + }, + Entry("if the nodes are on-demand nodes", false), + Entry("if the nodes are spot nodes", true), + ) + DescribeTable("should consolidate nodes (replace)", + func(spotToSpot bool) { + nodePool := test.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Disruption: corev1beta1.Disruption{ + ConsolidationPolicy: corev1beta1.ConsolidationPolicyWhenUnderutilized, + // Disable Consolidation until we're ready + ConsolidateAfter: &corev1beta1.NillableDuration{}, + }, + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: lo.Ternary(spotToSpot, []string{corev1beta1.CapacityTypeSpot}, []string{corev1beta1.CapacityTypeOnDemand}), + }, + { + Key: v1beta1.LabelInstanceSize, + Operator: v1.NodeSelectorOpIn, + Values: []string{"large", "2xlarge"}, + }, }, + NodeClassRef: &corev1beta1.NodeClassReference{Name: nodeClass.Name}, }, - NodeClassRef: &corev1beta1.NodeClassReference{Name: nodeClass.Name}, }, }, - }, - }) - - var numPods int32 = 3 - largeDep := test.Deployment(test.DeploymentOptions{ - Replicas: numPods, - PodOptions: test.PodOptions{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{"app": "large-app"}, - }, - TopologySpreadConstraints: []v1.TopologySpreadConstraint{ - { - MaxSkew: 1, - TopologyKey: v1.LabelHostname, - WhenUnsatisfiable: v1.DoNotSchedule, - LabelSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app": "large-app", + }) + + var numPods int32 = 3 + largeDep := test.Deployment(test.DeploymentOptions{ + Replicas: numPods, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: v1.LabelHostname, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "large-app", + }, }, }, }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("4")}, + }, }, - ResourceRequirements: v1.ResourceRequirements{ - Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("4")}, - }, - }, - }) - smallDep := test.Deployment(test.DeploymentOptions{ - Replicas: numPods, - PodOptions: test.PodOptions{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{"app": "small-app"}, - }, - TopologySpreadConstraints: []v1.TopologySpreadConstraint{ - { - MaxSkew: 1, - TopologyKey: v1.LabelHostname, - WhenUnsatisfiable: v1.DoNotSchedule, - LabelSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app": "small-app", + }) + smallDep := test.Deployment(test.DeploymentOptions{ + Replicas: numPods, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "small-app"}, + }, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: v1.LabelHostname, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "small-app", + }, }, }, }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1.5")}, + }, }, - ResourceRequirements: v1.ResourceRequirements{ - Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1.5")}, - }, - }, - }) + }) - selector := labels.SelectorFromSet(largeDep.Spec.Selector.MatchLabels) - env.ExpectCreatedNodeCount("==", 0) - env.ExpectCreated(nodePool, nodeClass, largeDep, smallDep) + selector := labels.SelectorFromSet(largeDep.Spec.Selector.MatchLabels) + env.ExpectCreatedNodeCount("==", 0) + env.ExpectCreated(nodePool, nodeClass, largeDep, smallDep) - env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) - // 3 nodes due to the anti-affinity rules - env.ExpectCreatedNodeCount("==", 3) + // 3 nodes due to the anti-affinity rules + env.ExpectCreatedNodeCount("==", 3) - // scaling down the large deployment leaves only small pods on each node - largeDep.Spec.Replicas = aws.Int32(0) - env.ExpectUpdated(largeDep) - env.EventuallyExpectAvgUtilization(v1.ResourceCPU, "<", 0.5) + // scaling down the large deployment leaves only small pods on each node + largeDep.Spec.Replicas = aws.Int32(0) + env.ExpectUpdated(largeDep) + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, "<", 0.5) - nodePool.Spec.Disruption.ConsolidateAfter = nil - env.ExpectUpdated(nodePool) + nodePool.Spec.Disruption.ConsolidateAfter = nil + env.ExpectUpdated(nodePool) - // With consolidation enabled, we now must replace each node in turn to consolidate due to the anti-affinity - // rules on the smaller deployment. The 2xl nodes should go to a large - env.EventuallyExpectAvgUtilization(v1.ResourceCPU, ">", 0.8) - - var nodes v1.NodeList - Expect(env.Client.List(env.Context, &nodes)).To(Succeed()) - numLargeNodes := 0 - numOtherNodes := 0 - for _, n := range nodes.Items { - // only count the nodes created by the provisoiner - if n.Labels[corev1beta1.NodePoolLabelKey] != nodePool.Name { - continue - } - if strings.HasSuffix(n.Labels[v1.LabelInstanceTypeStable], ".large") { - numLargeNodes++ - } else { - numOtherNodes++ + // With consolidation enabled, we now must replace each node in turn to consolidate due to the anti-affinity + // rules on the smaller deployment. The 2xl nodes should go to a large + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, ">", 0.8) + + var nodes v1.NodeList + Expect(env.Client.List(env.Context, &nodes)).To(Succeed()) + numLargeNodes := 0 + numOtherNodes := 0 + for _, n := range nodes.Items { + // only count the nodes created by the provisoiner + if n.Labels[corev1beta1.NodePoolLabelKey] != nodePool.Name { + continue + } + if strings.HasSuffix(n.Labels[v1.LabelInstanceTypeStable], ".large") { + numLargeNodes++ + } else { + numOtherNodes++ + } } - } - // all of the 2xlarge nodes should have been replaced with large instance types - Expect(numLargeNodes).To(Equal(3)) - // and we should have no other nodes - Expect(numOtherNodes).To(Equal(0)) + // all of the 2xlarge nodes should have been replaced with large instance types + Expect(numLargeNodes).To(Equal(3)) + // and we should have no other nodes + Expect(numOtherNodes).To(Equal(0)) - env.ExpectDeleted(largeDep, smallDep) - }) + env.ExpectDeleted(largeDep, smallDep) + }, + Entry("if the nodes are on-demand nodes", false), + Entry("if the nodes are spot nodes", true), + ) It("should consolidate on-demand nodes to spot (replace)", func() { nodePool := test.NodePool(corev1beta1.NodePool{ Spec: corev1beta1.NodePoolSpec{ @@ -266,8 +273,7 @@ var _ = Describe("Consolidation", func() { { Key: corev1beta1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, - // we don't replace spot nodes, so this forces us to only delete nodes - Values: []string{corev1beta1.CapacityTypeOnDemand}, + Values: []string{corev1beta1.CapacityTypeOnDemand}, }, { Key: v1beta1.LabelInstanceSize, diff --git a/website/content/en/preview/concepts/disruption.md b/website/content/en/preview/concepts/disruption.md index 27f76bd63c12..c33a344250d8 100644 --- a/website/content/en/preview/concepts/disruption.md +++ b/website/content/en/preview/concepts/disruption.md @@ -117,9 +117,17 @@ Events: Using preferred anti-affinity and topology spreads can reduce the effectiveness of consolidation. At node launch, Karpenter attempts to satisfy affinity and topology spread preferences. In order to reduce node churn, consolidation must also attempt to satisfy these constraints to avoid immediately consolidating nodes after they launch. This means that consolidation may not disrupt nodes in order to avoid violating preferences, even if kube-scheduler can fit the host pods elsewhere. Karpenter reports these pods via logging to bring awareness to the possible issues they can cause (e.g. `pod default/inflate-anti-self-55894c5d8b-522jd has a preferred Anti-Affinity which can prevent consolidation`). {{% /alert %}} -{{% alert title="Note" color="primary" %}} -For spot nodes, Karpenter only uses the deletion consolidation mechanism. It will not replace a spot node with a cheaper spot node. Spot instance types are selected with the `price-capacity-optimized` strategy and often the cheapest spot instance type is not launched due to the likelihood of interruption. Consolidation would then replace the spot instance with a cheaper instance negating the `price-capacity-optimized` strategy entirely and increasing interruption rate. -{{% /alert %}} +#### Spot consolidation +For spot nodes, Karpenter has deletion consolidation enabled by default. If you would like to enable replacement with spot consolidation, you need to enable the feature through the [`SpotToSpotConsolidation` feature flag]({{}}). + +Cheaper spot instance types are selected with the [`price-capacity-optimized` strategy](https://aws.amazon.com/blogs/compute/introducing-price-capacity-optimized-allocation-strategy-for-ec2-spot-instances/). Often, the cheapest spot instance type is not launched due to the likelihood of interruption. As a result, Karpenter uses the number of available instance type options cheaper than the currently launched spot instance as a heuristic for evaluating whether it should launch a replacement for the current spot node. + +We refer to the number of instances that Karpenter has within its launch decision as a launch's "instance type flexibility." When Karpenter is considering performing a spot-to-spot consolidation replacement, it will check whether replacing the instance type will lead to enough instance type flexibility in the subsequent launch request. As a result, we get the following properties when evaluating for consolidation: +1) We shouldn't continually consolidate down to the cheapest spot instance which might have very high rates of interruption. +2) We launch with enough instance types that there’s high likelihood that our replacement instance has comparable availability to our current one. + +Karpenter requires a minimum instance type flexibility of 15 instance types when performing single node spot-to-spot consolidations (1 node to 1 node). It does not have the same instance type flexibility requirement for multi-node spot-to-spot consolidations (many nodes to 1 node) since doing so without requiring flexibility won't lead to "race to the bottom" scenarios. + ### Drift Drift handles changes to the NodePool/EC2NodeClass. For Drift, values in the NodePool/EC2NodeClass are reflected in the NodeClaimTemplateSpec/EC2NodeClassSpec in the same way that they’re set. A NodeClaim will be detected as drifted if the values in its owning NodePool/EC2NodeClass do not match the values in the NodeClaim. Similar to the upstream `deployment.spec.template` relationship to pods, Karpenter will annotate the owning NodePool and EC2NodeClass with a hash of the NodeClaimTemplateSpec to check for drift. Some special cases will be discovered either from Karpenter or through the CloudProvider interface, triggered by NodeClaim/Instance/NodePool/EC2NodeClass changes. diff --git a/website/content/en/preview/reference/settings.md b/website/content/en/preview/reference/settings.md index 438c24244d26..4150586483ea 100644 --- a/website/content/en/preview/reference/settings.md +++ b/website/content/en/preview/reference/settings.md @@ -21,7 +21,7 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf | CLUSTER_NAME | \-\-cluster-name | [REQUIRED] The kubernetes cluster name for resource discovery.| | DISABLE_WEBHOOK | \-\-disable-webhook | Disable the admission and validation webhooks| | ENABLE_PROFILING | \-\-enable-profiling | Enable the profiling on the metric endpoint| -| FEATURE_GATES | \-\-feature-gates | Optional features can be enabled / disabled using feature gates. Current options are: Drift (default = Drift=true)| +| FEATURE_GATES | \-\-feature-gates | Optional features can be enabled / disabled using feature gates. Current options are: Drift,SpotToSpotConsolidation (default = Drift=true,SpotToSpotConsolidation=false)| | HEALTH_PROBE_PORT | \-\-health-probe-port | The port the health probe endpoint binds to for reporting controller health (default = 8081)| | INTERRUPTION_QUEUE | \-\-interruption-queue | Interruption queue is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.| | ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS pricing endpoint.| @@ -41,12 +41,13 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf ### Feature Gates -Karpenter uses [feature gates](https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features) You can enable the feature gates through the `--feature-gates` CLI environment variable or the `FEATURE_GATES` environment variable in the Karpenter deployment. For example, you can configure drift by setting the following CLI argument: `--feature-gates Drift=true`. +Karpenter uses [feature gates](https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features) You can enable the feature gates through the `--feature-gates` CLI environment variable or the `FEATURE_GATES` environment variable in the Karpenter deployment. For example, you can configure drift, spotToSpotConsolidation by setting the CLI argument: `--feature-gates Drift=true,SpotToSpotConsolidation=true`. -| Feature | Default | Stage | Since | Until | -|---------|---------|-------|---------|---------| -| Drift | false | Alpha | v0.21.x | v0.32.x | -| Drift | true | Beta | v0.33.x | | +| Feature | Default | Stage | Since | Until | +|-------------------------|---------|-------|---------|---------| +| Drift | false | Alpha | v0.21.x | v0.32.x | +| Drift | true | Beta | v0.33.x | | +| SpotToSpotConsolidation | false | Beta | v0.34.x | | ### Batching Parameters