From 52b9ea9f9d0bad8ccbf06fe59dd2c128bae91d05 Mon Sep 17 00:00:00 2001 From: Akshay Raj Date: Mon, 28 Mar 2022 12:58:57 -0700 Subject: [PATCH] Add support for using topologyKey=karpenter.sh/capacity-type in topologySpreadConstraints --- .../provisioning/scheduling/suite_test.go | 351 ++++++++++++++++++ .../provisioning/scheduling/topology.go | 10 + pkg/controllers/selection/controller.go | 2 +- website/content/en/preview/provisioner.md | 1 + .../content/en/preview/tasks/scheduling.md | 14 +- 5 files changed, 376 insertions(+), 2 deletions(-) diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go index d97997e2eee7..7c341f6a2f80 100644 --- a/pkg/controllers/provisioning/scheduling/suite_test.go +++ b/pkg/controllers/provisioning/scheduling/suite_test.go @@ -23,6 +23,7 @@ import ( "github.com/Pallinder/go-randomdata" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider" + "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/cloudprovider/fake" "github.com/aws/karpenter/pkg/cloudprovider/registry" "github.com/aws/karpenter/pkg/controllers/provisioning" @@ -900,6 +901,153 @@ var _ = Describe("Topology", func() { }) }) + Context("CapacityType", func() { + It("should balance pods across capacity types", func() { + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1alpha5.LabelCapacityType, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }} + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}), + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}), + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}), + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}), + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2, 2)) + }) + It("should respect provisioner capacity type constraints", func() { + provisioner.Spec.Requirements = v1alpha5.NewRequirements( + v1.NodeSelectorRequirement{Key: v1alpha5.LabelCapacityType, Operator: v1.NodeSelectorOpIn, Values: []string{v1alpha1.CapacityTypeSpot, v1alpha1.CapacityTypeOnDemand}}) + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1alpha5.LabelCapacityType, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }} + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}), + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}), + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}), + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}), + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2, 2)) + }) + It("should not violate max-skew when unsat = do not schedule", func() { + Skip("enable after scheduler no longer violates max-skew") + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1alpha5.LabelCapacityType, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }} + // force this pod onto spot + provisioner.Spec.Requirements = v1alpha5.NewRequirements( + v1.NodeSelectorRequirement{Key: v1alpha5.LabelCapacityType, Operator: v1.NodeSelectorOpIn, Values: []string{v1alpha1.CapacityTypeSpot}}) + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1)) + + // now only allow scheduling pods on on-demand + provisioner.Spec.Requirements = v1alpha5.NewRequirements( + v1.NodeSelectorRequirement{Key: v1alpha5.LabelCapacityType, Operator: v1.NodeSelectorOpIn, Values: []string{v1alpha1.CapacityTypeOnDemand}}) + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(5, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + + // max skew of 1, so on-demand will have 2 pods and the rest of the pods will fail to schedule + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1, 2)) + }) + It("should violate max-skew when unsat = schedule anyway", func() { + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1alpha5.LabelCapacityType, + WhenUnsatisfiable: v1.ScheduleAnyway, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }} + provisioner.Spec.Requirements = v1alpha5.NewRequirements( + v1.NodeSelectorRequirement{Key: v1alpha5.LabelCapacityType, Operator: v1.NodeSelectorOpIn, Values: []string{v1alpha1.CapacityTypeSpot}}) + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1)) + + provisioner.Spec.Requirements = v1alpha5.NewRequirements( + v1.NodeSelectorRequirement{Key: v1alpha5.LabelCapacityType, Operator: v1.NodeSelectorOpIn, Values: []string{v1alpha1.CapacityTypeOnDemand}}) + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(5, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + + // max skew of 1, on-demand will end up with 5 pods even though spot has a single pod + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1, 5)) + }) + It("should only count running/scheduled pods with matching labels scheduled to nodes with a corresponding domain", func() { + wrongNamespace := strings.ToLower(randomdata.SillyName()) + firstNode := test.Node(test.NodeOptions{ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{v1alpha5.LabelCapacityType: v1alpha1.CapacityTypeSpot}}}) + secondNode := test.Node(test.NodeOptions{ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{v1alpha5.LabelCapacityType: v1alpha1.CapacityTypeOnDemand}}}) + thirdNode := test.Node(test.NodeOptions{}) // missing topology capacity type + ExpectCreated(ctx, env.Client, provisioner, firstNode, secondNode, thirdNode, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: wrongNamespace}}) + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1alpha5.LabelCapacityType, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }} + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.Pod(test.PodOptions{NodeName: firstNode.Name}), // ignored, missing labels + test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}}), // ignored, pending + test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, NodeName: thirdNode.Name}), // ignored, no domain on node + test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels, Namespace: wrongNamespace}, NodeName: firstNode.Name}), // ignored, wrong namespace + test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels, DeletionTimestamp: &metav1.Time{Time: time.Now().Add(10 * time.Second)}}, NodeName: firstNode.Name}), // ignored, terminating + test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, NodeName: firstNode.Name, Phase: v1.PodFailed}), // ignored, phase=Failed + test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, NodeName: firstNode.Name, Phase: v1.PodSucceeded}), // ignored, phase=Succeeded + test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, NodeName: firstNode.Name}), + test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, NodeName: firstNode.Name}), + test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, NodeName: secondNode.Name}), + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}), + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}), + ) + nodes := v1.NodeList{} + Expect(env.Client.List(ctx, &nodes)).To(Succeed()) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2, 3)) + }) + It("should match all pods when labelSelector is not specified", func() { + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1alpha5.LabelCapacityType, + WhenUnsatisfiable: v1.DoNotSchedule, + MaxSkew: 1, + }} + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(), + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1)) + }) + It("should handle interdependent selectors", func() { + Skip("enable after scheduler handles non-self selecting topology") + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1.LabelHostname, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }} + pods := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(5, test.PodOptions{TopologySpreadConstraints: topology})..., + ) + // This is weird, but the topology label selector is used for determining domain counts. The pod that + // owns the topology is what the spread actually applies to. In this test case, there are no pods matching + // the label selector, so the max skew is zero. This means we can pack all the pods onto the same node since + // it doesn't violate the topology spread constraint (i.e. adding new pods doesn't increase skew since the + // pods we are adding don't count toward skew). This behavior is called out at + // https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/ , though it's not + // recommended for users. + nodeNames := sets.NewString() + for _, p := range pods { + nodeNames.Insert(p.Spec.NodeName) + } + Expect(nodeNames).To(HaveLen(1)) + }) + }) + Context("Combined Hostname and Zonal Topology", func() { It("should spread pods while respecting both constraints", func() { topology := []v1.TopologySpreadConstraint{{ @@ -939,6 +1087,132 @@ var _ = Describe("Topology", func() { }) }) + Context("Combined Hostname and Capacity Type Topology", func() { + It("should spread pods while respecting both constraints", func() { + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1alpha5.LabelCapacityType, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }, { + TopologyKey: v1.LabelHostname, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 3, + }} + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(2, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1, 1)) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 3))) + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(3, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(3, 2)) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 3))) + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(5, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(5, 5)) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 3))) + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(11, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(11, 10)) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 3))) + }) + }) + + Context("Combined Zonal and Capacity Type Topology", func() { + It("should spread pods while respecting both constraints", func() { + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1alpha5.LabelCapacityType, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }, { + TopologyKey: v1.LabelTopologyZone, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }} + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(2, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).ToNot(ContainElements(BeNumerically(">", 1))) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 1))) + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(3, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).ToNot(ContainElements(BeNumerically(">", 3))) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 2))) + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(5, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).ToNot(ContainElements(BeNumerically(">", 5))) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 4))) + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(11, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).ToNot(ContainElements(BeNumerically(">", 11))) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 7))) + }) + }) + + Context("Combined Hostname, Zonal, and Capacity Type Topology", func() { + It("should spread pods while respecting all constraints", func() { + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1alpha5.LabelCapacityType, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }, { + TopologyKey: v1.LabelTopologyZone, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 2, + }, { + TopologyKey: v1.LabelHostname, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 3, + }} + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(2, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).ToNot(ContainElements(BeNumerically(">", 1))) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 2))) + ExpectSkew(ctx, env.Client, "default", &topology[2]).ToNot(ContainElements(BeNumerically(">", 3))) + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(3, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).ToNot(ContainElements(BeNumerically(">", 3))) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 2))) + ExpectSkew(ctx, env.Client, "default", &topology[2]).ToNot(ContainElements(BeNumerically(">", 3))) + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(5, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).ToNot(ContainElements(BeNumerically(">", 5))) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 4))) + ExpectSkew(ctx, env.Client, "default", &topology[2]).ToNot(ContainElements(BeNumerically(">", 5))) + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(11, test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology})..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).ToNot(ContainElements(BeNumerically(">", 11))) + ExpectSkew(ctx, env.Client, "default", &topology[1]).ToNot(ContainElements(BeNumerically(">", 8))) + ExpectSkew(ctx, env.Client, "default", &topology[2]).ToNot(ContainElements(BeNumerically(">", 9))) + }) + }) + // https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/#interaction-with-node-affinity-and-node-selectors Context("Combined Zonal Topology and Node Affinity", func() { It("should limit spread options by nodeSelector", func() { @@ -1011,6 +1285,78 @@ var _ = Describe("Topology", func() { }) }) + // https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/#interaction-with-node-affinity-and-node-selectors + Context("Combined Capacity Type Topology and Node Affinity", func() { + It("should limit spread options by nodeSelector", func() { + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1alpha5.LabelCapacityType, + WhenUnsatisfiable: v1.ScheduleAnyway, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }} + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + append( + MakePods(5, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: labels}, + TopologySpreadConstraints: topology, + NodeSelector: map[string]string{v1alpha5.LabelCapacityType: v1alpha1.CapacityTypeSpot}, + }), + MakePods(5, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: labels}, + TopologySpreadConstraints: topology, + NodeSelector: map[string]string{v1alpha5.LabelCapacityType: v1alpha1.CapacityTypeOnDemand}, + })..., + )..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(5, 5)) + }) + It("should limit spread options by node affinity", func() { + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1alpha5.LabelCapacityType, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }} + + // need to limit the provisioner to spot or else it will know that on-demand has 0 pods and won't violate + // the max-skew + provisioner.Spec.Requirements = v1alpha5.NewRequirements( + v1.NodeSelectorRequirement{Key: v1alpha5.LabelCapacityType, Operator: v1.NodeSelectorOpIn, Values: []string{v1alpha1.CapacityTypeSpot}}) + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(3, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: labels}, + TopologySpreadConstraints: topology, + NodeRequirements: []v1.NodeSelectorRequirement{{Key: v1alpha5.LabelCapacityType, Operator: v1.NodeSelectorOpIn, Values: []string{ + v1alpha1.CapacityTypeSpot, v1alpha1.CapacityTypeOnDemand, + }}}, + })...) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(3)) + + // open the provisioner back to up so it can see all capacity types + provisioner.Spec.Requirements = v1alpha5.NewRequirements( + v1.NodeSelectorRequirement{Key: v1alpha5.LabelCapacityType, Operator: v1.NodeSelectorOpIn, Values: []string{v1alpha1.CapacityTypeSpot, v1alpha1.CapacityTypeOnDemand}}) + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, MakePods(1, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: labels}, + TopologySpreadConstraints: topology, + NodeRequirements: []v1.NodeSelectorRequirement{{Key: v1alpha5.LabelCapacityType, Operator: v1.NodeSelectorOpIn, Values: []string{ + v1alpha1.CapacityTypeOnDemand, + }}}, + })...) + + // it will schedule on the currently empty on-demand even though max-skew is violated as it improves max-skew + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(3, 1)) + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + MakePods(5, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: labels}, + TopologySpreadConstraints: topology, + })..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(5, 4)) + }) + }) + Context("Pod Affinity", func() { It("should schedule a pod with empty pod affinity and anti-affinity", func() { Skip("enable after pod-affinity is finished") @@ -2035,6 +2381,11 @@ func ExpectSkew(ctx context.Context, c client.Client, namespace string, constrai skew[key]++ } } + if constraint.TopologyKey == v1alpha5.LabelCapacityType { + if key, ok := node.Labels[constraint.TopologyKey]; ok { + skew[key]++ + } + } } } } diff --git a/pkg/controllers/provisioning/scheduling/topology.go b/pkg/controllers/provisioning/scheduling/topology.go index e7a7ed24f0ed..64ea769326cb 100644 --- a/pkg/controllers/provisioning/scheduling/topology.go +++ b/pkg/controllers/provisioning/scheduling/topology.go @@ -83,6 +83,8 @@ func (t *Topology) computeCurrentTopology(ctx context.Context, constraints *v1al return t.computeHostnameTopology(topologyGroup, constraints) case v1.LabelTopologyZone: return t.computeZonalTopology(ctx, constraints, topologyGroup) + case v1alpha5.LabelCapacityType: + return t.computeLabelCapacity(ctx, constraints, topologyGroup) default: return nil } @@ -119,6 +121,14 @@ func (t *Topology) computeZonalTopology(ctx context.Context, constraints *v1alph return nil } +func (t *Topology) computeLabelCapacity(ctx context.Context, constraints *v1alpha5.Constraints, topologyGroup *TopologyGroup) error { + topologyGroup.Register(constraints.Requirements.CapacityTypes().UnsortedList()...) + if err := t.countMatchingPods(ctx, topologyGroup); err != nil { + return fmt.Errorf("getting matching pods, %w", err) + } + return nil +} + func (t *Topology) countMatchingPods(ctx context.Context, topologyGroup *TopologyGroup) error { pods := &v1.PodList{} if err := t.kubeClient.List(ctx, pods, TopologyListOptions(topologyGroup.Pods[0].Namespace, &topologyGroup.Constraint)); err != nil { diff --git a/pkg/controllers/selection/controller.go b/pkg/controllers/selection/controller.go index e55926c3d934..b7c84bb520fa 100644 --- a/pkg/controllers/selection/controller.go +++ b/pkg/controllers/selection/controller.go @@ -131,7 +131,7 @@ func validate(p *v1.Pod) error { func validateTopology(pod *v1.Pod) (errs error) { for _, constraint := range pod.Spec.TopologySpreadConstraints { - if supported := sets.NewString(v1.LabelHostname, v1.LabelTopologyZone); !supported.Has(constraint.TopologyKey) { + if supported := sets.NewString(v1.LabelHostname, v1.LabelTopologyZone, v1alpha5.LabelCapacityType); !supported.Has(constraint.TopologyKey) { errs = multierr.Append(errs, fmt.Errorf("unsupported topology key, %s not in %s", constraint.TopologyKey, supported)) } } diff --git a/website/content/en/preview/provisioner.md b/website/content/en/preview/provisioner.md index 97e949e1988a..c04fa1ee886e 100644 --- a/website/content/en/preview/provisioner.md +++ b/website/content/en/preview/provisioner.md @@ -168,6 +168,7 @@ Karpenter supports specifying capacity type, which is analogous to [EC2 purchase Karpenter prioritizes Spot offerings if the provisioner allows Spot and on-demand instances. If the provider API (e.g. EC2 Fleet's API) indicates Spot capacity is unavailable, Karpenter caches that result across all attempts to provision EC2 capacity for that instance type and zone for the next 45 seconds. If there are no other possible offerings available for Spot, Karpenter will attempt to provision on-demand instances, generally within milliseconds. +Karpenter also allows `karpenter.sh/capacity-type` to be used as a topology key for enforcing topology-spread. ## spec.kubeletConfiguration diff --git a/website/content/en/preview/tasks/scheduling.md b/website/content/en/preview/tasks/scheduling.md index 8077a99267c1..e29fa85c6a71 100755 --- a/website/content/en/preview/tasks/scheduling.md +++ b/website/content/en/preview/tasks/scheduling.md @@ -229,17 +229,29 @@ spec: labelSelector: matchLabels: dev: jjones + - maxSkew: 1 + topologyKey: "karpenter.sh/capacity-type" + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + dev: jjones ``` Adding this to your podspec would result in: -* Pods being spread across both zones and hosts (`topologyKey`). +* Pods being spread across zones, hosts, and capacity-type (`topologyKey`). * The `dev` `labelSelector` will include all pods with the label of `dev=jjones` in topology calculations. It is recommended to use a selector to match all pods in a deployment. * No more than one pod difference in the number of pods on each host (`maxSkew`). For example, if there were three nodes and five pods the pods could be spread 1, 2, 2 or 2, 1, 2 and so on. If instead the spread were 5, pods could be 5, 0, 0 or 3, 2, 0, or 2, 1, 2 and so on. * Karpenter is always able to improve skew by launching new nodes in the right zones. Therefore, `whenUnsatisfiable` does not change provisioning behavior. +The three supported `topologyKey` values that Karpenter supports are: +- `topology.kubernetes.io/zone` +- `kubernetes.io/hostname` +- `karpenter.sh/capacity-type` + + See [Pod Topology Spread Constraints](https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/) for details. ## Persistent Volume Topology