Skip to content

Commit

Permalink
fix: do not consider provisioners without a provider (aws#491)
Browse files Browse the repository at this point in the history
  • Loading branch information
njtran authored Aug 29, 2023
1 parent 6bca755 commit 8ad12e5
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 2 deletions.
3 changes: 3 additions & 0 deletions pkg/controllers/deprovisioning/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,9 @@ func buildNodePoolMap(ctx context.Context, kubeClient client.Client, cloudProvid
if err != nil {
return nil, nil, fmt.Errorf("listing instance types for %s, %w", np.Name, err)
}
if len(nodePoolInstanceTypes) == 0 {
continue
}
nodePoolToInstanceTypesMap[key] = map[string]*cloudprovider.InstanceType{}
for _, it := range nodePoolInstanceTypes {
nodePoolToInstanceTypesMap[key][it.Name] = it
Expand Down
123 changes: 123 additions & 0 deletions pkg/controllers/deprovisioning/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,82 @@ var _ = Describe("Replace Nodes", func() {
// and delete the old one
ExpectNotFound(ctx, env.Client, machine, node)
})
It("can replace nodes if another provisioner has no node template", func() {
labels := map[string]string{
"app": "test",
}
// create our RS so we can link a pod to it
rs := test.ReplicaSet()
ExpectApplied(ctx, env.Client, rs)
Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed())

pod := test.Pod(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{Labels: labels,
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: "apps/v1",
Kind: "ReplicaSet",
Name: rs.Name,
UID: rs.UID,
Controller: ptr.Bool(true),
BlockOwnerDeletion: ptr.Bool(true),
},
}}})

prov := test.Provisioner(test.ProvisionerOptions{
Consolidation: &v1alpha5.Consolidation{Enabled: ptr.Bool(true)},
})
machine, node := test.MachineAndNode(v1alpha5.Machine{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1alpha5.ProvisionerNameLabelKey: prov.Name,
v1.LabelInstanceTypeStable: mostExpensiveInstance.Name,
v1alpha5.LabelCapacityType: mostExpensiveOffering.CapacityType,
v1.LabelTopologyZone: mostExpensiveOffering.Zone,
},
},
Status: v1alpha5.MachineStatus{
ProviderID: test.RandomProviderID(),
Allocatable: map[v1.ResourceName]resource.Quantity{v1.ResourceCPU: resource.MustParse("32")},
},
})

noTemplateProv := test.Provisioner()
noTemplateProv.Spec.Provider = nil
ExpectApplied(ctx, env.Client, rs, pod, node, machine, prov, noTemplateProv)

// bind pods to node
ExpectManualBinding(ctx, env.Client, pod, node)

// inform cluster state about nodes and machines
ExpectMakeInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine})

fakeClock.Step(10 * time.Minute)

// consolidation won't delete the old machine until the new machine is ready
var wg sync.WaitGroup
ExpectTriggerVerifyAction(&wg)
ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1)
ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{})
wg.Wait()

// Cascade any deletion of the machine to the node
ExpectMachinesCascadeDeletion(ctx, env.Client, machine)

// should create a new machine as there is a cheaper one that can hold the pod
machines := ExpectMachines(ctx, env.Client)
nodes := ExpectNodes(ctx, env.Client)
Expect(machines).To(HaveLen(1))
Expect(nodes).To(HaveLen(1))

// Expect that the new machine does not request the most expensive instance type
Expect(machines[0].Name).ToNot(Equal(machine.Name))
Expect(scheduling.NewNodeSelectorRequirements(machines[0].Spec.Requirements...).Has(v1.LabelInstanceTypeStable)).To(BeTrue())
Expect(scheduling.NewNodeSelectorRequirements(machines[0].Spec.Requirements...).Get(v1.LabelInstanceTypeStable).Has(mostExpensiveInstance.Name)).To(BeFalse())

// and delete the old one
ExpectNotFound(ctx, env.Client, machine, node)
})
It("can replace nodes, considers PDB", func() {
labels := map[string]string{
"app": "test",
Expand Down Expand Up @@ -1012,6 +1088,53 @@ var _ = Describe("Delete Node", func() {
// and delete the old one
ExpectNotFound(ctx, env.Client, machine2, node2)
})
It("can delete nodes if another provisioner has no node template", func() {
labels := map[string]string{
"app": "test",
}
// create our RS so we can link a pod to it
rs := test.ReplicaSet()
ExpectApplied(ctx, env.Client, rs)
pods := test.Pods(3, test.PodOptions{
ObjectMeta: metav1.ObjectMeta{Labels: labels,
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: "apps/v1",
Kind: "ReplicaSet",
Name: rs.Name,
UID: rs.UID,
Controller: ptr.Bool(true),
BlockOwnerDeletion: ptr.Bool(true),
},
}}})
noTemplateProv := test.Provisioner()
noTemplateProv.Spec.Provider = nil
ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], machine1, node1, machine2, node2, prov)

// bind pods to node
ExpectManualBinding(ctx, env.Client, pods[0], node1)
ExpectManualBinding(ctx, env.Client, pods[1], node1)
ExpectManualBinding(ctx, env.Client, pods[2], node2)

// inform cluster state about nodes and machines
ExpectMakeInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node1, node2}, []*v1alpha5.Machine{machine1, machine2})

fakeClock.Step(10 * time.Minute)

var wg sync.WaitGroup
ExpectTriggerVerifyAction(&wg)
ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{})
wg.Wait()

// Cascade any deletion of the machine to the node
ExpectMachinesCascadeDeletion(ctx, env.Client, machine2)

// we don't need a new node, but we should evict everything off one of node2 which only has a single pod
Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1))
Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(1))
// and delete the old one
ExpectNotFound(ctx, env.Client, machine2, node2)
})
It("can delete nodes, when non-Karpenter capacity can fit pods", func() {
labels := map[string]string{
"app": "test",
Expand Down
4 changes: 4 additions & 0 deletions pkg/controllers/provisioning/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,10 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, stateNod
if err != nil {
return nil, fmt.Errorf("getting instance types, %w", err)
}
if len(instanceTypeOptions) == 0 {
logging.FromContext(ctx).With(lo.Ternary(nodePool.IsProvisioner, "provisioner", "nodepool"), nodePool.Name).Info("skipping, no resolved instance types found")
continue
}
instanceTypes[nodepoolutil.Key{Name: nodePool.Name, IsProvisioner: nodePool.IsProvisioner}] = append(instanceTypes[nodepoolutil.Key{Name: nodePool.Name, IsProvisioner: nodePool.IsProvisioner}], instanceTypeOptions...)

// Construct Topology Domains
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/provisioning/scheduling/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ var _ = BeforeEach(func() {
}}})
// reset instance types
newCP := fake.CloudProvider{}
cloudProvider.InstanceTypes, _ = newCP.GetInstanceTypes(context.Background(), nil)
cloudProvider.InstanceTypes, _ = newCP.GetInstanceTypes(ctx, nil)
cloudProvider.CreateCalls = nil
pscheduling.ResetDefaultStorageClass()
})
Expand Down
13 changes: 12 additions & 1 deletion pkg/controllers/provisioning/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ var _ = BeforeSuite(func() {
nodeController = informer.NewNodeController(env.Client, cluster)
prov = provisioning.NewProvisioner(env.Client, corev1.NewForConfigOrDie(env.Config), events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster)
daemonsetController = informer.NewDaemonSetController(env.Client, cluster)
instanceTypes, _ := cloudProvider.GetInstanceTypes(context.Background(), nil)
instanceTypes, _ := cloudProvider.GetInstanceTypes(ctx, nil)
instanceTypeMap = map[string]*cloudprovider.InstanceType{}
for _, it := range instanceTypes {
instanceTypeMap[it.Name] = it
Expand Down Expand Up @@ -109,6 +109,17 @@ var _ = Describe("Provisioning", func() {
Expect(len(nodes.Items)).To(Equal(1))
ExpectScheduled(ctx, env.Client, pod)
})
It("should continue with provisioning when at least a provisioner doesn't have resolved instance types", func() {
provNotDefined := test.Provisioner()
provNotDefined.Spec.ProviderRef = nil
ExpectApplied(ctx, env.Client, test.Provisioner(), provNotDefined)
pod := test.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
nodes := &v1.NodeList{}
Expect(env.Client.List(ctx, nodes)).To(Succeed())
Expect(len(nodes.Items)).To(Equal(1))
ExpectScheduled(ctx, env.Client, pod)
})
It("should ignore provisioners that are deleting", func() {
provisioner := test.Provisioner()
ExpectApplied(ctx, env.Client, provisioner)
Expand Down

0 comments on commit 8ad12e5

Please sign in to comment.