Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add in cloudprovider disruption methods #1577

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions hack/kwok/requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,7 @@ for Version in $(seq 0 1); do
yqVersion="$Version" yq eval '.spec.versions[env(yqVersion)].schema.openAPIV3Schema.properties.spec.properties.template.properties.spec.properties.requirements.items.properties.key.x-kubernetes-validations += [
{"message": "label domain \"karpenter.kwok.sh\" is restricted", "rule": "self in [\"karpenter.kwok.sh/instance-cpu\", \"karpenter.kwok.sh/instance-memory\", \"karpenter.kwok.sh/instance-family\", \"karpenter.kwok.sh/instance-size\"] || !self.find(\"^([^/]+)\").endsWith(\"karpenter.kwok.sh\")"}]' -i kwok/charts/crds/karpenter.sh_nodepools.yaml
done


# Add ExampleReason in KwoK CloudProvider
yq eval '.spec.versions[0].schema.openAPIV3Schema.properties.spec.properties.disruption.properties.budgets.items.properties.reasons.items.enum += [ "ExampleReason" ]' -i kwok/charts/crds/karpenter.sh_nodepools.yaml
7 changes: 5 additions & 2 deletions kwok/charts/crds/karpenter.sh_nodepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,13 +112,16 @@ spec:
description: |-
Reasons is a list of disruption methods that this budget applies to. If Reasons is not set, this budget applies to all methods.
Otherwise, this will apply to each reason defined.
allowed reasons are Underutilized, Empty, and Drifted.
allowed reasons are Underutilized, Empty, and Drifted and additional CloudProvider-specific reasons.
items:
description: DisruptionReason defines valid reasons for disruption budgets.
description: |-
DisruptionReason defines valid reasons for disruption budgets.
CloudProviders will need to append to the list of enums when implementing cloud provider disruption reasons
enum:
- Underutilized
- Empty
- Drifted
- ExampleReason
type: string
type: array
schedule:
Expand Down
6 changes: 4 additions & 2 deletions pkg/apis/crds/karpenter.sh_nodepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,11 @@ spec:
description: |-
Reasons is a list of disruption methods that this budget applies to. If Reasons is not set, this budget applies to all methods.
Otherwise, this will apply to each reason defined.
allowed reasons are Underutilized, Empty, and Drifted.
allowed reasons are Underutilized, Empty, and Drifted and additional CloudProvider-specific reasons.
items:
description: DisruptionReason defines valid reasons for disruption budgets.
description: |-
DisruptionReason defines valid reasons for disruption budgets.
CloudProviders will need to append to the list of enums when implementing cloud provider disruption reasons
enum:
- Underutilized
- Empty
Expand Down
3 changes: 2 additions & 1 deletion pkg/apis/v1/nodepool.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ type Disruption struct {
type Budget struct {
// Reasons is a list of disruption methods that this budget applies to. If Reasons is not set, this budget applies to all methods.
// Otherwise, this will apply to each reason defined.
// allowed reasons are Underutilized, Empty, and Drifted.
// allowed reasons are Underutilized, Empty, and Drifted and additional CloudProvider-specific reasons.
// +optional
Reasons []DisruptionReason `json:"reasons,omitempty"`
// Nodes dictates the maximum number of NodeClaims owned by this NodePool
Expand Down Expand Up @@ -129,6 +129,7 @@ const (
)

// DisruptionReason defines valid reasons for disruption budgets.
// CloudProviders will need to append to the list of enums when implementing cloud provider disruption reasons
// +kubebuilder:validation:Enum={Underutilized,Empty,Drifted}
type DisruptionReason string

Expand Down
6 changes: 5 additions & 1 deletion pkg/cloudprovider/fake/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ type CloudProvider struct {
CreatedNodeClaims map[string]*v1.NodeClaim
Drifted cloudprovider.DriftReason
NodeClassGroupVersionKind []schema.GroupVersionKind

disruptionReasons []v1.DisruptionReason
}

func NewCloudProvider() *CloudProvider {
Expand Down Expand Up @@ -95,6 +97,8 @@ func (c *CloudProvider) Reset() {
Kind: "",
},
}

c.disruptionReasons = []v1.DisruptionReason{"ExampleReason"}
}

func (c *CloudProvider) Create(ctx context.Context, nodeClaim *v1.NodeClaim) (*v1.NodeClaim, error) {
Expand Down Expand Up @@ -236,7 +240,7 @@ func (c *CloudProvider) GetInstanceTypes(_ context.Context, np *v1.NodePool) ([]
}

func (c *CloudProvider) DisruptionReasons() []v1.DisruptionReason {
return nil
return c.disruptionReasons
}

func (c *CloudProvider) Delete(_ context.Context, nc *v1.NodeClaim) error {
Expand Down
127 changes: 127 additions & 0 deletions pkg/controllers/disruption/cloudprovider.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/*
Copyright The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package disruption

import (
"context"
"errors"

"sigs.k8s.io/controller-runtime/pkg/client"

v1 "sigs.k8s.io/karpenter/pkg/apis/v1"
"sigs.k8s.io/karpenter/pkg/cloudprovider"
disruptionevents "sigs.k8s.io/karpenter/pkg/controllers/disruption/events"
"sigs.k8s.io/karpenter/pkg/controllers/provisioning"
"sigs.k8s.io/karpenter/pkg/controllers/provisioning/scheduling"
"sigs.k8s.io/karpenter/pkg/controllers/state"
"sigs.k8s.io/karpenter/pkg/events"
)

// CloudProvider is a subreconciler that deletes candidates according to cloud provider specific reasons.
// This should be methods that are
type CloudProvider struct {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not convinced that we need a separate method definition here. Can't we just combine this with the drift implementation so that drift is just another one of these "reasons" that we pull from the CloudProvider. I'm asking because this looks exactly the same as the drift implementation at this point.

cloudprovider.CloudProvider
kubeClient client.Client
recorder events.Recorder
cluster *state.Cluster
provisioner *provisioning.Provisioner
}

func NewCloudProvider(kubeClient client.Client, cluster *state.Cluster, cloudProvider cloudprovider.CloudProvider, recorder events.Recorder, provisioner *provisioning.Provisioner) *CloudProvider {
return &CloudProvider{
CloudProvider: cloudProvider,
kubeClient: kubeClient,
cluster: cluster,
provisioner: provisioner,
}
}

// ShouldDisrupt is a predicate used to filter candidates
func (cp *CloudProvider) ShouldDisrupt(ctx context.Context, c *Candidate) bool {
for _, reason := range cp.DisruptionReasons() {
if c.NodeClaim.StatusConditions().Get(string(reason)).IsTrue() {
return true
}
}
return false
}

// ComputeCommand generates a disruption command given candidates
func (cp *CloudProvider) ComputeCommand(ctx context.Context, disruptionBudgetMapping map[string]map[v1.DisruptionReason]int, candidates ...*Candidate) (Command, scheduling.Results, error) {
// Do a quick check through the candidates to see if they're empty.
// For each candidate that is empty with a nodePool allowing its disruption
// add it to the existing command.
empty := make([]*Candidate, 0, len(candidates))
for _, candidate := range candidates {
if len(candidate.reschedulablePods) > 0 {
continue
}
// If there's disruptions allowed for the candidate's nodepool,
// add it to the list of candidates, and decrement the budget.
if disruptionBudgetMapping[candidate.nodePool.Name][cp.Reason()] > 0 {
empty = append(empty, candidate)
disruptionBudgetMapping[candidate.nodePool.Name][cp.Reason()]--
}
}
// Disrupt all empty CloudProvidered candidates, as they require no scheduling simulations.
if len(empty) > 0 {
return Command{
candidates: empty,
}, scheduling.Results{}, nil
}

for _, candidate := range candidates {
// If the disruption budget doesn't allow this candidate to be disrupted,
// continue to the next candidate. We don't need to decrement any budget
// counter since CloudProvider commands can only have one candidate.
if disruptionBudgetMapping[candidate.nodePool.Name][cp.Reason()] == 0 {
continue
}
// Check if we need to create any NodeClaims.
results, err := SimulateScheduling(ctx, cp.kubeClient, cp.cluster, cp.provisioner, candidate)
if err != nil {
// if a candidate is now deleting, just retry
if errors.Is(err, errCandidateDeleting) {
continue
}
return Command{}, scheduling.Results{}, err
}
// Emit an event that we couldn't reschedule the pods on the node.
if !results.AllNonPendingPodsScheduled() {
cp.recorder.Publish(disruptionevents.Blocked(candidate.Node, candidate.NodeClaim, results.NonPendingPodSchedulingErrors())...)
continue
}

return Command{
candidates: []*Candidate{candidate},
replacements: results.NewNodeClaims,
}, results, nil
}
return Command{}, scheduling.Results{}, nil
}

func (cp *CloudProvider) Reason() v1.DisruptionReason {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just use the actual reason that was passed by the CloudProvider?

return "CloudProviderReason"
}

func (cp *CloudProvider) Class() string {
return EventualDisruptionClass
}

func (cp *CloudProvider) ConsolidationType() string {
return ""
}
Loading
Loading