Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Termination now only evicts pods that don't tolerate Unschedulable #479

Merged
merged 4 commits into from
Jun 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 8 additions & 19 deletions pkg/controllers/allocation/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"github.com/awslabs/karpenter/pkg/utils/functional"
"github.com/awslabs/karpenter/pkg/utils/pod"
"github.com/awslabs/karpenter/pkg/utils/ptr"
"go.uber.org/multierr"
"go.uber.org/zap"
v1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -44,22 +43,22 @@ func (f *Filter) GetProvisionablePods(ctx context.Context, provisioner *v1alpha2

// 2. Filter pods that aren't provisionable
provisionable := []*v1.Pod{}
for _, pod := range pods.Items {
for _, p := range pods.Items {
if err := functional.ValidateAll(
func() error { return f.isUnschedulable(&pod) },
func() error { return f.matchesProvisioner(&pod, provisioner) },
func() error { return f.hasSupportedSchedulingConstraints(&pod) },
func() error { return f.toleratesTaints(&pod, provisioner) },
func() error { return f.withValidConstraints(ctx, &pod, provisioner) },
func() error { return f.isUnschedulable(&p) },
func() error { return f.matchesProvisioner(&p, provisioner) },
func() error { return f.hasSupportedSchedulingConstraints(&p) },
func() error { return pod.ToleratesTaints(&p.Spec, provisioner.Spec.Taints...) },
func() error { return f.withValidConstraints(ctx, &p, provisioner) },
); err != nil {
zap.S().Debugf("Ignored pod %s/%s when allocating for provisioner %s/%s, %s",
pod.Name, pod.Namespace,
p.Name, p.Namespace,
provisioner.Name, provisioner.Namespace,
err.Error(),
)
continue
}
provisionable = append(provisionable, ptr.Pod(pod))
provisionable = append(provisionable, ptr.Pod(p))
}
return provisionable, nil
}
Expand Down Expand Up @@ -102,16 +101,6 @@ func (f *Filter) matchesProvisioner(pod *v1.Pod, provisioner *v1alpha2.Provision
return fmt.Errorf("matched another provisioner, %s/%s", name, namespace)
}

func (f *Filter) toleratesTaints(p *v1.Pod, provisioner *v1alpha2.Provisioner) error {
var err error
for _, taint := range provisioner.Spec.Taints {
if !pod.ToleratesTaint(&p.Spec, taint) {
err = multierr.Append(err, fmt.Errorf("did not tolerate %s=%s:%s", taint.Key, taint.Value, taint.Effect))
}
}
return err
}

func (f *Filter) withValidConstraints(ctx context.Context, pod *v1.Pod, provisioner *v1alpha2.Provisioner) error {
if err := provisioner.Spec.Constraints.WithOverrides(pod).Validate(ctx); err != nil {
return fmt.Errorf("invalid constraints, %w", err)
Expand Down
19 changes: 1 addition & 18 deletions pkg/controllers/reallocation/utilization.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,6 @@ type Utilization struct {
kubeClient client.Client
}

func (u *Utilization) Reconcile(ctx context.Context, provisioner *v1alpha2.Provisioner) error {
// 1. Set TTL on TTLable Nodes
if err := u.markUnderutilized(ctx, provisioner); err != nil {
return fmt.Errorf("adding ttl and underutilized label, %w", err)
}

// 2. Remove TTL from Utilized Nodes
if err := u.clearUnderutilized(ctx, provisioner); err != nil {
return fmt.Errorf("removing ttl from node, %w", err)
}

// 3. Mark any Node past TTL as expired
if err := u.terminateExpired(ctx, provisioner); err != nil {
return fmt.Errorf("marking nodes terminable, %w", err)
}
return nil
}

// markUnderutilized adds a TTL to underutilized nodes
func (u *Utilization) markUnderutilized(ctx context.Context, provisioner *v1alpha2.Provisioner) error {
ttlable := []*v1.Node{}
Expand Down Expand Up @@ -129,6 +111,7 @@ func (u *Utilization) terminateExpired(ctx context.Context, provisioner *v1alpha
}

// 2. Delete node if past TTL
// This will kick off work for the termination controller to gracefully shut down the node.
for _, node := range nodes {
if utilsnode.IsPastTTL(node) {
if err := u.kubeClient.Delete(ctx, node); err != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/termination/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func (c *Controller) Reconcile(ctx context.Context, object client.Object) (recon
// 4. If fully drained, terminate the node
if drained {
if err := c.terminator.terminate(ctx, node); err != nil {
return reconcile.Result{}, fmt.Errorf("terminating nodes, %w", err)
return reconcile.Result{}, fmt.Errorf("terminating node %s, %w", node.Name, err)
}
}
return reconcile.Result{Requeue: !drained}, nil
Expand Down
23 changes: 23 additions & 0 deletions pkg/controllers/termination/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ import (
"github.com/awslabs/karpenter/pkg/cloudprovider/fake"
"github.com/awslabs/karpenter/pkg/cloudprovider/registry"
"github.com/awslabs/karpenter/pkg/test"
v1 "k8s.io/api/core/v1"
corev1 "k8s.io/client-go/kubernetes/typed/core/v1"

. "github.com/awslabs/karpenter/pkg/test/expectations"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func TestAPIs(t *testing.T) {
Expand Down Expand Up @@ -78,5 +80,26 @@ var _ = Describe("Termination", func() {
Expect(env.Client.Delete(ctx, node)).To(Succeed())
ExpectNotFound(env.Client, node)
})
It("should not evict pods that tolerate unschedulable taint", func() {
node := test.NodeWith(test.NodeOptions{
Finalizers: []string{v1alpha2.KarpenterFinalizer},
Labels: map[string]string{
v1alpha2.ProvisionerNameLabelKey: "default",
v1alpha2.ProvisionerNamespaceLabelKey: "default",
},
})
pod := test.Pod(test.PodOptions{
NodeName: node.Name,
Tolerations: []v1.Toleration{{Key: v1.TaintNodeUnschedulable, Operator: v1.TolerationOpExists, Effect: v1.TaintEffectNoSchedule}},
})
ExpectCreatedWithStatus(env.Client, node)
ExpectCreatedWithStatus(env.Client, pod)

pods := &v1.PodList{}
Expect(env.Client.Delete(ctx, node)).To(Succeed())
Expect(env.Client.List(ctx, pods, client.MatchingFields{"spec.nodeName": node.Name})).To(Succeed())
Expect(pods.Items).To(HaveLen(1))
ExpectNotFound(env.Client, node)
})
})
})
53 changes: 38 additions & 15 deletions pkg/controllers/termination/terminate.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,11 @@ type Terminator struct {

// cordon cordons a node
func (t *Terminator) cordon(ctx context.Context, node *v1.Node) error {
// 1. Check if node is already cordoned
if node.Spec.Unschedulable {
return nil
}
// 2. Cordon node
persisted := node.DeepCopy()
node.Spec.Unschedulable = true
if err := t.kubeClient.Patch(ctx, node, client.MergeFrom(persisted)); err != nil {
Expand All @@ -52,28 +54,33 @@ func (t *Terminator) cordon(ctx context.Context, node *v1.Node) error {
return nil
}

// drain evicts pods from the node and returns true when fully drained
// drain evicts pods from the node and returns true when all pods are evicted
func (t *Terminator) drain(ctx context.Context, node *v1.Node) (bool, error) {
// 1. Get pods on node
pods, err := t.getPods(ctx, node)
if err != nil {
return false, fmt.Errorf("listing pods for node %s, %w", node.Name, err)
}
// 2. Evict pods on node
empty := true
for _, p := range pods {
if !pod.IsOwnedByDaemonSet(p) {
empty = false
if err := t.coreV1Client.Pods(p.Namespace).Evict(ctx, &v1beta1.Eviction{
ObjectMeta: metav1.ObjectMeta{
Name: p.Name,
},
}); err != nil {
zap.S().Debugf("Continuing after failing to evict pods from node %s, %s", node.Name, err.Error())
}
// 2. Separate pods as non-critical and critical
// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
nonCritical := []*v1.Pod{}
critical := []*v1.Pod{}
for _, pod := range pods {
if pod.Spec.PriorityClassName == "system-cluster-critical" || pod.Spec.PriorityClassName == "system-node-critical" {
critical = append(critical, pod)
} else {
nonCritical = append(nonCritical, pod)
}
}
return empty, nil
// 3. Evict non-critical pods
if !t.evictPods(ctx, nonCritical) {
return false, nil
}
// 4. Evict critical pods once all non-critical pods are evicted
if !t.evictPods(ctx, critical) {
return false, nil
}
return true, nil
}

// terminate terminates the node then removes the finalizer to delete the node
Expand All @@ -92,7 +99,23 @@ func (t *Terminator) terminate(ctx context.Context, node *v1.Node) error {
return nil
}

// getPods returns a list of pods scheduled to a node
// evictPods returns true if there are no evictable pods
func (t *Terminator) evictPods(ctx context.Context, pods []*v1.Pod) bool {
empty := true
for _, p := range pods {
// If a pod tolerates the unschedulable taint, don't evict it as it could reschedule back onto the node
if err := pod.ToleratesTaints(&p.Spec, v1.Taint{Key: v1.TaintNodeUnschedulable, Effect: v1.TaintEffectNoSchedule}); err != nil {
if err := t.coreV1Client.Pods(p.Namespace).Evict(ctx, &v1beta1.Eviction{ObjectMeta: metav1.ObjectMeta{Name: p.Name}}); err != nil {
// If an eviction fails, we need to eventually try again
zap.S().Debugf("Continuing after failing to evict pod %s from node %s, %s", p.Name, p.Spec.NodeName, err.Error())
empty = false
}
}
}
return empty
}

// getPods returns a list of pods scheduled to a node based on some filters
func (t *Terminator) getPods(ctx context.Context, node *v1.Node) ([]*v1.Pod, error) {
pods := &v1.PodList{}
if err := t.kubeClient.List(ctx, pods, client.MatchingFields{"spec.nodeName": node.Name}); err != nil {
Expand Down
36 changes: 15 additions & 21 deletions pkg/utils/pod/scheduling.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ limitations under the License.
package pod

import (
"fmt"

"go.uber.org/multierr"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
)
Expand All @@ -31,7 +34,7 @@ func FailedToSchedule(pod *v1.Pod) bool {
// IsSchedulable returns true if the pod can schedule to the node
func IsSchedulable(pod *v1.PodSpec, node *v1.Node) bool {
// Tolerate Taints
if !ToleratesAllTaints(pod, node.Spec.Taints) {
if err := ToleratesTaints(pod, node.Spec.Taints...); err != nil {
return false
}
// Match Node Selector labels
Expand All @@ -42,31 +45,22 @@ func IsSchedulable(pod *v1.PodSpec, node *v1.Node) bool {
return true
}

// ToleratesAllTaints returns true if the pod tolerates all taints
func ToleratesAllTaints(pod *v1.PodSpec, taints []v1.Taint) bool {
// ToleratesTaints returns an error if the pod does not tolerate the taints
// https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/#concepts
func ToleratesTaints(spec *v1.PodSpec, taints ...v1.Taint) (err error) {
for _, taint := range taints {
if !ToleratesTaint(pod, taint) {
return false
if !Tolerates(spec.Tolerations, taint) {
err = multierr.Append(err, fmt.Errorf("did not tolerate %s=%s:%s", taint.Key, taint.Value, taint.Effect))
}
}
return true
return err
}

// ToleratesTaint returns true if the pod tolerates the taint
// https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/#concepts
func ToleratesTaint(pod *v1.PodSpec, taint v1.Taint) bool {
// Soft constraints are consider to be always tolerated.
if taint.Effect == v1.TaintEffectPreferNoSchedule {
return true
}
for _, toleration := range pod.Tolerations {
if toleration.Key == taint.Key {
if toleration.Operator == v1.TolerationOpExists {
return true
}
if toleration.Operator == v1.TolerationOpEqual && toleration.Value == taint.Value {
return true
}
// Tolerates returns true if one of the tolerations tolerate the taint
func Tolerates(tolerations []v1.Toleration, taint v1.Taint) bool {
for _, t := range tolerations {
if t.ToleratesTaint(&taint) {
return true
}
}
return false
Expand Down