Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check DoNotEvict after filtering evictable pods to ensure termination can complete. #1294

Merged
merged 6 commits into from
Feb 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pkg/controllers/termination/eviction.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func NewEvictionQueue(ctx context.Context, coreV1Client corev1.CoreV1Interface)

coreV1Client: coreV1Client,
}
go queue.Start(ctx)
go queue.Start(logging.WithLogger(ctx, logging.FromContext(ctx).Named("eviction")))
njtran marked this conversation as resolved.
Show resolved Hide resolved
return queue
}

Expand Down Expand Up @@ -92,11 +92,11 @@ func (e *EvictionQueue) evict(ctx context.Context, nn types.NamespacedName) bool
ObjectMeta: metav1.ObjectMeta{Name: nn.Name, Namespace: nn.Namespace},
})
if errors.IsInternalError(err) { // 500
logging.FromContext(ctx).Debugf("Failed to evict pod %s due to PDB misconfiguration error.", nn.String())
logging.FromContext(ctx).Errorf("Could not evict pod %s due to PDB misconfiguration error.", nn.String())
return false
}
if errors.IsTooManyRequests(err) { // 429
logging.FromContext(ctx).Debugf("Failed to evict pod %s due to PDB violation.", nn.String())
logging.FromContext(ctx).Debugf("Did not to evict pod %s due to PDB violation.", nn.String())
return false
}
if errors.IsNotFound(err) { // 404
Expand Down
36 changes: 33 additions & 3 deletions pkg/controllers/termination/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
"k8s.io/apimachinery/pkg/util/intstr"
corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
. "knative.dev/pkg/logging/testing"
"knative.dev/pkg/ptr"
)

var ctx context.Context
Expand Down Expand Up @@ -160,6 +161,38 @@ var _ = Describe("Termination", func() {
ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node))
ExpectNotFound(ctx, env.Client, node)
})
It("should delete nodes that have do-not-evict on pods for which it does not apply", func() {
ExpectCreated(ctx, env.Client, node)
node = ExpectNodeExists(ctx, env.Client, node.Name)
pods := []*v1.Pod{
test.Pod(test.PodOptions{
NodeName: node.Name,
ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{v1alpha5.DoNotEvictPodAnnotationKey: "true"}},
}),
test.Pod(test.PodOptions{
NodeName: node.Name,
Tolerations: []v1.Toleration{{Operator: v1.TolerationOpExists}},
}),
test.Pod(test.PodOptions{
NodeName: node.Name,
ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{Kind: "Node", APIVersion: "v1", Name: node.Name, UID: node.UID}}},
}),
}
for _, pod := range pods {
ExpectCreated(ctx, env.Client, pod)
}
// Trigger eviction
Expect(env.Client.Delete(ctx, node)).To(Succeed())
for _, pod := range pods {
Expect(env.Client.Delete(ctx, pod, &client.DeleteOptions{GracePeriodSeconds: ptr.Int64(30)})).To(Succeed())
}
ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node))
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Simulate stuck terminating
injectabletime.Now = func() time.Time { return time.Now().Add(1 * time.Minute) }
ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node))
ExpectNotFound(ctx, env.Client, node)
})
It("should fail to evict pods that violate a PDB", func() {
minAvailable := intstr.FromInt(1)
labelSelector := map[string]string{randomdata.SillyName(): randomdata.SillyName()}
Expand All @@ -186,9 +219,6 @@ var _ = Describe("Termination", func() {
// Expect node to exist and be draining
ExpectNodeDraining(env.Client, node.Name)

// Expect podNoEvict to fail eviction due to PDB
// ExpectNotEvicted(env.Client, evictionQueue, podNoEvict) // TODO(etarn) reenable this after upgrading testenv apiserver

// Delete pod to simulate successful eviction
ExpectDeleted(ctx, env.Client, podNoEvict)

Expand Down
44 changes: 17 additions & 27 deletions pkg/controllers/termination/terminate.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,29 +56,23 @@ func (t *Terminator) cordon(ctx context.Context, node *v1.Node) error {
}

// drain evicts pods from the node and returns true when all pods are evicted
// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
func (t *Terminator) drain(ctx context.Context, node *v1.Node) (bool, error) {
// 1. Get pods on node
// Get evictable pods
pods, err := t.getPods(ctx, node)
if err != nil {
return false, fmt.Errorf("listing pods for node, %w", err)
}

// 2. Separate pods as non-critical and critical
// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
// Skip node due to do-not-evict
for _, pod := range pods {
if val := pod.Annotations[v1alpha5.DoNotEvictPodAnnotationKey]; val == "true" {
logging.FromContext(ctx).Debugf("Unable to drain node, pod %s has do-not-evict annotation", pod.Name)
logging.FromContext(ctx).Debugf("Unable to drain node, pod %s/%s has do-not-evict annotation", pod.Namespace, pod.Name)
return false, nil
}
}

// 4. Get and evict pods
evictable := t.getEvictablePods(pods)
if len(evictable) == 0 {
return true, nil
}
t.evict(evictable)
return false, nil
// Enqueue for eviction
t.evict(pods)
return len(pods) == 0, nil
}

// terminate calls cloud provider delete then removes the finalizer to delete the node
Expand All @@ -100,33 +94,29 @@ func (t *Terminator) terminate(ctx context.Context, node *v1.Node) error {
return nil
}

// getPods returns a list of pods scheduled to a node based on some filters
// getPods returns a list of evictable pods for the node
func (t *Terminator) getPods(ctx context.Context, node *v1.Node) ([]*v1.Pod, error) {
pods := &v1.PodList{}
if err := t.KubeClient.List(ctx, pods, client.MatchingFields{"spec.nodeName": node.Name}); err != nil {
podList := &v1.PodList{}
if err := t.KubeClient.List(ctx, podList, client.MatchingFields{"spec.nodeName": node.Name}); err != nil {
return nil, fmt.Errorf("listing pods on node, %w", err)
}
return ptr.PodListToSlice(pods), nil
}

func (t *Terminator) getEvictablePods(pods []*v1.Pod) []*v1.Pod {
evictable := []*v1.Pod{}
for _, p := range pods {
pods := []*v1.Pod{}
for _, p := range podList.Items {
// Ignore if unschedulable is tolerated, since they will reschedule
if (v1alpha5.Taints{{Key: v1.TaintNodeUnschedulable, Effect: v1.TaintEffectNoSchedule}}).Tolerates(p) == nil {
if (v1alpha5.Taints{{Key: v1.TaintNodeUnschedulable, Effect: v1.TaintEffectNoSchedule}}).Tolerates(ptr.Pod(p)) == nil {
continue
}
// Ignore if kubelet is partitioned and pods are beyond graceful termination window
if IsStuckTerminating(p) {
if IsStuckTerminating(ptr.Pod(p)) {
continue
}
// Ignore static mirror pods
if pod.IsOwnedByNode(p) {
if pod.IsOwnedByNode(ptr.Pod(p)) {
continue
}
evictable = append(evictable, p)
pods = append(pods, ptr.Pod(p))
}
return evictable
return pods, nil
}

func (t *Terminator) evict(pods []*v1.Pod) {
Expand Down
18 changes: 8 additions & 10 deletions website/content/en/preview/tasks/deprovisioning.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ There are both automated and manual ways of deprovisioning nodes provisioned by
Keep in mind that a small NodeExpiry results in a higher churn in cluster activity. So, for example, if a cluster
brings up all nodes at once, all the pods on those nodes would fall into the same batching window on expiration.
{{% /alert %}}

* **Node deleted**: You could use `kubectl` to manually remove a single Karpenter node:

```bash
# Delete a specific node
kubectl delete node $NODE_NAME

# Delete all nodes owned any provisioner
kubectl delete nodes -l karpenter.sh/provisioner-name

# Delete all nodes owned by a specific provisioner
kubectl delete nodes -l karpenter.sh/provisioner-name=$PROVISIONER_NAME
```
Expand All @@ -44,7 +44,7 @@ If the Karpenter controller is removed or fails, the finalizers on the nodes are

{{% alert title="Note" color="primary" %}}
By adding the finalizer, Karpenter improves the default Kubernetes process of node deletion.
When you run `kubectl delete node` on a node without a finalizer, the node is deleted without triggering the finalization logic. The instance will continue running in EC2, even though there is no longer a node object for it.
When you run `kubectl delete node` on a node without a finalizer, the node is deleted without triggering the finalization logic. The instance will continue running in EC2, even though there is no longer a node object for it.
The kubelet isn’t watching for its own existence, so if a node is deleted the kubelet doesn’t terminate itself.
All the pod objects get deleted by a garbage collection process later, because the pods’ node is gone.
{{% /alert %}}
Expand All @@ -56,7 +56,7 @@ There are a few cases where requesting to deprovision a Karpenter node will fail
### Disruption budgets

Karpenter respects Pod Disruption Budgets (PDBs) by using a backoff retry eviction strategy. Pods will never be forcibly deleted, so pods that fail to shut down will prevent a node from deprovisioning.
Kubernetes PDBs let you specify how much of a Deployment, ReplicationController, ReplicaSet, or StatefulSet must be protected from disruptions when pod eviction requests are made.
Kubernetes PDBs let you specify how much of a Deployment, ReplicationController, ReplicaSet, or StatefulSet must be protected from disruptions when pod eviction requests are made.

PDBs can be used to strike a balance by protecting the application's availability while still allowing a cluster administrator to manage the cluster.
Here is an example where the pods matching the label `myapp` will block node termination if evicting the pod would reduce the number of available pods below 4.
Expand All @@ -78,11 +78,9 @@ Review what [disruptions are](https://kubernetes.io/docs/concepts/workloads/pods

### Pod set to do-not-evict

If a pod exists with the annotation `karpenter.sh/do-not-evict` on a node, and a request is made to delete the node, Karpenter will not drain any pods from that node or otherwise try to delete the node.
However, if a `do-not-evict` pod is added to a node while the node is draining, the remaining pods will still evict, but that pod will block termination until it is removed.
In either case, the node will be cordoned to prevent additional work from scheduling.
If a pod exists with the annotation `karpenter.sh/do-not-evict` on a node, and a request is made to delete the node, Karpenter will not drain any pods from that node or otherwise try to delete the node. This annotation will have no effect for static pods, pods that tolerate `NoSchedule`, or pods terminating past their graceful termination period.

That annotation is used for pods that you want to run on one node from start to finish without interruption.
This is useful for pods that you want to run from start to finish without interruption.
Examples might include a real-time, interactive game that you don't want to interrupt or a long batch job (such as you might have with machine learning) that would need to start over if it were interrupted.

If you want to terminate a `do-not-evict` pod, you can simply remove the annotation and the finalizer will delete the pod and continue the node deprovisioning process.
If you want to terminate a node with a `do-not-evict` pod, you can simply remove the annotation and the deprovisioning process will continue.