Skip to content

Commit

Permalink
Check DoNotEvict after filtering evictable pods to ensure termination…
Browse files Browse the repository at this point in the history
… can complete.
  • Loading branch information
ellistarn committed Feb 8, 2022
1 parent 4c1fb8a commit bb3e47b
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 44 deletions.
8 changes: 4 additions & 4 deletions pkg/controllers/termination/eviction.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@ func NewEvictionQueue(ctx context.Context, coreV1Client corev1.CoreV1Interface)

coreV1Client: coreV1Client,
}
go queue.Start(ctx)
go queue.Start(logging.WithLogger(ctx, logging.FromContext(ctx).Named("eviction")))
return queue
}

// Add adds pods to the EvictionQueue
func (e *EvictionQueue) Add(pods []*v1.Pod) {
func (e *EvictionQueue) Add(ctx context.Context, pods []*v1.Pod) {
for _, pod := range pods {
if nn := client.ObjectKeyFromObject(pod); !e.Set.Contains(nn) {
e.Set.Add(nn)
Expand Down Expand Up @@ -92,11 +92,11 @@ func (e *EvictionQueue) evict(ctx context.Context, nn types.NamespacedName) bool
ObjectMeta: metav1.ObjectMeta{Name: nn.Name, Namespace: nn.Namespace},
})
if errors.IsInternalError(err) { // 500
logging.FromContext(ctx).Debugf("Failed to evict pod %s due to PDB misconfiguration error.", nn.String())
logging.FromContext(ctx).Errorf("Could not evict pod %s due to PDB misconfiguration error.", nn.String())
return false
}
if errors.IsTooManyRequests(err) { // 429
logging.FromContext(ctx).Debugf("Failed to evict pod %s due to PDB violation.", nn.String())
logging.FromContext(ctx).Debugf("Did not to evict pod %s due to PDB violation.", nn.String())
return false
}
if errors.IsNotFound(err) { // 404
Expand Down
49 changes: 19 additions & 30 deletions pkg/controllers/termination/terminate.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
"github.com/aws/karpenter/pkg/utils/functional"
"github.com/aws/karpenter/pkg/utils/injectabletime"
"github.com/aws/karpenter/pkg/utils/pod"
"github.com/aws/karpenter/pkg/utils/ptr"
)

type Terminator struct {
Expand Down Expand Up @@ -57,28 +56,22 @@ func (t *Terminator) cordon(ctx context.Context, node *v1.Node) error {

// drain evicts pods from the node and returns true when all pods are evicted
func (t *Terminator) drain(ctx context.Context, node *v1.Node) (bool, error) {
// 1. Get pods on node
// Get evictable pods
pods, err := t.getPods(ctx, node)
if err != nil {
return false, fmt.Errorf("listing pods for node, %w", err)
}

// 2. Separate pods as non-critical and critical
// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
// Skip node due to do-not-evict
for _, pod := range pods {
// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
if val := pod.Annotations[v1alpha5.DoNotEvictPodAnnotationKey]; val == "true" {
logging.FromContext(ctx).Debugf("Unable to drain node, pod %s has do-not-evict annotation", pod.Name)
logging.FromContext(ctx).Debugf("Unable to drain node, pod %s/%s has do-not-evict annotation", pod.Namespace, pod.Name)
return false, nil
}
}

// 4. Get and evict pods
evictable := t.getEvictablePods(pods)
if len(evictable) == 0 {
return true, nil
}
t.evict(evictable)
return false, nil
// Enqueue for eviction
t.evict(ctx, pods)
return len(pods) == 0, nil
}

// terminate calls cloud provider delete then removes the finalizer to delete the node
Expand All @@ -102,34 +95,30 @@ func (t *Terminator) terminate(ctx context.Context, node *v1.Node) error {

// getPods returns a list of pods scheduled to a node based on some filters
func (t *Terminator) getPods(ctx context.Context, node *v1.Node) ([]*v1.Pod, error) {
pods := &v1.PodList{}
if err := t.KubeClient.List(ctx, pods, client.MatchingFields{"spec.nodeName": node.Name}); err != nil {
podList := &v1.PodList{}
if err := t.KubeClient.List(ctx, podList, client.MatchingFields{"spec.nodeName": node.Name}); err != nil {
return nil, fmt.Errorf("listing pods on node, %w", err)
}
return ptr.PodListToSlice(pods), nil
}

func (t *Terminator) getEvictablePods(pods []*v1.Pod) []*v1.Pod {
evictable := []*v1.Pod{}
for _, p := range pods {
pods := []*v1.Pod{}
for i := range podList.Items {
// Ignore if unschedulable is tolerated, since they will reschedule
if (v1alpha5.Taints{{Key: v1.TaintNodeUnschedulable, Effect: v1.TaintEffectNoSchedule}}).Tolerates(p) == nil {
if (v1alpha5.Taints{{Key: v1.TaintNodeUnschedulable, Effect: v1.TaintEffectNoSchedule}}).Tolerates(&podList.Items[i]) == nil {
continue
}
// Ignore if kubelet is partitioned and pods are beyond graceful termination window
if IsStuckTerminating(p) {
if IsStuckTerminating(&podList.Items[i]) {
continue
}
// Ignore static mirror pods
if pod.IsOwnedByNode(p) {
if pod.IsOwnedByNode(&podList.Items[i]) {
continue
}
evictable = append(evictable, p)
pods = append(pods, &podList.Items[i])
}
return evictable
return pods, nil
}

func (t *Terminator) evict(pods []*v1.Pod) {
func (t *Terminator) evict(ctx context.Context, pods []*v1.Pod) {
// 1. Prioritize noncritical pods https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
critical := []*v1.Pod{}
nonCritical := []*v1.Pod{}
Expand All @@ -145,9 +134,9 @@ func (t *Terminator) evict(pods []*v1.Pod) {
}
// 2. Evict critical pods if all noncritical are evicted
if len(nonCritical) == 0 {
t.EvictionQueue.Add(critical)
t.EvictionQueue.Add(ctx, critical)
} else {
t.EvictionQueue.Add(nonCritical)
t.EvictionQueue.Add(ctx, nonCritical)
}
}

Expand Down
18 changes: 8 additions & 10 deletions website/content/en/preview/tasks/deprovisioning.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ There are both automated and manual ways of deprovisioning nodes provisioned by
Keep in mind that a small NodeExpiry results in a higher churn in cluster activity. So, for example, if a cluster
brings up all nodes at once, all the pods on those nodes would fall into the same batching window on expiration.
{{% /alert %}}

* **Node deleted**: You could use `kubectl` to manually remove a single Karpenter node:

```bash
# Delete a specific node
kubectl delete node $NODE_NAME

# Delete all nodes owned any provisioner
kubectl delete nodes -l karpenter.sh/provisioner-name

# Delete all nodes owned by a specific provisioner
kubectl delete nodes -l karpenter.sh/provisioner-name=$PROVISIONER_NAME
```
Expand All @@ -44,7 +44,7 @@ If the Karpenter controller is removed or fails, the finalizers on the nodes are

{{% alert title="Note" color="primary" %}}
By adding the finalizer, Karpenter improves the default Kubernetes process of node deletion.
When you run `kubectl delete node` on a node without a finalizer, the node is deleted without triggering the finalization logic. The instance will continue running in EC2, even though there is no longer a node object for it.
When you run `kubectl delete node` on a node without a finalizer, the node is deleted without triggering the finalization logic. The instance will continue running in EC2, even though there is no longer a node object for it.
The kubelet isn’t watching for its own existence, so if a node is deleted the kubelet doesn’t terminate itself.
All the pod objects get deleted by a garbage collection process later, because the pods’ node is gone.
{{% /alert %}}
Expand All @@ -56,7 +56,7 @@ There are a few cases where requesting to deprovision a Karpenter node will fail
### Disruption budgets

Karpenter respects Pod Disruption Budgets (PDBs) by using a backoff retry eviction strategy. Pods will never be forcibly deleted, so pods that fail to shut down will prevent a node from deprovisioning.
Kubernetes PDBs let you specify how much of a Deployment, ReplicationController, ReplicaSet, or StatefulSet must be protected from disruptions when pod eviction requests are made.
Kubernetes PDBs let you specify how much of a Deployment, ReplicationController, ReplicaSet, or StatefulSet must be protected from disruptions when pod eviction requests are made.

PDBs can be used to strike a balance by protecting the application's availability while still allowing a cluster administrator to manage the cluster.
Here is an example where the pods matching the label `myapp` will block node termination if evicting the pod would reduce the number of available pods below 4.
Expand All @@ -78,11 +78,9 @@ Review what [disruptions are](https://kubernetes.io/docs/concepts/workloads/pods
### Pod set to do-not-evict
If a pod exists with the annotation `karpenter.sh/do-not-evict` on a node, and a request is made to delete the node, Karpenter will not drain any pods from that node or otherwise try to delete the node.
However, if a `do-not-evict` pod is added to a node while the node is draining, the remaining pods will still evict, but that pod will block termination until it is removed.
In either case, the node will be cordoned to prevent additional work from scheduling.
If a pod exists with the annotation `karpenter.sh/do-not-evict` on a node, and a request is made to delete the node, Karpenter will not drain any pods from that node or otherwise try to delete the node. This annotation does not apply to static pods, pods that tolerate `NoSchedule`, or pods terminating past their graceful termination period.
That annotation is used for pods that you want to run on one node from start to finish without interruption.
This is useful for pods that you want to run from start to finish without interruption.
Examples might include a real-time, interactive game that you don't want to interrupt or a long batch job (such as you might have with machine learning) that would need to start over if it were interrupted.

If you want to terminate a `do-not-evict` pod, you can simply remove the annotation and the finalizer will delete the pod and continue the node deprovisioning process.
If you want to terminate node with a `do-not-evict` pod, you can simply remove the annotation and the deprovisioning process will continue.

0 comments on commit bb3e47b

Please sign in to comment.