Check DoNotEvict after filtering evictable pods to ensure termination…

… can complete.
aws · Feb 8, 2022 · bb3e47b · bb3e47b
1 parent 4c1fb8a
commit bb3e47b
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 44 deletions.
diff --git a/pkg/controllers/termination/eviction.go b/pkg/controllers/termination/eviction.go
@@ -49,12 +49,12 @@ func NewEvictionQueue(ctx context.Context, coreV1Client corev1.CoreV1Interface)
 
 		coreV1Client: coreV1Client,
 	}
-	go queue.Start(ctx)
+	go queue.Start(logging.WithLogger(ctx, logging.FromContext(ctx).Named("eviction")))
 	return queue
 }
 
 // Add adds pods to the EvictionQueue
-func (e *EvictionQueue) Add(pods []*v1.Pod) {
+func (e *EvictionQueue) Add(ctx context.Context, pods []*v1.Pod) {
 	for _, pod := range pods {
 		if nn := client.ObjectKeyFromObject(pod); !e.Set.Contains(nn) {
 			e.Set.Add(nn)
@@ -92,11 +92,11 @@ func (e *EvictionQueue) evict(ctx context.Context, nn types.NamespacedName) bool
 		ObjectMeta: metav1.ObjectMeta{Name: nn.Name, Namespace: nn.Namespace},
 	})
 	if errors.IsInternalError(err) { // 500
-		logging.FromContext(ctx).Debugf("Failed to evict pod %s due to PDB misconfiguration error.", nn.String())
+		logging.FromContext(ctx).Errorf("Could not evict pod %s due to PDB misconfiguration error.", nn.String())
 		return false
 	}
 	if errors.IsTooManyRequests(err) { // 429
-		logging.FromContext(ctx).Debugf("Failed to evict pod %s due to PDB violation.", nn.String())
+		logging.FromContext(ctx).Debugf("Did not to evict pod %s due to PDB violation.", nn.String())
 		return false
 	}
 	if errors.IsNotFound(err) { // 404

diff --git a/pkg/controllers/termination/terminate.go b/pkg/controllers/termination/terminate.go
@@ -29,7 +29,6 @@ import (
 	"github.com/aws/karpenter/pkg/utils/functional"
 	"github.com/aws/karpenter/pkg/utils/injectabletime"
 	"github.com/aws/karpenter/pkg/utils/pod"
-	"github.com/aws/karpenter/pkg/utils/ptr"
 )
 
 type Terminator struct {
@@ -57,28 +56,22 @@ func (t *Terminator) cordon(ctx context.Context, node *v1.Node) error {
 
 // drain evicts pods from the node and returns true when all pods are evicted
 func (t *Terminator) drain(ctx context.Context, node *v1.Node) (bool, error) {
-	// 1. Get pods on node
+	// Get evictable pods
 	pods, err := t.getPods(ctx, node)
 	if err != nil {
 		return false, fmt.Errorf("listing pods for node, %w", err)
 	}
-
-	// 2. Separate pods as non-critical and critical
-	// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
+	// Skip node due to do-not-evict
 	for _, pod := range pods {
+		// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
 		if val := pod.Annotations[v1alpha5.DoNotEvictPodAnnotationKey]; val == "true" {
-			logging.FromContext(ctx).Debugf("Unable to drain node, pod %s has do-not-evict annotation", pod.Name)
+			logging.FromContext(ctx).Debugf("Unable to drain node, pod %s/%s has do-not-evict annotation", pod.Namespace, pod.Name)
 			return false, nil
 		}
 	}
-
-	// 4. Get and evict pods
-	evictable := t.getEvictablePods(pods)
-	if len(evictable) == 0 {
-		return true, nil
-	}
-	t.evict(evictable)
-	return false, nil
+	// Enqueue for eviction
+	t.evict(ctx, pods)
+	return len(pods) == 0, nil
 }
 
 // terminate calls cloud provider delete then removes the finalizer to delete the node
@@ -102,34 +95,30 @@ func (t *Terminator) terminate(ctx context.Context, node *v1.Node) error {
 
 // getPods returns a list of pods scheduled to a node based on some filters
 func (t *Terminator) getPods(ctx context.Context, node *v1.Node) ([]*v1.Pod, error) {
-	pods := &v1.PodList{}
-	if err := t.KubeClient.List(ctx, pods, client.MatchingFields{"spec.nodeName": node.Name}); err != nil {
+	podList := &v1.PodList{}
+	if err := t.KubeClient.List(ctx, podList, client.MatchingFields{"spec.nodeName": node.Name}); err != nil {
 		return nil, fmt.Errorf("listing pods on node, %w", err)
 	}
-	return ptr.PodListToSlice(pods), nil
-}
-
-func (t *Terminator) getEvictablePods(pods []*v1.Pod) []*v1.Pod {
-	evictable := []*v1.Pod{}
-	for _, p := range pods {
+	pods := []*v1.Pod{}
+	for i  := range podList.Items {
 		// Ignore if unschedulable is tolerated, since they will reschedule
-		if (v1alpha5.Taints{{Key: v1.TaintNodeUnschedulable, Effect: v1.TaintEffectNoSchedule}}).Tolerates(p) == nil {
+		if (v1alpha5.Taints{{Key: v1.TaintNodeUnschedulable, Effect: v1.TaintEffectNoSchedule}}).Tolerates(&podList.Items[i]) == nil {
 			continue
 		}
 		// Ignore if kubelet is partitioned and pods are beyond graceful termination window
-		if IsStuckTerminating(p) {
+		if IsStuckTerminating(&podList.Items[i]) {
 			continue
 		}
 		// Ignore static mirror pods
-		if pod.IsOwnedByNode(p) {
+		if pod.IsOwnedByNode(&podList.Items[i]) {
 			continue
 		}
-		evictable = append(evictable, p)
+		pods = append(pods, &podList.Items[i])
 	}
-	return evictable
+	return pods, nil
 }
 
-func (t *Terminator) evict(pods []*v1.Pod) {
+func (t *Terminator) evict(ctx context.Context, pods []*v1.Pod) {
 	// 1. Prioritize noncritical pods https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
 	critical := []*v1.Pod{}
 	nonCritical := []*v1.Pod{}
@@ -145,9 +134,9 @@ func (t *Terminator) evict(pods []*v1.Pod) {
 	}
 	// 2. Evict critical pods if all noncritical are evicted
 	if len(nonCritical) == 0 {
-		t.EvictionQueue.Add(critical)
+		t.EvictionQueue.Add(ctx, critical)
 	} else {
-		t.EvictionQueue.Add(nonCritical)
+		t.EvictionQueue.Add(ctx, nonCritical)
 	}
 }
 

diff --git a/website/content/en/preview/tasks/deprovisioning.md b/website/content/en/preview/tasks/deprovisioning.md
@@ -24,16 +24,16 @@ There are both automated and manual ways of deprovisioning nodes provisioned by
     Keep in mind that a small NodeExpiry results in a higher churn in cluster activity. So, for example, if a cluster
     brings up all nodes at once, all the pods on those nodes would fall into the same batching window on expiration.
     {{% /alert %}}
-    
+
 * **Node deleted**: You could use `kubectl` to manually remove a single Karpenter node:
 
     ```bash
     # Delete a specific node
     kubectl delete node $NODE_NAME
-    
+
     # Delete all nodes owned any provisioner
     kubectl delete nodes -l karpenter.sh/provisioner-name
-    
+
     # Delete all nodes owned by a specific provisioner
     kubectl delete nodes -l karpenter.sh/provisioner-name=$PROVISIONER_NAME
     ```
@@ -44,7 +44,7 @@ If the Karpenter controller is removed or fails, the finalizers on the nodes are
 
 {{% alert title="Note" color="primary" %}}
 By adding the finalizer, Karpenter improves the default Kubernetes process of node deletion.
-When you run `kubectl delete node` on a node without a finalizer, the node is deleted without triggering the finalization logic. The instance will continue running in EC2, even though there is no longer a node object for it. 
+When you run `kubectl delete node` on a node without a finalizer, the node is deleted without triggering the finalization logic. The instance will continue running in EC2, even though there is no longer a node object for it.
 The kubelet isn’t watching for its own existence, so if a node is deleted the kubelet doesn’t terminate itself.
 All the pod objects get deleted by a garbage collection process later, because the pods’ node is gone.
 {{% /alert %}}
@@ -56,7 +56,7 @@ There are a few cases where requesting to deprovision a Karpenter node will fail
 ### Disruption budgets
 
 Karpenter respects Pod Disruption Budgets (PDBs) by using a backoff retry eviction strategy. Pods will never be forcibly deleted, so pods that fail to shut down will prevent a node from deprovisioning.
-Kubernetes PDBs let you specify how much of a Deployment, ReplicationController, ReplicaSet, or StatefulSet must be protected from disruptions when pod eviction requests are made. 
+Kubernetes PDBs let you specify how much of a Deployment, ReplicationController, ReplicaSet, or StatefulSet must be protected from disruptions when pod eviction requests are made.
 
 PDBs can be used to strike a balance by protecting the application's availability while still allowing a cluster administrator to manage the cluster.
 Here is an example where the pods matching the label `myapp` will block node termination if evicting the pod would reduce the number of available pods below 4.
@@ -78,11 +78,9 @@ Review what [disruptions are](https://kubernetes.io/docs/concepts/workloads/pods
 
 ### Pod set to do-not-evict
 
-If a pod exists with the annotation `karpenter.sh/do-not-evict` on a node, and a request is made to delete the node, Karpenter will not drain any pods from that node or otherwise try to delete the node.
-However, if a `do-not-evict` pod is added to a node while the node is draining, the remaining pods will still evict, but that pod will block termination until it is removed.
-In either case, the node will be cordoned to prevent additional work from scheduling.
+If a pod exists with the annotation `karpenter.sh/do-not-evict` on a node, and a request is made to delete the node, Karpenter will not drain any pods from that node or otherwise try to delete the node. This annotation does not apply to static pods, pods that tolerate `NoSchedule`, or pods terminating past their graceful termination period.
 
-That annotation is used for pods that you want to run on one node from start to finish without interruption.
+This is useful for pods that you want to run from start to finish without interruption.
 Examples might include a real-time, interactive game that you don't want to interrupt or a long batch job (such as you might have with machine learning) that would need to start over if it were interrupted.
 
-If you want to terminate a `do-not-evict` pod, you can simply remove the annotation and the finalizer will delete the pod and continue the node deprovisioning process.
+If you want to terminate node with a `do-not-evict` pod, you can simply remove the annotation and the deprovisioning process will continue.