From 20ac5db5be4432f8c2dfa3894fe94f76dcf95f87 Mon Sep 17 00:00:00 2001 From: William Hutcheson Date: Sun, 10 Apr 2022 18:47:14 +0100 Subject: [PATCH] Handle pod eviction errors correctly Currently any eviction error causes the draining of a node to stop and a new node to start draining. Eviction errors are common, expected occurences especially when PDBs are used in the cluster. By having any error abort the draining of a node we slow down the entire node draining process as many of the pods further in the list could happily be drained. This change separates recoverable and irrecoverable eviction errors and retries only the recoverable. Unrecoverable errors fail the entire command. An important aspect of this is that the `evictPods` function becomes blocking until a node is drained or the process times out. This is required as the current implementation begins draining another node on the first eviction error. We would rather keep trying and eventually time out than make a bad situation worse by draining a new node. --- pkg/actions/nodegroup/drain.go | 17 +-- pkg/ctl/delete/nodegroup.go | 36 ++++--- pkg/ctl/delete/nodegroup_test.go | 2 +- pkg/ctl/drain/nodegroup.go | 40 +++---- pkg/ctl/drain/nodegroup_test.go | 2 +- pkg/drain/evictor/pod.go | 5 +- pkg/drain/nodegroup.go | 145 +++++++++++++++++-------- pkg/drain/nodegroup_test.go | 177 ++++++++++++++++++++++++++++++- 8 files changed, 326 insertions(+), 98 deletions(-) diff --git a/pkg/actions/nodegroup/drain.go b/pkg/actions/nodegroup/drain.go index 7afd04f22b..fc748a5874 100644 --- a/pkg/actions/nodegroup/drain.go +++ b/pkg/actions/nodegroup/drain.go @@ -9,19 +9,20 @@ import ( ) type DrainInput struct { - NodeGroups []eks.KubeNodeGroup - Plan bool - MaxGracePeriod time.Duration - NodeDrainWaitPeriod time.Duration - Undo bool - DisableEviction bool - Parallel int + NodeGroups []eks.KubeNodeGroup + Plan bool + MaxGracePeriod time.Duration + NodeDrainWaitPeriod time.Duration + PodEvictionWaitPeriod time.Duration + Undo bool + DisableEviction bool + Parallel int } func (m *Manager) Drain(input *DrainInput) error { if !input.Plan { for _, n := range input.NodeGroups { - nodeGroupDrainer := drain.NewNodeGroupDrainer(m.clientSet, n, m.ctl.Provider.WaitTimeout(), input.MaxGracePeriod, input.NodeDrainWaitPeriod, input.Undo, input.DisableEviction, input.Parallel) + nodeGroupDrainer := drain.NewNodeGroupDrainer(m.clientSet, n, m.ctl.Provider.WaitTimeout(), input.MaxGracePeriod, input.NodeDrainWaitPeriod, input.PodEvictionWaitPeriod, input.Undo, input.DisableEviction, input.Parallel) if err := nodeGroupDrainer.Drain(); err != nil { return err } diff --git a/pkg/ctl/delete/nodegroup.go b/pkg/ctl/delete/nodegroup.go index 19461ffe2e..67f6d80033 100644 --- a/pkg/ctl/delete/nodegroup.go +++ b/pkg/ctl/delete/nodegroup.go @@ -17,30 +17,31 @@ import ( ) func deleteNodeGroupCmd(cmd *cmdutils.Cmd) { - deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error { - return doDeleteNodeGroup(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, disableEviction, parallel) + deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { + return doDeleteNodeGroup(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, podEvictionWaitPeriod, disableEviction, parallel) }) } -func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error) { +func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error) { cfg := api.NewClusterConfig() ng := api.NewNodeGroup() cmd.ClusterConfig = cfg var ( - updateAuthConfigMap bool - deleteNodeGroupDrain bool - onlyMissing bool - maxGracePeriod time.Duration - disableEviction bool - parallel int + updateAuthConfigMap bool + deleteNodeGroupDrain bool + onlyMissing bool + maxGracePeriod time.Duration + podEvictionWaitPeriod time.Duration + disableEviction bool + parallel int ) cmd.SetDescription("nodegroup", "Delete a nodegroup", "", "ng") cmd.CobraCommand.RunE = func(_ *cobra.Command, args []string) error { cmd.NameArg = cmdutils.GetNameArg(args) - return runFunc(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, disableEviction, parallel) + return runFunc(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, podEvictionWaitPeriod, disableEviction, parallel) } cmd.FlagSetGroup.InFlagSet("General", func(fs *pflag.FlagSet) { @@ -55,6 +56,8 @@ func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cm fs.BoolVar(&deleteNodeGroupDrain, "drain", true, "Drain and cordon all nodes in the nodegroup before deletion") defaultMaxGracePeriod, _ := time.ParseDuration("10m") fs.DurationVar(&maxGracePeriod, "max-grace-period", defaultMaxGracePeriod, "Maximum pods termination grace period") + defaultPodEvictionWaitPeriod, _ := time.ParseDuration("10s") + fs.DurationVar(&podEvictionWaitPeriod, "pod-eviction-wait-period", defaultPodEvictionWaitPeriod, "Duration to wait after failing to evict a pod") defaultDisableEviction := false fs.BoolVar(&disableEviction, "disable-eviction", defaultDisableEviction, "Force drain to use delete, even if eviction is supported. This will bypass checking PodDisruptionBudgets, use with caution.") fs.IntVar(¶llel, "parallel", 1, "Number of nodes to drain in parallel. Max 25") @@ -67,7 +70,7 @@ func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cm cmdutils.AddCommonFlagsForAWS(cmd.FlagSetGroup, &cmd.ProviderConfig, true) } -func doDeleteNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error { +func doDeleteNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { ngFilter := filter.NewNodeGroupFilter() if err := cmdutils.NewDeleteAndDrainNodeGroupLoader(cmd, ng, ngFilter).Load(); err != nil { @@ -128,11 +131,12 @@ func doDeleteNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap cmdutils.LogIntendedAction(cmd.Plan, "drain %d nodegroup(s) in cluster %q", len(allNodeGroups), cfg.Metadata.Name) drainInput := &nodegroup.DrainInput{ - NodeGroups: allNodeGroups, - Plan: cmd.Plan, - MaxGracePeriod: maxGracePeriod, - DisableEviction: disableEviction, - Parallel: parallel, + NodeGroups: allNodeGroups, + Plan: cmd.Plan, + MaxGracePeriod: maxGracePeriod, + PodEvictionWaitPeriod: podEvictionWaitPeriod, + DisableEviction: disableEviction, + Parallel: parallel, } err := nodeGroupManager.Drain(drainInput) if err != nil { diff --git a/pkg/ctl/delete/nodegroup_test.go b/pkg/ctl/delete/nodegroup_test.go index b46ec18019..fb42d8b3b5 100644 --- a/pkg/ctl/delete/nodegroup_test.go +++ b/pkg/ctl/delete/nodegroup_test.go @@ -22,7 +22,7 @@ var _ = Describe("delete", func() { cmd := newMockEmptyCmd(args...) count := 0 cmdutils.AddResourceCmd(cmdutils.NewGrouping(), cmd.parentCmd, func(cmd *cmdutils.Cmd) { - deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error { + deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { Expect(cmd.ClusterConfig.Metadata.Name).To(Equal("clusterName")) Expect(ng.Name).To(Equal("ng")) count++ diff --git a/pkg/ctl/drain/nodegroup.go b/pkg/ctl/drain/nodegroup.go index 87e719be82..d3ad4e5f70 100644 --- a/pkg/ctl/drain/nodegroup.go +++ b/pkg/ctl/drain/nodegroup.go @@ -15,30 +15,31 @@ import ( ) func drainNodeGroupCmd(cmd *cmdutils.Cmd) { - drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error { - return doDrainNodeGroup(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, disableEviction, parallel) + drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { + return doDrainNodeGroup(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod, disableEviction, parallel) }) } -func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error) { +func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error) { cfg := api.NewClusterConfig() ng := api.NewNodeGroup() cmd.ClusterConfig = cfg var ( - undo bool - onlyMissing bool - disableEviction bool - parallel int - maxGracePeriod time.Duration - nodeDrainWaitPeriod time.Duration + undo bool + onlyMissing bool + disableEviction bool + parallel int + maxGracePeriod time.Duration + nodeDrainWaitPeriod time.Duration + podEvictionWaitPeriod time.Duration ) cmd.SetDescription("nodegroup", "Cordon and drain a nodegroup", "", "ng") cmd.CobraCommand.RunE = func(_ *cobra.Command, args []string) error { cmd.NameArg = cmdutils.GetNameArg(args) - return runFunc(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, disableEviction, parallel) + return runFunc(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod, disableEviction, parallel) } cmd.FlagSetGroup.InFlagSet("General", func(fs *pflag.FlagSet) { @@ -52,6 +53,8 @@ func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd fs.BoolVar(&undo, "undo", false, "Uncordon the nodegroup") defaultMaxGracePeriod, _ := time.ParseDuration("10m") fs.DurationVar(&maxGracePeriod, "max-grace-period", defaultMaxGracePeriod, "Maximum pods termination grace period") + defaultPodEvictionWaitPeriod, _ := time.ParseDuration("10s") + fs.DurationVar(&podEvictionWaitPeriod, "pod-eviction-wait-period", defaultPodEvictionWaitPeriod, "Duration to wait after failing to evict a pod") defaultDisableEviction := false fs.BoolVar(&disableEviction, "disable-eviction", defaultDisableEviction, "Force drain to use delete, even if eviction is supported. This will bypass checking PodDisruptionBudgets, use with caution.") cmdutils.AddTimeoutFlag(fs, &cmd.ProviderConfig.WaitTimeout) @@ -62,7 +65,7 @@ func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd cmdutils.AddCommonFlagsForAWS(cmd.FlagSetGroup, &cmd.ProviderConfig, true) } -func doDrainNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error { +func doDrainNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { ngFilter := filter.NewNodeGroupFilter() if err := cmdutils.NewDeleteAndDrainNodeGroupLoader(cmd, ng, ngFilter).Load(); err != nil { @@ -125,13 +128,14 @@ func doDrainNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bo allNodeGroups := cmdutils.ToKubeNodeGroups(cfg) drainInput := &nodegroup.DrainInput{ - NodeGroups: allNodeGroups, - Plan: cmd.Plan, - MaxGracePeriod: maxGracePeriod, - NodeDrainWaitPeriod: nodeDrainWaitPeriod, - Undo: undo, - DisableEviction: disableEviction, - Parallel: parallel, + NodeGroups: allNodeGroups, + Plan: cmd.Plan, + MaxGracePeriod: maxGracePeriod, + NodeDrainWaitPeriod: nodeDrainWaitPeriod, + PodEvictionWaitPeriod: podEvictionWaitPeriod, + Undo: undo, + DisableEviction: disableEviction, + Parallel: parallel, } return nodegroup.New(cfg, ctl, clientSet).Drain(drainInput) } diff --git a/pkg/ctl/drain/nodegroup_test.go b/pkg/ctl/drain/nodegroup_test.go index 103c160750..463c36e443 100644 --- a/pkg/ctl/drain/nodegroup_test.go +++ b/pkg/ctl/drain/nodegroup_test.go @@ -24,7 +24,7 @@ var _ = Describe("drain node group", func() { cmd := newMockEmptyCmd(args...) count := 0 cmdutils.AddResourceCmd(cmdutils.NewGrouping(), cmd.parentCmd, func(cmd *cmdutils.Cmd) { - drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error { + drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { Expect(cmd.ClusterConfig.Metadata.Name).To(Equal("clusterName")) Expect(ng.Name).To(Equal("ng")) count++ diff --git a/pkg/drain/evictor/pod.go b/pkg/drain/evictor/pod.go index 031dad10c3..fcd517eef2 100644 --- a/pkg/drain/evictor/pod.go +++ b/pkg/drain/evictor/pod.go @@ -29,7 +29,6 @@ import ( const ( daemonSetFatal = "DaemonSet-managed Pods (use --ignore-daemonsets to ignore)" - daemonSetWarning = "ignoring DaemonSet-managed Pods" localStorageFatal = "Pods with local storage (use --delete-local-data to override)" localStorageWarning = "deleting Pods with local storage" unmanagedFatal = "Pods not managed by ReplicationController, ReplicaSet, Job, DaemonSet or StatefulSet (use --force to override)" @@ -204,7 +203,7 @@ func (d *Evictor) daemonSetFilter(pod corev1.Pod) PodDeleteStatus { if controllerRef.Name == ignoreDaemonSet.Name { switch ignoreDaemonSet.Namespace { case pod.Namespace, metav1.NamespaceAll: - return makePodDeleteStatusWithWarning(false, daemonSetWarning) + return makePodDeleteStatusSkip() } } } @@ -213,7 +212,7 @@ func (d *Evictor) daemonSetFilter(pod corev1.Pod) PodDeleteStatus { return makePodDeleteStatusWithError(daemonSetFatal) } - return makePodDeleteStatusWithWarning(false, daemonSetWarning) + return makePodDeleteStatusSkip() } func (d *Evictor) mirrorPodFilter(pod corev1.Pod) PodDeleteStatus { diff --git a/pkg/drain/nodegroup.go b/pkg/drain/nodegroup.go index 35f1b5b380..49eb28a54f 100644 --- a/pkg/drain/nodegroup.go +++ b/pkg/drain/nodegroup.go @@ -3,8 +3,10 @@ package drain import ( "context" "fmt" + "strings" "time" + "golang.org/x/sync/errgroup" "golang.org/x/sync/semaphore" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -27,6 +29,13 @@ import ( // retryDelay is how long is slept before retry after an error occurs during drainage const retryDelay = 5 * time.Second +var recoverablePodEvictionErrors = [...]string{ + "Cannot evict pod as it would violate the pod's disruption budget", + "TooManyRequests", + "NotFound", + "not found", +} + //go:generate go run github.com/maxbrunsfeld/counterfeiter/v6 -generate //counterfeiter:generate -o fakes/fake_evictor.go . Evictor type Evictor interface { @@ -36,16 +45,17 @@ type Evictor interface { } type NodeGroupDrainer struct { - clientSet kubernetes.Interface - evictor Evictor - ng eks.KubeNodeGroup - waitTimeout time.Duration - nodeDrainWaitPeriod time.Duration - undo bool - parallel int + clientSet kubernetes.Interface + evictor Evictor + ng eks.KubeNodeGroup + waitTimeout time.Duration + nodeDrainWaitPeriod time.Duration + podEvictionWaitPeriod time.Duration + undo bool + parallel int } -func NewNodeGroupDrainer(clientSet kubernetes.Interface, ng eks.KubeNodeGroup, waitTimeout, maxGracePeriod, nodeDrainWaitPeriod time.Duration, undo, disableEviction bool, parallel int) NodeGroupDrainer { +func NewNodeGroupDrainer(clientSet kubernetes.Interface, ng eks.KubeNodeGroup, waitTimeout, maxGracePeriod, nodeDrainWaitPeriod time.Duration, podEvictionWaitPeriod time.Duration, undo, disableEviction bool, parallel int) NodeGroupDrainer { ignoreDaemonSets := []metav1.ObjectMeta{ { Namespace: "kube-system", @@ -73,13 +83,14 @@ func NewNodeGroupDrainer(clientSet kubernetes.Interface, ng eks.KubeNodeGroup, w } return NodeGroupDrainer{ - evictor: evictor.New(clientSet, maxGracePeriod, ignoreDaemonSets, disableEviction), - clientSet: clientSet, - ng: ng, - waitTimeout: waitTimeout, - nodeDrainWaitPeriod: nodeDrainWaitPeriod, - undo: undo, - parallel: parallel, + evictor: evictor.New(clientSet, maxGracePeriod, ignoreDaemonSets, disableEviction), + clientSet: clientSet, + ng: ng, + waitTimeout: waitTimeout, + nodeDrainWaitPeriod: nodeDrainWaitPeriod, + podEvictionWaitPeriod: podEvictionWaitPeriod, + undo: undo, + parallel: parallel, } } @@ -112,6 +123,7 @@ func (n *NodeGroupDrainer) Drain() error { parallelLimit := int64(n.parallel) sem := semaphore.NewWeighted(parallelLimit) logger.Info("starting parallel draining, max in-flight of %d", parallelLimit) + var evictErr error // loop until all nodes are drained to handle accidental scale-up // or any other changes in the ASG for { @@ -121,6 +133,9 @@ func (n *NodeGroupDrainer) Drain() error { waitForAllRoutinesToFinish(context.TODO(), sem, parallelLimit) return fmt.Errorf("timed out (after %s) waiting for nodegroup %q to be drained", n.waitTimeout, n.ng.NameString()) default: + if evictErr != nil { + return evictErr + } nodes, err := n.clientSet.CoreV1().Nodes().List(context.TODO(), listOptions) if err != nil { return err @@ -143,32 +158,37 @@ func (n *NodeGroupDrainer) Drain() error { logger.Debug("already drained: %v", mapToList(drainedNodes.Items())) logger.Debug("will drain: %v", newPendingNodes.List()) - for i, node := range newPendingNodes.List() { - if err := sem.Acquire(ctx, 1); err != nil { - logger.Critical("failed to acquire semaphore: %w", err) - } - go func(i int, node string) { + errorGroup, _ := errgroup.WithContext(ctx) + for _, node := range newPendingNodes.List() { + node := node + errorGroup.Go(func() error { + if err := sem.Acquire(ctx, 1); err != nil { + return errors.Wrapf(err, "failed to acquire semaphore") + } defer sem.Release(1) + + drainedNodes.Set(node, nil) logger.Debug("starting drain of node %s", node) - pending, err := n.evictPods(node) + err := n.evictPods(ctx, node) if err != nil { logger.Warning("pod eviction error (%q) on node %s", err, node) time.Sleep(retryDelay) - return + return err } - logger.Debug("%d pods to be evicted from %s", pending, node) - if pending == 0 { - drainedNodes.Set(node, nil) - } + drainedNodes.Set(node, nil) if n.nodeDrainWaitPeriod > 0 { logger.Debug("waiting for %.0f seconds before draining next node", n.nodeDrainWaitPeriod.Seconds()) time.Sleep(n.nodeDrainWaitPeriod) } - }(i, node) + return nil + }) } + // We need to loop even if this is an error to check whether the error was a + // context timeout or something else. This lets us log timout errors consistently + evictErr = errorGroup.Wait() } } } @@ -204,29 +224,53 @@ func (n *NodeGroupDrainer) toggleCordon(cordon bool, nodes *corev1.NodeList) { logger.Debug("no need to %s node %q", cordonStatus(cordon), node.Name) } } - } -func (n *NodeGroupDrainer) evictPods(node string) (int, error) { - list, errs := n.evictor.GetPodsForEviction(node) - if len(errs) > 0 { - return 0, fmt.Errorf("errs: %v", errs) // TODO: improve formatting - } - if list == nil { - return 0, nil - } - if w := list.Warnings(); w != "" { - logger.Warning(w) - } - pods := list.Pods() - pending := len(pods) - for _, pod := range pods { - // TODO: handle API rate limiter error - if err := n.evictor.EvictOrDeletePod(pod); err != nil { - return pending, errors.Wrapf(err, "error evicting pod: %s/%s", pod.Namespace, pod.Name) +func (n *NodeGroupDrainer) evictPods(ctx context.Context, node string) error { + // Loop until context times out. We want to continually try to remove pods + // from the node as their eviction status changes. + previousReportTime := time.Now() + for { + select { + case <-ctx.Done(): + return fmt.Errorf("timed out (after %s) waiting for node %q to be drained", n.waitTimeout, node) + default: + list, errs := n.evictor.GetPodsForEviction(node) + if len(errs) > 0 { + return fmt.Errorf("errs: %v", errs) // TODO: improve formatting + } + if list == nil { + return nil + } + pods := list.Pods() + if len(pods) == 0 { + return nil + } + if w := list.Warnings(); w != "" { + logger.Warning(w) + } + // This log message is important but can be noisy. It's useful to only + // update on a node every minute. + if time.Now().Sub(previousReportTime) > time.Minute*1 && len(pods) > 0 { + logger.Warning("%d pods are unevictable from node %s", len(pods), node) + previousReportTime = time.Now() + } + logger.Debug("%d pods to be evicted from %s", pods, node) + failedEvictions := false + for _, pod := range pods { + if err := n.evictor.EvictOrDeletePod(pod); err != nil { + if !isEvictionErrorRecoverable(err) { + return errors.Wrapf(err, "unrecoverable error evicting pod: %s/%s", pod.Namespace, pod.Name) + } + logger.Debug("pod eviction failed recoverably: %q", err) + failedEvictions = true + } + } + if failedEvictions { + time.Sleep(n.podEvictionWaitPeriod) + } } } - return pending, nil } func cordonStatus(desired bool) string { @@ -235,3 +279,12 @@ func cordonStatus(desired bool) string { } return "uncordon" } + +func isEvictionErrorRecoverable(err error) bool { + for _, recoverableError := range recoverablePodEvictionErrors { + if strings.Contains(err.Error(), recoverableError) { + return true + } + } + return false +} diff --git a/pkg/drain/nodegroup_test.go b/pkg/drain/nodegroup_test.go index 32520ede9e..42863a6154 100644 --- a/pkg/drain/nodegroup_test.go +++ b/pkg/drain/nodegroup_test.go @@ -2,6 +2,7 @@ package drain_test import ( "context" + "errors" "fmt" "time" @@ -16,6 +17,7 @@ import ( . "github.com/onsi/gomega" . "github.com/onsi/ginkgo" + "github.com/weaveworks/eksctl/pkg/drain" "github.com/weaveworks/eksctl/pkg/eks/mocks" "k8s.io/client-go/kubernetes/fake" @@ -73,7 +75,7 @@ var _ = Describe("Drain", func() { }) It("does not error", func() { - nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second*10, time.Second, false, false, 1) + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second*10, time.Second, time.Second*0, false, false, 1) nodeGroupDrainer.SetDrainer(fakeEvictor) err := nodeGroupDrainer.Drain() @@ -117,7 +119,7 @@ var _ = Describe("Drain", func() { }) It("times out and errors", func() { - nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*2, time.Second*0, time.Second, false, false, 1) + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*2, time.Second*0, time.Second, time.Second*0, false, false, 1) nodeGroupDrainer.SetDrainer(fakeEvictor) err := nodeGroupDrainer.Drain() @@ -131,7 +133,7 @@ var _ = Describe("Drain", func() { }) It("errors", func() { - nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second, time.Second, time.Second, false, false, 1) + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second, time.Second, time.Second, time.Second*0, false, false, 1) nodeGroupDrainer.SetDrainer(fakeEvictor) err := nodeGroupDrainer.Drain() @@ -175,7 +177,7 @@ var _ = Describe("Drain", func() { }) It("does not error", func() { - nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, false, true, 1) + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, time.Second*0, false, true, 1) nodeGroupDrainer.SetDrainer(fakeEvictor) err := nodeGroupDrainer.Drain() @@ -205,7 +207,7 @@ var _ = Describe("Drain", func() { }) It("uncordons all the nodes", func() { - nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, true, false, 1) + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, time.Second*0, true, false, 1) nodeGroupDrainer.SetDrainer(fakeEvictor) err := nodeGroupDrainer.Drain() @@ -219,4 +221,169 @@ var _ = Describe("Drain", func() { Expect(fakeEvictor.EvictOrDeletePodCallCount()).To(BeZero()) }) }) + + When("an eviction fails recoverably", func() { + var pod corev1.Pod + + BeforeEach(func() { + pod = corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-1", + }, + } + + for i := 0; i < 2; i++ { + fakeEvictor.GetPodsForEvictionReturnsOnCall(i, &evictor.PodDeleteList{ + Items: []evictor.PodDelete{ + { + Pod: pod, + Status: evictor.PodDeleteStatus{ + Delete: true, + }, + }, + }, + }, nil) + } + fakeEvictor.GetPodsForEvictionReturnsOnCall(2, nil, nil) + + fakeEvictor.EvictOrDeletePodReturnsOnCall(0, errors.New("Cannot evict pod as it would violate the pod's disruption budget")) + fakeEvictor.EvictOrDeletePodReturnsOnCall(1, nil) + + _, err := fakeClientSet.CoreV1().Nodes().Create(context.TODO(), &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + Spec: corev1.NodeSpec{ + Unschedulable: false, + }, + }, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + }) + + It("does not error", func() { + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, time.Second*0, false, false, 1) + nodeGroupDrainer.SetDrainer(fakeEvictor) + + err := nodeGroupDrainer.Drain() + Expect(err).NotTo(HaveOccurred()) + + Expect(fakeEvictor.GetPodsForEvictionCallCount()).To(Equal(3)) + Expect(fakeEvictor.EvictOrDeletePodCallCount()).To(Equal(2)) + Expect(fakeEvictor.EvictOrDeletePodArgsForCall(0)).To(Equal(pod)) + Expect(fakeEvictor.EvictOrDeletePodArgsForCall(1)).To(Equal(pod)) + }) + }) + + When("an eviction fails irrecoverably", func() { + var pod corev1.Pod + var evictionError error + + BeforeEach(func() { + pod = corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-1", + Namespace: "ns-1", + }, + } + + fakeEvictor.GetPodsForEvictionReturns(&evictor.PodDeleteList{ + Items: []evictor.PodDelete{ + { + Pod: pod, + Status: evictor.PodDeleteStatus{ + Delete: true, + }, + }, + }, + }, nil) + + evictionError = errors.New("error1") + fakeEvictor.EvictOrDeletePodReturns(evictionError) + + _, err := fakeClientSet.CoreV1().Nodes().Create(context.TODO(), &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + Spec: corev1.NodeSpec{ + Unschedulable: false, + }, + }, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + }) + + It("returns an error", func() { + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, time.Second*0, false, false, 1) + nodeGroupDrainer.SetDrainer(fakeEvictor) + + err := nodeGroupDrainer.Drain() + Expect(err.Error()).To(ContainSubstring("unrecoverable error evicting pod: ns-1/pod-1")) + + Expect(fakeEvictor.GetPodsForEvictionCallCount()).To(Equal(1)) + Expect(fakeEvictor.EvictOrDeletePodCallCount()).To(Equal(1)) + Expect(fakeEvictor.EvictOrDeletePodArgsForCall(0)).To(Equal(pod)) + }) + }) + + When("eviction fails recoverably with multiple pods", func() { + var pods []corev1.Pod + var evictionError error + + BeforeEach(func() { + pods = []corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-1", + Namespace: "ns-1", + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-2", + Namespace: "ns-2", + }, + }, + } + + fakeEvictor.GetPodsForEvictionReturns(&evictor.PodDeleteList{ + Items: []evictor.PodDelete{ + { + Pod: pods[0], + Status: evictor.PodDeleteStatus{ + Delete: true, + }, + }, + { + Pod: pods[1], + Status: evictor.PodDeleteStatus{ + Delete: true, + }, + }, + }, + }, nil) + + evictionError = errors.New("Cannot evict pod as it would violate the pod's disruption budget") + fakeEvictor.EvictOrDeletePodReturns(evictionError) + + _, err := fakeClientSet.CoreV1().Nodes().Create(context.TODO(), &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + Spec: corev1.NodeSpec{ + Unschedulable: false, + }, + }, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + }) + + It("it attempts to drain all pods", func() { + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, time.Second*0, false, false, 1) + nodeGroupDrainer.SetDrainer(fakeEvictor) + + _ = nodeGroupDrainer.Drain() + + Expect(fakeEvictor.EvictOrDeletePodCallCount()).To(BeNumerically(">=", 2)) + Expect(fakeEvictor.EvictOrDeletePodArgsForCall(0)).To(Equal(pods[0])) + Expect(fakeEvictor.EvictOrDeletePodArgsForCall(1)).To(Equal(pods[1])) + }) + }) })