diff --git a/pkg/actions/cluster/cluster.go b/pkg/actions/cluster/cluster.go index 6cc87011e5..a8f0d405c4 100644 --- a/pkg/actions/cluster/cluster.go +++ b/pkg/actions/cluster/cluster.go @@ -17,7 +17,7 @@ import ( type Cluster interface { Upgrade(ctx context.Context, dryRun bool) error - Delete(ctx context.Context, waitInterval time.Duration, wait, force, disableNodegroupEviction bool, parallel int) error + Delete(ctx context.Context, waitInterval, podEvictionWaitPeriod time.Duration, wait, force, disableNodegroupEviction bool, parallel int) error } func New(ctx context.Context, cfg *api.ClusterConfig, ctl *eks.ClusterProvider) (Cluster, error) { diff --git a/pkg/actions/cluster/delete.go b/pkg/actions/cluster/delete.go index 755ae53dca..30bc090991 100644 --- a/pkg/actions/cluster/delete.go +++ b/pkg/actions/cluster/delete.go @@ -166,7 +166,7 @@ func checkForUndeletedStacks(ctx context.Context, stackManager manager.StackMana } func drainAllNodeGroups(cfg *api.ClusterConfig, ctl *eks.ClusterProvider, clientSet kubernetes.Interface, allStacks []manager.NodeGroupStack, - disableEviction bool, parallel int, nodeGroupDrainer NodeGroupDrainer, vpcCniDeleter vpcCniDeleter) error { + disableEviction bool, parallel int, nodeGroupDrainer NodeGroupDrainer, vpcCniDeleter vpcCniDeleter, podEvictionWaitPeriod time.Duration) error { if len(allStacks) == 0 { return nil } @@ -181,10 +181,11 @@ func drainAllNodeGroups(cfg *api.ClusterConfig, ctl *eks.ClusterProvider, client logger.Info("will drain %d unmanaged nodegroup(s) in cluster %q", len(cfg.NodeGroups), cfg.Metadata.Name) drainInput := &nodegroup.DrainInput{ - NodeGroups: cmdutils.ToKubeNodeGroups(cfg), - MaxGracePeriod: ctl.Provider.WaitTimeout(), - DisableEviction: disableEviction, - Parallel: parallel, + NodeGroups: cmdutils.ToKubeNodeGroups(cfg), + MaxGracePeriod: ctl.Provider.WaitTimeout(), + DisableEviction: disableEviction, + PodEvictionWaitPeriod: podEvictionWaitPeriod, + Parallel: parallel, } if err := nodeGroupDrainer.Drain(drainInput); err != nil { return err diff --git a/pkg/actions/cluster/delete_test.go b/pkg/actions/cluster/delete_test.go index 88b752931a..7ca5b5bd77 100644 --- a/pkg/actions/cluster/delete_test.go +++ b/pkg/actions/cluster/delete_test.go @@ -1,6 +1,8 @@ package cluster_test import ( + "time" + "github.com/weaveworks/eksctl/pkg/actions/nodegroup" "github.com/weaveworks/eksctl/pkg/ctl/cmdutils" @@ -68,7 +70,7 @@ var _ = Describe("DrainAllNodeGroups", func() { vpcCniDeleterCalled++ } - err := cluster.DrainAllNodeGroups(cfg, ctl, fakeClientSet, nodeGroupStacks, false, 1, mockedDrainer, vpcCniDeleter) + err := cluster.DrainAllNodeGroups(cfg, ctl, fakeClientSet, nodeGroupStacks, false, 1, mockedDrainer, vpcCniDeleter, time.Second*0) Expect(err).NotTo(HaveOccurred()) mockedDrainer.AssertNumberOfCalls(GinkgoT(), "Drain", 1) Expect(vpcCniDeleterCalled).To(Equal(1)) @@ -97,7 +99,7 @@ var _ = Describe("DrainAllNodeGroups", func() { vpcCniDeleterCalled++ } - err := cluster.DrainAllNodeGroups(cfg, ctl, fakeClientSet, nodeGroupStacks, true, 1, mockedDrainer, vpcCniDeleter) + err := cluster.DrainAllNodeGroups(cfg, ctl, fakeClientSet, nodeGroupStacks, true, 1, mockedDrainer, vpcCniDeleter, time.Second*0) Expect(err).NotTo(HaveOccurred()) mockedDrainer.AssertNumberOfCalls(GinkgoT(), "Drain", 1) Expect(vpcCniDeleterCalled).To(Equal(1)) @@ -125,7 +127,7 @@ var _ = Describe("DrainAllNodeGroups", func() { vpcCniDeleterCalled++ } - err := cluster.DrainAllNodeGroups(cfg, ctl, fakeClientSet, nodeGroupStacks, false, 1, mockedDrainer, vpcCniDeleter) + err := cluster.DrainAllNodeGroups(cfg, ctl, fakeClientSet, nodeGroupStacks, false, 1, mockedDrainer, vpcCniDeleter, time.Second*0) Expect(err).NotTo(HaveOccurred()) mockedDrainer.AssertNotCalled(GinkgoT(), "Drain") Expect(vpcCniDeleterCalled).To(Equal(0)) diff --git a/pkg/actions/cluster/owned.go b/pkg/actions/cluster/owned.go index 57d67a0187..fd54024645 100644 --- a/pkg/actions/cluster/owned.go +++ b/pkg/actions/cluster/owned.go @@ -65,7 +65,7 @@ func (c *OwnedCluster) Upgrade(ctx context.Context, dryRun bool) error { return nil } -func (c *OwnedCluster) Delete(ctx context.Context, _ time.Duration, wait, force, disableNodegroupEviction bool, parallel int) error { +func (c *OwnedCluster) Delete(ctx context.Context, _, podEvictionWaitPeriod time.Duration, wait, force, disableNodegroupEviction bool, parallel int) error { var ( clientSet kubernetes.Interface oidc *iamoidc.OpenIDConnectManager @@ -107,7 +107,7 @@ func (c *OwnedCluster) Delete(ctx context.Context, _ time.Duration, wait, force, } nodeGroupManager := c.newNodeGroupManager(c.cfg, c.ctl, clientSet) - if err := drainAllNodeGroups(c.cfg, c.ctl, clientSet, allStacks, disableNodegroupEviction, parallel, nodeGroupManager, attemptVpcCniDeletion); err != nil { + if err := drainAllNodeGroups(c.cfg, c.ctl, clientSet, allStacks, disableNodegroupEviction, parallel, nodeGroupManager, attemptVpcCniDeletion, podEvictionWaitPeriod); err != nil { if !force { return err } diff --git a/pkg/actions/cluster/owned_test.go b/pkg/actions/cluster/owned_test.go index a7f7d1a47b..2d0b2bfa8a 100644 --- a/pkg/actions/cluster/owned_test.go +++ b/pkg/actions/cluster/owned_test.go @@ -114,7 +114,7 @@ var _ = Describe("Delete", func() { return fakeClientSet, nil }) - err := c.Delete(context.Background(), time.Microsecond, false, false, false, 1) + err := c.Delete(context.Background(), time.Microsecond, time.Second*0, false, false, false, 1) Expect(err).NotTo(HaveOccurred()) Expect(fakeStackManager.DeleteTasksForDeprecatedStacksCallCount()).To(Equal(1)) Expect(ranDeleteDeprecatedTasks).To(BeTrue()) @@ -189,7 +189,7 @@ var _ = Describe("Delete", func() { return mockedDrainer }) - err := c.Delete(context.Background(), time.Microsecond, false, true, false, 1) + err := c.Delete(context.Background(), time.Microsecond, time.Second*0, false, true, false, 1) Expect(err).NotTo(HaveOccurred()) Expect(fakeStackManager.DeleteTasksForDeprecatedStacksCallCount()).To(Equal(1)) Expect(ranDeleteDeprecatedTasks).To(BeFalse()) @@ -262,7 +262,7 @@ var _ = Describe("Delete", func() { return mockedDrainer }) - err := c.Delete(context.Background(), time.Microsecond, false, false, false, 1) + err := c.Delete(context.Background(), time.Microsecond, time.Second*0, false, false, false, 1) Expect(err).To(MatchError(errorMessage)) Expect(fakeStackManager.DeleteTasksForDeprecatedStacksCallCount()).To(Equal(0)) Expect(ranDeleteDeprecatedTasks).To(BeFalse()) @@ -303,7 +303,7 @@ var _ = Describe("Delete", func() { c := cluster.NewOwnedCluster(cfg, ctl, nil, fakeStackManager) - err := c.Delete(context.Background(), time.Microsecond, false, false, false, 1) + err := c.Delete(context.Background(), time.Microsecond, time.Second*0, false, false, false, 1) Expect(err).NotTo(HaveOccurred()) Expect(fakeStackManager.DeleteTasksForDeprecatedStacksCallCount()).To(Equal(1)) Expect(ranDeleteDeprecatedTasks).To(BeTrue()) diff --git a/pkg/actions/cluster/unowned.go b/pkg/actions/cluster/unowned.go index d94337a48e..c1a8a2a890 100644 --- a/pkg/actions/cluster/unowned.go +++ b/pkg/actions/cluster/unowned.go @@ -55,7 +55,7 @@ func (c *UnownedCluster) Upgrade(_ context.Context, dryRun bool) error { return nil } -func (c *UnownedCluster) Delete(ctx context.Context, waitInterval time.Duration, wait, force, disableNodegroupEviction bool, parallel int) error { +func (c *UnownedCluster) Delete(ctx context.Context, waitInterval, podEvictionWaitPeriod time.Duration, wait, force, disableNodegroupEviction bool, parallel int) error { clusterName := c.cfg.Metadata.Name if err := c.checkClusterExists(clusterName); err != nil { @@ -80,7 +80,7 @@ func (c *UnownedCluster) Delete(ctx context.Context, waitInterval time.Duration, } nodeGroupManager := c.newNodeGroupManager(c.cfg, c.ctl, clientSet) - if err := drainAllNodeGroups(c.cfg, c.ctl, clientSet, allStacks, disableNodegroupEviction, parallel, nodeGroupManager, attemptVpcCniDeletion); err != nil { + if err := drainAllNodeGroups(c.cfg, c.ctl, clientSet, allStacks, disableNodegroupEviction, parallel, nodeGroupManager, attemptVpcCniDeletion, podEvictionWaitPeriod); err != nil { if !force { return err } diff --git a/pkg/actions/cluster/unowned_test.go b/pkg/actions/cluster/unowned_test.go index bc746224f1..8954bf6006 100644 --- a/pkg/actions/cluster/unowned_test.go +++ b/pkg/actions/cluster/unowned_test.go @@ -152,7 +152,7 @@ var _ = Describe("Delete", func() { return fakeClientSet, nil }) - err := c.Delete(context.Background(), time.Microsecond, false, false, false, 1) + err := c.Delete(context.Background(), time.Microsecond, time.Second*0, false, false, false, 1) Expect(err).NotTo(HaveOccurred()) Expect(deleteCallCount).To(Equal(1)) Expect(unownedDeleteCallCount).To(Equal(1)) @@ -255,7 +255,7 @@ var _ = Describe("Delete", func() { return mockedDrainer }) - err := c.Delete(context.Background(), time.Microsecond, false, true, false, 1) + err := c.Delete(context.Background(), time.Microsecond, time.Second*0, false, true, false, 1) Expect(err).NotTo(HaveOccurred()) Expect(deleteCallCount).To(Equal(0)) Expect(unownedDeleteCallCount).To(Equal(0)) @@ -358,7 +358,7 @@ var _ = Describe("Delete", func() { return mockedDrainer }) - err := c.Delete(context.Background(), time.Microsecond, false, false, false, 1) + err := c.Delete(context.Background(), time.Microsecond, time.Second*0, false, false, false, 1) Expect(err).To(MatchError(errorMessage)) Expect(deleteCallCount).To(Equal(0)) Expect(unownedDeleteCallCount).To(Equal(0)) @@ -422,7 +422,7 @@ var _ = Describe("Delete", func() { p.MockEKS().On("DeleteCluster", mock.Anything).Return(&awseks.DeleteClusterOutput{}, nil) c := cluster.NewUnownedCluster(cfg, ctl, fakeStackManager) - err := c.Delete(context.Background(), time.Microsecond, false, false, false, 1) + err := c.Delete(context.Background(), time.Microsecond, time.Second*0, false, false, false, 1) Expect(err).NotTo(HaveOccurred()) Expect(fakeStackManager.DeleteTasksForDeprecatedStacksCallCount()).To(Equal(1)) Expect(deleteCallCount).To(Equal(1)) diff --git a/pkg/actions/nodegroup/drain.go b/pkg/actions/nodegroup/drain.go index 7afd04f22b..fc748a5874 100644 --- a/pkg/actions/nodegroup/drain.go +++ b/pkg/actions/nodegroup/drain.go @@ -9,19 +9,20 @@ import ( ) type DrainInput struct { - NodeGroups []eks.KubeNodeGroup - Plan bool - MaxGracePeriod time.Duration - NodeDrainWaitPeriod time.Duration - Undo bool - DisableEviction bool - Parallel int + NodeGroups []eks.KubeNodeGroup + Plan bool + MaxGracePeriod time.Duration + NodeDrainWaitPeriod time.Duration + PodEvictionWaitPeriod time.Duration + Undo bool + DisableEviction bool + Parallel int } func (m *Manager) Drain(input *DrainInput) error { if !input.Plan { for _, n := range input.NodeGroups { - nodeGroupDrainer := drain.NewNodeGroupDrainer(m.clientSet, n, m.ctl.Provider.WaitTimeout(), input.MaxGracePeriod, input.NodeDrainWaitPeriod, input.Undo, input.DisableEviction, input.Parallel) + nodeGroupDrainer := drain.NewNodeGroupDrainer(m.clientSet, n, m.ctl.Provider.WaitTimeout(), input.MaxGracePeriod, input.NodeDrainWaitPeriod, input.PodEvictionWaitPeriod, input.Undo, input.DisableEviction, input.Parallel) if err := nodeGroupDrainer.Drain(); err != nil { return err } diff --git a/pkg/ctl/delete/cluster.go b/pkg/ctl/delete/cluster.go index 6d628d1b99..fbc903b23a 100644 --- a/pkg/ctl/delete/cluster.go +++ b/pkg/ctl/delete/cluster.go @@ -16,12 +16,12 @@ import ( ) func deleteClusterCmd(cmd *cmdutils.Cmd) { - deleteClusterWithRunFunc(cmd, func(cmd *cmdutils.Cmd, force bool, disableNodegroupEviction bool, parallel int) error { - return doDeleteCluster(cmd, force, disableNodegroupEviction, parallel) + deleteClusterWithRunFunc(cmd, func(cmd *cmdutils.Cmd, force bool, disableNodegroupEviction bool, podEvictionWaitPeriod time.Duration, parallel int) error { + return doDeleteCluster(cmd, force, disableNodegroupEviction, podEvictionWaitPeriod, parallel) }) } -func deleteClusterWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, force bool, disableNodegroupEviction bool, parallel int) error) { +func deleteClusterWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, force bool, disableNodegroupEviction bool, podEvictionWaitPeriod time.Duration, parallel int) error) { cfg := api.NewClusterConfig() cmd.ClusterConfig = cfg @@ -30,11 +30,12 @@ func deleteClusterWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, var ( force bool disableNodegroupEviction bool + podEvictionWaitPeriod time.Duration parallel int ) cmd.CobraCommand.RunE = func(_ *cobra.Command, args []string) error { cmd.NameArg = cmdutils.GetNameArg(args) - return runFunc(cmd, force, disableNodegroupEviction, parallel) + return runFunc(cmd, force, disableNodegroupEviction, podEvictionWaitPeriod, parallel) } cmd.FlagSetGroup.InFlagSet("General", func(fs *pflag.FlagSet) { @@ -45,6 +46,8 @@ func deleteClusterWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, cmdutils.AddWaitFlag(fs, &cmd.Wait, "deletion of all resources") fs.BoolVar(&force, "force", false, "Force deletion to continue when errors occur") fs.BoolVar(&disableNodegroupEviction, "disable-nodegroup-eviction", false, "Force drain to use delete, even if eviction is supported. This will bypass checking PodDisruptionBudgets, use with caution.") + defaultPodEvictionWaitPeriod, _ := time.ParseDuration("10s") + fs.DurationVar(&podEvictionWaitPeriod, "pod-eviction-wait-period", defaultPodEvictionWaitPeriod, "Duration to wait after failing to evict a pod") fs.IntVar(¶llel, "parallel", 1, "Number of nodes to drain in parallel. Max 25") cmdutils.AddConfigFileFlag(fs, &cmd.ClusterConfigFile) @@ -54,7 +57,7 @@ func deleteClusterWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, cmdutils.AddCommonFlagsForAWS(cmd.FlagSetGroup, &cmd.ProviderConfig, true) } -func doDeleteCluster(cmd *cmdutils.Cmd, force bool, disableNodegroupEviction bool, parallel int) error { +func doDeleteCluster(cmd *cmdutils.Cmd, force bool, disableNodegroupEviction bool, podEvictionWaitPeriod time.Duration, parallel int) error { if err := cmdutils.NewMetadataLoader(cmd).Load(); err != nil { return err } @@ -89,5 +92,5 @@ func doDeleteCluster(cmd *cmdutils.Cmd, force bool, disableNodegroupEviction boo // ProviderConfig.WaitTimeout is not respected by cluster.Delete, which means the operation will never time out. // When this is fixed, a deadline-based Context can be used here. - return cluster.Delete(context.TODO(), time.Second*20, cmd.Wait, force, disableNodegroupEviction, parallel) + return cluster.Delete(context.TODO(), time.Second*20, podEvictionWaitPeriod, cmd.Wait, force, disableNodegroupEviction, parallel) } diff --git a/pkg/ctl/delete/cluster_test.go b/pkg/ctl/delete/cluster_test.go index c73f54d939..5c5cbea800 100644 --- a/pkg/ctl/delete/cluster_test.go +++ b/pkg/ctl/delete/cluster_test.go @@ -1,6 +1,8 @@ package delete import ( + "time" + . "github.com/onsi/ginkgo" . "github.com/onsi/ginkgo/extensions/table" . "github.com/onsi/gomega" @@ -18,7 +20,7 @@ var _ = Describe("delete cluster", func() { cmd := newMockEmptyCmd(args...) count := 0 cmdutils.AddResourceCmd(cmdutils.NewGrouping(), cmd.parentCmd, func(cmd *cmdutils.Cmd) { - deleteClusterWithRunFunc(cmd, func(cmd *cmdutils.Cmd, force bool, disableNodegroupEviction bool, parallel int) error { + deleteClusterWithRunFunc(cmd, func(cmd *cmdutils.Cmd, force bool, disableNodegroupEviction bool, podEvictionWaitPeriod time.Duration, parallel int) error { Expect(cmd.ClusterConfig.Metadata.Name).To(Equal(clusterName)) Expect(force).To(Equal(forceExpected)) Expect(disableNodegroupEviction).To(Equal(disableNodegroupEvictionExpected)) diff --git a/pkg/ctl/delete/nodegroup.go b/pkg/ctl/delete/nodegroup.go index af8e82e6e2..060516cf9f 100644 --- a/pkg/ctl/delete/nodegroup.go +++ b/pkg/ctl/delete/nodegroup.go @@ -18,30 +18,31 @@ import ( ) func deleteNodeGroupCmd(cmd *cmdutils.Cmd) { - deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error { - return doDeleteNodeGroup(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, disableEviction, parallel) + deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { + return doDeleteNodeGroup(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, podEvictionWaitPeriod, disableEviction, parallel) }) } -func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error) { +func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error) { cfg := api.NewClusterConfig() ng := api.NewNodeGroup() cmd.ClusterConfig = cfg var ( - updateAuthConfigMap bool - deleteNodeGroupDrain bool - onlyMissing bool - maxGracePeriod time.Duration - disableEviction bool - parallel int + updateAuthConfigMap bool + deleteNodeGroupDrain bool + onlyMissing bool + maxGracePeriod time.Duration + podEvictionWaitPeriod time.Duration + disableEviction bool + parallel int ) cmd.SetDescription("nodegroup", "Delete a nodegroup", "", "ng") cmd.CobraCommand.RunE = func(_ *cobra.Command, args []string) error { cmd.NameArg = cmdutils.GetNameArg(args) - return runFunc(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, disableEviction, parallel) + return runFunc(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, podEvictionWaitPeriod, disableEviction, parallel) } cmd.FlagSetGroup.InFlagSet("General", func(fs *pflag.FlagSet) { @@ -56,6 +57,8 @@ func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cm fs.BoolVar(&deleteNodeGroupDrain, "drain", true, "Drain and cordon all nodes in the nodegroup before deletion") defaultMaxGracePeriod, _ := time.ParseDuration("10m") fs.DurationVar(&maxGracePeriod, "max-grace-period", defaultMaxGracePeriod, "Maximum pods termination grace period") + defaultPodEvictionWaitPeriod, _ := time.ParseDuration("10s") + fs.DurationVar(&podEvictionWaitPeriod, "pod-eviction-wait-period", defaultPodEvictionWaitPeriod, "Duration to wait after failing to evict a pod") defaultDisableEviction := false fs.BoolVar(&disableEviction, "disable-eviction", defaultDisableEviction, "Force drain to use delete, even if eviction is supported. This will bypass checking PodDisruptionBudgets, use with caution.") fs.IntVar(¶llel, "parallel", 1, "Number of nodes to drain in parallel. Max 25") @@ -68,7 +71,7 @@ func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cm cmdutils.AddCommonFlagsForAWS(cmd.FlagSetGroup, &cmd.ProviderConfig, true) } -func doDeleteNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error { +func doDeleteNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { ngFilter := filter.NewNodeGroupFilter() if err := cmdutils.NewDeleteAndDrainNodeGroupLoader(cmd, ng, ngFilter).Load(); err != nil { @@ -130,11 +133,12 @@ func doDeleteNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap cmdutils.LogIntendedAction(cmd.Plan, "drain %d nodegroup(s) in cluster %q", len(allNodeGroups), cfg.Metadata.Name) drainInput := &nodegroup.DrainInput{ - NodeGroups: allNodeGroups, - Plan: cmd.Plan, - MaxGracePeriod: maxGracePeriod, - DisableEviction: disableEviction, - Parallel: parallel, + NodeGroups: allNodeGroups, + Plan: cmd.Plan, + MaxGracePeriod: maxGracePeriod, + PodEvictionWaitPeriod: podEvictionWaitPeriod, + DisableEviction: disableEviction, + Parallel: parallel, } err := nodeGroupManager.Drain(drainInput) if err != nil { diff --git a/pkg/ctl/delete/nodegroup_test.go b/pkg/ctl/delete/nodegroup_test.go index b46ec18019..fb42d8b3b5 100644 --- a/pkg/ctl/delete/nodegroup_test.go +++ b/pkg/ctl/delete/nodegroup_test.go @@ -22,7 +22,7 @@ var _ = Describe("delete", func() { cmd := newMockEmptyCmd(args...) count := 0 cmdutils.AddResourceCmd(cmdutils.NewGrouping(), cmd.parentCmd, func(cmd *cmdutils.Cmd) { - deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error { + deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { Expect(cmd.ClusterConfig.Metadata.Name).To(Equal("clusterName")) Expect(ng.Name).To(Equal("ng")) count++ diff --git a/pkg/ctl/drain/nodegroup.go b/pkg/ctl/drain/nodegroup.go index 75c5ea80d5..d2c3df6152 100644 --- a/pkg/ctl/drain/nodegroup.go +++ b/pkg/ctl/drain/nodegroup.go @@ -16,30 +16,31 @@ import ( ) func drainNodeGroupCmd(cmd *cmdutils.Cmd) { - drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error { - return doDrainNodeGroup(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, disableEviction, parallel) + drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { + return doDrainNodeGroup(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod, disableEviction, parallel) }) } -func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error) { +func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error) { cfg := api.NewClusterConfig() ng := api.NewNodeGroup() cmd.ClusterConfig = cfg var ( - undo bool - onlyMissing bool - disableEviction bool - parallel int - maxGracePeriod time.Duration - nodeDrainWaitPeriod time.Duration + undo bool + onlyMissing bool + disableEviction bool + parallel int + maxGracePeriod time.Duration + nodeDrainWaitPeriod time.Duration + podEvictionWaitPeriod time.Duration ) cmd.SetDescription("nodegroup", "Cordon and drain a nodegroup", "", "ng") cmd.CobraCommand.RunE = func(_ *cobra.Command, args []string) error { cmd.NameArg = cmdutils.GetNameArg(args) - return runFunc(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, disableEviction, parallel) + return runFunc(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod, disableEviction, parallel) } cmd.FlagSetGroup.InFlagSet("General", func(fs *pflag.FlagSet) { @@ -53,6 +54,8 @@ func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd fs.BoolVar(&undo, "undo", false, "Uncordon the nodegroup") defaultMaxGracePeriod, _ := time.ParseDuration("10m") fs.DurationVar(&maxGracePeriod, "max-grace-period", defaultMaxGracePeriod, "Maximum pods termination grace period") + defaultPodEvictionWaitPeriod, _ := time.ParseDuration("10s") + fs.DurationVar(&podEvictionWaitPeriod, "pod-eviction-wait-period", defaultPodEvictionWaitPeriod, "Duration to wait after failing to evict a pod") defaultDisableEviction := false fs.BoolVar(&disableEviction, "disable-eviction", defaultDisableEviction, "Force drain to use delete, even if eviction is supported. This will bypass checking PodDisruptionBudgets, use with caution.") cmdutils.AddTimeoutFlag(fs, &cmd.ProviderConfig.WaitTimeout) @@ -63,7 +66,7 @@ func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd cmdutils.AddCommonFlagsForAWS(cmd.FlagSetGroup, &cmd.ProviderConfig, true) } -func doDrainNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error { +func doDrainNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { ngFilter := filter.NewNodeGroupFilter() if err := cmdutils.NewDeleteAndDrainNodeGroupLoader(cmd, ng, ngFilter).Load(); err != nil { @@ -126,13 +129,14 @@ func doDrainNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bo allNodeGroups := cmdutils.ToKubeNodeGroups(cfg) drainInput := &nodegroup.DrainInput{ - NodeGroups: allNodeGroups, - Plan: cmd.Plan, - MaxGracePeriod: maxGracePeriod, - NodeDrainWaitPeriod: nodeDrainWaitPeriod, - Undo: undo, - DisableEviction: disableEviction, - Parallel: parallel, + NodeGroups: allNodeGroups, + Plan: cmd.Plan, + MaxGracePeriod: maxGracePeriod, + NodeDrainWaitPeriod: nodeDrainWaitPeriod, + PodEvictionWaitPeriod: podEvictionWaitPeriod, + Undo: undo, + DisableEviction: disableEviction, + Parallel: parallel, } return nodegroup.New(cfg, ctl, clientSet).Drain(drainInput) } diff --git a/pkg/ctl/drain/nodegroup_test.go b/pkg/ctl/drain/nodegroup_test.go index 103c160750..463c36e443 100644 --- a/pkg/ctl/drain/nodegroup_test.go +++ b/pkg/ctl/drain/nodegroup_test.go @@ -24,7 +24,7 @@ var _ = Describe("drain node group", func() { cmd := newMockEmptyCmd(args...) count := 0 cmdutils.AddResourceCmd(cmdutils.NewGrouping(), cmd.parentCmd, func(cmd *cmdutils.Cmd) { - drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error { + drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error { Expect(cmd.ClusterConfig.Metadata.Name).To(Equal("clusterName")) Expect(ng.Name).To(Equal("ng")) count++ diff --git a/pkg/drain/evictor/pod.go b/pkg/drain/evictor/pod.go index 031dad10c3..fcd517eef2 100644 --- a/pkg/drain/evictor/pod.go +++ b/pkg/drain/evictor/pod.go @@ -29,7 +29,6 @@ import ( const ( daemonSetFatal = "DaemonSet-managed Pods (use --ignore-daemonsets to ignore)" - daemonSetWarning = "ignoring DaemonSet-managed Pods" localStorageFatal = "Pods with local storage (use --delete-local-data to override)" localStorageWarning = "deleting Pods with local storage" unmanagedFatal = "Pods not managed by ReplicationController, ReplicaSet, Job, DaemonSet or StatefulSet (use --force to override)" @@ -204,7 +203,7 @@ func (d *Evictor) daemonSetFilter(pod corev1.Pod) PodDeleteStatus { if controllerRef.Name == ignoreDaemonSet.Name { switch ignoreDaemonSet.Namespace { case pod.Namespace, metav1.NamespaceAll: - return makePodDeleteStatusWithWarning(false, daemonSetWarning) + return makePodDeleteStatusSkip() } } } @@ -213,7 +212,7 @@ func (d *Evictor) daemonSetFilter(pod corev1.Pod) PodDeleteStatus { return makePodDeleteStatusWithError(daemonSetFatal) } - return makePodDeleteStatusWithWarning(false, daemonSetWarning) + return makePodDeleteStatusSkip() } func (d *Evictor) mirrorPodFilter(pod corev1.Pod) PodDeleteStatus { diff --git a/pkg/drain/nodegroup.go b/pkg/drain/nodegroup.go index 35f1b5b380..de13d602d7 100644 --- a/pkg/drain/nodegroup.go +++ b/pkg/drain/nodegroup.go @@ -5,7 +5,9 @@ import ( "fmt" "time" + "golang.org/x/sync/errgroup" "golang.org/x/sync/semaphore" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/weaveworks/eksctl/pkg/drain/evictor" @@ -36,16 +38,17 @@ type Evictor interface { } type NodeGroupDrainer struct { - clientSet kubernetes.Interface - evictor Evictor - ng eks.KubeNodeGroup - waitTimeout time.Duration - nodeDrainWaitPeriod time.Duration - undo bool - parallel int + clientSet kubernetes.Interface + evictor Evictor + ng eks.KubeNodeGroup + waitTimeout time.Duration + nodeDrainWaitPeriod time.Duration + podEvictionWaitPeriod time.Duration + undo bool + parallel int } -func NewNodeGroupDrainer(clientSet kubernetes.Interface, ng eks.KubeNodeGroup, waitTimeout, maxGracePeriod, nodeDrainWaitPeriod time.Duration, undo, disableEviction bool, parallel int) NodeGroupDrainer { +func NewNodeGroupDrainer(clientSet kubernetes.Interface, ng eks.KubeNodeGroup, waitTimeout, maxGracePeriod, nodeDrainWaitPeriod time.Duration, podEvictionWaitPeriod time.Duration, undo, disableEviction bool, parallel int) NodeGroupDrainer { ignoreDaemonSets := []metav1.ObjectMeta{ { Namespace: "kube-system", @@ -73,13 +76,14 @@ func NewNodeGroupDrainer(clientSet kubernetes.Interface, ng eks.KubeNodeGroup, w } return NodeGroupDrainer{ - evictor: evictor.New(clientSet, maxGracePeriod, ignoreDaemonSets, disableEviction), - clientSet: clientSet, - ng: ng, - waitTimeout: waitTimeout, - nodeDrainWaitPeriod: nodeDrainWaitPeriod, - undo: undo, - parallel: parallel, + evictor: evictor.New(clientSet, maxGracePeriod, ignoreDaemonSets, disableEviction), + clientSet: clientSet, + ng: ng, + waitTimeout: waitTimeout, + nodeDrainWaitPeriod: nodeDrainWaitPeriod, + podEvictionWaitPeriod: podEvictionWaitPeriod, + undo: undo, + parallel: parallel, } } @@ -112,6 +116,7 @@ func (n *NodeGroupDrainer) Drain() error { parallelLimit := int64(n.parallel) sem := semaphore.NewWeighted(parallelLimit) logger.Info("starting parallel draining, max in-flight of %d", parallelLimit) + var evictErr error // loop until all nodes are drained to handle accidental scale-up // or any other changes in the ASG for { @@ -121,6 +126,9 @@ func (n *NodeGroupDrainer) Drain() error { waitForAllRoutinesToFinish(context.TODO(), sem, parallelLimit) return fmt.Errorf("timed out (after %s) waiting for nodegroup %q to be drained", n.waitTimeout, n.ng.NameString()) default: + if evictErr != nil { + return evictErr + } nodes, err := n.clientSet.CoreV1().Nodes().List(context.TODO(), listOptions) if err != nil { return err @@ -143,32 +151,36 @@ func (n *NodeGroupDrainer) Drain() error { logger.Debug("already drained: %v", mapToList(drainedNodes.Items())) logger.Debug("will drain: %v", newPendingNodes.List()) - for i, node := range newPendingNodes.List() { - if err := sem.Acquire(ctx, 1); err != nil { - logger.Critical("failed to acquire semaphore: %w", err) - } - go func(i int, node string) { + g, ctx := errgroup.WithContext(ctx) + for _, node := range newPendingNodes.List() { + node := node + g.Go(func() error { + if err := sem.Acquire(ctx, 1); err != nil { + return errors.Wrapf(err, "failed to acquire semaphore") + } defer sem.Release(1) + + drainedNodes.Set(node, nil) logger.Debug("starting drain of node %s", node) - pending, err := n.evictPods(node) - if err != nil { + if err := n.evictPods(ctx, node); err != nil { logger.Warning("pod eviction error (%q) on node %s", err, node) time.Sleep(retryDelay) - return + return err } - logger.Debug("%d pods to be evicted from %s", pending, node) - if pending == 0 { - drainedNodes.Set(node, nil) - } + drainedNodes.Set(node, nil) if n.nodeDrainWaitPeriod > 0 { logger.Debug("waiting for %.0f seconds before draining next node", n.nodeDrainWaitPeriod.Seconds()) time.Sleep(n.nodeDrainWaitPeriod) } - }(i, node) + return nil + }) } + // We need to loop even if this is an error to check whether the error was a + // context timeout or something else. This lets us log timout errors consistently + evictErr = g.Wait() } } } @@ -204,29 +216,50 @@ func (n *NodeGroupDrainer) toggleCordon(cordon bool, nodes *corev1.NodeList) { logger.Debug("no need to %s node %q", cordonStatus(cordon), node.Name) } } - } -func (n *NodeGroupDrainer) evictPods(node string) (int, error) { - list, errs := n.evictor.GetPodsForEviction(node) - if len(errs) > 0 { - return 0, fmt.Errorf("errs: %v", errs) // TODO: improve formatting - } - if list == nil { - return 0, nil - } - if w := list.Warnings(); w != "" { - logger.Warning(w) - } - pods := list.Pods() - pending := len(pods) - for _, pod := range pods { - // TODO: handle API rate limiter error - if err := n.evictor.EvictOrDeletePod(pod); err != nil { - return pending, errors.Wrapf(err, "error evicting pod: %s/%s", pod.Namespace, pod.Name) +func (n *NodeGroupDrainer) evictPods(ctx context.Context, node string) error { + // Loop until context times out. We want to continually try to remove pods + // from the node as their eviction status changes. + previousReportTime := time.Now() + for { + select { + case <-ctx.Done(): + return fmt.Errorf("timed out (after %s) waiting for node %q to be drained", n.waitTimeout, node) + default: + list, errs := n.evictor.GetPodsForEviction(node) + if len(errs) > 0 { + return fmt.Errorf("errs: %v", errs) // TODO: improve formatting + } + if list == nil || len(list.Pods()) == 0 { + return nil + } + pods := list.Pods() + if w := list.Warnings(); w != "" { + logger.Warning(w) + } + // This log message is important but can be noisy. It's useful to only + // update on a node every minute. + if time.Now().Sub(previousReportTime) > time.Minute*1 && len(pods) > 0 { + logger.Warning("%d pods are unevictable from node %s", len(pods), node) + previousReportTime = time.Now() + } + logger.Debug("%d pods to be evicted from %s", pods, node) + failedEvictions := false + for _, pod := range pods { + if err := n.evictor.EvictOrDeletePod(pod); err != nil { + if !isEvictionErrorRecoverable(err) { + return errors.Wrapf(err, "unrecoverable error evicting pod: %s/%s", pod.Namespace, pod.Name) + } + logger.Debug("recoverable pod eviction failure: %q", err) + failedEvictions = true + } + } + if failedEvictions { + time.Sleep(n.podEvictionWaitPeriod) + } } } - return pending, nil } func cordonStatus(desired bool) string { @@ -235,3 +268,25 @@ func cordonStatus(desired bool) string { } return "uncordon" } + +func isEvictionErrorRecoverable(err error) bool { + var recoverableCheckerFuncs []func(error) bool + recoverableCheckerFuncs = append( + recoverableCheckerFuncs, + apierrors.IsGone, + apierrors.IsNotFound, + apierrors.IsResourceExpired, + apierrors.IsServerTimeout, + apierrors.IsServiceUnavailable, + apierrors.IsTimeout, + // IsTooManyRequests also captures PDB errors + apierrors.IsTooManyRequests, + ) + + for _, f := range recoverableCheckerFuncs { + if f(err) { + return true + } + } + return false +} diff --git a/pkg/drain/nodegroup_test.go b/pkg/drain/nodegroup_test.go index 32520ede9e..d3e68730f4 100644 --- a/pkg/drain/nodegroup_test.go +++ b/pkg/drain/nodegroup_test.go @@ -2,10 +2,12 @@ package drain_test import ( "context" + "errors" "fmt" "time" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "github.com/weaveworks/eksctl/pkg/drain/evictor" @@ -16,6 +18,7 @@ import ( . "github.com/onsi/gomega" . "github.com/onsi/ginkgo" + "github.com/weaveworks/eksctl/pkg/drain" "github.com/weaveworks/eksctl/pkg/eks/mocks" "k8s.io/client-go/kubernetes/fake" @@ -73,7 +76,7 @@ var _ = Describe("Drain", func() { }) It("does not error", func() { - nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second*10, time.Second, false, false, 1) + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second*10, time.Second, time.Second*0, false, false, 1) nodeGroupDrainer.SetDrainer(fakeEvictor) err := nodeGroupDrainer.Drain() @@ -117,7 +120,7 @@ var _ = Describe("Drain", func() { }) It("times out and errors", func() { - nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*2, time.Second*0, time.Second, false, false, 1) + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*2, time.Second*0, time.Second, time.Second*0, false, false, 1) nodeGroupDrainer.SetDrainer(fakeEvictor) err := nodeGroupDrainer.Drain() @@ -131,7 +134,7 @@ var _ = Describe("Drain", func() { }) It("errors", func() { - nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second, time.Second, time.Second, false, false, 1) + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second, time.Second, time.Second, time.Second*0, false, false, 1) nodeGroupDrainer.SetDrainer(fakeEvictor) err := nodeGroupDrainer.Drain() @@ -175,7 +178,7 @@ var _ = Describe("Drain", func() { }) It("does not error", func() { - nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, false, true, 1) + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, time.Second*0, false, true, 1) nodeGroupDrainer.SetDrainer(fakeEvictor) err := nodeGroupDrainer.Drain() @@ -205,7 +208,7 @@ var _ = Describe("Drain", func() { }) It("uncordons all the nodes", func() { - nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, true, false, 1) + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, time.Second*0, true, false, 1) nodeGroupDrainer.SetDrainer(fakeEvictor) err := nodeGroupDrainer.Drain() @@ -219,4 +222,169 @@ var _ = Describe("Drain", func() { Expect(fakeEvictor.EvictOrDeletePodCallCount()).To(BeZero()) }) }) + + When("an eviction fails recoverably", func() { + var pod corev1.Pod + + BeforeEach(func() { + pod = corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-1", + }, + } + + for i := 0; i < 2; i++ { + fakeEvictor.GetPodsForEvictionReturnsOnCall(i, &evictor.PodDeleteList{ + Items: []evictor.PodDelete{ + { + Pod: pod, + Status: evictor.PodDeleteStatus{ + Delete: true, + }, + }, + }, + }, nil) + } + fakeEvictor.GetPodsForEvictionReturnsOnCall(2, nil, nil) + + fakeEvictor.EvictOrDeletePodReturnsOnCall(0, apierrors.NewTooManyRequestsError("error1")) + fakeEvictor.EvictOrDeletePodReturnsOnCall(1, nil) + + _, err := fakeClientSet.CoreV1().Nodes().Create(context.TODO(), &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + Spec: corev1.NodeSpec{ + Unschedulable: false, + }, + }, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + }) + + It("does not error", func() { + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, time.Second*0, false, false, 1) + nodeGroupDrainer.SetDrainer(fakeEvictor) + + err := nodeGroupDrainer.Drain() + Expect(err).NotTo(HaveOccurred()) + + Expect(fakeEvictor.GetPodsForEvictionCallCount()).To(Equal(3)) + Expect(fakeEvictor.EvictOrDeletePodCallCount()).To(Equal(2)) + Expect(fakeEvictor.EvictOrDeletePodArgsForCall(0)).To(Equal(pod)) + Expect(fakeEvictor.EvictOrDeletePodArgsForCall(1)).To(Equal(pod)) + }) + }) + + When("an eviction fails irrecoverably", func() { + var pod corev1.Pod + var evictionError error + + BeforeEach(func() { + pod = corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-1", + Namespace: "ns-1", + }, + } + + fakeEvictor.GetPodsForEvictionReturns(&evictor.PodDeleteList{ + Items: []evictor.PodDelete{ + { + Pod: pod, + Status: evictor.PodDeleteStatus{ + Delete: true, + }, + }, + }, + }, nil) + + evictionError = errors.New("error1") + fakeEvictor.EvictOrDeletePodReturns(evictionError) + + _, err := fakeClientSet.CoreV1().Nodes().Create(context.TODO(), &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + Spec: corev1.NodeSpec{ + Unschedulable: false, + }, + }, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + }) + + It("returns an error", func() { + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, time.Second*0, false, false, 1) + nodeGroupDrainer.SetDrainer(fakeEvictor) + + err := nodeGroupDrainer.Drain() + Expect(err.Error()).To(ContainSubstring("unrecoverable error evicting pod: ns-1/pod-1")) + + Expect(fakeEvictor.GetPodsForEvictionCallCount()).To(Equal(1)) + Expect(fakeEvictor.EvictOrDeletePodCallCount()).To(Equal(1)) + Expect(fakeEvictor.EvictOrDeletePodArgsForCall(0)).To(Equal(pod)) + }) + }) + + When("eviction fails recoverably with multiple pods", func() { + var pods []corev1.Pod + var evictionError error + + BeforeEach(func() { + pods = []corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-1", + Namespace: "ns-1", + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-2", + Namespace: "ns-2", + }, + }, + } + + fakeEvictor.GetPodsForEvictionReturns(&evictor.PodDeleteList{ + Items: []evictor.PodDelete{ + { + Pod: pods[0], + Status: evictor.PodDeleteStatus{ + Delete: true, + }, + }, + { + Pod: pods[1], + Status: evictor.PodDeleteStatus{ + Delete: true, + }, + }, + }, + }, nil) + + evictionError = apierrors.NewTooManyRequestsError("error1") + fakeEvictor.EvictOrDeletePodReturns(evictionError) + + _, err := fakeClientSet.CoreV1().Nodes().Create(context.TODO(), &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + Spec: corev1.NodeSpec{ + Unschedulable: false, + }, + }, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + }) + + It("it attempts to drain all pods", func() { + nodeGroupDrainer := drain.NewNodeGroupDrainer(fakeClientSet, &mockNG, time.Second*10, time.Second, time.Second, time.Second*0, false, false, 1) + nodeGroupDrainer.SetDrainer(fakeEvictor) + + _ = nodeGroupDrainer.Drain() + + Expect(fakeEvictor.EvictOrDeletePodCallCount()).To(BeNumerically(">=", 2)) + Expect(fakeEvictor.EvictOrDeletePodArgsForCall(0)).To(Equal(pods[0])) + Expect(fakeEvictor.EvictOrDeletePodArgsForCall(1)).To(Equal(pods[1])) + }) + }) })