Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle pod eviction errors correctly #5116

Merged
merged 10 commits into from
Apr 21, 2022
17 changes: 9 additions & 8 deletions pkg/actions/nodegroup/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,20 @@ import (
)

type DrainInput struct {
NodeGroups []eks.KubeNodeGroup
Plan bool
MaxGracePeriod time.Duration
NodeDrainWaitPeriod time.Duration
Undo bool
DisableEviction bool
Parallel int
NodeGroups []eks.KubeNodeGroup
Plan bool
MaxGracePeriod time.Duration
NodeDrainWaitPeriod time.Duration
PodEvictionWaitPeriod time.Duration
Undo bool
DisableEviction bool
Parallel int
}

func (m *Manager) Drain(input *DrainInput) error {
if !input.Plan {
for _, n := range input.NodeGroups {
nodeGroupDrainer := drain.NewNodeGroupDrainer(m.clientSet, n, m.ctl.Provider.WaitTimeout(), input.MaxGracePeriod, input.NodeDrainWaitPeriod, input.Undo, input.DisableEviction, input.Parallel)
nodeGroupDrainer := drain.NewNodeGroupDrainer(m.clientSet, n, m.ctl.Provider.WaitTimeout(), input.MaxGracePeriod, input.NodeDrainWaitPeriod, input.PodEvictionWaitPeriod, input.Undo, input.DisableEviction, input.Parallel)
if err := nodeGroupDrainer.Drain(); err != nil {
return err
}
Expand Down
36 changes: 20 additions & 16 deletions pkg/ctl/delete/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,31 @@ import (
)

func deleteNodeGroupCmd(cmd *cmdutils.Cmd) {
deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error {
return doDeleteNodeGroup(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, disableEviction, parallel)
deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error {
return doDeleteNodeGroup(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, podEvictionWaitPeriod, disableEviction, parallel)
})
}

func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error) {
func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error) {
cfg := api.NewClusterConfig()
ng := api.NewNodeGroup()
cmd.ClusterConfig = cfg

var (
updateAuthConfigMap bool
deleteNodeGroupDrain bool
onlyMissing bool
maxGracePeriod time.Duration
disableEviction bool
parallel int
updateAuthConfigMap bool
deleteNodeGroupDrain bool
onlyMissing bool
maxGracePeriod time.Duration
podEvictionWaitPeriod time.Duration
disableEviction bool
parallel int
)

cmd.SetDescription("nodegroup", "Delete a nodegroup", "", "ng")

cmd.CobraCommand.RunE = func(_ *cobra.Command, args []string) error {
cmd.NameArg = cmdutils.GetNameArg(args)
return runFunc(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, disableEviction, parallel)
return runFunc(cmd, ng, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing, maxGracePeriod, podEvictionWaitPeriod, disableEviction, parallel)
}

cmd.FlagSetGroup.InFlagSet("General", func(fs *pflag.FlagSet) {
Expand All @@ -55,6 +56,8 @@ func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cm
fs.BoolVar(&deleteNodeGroupDrain, "drain", true, "Drain and cordon all nodes in the nodegroup before deletion")
defaultMaxGracePeriod, _ := time.ParseDuration("10m")
fs.DurationVar(&maxGracePeriod, "max-grace-period", defaultMaxGracePeriod, "Maximum pods termination grace period")
defaultPodEvictionWaitPeriod, _ := time.ParseDuration("10s")
fs.DurationVar(&podEvictionWaitPeriod, "pod-eviction-wait-period", defaultPodEvictionWaitPeriod, "Duration to wait after failing to evict a pod")
defaultDisableEviction := false
fs.BoolVar(&disableEviction, "disable-eviction", defaultDisableEviction, "Force drain to use delete, even if eviction is supported. This will bypass checking PodDisruptionBudgets, use with caution.")
fs.IntVar(&parallel, "parallel", 1, "Number of nodes to drain in parallel. Max 25")
Expand All @@ -67,7 +70,7 @@ func deleteNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cm
cmdutils.AddCommonFlagsForAWS(cmd.FlagSetGroup, &cmd.ProviderConfig, true)
}

func doDeleteNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error {
func doDeleteNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error {
ngFilter := filter.NewNodeGroupFilter()

if err := cmdutils.NewDeleteAndDrainNodeGroupLoader(cmd, ng, ngFilter).Load(); err != nil {
Expand Down Expand Up @@ -128,11 +131,12 @@ func doDeleteNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, updateAuthConfigMap
cmdutils.LogIntendedAction(cmd.Plan, "drain %d nodegroup(s) in cluster %q", len(allNodeGroups), cfg.Metadata.Name)

drainInput := &nodegroup.DrainInput{
NodeGroups: allNodeGroups,
Plan: cmd.Plan,
MaxGracePeriod: maxGracePeriod,
DisableEviction: disableEviction,
Parallel: parallel,
NodeGroups: allNodeGroups,
Plan: cmd.Plan,
MaxGracePeriod: maxGracePeriod,
PodEvictionWaitPeriod: podEvictionWaitPeriod,
DisableEviction: disableEviction,
Parallel: parallel,
}
err := nodeGroupManager.Drain(drainInput)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/ctl/delete/nodegroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ var _ = Describe("delete", func() {
cmd := newMockEmptyCmd(args...)
count := 0
cmdutils.AddResourceCmd(cmdutils.NewGrouping(), cmd.parentCmd, func(cmd *cmdutils.Cmd) {
deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod time.Duration, disableEviction bool, parallel int) error {
deleteNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, updateAuthConfigMap, deleteNodeGroupDrain, onlyMissing bool, maxGracePeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error {
Expect(cmd.ClusterConfig.Metadata.Name).To(Equal("clusterName"))
Expect(ng.Name).To(Equal("ng"))
count++
Expand Down
40 changes: 22 additions & 18 deletions pkg/ctl/drain/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,31 @@ import (
)

func drainNodeGroupCmd(cmd *cmdutils.Cmd) {
drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error {
return doDrainNodeGroup(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, disableEviction, parallel)
drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error {
return doDrainNodeGroup(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod, disableEviction, parallel)
})
}

func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error) {
func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error) {
cfg := api.NewClusterConfig()
ng := api.NewNodeGroup()
cmd.ClusterConfig = cfg

var (
undo bool
onlyMissing bool
disableEviction bool
parallel int
maxGracePeriod time.Duration
nodeDrainWaitPeriod time.Duration
undo bool
onlyMissing bool
disableEviction bool
parallel int
maxGracePeriod time.Duration
nodeDrainWaitPeriod time.Duration
podEvictionWaitPeriod time.Duration
)

cmd.SetDescription("nodegroup", "Cordon and drain a nodegroup", "", "ng")

cmd.CobraCommand.RunE = func(_ *cobra.Command, args []string) error {
cmd.NameArg = cmdutils.GetNameArg(args)
return runFunc(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, disableEviction, parallel)
return runFunc(cmd, ng, undo, onlyMissing, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod, disableEviction, parallel)
}

cmd.FlagSetGroup.InFlagSet("General", func(fs *pflag.FlagSet) {
Expand All @@ -52,6 +53,8 @@ func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd
fs.BoolVar(&undo, "undo", false, "Uncordon the nodegroup")
defaultMaxGracePeriod, _ := time.ParseDuration("10m")
fs.DurationVar(&maxGracePeriod, "max-grace-period", defaultMaxGracePeriod, "Maximum pods termination grace period")
defaultPodEvictionWaitPeriod, _ := time.ParseDuration("10s")
fs.DurationVar(&podEvictionWaitPeriod, "pod-eviction-wait-period", defaultPodEvictionWaitPeriod, "Duration to wait after failing to evict a pod")
defaultDisableEviction := false
fs.BoolVar(&disableEviction, "disable-eviction", defaultDisableEviction, "Force drain to use delete, even if eviction is supported. This will bypass checking PodDisruptionBudgets, use with caution.")
cmdutils.AddTimeoutFlag(fs, &cmd.ProviderConfig.WaitTimeout)
Expand All @@ -62,7 +65,7 @@ func drainNodeGroupWithRunFunc(cmd *cmdutils.Cmd, runFunc func(cmd *cmdutils.Cmd
cmdutils.AddCommonFlagsForAWS(cmd.FlagSetGroup, &cmd.ProviderConfig, true)
}

func doDrainNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error {
func doDrainNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error {
ngFilter := filter.NewNodeGroupFilter()

if err := cmdutils.NewDeleteAndDrainNodeGroupLoader(cmd, ng, ngFilter).Load(); err != nil {
Expand Down Expand Up @@ -125,13 +128,14 @@ func doDrainNodeGroup(cmd *cmdutils.Cmd, ng *api.NodeGroup, undo, onlyMissing bo
allNodeGroups := cmdutils.ToKubeNodeGroups(cfg)

drainInput := &nodegroup.DrainInput{
NodeGroups: allNodeGroups,
Plan: cmd.Plan,
MaxGracePeriod: maxGracePeriod,
NodeDrainWaitPeriod: nodeDrainWaitPeriod,
Undo: undo,
DisableEviction: disableEviction,
Parallel: parallel,
NodeGroups: allNodeGroups,
Plan: cmd.Plan,
MaxGracePeriod: maxGracePeriod,
NodeDrainWaitPeriod: nodeDrainWaitPeriod,
PodEvictionWaitPeriod: podEvictionWaitPeriod,
Undo: undo,
DisableEviction: disableEviction,
Parallel: parallel,
}
return nodegroup.New(cfg, ctl, clientSet).Drain(drainInput)
}
2 changes: 1 addition & 1 deletion pkg/ctl/drain/nodegroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ var _ = Describe("drain node group", func() {
cmd := newMockEmptyCmd(args...)
count := 0
cmdutils.AddResourceCmd(cmdutils.NewGrouping(), cmd.parentCmd, func(cmd *cmdutils.Cmd) {
drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod time.Duration, disableEviction bool, parallel int) error {
drainNodeGroupWithRunFunc(cmd, func(cmd *cmdutils.Cmd, ng *v1alpha5.NodeGroup, undo, onlyMissing bool, maxGracePeriod, nodeDrainWaitPeriod, podEvictionWaitPeriod time.Duration, disableEviction bool, parallel int) error {
Expect(cmd.ClusterConfig.Metadata.Name).To(Equal("clusterName"))
Expect(ng.Name).To(Equal("ng"))
count++
Expand Down
5 changes: 2 additions & 3 deletions pkg/drain/evictor/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (

const (
daemonSetFatal = "DaemonSet-managed Pods (use --ignore-daemonsets to ignore)"
daemonSetWarning = "ignoring DaemonSet-managed Pods"
localStorageFatal = "Pods with local storage (use --delete-local-data to override)"
localStorageWarning = "deleting Pods with local storage"
unmanagedFatal = "Pods not managed by ReplicationController, ReplicaSet, Job, DaemonSet or StatefulSet (use --force to override)"
Expand Down Expand Up @@ -204,7 +203,7 @@ func (d *Evictor) daemonSetFilter(pod corev1.Pod) PodDeleteStatus {
if controllerRef.Name == ignoreDaemonSet.Name {
switch ignoreDaemonSet.Namespace {
case pod.Namespace, metav1.NamespaceAll:
return makePodDeleteStatusWithWarning(false, daemonSetWarning)
return makePodDeleteStatusSkip()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error is super noisy and adds next to zero value.

There may be a better way to disable it / filter it out but this was the best I found

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that error came from some copied over eviction code from 3 years ago. I don't think we need it... :D

}
}
}
Expand All @@ -213,7 +212,7 @@ func (d *Evictor) daemonSetFilter(pod corev1.Pod) PodDeleteStatus {
return makePodDeleteStatusWithError(daemonSetFatal)
}

return makePodDeleteStatusWithWarning(false, daemonSetWarning)
return makePodDeleteStatusSkip()
}

func (d *Evictor) mirrorPodFilter(pod corev1.Pod) PodDeleteStatus {
Expand Down
Loading