Skip to content

Commit

Permalink
fix: delayed MHC replacement of unreachable nodes
Browse files Browse the repository at this point in the history
Co-authored-by: Michael Shen <[email protected]>
  • Loading branch information
2 people authored and k8s-infra-cherrypick-robot committed Jun 17, 2024
1 parent 240ea95 commit dfc884a
Showing 1 changed file with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions internal/controllers/machine/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -671,8 +671,18 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster,
}

if noderefutil.IsNodeUnreachable(node) {
// When the node is unreachable and some pods are not evicted for as long as this timeout, we ignore them.
drainer.SkipWaitForDeleteTimeoutSeconds = 60 * 5 // 5 minutes
// Kubelet is unreachable, pods will never disappear.

// SkipWaitForDeleteTimeoutSeconds ensures the drain completes
// even if pod objects are not deleted.
drainer.SkipWaitForDeleteTimeoutSeconds = 1

// kube-apiserver sets the `deletionTimestamp` to a future date computed using the grace period.
// We are effectively waiting for GracePeriodSeconds + SkipWaitForDeleteTimeoutSeconds.
// Override the grace period of pods to reduce the time needed to skip them.
drainer.GracePeriodSeconds = 1

log.V(5).Info("Node is unreachable, draining will ignore gracePeriod. PDBs are still honored.")
}

if err := kubedrain.RunCordonOrUncordon(drainer, node, true); err != nil {
Expand Down

0 comments on commit dfc884a

Please sign in to comment.