Skip to content

Commit

Permalink
chore: remove machine taint code (aws#762)
Browse files Browse the repository at this point in the history
  • Loading branch information
njtran authored Nov 6, 2023
1 parent cd7b46d commit c3db2ec
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 53 deletions.
63 changes: 17 additions & 46 deletions pkg/controllers/disruption/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc
// Karpenter taints nodes with a karpenter.sh/disruption taint as part of the disruption process
// while it progresses in memory. If Karpenter restarts during a disruption action, some nodes can be left tainted.
// Idempotently remove this taint from candidates before continuing.
if err := c.requireNodeClaimNoScheduleTaint(ctx, false, c.cluster.Nodes()...); err != nil {
if err := c.requireNoScheduleTaint(ctx, false, c.cluster.Nodes()...); err != nil {
return reconcile.Result{}, fmt.Errorf("removing taint from nodes, %w", err)
}

Expand Down Expand Up @@ -230,14 +230,14 @@ func (c *Controller) launchReplacementNodeClaims(ctx context.Context, m Method,
stateNodes := lo.Map(cmd.candidates, func(c *Candidate, _ int) *state.StateNode { return c.StateNode })

// taint the candidate nodes before we launch the replacements to prevent new pods from scheduling to the candidate nodes
if err := c.requireNoScheduleTaints(ctx, true, stateNodes...); err != nil {
if err := c.requireNoScheduleTaint(ctx, true, stateNodes...); err != nil {
return fmt.Errorf("cordoning nodes, %w", err)
}

nodeClaimKeys, err := c.provisioner.CreateNodeClaims(ctx, cmd.replacements, provisioning.WithReason(reason))
if err != nil {
// untaint the nodes as the launch may fail (e.g. ICE)
err = multierr.Append(err, c.requireNoScheduleTaints(ctx, false, stateNodes...))
err = multierr.Append(err, c.requireNoScheduleTaint(ctx, false, stateNodes...))
return err
}
if len(nodeClaimKeys) != len(cmd.replacements) {
Expand All @@ -264,7 +264,7 @@ func (c *Controller) launchReplacementNodeClaims(ctx context.Context, m Method,
})
if err = multierr.Combine(errs...); err != nil {
c.cluster.UnmarkForDeletion(candidateProviderIDs...)
return multierr.Combine(c.requireNoScheduleTaints(ctx, false, stateNodes...),
return multierr.Combine(c.requireNoScheduleTaint(ctx, false, stateNodes...),
fmt.Errorf("timed out checking node readiness, %w", err))
}
return nil
Expand Down Expand Up @@ -320,22 +320,16 @@ func (c *Controller) waitForDeletion(ctx context.Context, nodeClaim *v1beta1.Nod
}
}

// TODO remove this function when v1alpha5 APIs are no longer supported.
// requireNoScheduleTaints will add NoSchedule Taints for Machines and NodeClaims.
func (c *Controller) requireNoScheduleTaints(ctx context.Context, addTaint bool, nodes ...*state.StateNode) error {
nodeClaimErrs := c.requireNodeClaimNoScheduleTaint(ctx, addTaint, nodes...)
machineErrs := c.requireMachineUnschedulable(ctx, addTaint, nodes...)
return multierr.Combine(nodeClaimErrs, machineErrs)
}

// requireNodeClaimNoScheduleTaint will add/remove the karpenter.sh/disruption taint from the candidates.
// requireNoScheduleTaint will add/remove the karpenter.sh/disruption:NoSchedule taint from the candidates.
// This is used to enforce no taints at the beginning of disruption, and
// to add/remove taints while executing a disruption action.
// nolint:gocyclo
func (c *Controller) requireNodeClaimNoScheduleTaint(ctx context.Context, addTaint bool, nodes ...*state.StateNode) error {
func (c *Controller) requireNoScheduleTaint(ctx context.Context, addTaint bool, nodes ...*state.StateNode) error {
var multiErr error
for _, n := range nodes {
if n.Node == nil || (n.NodeClaim != nil && n.NodeClaim.IsMachine) {
// If the StateNode is Karpenter owned and only has a nodeclaim, or is not owned by
// Karpenter, thus having no nodeclaim, don't touch the node.
if n.Node == nil || n.NodeClaim == nil {
continue
}
node := &v1.Node{}
Expand All @@ -346,18 +340,24 @@ func (c *Controller) requireNodeClaimNoScheduleTaint(ctx context.Context, addTai
_, hasTaint := lo.Find(node.Spec.Taints, func(taint v1.Taint) bool {
return v1beta1.IsDisruptingTaint(taint)
})
// node is being deleted, so no need to remove taint as the node will be gone soon
// Node is being deleted, so no need to remove taint as the node will be gone soon.
// This ensures that the disruption controller doesn't modify taints that the Termination
// controller is also modifying
if hasTaint && !node.DeletionTimestamp.IsZero() {
continue
}
stored := node.DeepCopy()
// If the taint is present and we want to remove the taint, remove it.
if !addTaint {
node.Spec.Taints = lo.Reject(node.Spec.Taints, func(taint v1.Taint, _ int) bool {
return v1beta1.IsDisruptingTaint(taint)
return taint.Key == v1beta1.DisruptionTaintKey
})
// otherwise, add it.
} else if addTaint && !hasTaint {
// If the taint key is present (but with a different value or effect), remove it.
node.Spec.Taints = lo.Reject(node.Spec.Taints, func(t v1.Taint, _ int) bool {
return t.Key == v1beta1.DisruptionTaintKey
})
node.Spec.Taints = append(node.Spec.Taints, v1beta1.DisruptionNoScheduleTaint)
}
if !equality.Semantic.DeepEqual(stored, node) {
Expand All @@ -369,35 +369,6 @@ func (c *Controller) requireNodeClaimNoScheduleTaint(ctx context.Context, addTai
return multiErr
}

// TODO remove this function when removing v1alpha5 APIs.
// requireMachineUnschedulable will add/remove the node.kubernetes.io/unschedulable taint from the candidates.
func (c *Controller) requireMachineUnschedulable(ctx context.Context, isUnschedulable bool, nodes ...*state.StateNode) error {
var multiErr error
for _, n := range nodes {
if n.Node == nil || (n.NodeClaim != nil && !n.NodeClaim.IsMachine) {
continue
}
node := &v1.Node{}
if err := c.kubeClient.Get(ctx, client.ObjectKey{Name: n.Node.Name}, node); client.IgnoreNotFound(err) != nil {
multiErr = multierr.Append(multiErr, fmt.Errorf("getting node, %w", err))
}
// If the node already has the taint, continue to the next
unschedulable := node.Spec.Unschedulable
// node is being deleted, so no need to remove taint as the node will be gone soon
if unschedulable && !node.DeletionTimestamp.IsZero() {
continue
}
stored := node.DeepCopy()
node.Spec.Unschedulable = isUnschedulable
if !equality.Semantic.DeepEqual(stored, node) {
if err := c.kubeClient.Patch(ctx, node, client.MergeFrom(stored)); err != nil {
multiErr = multierr.Append(multiErr, fmt.Errorf("patching node %s, %w", node.Name, err))
}
}
}
return multiErr
}

func (c *Controller) recordRun(s string) {
c.mu.Lock()
defer c.mu.Unlock()
Expand Down
14 changes: 7 additions & 7 deletions pkg/controllers/node/termination/terminator/terminator.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,16 @@ func NewTerminator(clk clock.Clock, kubeClient client.Client, eq *Queue) *Termin
}
}

// Taint adds the karpenter.sh/disruption taint to a node with a NodeClaim
// If the node is linked to a machine, it will use the node.kubernetes.io/unschedulable taint.
// Taint idempotently adds the karpenter.sh/disruption taint to a node with a NodeClaim
func (t *Terminator) Taint(ctx context.Context, node *v1.Node) error {
stored := node.DeepCopy()

if _, isMachine := node.Labels[v1alpha5.ProvisionerNameLabelKey]; isMachine {
node.Spec.Unschedulable = true
} else {
// If the taint already has the karpenter.sh/disruption=disrupting:NoSchedule taint, do nothing.
if _, ok := lo.Find(node.Spec.Taints, func(t v1.Taint) bool {
return v1beta1.IsDisruptingTaint(t)
}); !ok {
// If the taint key exists (but with a different value or effect), remove it.
node.Spec.Taints = lo.Reject(node.Spec.Taints, func(t v1.Taint, _ int) bool {
return v1beta1.IsDisruptingTaint(t)
return t.Key == v1beta1.DisruptionTaintKey
})
node.Spec.Taints = append(node.Spec.Taints, v1beta1.DisruptionNoScheduleTaint)
}
Expand Down

0 comments on commit c3db2ec

Please sign in to comment.