diff --git a/pkg/controllers/disruption/controller.go b/pkg/controllers/disruption/controller.go index 7d21cd85f1..3bb5a0ae37 100644 --- a/pkg/controllers/disruption/controller.go +++ b/pkg/controllers/disruption/controller.go @@ -35,6 +35,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/karpenter/pkg/utils/pretty" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" @@ -126,8 +127,8 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) { return reconcile.Result{RequeueAfter: time.Second}, nil } - // Karpenter taints nodes with a karpenter.sh/disruption taint as part of the disruption process - // while it progresses in memory. If Karpenter restarts during a disruption action, some nodes can be left tainted. + // Karpenter taints nodes with a karpenter.sh/disruption taint as part of the disruption process while it progresses in memory. + // If Karpenter restarts or fails with an error during a disruption action, some nodes can be left tainted. // Idempotently remove this taint from candidates that are not in the orchestration queue before continuing. if err := state.RequireNoScheduleTaint(ctx, c.kubeClient, false, lo.Filter(c.cluster.Nodes(), func(s *state.StateNode, _ int) bool { return !c.queue.HasAny(s.ProviderID()) @@ -135,7 +136,7 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) { if errors.IsConflict(err) { return reconcile.Result{Requeue: true}, nil } - return reconcile.Result{}, fmt.Errorf("removing taint from nodes, %w", err) + return reconcile.Result{}, fmt.Errorf("removing taint %s from nodes, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err) } // Attempt different disruption methods. We'll only let one method perform an action @@ -207,7 +208,7 @@ func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command, }) // Cordon the old nodes before we launch the replacements to prevent new pods from scheduling to the old nodes if err := state.RequireNoScheduleTaint(ctx, c.kubeClient, true, stateNodes...); err != nil { - return fmt.Errorf("tainting nodes (command-id: %s), %w", commandID, err) + return fmt.Errorf("tainting nodes with %s (command-id: %s), %w", pretty.Taint(v1.DisruptedNoScheduleTaint), commandID, err) } var nodeClaimNames []string diff --git a/pkg/controllers/node/termination/controller.go b/pkg/controllers/node/termination/controller.go index a9f67ad22d..def8e42c14 100644 --- a/pkg/controllers/node/termination/controller.go +++ b/pkg/controllers/node/termination/controller.go @@ -38,6 +38,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/karpenter/pkg/utils/pretty" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" @@ -104,7 +105,7 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile if errors.IsConflict(err) { return reconcile.Result{Requeue: true}, nil } - return reconcile.Result{}, client.IgnoreNotFound(fmt.Errorf("tainting node with %s, %w", v1.DisruptedTaintKey, err)) + return reconcile.Result{}, client.IgnoreNotFound(fmt.Errorf("tainting node with %s, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err)) } if err = c.terminator.Drain(ctx, node, nodeTerminationTime); err != nil { if !terminator.IsNodeDrainError(err) { @@ -237,6 +238,7 @@ func (c *Controller) removeFinalizer(ctx context.Context, n *corev1.Node) error // We use client.StrategicMergeFrom here since the node object supports it and // a strategic merge patch represents the finalizer list as a keyed "set" so removing // an item from the list doesn't replace the full list + // https://github.com/kubernetes/kubernetes/issues/111643#issuecomment-2016489732 if err := c.kubeClient.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil { return client.IgnoreNotFound(fmt.Errorf("removing finalizer, %w", err)) } diff --git a/pkg/controllers/nodeclaim/termination/controller.go b/pkg/controllers/nodeclaim/termination/controller.go index 3aa84c9b7e..0b032c2249 100644 --- a/pkg/controllers/nodeclaim/termination/controller.go +++ b/pkg/controllers/nodeclaim/termination/controller.go @@ -130,6 +130,7 @@ func (c *Controller) finalize(ctx context.Context, nodeClaim *v1.NodeClaim) (rec // We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch // can cause races due to the fact that it fully replaces the list on a change // Here, we are updating the finalizer list + // https://github.com/kubernetes/kubernetes/issues/111643#issuecomment-2016489732 if err = c.kubeClient.Patch(ctx, nodeClaim, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { if errors.IsConflict(err) { return reconcile.Result{Requeue: true}, nil diff --git a/pkg/scheduling/taints.go b/pkg/scheduling/taints.go index 102d9ce627..55018e8f64 100644 --- a/pkg/scheduling/taints.go +++ b/pkg/scheduling/taints.go @@ -23,6 +23,7 @@ import ( "go.uber.org/multierr" corev1 "k8s.io/api/core/v1" cloudproviderapi "k8s.io/cloud-provider/api" + "sigs.k8s.io/karpenter/pkg/utils/pretty" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" ) @@ -49,7 +50,7 @@ func (ts Taints) Tolerates(pod *corev1.Pod) (errs error) { tolerates = tolerates || t.ToleratesTaint(&taint) } if !tolerates { - errs = multierr.Append(errs, fmt.Errorf("did not tolerate %s=%s:%s", taint.Key, taint.Value, taint.Effect)) + errs = multierr.Append(errs, fmt.Errorf("did not tolerate %s", pretty.Taint(ts[i]))) } } return errs diff --git a/pkg/utils/pretty/pretty.go b/pkg/utils/pretty/pretty.go index d419a71fe9..b67d40bc38 100644 --- a/pkg/utils/pretty/pretty.go +++ b/pkg/utils/pretty/pretty.go @@ -24,6 +24,7 @@ import ( "golang.org/x/exp/constraints" "golang.org/x/exp/slices" + v1 "k8s.io/api/core/v1" ) func Concise(o interface{}) string { @@ -77,3 +78,7 @@ func Map[K constraints.Ordered, V any](values map[K]V, maxItems int) string { } return buf.String() } + +func Taint(t v1.Taint) string { + return fmt.Sprintf("%s=%s:%s", t.Key, t.Value, t.Effect) +}