From e4ea0d4f927de539e602f7e3c813010b4755279a Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 17 Sep 2024 10:50:43 -0700 Subject: [PATCH] PR comments --- kwok/charts/crds/karpenter.sh_nodepools.yaml | 2 ++ pkg/apis/crds/karpenter.sh_nodepools.yaml | 2 ++ pkg/controllers/disruption/controller.go | 10 ++++++---- pkg/controllers/node/termination/controller.go | 5 ++++- pkg/controllers/nodeclaim/termination/controller.go | 1 + pkg/scheduling/taints.go | 4 +++- pkg/utils/pretty/pretty.go | 8 ++++++++ 7 files changed, 26 insertions(+), 6 deletions(-) diff --git a/kwok/charts/crds/karpenter.sh_nodepools.yaml b/kwok/charts/crds/karpenter.sh_nodepools.yaml index f22bbccb52..434c596165 100644 --- a/kwok/charts/crds/karpenter.sh_nodepools.yaml +++ b/kwok/charts/crds/karpenter.sh_nodepools.yaml @@ -71,6 +71,8 @@ spec: from a combination of nodepool and pod scheduling constraints. properties: disruption: + default: + consolidateAfter: 0s description: Disruption contains the parameters that relate to Karpenter's disruption logic properties: budgets: diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index 2f85035059..5cf036d643 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -71,6 +71,8 @@ spec: from a combination of nodepool and pod scheduling constraints. properties: disruption: + default: + consolidateAfter: 0s description: Disruption contains the parameters that relate to Karpenter's disruption logic properties: budgets: diff --git a/pkg/controllers/disruption/controller.go b/pkg/controllers/disruption/controller.go index 7d21cd85f1..b809ccad73 100644 --- a/pkg/controllers/disruption/controller.go +++ b/pkg/controllers/disruption/controller.go @@ -36,6 +36,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/karpenter/pkg/utils/pretty" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" "sigs.k8s.io/karpenter/pkg/controllers/disruption/orchestration" @@ -126,8 +128,8 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) { return reconcile.Result{RequeueAfter: time.Second}, nil } - // Karpenter taints nodes with a karpenter.sh/disruption taint as part of the disruption process - // while it progresses in memory. If Karpenter restarts during a disruption action, some nodes can be left tainted. + // Karpenter taints nodes with a karpenter.sh/disruption taint as part of the disruption process while it progresses in memory. + // If Karpenter restarts or fails with an error during a disruption action, some nodes can be left tainted. // Idempotently remove this taint from candidates that are not in the orchestration queue before continuing. if err := state.RequireNoScheduleTaint(ctx, c.kubeClient, false, lo.Filter(c.cluster.Nodes(), func(s *state.StateNode, _ int) bool { return !c.queue.HasAny(s.ProviderID()) @@ -135,7 +137,7 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) { if errors.IsConflict(err) { return reconcile.Result{Requeue: true}, nil } - return reconcile.Result{}, fmt.Errorf("removing taint from nodes, %w", err) + return reconcile.Result{}, fmt.Errorf("removing taint %s from nodes, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err) } // Attempt different disruption methods. We'll only let one method perform an action @@ -207,7 +209,7 @@ func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command, }) // Cordon the old nodes before we launch the replacements to prevent new pods from scheduling to the old nodes if err := state.RequireNoScheduleTaint(ctx, c.kubeClient, true, stateNodes...); err != nil { - return fmt.Errorf("tainting nodes (command-id: %s), %w", commandID, err) + return fmt.Errorf("tainting nodes with %s (command-id: %s), %w", pretty.Taint(v1.DisruptedNoScheduleTaint), commandID, err) } var nodeClaimNames []string diff --git a/pkg/controllers/node/termination/controller.go b/pkg/controllers/node/termination/controller.go index a9f67ad22d..b8e49a3931 100644 --- a/pkg/controllers/node/termination/controller.go +++ b/pkg/controllers/node/termination/controller.go @@ -39,6 +39,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/karpenter/pkg/utils/pretty" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator" @@ -104,7 +106,7 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile if errors.IsConflict(err) { return reconcile.Result{Requeue: true}, nil } - return reconcile.Result{}, client.IgnoreNotFound(fmt.Errorf("tainting node with %s, %w", v1.DisruptedTaintKey, err)) + return reconcile.Result{}, client.IgnoreNotFound(fmt.Errorf("tainting node with %s, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err)) } if err = c.terminator.Drain(ctx, node, nodeTerminationTime); err != nil { if !terminator.IsNodeDrainError(err) { @@ -237,6 +239,7 @@ func (c *Controller) removeFinalizer(ctx context.Context, n *corev1.Node) error // We use client.StrategicMergeFrom here since the node object supports it and // a strategic merge patch represents the finalizer list as a keyed "set" so removing // an item from the list doesn't replace the full list + // https://github.com/kubernetes/kubernetes/issues/111643#issuecomment-2016489732 if err := c.kubeClient.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil { return client.IgnoreNotFound(fmt.Errorf("removing finalizer, %w", err)) } diff --git a/pkg/controllers/nodeclaim/termination/controller.go b/pkg/controllers/nodeclaim/termination/controller.go index 3aa84c9b7e..0b032c2249 100644 --- a/pkg/controllers/nodeclaim/termination/controller.go +++ b/pkg/controllers/nodeclaim/termination/controller.go @@ -130,6 +130,7 @@ func (c *Controller) finalize(ctx context.Context, nodeClaim *v1.NodeClaim) (rec // We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch // can cause races due to the fact that it fully replaces the list on a change // Here, we are updating the finalizer list + // https://github.com/kubernetes/kubernetes/issues/111643#issuecomment-2016489732 if err = c.kubeClient.Patch(ctx, nodeClaim, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { if errors.IsConflict(err) { return reconcile.Result{Requeue: true}, nil diff --git a/pkg/scheduling/taints.go b/pkg/scheduling/taints.go index 102d9ce627..b6bd0d6ab9 100644 --- a/pkg/scheduling/taints.go +++ b/pkg/scheduling/taints.go @@ -24,6 +24,8 @@ import ( corev1 "k8s.io/api/core/v1" cloudproviderapi "k8s.io/cloud-provider/api" + "sigs.k8s.io/karpenter/pkg/utils/pretty" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" ) @@ -49,7 +51,7 @@ func (ts Taints) Tolerates(pod *corev1.Pod) (errs error) { tolerates = tolerates || t.ToleratesTaint(&taint) } if !tolerates { - errs = multierr.Append(errs, fmt.Errorf("did not tolerate %s=%s:%s", taint.Key, taint.Value, taint.Effect)) + errs = multierr.Append(errs, fmt.Errorf("did not tolerate %s", pretty.Taint(taint))) } } return errs diff --git a/pkg/utils/pretty/pretty.go b/pkg/utils/pretty/pretty.go index d419a71fe9..fbc7b3bf94 100644 --- a/pkg/utils/pretty/pretty.go +++ b/pkg/utils/pretty/pretty.go @@ -24,6 +24,7 @@ import ( "golang.org/x/exp/constraints" "golang.org/x/exp/slices" + v1 "k8s.io/api/core/v1" ) func Concise(o interface{}) string { @@ -77,3 +78,10 @@ func Map[K constraints.Ordered, V any](values map[K]V, maxItems int) string { } return buf.String() } + +func Taint(t v1.Taint) string { + if t.Value == "" { + return fmt.Sprintf("%s:%s", t.Key, t.Effect) + } + return fmt.Sprintf("%s=%s:%s", t.Key, t.Value, t.Effect) +}