diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 9ac8f57234a1..b6e0cee8d443 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -20,7 +20,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter-core/pkg/cloudprovider" - "github.com/aws/karpenter-core/pkg/controllers/deprovisioning" + "github.com/aws/karpenter-core/pkg/controllers/disruption" "github.com/aws/karpenter-core/pkg/controllers/leasegarbagecollection" metricsnode "github.com/aws/karpenter-core/pkg/controllers/metrics/node" metricspod "github.com/aws/karpenter-core/pkg/controllers/metrics/pod" @@ -55,7 +55,7 @@ func NewControllers( return []controller.Controller{ p, evictionQueue, - deprovisioning.NewController(clock, kubeClient, p, cloudProvider, recorder, cluster), + disruption.NewController(clock, kubeClient, p, cloudProvider, recorder, cluster), provisioning.NewController(kubeClient, p, recorder), nodepoolhash.NewProvisionerController(kubeClient), informer.NewDaemonSetController(kubeClient, cluster), diff --git a/pkg/controllers/deprovisioning/consolidation.go b/pkg/controllers/disruption/consolidation.go similarity index 81% rename from pkg/controllers/deprovisioning/consolidation.go rename to pkg/controllers/disruption/consolidation.go index c67e4b095cf0..94ec1a124ae2 100644 --- a/pkg/controllers/deprovisioning/consolidation.go +++ b/pkg/controllers/disruption/consolidation.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "context" @@ -29,11 +29,10 @@ import ( "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/cloudprovider" - deprovisioningevents "github.com/aws/karpenter-core/pkg/controllers/deprovisioning/events" + disruptionevents "github.com/aws/karpenter-core/pkg/controllers/disruption/events" "github.com/aws/karpenter-core/pkg/controllers/provisioning" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/events" - "github.com/aws/karpenter-core/pkg/metrics" "github.com/aws/karpenter-core/pkg/scheduling" ) @@ -64,12 +63,7 @@ func makeConsolidation(clock clock.Clock, cluster *state.Cluster, kubeClient cli } } -// string is the string representation of the deprovisioner -func (c *consolidation) String() string { - return metrics.ConsolidationReason -} - -// sortAndFilterCandidates orders deprovisionable candidates by the disruptionCost, removing any that we already know won't +// sortAndFilterCandidates orders candidates by the disruptionCost, removing any that we already know won't // be viable consolidation options. func (c *consolidation) sortAndFilterCandidates(ctx context.Context, candidates []*Candidate) ([]*Candidate, error) { candidates, err := filterCandidates(ctx, c.kubeClient, c.recorder, candidates) @@ -93,15 +87,15 @@ func (c *consolidation) markConsolidated() { c.lastConsolidationState = c.cluster.ConsolidationState() } -// ShouldDeprovision is a predicate used to filter deprovisionable candidates -func (c *consolidation) ShouldDeprovision(_ context.Context, cn *Candidate) bool { +// ShouldDisrupt is a predicate used to filter candidates +func (c *consolidation) ShouldDisrupt(_ context.Context, cn *Candidate) bool { if cn.Annotations()[v1alpha5.DoNotConsolidateNodeAnnotationKey] == "true" { - c.recorder.Publish(deprovisioningevents.Unconsolidatable(cn.Node, cn.NodeClaim, fmt.Sprintf("%s annotation exists", v1alpha5.DoNotConsolidateNodeAnnotationKey))...) + c.recorder.Publish(disruptionevents.Unconsolidatable(cn.Node, cn.NodeClaim, fmt.Sprintf("%s annotation exists", v1alpha5.DoNotConsolidateNodeAnnotationKey))...) return false } if cn.nodePool.Spec.Disruption.ConsolidationPolicy != v1beta1.ConsolidationPolicyWhenUnderutilized || (cn.nodePool.Spec.Disruption.ConsolidateAfter != nil && cn.nodePool.Spec.Disruption.ConsolidateAfter.Duration == nil) { - c.recorder.Publish(deprovisioningevents.Unconsolidatable(cn.Node, cn.NodeClaim, fmt.Sprintf("%s %q has consolidation disabled", lo.Ternary(cn.nodePool.IsProvisioner, "Provisioner", "NodePool"), cn.nodePool.Name))...) + c.recorder.Publish(disruptionevents.Unconsolidatable(cn.Node, cn.NodeClaim, fmt.Sprintf("%s %q has consolidation disabled", lo.Ternary(cn.nodePool.IsProvisioner, "Provisioner", "NodePool"), cn.nodePool.Name))...) return false } return true @@ -125,7 +119,7 @@ func (c *consolidation) computeConsolidation(ctx context.Context, candidates ... if !results.AllNonPendingPodsScheduled() { // This method is used by multi-node consolidation as well, so we'll only report in the single node case if len(candidates) == 1 { - c.recorder.Publish(deprovisioningevents.Unconsolidatable(candidates[0].Node, candidates[0].NodeClaim, results.NonPendingPodSchedulingErrors())...) + c.recorder.Publish(disruptionevents.Unconsolidatable(candidates[0].Node, candidates[0].NodeClaim, results.NonPendingPodSchedulingErrors())...) } return Command{}, nil } @@ -140,7 +134,7 @@ func (c *consolidation) computeConsolidation(ctx context.Context, candidates ... // we're not going to turn a single node into multiple candidates if len(results.NewNodeClaims) != 1 { if len(candidates) == 1 { - c.recorder.Publish(deprovisioningevents.Unconsolidatable(candidates[0].Node, candidates[0].NodeClaim, fmt.Sprintf("Can't remove without creating %d candidates", len(results.NewNodeClaims)))...) + c.recorder.Publish(disruptionevents.Unconsolidatable(candidates[0].Node, candidates[0].NodeClaim, fmt.Sprintf("Can't remove without creating %d candidates", len(results.NewNodeClaims)))...) } return Command{}, nil } @@ -154,7 +148,7 @@ func (c *consolidation) computeConsolidation(ctx context.Context, candidates ... results.NewNodeClaims[0].InstanceTypeOptions = filterByPrice(results.NewNodeClaims[0].InstanceTypeOptions, results.NewNodeClaims[0].Requirements, candidatePrice) if len(results.NewNodeClaims[0].InstanceTypeOptions) == 0 { if len(candidates) == 1 { - c.recorder.Publish(deprovisioningevents.Unconsolidatable(candidates[0].Node, candidates[0].NodeClaim, "Can't replace with a cheaper node")...) + c.recorder.Publish(disruptionevents.Unconsolidatable(candidates[0].Node, candidates[0].NodeClaim, "Can't replace with a cheaper node")...) } // no instance types remain after filtering by price return Command{}, nil @@ -173,7 +167,7 @@ func (c *consolidation) computeConsolidation(ctx context.Context, candidates ... if allExistingAreSpot && results.NewNodeClaims[0].Requirements.Get(v1beta1.CapacityTypeLabelKey).Has(v1beta1.CapacityTypeSpot) { if len(candidates) == 1 { - c.recorder.Publish(deprovisioningevents.Unconsolidatable(candidates[0].Node, candidates[0].NodeClaim, "Can't replace a spot node with a spot node")...) + c.recorder.Publish(disruptionevents.Unconsolidatable(candidates[0].Node, candidates[0].NodeClaim, "Can't replace a spot node with a spot node")...) } return Command{}, nil } diff --git a/pkg/controllers/deprovisioning/controller.go b/pkg/controllers/disruption/controller.go similarity index 77% rename from pkg/controllers/deprovisioning/controller.go rename to pkg/controllers/disruption/controller.go index a5723d78da83..aa3f294cb26d 100644 --- a/pkg/controllers/deprovisioning/controller.go +++ b/pkg/controllers/disruption/controller.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "context" @@ -35,7 +35,7 @@ import ( "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/cloudprovider" - deprovisioningevents "github.com/aws/karpenter-core/pkg/controllers/deprovisioning/events" + disruptionevents "github.com/aws/karpenter-core/pkg/controllers/disruption/events" "github.com/aws/karpenter-core/pkg/controllers/provisioning" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/events" @@ -44,20 +44,19 @@ import ( nodeclaimutil "github.com/aws/karpenter-core/pkg/utils/nodeclaim" ) -// Controller is the deprovisioning controller. type Controller struct { - kubeClient client.Client - cluster *state.Cluster - provisioner *provisioning.Provisioner - recorder events.Recorder - clock clock.Clock - cloudProvider cloudprovider.CloudProvider - deprovisioners []Deprovisioner - mu sync.Mutex - lastRun map[string]time.Time + kubeClient client.Client + cluster *state.Cluster + provisioner *provisioning.Provisioner + recorder events.Recorder + clock clock.Clock + cloudProvider cloudprovider.CloudProvider + methods []Method + mu sync.Mutex + lastRun map[string]time.Time } -// pollingPeriod that we inspect cluster to look for opportunities to deprovision +// pollingPeriod that we inspect cluster to look for opportunities to disrupt const pollingPeriod = 10 * time.Second var errCandidateDeleting = fmt.Errorf("candidate is deleting") @@ -86,7 +85,7 @@ func NewController(clk clock.Clock, kubeClient client.Client, provisioner *provi recorder: recorder, cloudProvider: cp, lastRun: map[string]time.Time{}, - deprovisioners: []Deprovisioner{ + methods: []Method{ // Expire any NodeClaims that must be deleted, allowing their pods to potentially land on currently NewExpiration(clk, kubeClient, cluster, provisioner, recorder), // Terminate any NodeClaims that have drifted from provisioning specifications, allowing the pods to reschedule. @@ -104,7 +103,7 @@ func NewController(clk clock.Clock, kubeClient client.Client, provisioner *provi } func (c *Controller) Name() string { - return "deprovisioning" + return "disruption" } func (c *Controller) Builder(_ context.Context, m manager.Manager) controller.Builder { @@ -115,7 +114,7 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc // this won't catch if the reconcile loop hangs forever, but it will catch other issues c.logAbnormalRuns(ctx) defer c.logAbnormalRuns(ctx) - c.recordRun("deprovisioning-loop") + c.recordRun("disruption-loop") // We need to ensure that our internal cluster state mechanism is synced before we proceed // with making any scheduling decision off of our state nodes. Otherwise, we have the potential to make @@ -125,76 +124,84 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc return reconcile.Result{RequeueAfter: time.Second}, nil } - // Karpenter taints nodes with a karpenter.sh/disruption taint as part of the deprovisioning process - // while it progresses in memory. If Karpenter restarts during a deprovisioning action, some nodes can be left tainted. + // Karpenter taints nodes with a karpenter.sh/disruption taint as part of the disruption process + // while it progresses in memory. If Karpenter restarts during a disruption action, some nodes can be left tainted. // Idempotently remove this taint from candidates before continuing. if err := c.requireNodeClaimNoScheduleTaint(ctx, false, c.cluster.Nodes()...); err != nil { return reconcile.Result{}, fmt.Errorf("removing taint from nodes, %w", err) } - // Attempt different deprovisioning methods. We'll only let one method perform an action - for _, d := range c.deprovisioners { - c.recordRun(fmt.Sprintf("%T", d)) - success, err := c.deprovision(ctx, d) + // Attempt different disruption methods. We'll only let one method perform an action + for _, m := range c.methods { + c.recordRun(fmt.Sprintf("%T", m)) + success, err := c.disrupt(ctx, m) if err != nil { - return reconcile.Result{}, fmt.Errorf("deprovisioning via %q, %w", d, err) + return reconcile.Result{}, fmt.Errorf("disrupting via %q, %w", m.Type(), err) } if success { return reconcile.Result{RequeueAfter: controller.Immediately}, nil } } - // All deprovisioners did nothing, so return nothing to do + // All methods did nothing, so return nothing to do return reconcile.Result{RequeueAfter: pollingPeriod}, nil } -func (c *Controller) deprovision(ctx context.Context, deprovisioner Deprovisioner) (bool, error) { - defer metrics.Measure(deprovisioningDurationHistogram.WithLabelValues(deprovisioner.String()))() - candidates, err := GetCandidates(ctx, c.cluster, c.kubeClient, c.recorder, c.clock, c.cloudProvider, deprovisioner.ShouldDeprovision) +func (c *Controller) disrupt(ctx context.Context, disruption Method) (bool, error) { + defer metrics.Measure(deprovisioningDurationHistogram.WithLabelValues(disruption.Type()))() + defer metrics.Measure(disruptionEvaluationDurationHistogram.With(map[string]string{ + methodLabel: disruption.Type(), + consolidationTypeLabel: disruption.ConsolidationType(), + }))() + candidates, err := GetCandidates(ctx, c.cluster, c.kubeClient, c.recorder, c.clock, c.cloudProvider, disruption.ShouldDisrupt) if err != nil { return false, fmt.Errorf("determining candidates, %w", err) } - // If there are no candidates, move to the next deprovisioner + // If there are no candidates, move to the next disruption if len(candidates) == 0 { return false, nil } - // Determine the deprovisioning action - cmd, err := deprovisioner.ComputeCommand(ctx, candidates...) + // Determine the disruption action + cmd, err := disruption.ComputeCommand(ctx, candidates...) if err != nil { - return false, fmt.Errorf("computing deprovisioning decision, %w", err) + return false, fmt.Errorf("computing disruption decision, %w", err) } if cmd.Action() == NoOpAction { return false, nil } - // Attempt to deprovision - if err := c.executeCommand(ctx, deprovisioner, cmd); err != nil { - return false, fmt.Errorf("deprovisioning candidates, %w", err) + // Attempt to disrupt + if err := c.executeCommand(ctx, disruption, cmd); err != nil { + return false, fmt.Errorf("disrupting candidates, %w", err) } return true, nil } -func (c *Controller) executeCommand(ctx context.Context, d Deprovisioner, command Command) error { +func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command) error { deprovisioningActionsPerformedCounter.With(map[string]string{ - // TODO: make this just command.Action() since we've added the deprovisioner as its own label. - actionLabel: fmt.Sprintf("%s/%s", d, command.Action()), - deprovisionerLabel: d.String(), + actionLabel: fmt.Sprintf("%s/%s", m, cmd.Action()), + deprovisionerLabel: m.Type(), }).Inc() - logging.FromContext(ctx).Infof("deprovisioning via %s %s", d, command) + disruptionActionsPerformedCounter.With(map[string]string{ + actionLabel: string(cmd.Action()), + methodLabel: m.Type(), + consolidationTypeLabel: m.ConsolidationType(), + }).Inc() + logging.FromContext(ctx).Infof("disrupting via %s %s", m, cmd) - reason := fmt.Sprintf("%s/%s", d, command.Action()) - if command.Action() == ReplaceAction { - if err := c.launchReplacementNodeClaims(ctx, command, reason); err != nil { - // If we failed to launch the replacement, don't deprovision. If this is some permanent failure, + reason := fmt.Sprintf("%s/%s", m.Type(), cmd.Action()) + if cmd.Action() == ReplaceAction { + if err := c.launchReplacementNodeClaims(ctx, m, cmd); err != nil { + // If we failed to launch the replacement, don't disrupt. If this is some permanent failure, // we don't want to disrupt workloads with no way to provision new NodeClaims for them. return fmt.Errorf("launching replacement, %w", err) } } - for _, candidate := range command.candidates { - c.recorder.Publish(deprovisioningevents.Terminating(candidate.Node, candidate.NodeClaim, reason)...) + for _, candidate := range cmd.candidates { + c.recorder.Publish(disruptionevents.Terminating(candidate.Node, candidate.NodeClaim, reason)...) if err := nodeclaimutil.Delete(ctx, c.kubeClient, candidate.NodeClaim); err != nil { if !errors.IsNotFound(err) { @@ -205,9 +212,9 @@ func (c *Controller) executeCommand(ctx context.Context, d Deprovisioner, comman nodeclaimutil.TerminatedCounter(candidate.NodeClaim, reason).Inc() } - // We wait for NodeClaims to delete to ensure we don't start another round of deprovisioning until this node is fully - // deleted. - for _, oldCandidate := range command.candidates { + // We wait for NodeClaims to delete to ensure we don't start another round of disruption + // until this node is fully deleted. + for _, oldCandidate := range cmd.candidates { c.waitForDeletion(ctx, oldCandidate.NodeClaim) } return nil @@ -215,28 +222,30 @@ func (c *Controller) executeCommand(ctx context.Context, d Deprovisioner, comman // launchReplacementNodeClaims launches replacement NodeClaims and blocks until it is ready // nolint:gocyclo -func (c *Controller) launchReplacementNodeClaims(ctx context.Context, action Command, reason string) error { +func (c *Controller) launchReplacementNodeClaims(ctx context.Context, m Method, cmd Command) error { + reason := fmt.Sprintf("%s/%s", m.Type(), cmd.Action()) defer metrics.Measure(deprovisioningReplacementNodeInitializedHistogram)() + defer metrics.Measure(disruptionReplacementNodeClaimInitializedHistogram)() - stateNodes := lo.Map(action.candidates, func(c *Candidate, _ int) *state.StateNode { return c.StateNode }) + stateNodes := lo.Map(cmd.candidates, func(c *Candidate, _ int) *state.StateNode { return c.StateNode }) // taint the candidate nodes before we launch the replacements to prevent new pods from scheduling to the candidate nodes if err := c.requireNoScheduleTaints(ctx, true, stateNodes...); err != nil { return fmt.Errorf("cordoning nodes, %w", err) } - nodeClaimKeys, err := c.provisioner.CreateNodeClaims(ctx, action.replacements, provisioning.WithReason(reason)) + nodeClaimKeys, err := c.provisioner.CreateNodeClaims(ctx, cmd.replacements, provisioning.WithReason(reason)) if err != nil { // untaint the nodes as the launch may fail (e.g. ICE) err = multierr.Append(err, c.requireNoScheduleTaints(ctx, false, stateNodes...)) return err } - if len(nodeClaimKeys) != len(action.replacements) { + if len(nodeClaimKeys) != len(cmd.replacements) { // shouldn't ever occur since a partially failed CreateNodeClaims should return an error - return fmt.Errorf("expected %d replacements, got %d", len(action.replacements), len(nodeClaimKeys)) + return fmt.Errorf("expected %d replacements, got %d", len(cmd.replacements), len(nodeClaimKeys)) } - candidateProviderIDs := lo.Map(action.candidates, func(c *Candidate, _ int) string { return c.ProviderID() }) + candidateProviderIDs := lo.Map(cmd.candidates, func(c *Candidate, _ int) string { return c.ProviderID() }) // We have the new NodeClaims created at the API server so mark the old NodeClaims for deletion c.cluster.MarkForDeletion(candidateProviderIDs...) @@ -246,6 +255,10 @@ func (c *Controller) launchReplacementNodeClaims(ctx context.Context, action Com // other transient error if err := c.waitForReadiness(ctx, nodeClaimKeys[i], reason); err != nil { deprovisioningReplacementNodeLaunchFailedCounter.WithLabelValues(reason).Inc() + disruptionReplacementNodeClaimFailedCounter.With(map[string]string{ + methodLabel: m.Type(), + consolidationTypeLabel: m.ConsolidationType(), + }).Inc() errs[i] = err } }) @@ -257,7 +270,7 @@ func (c *Controller) launchReplacementNodeClaims(ctx context.Context, action Com return nil } -// TODO @njtran: Allow to bypass this check for certain deprovisioners +// TODO @njtran: Allow to bypass this check for certain methods func (c *Controller) waitForReadiness(ctx context.Context, key nodeclaimutil.Key, reason string) error { // Wait for the NodeClaim to be initialized var once sync.Once @@ -273,11 +286,11 @@ func (c *Controller) waitForReadiness(ctx context.Context, key nodeclaimutil.Key return fmt.Errorf("getting %s, %w", lo.Ternary(key.IsMachine, "machine", "nodeclaim"), err) } once.Do(func() { - c.recorder.Publish(deprovisioningevents.Launching(nodeClaim, reason)) + c.recorder.Publish(disruptionevents.Launching(nodeClaim, reason)) }) if !nodeClaim.StatusConditions().GetCondition(v1beta1.Initialized).IsTrue() { - // make the user aware of why deprovisioning is paused - c.recorder.Publish(deprovisioningevents.WaitingOnReadiness(nodeClaim)) + // make the user aware of why disruption is paused + c.recorder.Publish(disruptionevents.WaitingOnReadiness(nodeClaim)) return fmt.Errorf("node is not initialized") } return nil @@ -294,8 +307,8 @@ func (c *Controller) waitForDeletion(ctx context.Context, nodeClaim *v1beta1.Nod if errors.IsNotFound(nerr) { return nil } - // make the user aware of why deprovisioning is paused - c.recorder.Publish(deprovisioningevents.WaitingOnDeletion(nc)) + // make the user aware of why disruption is paused + c.recorder.Publish(disruptionevents.WaitingOnDeletion(nc)) if nerr != nil { return fmt.Errorf("expected to be not found, %w", nerr) } @@ -316,8 +329,8 @@ func (c *Controller) requireNoScheduleTaints(ctx context.Context, addTaint bool, } // requireNodeClaimNoScheduleTaint will add/remove the karpenter.sh/disruption taint from the candidates. -// This is used to enforce no taints at the beginning of deprovisioning, and -// to add/remove taints while executing a deprovisioning action. +// This is used to enforce no taints at the beginning of disruption, and +// to add/remove taints while executing a disruption action. // nolint:gocyclo func (c *Controller) requireNodeClaimNoScheduleTaint(ctx context.Context, addTaint bool, nodes ...*state.StateNode) error { var multiErr error diff --git a/pkg/controllers/deprovisioning/drift.go b/pkg/controllers/disruption/drift.go similarity index 80% rename from pkg/controllers/deprovisioning/drift.go rename to pkg/controllers/disruption/drift.go index f58f513fd99b..1c585a4dc261 100644 --- a/pkg/controllers/deprovisioning/drift.go +++ b/pkg/controllers/disruption/drift.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "context" @@ -26,7 +26,7 @@ import ( "github.com/samber/lo" "github.com/aws/karpenter-core/pkg/apis/v1beta1" - deprovisioningevents "github.com/aws/karpenter-core/pkg/controllers/deprovisioning/events" + disruptionevents "github.com/aws/karpenter-core/pkg/controllers/disruption/events" "github.com/aws/karpenter-core/pkg/controllers/provisioning" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/events" @@ -51,8 +51,8 @@ func NewDrift(kubeClient client.Client, cluster *state.Cluster, provisioner *pro } } -// ShouldDeprovision is a predicate used to filter deprovisionable candidates -func (d *Drift) ShouldDeprovision(ctx context.Context, c *Candidate) bool { +// ShouldDisrupt is a predicate used to filter candidates +func (d *Drift) ShouldDisrupt(ctx context.Context, c *Candidate) bool { return options.FromContext(ctx).FeatureGates.Drift && c.NodeClaim.StatusConditions().GetCondition(v1beta1.Drifted).IsTrue() } @@ -70,15 +70,19 @@ func (d *Drift) filterAndSortCandidates(ctx context.Context, candidates []*Candi return candidates, nil } -// ComputeCommand generates a deprovisioning command given deprovisionable candidates +// ComputeCommand generates a disruption command given candidates func (d *Drift) ComputeCommand(ctx context.Context, candidates ...*Candidate) (Command, error) { candidates, err := d.filterAndSortCandidates(ctx, candidates) if err != nil { return Command{}, err } - deprovisioningEligibleMachinesGauge.WithLabelValues(d.String()).Set(float64(len(candidates))) + deprovisioningEligibleMachinesGauge.WithLabelValues(d.Type()).Set(float64(len(candidates))) + disruptionEligibleNodesGauge.With(map[string]string{ + methodLabel: d.Type(), + consolidationTypeLabel: d.ConsolidationType(), + }).Set(float64(len(candidates))) - // Deprovision all empty drifted candidates, as they require no scheduling simulations. + // Disrupt all empty drifted candidates, as they require no scheduling simulations. if empty := lo.Filter(candidates, func(c *Candidate, _ int) bool { return len(c.pods) == 0 }); len(empty) > 0 { @@ -100,7 +104,7 @@ func (d *Drift) ComputeCommand(ctx context.Context, candidates ...*Candidate) (C // Log when all pods can't schedule, as the command will get executed immediately. if !results.AllNonPendingPodsScheduled() { logging.FromContext(ctx).With(lo.Ternary(candidate.NodeClaim.IsMachine, "machine", "nodeclaim"), candidate.NodeClaim.Name, "node", candidate.Node.Name).Debugf("cannot terminate since scheduling simulation failed to schedule all pods %s", results.NonPendingPodSchedulingErrors()) - d.recorder.Publish(deprovisioningevents.Blocked(candidate.Node, candidate.NodeClaim, "Scheduling simulation failed to schedule all pods")...) + d.recorder.Publish(disruptionevents.Blocked(candidate.Node, candidate.NodeClaim, "Scheduling simulation failed to schedule all pods")...) continue } if len(results.NewNodeClaims) == 0 { @@ -116,7 +120,10 @@ func (d *Drift) ComputeCommand(ctx context.Context, candidates ...*Candidate) (C return Command{}, nil } -// String is the string representation of the deprovisioner -func (d *Drift) String() string { +func (d *Drift) Type() string { return metrics.DriftReason } + +func (d *Drift) ConsolidationType() string { + return "" +} diff --git a/pkg/controllers/deprovisioning/emptiness.go b/pkg/controllers/disruption/emptiness.go similarity index 75% rename from pkg/controllers/deprovisioning/emptiness.go rename to pkg/controllers/disruption/emptiness.go index b1bc63863b66..3a8c8bb68a45 100644 --- a/pkg/controllers/deprovisioning/emptiness.go +++ b/pkg/controllers/disruption/emptiness.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "context" @@ -36,8 +36,8 @@ func NewEmptiness(clk clock.Clock) *Emptiness { } } -// ShouldDeprovision is a predicate used to filter deprovisionable candidates -func (e *Emptiness) ShouldDeprovision(_ context.Context, c *Candidate) bool { +// ShouldDisrupt is a predicate used to filter candidates +func (e *Emptiness) ShouldDisrupt(_ context.Context, c *Candidate) bool { return c.nodePool.Spec.Disruption.ConsolidateAfter != nil && c.nodePool.Spec.Disruption.ConsolidateAfter.Duration != nil && c.nodePool.Spec.Disruption.ConsolidationPolicy == v1beta1.ConsolidationPolicyWhenEmpty && @@ -45,19 +45,26 @@ func (e *Emptiness) ShouldDeprovision(_ context.Context, c *Candidate) bool { !e.clock.Now().Before(c.NodeClaim.StatusConditions().GetCondition(v1beta1.Empty).LastTransitionTime.Inner.Add(*c.nodePool.Spec.Disruption.ConsolidateAfter.Duration)) } -// ComputeCommand generates a deprovisioning command given deprovisionable candidates +// ComputeCommand generates a disruption command given candidates func (e *Emptiness) ComputeCommand(_ context.Context, candidates ...*Candidate) (Command, error) { emptyCandidates := lo.Filter(candidates, func(cn *Candidate, _ int) bool { return cn.NodeClaim.DeletionTimestamp.IsZero() && len(cn.pods) == 0 }) - deprovisioningEligibleMachinesGauge.WithLabelValues(e.String()).Set(float64(len(candidates))) + deprovisioningEligibleMachinesGauge.WithLabelValues(e.Type()).Set(float64(len(candidates))) + disruptionEligibleNodesGauge.With(map[string]string{ + methodLabel: e.Type(), + consolidationTypeLabel: e.ConsolidationType(), + }).Set(float64(len(candidates))) return Command{ candidates: emptyCandidates, }, nil } -// string is the string representation of the deprovisioner -func (e *Emptiness) String() string { +func (e *Emptiness) Type() string { return metrics.EmptinessReason } + +func (e *Emptiness) ConsolidationType() string { + return "" +} diff --git a/pkg/controllers/deprovisioning/emptynodeconsolidation.go b/pkg/controllers/disruption/emptynodeconsolidation.go similarity index 84% rename from pkg/controllers/deprovisioning/emptynodeconsolidation.go rename to pkg/controllers/disruption/emptynodeconsolidation.go index 16675bcdf2dd..a80cfe2f105b 100644 --- a/pkg/controllers/deprovisioning/emptynodeconsolidation.go +++ b/pkg/controllers/disruption/emptynodeconsolidation.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "context" @@ -28,6 +28,7 @@ import ( "github.com/aws/karpenter-core/pkg/controllers/provisioning" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/events" + "github.com/aws/karpenter-core/pkg/metrics" ) // EmptyNodeConsolidation is the consolidation controller that performs multi-nodeclaim consolidation of entirely empty nodes @@ -40,7 +41,7 @@ func NewEmptyNodeConsolidation(clk clock.Clock, cluster *state.Cluster, kubeClie return &EmptyNodeConsolidation{consolidation: makeConsolidation(clk, cluster, kubeClient, provisioner, cp, recorder)} } -// ComputeCommand generates a deprovisioning command given deprovisionable NodeClaims +// ComputeCommand generates a disruption command given candidates func (c *EmptyNodeConsolidation) ComputeCommand(ctx context.Context, candidates ...*Candidate) (Command, error) { if c.isConsolidated() { return Command{}, nil @@ -49,7 +50,11 @@ func (c *EmptyNodeConsolidation) ComputeCommand(ctx context.Context, candidates if err != nil { return Command{}, fmt.Errorf("sorting candidates, %w", err) } - deprovisioningEligibleMachinesGauge.WithLabelValues(c.String()).Set(float64(len(candidates))) + deprovisioningEligibleMachinesGauge.WithLabelValues(c.Type()).Set(float64(len(candidates))) + disruptionEligibleNodesGauge.With(map[string]string{ + methodLabel: c.Type(), + consolidationTypeLabel: c.ConsolidationType(), + }).Set(float64(len(candidates))) // select the entirely empty NodeClaims emptyCandidates := lo.Filter(candidates, func(n *Candidate, _ int) bool { return len(n.pods) == 0 }) @@ -71,7 +76,7 @@ func (c *EmptyNodeConsolidation) ComputeCommand(ctx context.Context, candidates return Command{}, errors.New("interrupted") case <-c.clock.After(consolidationTTL): } - validationCandidates, err := GetCandidates(ctx, c.cluster, c.kubeClient, c.recorder, c.clock, c.cloudProvider, c.ShouldDeprovision) + validationCandidates, err := GetCandidates(ctx, c.cluster, c.kubeClient, c.recorder, c.clock, c.cloudProvider, c.ShouldDisrupt) if err != nil { logging.FromContext(ctx).Errorf("computing validation candidates %s", err) return Command{}, err @@ -91,3 +96,11 @@ func (c *EmptyNodeConsolidation) ComputeCommand(ctx context.Context, candidates } return cmd, nil } + +func (c *EmptyNodeConsolidation) Type() string { + return metrics.ConsolidationReason +} + +func (c *EmptyNodeConsolidation) ConsolidationType() string { + return "empty" +} diff --git a/pkg/controllers/deprovisioning/events/events.go b/pkg/controllers/disruption/events/events.go similarity index 64% rename from pkg/controllers/deprovisioning/events/events.go rename to pkg/controllers/disruption/events/events.go index 72ee8395aa08..bc5e01ccdcfb 100644 --- a/pkg/controllers/deprovisioning/events/events.go +++ b/pkg/controllers/disruption/events/events.go @@ -41,7 +41,7 @@ func Launching(nodeClaim *v1beta1.NodeClaim, reason string) events.Event { return events.Event{ InvolvedObject: nodeClaim, Type: v1.EventTypeNormal, - Reason: "DeprovisioningLaunching", + Reason: "DisruptionLaunching", Message: fmt.Sprintf("Launching NodeClaim: %s", cases.Title(language.Und, cases.NoLower).String(reason)), DedupeValues: []string{string(nodeClaim.UID), reason}, } @@ -61,8 +61,8 @@ func WaitingOnReadiness(nodeClaim *v1beta1.NodeClaim) events.Event { return events.Event{ InvolvedObject: nodeClaim, Type: v1.EventTypeNormal, - Reason: "DeprovisioningWaitingReadiness", - Message: "Waiting on readiness to continue deprovisioning", + Reason: "DisruptionWaitingReadiness", + Message: "Waiting on readiness to continue disruption", DedupeValues: []string{string(nodeClaim.UID)}, } } @@ -81,47 +81,75 @@ func WaitingOnDeletion(nodeClaim *v1beta1.NodeClaim) events.Event { return events.Event{ InvolvedObject: nodeClaim, Type: v1.EventTypeNormal, - Reason: "DeprovisioningWaitingDeletion", - Message: "Waiting on deletion to continue deprovisioning", + Reason: "DisruptionWaitingDeletion", + Message: "Waiting on deletion to continue disruption", DedupeValues: []string{string(nodeClaim.UID)}, } } func Terminating(node *v1.Node, nodeClaim *v1beta1.NodeClaim, reason string) []events.Event { - evts := []events.Event{ + if nodeClaim.IsMachine { + machine := machineutil.NewFromNodeClaim(nodeClaim) + return []events.Event{ + { + InvolvedObject: node, + Type: v1.EventTypeNormal, + Reason: "DeprovisioningTerminating", + Message: fmt.Sprintf("Deprovisioning Node: %s", cases.Title(language.Und, cases.NoLower).String(reason)), + DedupeValues: []string{string(node.UID), reason}, + }, + { + InvolvedObject: machine, + Type: v1.EventTypeNormal, + Reason: "DeprovisioningTerminating", + Message: fmt.Sprintf("Deprovisioning Machine: %s", cases.Title(language.Und, cases.NoLower).String(reason)), + DedupeValues: []string{string(machine.UID), reason}, + }, + } + } + return []events.Event{ { InvolvedObject: node, Type: v1.EventTypeNormal, - Reason: "DeprovisioningTerminating", - Message: fmt.Sprintf("Deprovisioning Node: %s", cases.Title(language.Und, cases.NoLower).String(reason)), + Reason: "DisruptionTerminating", + Message: fmt.Sprintf("Disrupting Node: %s", cases.Title(language.Und, cases.NoLower).String(reason)), DedupeValues: []string{string(node.UID), reason}, }, - } - if nodeClaim.IsMachine { - machine := machineutil.NewFromNodeClaim(nodeClaim) - evts = append(evts, events.Event{ - InvolvedObject: machine, - Type: v1.EventTypeNormal, - Reason: "DeprovisioningTerminating", - Message: fmt.Sprintf("Deprovisioning Machine: %s", cases.Title(language.Und, cases.NoLower).String(reason)), - DedupeValues: []string{string(machine.UID), reason}, - }) - } else { - evts = append(evts, events.Event{ + { InvolvedObject: nodeClaim, Type: v1.EventTypeNormal, - Reason: "DeprovisioningTerminating", - Message: fmt.Sprintf("Deprovisioning NodeClaim: %s", cases.Title(language.Und, cases.NoLower).String(reason)), + Reason: "DisruptionTerminating", + Message: fmt.Sprintf("Disrupting NodeClaim: %s", cases.Title(language.Und, cases.NoLower).String(reason)), DedupeValues: []string{string(nodeClaim.UID), reason}, - }) + }, } - return evts } // Unconsolidatable is an event that informs the user that a Machine/Node combination cannot be consolidated // due to the state of the Machine/Node or due to some state of the pods that are scheduled to the Machine/Node func Unconsolidatable(node *v1.Node, nodeClaim *v1beta1.NodeClaim, reason string) []events.Event { - evts := []events.Event{ + if nodeClaim.IsMachine { + machine := machineutil.NewFromNodeClaim(nodeClaim) + return []events.Event{ + { + InvolvedObject: node, + Type: v1.EventTypeNormal, + Reason: "Unconsolidatable", + Message: reason, + DedupeValues: []string{string(node.UID)}, + DedupeTimeout: time.Minute * 15, + }, + { + InvolvedObject: machine, + Type: v1.EventTypeNormal, + Reason: "Unconsolidatable", + Message: reason, + DedupeValues: []string{string(machine.UID)}, + DedupeTimeout: time.Minute * 15, + }, + } + } + return []events.Event{ { InvolvedObject: node, Type: v1.EventTypeNormal, @@ -130,59 +158,53 @@ func Unconsolidatable(node *v1.Node, nodeClaim *v1beta1.NodeClaim, reason string DedupeValues: []string{string(node.UID)}, DedupeTimeout: time.Minute * 15, }, - } - if nodeClaim.IsMachine { - machine := machineutil.NewFromNodeClaim(nodeClaim) - evts = append(evts, events.Event{ - InvolvedObject: machine, - Type: v1.EventTypeNormal, - Reason: "Unconsolidatable", - Message: reason, - DedupeValues: []string{string(machine.UID)}, - DedupeTimeout: time.Minute * 15, - }) - } else { - evts = append(evts, events.Event{ + { InvolvedObject: nodeClaim, Type: v1.EventTypeNormal, Reason: "Unconsolidatable", Message: reason, DedupeValues: []string{string(nodeClaim.UID)}, DedupeTimeout: time.Minute * 15, - }) + }, } - return evts } // Blocked is an event that informs the user that a Machine/Node combination is blocked on deprovisioning // due to the state of the Machine/Node or due to some state of the pods that are scheduled to the Machine/Node func Blocked(node *v1.Node, nodeClaim *v1beta1.NodeClaim, reason string) []events.Event { - evts := []events.Event{ + if nodeClaim.IsMachine { + machine := machineutil.NewFromNodeClaim(nodeClaim) + return []events.Event{ + { + InvolvedObject: node, + Type: v1.EventTypeNormal, + Reason: "DeprovisioningBlocked", + Message: fmt.Sprintf("Cannot deprovision Node: %s", reason), + DedupeValues: []string{string(node.UID)}, + }, + { + InvolvedObject: machine, + Type: v1.EventTypeNormal, + Reason: "DeprovisioningBlocked", + Message: fmt.Sprintf("Cannot deprovision Machine: %s", reason), + DedupeValues: []string{string(machine.UID)}, + }, + } + } + return []events.Event{ { InvolvedObject: node, Type: v1.EventTypeNormal, - Reason: "DeprovisioningBlocked", - Message: fmt.Sprintf("Cannot deprovision Node: %s", reason), + Reason: "DisruptionBlocked", + Message: fmt.Sprintf("Cannot disrupt Node: %s", reason), DedupeValues: []string{string(node.UID)}, }, - } - if nodeClaim.IsMachine { - machine := machineutil.NewFromNodeClaim(nodeClaim) - evts = append(evts, events.Event{ - InvolvedObject: machine, - Type: v1.EventTypeNormal, - Reason: "DeprovisioningBlocked", - Message: fmt.Sprintf("Cannot deprovision Machine: %s", reason), - DedupeValues: []string{string(machine.UID)}, - }) - } else { - evts = append(evts, events.Event{ + { InvolvedObject: nodeClaim, Type: v1.EventTypeNormal, - Reason: "DeprovisioningBlocked", - Message: fmt.Sprintf("Cannot deprovision NodeClaim: %s", reason), + Reason: "DisruptionBlocked", + Message: fmt.Sprintf("Cannot disrupt NodeClaim: %s", reason), DedupeValues: []string{string(nodeClaim.UID)}, - }) + }, } - return evts } diff --git a/pkg/controllers/deprovisioning/expiration.go b/pkg/controllers/disruption/expiration.go similarity index 81% rename from pkg/controllers/deprovisioning/expiration.go rename to pkg/controllers/disruption/expiration.go index fb720f28e168..f84165780e43 100644 --- a/pkg/controllers/deprovisioning/expiration.go +++ b/pkg/controllers/disruption/expiration.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "context" @@ -28,7 +28,7 @@ import ( "github.com/samber/lo" "github.com/aws/karpenter-core/pkg/apis/v1beta1" - deprovisioningevents "github.com/aws/karpenter-core/pkg/controllers/deprovisioning/events" + disruptionevents "github.com/aws/karpenter-core/pkg/controllers/disruption/events" "github.com/aws/karpenter-core/pkg/controllers/provisioning" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/events" @@ -55,8 +55,8 @@ func NewExpiration(clk clock.Clock, kubeClient client.Client, cluster *state.Clu } } -// ShouldDeprovision is a predicate used to filter deprovisionable candidates -func (e *Expiration) ShouldDeprovision(_ context.Context, c *Candidate) bool { +// ShouldDisrupt is a predicate used to filter candidates +func (e *Expiration) ShouldDisrupt(_ context.Context, c *Candidate) bool { return c.nodePool.Spec.Disruption.ExpireAfter.Duration != nil && c.NodeClaim.StatusConditions().GetCondition(v1beta1.Expired).IsTrue() } @@ -74,15 +74,19 @@ func (e *Expiration) filterAndSortCandidates(ctx context.Context, candidates []* return candidates, nil } -// ComputeCommand generates a deprovisioning command given deprovisionable candidates +// ComputeCommand generates a disrpution command given candidates func (e *Expiration) ComputeCommand(ctx context.Context, candidates ...*Candidate) (Command, error) { candidates, err := e.filterAndSortCandidates(ctx, candidates) if err != nil { return Command{}, fmt.Errorf("filtering candidates, %w", err) } - deprovisioningEligibleMachinesGauge.WithLabelValues(e.String()).Set(float64(len(candidates))) + deprovisioningEligibleMachinesGauge.WithLabelValues(e.Type()).Set(float64(len(candidates))) + disruptionEligibleNodesGauge.With(map[string]string{ + methodLabel: e.Type(), + consolidationTypeLabel: e.ConsolidationType(), + }).Set(float64(len(candidates))) - // Deprovision all empty expired candidates, as they require no scheduling simulations. + // Disrupt all empty expired candidates, as they require no scheduling simulations. if empty := lo.Filter(candidates, func(c *Candidate, _ int) bool { return len(c.pods) == 0 }); len(empty) > 0 { @@ -104,7 +108,7 @@ func (e *Expiration) ComputeCommand(ctx context.Context, candidates ...*Candidat // Log when all pods can't schedule, as the command will get executed immediately. if !results.AllNonPendingPodsScheduled() { logging.FromContext(ctx).With(lo.Ternary(candidate.NodeClaim.IsMachine, "machine", "nodeclaim"), candidate.NodeClaim.Name, "node", candidate.Node.Name).Debugf("cannot terminate since scheduling simulation failed to schedule all pods, %s", results.NonPendingPodSchedulingErrors()) - e.recorder.Publish(deprovisioningevents.Blocked(candidate.Node, candidate.NodeClaim, "Scheduling simulation failed to schedule all pods")...) + e.recorder.Publish(disruptionevents.Blocked(candidate.Node, candidate.NodeClaim, "Scheduling simulation failed to schedule all pods")...) continue } @@ -117,7 +121,10 @@ func (e *Expiration) ComputeCommand(ctx context.Context, candidates ...*Candidat return Command{}, nil } -// String is the string representation of the deprovisioner -func (e *Expiration) String() string { +func (e *Expiration) Type() string { return metrics.ExpirationReason } + +func (e *Expiration) ConsolidationType() string { + return "" +} diff --git a/pkg/controllers/deprovisioning/helpers.go b/pkg/controllers/disruption/helpers.go similarity index 93% rename from pkg/controllers/deprovisioning/helpers.go rename to pkg/controllers/disruption/helpers.go index 54ac6c3ceb68..9908d82a9ac2 100644 --- a/pkg/controllers/deprovisioning/helpers.go +++ b/pkg/controllers/disruption/helpers.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "context" @@ -30,7 +30,7 @@ import ( "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/cloudprovider" - deprovisioningevents "github.com/aws/karpenter-core/pkg/controllers/deprovisioning/events" + disruptionevents "github.com/aws/karpenter-core/pkg/controllers/disruption/events" "github.com/aws/karpenter-core/pkg/controllers/provisioning" pscheduling "github.com/aws/karpenter-core/pkg/controllers/provisioning/scheduling" "github.com/aws/karpenter-core/pkg/controllers/state" @@ -50,15 +50,15 @@ func filterCandidates(ctx context.Context, kubeClient client.Client, recorder ev // filter out nodes that can't be terminated nodes = lo.Filter(nodes, func(cn *Candidate, _ int) bool { if !cn.Node.DeletionTimestamp.IsZero() { - recorder.Publish(deprovisioningevents.Blocked(cn.Node, cn.NodeClaim, "Node in the process of deletion")...) + recorder.Publish(disruptionevents.Blocked(cn.Node, cn.NodeClaim, "Node in the process of deletion")...) return false } if pdb, ok := pdbs.CanEvictPods(cn.pods); !ok { - recorder.Publish(deprovisioningevents.Blocked(cn.Node, cn.NodeClaim, fmt.Sprintf("PDB %q prevents pod evictions", pdb))...) + recorder.Publish(disruptionevents.Blocked(cn.Node, cn.NodeClaim, fmt.Sprintf("PDB %q prevents pod evictions", pdb))...) return false } if p, ok := hasDoNotDisruptPod(cn); ok { - recorder.Publish(deprovisioningevents.Blocked(cn.Node, cn.NodeClaim, fmt.Sprintf("Pod %q has do not evict annotation", client.ObjectKeyFromObject(p)))...) + recorder.Publish(disruptionevents.Blocked(cn.Node, cn.NodeClaim, fmt.Sprintf("Pod %q has do not evict annotation", client.ObjectKeyFromObject(p)))...) return false } return true @@ -173,7 +173,7 @@ func disruptionCost(ctx context.Context, pods []*v1.Pod) float64 { return cost } -// GetCandidates returns nodes that appear to be currently deprovisionable based off of their nodePool +// GetCandidates returns nodes that appear to be currently disruptable based off of their nodePool func GetCandidates(ctx context.Context, cluster *state.Cluster, kubeClient client.Client, recorder events.Recorder, clk clock.Clock, cloudProvider cloudprovider.CloudProvider, shouldDeprovision CandidateFilter) ([]*Candidate, error) { nodePoolMap, nodePoolToInstanceTypesMap, err := buildNodePoolMap(ctx, kubeClient, cloudProvider) if err != nil { @@ -183,7 +183,7 @@ func GetCandidates(ctx context.Context, cluster *state.Cluster, kubeClient clien cn, e := NewCandidate(ctx, kubeClient, recorder, clk, n, nodePoolMap, nodePoolToInstanceTypesMap) return cn, e == nil }) - // Filter only the valid candidates that we should deprovision + // Filter only the valid candidates that we should disrupt return lo.Filter(candidates, func(c *Candidate, _ int) bool { return shouldDeprovision(ctx, c) }), nil } diff --git a/pkg/controllers/deprovisioning/machine_consolidation_test.go b/pkg/controllers/disruption/machine_consolidation_test.go similarity index 96% rename from pkg/controllers/deprovisioning/machine_consolidation_test.go rename to pkg/controllers/disruption/machine_consolidation_test.go index b7147a115efa..9871e220f935 100644 --- a/pkg/controllers/deprovisioning/machine_consolidation_test.go +++ b/pkg/controllers/disruption/machine_consolidation_test.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning_test +package disruption_test import ( "fmt" @@ -40,7 +40,7 @@ import ( "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/cloudprovider" "github.com/aws/karpenter-core/pkg/cloudprovider/fake" - "github.com/aws/karpenter-core/pkg/controllers/deprovisioning" + "github.com/aws/karpenter-core/pkg/controllers/disruption" "github.com/aws/karpenter-core/pkg/events" "github.com/aws/karpenter-core/pkg/scheduling" "github.com/aws/karpenter-core/pkg/test" @@ -105,7 +105,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -125,7 +125,7 @@ var _ = Describe("Machine/Consolidation", func() { fakeClock.Step(10 * time.Minute) wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the machine to the node ExpectMachinesCascadeDeletion(ctx, env.Client, machine1, machine2) @@ -187,7 +187,7 @@ var _ = Describe("Machine/Consolidation", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node1}, []*v1alpha5.Machine{machine1}) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) // we don't need any new nodes and consolidation should notice the huge pending pod that needs the large // node to schedule, which prevents the large expensive node from being replaced @@ -246,7 +246,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -319,7 +319,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -404,7 +404,7 @@ var _ = Describe("Machine/Consolidation", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) // we didn't create a new machine or delete the old one Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -498,7 +498,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -580,7 +580,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -675,7 +675,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -770,7 +770,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -845,7 +845,7 @@ var _ = Describe("Machine/Consolidation", func() { }, }, }) - // Block this pod from being deprovisioned with karpenter.sh/do-not-evict + // Block this pod from being disrupted with karpenter.sh/do-not-evict pods[2].Annotations = lo.Assign(pods[2].Annotations, map[string]string{v1alpha5.DoNotEvictPodAnnotationKey: "true"}) ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], provisioner) @@ -864,7 +864,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -939,7 +939,7 @@ var _ = Describe("Machine/Consolidation", func() { }, }, }) - // Block this pod from being deprovisioned with karpenter.sh/do-not-evict + // Block this pod from being disrupted with karpenter.sh/do-not-evict pods[2].Annotations = lo.Assign(pods[2].Annotations, map[string]string{v1beta1.DoNotDisruptAnnotationKey: "true"}) ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], provisioner) @@ -958,7 +958,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1055,7 +1055,7 @@ var _ = Describe("Machine/Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Expect to not create or delete more machines @@ -1165,7 +1165,7 @@ var _ = Describe("Machine/Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Expect to not create or delete more machines @@ -1228,7 +1228,7 @@ var _ = Describe("Machine/Consolidation", func() { var consolidationFinished atomic.Bool go func() { defer GinkgoRecover() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) consolidationFinished.Store(true) }() wg.Wait() @@ -1334,7 +1334,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1381,7 +1381,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1435,7 +1435,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1500,7 +1500,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1553,7 +1553,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1604,7 +1604,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1637,7 +1637,7 @@ var _ = Describe("Machine/Consolidation", func() { BlockOwnerDeletion: ptr.Bool(true), }, }}}) - // Block this pod from being deprovisioned with karpenter.sh/do-not-evict + // Block this pod from being disrupted with karpenter.sh/do-not-evict pods[2].Annotations = lo.Assign(pods[2].Annotations, map[string]string{v1alpha5.DoNotEvictPodAnnotationKey: "true"}) ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], provisioner) @@ -1655,7 +1655,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1688,7 +1688,7 @@ var _ = Describe("Machine/Consolidation", func() { BlockOwnerDeletion: ptr.Bool(true), }, }}}) - // Block this pod from being deprovisioned with karpenter.sh/do-not-disrupt + // Block this pod from being disrupted with karpenter.sh/do-not-disrupt pods[2].Annotations = lo.Assign(pods[2].Annotations, map[string]string{v1beta1.DoNotDisruptAnnotationKey: "true"}) ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], provisioner) @@ -1706,7 +1706,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1753,7 +1753,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1799,7 +1799,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // shouldn't delete the node @@ -1948,7 +1948,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() ExpectMachinesCascadeDeletion(ctx, env.Client, consolidatableMachine) @@ -2005,7 +2005,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -2061,7 +2061,7 @@ var _ = Describe("Machine/Consolidation", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) // No node can be deleted as it would cause one of the three pods to go pending Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(2)) @@ -2072,7 +2072,7 @@ var _ = Describe("Machine/Consolidation", func() { "app": "test", } - // this invalid provisioner should not be enough to stop all deprovisioning + // this invalid provisioner should not be enough to stop all disruption badProvisioner := &v1alpha5.Provisioner{ ObjectMeta: metav1.ObjectMeta{ Name: "bad-provisioner", @@ -2109,7 +2109,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -2175,7 +2175,7 @@ var _ = Describe("Machine/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // wait for the controller to block on the validation timeout @@ -2259,7 +2259,7 @@ var _ = Describe("Machine/Consolidation", func() { go func() { defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) }() // wait for the controller to block on the validation timeout @@ -2325,10 +2325,10 @@ var _ = Describe("Machine/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() - // wait for the deprovisioningController to block on the validation timeout + // wait for the disruptionController to block on the validation timeout Eventually(fakeClock.HasWaiters, time.Second*10).Should(BeTrue()) // controller should be blocking during the timeout Expect(finished.Load()).To(BeFalse()) @@ -2389,10 +2389,10 @@ var _ = Describe("Machine/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() - // wait for the deprovisioningController to block on the validation timeout + // wait for the disruptionController to block on the validation timeout Eventually(fakeClock.HasWaiters, time.Second*10).Should(BeTrue()) // controller should be blocking during the timeout Expect(finished.Load()).To(BeFalse()) @@ -2449,7 +2449,7 @@ var _ = Describe("Machine/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2514,7 +2514,7 @@ var _ = Describe("Machine/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2579,7 +2579,7 @@ var _ = Describe("Machine/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2633,7 +2633,7 @@ var _ = Describe("Machine/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2684,7 +2684,7 @@ var _ = Describe("Machine/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2735,7 +2735,7 @@ var _ = Describe("Machine/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2840,11 +2840,11 @@ var _ = Describe("Machine/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // advance the clock so that the timeout expires - fakeClock.Step(deprovisioning.MultiNodeConsolidationTimeoutDuration) + fakeClock.Step(disruption.MultiNodeConsolidationTimeoutDuration) // wait for the controller to block on the validation timeout Eventually(fakeClock.HasWaiters, time.Second*10).Should(BeTrue()) @@ -2932,13 +2932,13 @@ var _ = Describe("Machine/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // advance the clock so that the timeout expires for multi-machine - fakeClock.Step(deprovisioning.MultiNodeConsolidationTimeoutDuration) + fakeClock.Step(disruption.MultiNodeConsolidationTimeoutDuration) // advance the clock so that the timeout expires for single-machine - fakeClock.Step(deprovisioning.SingleNodeConsolidationTimeoutDuration) + fakeClock.Step(disruption.SingleNodeConsolidationTimeoutDuration) ExpectTriggerVerifyAction(&wg) @@ -3043,7 +3043,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -3110,7 +3110,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -3168,7 +3168,7 @@ var _ = Describe("Machine/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // wait for the controller to block on the validation timeout @@ -3225,7 +3225,7 @@ var _ = Describe("Machine/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // wait for the controller to block on the validation timeout @@ -3301,7 +3301,7 @@ var _ = Describe("Machine/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // wait for the controller to block on the validation timeout @@ -3434,7 +3434,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -3549,7 +3549,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -3634,7 +3634,7 @@ var _ = Describe("Machine/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // our nodes are already the cheapest available, so we can't replace them. If we delete, it would @@ -3712,7 +3712,7 @@ var _ = Describe("Machine/Consolidation", func() { ExpectTriggerVerifyAction(&wg) go func() { defer GinkgoRecover() - _, _ = deprovisioningController.Reconcile(ctx, reconcile.Request{}) + _, _ = disruptionController.Reconcile(ctx, reconcile.Request{}) }() wg.Wait() @@ -3798,7 +3798,7 @@ var _ = Describe("Machine/Consolidation", func() { // consolidation shouldn't trigger additional actions fakeClock.Step(10 * time.Minute) - result, err := deprovisioningController.Reconcile(ctx, reconcile.Request{}) + result, err := disruptionController.Reconcile(ctx, reconcile.Request{}) Expect(err).ToNot(HaveOccurred()) Expect(result.RequeueAfter).To(BeNumerically(">", 0)) }) diff --git a/pkg/controllers/deprovisioning/machine_drift_test.go b/pkg/controllers/disruption/machine_drift_test.go similarity index 92% rename from pkg/controllers/deprovisioning/machine_drift_test.go rename to pkg/controllers/disruption/machine_drift_test.go index 5df69e7943c6..5675059c42e0 100644 --- a/pkg/controllers/deprovisioning/machine_drift_test.go +++ b/pkg/controllers/disruption/machine_drift_test.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning_test +package disruption_test import ( "sync" @@ -75,7 +75,7 @@ var _ = Describe("Machine/Drift", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Expect to not create or delete more machines @@ -127,11 +127,11 @@ var _ = Describe("Machine/Drift", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node2}, []*v1alpha5.Machine{machine2}) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() ExpectMachinesCascadeDeletion(ctx, env.Client, machine, machine2) @@ -150,7 +150,7 @@ var _ = Describe("Machine/Drift", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -163,7 +163,7 @@ var _ = Describe("Machine/Drift", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -184,7 +184,7 @@ var _ = Describe("Machine/Drift", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -205,7 +205,7 @@ var _ = Describe("Machine/Drift", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -221,7 +221,7 @@ var _ = Describe("Machine/Drift", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -237,7 +237,7 @@ var _ = Describe("Machine/Drift", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -248,7 +248,7 @@ var _ = Describe("Machine/Drift", func() { Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(0)) ExpectNotFound(ctx, env.Client, machine, node) }) - It("should deprovision all empty drifted nodes in parallel", func() { + It("should disrupt all empty drifted nodes in parallel", func() { machines, nodes := test.MachinesAndNodes(100, v1alpha5.Machine{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ @@ -279,7 +279,7 @@ var _ = Describe("Machine/Drift", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -321,11 +321,11 @@ var _ = Describe("Machine/Drift", func() { fakeClock.Step(10 * time.Minute) - // deprovisioning won't delete the old machine until the new machine is ready + // disruption won't delete the old machine until the new machine is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -421,11 +421,11 @@ var _ = Describe("Machine/Drift", func() { fakeClock.Step(10 * time.Minute) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 3) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -486,18 +486,18 @@ var _ = Describe("Machine/Drift", func() { ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], machine, node, machine2, node2, provisioner) - // bind pods to node so that they're not empty and don't deprovision in parallel. + // bind pods to node so that they're not empty and don't disrupt in parallel. ExpectManualBinding(ctx, env.Client, pods[0], node) ExpectManualBinding(ctx, env.Client, pods[1], node2) // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node, node2}, []*v1alpha5.Machine{machine, machine2}) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the machine to the node diff --git a/pkg/controllers/deprovisioning/machine_emptiness_test.go b/pkg/controllers/disruption/machine_emptiness_test.go similarity index 92% rename from pkg/controllers/deprovisioning/machine_emptiness_test.go rename to pkg/controllers/disruption/machine_emptiness_test.go index c5e39ed51280..7387c025d185 100644 --- a/pkg/controllers/deprovisioning/machine_emptiness_test.go +++ b/pkg/controllers/disruption/machine_emptiness_test.go @@ -13,7 +13,7 @@ limitations under the License. */ // nolint:gosec -package deprovisioning_test +package disruption_test import ( "sync" @@ -71,7 +71,7 @@ var _ = Describe("Machine/Emptiness", func() { fakeClock.Step(10 * time.Minute) wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the machine to the node ExpectMachinesCascadeDeletion(ctx, env.Client, machine) @@ -88,7 +88,7 @@ var _ = Describe("Machine/Emptiness", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -102,7 +102,7 @@ var _ = Describe("Machine/Emptiness", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -123,7 +123,7 @@ var _ = Describe("Machine/Emptiness", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -144,7 +144,7 @@ var _ = Describe("Machine/Emptiness", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -160,7 +160,7 @@ var _ = Describe("Machine/Emptiness", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) diff --git a/pkg/controllers/deprovisioning/machine_expiration_test.go b/pkg/controllers/disruption/machine_expiration_test.go similarity index 93% rename from pkg/controllers/deprovisioning/machine_expiration_test.go rename to pkg/controllers/disruption/machine_expiration_test.go index 85db1a26d2c8..ce81d9b48af3 100644 --- a/pkg/controllers/deprovisioning/machine_expiration_test.go +++ b/pkg/controllers/disruption/machine_expiration_test.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning_test +package disruption_test import ( "sync" @@ -76,7 +76,7 @@ var _ = Describe("Machine/Expiration", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -90,7 +90,7 @@ var _ = Describe("Machine/Expiration", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -111,7 +111,7 @@ var _ = Describe("Machine/Expiration", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -132,7 +132,7 @@ var _ = Describe("Machine/Expiration", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -184,11 +184,11 @@ var _ = Describe("Machine/Expiration", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node2}, []*v1alpha5.Machine{machine2}) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() ExpectMachinesCascadeDeletion(ctx, env.Client, machine, machine2) @@ -207,7 +207,7 @@ var _ = Describe("Machine/Expiration", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more machines Expect(ExpectMachines(ctx, env.Client)).To(HaveLen(1)) @@ -221,7 +221,7 @@ var _ = Describe("Machine/Expiration", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -232,7 +232,7 @@ var _ = Describe("Machine/Expiration", func() { Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(0)) ExpectNotFound(ctx, env.Client, machine, node) }) - It("should deprovision all empty expired nodes in parallel", func() { + It("should disrupt all empty expired nodes in parallel", func() { machines, nodes := test.MachinesAndNodes(100, v1alpha5.Machine{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ @@ -263,7 +263,7 @@ var _ = Describe("Machine/Expiration", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -321,18 +321,18 @@ var _ = Describe("Machine/Expiration", func() { ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], machine, machine2, node, node2, provisioner) - // bind pods to node so that they're not empty and don't deprovision in parallel. + // bind pods to node so that they're not empty and don't disrupt in parallel. ExpectManualBinding(ctx, env.Client, pods[0], node) ExpectManualBinding(ctx, env.Client, pods[1], node2) // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node, node2}, []*v1alpha5.Machine{machine, machine2}) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -376,11 +376,11 @@ var _ = Describe("Machine/Expiration", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -431,7 +431,7 @@ var _ = Describe("Machine/Expiration", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectNewMachinesDeleted(ctx, env.Client, &wg, 1) - _, err := deprovisioningController.Reconcile(ctx, reconcile.Request{}) + _, err := disruptionController.Reconcile(ctx, reconcile.Request{}) Expect(err).To(HaveOccurred()) wg.Wait() @@ -516,11 +516,11 @@ var _ = Describe("Machine/Expiration", func() { // inform cluster state about nodes and machines ExpectMakeNodesAndMachinesInitializedAndStateUpdated(ctx, env.Client, nodeStateController, machineStateController, []*v1.Node{node}, []*v1alpha5.Machine{machine}) - // deprovisioning won't delete the old machine until the new machine is ready + // disruption won't delete the old machine until the new machine is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 3) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the machine to the node diff --git a/pkg/controllers/deprovisioning/metrics.go b/pkg/controllers/disruption/metrics.go similarity index 55% rename from pkg/controllers/deprovisioning/metrics.go rename to pkg/controllers/disruption/metrics.go index 6ed04b63052f..49cc310b234f 100644 --- a/pkg/controllers/deprovisioning/metrics.go +++ b/pkg/controllers/disruption/metrics.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "github.com/prometheus/client_golang/prometheus" @@ -23,14 +23,19 @@ import ( func init() { crmetrics.Registry.MustRegister(deprovisioningDurationHistogram, deprovisioningReplacementNodeInitializedHistogram, deprovisioningActionsPerformedCounter, - deprovisioningEligibleMachinesGauge, deprovisioningReplacementNodeLaunchFailedCounter, deprovisioningConsolidationTimeoutsCounter) + deprovisioningEligibleMachinesGauge, deprovisioningReplacementNodeLaunchFailedCounter, deprovisioningConsolidationTimeoutsCounter, + disruptionEvaluationDurationHistogram, disruptionReplacementNodeClaimInitializedHistogram, disruptionReplacementNodeClaimFailedCounter, + disruptionActionsPerformedCounter, disruptionEligibleNodesGauge, disruptionConsolidationTimeoutTotalCounter) } const ( deprovisioningSubsystem = "deprovisioning" deprovisionerLabel = "deprovisioner" - actionLabel = "action" - consolidationType = "consolidation_type" + + disruptionSubsystem = "disruption" + actionLabel = "action" + methodLabel = "method" + consolidationTypeLabel = "consolidation_type" multiMachineConsolidationLabelValue = "multi-machine" singleMachineConsolidationLabelValue = "single-machine" @@ -45,7 +50,8 @@ var ( Help: "Duration of the deprovisioning evaluation process in seconds.", Buckets: metrics.DurationBuckets(), }, - []string{"method"}) + []string{"method"}, + ) deprovisioningReplacementNodeInitializedHistogram = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: metrics.Namespace, @@ -79,7 +85,7 @@ var ( Name: "consolidation_timeouts", Help: "Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type.", }, - []string{consolidationType}, + []string{consolidationTypeLabel}, ) deprovisioningReplacementNodeLaunchFailedCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -91,3 +97,61 @@ var ( []string{deprovisionerLabel}, ) ) + +var ( + disruptionEvaluationDurationHistogram = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: metrics.Namespace, + Subsystem: disruptionSubsystem, + Name: "evaluation_duration_seconds", + Help: "Duration of the disruption evaluation process in seconds.", + Buckets: metrics.DurationBuckets(), + }, + []string{methodLabel, consolidationTypeLabel}, + ) + disruptionReplacementNodeClaimInitializedHistogram = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: metrics.Namespace, + Subsystem: disruptionSubsystem, + Name: "replacement_nodeclaim_initialized_seconds", + Help: "Amount of time required for a replacement nodeclaim to become initialized.", + Buckets: metrics.DurationBuckets(), + }, + ) + disruptionReplacementNodeClaimFailedCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: disruptionSubsystem, + Name: "replacement_nodeclaim_failures_total", + Help: "The number of times that Karpenter failed to launch a replacement node for disruption. Labeled by disruption type.", + }, + []string{methodLabel, consolidationTypeLabel}, + ) + disruptionActionsPerformedCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: disruptionSubsystem, + Name: "actions_performed_total", + Help: "Number of disruption methods performed. Labeled by disruption type.", + }, + []string{actionLabel, methodLabel, consolidationTypeLabel}, + ) + disruptionEligibleNodesGauge = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metrics.Namespace, + Subsystem: disruptionSubsystem, + Name: "eligible_nodes", + Help: "Number of nodes eligible for disruption by Karpenter. Labeled by disruption type.", + }, + []string{methodLabel, consolidationTypeLabel}, + ) + disruptionConsolidationTimeoutTotalCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: disruptionSubsystem, + Name: "consolidation_timeouts_total", + Help: "Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type.", + }, + []string{consolidationTypeLabel}, + ) +) diff --git a/pkg/controllers/deprovisioning/multinodeconsolidation.go b/pkg/controllers/disruption/multinodeconsolidation.go similarity index 92% rename from pkg/controllers/deprovisioning/multinodeconsolidation.go rename to pkg/controllers/disruption/multinodeconsolidation.go index ba39a812b706..6dcbaa1738d9 100644 --- a/pkg/controllers/deprovisioning/multinodeconsolidation.go +++ b/pkg/controllers/disruption/multinodeconsolidation.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "context" @@ -31,6 +31,7 @@ import ( "github.com/aws/karpenter-core/pkg/controllers/provisioning/scheduling" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/events" + "github.com/aws/karpenter-core/pkg/metrics" ) const MultiNodeConsolidationTimeoutDuration = 1 * time.Minute @@ -52,7 +53,11 @@ func (m *MultiNodeConsolidation) ComputeCommand(ctx context.Context, candidates if err != nil { return Command{}, fmt.Errorf("sorting candidates, %w", err) } - deprovisioningEligibleMachinesGauge.WithLabelValues(m.String()).Set(float64(len(candidates))) + deprovisioningEligibleMachinesGauge.WithLabelValues(m.Type()).Set(float64(len(candidates))) + disruptionEligibleNodesGauge.With(map[string]string{ + methodLabel: m.Type(), + consolidationTypeLabel: m.ConsolidationType(), + }).Set(float64(len(candidates))) // Only consider a maximum batch of 100 NodeClaims to save on computation. // This could be further configurable in the future. @@ -100,8 +105,8 @@ func (m *MultiNodeConsolidation) firstNConsolidationOption(ctx context.Context, // binary search to find the maximum number of NodeClaims we can terminate for min <= max { if m.clock.Now().After(timeout) { - // TODO @joinnis: Change this to multiNodeConsolidationLabelValue when migrating deprovisioningConsolidationTimeoutsCounter.WithLabelValues(multiMachineConsolidationLabelValue).Inc() + disruptionConsolidationTimeoutTotalCounter.WithLabelValues(m.ConsolidationType()).Inc() if lastSavedCommand.candidates == nil { logging.FromContext(ctx).Debugf("failed to find a multi-node consolidation after timeout, last considered batch had %d", (min+max)/2) } else { @@ -187,3 +192,11 @@ func filterOutSameType(newNodeClaim *scheduling.NodeClaim, consolidate []*Candid return filterByPrice(newNodeClaim.InstanceTypeOptions, newNodeClaim.Requirements, maxPrice) } + +func (m *MultiNodeConsolidation) Type() string { + return metrics.ConsolidationReason +} + +func (m *MultiNodeConsolidation) ConsolidationType() string { + return "multi" +} diff --git a/pkg/controllers/deprovisioning/nodeclaim_consolidation_test.go b/pkg/controllers/disruption/nodeclaim_consolidation_test.go similarity index 96% rename from pkg/controllers/deprovisioning/nodeclaim_consolidation_test.go rename to pkg/controllers/disruption/nodeclaim_consolidation_test.go index 42ae309f0c72..0536512548c5 100644 --- a/pkg/controllers/deprovisioning/nodeclaim_consolidation_test.go +++ b/pkg/controllers/disruption/nodeclaim_consolidation_test.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning_test +package disruption_test import ( "fmt" @@ -40,7 +40,7 @@ import ( "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/cloudprovider" "github.com/aws/karpenter-core/pkg/cloudprovider/fake" - "github.com/aws/karpenter-core/pkg/controllers/deprovisioning" + "github.com/aws/karpenter-core/pkg/controllers/disruption" "github.com/aws/karpenter-core/pkg/events" "github.com/aws/karpenter-core/pkg/scheduling" "github.com/aws/karpenter-core/pkg/test" @@ -107,7 +107,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -127,7 +127,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { fakeClock.Step(10 * time.Minute) wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the nodeclaim to the node ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaim, nodeClaim2) @@ -189,7 +189,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) // we don't need any new nodes and consolidation should notice the huge pending pod that needs the large // node to schedule, which prevents the large expensive node from being replaced @@ -234,7 +234,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -292,7 +292,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -358,7 +358,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) // we didn't create a new nodeclaim or delete the old one Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -435,7 +435,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -503,7 +503,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -581,7 +581,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -659,7 +659,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -717,7 +717,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { }, }, }) - // Block this pod from being deprovisioned with karpenter.sh/do-not-evict + // Block this pod from being disrupted with karpenter.sh/do-not-evict pods[2].Annotations = lo.Assign(pods[2].Annotations, map[string]string{v1alpha5.DoNotEvictPodAnnotationKey: "true"}) ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], nodePool) @@ -736,7 +736,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -794,7 +794,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { }, }, }) - // Block this pod from being deprovisioned with karpenter.sh/do-not-evict + // Block this pod from being disrupted with karpenter.sh/do-not-evict pods[2].Annotations = lo.Assign(pods[2].Annotations, map[string]string{v1beta1.DoNotDisruptAnnotationKey: "true"}) ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], nodePool) @@ -813,7 +813,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -910,7 +910,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Expect to not create or delete more nodeclaims @@ -1021,7 +1021,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Expect to not create or delete more nodeclaims @@ -1072,7 +1072,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var consolidationFinished atomic.Bool go func() { defer GinkgoRecover() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) consolidationFinished.Store(true) }() wg.Wait() @@ -1179,7 +1179,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -1226,7 +1226,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -1280,7 +1280,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -1342,7 +1342,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -1395,7 +1395,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1446,7 +1446,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1479,7 +1479,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { BlockOwnerDeletion: ptr.Bool(true), }, }}}) - // Block this pod from being deprovisioned with karpenter.sh/do-not-evict + // Block this pod from being disrupted with karpenter.sh/do-not-evict pods[2].Annotations = lo.Assign(pods[2].Annotations, map[string]string{v1alpha5.DoNotEvictPodAnnotationKey: "true"}) ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], nodePool) @@ -1497,7 +1497,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -1530,7 +1530,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { BlockOwnerDeletion: ptr.Bool(true), }, }}}) - // Block this pod from being deprovisioned with karpenter.sh/do-not-disrupt + // Block this pod from being disrupted with karpenter.sh/do-not-disrupt pods[2].Annotations = lo.Assign(pods[2].Annotations, map[string]string{v1beta1.DoNotDisruptAnnotationKey: "true"}) ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], nodePool) @@ -1548,7 +1548,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -1595,7 +1595,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -1641,7 +1641,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // shouldn't delete the node @@ -1790,7 +1790,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() ExpectNodeClaimsCascadeDeletion(ctx, env.Client, consolidatableNodeClaim) @@ -1847,7 +1847,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -1903,7 +1903,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) // No node can be deleted as it would cause one of the three pods to go pending Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(2)) @@ -1914,7 +1914,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { "app": "test", } - // this invalid node pool should not be enough to stop all deprovisioning + // this invalid node pool should not be enough to stop all disruption badNodePool := &v1beta1.NodePool{ ObjectMeta: metav1.ObjectMeta{ Name: "bad-nodepool", @@ -1961,7 +1961,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the machine to the node @@ -2027,7 +2027,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // wait for the controller to block on the validation timeout @@ -2111,7 +2111,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { go func() { defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) }() // wait for the controller to block on the validation timeout @@ -2178,10 +2178,10 @@ var _ = Describe("NodeClaim/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() - // wait for the deprovisioningController to block on the validation timeout + // wait for the disruptionController to block on the validation timeout Eventually(fakeClock.HasWaiters, time.Second*10).Should(BeTrue()) // controller should be blocking during the timeout Expect(finished.Load()).To(BeFalse()) @@ -2242,10 +2242,10 @@ var _ = Describe("NodeClaim/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() - // wait for the deprovisioningController to block on the validation timeout + // wait for the disruptionController to block on the validation timeout Eventually(fakeClock.HasWaiters, time.Second*10).Should(BeTrue()) // controller should be blocking during the timeout Expect(finished.Load()).To(BeFalse()) @@ -2285,7 +2285,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2333,7 +2333,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2381,7 +2381,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2435,7 +2435,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2486,7 +2486,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2537,7 +2537,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { wg.Add(1) go func() { defer wg.Done() - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -2642,11 +2642,11 @@ var _ = Describe("NodeClaim/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // advance the clock so that the timeout expires - fakeClock.Step(deprovisioning.MultiNodeConsolidationTimeoutDuration) + fakeClock.Step(disruption.MultiNodeConsolidationTimeoutDuration) // wait for the controller to block on the validation timeout Eventually(fakeClock.HasWaiters, time.Second*10).Should(BeTrue()) @@ -2734,13 +2734,13 @@ var _ = Describe("NodeClaim/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // advance the clock so that the timeout expires for multi-nodeClaim - fakeClock.Step(deprovisioning.MultiNodeConsolidationTimeoutDuration) + fakeClock.Step(disruption.MultiNodeConsolidationTimeoutDuration) // advance the clock so that the timeout expires for single-nodeClaim - fakeClock.Step(deprovisioning.SingleNodeConsolidationTimeoutDuration) + fakeClock.Step(disruption.SingleNodeConsolidationTimeoutDuration) ExpectTriggerVerifyAction(&wg) @@ -2845,7 +2845,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -2912,7 +2912,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -2970,7 +2970,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // wait for the controller to block on the validation timeout @@ -3027,7 +3027,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // wait for the controller to block on the validation timeout @@ -3103,7 +3103,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() // wait for the controller to block on the validation timeout @@ -3236,7 +3236,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -3351,7 +3351,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeclaim to the node @@ -3436,7 +3436,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // our nodes are already the cheapest available, so we can't replace them. If we delete, it would @@ -3492,7 +3492,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { ExpectTriggerVerifyAction(&wg) go func() { defer GinkgoRecover() - _, _ = deprovisioningController.Reconcile(ctx, reconcile.Request{}) + _, _ = disruptionController.Reconcile(ctx, reconcile.Request{}) }() wg.Wait() @@ -3577,7 +3577,7 @@ var _ = Describe("NodeClaim/Consolidation", func() { // consolidation shouldn't trigger additional actions fakeClock.Step(10 * time.Minute) - result, err := deprovisioningController.Reconcile(ctx, reconcile.Request{}) + result, err := disruptionController.Reconcile(ctx, reconcile.Request{}) Expect(err).ToNot(HaveOccurred()) Expect(result.RequeueAfter).To(BeNumerically(">", 0)) }) diff --git a/pkg/controllers/deprovisioning/nodeclaim_drift_test.go b/pkg/controllers/disruption/nodeclaim_drift_test.go similarity index 92% rename from pkg/controllers/deprovisioning/nodeclaim_drift_test.go rename to pkg/controllers/disruption/nodeclaim_drift_test.go index fbb2630eb7fb..43e0ca48a7a9 100644 --- a/pkg/controllers/deprovisioning/nodeclaim_drift_test.go +++ b/pkg/controllers/disruption/nodeclaim_drift_test.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning_test +package disruption_test import ( "sync" @@ -82,7 +82,7 @@ var _ = Describe("NodeClaim/Drift", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Expect to not create or delete more nodeclaims @@ -134,11 +134,11 @@ var _ = Describe("NodeClaim/Drift", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node2}, []*v1beta1.NodeClaim{nodeClaim2}) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaim, nodeClaim2) @@ -157,7 +157,7 @@ var _ = Describe("NodeClaim/Drift", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -170,7 +170,7 @@ var _ = Describe("NodeClaim/Drift", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -191,7 +191,7 @@ var _ = Describe("NodeClaim/Drift", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -212,7 +212,7 @@ var _ = Describe("NodeClaim/Drift", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -228,7 +228,7 @@ var _ = Describe("NodeClaim/Drift", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -244,7 +244,7 @@ var _ = Describe("NodeClaim/Drift", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the nodeClaim to the node @@ -255,7 +255,7 @@ var _ = Describe("NodeClaim/Drift", func() { Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(0)) ExpectNotFound(ctx, env.Client, nodeClaim, node) }) - It("should deprovision all empty drifted nodes in parallel", func() { + It("should disrupt all empty drifted nodes in parallel", func() { nodeClaims, nodes := test.NodeClaimsAndNodes(100, v1beta1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ @@ -286,7 +286,7 @@ var _ = Describe("NodeClaim/Drift", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the nodeClaim to the node @@ -328,11 +328,11 @@ var _ = Describe("NodeClaim/Drift", func() { fakeClock.Step(10 * time.Minute) - // deprovisioning won't delete the old nodeClaim until the new nodeClaim is ready + // disruption won't delete the old nodeClaim until the new nodeClaim is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the nodeClaim to the node @@ -428,11 +428,11 @@ var _ = Describe("NodeClaim/Drift", func() { fakeClock.Step(10 * time.Minute) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 3) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) wg.Wait() // Cascade any deletion of the nodeClaim to the node @@ -493,18 +493,18 @@ var _ = Describe("NodeClaim/Drift", func() { ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], nodeClaim, node, nodeClaim2, node2, nodePool) - // bind pods to node so that they're not empty and don't deprovision in parallel. + // bind pods to node so that they're not empty and don't disrupt in parallel. ExpectManualBinding(ctx, env.Client, pods[0], node) ExpectManualBinding(ctx, env.Client, pods[1], node2) // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node, node2}, []*v1beta1.NodeClaim{nodeClaim, nodeClaim2}) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the nodeClaim to the node diff --git a/pkg/controllers/deprovisioning/nodeclaim_emptiness_test.go b/pkg/controllers/disruption/nodeclaim_emptiness_test.go similarity index 92% rename from pkg/controllers/deprovisioning/nodeclaim_emptiness_test.go rename to pkg/controllers/disruption/nodeclaim_emptiness_test.go index a794f5b209d8..8a65149b5be9 100644 --- a/pkg/controllers/deprovisioning/nodeclaim_emptiness_test.go +++ b/pkg/controllers/disruption/nodeclaim_emptiness_test.go @@ -13,7 +13,7 @@ limitations under the License. */ // nolint:gosec -package deprovisioning_test +package disruption_test import ( "sync" @@ -76,7 +76,7 @@ var _ = Describe("NodeClaim/Emptiness", func() { fakeClock.Step(10 * time.Minute) wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the nodeClaim to the node ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaim) @@ -93,7 +93,7 @@ var _ = Describe("NodeClaim/Emptiness", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -107,7 +107,7 @@ var _ = Describe("NodeClaim/Emptiness", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -128,7 +128,7 @@ var _ = Describe("NodeClaim/Emptiness", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -149,7 +149,7 @@ var _ = Describe("NodeClaim/Emptiness", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -165,7 +165,7 @@ var _ = Describe("NodeClaim/Emptiness", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) diff --git a/pkg/controllers/deprovisioning/nodeclaim_expiration_test.go b/pkg/controllers/disruption/nodeclaim_expiration_test.go similarity index 93% rename from pkg/controllers/deprovisioning/nodeclaim_expiration_test.go rename to pkg/controllers/disruption/nodeclaim_expiration_test.go index 229513d21b32..e5cd9dcb4e5a 100644 --- a/pkg/controllers/deprovisioning/nodeclaim_expiration_test.go +++ b/pkg/controllers/disruption/nodeclaim_expiration_test.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning_test +package disruption_test import ( "sync" @@ -78,7 +78,7 @@ var _ = Describe("NodeClaim/Expiration", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -92,7 +92,7 @@ var _ = Describe("NodeClaim/Expiration", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -113,7 +113,7 @@ var _ = Describe("NodeClaim/Expiration", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -134,7 +134,7 @@ var _ = Describe("NodeClaim/Expiration", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -186,11 +186,11 @@ var _ = Describe("NodeClaim/Expiration", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node2}, []*v1beta1.NodeClaim{nodeClaim2}) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaim, nodeClaim2) @@ -209,7 +209,7 @@ var _ = Describe("NodeClaim/Expiration", func() { fakeClock.Step(10 * time.Minute) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -223,7 +223,7 @@ var _ = Describe("NodeClaim/Expiration", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the nodeClaim to the node @@ -234,7 +234,7 @@ var _ = Describe("NodeClaim/Expiration", func() { Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(0)) ExpectNotFound(ctx, env.Client, nodeClaim, node) }) - It("should deprovision all empty expired nodes in parallel", func() { + It("should disrupt all empty expired nodes in parallel", func() { nodeClaims, nodes := test.NodeClaimsAndNodes(100, v1beta1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ @@ -265,7 +265,7 @@ var _ = Describe("NodeClaim/Expiration", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the nodeClaim to the node @@ -323,18 +323,18 @@ var _ = Describe("NodeClaim/Expiration", func() { ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], nodeClaim, nodeClaim2, node, node2, nodePool) - // bind pods to node so that they're not empty and don't deprovision in parallel. + // bind pods to node so that they're not empty and don't disrupt in parallel. ExpectManualBinding(ctx, env.Client, pods[0], node) ExpectManualBinding(ctx, env.Client, pods[1], node2) // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node, node2}, []*v1beta1.NodeClaim{nodeClaim, nodeClaim2}) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the nodeClaim to the node @@ -377,11 +377,11 @@ var _ = Describe("NodeClaim/Expiration", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - // deprovisioning won't delete the old node until the new node is ready + // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the nodeClaim to the node @@ -432,7 +432,7 @@ var _ = Describe("NodeClaim/Expiration", func() { var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectNewNodeClaimsDeleted(ctx, env.Client, &wg, 1) - _, err := deprovisioningController.Reconcile(ctx, reconcile.Request{}) + _, err := disruptionController.Reconcile(ctx, reconcile.Request{}) Expect(err).To(HaveOccurred()) wg.Wait() @@ -517,11 +517,11 @@ var _ = Describe("NodeClaim/Expiration", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) - // deprovisioning won't delete the old nodeClaim until the new nodeClaim is ready + // disruption won't delete the old nodeClaim until the new nodeClaim is ready var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 3) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) wg.Wait() // Cascade any deletion of the nodeClaim to the node diff --git a/pkg/controllers/deprovisioning/pdblimits.go b/pkg/controllers/disruption/pdblimits.go similarity index 99% rename from pkg/controllers/deprovisioning/pdblimits.go rename to pkg/controllers/disruption/pdblimits.go index 62a5ac89ffb0..197b908a05e5 100644 --- a/pkg/controllers/deprovisioning/pdblimits.go +++ b/pkg/controllers/disruption/pdblimits.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "context" diff --git a/pkg/controllers/deprovisioning/singlenodeconsolidation.go b/pkg/controllers/disruption/singlenodeconsolidation.go similarity index 82% rename from pkg/controllers/deprovisioning/singlenodeconsolidation.go rename to pkg/controllers/disruption/singlenodeconsolidation.go index 317ec8c6812d..e1f7071ba885 100644 --- a/pkg/controllers/deprovisioning/singlenodeconsolidation.go +++ b/pkg/controllers/disruption/singlenodeconsolidation.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "context" @@ -27,6 +27,7 @@ import ( "github.com/aws/karpenter-core/pkg/controllers/provisioning" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/events" + "github.com/aws/karpenter-core/pkg/metrics" ) const SingleNodeConsolidationTimeoutDuration = 3 * time.Minute @@ -41,7 +42,7 @@ func NewSingleNodeConsolidation(clk clock.Clock, cluster *state.Cluster, kubeCli return &SingleNodeConsolidation{consolidation: makeConsolidation(clk, cluster, kubeClient, provisioner, cp, recorder)} } -// ComputeCommand generates a deprovisioning command given deprovisionable NodeClaims +// ComputeCommand generates a disruption command given candidates // nolint:gocyclo func (s *SingleNodeConsolidation) ComputeCommand(ctx context.Context, candidates ...*Candidate) (Command, error) { if s.isConsolidated() { @@ -51,7 +52,11 @@ func (s *SingleNodeConsolidation) ComputeCommand(ctx context.Context, candidates if err != nil { return Command{}, fmt.Errorf("sorting candidates, %w", err) } - deprovisioningEligibleMachinesGauge.WithLabelValues(s.String()).Set(float64(len(candidates))) + deprovisioningEligibleMachinesGauge.WithLabelValues(s.Type()).Set(float64(len(candidates))) + disruptionEligibleNodesGauge.With(map[string]string{ + methodLabel: s.Type(), + consolidationTypeLabel: s.ConsolidationType(), + }).Set(float64(len(candidates))) v := NewValidation(consolidationTTL, s.clock, s.cluster, s.kubeClient, s.provisioner, s.cloudProvider, s.recorder) @@ -60,8 +65,8 @@ func (s *SingleNodeConsolidation) ComputeCommand(ctx context.Context, candidates // binary search to find the maximum number of NodeClaims we can terminate for i, candidate := range candidates { if s.clock.Now().After(timeout) { - // TODO @joinnis: Change this to singleNodeClaimConsolidationLabelValue when migrating deprovisioningConsolidationTimeoutsCounter.WithLabelValues(singleMachineConsolidationLabelValue).Inc() + deprovisioningConsolidationTimeoutsCounter.WithLabelValues(s.ConsolidationType()).Inc() logging.FromContext(ctx).Debugf("abandoning single-node consolidation due to timeout after evaluating %d candidates", i) return Command{}, nil } @@ -88,3 +93,11 @@ func (s *SingleNodeConsolidation) ComputeCommand(ctx context.Context, candidates s.markConsolidated() return Command{}, nil } + +func (s *SingleNodeConsolidation) Type() string { + return metrics.ConsolidationReason +} + +func (s *SingleNodeConsolidation) ConsolidationType() string { + return "single" +} diff --git a/pkg/controllers/deprovisioning/suite_test.go b/pkg/controllers/disruption/suite_test.go similarity index 94% rename from pkg/controllers/deprovisioning/suite_test.go rename to pkg/controllers/disruption/suite_test.go index 171a53c9f11b..ab52bbcaac9d 100644 --- a/pkg/controllers/deprovisioning/suite_test.go +++ b/pkg/controllers/disruption/suite_test.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning_test +package disruption_test import ( "context" @@ -41,7 +41,7 @@ import ( "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/cloudprovider" "github.com/aws/karpenter-core/pkg/cloudprovider/fake" - "github.com/aws/karpenter-core/pkg/controllers/deprovisioning" + "github.com/aws/karpenter-core/pkg/controllers/disruption" "github.com/aws/karpenter-core/pkg/controllers/provisioning" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/controllers/state/informer" @@ -58,7 +58,7 @@ import ( var ctx context.Context var env *test.Environment var cluster *state.Cluster -var deprovisioningController *deprovisioning.Controller +var disruptionController *disruption.Controller var prov *provisioning.Provisioner var cloudProvider *fake.CloudProvider var nodeStateController controller.Controller @@ -88,7 +88,7 @@ var _ = BeforeSuite(func() { nodeClaimStateController = informer.NewNodeClaimController(env.Client, cluster) recorder = test.NewEventRecorder() prov = provisioning.NewProvisioner(env.Client, env.KubernetesInterface.CoreV1(), recorder, cloudProvider, cluster) - deprovisioningController = deprovisioning.NewController(fakeClock, env.Client, prov, cloudProvider, recorder, cluster) + disruptionController = disruption.NewController(fakeClock, env.Client, prov, cloudProvider, recorder, cluster) }) var _ = AfterSuite(func() { @@ -217,7 +217,7 @@ var _ = Describe("Disruption Taints", func() { replacementInstance, } }) - It("should remove taints from NodeClaims that were left tainted from a previous deprovisioning action", func() { + It("should remove taints from NodeClaims that were left tainted from a previous disruption action", func() { pod := test.Pod(test.PodOptions{ ResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ @@ -240,13 +240,13 @@ var _ = Describe("Disruption Taints", func() { go func() { defer wg.Done() ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileSucceeded(ctx, disruptionController, client.ObjectKey{}) }() wg.Wait() nodeClaimNode = ExpectNodeExists(ctx, env.Client, nodeClaimNode.Name) Expect(nodeClaimNode.Spec.Taints).ToNot(ContainElement(v1beta1.DisruptionNoScheduleTaint)) }) - It("should add and remove taints from NodeClaims that fail to deprovision", func() { + It("should add and remove taints from NodeClaims that fail to disrupt", func() { nodePool.Spec.Disruption.ConsolidationPolicy = v1beta1.ConsolidationPolicyWhenUnderutilized pod := test.Pod(test.PodOptions{ ResourceRequirements: v1.ResourceRequirements{ @@ -268,7 +268,7 @@ var _ = Describe("Disruption Taints", func() { go func() { defer wg.Done() ExpectTriggerVerifyAction(&wg) - ExpectReconcileFailed(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileFailed(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -293,7 +293,7 @@ var _ = Describe("Disruption Taints", func() { nodeClaimNode = ExpectNodeExists(ctx, env.Client, nodeClaimNode.Name) Expect(nodeClaimNode.Spec.Taints).ToNot(ContainElement(v1beta1.DisruptionNoScheduleTaint)) }) - It("should add and remove taints from Machines that fail to deprovision", func() { + It("should add and remove taints from Machines that fail to disrupt", func() { provisioner.Spec.Consolidation = &v1alpha5.Consolidation{Enabled: lo.ToPtr(true)} pod := test.Pod(test.PodOptions{ ResourceRequirements: v1.ResourceRequirements{ @@ -317,7 +317,7 @@ var _ = Describe("Disruption Taints", func() { go func() { defer wg.Done() ExpectTriggerVerifyAction(&wg) - ExpectReconcileFailed(ctx, deprovisioningController, client.ObjectKey{}) + ExpectReconcileFailed(ctx, disruptionController, client.ObjectKey{}) }() // Iterate in a loop until we get to the validation action @@ -343,7 +343,7 @@ var _ = Describe("Disruption Taints", func() { }) }) -var _ = Describe("Combined/Deprovisioning", func() { +var _ = Describe("Combined/Disruption", func() { var provisioner *v1alpha5.Provisioner var machine *v1alpha5.Machine var nodePool *v1beta1.NodePool @@ -387,7 +387,7 @@ var _ = Describe("Combined/Deprovisioning", func() { }, }) }) - It("should deprovision all empty Machine and NodeClaims in parallel (Emptiness)", func() { + It("should disrupt all empty Machine and NodeClaims in parallel (Emptiness)", func() { provisioner.Spec.TTLSecondsAfterEmpty = lo.ToPtr[int64](30) nodePool.Spec.Disruption.ConsolidationPolicy = v1beta1.ConsolidationPolicyWhenEmpty nodePool.Spec.Disruption.ConsolidateAfter = &v1beta1.NillableDuration{Duration: lo.ToPtr(time.Second * 30)} @@ -403,7 +403,7 @@ var _ = Describe("Combined/Deprovisioning", func() { fakeClock.Step(10 * time.Minute) wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the machine to the node ExpectMachinesCascadeDeletion(ctx, env.Client, machine) @@ -418,7 +418,7 @@ var _ = Describe("Combined/Deprovisioning", func() { Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(0)) ExpectNotFound(ctx, env.Client, machine, nodeClaim, machineNode, nodeClaimNode) }) - It("should deprovision all empty Machine and NodeClaims in parallel (Expiration)", func() { + It("should disrupt all empty Machine and NodeClaims in parallel (Expiration)", func() { provisioner.Spec.TTLSecondsUntilExpired = lo.ToPtr[int64](30) nodePool.Spec.Disruption.ExpireAfter = v1beta1.NillableDuration{Duration: lo.ToPtr(time.Second * 30)} machine.StatusConditions().MarkTrue(v1alpha5.MachineExpired) @@ -433,7 +433,7 @@ var _ = Describe("Combined/Deprovisioning", func() { fakeClock.Step(10 * time.Minute) wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the machine to the node ExpectMachinesCascadeDeletion(ctx, env.Client, machine) @@ -448,7 +448,7 @@ var _ = Describe("Combined/Deprovisioning", func() { Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(0)) ExpectNotFound(ctx, env.Client, machine, nodeClaim, machineNode, nodeClaimNode) }) - It("should deprovision all empty Machine and NodeClaims in parallel (Drift)", func() { + It("should disrupt all empty Machine and NodeClaims in parallel (Drift)", func() { machine.StatusConditions().MarkTrue(v1alpha5.MachineDrifted) nodeClaim.StatusConditions().MarkTrue(v1beta1.Drifted) @@ -462,7 +462,7 @@ var _ = Describe("Combined/Deprovisioning", func() { fakeClock.Step(10 * time.Minute) wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the machine to the node ExpectMachinesCascadeDeletion(ctx, env.Client, machine) @@ -477,7 +477,7 @@ var _ = Describe("Combined/Deprovisioning", func() { Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(0)) ExpectNotFound(ctx, env.Client, machine, nodeClaim, machineNode, nodeClaimNode) }) - It("should deprovision all empty Machine and NodeClaims in parallel (Consolidation)", func() { + It("should disrupt all empty Machine and NodeClaims in parallel (Consolidation)", func() { provisioner.Spec.Consolidation = &v1alpha5.Consolidation{Enabled: lo.ToPtr(true)} nodePool.Spec.Disruption.ConsolidationPolicy = v1beta1.ConsolidationPolicyWhenUnderutilized ExpectApplied(ctx, env.Client, provisioner, nodePool, machine, nodeClaim, machineNode, nodeClaimNode) @@ -490,7 +490,7 @@ var _ = Describe("Combined/Deprovisioning", func() { fakeClock.Step(10 * time.Minute) wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the machine to the node ExpectMachinesCascadeDeletion(ctx, env.Client, machine) @@ -505,7 +505,7 @@ var _ = Describe("Combined/Deprovisioning", func() { Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(0)) ExpectNotFound(ctx, env.Client, machine, nodeClaim, machineNode, nodeClaimNode) }) - It("should deprovision a Machine and replace with a cheaper NodeClaim", func() { + It("should disrupt a Machine and replace with a cheaper NodeClaim", func() { provisioner.Spec.Consolidation = &v1alpha5.Consolidation{Enabled: lo.ToPtr(true)} nodePool.Spec.Disruption.ConsolidationPolicy = v1beta1.ConsolidationPolicyWhenUnderutilized currentInstance := fake.NewInstanceType(fake.InstanceTypeOptions{ @@ -592,7 +592,7 @@ var _ = Describe("Combined/Deprovisioning", func() { wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the machine to the node ExpectMachinesCascadeDeletion(ctx, env.Client, machine) @@ -603,7 +603,7 @@ var _ = Describe("Combined/Deprovisioning", func() { Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(1)) ExpectNotFound(ctx, env.Client, machine, machineNode) }) - It("should deprovision multiple Machines and replace with a single cheaper NodeClaim", func() { + It("should disrupt multiple Machines and replace with a single cheaper NodeClaim", func() { provisioner.Spec.Consolidation = &v1alpha5.Consolidation{Enabled: lo.ToPtr(true)} nodePool.Spec.Disruption.ConsolidationPolicy = v1beta1.ConsolidationPolicyWhenUnderutilized currentInstance := fake.NewInstanceType(fake.InstanceTypeOptions{ @@ -706,7 +706,7 @@ var _ = Describe("Combined/Deprovisioning", func() { wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the machines to the nodes ExpectMachinesCascadeDeletion(ctx, env.Client, machines...) @@ -719,7 +719,7 @@ var _ = Describe("Combined/Deprovisioning", func() { ExpectNotFound(ctx, env.Client, lo.Map(machines, func(m *v1alpha5.Machine, _ int) client.Object { return m })...) ExpectNotFound(ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) }) - It("should deprovision a NodeClaim and replace with a cheaper Machine", func() { + It("should disrupt a NodeClaim and replace with a cheaper Machine", func() { provisioner.Spec.Consolidation = &v1alpha5.Consolidation{Enabled: lo.ToPtr(true)} nodePool.Spec.Disruption.ConsolidationPolicy = v1beta1.ConsolidationPolicyWhenUnderutilized currentInstance := fake.NewInstanceType(fake.InstanceTypeOptions{ @@ -806,7 +806,7 @@ var _ = Describe("Combined/Deprovisioning", func() { wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the nodeclaim to the node ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaim) @@ -817,7 +817,7 @@ var _ = Describe("Combined/Deprovisioning", func() { Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(1)) ExpectNotFound(ctx, env.Client, nodeClaim, nodeClaimNode) }) - It("should deprovision multiple NodeClaims and replace with a single cheaper Machine", func() { + It("should disrupt multiple NodeClaims and replace with a single cheaper Machine", func() { provisioner.Spec.Consolidation = &v1alpha5.Consolidation{Enabled: lo.ToPtr(true)} nodePool.Spec.Disruption.ConsolidationPolicy = v1beta1.ConsolidationPolicyWhenUnderutilized currentInstance := fake.NewInstanceType(fake.InstanceTypeOptions{ @@ -920,7 +920,7 @@ var _ = Describe("Combined/Deprovisioning", func() { wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the nodeclaims to the nodes ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaims...) @@ -933,7 +933,7 @@ var _ = Describe("Combined/Deprovisioning", func() { ExpectNotFound(ctx, env.Client, lo.Map(nodeClaims, func(nc *v1beta1.NodeClaim, _ int) client.Object { return nc })...) ExpectNotFound(ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) }) - It("should deprovision Machines and NodeClaims to consolidate pods onto a single NodeClaim", func() { + It("should disrupt Machines and NodeClaims to consolidate pods onto a single NodeClaim", func() { provisioner.Spec.Consolidation = &v1alpha5.Consolidation{Enabled: lo.ToPtr(true)} nodePool.Spec.Disruption.ConsolidationPolicy = v1beta1.ConsolidationPolicyWhenUnderutilized currentInstance := fake.NewInstanceType(fake.InstanceTypeOptions{ @@ -1069,7 +1069,7 @@ var _ = Describe("Combined/Deprovisioning", func() { wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the machines to the nodes ExpectMachinesCascadeDeletion(ctx, env.Client, machines...) @@ -1084,7 +1084,7 @@ var _ = Describe("Combined/Deprovisioning", func() { ExpectNotFound(ctx, env.Client, lo.Map(nodeClaims, func(nc *v1beta1.NodeClaim, _ int) client.Object { return nc })...) ExpectNotFound(ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) }) - It("should deprovision Machines and NodeClaims to consolidate pods onto a single Machine", func() { + It("should disrupt Machines and NodeClaims to consolidate pods onto a single Machine", func() { provisioner.Spec.Consolidation = &v1alpha5.Consolidation{Enabled: lo.ToPtr(true)} nodePool.Spec.Disruption.ConsolidationPolicy = v1beta1.ConsolidationPolicyWhenUnderutilized currentInstance := fake.NewInstanceType(fake.InstanceTypeOptions{ @@ -1220,7 +1220,7 @@ var _ = Describe("Combined/Deprovisioning", func() { wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) ExpectMakeNewMachinesReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Cascade any deletion of the machines to the nodes ExpectMachinesCascadeDeletion(ctx, env.Client, machines...) @@ -1249,7 +1249,7 @@ var _ = Describe("Combined/Deprovisioning", func() { fakeClock.Step(10 * time.Minute) wg := sync.WaitGroup{} ExpectTriggerVerifyAction(&wg) - ExpectReconcileSucceeded(ctx, deprovisioningController, types.NamespacedName{}) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) // Expect that the expired nodeclaim is not gone Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -1263,11 +1263,11 @@ var _ = Describe("Combined/Deprovisioning", func() { var _ = Describe("Pod Eviction Cost", func() { const standardPodCost = 1.0 It("should have a standard disruptionCost for a pod with no priority or disruptionCost specified", func() { - cost := deprovisioning.GetPodEvictionCost(ctx, &v1.Pod{}) + cost := disruption.GetPodEvictionCost(ctx, &v1.Pod{}) Expect(cost).To(BeNumerically("==", standardPodCost)) }) It("should have a higher disruptionCost for a pod with a positive deletion disruptionCost", func() { - cost := deprovisioning.GetPodEvictionCost(ctx, &v1.Pod{ + cost := disruption.GetPodEvictionCost(ctx, &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ v1.PodDeletionCost: "100", }}, @@ -1275,7 +1275,7 @@ var _ = Describe("Pod Eviction Cost", func() { Expect(cost).To(BeNumerically(">", standardPodCost)) }) It("should have a lower disruptionCost for a pod with a positive deletion disruptionCost", func() { - cost := deprovisioning.GetPodEvictionCost(ctx, &v1.Pod{ + cost := disruption.GetPodEvictionCost(ctx, &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ v1.PodDeletionCost: "-100", }}, @@ -1283,17 +1283,17 @@ var _ = Describe("Pod Eviction Cost", func() { Expect(cost).To(BeNumerically("<", standardPodCost)) }) It("should have higher costs for higher deletion costs", func() { - cost1 := deprovisioning.GetPodEvictionCost(ctx, &v1.Pod{ + cost1 := disruption.GetPodEvictionCost(ctx, &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ v1.PodDeletionCost: "101", }}, }) - cost2 := deprovisioning.GetPodEvictionCost(ctx, &v1.Pod{ + cost2 := disruption.GetPodEvictionCost(ctx, &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ v1.PodDeletionCost: "100", }}, }) - cost3 := deprovisioning.GetPodEvictionCost(ctx, &v1.Pod{ + cost3 := disruption.GetPodEvictionCost(ctx, &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ v1.PodDeletionCost: "99", }}, @@ -1302,13 +1302,13 @@ var _ = Describe("Pod Eviction Cost", func() { Expect(cost2).To(BeNumerically(">", cost3)) }) It("should have a higher disruptionCost for a pod with a higher priority", func() { - cost := deprovisioning.GetPodEvictionCost(ctx, &v1.Pod{ + cost := disruption.GetPodEvictionCost(ctx, &v1.Pod{ Spec: v1.PodSpec{Priority: ptr.Int32(1)}, }) Expect(cost).To(BeNumerically(">", standardPodCost)) }) It("should have a lower disruptionCost for a pod with a lower priority", func() { - cost := deprovisioning.GetPodEvictionCost(ctx, &v1.Pod{ + cost := disruption.GetPodEvictionCost(ctx, &v1.Pod{ Spec: v1.PodSpec{Priority: ptr.Int32(-1)}, }) Expect(cost).To(BeNumerically("<", standardPodCost)) diff --git a/pkg/controllers/deprovisioning/types.go b/pkg/controllers/disruption/types.go similarity index 85% rename from pkg/controllers/deprovisioning/types.go rename to pkg/controllers/disruption/types.go index 146266d4e70e..600e21a28479 100644 --- a/pkg/controllers/deprovisioning/types.go +++ b/pkg/controllers/disruption/types.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "bytes" @@ -27,7 +27,7 @@ import ( "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/cloudprovider" - deprovisioningevents "github.com/aws/karpenter-core/pkg/controllers/deprovisioning/events" + disruptionevents "github.com/aws/karpenter-core/pkg/controllers/disruption/events" "github.com/aws/karpenter-core/pkg/controllers/provisioning/scheduling" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/events" @@ -35,15 +35,16 @@ import ( nodepoolutil "github.com/aws/karpenter-core/pkg/utils/nodepool" ) -type Deprovisioner interface { - ShouldDeprovision(context.Context, *Candidate) bool +type Method interface { + ShouldDisrupt(context.Context, *Candidate) bool ComputeCommand(context.Context, ...*Candidate) (Command, error) - String() string + Type() string + ConsolidationType() string } type CandidateFilter func(context.Context, *Candidate) bool -// Candidate is a state.StateNode that we are considering for deprovisioning along with extra information to be used in +// Candidate is a state.StateNode that we are considering for disruption along with extra information to be used in // making that determination type Candidate struct { *state.StateNode @@ -71,7 +72,7 @@ func NewCandidate(ctx context.Context, kubeClient client.Client, recorder events return nil, fmt.Errorf("state node isn't initialized") } if _, ok := node.Annotations()[v1beta1.DoNotDisruptAnnotationKey]; ok { - recorder.Publish(deprovisioningevents.Blocked(node.Node, node.NodeClaim, fmt.Sprintf("Disruption is blocked with the %q annotation", v1beta1.DoNotDisruptAnnotationKey))...) + recorder.Publish(disruptionevents.Blocked(node.Node, node.NodeClaim, fmt.Sprintf("Disruption is blocked with the %q annotation", v1beta1.DoNotDisruptAnnotationKey))...) return nil, fmt.Errorf("disruption is blocked through the %q annotation", v1beta1.DoNotDisruptAnnotationKey) } // check whether the node has all the labels we need @@ -80,7 +81,7 @@ func NewCandidate(ctx context.Context, kubeClient client.Client, recorder events v1.LabelTopologyZone, } { if _, ok := node.Labels()[label]; !ok { - recorder.Publish(deprovisioningevents.Blocked(node.Node, node.NodeClaim, fmt.Sprintf("Required label %q doesn't exist", label))...) + recorder.Publish(disruptionevents.Blocked(node.Node, node.NodeClaim, fmt.Sprintf("Required label %q doesn't exist", label))...) return nil, fmt.Errorf("state node doesn't have required label %q", label) } } @@ -92,18 +93,18 @@ func NewCandidate(ctx context.Context, kubeClient client.Client, recorder events instanceTypeMap := nodePoolToInstanceTypesMap[ownerKey] // skip any candidates where we can't determine the nodePool if nodePool == nil || instanceTypeMap == nil { - recorder.Publish(deprovisioningevents.Blocked(node.Node, node.NodeClaim, fmt.Sprintf("Owning %s %q not found", lo.Ternary(ownerKey.IsProvisioner, "provisioner", "nodepool"), ownerKey.Name))...) + recorder.Publish(disruptionevents.Blocked(node.Node, node.NodeClaim, fmt.Sprintf("Owning %s %q not found", lo.Ternary(ownerKey.IsProvisioner, "provisioner", "nodepool"), ownerKey.Name))...) return nil, fmt.Errorf("%s %q can't be resolved for state node", lo.Ternary(ownerKey.IsProvisioner, "provisioner", "nodepool"), ownerKey.Name) } instanceType := instanceTypeMap[node.Labels()[v1.LabelInstanceTypeStable]] // skip any candidates that we can't determine the instance of if instanceType == nil { - recorder.Publish(deprovisioningevents.Blocked(node.Node, node.NodeClaim, fmt.Sprintf("Instance type %q not found", node.Labels()[v1.LabelInstanceTypeStable]))...) + recorder.Publish(disruptionevents.Blocked(node.Node, node.NodeClaim, fmt.Sprintf("Instance type %q not found", node.Labels()[v1.LabelInstanceTypeStable]))...) return nil, fmt.Errorf("instance type '%s' can't be resolved", node.Labels()[v1.LabelInstanceTypeStable]) } // skip the node if it is nominated by a recent provisioning pass to be the target of a pending pod. if node.Nominated() { - recorder.Publish(deprovisioningevents.Blocked(node.Node, node.NodeClaim, "Nominated for a pending pod")...) + recorder.Publish(disruptionevents.Blocked(node.Node, node.NodeClaim, "Nominated for a pending pod")...) return nil, fmt.Errorf("state node is nominated for a pending pod") } pods, err := node.Pods(ctx, kubeClient) diff --git a/pkg/controllers/deprovisioning/validation.go b/pkg/controllers/disruption/validation.go similarity index 95% rename from pkg/controllers/deprovisioning/validation.go rename to pkg/controllers/disruption/validation.go index 4b26a5b3b869..b81ad700fe32 100644 --- a/pkg/controllers/deprovisioning/validation.go +++ b/pkg/controllers/disruption/validation.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deprovisioning +package disruption import ( "context" @@ -73,7 +73,7 @@ func (v *Validation) IsValid(ctx context.Context, cmd Command) (bool, error) { case <-v.clock.After(waitDuration): } } - validationCandidates, err := GetCandidates(ctx, v.cluster, v.kubeClient, v.recorder, v.clock, v.cloudProvider, v.ShouldDeprovision) + validationCandidates, err := GetCandidates(ctx, v.cluster, v.kubeClient, v.recorder, v.clock, v.cloudProvider, v.ShouldDisrupt) if err != nil { return false, fmt.Errorf("constructing validation candidates, %w", err) } @@ -102,15 +102,15 @@ func (v *Validation) IsValid(ctx context.Context, cmd Command) (bool, error) { return isValid, nil } -// ShouldDeprovision is a predicate used to filter deprovisionable candidates -func (v *Validation) ShouldDeprovision(_ context.Context, c *Candidate) bool { +// ShouldDisrupt is a predicate used to filter candidates +func (v *Validation) ShouldDisrupt(_ context.Context, c *Candidate) bool { if c.Annotations()[v1alpha5.DoNotConsolidateNodeAnnotationKey] == "true" { return false } return c.nodePool.Spec.Disruption.ConsolidationPolicy == v1beta1.ConsolidationPolicyWhenUnderutilized } -// ValidateCommand validates a command for a deprovisioner +// ValidateCommand validates a command for a Method func (v *Validation) ValidateCommand(ctx context.Context, cmd Command, candidates []*Candidate) (bool, error) { // None of the chosen candidate are valid for execution, so retry if len(candidates) == 0 { diff --git a/pkg/controllers/nodeclaim/consistency/termination.go b/pkg/controllers/nodeclaim/consistency/termination.go index fdca3a6610d8..94b9411eef6a 100644 --- a/pkg/controllers/nodeclaim/consistency/termination.go +++ b/pkg/controllers/nodeclaim/consistency/termination.go @@ -22,7 +22,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter-core/pkg/apis/v1beta1" - "github.com/aws/karpenter-core/pkg/controllers/deprovisioning" + "github.com/aws/karpenter-core/pkg/controllers/disruption" nodeutils "github.com/aws/karpenter-core/pkg/utils/node" ) @@ -42,7 +42,7 @@ func (t *Termination) Check(ctx context.Context, node *v1.Node, nodeClaim *v1bet if nodeClaim.DeletionTimestamp.IsZero() { return nil, nil } - pdbs, err := deprovisioning.NewPDBLimits(ctx, t.kubeClient) + pdbs, err := disruption.NewPDBLimits(ctx, t.kubeClient) if err != nil { return nil, err }