From ca088d26c28f6ec6033ca907f477cd3a70277081 Mon Sep 17 00:00:00 2001 From: Maria Oparka Date: Sun, 2 Apr 2023 21:19:14 +0200 Subject: [PATCH] Move MaxNodeProvisionTime to NodeGroupAutoscalingOptions --- .../cloudprovider/gce/gce_manager.go | 3 + .../cloudprovider/gce/gce_manager_test.go | 4 + .../clusterstate/clusterstate.go | 33 +++++-- .../clusterstate/clusterstate_test.go | 64 +++++++----- .../max_node_provision_time_provider.go | 54 ++++++++++ .../config/autoscaling_options.go | 4 +- cluster-autoscaler/config/const.go | 2 + .../core/scaledown/actuation/actuator_test.go | 4 +- .../actuation/delete_in_batch_test.go | 2 +- .../core/scaledown/legacy/legacy_test.go | 22 ++--- .../scaleup/orchestrator/orchestrator_test.go | 16 +-- cluster-autoscaler/core/static_autoscaler.go | 35 ++++--- .../core/static_autoscaler_test.go | 99 ++++++++++--------- cluster-autoscaler/main.go | 4 +- .../node_group_config_processor.go | 14 +++ .../node_group_config_processor_test.go | 14 +++ 16 files changed, 258 insertions(+), 116 deletions(-) create mode 100644 cluster-autoscaler/clusterstate/max_node_provision_time_provider.go diff --git a/cluster-autoscaler/cloudprovider/gce/gce_manager.go b/cluster-autoscaler/cloudprovider/gce/gce_manager.go index ba8d6e9d7491..348674a25d26 100644 --- a/cluster-autoscaler/cloudprovider/gce/gce_manager.go +++ b/cluster-autoscaler/cloudprovider/gce/gce_manager.go @@ -575,6 +575,9 @@ func (m *gceManagerImpl) GetMigOptions(mig Mig, defaults config.NodeGroupAutosca if opt, ok := getDurationOption(options, migRef.Name, config.DefaultScaleDownUnreadyTimeKey); ok { defaults.ScaleDownUnreadyTime = opt } + if opt, ok := getDurationOption(options, migRef.Name, config.DefaultMaxNodeProvisionTimeKey); ok { + defaults.MaxNodeProvisionTime = opt + } return &defaults } diff --git a/cluster-autoscaler/cloudprovider/gce/gce_manager_test.go b/cluster-autoscaler/cloudprovider/gce/gce_manager_test.go index 53dfdf89b71a..b37b1ba005e4 100644 --- a/cluster-autoscaler/cloudprovider/gce/gce_manager_test.go +++ b/cluster-autoscaler/cloudprovider/gce/gce_manager_test.go @@ -1520,6 +1520,7 @@ func TestGetMigOptions(t *testing.T) { ScaleDownGpuUtilizationThreshold: 0.2, ScaleDownUnneededTime: time.Second, ScaleDownUnreadyTime: time.Minute, + MaxNodeProvisionTime: 15 * time.Minute, } cases := []struct { @@ -1539,12 +1540,14 @@ func TestGetMigOptions(t *testing.T) { config.DefaultScaleDownUtilizationThresholdKey: "0.7", config.DefaultScaleDownUnneededTimeKey: "1h", config.DefaultScaleDownUnreadyTimeKey: "30m", + config.DefaultMaxNodeProvisionTimeKey: "60m", }, expected: &config.NodeGroupAutoscalingOptions{ ScaleDownGpuUtilizationThreshold: 0.6, ScaleDownUtilizationThreshold: 0.7, ScaleDownUnneededTime: time.Hour, ScaleDownUnreadyTime: 30 * time.Minute, + MaxNodeProvisionTime: 60 * time.Minute, }, }, { @@ -1558,6 +1561,7 @@ func TestGetMigOptions(t *testing.T) { ScaleDownUtilizationThreshold: defaultOptions.ScaleDownUtilizationThreshold, ScaleDownUnneededTime: time.Minute, ScaleDownUnreadyTime: defaultOptions.ScaleDownUnreadyTime, + MaxNodeProvisionTime: 15 * time.Minute, }, }, { diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 86721cddedce..9665d83a431e 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -45,6 +45,11 @@ const ( MaxNodeStartupTime = 15 * time.Minute ) +type maxNodeProvisionTimeProvider interface { + // GetMaxNodeProvisionTime returns MaxNodeProvisionTime value that should be used for the given NodeGroup. + GetMaxNodeProvisionTime(nodeGroup cloudprovider.NodeGroup) (time.Duration, error) +} + // ScaleUpRequest contains information about the requested node group scale up. type ScaleUpRequest struct { // NodeGroup is the node group to be scaled up. @@ -76,8 +81,6 @@ type ClusterStateRegistryConfig struct { // Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply. // This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling. OkTotalUnreadyCount int - // Maximum time CA waits for node to be provisioned - MaxNodeProvisionTime time.Duration } // IncorrectNodeGroupSize contains information about how much the current size of the node group @@ -132,6 +135,7 @@ type ClusterStateRegistry struct { previousCloudProviderNodeInstances map[string][]cloudprovider.Instance cloudProviderNodeInstancesCache *utils.CloudProviderNodeInstancesCache interrupt chan struct{} + maxNodeProvisionTimeProvider maxNodeProvisionTimeProvider // scaleUpFailures contains information about scale-up failures for each node group. It should be // cleared periodically to avoid unnecessary accumulation. @@ -139,7 +143,7 @@ type ClusterStateRegistry struct { } // NewClusterStateRegistry creates new ClusterStateRegistry. -func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff) *ClusterStateRegistry { +func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff, maxNodeProvisionTimeProvider maxNodeProvisionTimeProvider) *ClusterStateRegistry { emptyStatus := &api.ClusterAutoscalerStatus{ ClusterwideConditions: make([]api.ClusterAutoscalerCondition, 0), NodeGroupStatuses: make([]api.NodeGroupStatus, 0), @@ -163,6 +167,7 @@ func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config C cloudProviderNodeInstancesCache: utils.NewCloudProviderNodeInstancesCache(cloudProvider), interrupt: make(chan struct{}), scaleUpFailures: make(map[string][]ScaleUpFailure), + maxNodeProvisionTimeProvider: maxNodeProvisionTimeProvider, } } @@ -188,14 +193,25 @@ func (csr *ClusterStateRegistry) RegisterOrUpdateScaleUp(nodeGroup cloudprovider csr.registerOrUpdateScaleUpNoLock(nodeGroup, delta, currentTime) } +// MaxNodeProvisionTime returns MaxNodeProvisionTime value that should be used for the given NodeGroup. +func (csr *ClusterStateRegistry) MaxNodeProvisionTime(nodeGroup cloudprovider.NodeGroup) (time.Duration, error) { + return csr.maxNodeProvisionTimeProvider.GetMaxNodeProvisionTime(nodeGroup) +} + func (csr *ClusterStateRegistry) registerOrUpdateScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, delta int, currentTime time.Time) { + maxNodeProvisionTime, err := csr.maxNodeProvisionTimeProvider.GetMaxNodeProvisionTime(nodeGroup) + if err != nil { + klog.Warningf("Couldn't update scale up request: failed to get maxNodeProvisionTime for node group %s: %w", nodeGroup.Id(), err) + return + } + scaleUpRequest, found := csr.scaleUpRequests[nodeGroup.Id()] if !found && delta > 0 { scaleUpRequest = &ScaleUpRequest{ NodeGroup: nodeGroup, Increase: delta, Time: currentTime, - ExpectedAddTime: currentTime.Add(csr.config.MaxNodeProvisionTime), + ExpectedAddTime: currentTime.Add(maxNodeProvisionTime), } csr.scaleUpRequests[nodeGroup.Id()] = scaleUpRequest return @@ -217,7 +233,7 @@ func (csr *ClusterStateRegistry) registerOrUpdateScaleUpNoLock(nodeGroup cloudpr if delta > 0 { // if we are actually adding new nodes shift Time and ExpectedAddTime scaleUpRequest.Time = currentTime - scaleUpRequest.ExpectedAddTime = currentTime.Add(csr.config.MaxNodeProvisionTime) + scaleUpRequest.ExpectedAddTime = currentTime.Add(maxNodeProvisionTime) } } @@ -589,7 +605,12 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) { continue } perNgCopy := perNodeGroup[nodeGroup.Id()] - if unregistered.UnregisteredSince.Add(csr.config.MaxNodeProvisionTime).Before(currentTime) { + maxNodeProvisionTime, err := csr.maxNodeProvisionTimeProvider.GetMaxNodeProvisionTime(nodeGroup) + if err != nil { + klog.Warningf("Failed to get maxNodeProvisionTime for node %s in node group %s: %w", unregistered.Node.Name, nodeGroup.Id(), err) + continue + } + if unregistered.UnregisteredSince.Add(maxNodeProvisionTime).Before(currentTime) { perNgCopy.LongUnregistered = append(perNgCopy.LongUnregistered, unregistered.Node.Name) total.LongUnregistered = append(total.LongUnregistered, unregistered.Node.Name) } else { diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index 020b17640e40..d0147009f126 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -72,8 +72,8 @@ func TestOKWithScaleUp(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: time.Minute, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(time.Minute)) clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 4, time.Now()) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now) assert.NoError(t, err) @@ -114,8 +114,8 @@ func TestEmptyOK(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: time.Minute, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(time.Minute)) err := clusterstate.UpdateNodes([]*apiv1.Node{}, nil, now.Add(-5*time.Second)) assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) @@ -155,7 +155,7 @@ func TestOKOneUnreadyNode(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now) assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) @@ -193,7 +193,8 @@ func TestNodeWithoutNodeGroupDontCrash(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) err := clusterstate.UpdateNodes([]*apiv1.Node{noNgNode}, nil, now) assert.NoError(t, err) assert.Empty(t, clusterstate.GetScaleUpFailures()) @@ -220,7 +221,8 @@ func TestOKOneUnreadyNodeWithScaleDownCandidate(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now) clusterstate.UpdateScaleDownCandidates([]*apiv1.Node{ng1_1}, now) @@ -285,7 +287,8 @@ func TestMissingNodes(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now) assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) @@ -327,7 +330,8 @@ func TestTooManyUnready(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now) assert.NoError(t, err) assert.False(t, clusterstate.IsClusterHealthy()) @@ -356,7 +360,8 @@ func TestUnreadyLongAfterCreation(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now) assert.NoError(t, err) assert.Equal(t, 1, len(clusterstate.GetClusterReadiness().Unready)) @@ -388,7 +393,8 @@ func TestNotStarted(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now) assert.NoError(t, err) assert.Equal(t, 1, len(clusterstate.GetClusterReadiness().NotStarted)) @@ -425,8 +431,7 @@ func TestExpiredScaleUp(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: 2 * time.Minute, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), NewStaticMaxNodeProvisionTimeProvider(2*time.Minute)) clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 4, now.Add(-3*time.Minute)) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) assert.NoError(t, err) @@ -451,7 +456,8 @@ func TestRegisterScaleDown(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) now := time.Now() @@ -520,7 +526,8 @@ func TestUpcomingNodes(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1, ng3_1, ng4_1, ng5_1, ng5_2}, nil, now) assert.NoError(t, err) assert.Empty(t, clusterstate.GetScaleUpFailures()) @@ -567,7 +574,8 @@ func TestTaintBasedNodeDeletion(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2}, nil, now) assert.NoError(t, err) assert.Empty(t, clusterstate.GetScaleUpFailures()) @@ -588,7 +596,8 @@ func TestIncorrectSize(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) now := time.Now() clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now.Add(-5*time.Minute)) incorrect := clusterstate.incorrectNodeGroupSizes["ng1"] @@ -624,8 +633,8 @@ func TestUnregisteredNodes(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: 10 * time.Second, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(10*time.Second)) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, time.Now().Add(-time.Minute)) assert.NoError(t, err) @@ -674,8 +683,8 @@ func TestCloudProviderDeletedNodes(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: 10 * time.Second, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(10*time.Second)) now.Add(time.Minute) err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, noNgNode}, nil, now) @@ -876,8 +885,8 @@ func TestScaleUpBackoff(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: 120 * time.Second, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(120*time.Second)) // After failed scale-up, node group should be still healthy, but should backoff from scale-ups clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now.Add(-180*time.Second)) @@ -944,7 +953,8 @@ func TestGetClusterSize(t *testing.T) { clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - }, fakeLogRecorder, newBackoff()) + }, fakeLogRecorder, newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) // There are 2 actual nodes in 2 node groups with target sizes of 5 and 1. clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1, notAutoscaledNode}, nil, now) @@ -988,10 +998,10 @@ func TestUpdateScaleUp(t *testing.T) { ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: 10, OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: 10 * time.Second, }, fakeLogRecorder, - newBackoff()) + newBackoff(), + NewStaticMaxNodeProvisionTimeProvider(10*time.Second)) clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 100, now) assert.Equal(t, clusterstate.scaleUpRequests["ng1"].Increase, 100) @@ -1029,7 +1039,7 @@ func TestScaleUpFailures(t *testing.T) { fakeClient := &fake.Clientset{} fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") - clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, newBackoff()) + clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, newBackoff(), NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.Timeout, now) clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng2"), metrics.Timeout, now) diff --git a/cluster-autoscaler/clusterstate/max_node_provision_time_provider.go b/cluster-autoscaler/clusterstate/max_node_provision_time_provider.go new file mode 100644 index 000000000000..2c37b9c0f911 --- /dev/null +++ b/cluster-autoscaler/clusterstate/max_node_provision_time_provider.go @@ -0,0 +1,54 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clusterstate + +import ( + "time" + + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" + "k8s.io/autoscaler/cluster-autoscaler/context" + "k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupconfig" +) + +// NewDefaultMaxNodeProvisionTimeProvider returns the default maxNodeProvisionTimeProvider which uses the NodeGroupConfigProcessor. +func NewDefaultMaxNodeProvisionTimeProvider(context *context.AutoscalingContext, nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor) maxNodeProvisionTimeProvider { + return &defultMaxNodeProvisionTimeProvider{context: context, nodeGroupConfigProcessor: nodeGroupConfigProcessor} +} + +type defultMaxNodeProvisionTimeProvider struct { + context *context.AutoscalingContext + nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor +} + +// GetMaxNodeProvisionTime returns MaxNodeProvisionTime value that should be used for the given NodeGroup. +func (p *defultMaxNodeProvisionTimeProvider) GetMaxNodeProvisionTime(nodeGroup cloudprovider.NodeGroup) (time.Duration, error) { + return p.nodeGroupConfigProcessor.GetMaxNodeProvisionTime(p.context, nodeGroup) +} + +// NewStaticMaxNodeProvisionTimeProvider returns static maxNodeProvisionTimeProvider which returns constant MaxNodeProvisionTime for every NodeGroup. Can be used for convenient testing. +func NewStaticMaxNodeProvisionTimeProvider(maxNodeProvisionTime time.Duration) maxNodeProvisionTimeProvider { + return &staticMaxNodeProvisionTimeProvider{maxNodeProvisionTime} +} + +type staticMaxNodeProvisionTimeProvider struct { + staticMaxNodeProvisionTime time.Duration +} + +// GetMaxNodeProvisionTime returns constant MaxNodeProvisionTime value that should be used for every NodeGroup. +func (p *staticMaxNodeProvisionTimeProvider) GetMaxNodeProvisionTime(cloudprovider.NodeGroup) (time.Duration, error) { + return p.staticMaxNodeProvisionTime, nil +} diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 7c85e93cddb3..5dc336e45f36 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -44,6 +44,8 @@ type NodeGroupAutoscalingOptions struct { ScaleDownUnneededTime time.Duration // ScaleDownUnreadyTime represents how long an unready node should be unneeded before it is eligible for scale down ScaleDownUnreadyTime time.Duration + // Maximum time CA waits for node to be provisioned + MaxNodeProvisionTime time.Duration } // GCEOptions contain autoscaling options specific to GCE cloud provider. @@ -120,8 +122,6 @@ type AutoscalingOptions struct { // MaxGracefulTerminationSec is maximum number of seconds scale down waits for pods to terminate before // removing the node from cloud provider. MaxGracefulTerminationSec int - // Maximum time CA waits for node to be provisioned - MaxNodeProvisionTime time.Duration // MaxTotalUnreadyPercentage is the maximum percentage of unready nodes after which CA halts operations MaxTotalUnreadyPercentage float64 // OkTotalUnreadyCount is the number of allowed unready nodes, irrespective of max-total-unready-percentage diff --git a/cluster-autoscaler/config/const.go b/cluster-autoscaler/config/const.go index 5e4873d7e078..532b41a37d69 100644 --- a/cluster-autoscaler/config/const.go +++ b/cluster-autoscaler/config/const.go @@ -30,4 +30,6 @@ const ( DefaultScaleDownUnneededTimeKey = "scaledownunneededtime" // DefaultScaleDownUnreadyTimeKey identifies ScaleDownUnreadyTime autoscaling option DefaultScaleDownUnreadyTimeKey = "scaledownunreadytime" + // DefaultMaxNodeProvisionTimeKey identifies MaxNodeProvisionTime autoscaling option + DefaultMaxNodeProvisionTimeKey = "maxnodeprovisiontime" ) diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go index 4a64e076a3f4..6c12da493270 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go @@ -829,7 +829,7 @@ func TestStartDeletion(t *testing.T) { if err != nil { t.Fatalf("Couldn't set up autoscaling context: %v", err) } - csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff()) + csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) for _, node := range tc.emptyNodes { err := ctx.ClusterSnapshot.AddNodeWithPods(node, tc.pods[node.Name]) if err != nil { @@ -1078,7 +1078,7 @@ func TestStartDeletionInBatchBasic(t *testing.T) { if err != nil { t.Fatalf("Couldn't set up autoscaling context: %v", err) } - csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff()) + csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) ndt := deletiontracker.NewNodeDeletionTracker(0) actuator := Actuator{ ctx: &ctx, clusterState: csr, nodeDeletionTracker: ndt, diff --git a/cluster-autoscaler/core/scaledown/actuation/delete_in_batch_test.go b/cluster-autoscaler/core/scaledown/actuation/delete_in_batch_test.go index d4bda2f0ef30..d60328e3e707 100644 --- a/cluster-autoscaler/core/scaledown/actuation/delete_in_batch_test.go +++ b/cluster-autoscaler/core/scaledown/actuation/delete_in_batch_test.go @@ -161,7 +161,7 @@ func TestRemove(t *testing.T) { }) ctx, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{}, fakeClient, nil, provider, nil, nil) - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, fakeLogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, fakeLogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) if err != nil { t.Fatalf("Couldn't set up autoscaling context: %v", err) } diff --git a/cluster-autoscaler/core/scaledown/legacy/legacy_test.go b/cluster-autoscaler/core/scaledown/legacy/legacy_test.go index 2f542f9867cc..173df9fdbacb 100644 --- a/cluster-autoscaler/core/scaledown/legacy/legacy_test.go +++ b/cluster-autoscaler/core/scaledown/legacy/legacy_test.go @@ -146,7 +146,7 @@ func TestFindUnneededNodes(t *testing.T) { context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, registry, provider, nil, nil) assert.NoError(t, err) - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) wrapper := newWrapperForTesting(&context, clusterStateRegistry, nil) sd := wrapper.sd allNodes := []*apiv1.Node{n1, n2, n3, n4, n5, n7, n8, n9} @@ -277,7 +277,7 @@ func TestFindUnneededGPUNodes(t *testing.T) { context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, registry, provider, nil, nil) assert.NoError(t, err) - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) wrapper := newWrapperForTesting(&context, clusterStateRegistry, nil) sd := wrapper.sd allNodes := []*apiv1.Node{n1, n2, n3} @@ -392,7 +392,7 @@ func TestFindUnneededWithPerNodeGroupThresholds(t *testing.T) { context, err := NewScaleTestAutoscalingContext(globalOptions, &fake.Clientset{}, registry, provider, nil, nil) assert.NoError(t, err) - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) wrapper := newWrapperForTesting(&context, clusterStateRegistry, nil) sd := wrapper.sd clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, allNodes, allPods) @@ -475,7 +475,7 @@ func TestPodsWithPreemptionsFindUnneededNodes(t *testing.T) { context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, registry, provider, nil, nil) assert.NoError(t, err) - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) wrapper := newWrapperForTesting(&context, clusterStateRegistry, nil) sd := wrapper.sd @@ -539,7 +539,7 @@ func TestFindUnneededMaxCandidates(t *testing.T) { context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, registry, provider, nil, nil) assert.NoError(t, err) - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) wrapper := newWrapperForTesting(&context, clusterStateRegistry, nil) sd := wrapper.sd @@ -623,7 +623,7 @@ func TestFindUnneededEmptyNodes(t *testing.T) { context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, registry, provider, nil, nil) assert.NoError(t, err) - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) wrapper := newWrapperForTesting(&context, clusterStateRegistry, nil) sd := wrapper.sd @@ -680,7 +680,7 @@ func TestFindUnneededNodePool(t *testing.T) { context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, registry, provider, nil, nil) assert.NoError(t, err) - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) wrapper := newWrapperForTesting(&context, clusterStateRegistry, nil) sd := wrapper.sd clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, nodes, pods) @@ -771,7 +771,7 @@ func TestScaleDown(t *testing.T) { assert.NoError(t, err) nodes := []*apiv1.Node{n1, n2} - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) wrapper := newWrapperForTesting(&context, clusterStateRegistry, nil) clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, nodes, []*apiv1.Pod{p1, p2}) autoscalererr = wrapper.UpdateClusterState(nodes, nodes, nil, time.Now().Add(-5*time.Minute)) @@ -1028,7 +1028,7 @@ func simpleScaleDownEmpty(t *testing.T, config *ScaleTestConfig) { context, err := NewScaleTestAutoscalingContext(config.Options, fakeClient, registry, provider, nil, nil) assert.NoError(t, err) - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) wrapper := newWrapperForTesting(&context, clusterStateRegistry, config.NodeDeletionTracker) clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, nodes, []*apiv1.Pod{}) autoscalererr = wrapper.UpdateClusterState(nodes, nodes, nil, time.Now().Add(-5*time.Minute)) @@ -1123,7 +1123,7 @@ func TestNoScaleDownUnready(t *testing.T) { nodes := []*apiv1.Node{n1, n2} // N1 is unready so it requires a bigger unneeded time. - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) wrapper := newWrapperForTesting(&context, clusterStateRegistry, nil) clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, nodes, []*apiv1.Pod{p2}) autoscalererr = wrapper.UpdateClusterState(nodes, nodes, nil, time.Now().Add(-5*time.Minute)) @@ -1237,7 +1237,7 @@ func TestScaleDownNoMove(t *testing.T) { nodes := []*apiv1.Node{n1, n2} - clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) wrapper := newWrapperForTesting(&context, clusterStateRegistry, nil) clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, nodes, []*apiv1.Pod{p1, p2}) autoscalererr = wrapper.UpdateClusterState(nodes, nodes, nil, time.Now().Add(-5*time.Minute)) diff --git a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go index 4dc246ac25d1..8aea1437b41d 100644 --- a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go +++ b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go @@ -534,7 +534,7 @@ func runSimpleScaleUpTest(t *testing.T, config *ScaleTestConfig) *ScaleTestResul context.ExpanderStrategy = expander nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&context, nodes, []*appsv1.DaemonSet{}, nil, now) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) clusterState.UpdateNodes(nodes, nodeInfos, time.Now()) extraPods := make([]*apiv1.Pod, 0, len(config.ExtraPods)) @@ -696,7 +696,7 @@ func TestScaleUpUnhealthy(t *testing.T) { nodes := []*apiv1.Node{n1, n2} nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&context, nodes, []*appsv1.DaemonSet{}, nil, now) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) clusterState.UpdateNodes(nodes, nodeInfos, time.Now()) p3 := BuildTestPod("p-new", 550, 0) @@ -739,7 +739,7 @@ func TestScaleUpNoHelp(t *testing.T) { nodes := []*apiv1.Node{n1} nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&context, nodes, []*appsv1.DaemonSet{}, nil, now) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) clusterState.UpdateNodes(nodes, nodeInfos, time.Now()) p3 := BuildTestPod("p-new", 500, 0) @@ -808,7 +808,7 @@ func TestScaleUpBalanceGroups(t *testing.T) { assert.NoError(t, err) nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&context, nodes, []*appsv1.DaemonSet{}, nil, now) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) clusterState.UpdateNodes(nodes, nodeInfos, time.Now()) pods := make([]*apiv1.Pod, 0) @@ -870,7 +870,7 @@ func TestScaleUpAutoprovisionedNodeGroup(t *testing.T) { context, err := NewScaleTestAutoscalingContext(options, fakeClient, listers, provider, nil, nil) assert.NoError(t, err) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) processors := NewTestProcessors(&context) processors.NodeGroupListProcessor = &MockAutoprovisioningNodeGroupListProcessor{T: t} @@ -925,7 +925,7 @@ func TestScaleUpBalanceAutoprovisionedNodeGroups(t *testing.T) { context, err := NewScaleTestAutoscalingContext(options, fakeClient, listers, provider, nil, nil) assert.NoError(t, err) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) processors := NewTestProcessors(&context) processors.NodeGroupListProcessor = &MockAutoprovisioningNodeGroupListProcessor{T: t} @@ -986,7 +986,7 @@ func TestScaleUpToMeetNodeGroupMinSize(t *testing.T) { nodes := []*apiv1.Node{n1, n2} nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&context, nodes, []*appsv1.DaemonSet{}, nil, time.Now()) processors := NewTestProcessors(&context) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) clusterState.UpdateNodes(nodes, nodeInfos, time.Now()) suOrchestrator := New() @@ -1059,7 +1059,7 @@ func TestAuthError(t *testing.T) { nodeGroup.On("IncreaseSize", 0).Return(errors.NewAutoscalerError(errors.AutoscalerErrorType("abcd"), "")) processors := NewTestProcessors(&context) - clusterStateRegistry := clusterstate.NewClusterStateRegistry(nil, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(nil, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) suOrchestrator := New() suOrchestrator.Initialize(&context, processors, clusterStateRegistry, nil) scaleUpOrchestrator := suOrchestrator.(*ScaleUpOrchestrator) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 284e53fe1404..844ec4c0bd64 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -158,7 +158,6 @@ func NewStaticAutoscaler( clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage, OkTotalUnreadyCount: opts.OkTotalUnreadyCount, - MaxNodeProvisionTime: opts.MaxNodeProvisionTime, } ignoredTaints := make(taints.TaintKeySet) @@ -167,7 +166,7 @@ func NewStaticAutoscaler( ignoredTaints[taintKey] = true } - clusterStateRegistry := clusterstate.NewClusterStateRegistry(autoscalingContext.CloudProvider, clusterStateConfig, autoscalingContext.LogRecorder, backoff) + clusterStateRegistry := clusterstate.NewClusterStateRegistry(autoscalingContext.CloudProvider, clusterStateConfig, autoscalingContext.LogRecorder, backoff, clusterstate.NewDefaultMaxNodeProvisionTimeProvider(autoscalingContext, processors.NodeGroupConfigProcessor)) processors.ScaleDownCandidatesNotifier.Register(clusterStateRegistry) deleteOptions := simulator.NewNodeDeleteOptions(opts) @@ -707,7 +706,11 @@ func fixNodeGroupSize(context *context.AutoscalingContext, clusterStateRegistry if incorrectSize == nil { continue } - if incorrectSize.FirstObserved.Add(context.MaxNodeProvisionTime).Before(currentTime) { + maxNodeProvisionTime, err := clusterStateRegistry.MaxNodeProvisionTime(nodeGroup) + if err != nil { + return false, fmt.Errorf("failed to retrieve maxNodeProvisionTime for nodeGroup %s", nodeGroup.Id()) + } + if incorrectSize.FirstObserved.Add(maxNodeProvisionTime).Before(currentTime) { delta := incorrectSize.CurrentSize - incorrectSize.ExpectedSize if delta < 0 { klog.V(0).Infof("Decreasing size of %s, expected=%d current=%d delta=%d", nodeGroup.Id(), @@ -729,17 +732,23 @@ func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNod csr *clusterstate.ClusterStateRegistry, currentTime time.Time, logRecorder *utils.LogEventRecorder) (bool, error) { removedAny := false for _, unregisteredNode := range unregisteredNodes { - if unregisteredNode.UnregisteredSince.Add(context.MaxNodeProvisionTime).Before(currentTime) { + nodeGroup, err := context.CloudProvider.NodeGroupForNode(unregisteredNode.Node) + if err != nil { + klog.Warningf("Failed to get node group for %s: %v", unregisteredNode.Node.Name, err) + return removedAny, err + } + if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { + klog.Warningf("No node group for node %s, skipping", unregisteredNode.Node.Name) + continue + } + + maxNodeProvisionTime, err := csr.MaxNodeProvisionTime(nodeGroup) + if err != nil { + return removedAny, fmt.Errorf("failed to retrieve maxNodeProvisionTime for node %s in nodeGroup %s", unregisteredNode.Node.Name, nodeGroup.Id()) + } + + if unregisteredNode.UnregisteredSince.Add(maxNodeProvisionTime).Before(currentTime) { klog.V(0).Infof("Removing unregistered node %v", unregisteredNode.Node.Name) - nodeGroup, err := context.CloudProvider.NodeGroupForNode(unregisteredNode.Node) - if err != nil { - klog.Warningf("Failed to get node group for %s: %v", unregisteredNode.Node.Name, err) - return removedAny, err - } - if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { - klog.Warningf("No node group for node %s, skipping", unregisteredNode.Node.Name) - continue - } size, err := nodeGroup.TargetSize() if err != nil { klog.Warningf("Failed to get node group size; unregisteredNode=%v; nodeGroup=%v; err=%v", unregisteredNode.Node.Name, nodeGroup.Id(), err) diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 20628cf6b36f..8857af9313e2 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -200,6 +200,7 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { ScaleDownUnneededTime: time.Minute, ScaleDownUnreadyTime: time.Minute, ScaleDownUtilizationThreshold: 0.5, + MaxNodeProvisionTime: 10 * time.Second, }, EstimatorName: estimator.BinpackingEstimatorName, EnforceNodeGroupMinSize: true, @@ -221,12 +222,10 @@ func TestStaticAutoscalerRunOnce(t *testing.T) { context.ListerRegistry = listerRegistry clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: 10 * time.Second, + OkTotalUnreadyCount: 1, } - processors := NewTestProcessors(&context) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(options.NodeGroupDefaults.MaxNodeProvisionTime)) sdPlanner, sdActuator := newScaleDownPlannerAndActuator(t, &context, processors, clusterState) suOrchestrator := orchestrator.New() suOrchestrator.Initialize(&context, processors, clusterState, nil) @@ -414,6 +413,7 @@ func TestStaticAutoscalerRunOnceWithAutoprovisionedEnabled(t *testing.T) { ScaleDownUnneededTime: time.Minute, ScaleDownUnreadyTime: time.Minute, ScaleDownUtilizationThreshold: 0.5, + MaxNodeProvisionTime: 10 * time.Second, }, EstimatorName: estimator.BinpackingEstimatorName, ScaleDownEnabled: true, @@ -440,10 +440,9 @@ func TestStaticAutoscalerRunOnceWithAutoprovisionedEnabled(t *testing.T) { context.ListerRegistry = listerRegistry clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - OkTotalUnreadyCount: 0, - MaxNodeProvisionTime: 10 * time.Second, + OkTotalUnreadyCount: 0, } - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(options.NodeGroupDefaults.MaxNodeProvisionTime)) sdPlanner, sdActuator := newScaleDownPlannerAndActuator(t, &context, processors, clusterState) suOrchestrator := orchestrator.New() @@ -564,13 +563,13 @@ func TestStaticAutoscalerRunOnceWithALongUnregisteredNode(t *testing.T) { ScaleDownUnneededTime: time.Minute, ScaleDownUnreadyTime: time.Minute, ScaleDownUtilizationThreshold: 0.5, + MaxNodeProvisionTime: 10 * time.Second, }, - EstimatorName: estimator.BinpackingEstimatorName, - ScaleDownEnabled: true, - MaxNodesTotal: 10, - MaxCoresTotal: 10, - MaxMemoryTotal: 100000, - MaxNodeProvisionTime: 10 * time.Second, + EstimatorName: estimator.BinpackingEstimatorName, + ScaleDownEnabled: true, + MaxNodesTotal: 10, + MaxCoresTotal: 10, + MaxMemoryTotal: 100000, } processorCallbacks := newStaticAutoscalerProcessorCallbacks() @@ -585,10 +584,9 @@ func TestStaticAutoscalerRunOnceWithALongUnregisteredNode(t *testing.T) { context.ListerRegistry = listerRegistry clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: 10 * time.Second, + OkTotalUnreadyCount: 1, } - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(options.NodeGroupDefaults.MaxNodeProvisionTime)) // broken node detected as unregistered nodes := []*apiv1.Node{n1} @@ -723,6 +721,7 @@ func TestStaticAutoscalerRunOncePodsWithPriorities(t *testing.T) { ScaleDownUnneededTime: time.Minute, ScaleDownUtilizationThreshold: 0.5, ScaleDownUnreadyTime: time.Minute, + MaxNodeProvisionTime: 10 * time.Second, }, EstimatorName: estimator.BinpackingEstimatorName, ScaleDownEnabled: true, @@ -745,12 +744,11 @@ func TestStaticAutoscalerRunOncePodsWithPriorities(t *testing.T) { context.ListerRegistry = listerRegistry clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: 10 * time.Second, + OkTotalUnreadyCount: 1, } processors := NewTestProcessors(&context) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(options.NodeGroupDefaults.MaxNodeProvisionTime)) sdPlanner, sdActuator := newScaleDownPlannerAndActuator(t, &context, processors, clusterState) suOrchestrator := orchestrator.New() suOrchestrator.Initialize(&context, processors, clusterState, nil) @@ -859,6 +857,7 @@ func TestStaticAutoscalerRunOnceWithFilteringOnBinPackingEstimator(t *testing.T) options := config.AutoscalingOptions{ NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ ScaleDownUtilizationThreshold: 0.5, + MaxNodeProvisionTime: 10 * time.Second, }, EstimatorName: estimator.BinpackingEstimatorName, ScaleDownEnabled: false, @@ -880,12 +879,11 @@ func TestStaticAutoscalerRunOnceWithFilteringOnBinPackingEstimator(t *testing.T) context.ListerRegistry = listerRegistry clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: 10 * time.Second, + OkTotalUnreadyCount: 1, } processors := NewTestProcessors(&context) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(options.NodeGroupDefaults.MaxNodeProvisionTime)) sdPlanner, sdActuator := newScaleDownPlannerAndActuator(t, &context, processors, clusterState) autoscaler := &StaticAutoscaler{ @@ -959,6 +957,7 @@ func TestStaticAutoscalerRunOnceWithFilteringOnUpcomingNodesEnabledNoScaleUp(t * options := config.AutoscalingOptions{ NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ ScaleDownUtilizationThreshold: 0.5, + MaxNodeProvisionTime: 10 * time.Second, }, EstimatorName: estimator.BinpackingEstimatorName, ScaleDownEnabled: false, @@ -980,12 +979,11 @@ func TestStaticAutoscalerRunOnceWithFilteringOnUpcomingNodesEnabledNoScaleUp(t * context.ListerRegistry = listerRegistry clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: 10 * time.Second, + OkTotalUnreadyCount: 1, } processors := NewTestProcessors(&context) - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff()) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(options.NodeGroupDefaults.MaxNodeProvisionTime)) sdPlanner, sdActuator := newScaleDownPlannerAndActuator(t, &context, processors, clusterState) autoscaler := &StaticAutoscaler{ @@ -1024,6 +1022,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { ScaleDownUnneededTime: time.Minute, ScaleDownUnreadyTime: time.Minute, ScaleDownUtilizationThreshold: 0.5, + MaxNodeProvisionTime: 10 * time.Second, }, EstimatorName: estimator.BinpackingEstimatorName, ScaleDownEnabled: true, @@ -1038,11 +1037,12 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { assert.NoError(t, err) clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - OkTotalUnreadyCount: 1, - MaxNodeProvisionTime: 10 * time.Second, + OkTotalUnreadyCount: 1, } - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff()) + staticMaxNodeProvisionTimeProvider := clusterstate.NewStaticMaxNodeProvisionTimeProvider(options.NodeGroupDefaults.MaxNodeProvisionTime) + + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), staticMaxNodeProvisionTimeProvider) autoscaler := &StaticAutoscaler{ AutoscalingContext: &context, clusterStateRegistry: clusterState, @@ -1060,6 +1060,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { nodeGroupA.On("TargetSize").Return(5, nil) nodeGroupA.On("Id").Return("A") nodeGroupA.On("DeleteNodes", mock.Anything).Return(nil) + nodeGroupA.On("GetOptions", options.NodeGroupDefaults).Return(&options.NodeGroupDefaults, nil) nodeGroupA.On("Nodes").Return([]cloudprovider.Instance{ { Id: "A1", @@ -1120,6 +1121,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { nodeGroupB.On("TargetSize").Return(5, nil) nodeGroupB.On("Id").Return("B") nodeGroupB.On("DeleteNodes", mock.Anything).Return(nil) + nodeGroupB.On("GetOptions", options.NodeGroupDefaults).Return(&options.NodeGroupDefaults, nil) nodeGroupB.On("Nodes").Return([]cloudprovider.Instance{ { Id: "B1", @@ -1257,6 +1259,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { nodeGroupC.On("TargetSize").Return(1, nil) nodeGroupC.On("Id").Return("C") nodeGroupC.On("DeleteNodes", mock.Anything).Return(nil) + nodeGroupC.On("GetOptions", options.NodeGroupDefaults).Return(&options.NodeGroupDefaults, nil) nodeGroupC.On("Nodes").Return([]cloudprovider.Instance{ { Id: "C1", @@ -1277,7 +1280,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { return false }, nil) - clusterState = clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff()) + clusterState = clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), staticMaxNodeProvisionTimeProvider) clusterState.RefreshCloudProviderNodeInstancesCache() autoscaler.clusterStateRegistry = clusterState @@ -1407,7 +1410,7 @@ func TestStaticAutoscalerUpcomingScaleDownCandidates(t *testing.T) { // Create CSR with unhealthy cluster protection effectively disabled, to guarantee we reach the tested logic. csrConfig := clusterstate.ClusterStateRegistryConfig{OkTotalUnreadyCount: nodeGroupCount * unreadyNodesCount} - csr := clusterstate.NewClusterStateRegistry(provider, csrConfig, ctx.LogRecorder, NewBackoff()) + csr := clusterstate.NewClusterStateRegistry(provider, csrConfig, ctx.LogRecorder, NewBackoff(), clusterstate.NewStaticMaxNodeProvisionTimeProvider(15*time.Minute)) // Setting the Actuator is necessary for testing any scale-down logic, it shouldn't have anything to do in this test. actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), simulator.NodeDeleteOptions{}) @@ -1491,20 +1494,24 @@ func TestRemoveFixNodeTargetSize(t *testing.T) { fakeClient := &fake.Clientset{} fakeLogRecorder, _ := clusterstate_utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{ - MaxTotalUnreadyPercentage: 10, - OkTotalUnreadyCount: 1, - }, fakeLogRecorder, NewBackoff()) - err := clusterState.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now.Add(-time.Hour)) - assert.NoError(t, err) context := &context.AutoscalingContext{ AutoscalingOptions: config.AutoscalingOptions{ - MaxNodeProvisionTime: 45 * time.Minute, + NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ + MaxNodeProvisionTime: 45 * time.Minute, + }, }, CloudProvider: provider, } + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{ + MaxTotalUnreadyPercentage: 10, + OkTotalUnreadyCount: 1, + }, fakeLogRecorder, NewBackoff(), + clusterstate.NewStaticMaxNodeProvisionTimeProvider(context.AutoscalingOptions.NodeGroupDefaults.MaxNodeProvisionTime)) + err := clusterState.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now.Add(-time.Hour)) + assert.NoError(t, err) + // Nothing should be fixed. The incorrect size state is not old enough. removed, err := fixNodeGroupSize(context, clusterState, now.Add(-50*time.Minute)) assert.NoError(t, err) @@ -1537,19 +1544,23 @@ func TestRemoveOldUnregisteredNodes(t *testing.T) { fakeClient := &fake.Clientset{} fakeLogRecorder, _ := clusterstate_utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") - clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{ - MaxTotalUnreadyPercentage: 10, - OkTotalUnreadyCount: 1, - }, fakeLogRecorder, NewBackoff()) - err := clusterState.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now.Add(-time.Hour)) - assert.NoError(t, err) context := &context.AutoscalingContext{ AutoscalingOptions: config.AutoscalingOptions{ - MaxNodeProvisionTime: 45 * time.Minute, + NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ + MaxNodeProvisionTime: 45 * time.Minute, + }, }, CloudProvider: provider, } + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{ + MaxTotalUnreadyPercentage: 10, + OkTotalUnreadyCount: 1, + }, fakeLogRecorder, NewBackoff(), + clusterstate.NewStaticMaxNodeProvisionTimeProvider(context.AutoscalingOptions.NodeGroupDefaults.MaxNodeProvisionTime)) + err := clusterState.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now.Add(-time.Hour)) + assert.NoError(t, err) + unregisteredNodes := clusterState.GetUnregisteredNodes() assert.Equal(t, 1, len(unregisteredNodes)) diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index e364dae3974e..0c44cf908c4f 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -148,7 +148,7 @@ var ( maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations") okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage") scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there 0 ready nodes.") - maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "Maximum time CA waits for node to be provisioned") + maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group") maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up") nodeGroupsFlag = multiStringFlag( "nodes", @@ -257,6 +257,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { ScaleDownGpuUtilizationThreshold: *scaleDownGpuUtilizationThreshold, ScaleDownUnneededTime: *scaleDownUnneededTime, ScaleDownUnreadyTime: *scaleDownUnreadyTime, + MaxNodeProvisionTime: *maxNodeProvisionTime, }, CloudConfig: *cloudConfig, CloudProviderName: *cloudProviderFlag, @@ -274,7 +275,6 @@ func createAutoscalingOptions() config.AutoscalingOptions { MaxBulkSoftTaintTime: *maxBulkSoftTaintTime, MaxEmptyBulkDelete: *maxEmptyBulkDeleteFlag, MaxGracefulTerminationSec: *maxGracefulTerminationFlag, - MaxNodeProvisionTime: *maxNodeProvisionTime, MaxPodEvictionTime: *maxPodEvictionTime, MaxNodesTotal: *maxNodesTotal, MaxCoresTotal: maxCoresTotal, diff --git a/cluster-autoscaler/processors/nodegroupconfig/node_group_config_processor.go b/cluster-autoscaler/processors/nodegroupconfig/node_group_config_processor.go index 74ce0efbce68..5d5c80aed3ac 100644 --- a/cluster-autoscaler/processors/nodegroupconfig/node_group_config_processor.go +++ b/cluster-autoscaler/processors/nodegroupconfig/node_group_config_processor.go @@ -33,6 +33,8 @@ type NodeGroupConfigProcessor interface { GetScaleDownUtilizationThreshold(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (float64, error) // GetScaleDownGpuUtilizationThreshold returns ScaleDownGpuUtilizationThreshold value that should be used for a given NodeGroup. GetScaleDownGpuUtilizationThreshold(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (float64, error) + // GetMaxNodeProvisionTime return MaxNodeProvisionTime value that should be used for a given NodeGroup. + GetMaxNodeProvisionTime(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (time.Duration, error) // CleanUp cleans up processor's internal structures. CleanUp() } @@ -91,6 +93,18 @@ func (p *DelegatingNodeGroupConfigProcessor) GetScaleDownGpuUtilizationThreshold return ngConfig.ScaleDownGpuUtilizationThreshold, nil } +// GetMaxNodeProvisionTime returns MaxNodeProvisionTime value that should be used for a given NodeGroup. +func (p *DelegatingNodeGroupConfigProcessor) GetMaxNodeProvisionTime(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (time.Duration, error) { + ngConfig, err := nodeGroup.GetOptions(context.NodeGroupDefaults) + if err != nil && err != cloudprovider.ErrNotImplemented { + return time.Duration(0), err + } + if ngConfig == nil || err == cloudprovider.ErrNotImplemented { + return context.NodeGroupDefaults.MaxNodeProvisionTime, nil + } + return ngConfig.MaxNodeProvisionTime, nil +} + // CleanUp cleans up processor's internal structures. func (p *DelegatingNodeGroupConfigProcessor) CleanUp() { } diff --git a/cluster-autoscaler/processors/nodegroupconfig/node_group_config_processor_test.go b/cluster-autoscaler/processors/nodegroupconfig/node_group_config_processor_test.go index aabb1838ceed..4046424afd58 100644 --- a/cluster-autoscaler/processors/nodegroupconfig/node_group_config_processor_test.go +++ b/cluster-autoscaler/processors/nodegroupconfig/node_group_config_processor_test.go @@ -48,12 +48,14 @@ func TestDelegatingNodeGroupConfigProcessor(t *testing.T) { ScaleDownUnreadyTime: 4 * time.Minute, ScaleDownGpuUtilizationThreshold: 0.6, ScaleDownUtilizationThreshold: 0.5, + MaxNodeProvisionTime: 15 * time.Minute, } ngOpts := &config.NodeGroupAutoscalingOptions{ ScaleDownUnneededTime: 10 * time.Minute, ScaleDownUnreadyTime: 11 * time.Minute, ScaleDownGpuUtilizationThreshold: 0.85, ScaleDownUtilizationThreshold: 0.75, + MaxNodeProvisionTime: 60 * time.Minute, } testUnneededTime := func(t *testing.T, p DelegatingNodeGroupConfigProcessor, c *context.AutoscalingContext, ng cloudprovider.NodeGroup, w Want, we error) { @@ -96,17 +98,29 @@ func TestDelegatingNodeGroupConfigProcessor(t *testing.T) { } assert.Equal(t, res, results[w]) } + testMaxNodeProvisionTime := func(t *testing.T, p DelegatingNodeGroupConfigProcessor, c *context.AutoscalingContext, ng cloudprovider.NodeGroup, w Want, we error) { + res, err := p.GetMaxNodeProvisionTime(c, ng) + assert.Equal(t, err, we) + results := map[Want]time.Duration{ + NIL: time.Duration(0), + GLOBAL: 15 * time.Minute, + NG: 60 * time.Minute, + } + assert.Equal(t, res, results[w]) + } funcs := map[string]func(*testing.T, DelegatingNodeGroupConfigProcessor, *context.AutoscalingContext, cloudprovider.NodeGroup, Want, error){ "ScaleDownUnneededTime": testUnneededTime, "ScaleDownUnreadyTime": testUnreadyTime, "ScaleDownUtilizationThreshold": testUtilizationThreshold, "ScaleDownGpuUtilizationThreshold": testGpuThreshold, + "MaxNodeProvisionTime": testMaxNodeProvisionTime, "MultipleOptions": func(t *testing.T, p DelegatingNodeGroupConfigProcessor, c *context.AutoscalingContext, ng cloudprovider.NodeGroup, w Want, we error) { testUnneededTime(t, p, c, ng, w, we) testUnreadyTime(t, p, c, ng, w, we) testUtilizationThreshold(t, p, c, ng, w, we) testGpuThreshold(t, p, c, ng, w, we) + testMaxNodeProvisionTime(t, p, c, ng, w, we) }, "RepeatingTheSameCallGivesConsistentResults": func(t *testing.T, p DelegatingNodeGroupConfigProcessor, c *context.AutoscalingContext, ng cloudprovider.NodeGroup, w Want, we error) { testUnneededTime(t, p, c, ng, w, we)