Merge pull request kubernetes#4700 from mongodb-forks/cluster-autosca…

…ler-release-1.21-nodegroup-minmax [cluster-autoscaler] backport kubernetes#4022 Publish node group min/max metrics into 1.21
Shubham82 · Apr 25, 2022 · 0ad1731 · 0ad1731
2 parents da4baf7 + 482cb1e
commit 0ad1731
Show file tree

Hide file tree

Showing 6 changed files with 87 additions and 3 deletions.
diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
@@ -673,6 +673,7 @@ The following startup parameters are supported for cluster autoscaler:
 | `max-node-provision-time` | Maximum time CA waits for node to be provisioned | 15 minutes
 | `nodes` | sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: <min>:<max>:<other...> | ""
 | `node-group-auto-discovery` | One or more definition(s) of node group auto-discovery.<br>A definition is expressed `<name of discoverer>:[<key>[=<value>]]`<br>The `aws`, `gce`, and `azure` cloud providers are currently supported. AWS matches by ASG tags, e.g. `asg:tag=tagKey,anotherTagKey`<br>GCE matches by IG name prefix, and requires you to specify min and max nodes per IG, e.g. `mig:namePrefix=pfx,min=0,max=10`<br> Azure matches by tags on VMSS, e.g. `label:foo=bar`, and will auto-detect `min` and `max` tags on the VMSS to set scaling limits.<br>Can be used multiple times | ""
+| `emit-per-nodegroup-metrics` | If true, emit per node group metrics. | false
 | `estimator` | Type of resource estimator to be used in scale up | binpacking
 | `expander` | Type of node group expander to be used in scale up.  | random
 | `write-status-configmap` | Should CA write status information to a configmap  | true

diff --git a/cluster-autoscaler/core/scale_up_test.go b/cluster-autoscaler/core/scale_up_test.go
@@ -978,7 +978,7 @@ func TestCheckScaleUpDeltaWithinLimits(t *testing.T) {
 }
 
 func TestAuthError(t *testing.T) {
-	metrics.RegisterAll()
+	metrics.RegisterAll(false)
 	context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{}, &fake.Clientset{}, nil, nil, nil)
 	assert.NoError(t, err)
 

diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -258,6 +258,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 		return errors.ToAutoscalerError(errors.CloudProviderError, err)
 	}
 
+	// Update node groups min/max after cloud provider refresh
+	for _, nodeGroup := range a.AutoscalingContext.CloudProvider.NodeGroups() {
+		metrics.UpdateNodeGroupMin(nodeGroup.Id(), nodeGroup.MinSize())
+		metrics.UpdateNodeGroupMax(nodeGroup.Id(), nodeGroup.MaxSize())
+	}
+
 	nonExpendableScheduledPods := core_utils.FilterOutExpendablePods(originalScheduledPods, a.ExpendablePodsPriorityCutoff)
 	// Initialize cluster state to ClusterSnapshot
 	if typedErr := a.initializeClusterSnapshot(allNodes, nonExpendableScheduledPods); typedErr != nil {

diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
@@ -180,6 +180,8 @@ var (
 	cordonNodeBeforeTerminate          = flag.Bool("cordon-node-before-terminating", false, "Should CA cordon nodes before terminating during downscale process")
 	daemonSetEvictionForEmptyNodes     = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes")
 	userAgent                          = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.")
+
+	emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.")
 )
 
 func createAutoscalingOptions() config.AutoscalingOptions {
@@ -339,7 +341,7 @@ func buildAutoscaler() (core.Autoscaler, error) {
 }
 
 func run(healthCheck *metrics.HealthCheck) {
-	metrics.RegisterAll()
+	metrics.RegisterAll(*emitPerNodeGroupMetrics)
 
 	autoscaler, err := buildAutoscaler()
 	if err != nil {

diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go
@@ -138,6 +138,22 @@ var (
 		},
 	)
 
+	nodesGroupMinNodes = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_min_count",
+			Help:      "Minimum number of nodes in the node group",
+		}, []string{"node_group"},
+	)
+
+	nodesGroupMaxNodes = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_max_count",
+			Help:      "Maximum number of nodes in the node group",
+		}, []string{"node_group"},
+	)
+
 	/**** Metrics related to autoscaler execution ****/
 	lastActivity = k8smetrics.NewGaugeVec(
 		&k8smetrics.GaugeOpts{
@@ -282,7 +298,7 @@ var (
 )
 
 // RegisterAll registers all metrics.
-func RegisterAll() {
+func RegisterAll(emitPerNodeGroupMetrics bool) {
 	legacyregistry.MustRegister(clusterSafeToAutoscale)
 	legacyregistry.MustRegister(nodesCount)
 	legacyregistry.MustRegister(nodeGroupsCount)
@@ -305,6 +321,11 @@ func RegisterAll() {
 	legacyregistry.MustRegister(napEnabled)
 	legacyregistry.MustRegister(nodeGroupCreationCount)
 	legacyregistry.MustRegister(nodeGroupDeletionCount)
+
+	if emitPerNodeGroupMetrics {
+		legacyregistry.MustRegister(nodesGroupMinNodes)
+		legacyregistry.MustRegister(nodesGroupMaxNodes)
+	}
 }
 
 // UpdateDurationFromStart records the duration of the step identified by the
@@ -364,6 +385,16 @@ func UpdateMaxNodesCount(nodesCount int) {
 	maxNodesCount.Set(float64(nodesCount))
 }
 
+// UpdateNodeGroupMin records the node group minimum allowed number of nodes
+func UpdateNodeGroupMin(nodeGroup string, minNodes int) {
+	nodesGroupMinNodes.WithLabelValues(nodeGroup).Set(float64(minNodes))
+}
+
+// UpdateNodeGroupMax records the node group maximum allowed number of nodes
+func UpdateNodeGroupMax(nodeGroup string, maxNodes int) {
+	nodesGroupMaxNodes.WithLabelValues(nodeGroup).Set(float64(maxNodes))
+}
+
 // RegisterError records any errors preventing Cluster Autoscaler from working.
 // No more than one error should be recorded per loop.
 func RegisterError(err errors.AutoscalerError) {

diff --git a/cluster-autoscaler/metrics/metrics_test.go b/cluster-autoscaler/metrics/metrics_test.go
@@ -0,0 +1,44 @@
+/*
+Copyright 2021 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestDisabledPerNodeGroupMetrics(t *testing.T) {
+	t.Skip("Registering metrics multiple times causes panic. Skipping until the test is fixed to not impact other tests.")
+	RegisterAll(false)
+	assert.False(t, nodesGroupMinNodes.IsCreated())
+	assert.False(t, nodesGroupMaxNodes.IsCreated())
+}
+
+func TestEnabledPerNodeGroupMetrics(t *testing.T) {
+	t.Skip("Registering metrics multiple times causes panic. Skipping until the test is fixed to not impact other tests.")
+	RegisterAll(true)
+	assert.True(t, nodesGroupMinNodes.IsCreated())
+	assert.True(t, nodesGroupMaxNodes.IsCreated())
+
+	UpdateNodeGroupMin("foo", 2)
+	UpdateNodeGroupMax("foo", 100)
+
+	assert.Equal(t, 2, int(testutil.ToFloat64(nodesGroupMinNodes.GaugeVec.WithLabelValues("foo"))))
+	assert.Equal(t, 100, int(testutil.ToFloat64(nodesGroupMaxNodes.GaugeVec.WithLabelValues("foo"))))
+}