kubernetes · k8s-ci-robot · Jan 24, 2024 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/cluster-autoscaler/cloudprovider/cloud_provider.go b/cluster-autoscaler/cloudprovider/cloud_provider.go
@@ -267,6 +267,11 @@ const (
 	InstanceDeleting InstanceState = 3
 )
 
+const (
+	// UnknownErrorCode means that the cloud provider has not provided an error code.
+	UnknownErrorCode = "unknown"
+)
+
 // InstanceErrorInfo provides information about error condition on instance
 type InstanceErrorInfo struct {
 	// ErrorClass tells what is class of error on instance

diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -116,6 +116,9 @@ type ScaleUpFailure struct {
 	Time      time.Time
 }
 
+// BackoffReasonStatus contains information about backoff status and reason
+type BackoffReasonStatus map[string]int
+
 // ClusterStateRegistry is a structure to keep track the current state of the cluster.
 type ClusterStateRegistry struct {
 	sync.Mutex
@@ -132,6 +135,7 @@ type ClusterStateRegistry struct {
 	unregisteredNodes                  map[string]UnregisteredNode
 	deletedNodes                       map[string]struct{}
 	candidatesForScaleDown             map[string][]string
+	backoffReasonStatus                map[string]BackoffReasonStatus
 	backoff                            backoff.Backoff
 	lastStatus                         *api.ClusterAutoscalerStatus
 	lastScaleDownUpdateTime            time.Time
@@ -168,6 +172,7 @@ func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config C
 		unregisteredNodes:               make(map[string]UnregisteredNode),
 		deletedNodes:                    make(map[string]struct{}),
 		candidatesForScaleDown:          make(map[string][]string),
+		backoffReasonStatus:             make(map[string]BackoffReasonStatus),
 		backoff:                         backoff,
 		lastStatus:                      utils.EmptyClusterAutoscalerStatus(),
 		logRecorder:                     logRecorder,
@@ -462,6 +467,38 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
 	metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
 }
 
+// UpdateSafeScaleUpMetricsForNodeGroup queries the health status and backoff situation of the node group and updates metrics
+func (csr *ClusterStateRegistry) UpdateSafeScaleUpMetricsForNodeGroup(now time.Time) {
+	for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
+		if !nodeGroup.Exist() {
+			continue
+		}
+		metrics.UpdateNodeGroupHealthStatus(nodeGroup.Id(), csr.IsNodeGroupHealthy(nodeGroup.Id()))
+		backoffStatus := csr.backoff.BackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
+		csr.updateNodeGroupBackoffStatusMetrics(nodeGroup.Id(), backoffStatus)
+	}
+}
+
+// updateNodeGroupBackoffStatusMetrics returns information about backoff situation and reason of the node group
+func (csr *ClusterStateRegistry) updateNodeGroupBackoffStatusMetrics(nodeGroup string, backoffStatus backoff.Status) {
+	backoffReasonStatus := make(BackoffReasonStatus)
+	if oldStatus, ok := csr.backoffReasonStatus[nodeGroup]; ok {
+		for reason := range oldStatus {
+			backoffReasonStatus[reason] = 0
+		}
+	}
+	if backoffStatus.IsBackedOff {
+		errorCode := backoffStatus.ErrorInfo.ErrorCode
+		if errorCode == "" {
+			// prevent error code from being empty.
+			errorCode = cloudprovider.UnknownErrorCode
+		}
+		backoffReasonStatus[errorCode] = 1
+	}
+	csr.backoffReasonStatus[nodeGroup] = backoffReasonStatus
+	metrics.UpdateNodeGroupBackOffStatus(nodeGroup, backoffReasonStatus)
+}
+
 // NodeGroupScaleUpSafety returns information about node group safety to be scaled up now.
 func (csr *ClusterStateRegistry) NodeGroupScaleUpSafety(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety {
 	isHealthy := csr.IsNodeGroupHealthy(nodeGroup.Id())

diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -406,6 +406,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
 		if err != nil {
 			klog.Errorf("AutoscalingStatusProcessor error: %v.", err)
 		}
+		a.clusterStateRegistry.UpdateSafeScaleUpMetricsForNodeGroup(currentTime)
 func NewDefaultAutoscalingStatusProcessor() AutoscalingStatusProcessor { 
 	return &NoOpAutoscalingStatusProcessor{} 
 } 
 func NewDefaultAutoscalingStatusProcessor() AutoscalingStatusProcessor { 
 	return &NoOpAutoscalingStatusProcessor{} 
 } 
 	}()
 
 	// Check if there are any nodes that failed to register in Kubernetes

diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go
@@ -215,6 +215,22 @@ var (
 		}, []string{"node_group"},
 	)
 
+	nodesGroupHealthiness = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_healthiness",
+			Help:      "Whether or not node group is healthy enough for autoscaling. 1 if it is, 0 otherwise.",
+		}, []string{"node_group"},
+	)
+
+	nodeGroupBackOffStatus = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_backoff_status",
+			Help:      "Whether or not node group is backoff for not autoscaling. 1 if it is, 0 otherwise.",
+		}, []string{"node_group", "reason"},
+	)
+
 	/**** Metrics related to autoscaler execution ****/
 	lastActivity = k8smetrics.NewGaugeVec(
 		&k8smetrics.GaugeOpts{
@@ -438,6 +454,8 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
 		legacyregistry.MustRegister(nodesGroupMinNodes)
 		legacyregistry.MustRegister(nodesGroupMaxNodes)
 		legacyregistry.MustRegister(nodesGroupTargetSize)
+		legacyregistry.MustRegister(nodesGroupHealthiness)
+		legacyregistry.MustRegister(nodeGroupBackOffStatus)
 	}
 }
 
@@ -543,6 +561,26 @@ func UpdateNodeGroupTargetSize(targetSizes map[string]int) {
 	}
 }
 
+// UpdateNodeGroupHealthStatus records if node group is healthy to autoscaling
+func UpdateNodeGroupHealthStatus(nodeGroup string, healthy bool) {
+	if healthy {
+		nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(1)
+	} else {
+		nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(0)
+	}
+}
+
+// UpdateNodeGroupBackOffStatus records if node group is backoff for not autoscaling
+func UpdateNodeGroupBackOffStatus(nodeGroup string, backoffReasonStatus map[string]int) {
+	if len(backoffReasonStatus) == 0 {
+		nodeGroupBackOffStatus.WithLabelValues(nodeGroup, "").Set(0)
+	} else {
+		for reason, status := range backoffReasonStatus {
+			nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(float64(status))
+		}
+	}
+}
+
 // RegisterError records any errors preventing Cluster Autoscaler from working.
 // No more than one error should be recorded per loop.
 func RegisterError(err errors.AutoscalerError) {