kubernetes · k8s-ci-robot · Jan 24, 2024 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -456,6 +456,18 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
 	metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
 }
 
+// UpdateSafeScaleUpMetricsForNodeGroup queries the health status and backoff situation of the node group and updates metrics
+func (csr *ClusterStateRegistry) UpdateSafeScaleUpMetricsForNodeGroup(now time.Time) {
+	for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
+		if !nodeGroup.Exist() {
+			continue
+		}
+		metrics.UpdateNodeGroupHealthStatus(nodeGroup.Id(), csr.IsNodeGroupHealthy(nodeGroup.Id()))
+		backoffStatus := csr.backoff.BackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
+		metrics.UpdateNodeGroupBackOffStatus(nodeGroup.Id(), backoffStatus.IsBackedOff, backoffStatus.ErrorInfo.ErrorCode)
+	}
+}
+
 // IsNodeGroupSafeToScaleUp returns information about node group safety to be scaled up now.
 func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety {
 	isHealthy := csr.IsNodeGroupHealthy(nodeGroup.Id())

diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -405,6 +405,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
 		if err != nil {
 			klog.Errorf("AutoscalingStatusProcessor error: %v.", err)
 		}
+		a.clusterStateRegistry.UpdateSafeScaleUpMetricsForNodeGroup(currentTime)
 func NewDefaultAutoscalingStatusProcessor() AutoscalingStatusProcessor { 
 	return &NoOpAutoscalingStatusProcessor{} 
 } 
 func NewDefaultAutoscalingStatusProcessor() AutoscalingStatusProcessor { 
 	return &NoOpAutoscalingStatusProcessor{} 
 } 
 	}()
 
 	// Check if there are any nodes that failed to register in Kubernetes

diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go
@@ -208,6 +208,22 @@ var (
 		}, []string{"node_group"},
 	)
 
+	nodesGroupHealthiness = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_healthiness",
+			Help:      "Whether or not node group is healthy enough for autoscaling. 1 if it is, 0 otherwise.",
+		}, []string{"node_group"},
+	)
+
+	nodeGroupBackOffStatus = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_backoff_status",
+			Help:      "Whether or not node group is backoff for not autoscaling. 1 if it is, 0 otherwise.",
+		}, []string{"node_group", "reason"},
+	)
+
 	/**** Metrics related to autoscaler execution ****/
 	lastActivity = k8smetrics.NewGaugeVec(
 		&k8smetrics.GaugeOpts{
@@ -431,6 +447,8 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
 		legacyregistry.MustRegister(nodesGroupMinNodes)
 		legacyregistry.MustRegister(nodesGroupMaxNodes)
 		legacyregistry.MustRegister(nodesGroupTargetSize)
+		legacyregistry.MustRegister(nodesGroupHealthiness)
+		legacyregistry.MustRegister(nodeGroupBackOffStatus)
 	}
 }
 
@@ -536,6 +554,24 @@ func UpdateNodeGroupTargetSize(targetSizes map[string]int) {
 	}
 }
 
+// UpdateNodeGroupHealthStatus records if node group is healthy to autoscaling
+func UpdateNodeGroupHealthStatus(nodeGroup string, healthy bool) {
+	if healthy {
+		nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(1)
+	} else {
+		nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(0)
+	}
+}
+
+// UpdateNodeGroupBackOffStatus records if node group is backoff for not autoscaling
+func UpdateNodeGroupBackOffStatus(nodeGroup string, backOff bool, reason string) {
+	if backOff {
+		nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(1)
+	} else {
+		nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(0)
+	}
+}
+
 // RegisterError records any errors preventing Cluster Autoscaler from working.
 // No more than one error should be recorded per loop.
 func RegisterError(err errors.AutoscalerError) {