Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat:add node group health and back off metrics #6396

Merged
5 changes: 5 additions & 0 deletions cluster-autoscaler/cloudprovider/cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,11 @@ const (
InstanceDeleting InstanceState = 3
)

const (
// UnknownErrorCode means that the cloud provider has not provided an error code.
UnknownErrorCode = "unknown"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this in cloudprovider package?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Modifications done and PR submitted

)

// InstanceErrorInfo provides information about error condition on instance
type InstanceErrorInfo struct {
// ErrorClass tells what is class of error on instance
Expand Down
37 changes: 37 additions & 0 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ type ScaleUpFailure struct {
Time time.Time
}

// BackoffReasonStatus contains information about backoff status and reason
type BackoffReasonStatus map[string]int
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the values here will always be either 0 or 1, maybe map[string]bool would make more sense?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Modifications done and PR submitted


// ClusterStateRegistry is a structure to keep track the current state of the cluster.
type ClusterStateRegistry struct {
sync.Mutex
Expand All @@ -132,6 +135,7 @@ type ClusterStateRegistry struct {
unregisteredNodes map[string]UnregisteredNode
deletedNodes map[string]struct{}
candidatesForScaleDown map[string][]string
backoffReasonStatus map[string]BackoffReasonStatus
backoff backoff.Backoff
lastStatus *api.ClusterAutoscalerStatus
lastScaleDownUpdateTime time.Time
Expand Down Expand Up @@ -168,6 +172,7 @@ func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config C
unregisteredNodes: make(map[string]UnregisteredNode),
deletedNodes: make(map[string]struct{}),
candidatesForScaleDown: make(map[string][]string),
backoffReasonStatus: make(map[string]BackoffReasonStatus),
backoff: backoff,
lastStatus: utils.EmptyClusterAutoscalerStatus(),
logRecorder: logRecorder,
Expand Down Expand Up @@ -462,6 +467,38 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
}

// UpdateSafeScaleUpMetricsForNodeGroup queries the health status and backoff situation of the node group and updates metrics
func (csr *ClusterStateRegistry) UpdateSafeScaleUpMetricsForNodeGroup(now time.Time) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both the ForNodeGroup suffix and the comment above suggest this is about a specific node group, while in fact it iterates over all of them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Modifications done and PR submitted

for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
if !nodeGroup.Exist() {
continue
}
metrics.UpdateNodeGroupHealthStatus(nodeGroup.Id(), csr.IsNodeGroupHealthy(nodeGroup.Id()))
backoffStatus := csr.backoff.BackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
csr.updateNodeGroupBackoffStatusMetrics(nodeGroup.Id(), backoffStatus)
}
}

// updateNodeGroupBackoffStatusMetrics returns information about backoff situation and reason of the node group
func (csr *ClusterStateRegistry) updateNodeGroupBackoffStatusMetrics(nodeGroup string, backoffStatus backoff.Status) {
backoffReasonStatus := make(BackoffReasonStatus)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Do you actually need to allocate this map every time? Zeroing the existing one would have the exact same effect, right? You're iterating over it anyway.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Modifications done and PR submitted

if oldStatus, ok := csr.backoffReasonStatus[nodeGroup]; ok {
for reason := range oldStatus {
backoffReasonStatus[reason] = 0
}
}
if backoffStatus.IsBackedOff {
errorCode := backoffStatus.ErrorInfo.ErrorCode
if errorCode == "" {
// prevent error code from being empty.
errorCode = cloudprovider.UnknownErrorCode
}
backoffReasonStatus[errorCode] = 1
}
csr.backoffReasonStatus[nodeGroup] = backoffReasonStatus
metrics.UpdateNodeGroupBackOffStatus(nodeGroup, backoffReasonStatus)
}

// NodeGroupScaleUpSafety returns information about node group safety to be scaled up now.
func (csr *ClusterStateRegistry) NodeGroupScaleUpSafety(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety {
isHealthy := csr.IsNodeGroupHealthy(nodeGroup.Id())
Expand Down
1 change: 1 addition & 0 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
if err != nil {
klog.Errorf("AutoscalingStatusProcessor error: %v.", err)
}
a.clusterStateRegistry.UpdateSafeScaleUpMetricsForNodeGroup(currentTime)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would better fit as an AutoscalingStatusProcessor implementation - all the relevant params are already passed in the Process call above. Let's update this function:

func NewDefaultAutoscalingStatusProcessor() AutoscalingStatusProcessor {
return &NoOpAutoscalingStatusProcessor{}
}

to return a new processor that does the metric update. The new processor would also be a better place to keep the backoffReasonStatus map - clusterStateRegistry should be responsible for tracking the cluster state, not for metrics handling.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Modifications done and PR submitted

}()

// Check if there are any nodes that failed to register in Kubernetes
Expand Down
38 changes: 38 additions & 0 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,22 @@ var (
}, []string{"node_group"},
)

nodesGroupHealthiness = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_group_healthiness",
Help: "Whether or not node group is healthy enough for autoscaling. 1 if it is, 0 otherwise.",
}, []string{"node_group"},
)

nodeGroupBackOffStatus = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_group_backoff_status",
Help: "Whether or not node group is backoff for not autoscaling. 1 if it is, 0 otherwise.",
}, []string{"node_group", "reason"},
)

/**** Metrics related to autoscaler execution ****/
lastActivity = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Expand Down Expand Up @@ -438,6 +454,8 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
legacyregistry.MustRegister(nodesGroupMinNodes)
legacyregistry.MustRegister(nodesGroupMaxNodes)
legacyregistry.MustRegister(nodesGroupTargetSize)
legacyregistry.MustRegister(nodesGroupHealthiness)
legacyregistry.MustRegister(nodeGroupBackOffStatus)
}
}

Expand Down Expand Up @@ -543,6 +561,26 @@ func UpdateNodeGroupTargetSize(targetSizes map[string]int) {
}
}

// UpdateNodeGroupHealthStatus records if node group is healthy to autoscaling
func UpdateNodeGroupHealthStatus(nodeGroup string, healthy bool) {
if healthy {
nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(1)
} else {
nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(0)
}
}

// UpdateNodeGroupBackOffStatus records if node group is backoff for not autoscaling
func UpdateNodeGroupBackOffStatus(nodeGroup string, backoffReasonStatus map[string]int) {
if len(backoffReasonStatus) == 0 {
nodeGroupBackOffStatus.WithLabelValues(nodeGroup, "").Set(0)
} else {
for reason, status := range backoffReasonStatus {
nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(float64(status))
}
}
}

// RegisterError records any errors preventing Cluster Autoscaler from working.
// No more than one error should be recorded per loop.
func RegisterError(err errors.AutoscalerError) {
Expand Down
Loading