Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat:add node group health and back off metrics #6396

Merged
5 changes: 5 additions & 0 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,11 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
}

// BackoffStatusForNodeGroup queries the backoff status of the node group
func (csr *ClusterStateRegistry) BackoffStatusForNodeGroup(nodeGroup cloudprovider.NodeGroup, now time.Time) backoff.Status {
return csr.backoff.BackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
}

// NodeGroupScaleUpSafety returns information about node group safety to be scaled up now.
func (csr *ClusterStateRegistry) NodeGroupScaleUpSafety(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety {
isHealthy := csr.IsNodeGroupHealthy(nodeGroup.Id())
Expand Down
42 changes: 42 additions & 0 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,22 @@ var (
}, []string{"node_group"},
)

nodesGroupHealthiness = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_group_healthiness",
Help: "Whether or not node group is healthy enough for autoscaling. 1 if it is, 0 otherwise.",
}, []string{"node_group"},
)

nodeGroupBackOffStatus = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_group_backoff_status",
Help: "Whether or not node group is backoff for not autoscaling. 1 if it is, 0 otherwise.",
}, []string{"node_group", "reason"},
)

/**** Metrics related to autoscaler execution ****/
lastActivity = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Expand Down Expand Up @@ -438,6 +454,8 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
legacyregistry.MustRegister(nodesGroupMinNodes)
legacyregistry.MustRegister(nodesGroupMaxNodes)
legacyregistry.MustRegister(nodesGroupTargetSize)
legacyregistry.MustRegister(nodesGroupHealthiness)
legacyregistry.MustRegister(nodeGroupBackOffStatus)
}
}

Expand Down Expand Up @@ -543,6 +561,30 @@ func UpdateNodeGroupTargetSize(targetSizes map[string]int) {
}
}

// UpdateNodeGroupHealthStatus records if node group is healthy to autoscaling
func UpdateNodeGroupHealthStatus(nodeGroup string, healthy bool) {
if healthy {
nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(1)
} else {
nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(0)
}
}

// UpdateNodeGroupBackOffStatus records if node group is backoff for not autoscaling
func UpdateNodeGroupBackOffStatus(nodeGroup string, backoffReasonStatus map[string]bool) {
if len(backoffReasonStatus) == 0 {
nodeGroupBackOffStatus.WithLabelValues(nodeGroup, "").Set(0)
} else {
for reason, backoff := range backoffReasonStatus {
if backoff {
nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(1)
} else {
nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(0)
}
}
}
}

// RegisterError records any errors preventing Cluster Autoscaler from working.
// No more than one error should be recorded per loop.
func RegisterError(err errors.AutoscalerError) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ type AutoscalingStatusProcessor interface {

// NewDefaultAutoscalingStatusProcessor creates a default instance of AutoscalingStatusProcessor.
func NewDefaultAutoscalingStatusProcessor() AutoscalingStatusProcessor {
return &NoOpAutoscalingStatusProcessor{}
return &MetricsAutoscalingStatusProcessor{
backoffReasonStatus: make(map[string]BackoffReasonStatus),
}
}

// NoOpAutoscalingStatusProcessor is an AutoscalingStatusProcessor implementation useful for testing.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
Copyright 2018 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package status

import (
"time"

"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/metrics"
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
)

const (
// unknownErrorCode means that the cloud provider has not provided an error code.
unknownErrorCode = "unknown"
)

// BackoffReasonStatus contains information about backoff status and reason
type BackoffReasonStatus map[string]bool

// MetricsAutoscalingStatusProcessor is used to update metrics after each autoscaling iteration.
type MetricsAutoscalingStatusProcessor struct {
backoffReasonStatus map[string]BackoffReasonStatus
}

// Process queries the health status and backoff situation of all node groups and updates metrics after each autoscaling iteration.
func (p *MetricsAutoscalingStatusProcessor) Process(context *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, now time.Time) error {
for _, nodeGroup := range context.CloudProvider.NodeGroups() {
if !nodeGroup.Exist() {
continue
}
metrics.UpdateNodeGroupHealthStatus(nodeGroup.Id(), csr.IsNodeGroupHealthy(nodeGroup.Id()))
backoffStatus := csr.BackoffStatusForNodeGroup(nodeGroup, now)
p.updateNodeGroupBackoffStatusMetrics(nodeGroup.Id(), backoffStatus)
}
return nil
}

// CleanUp cleans up the processor's internal structures.
func (p *MetricsAutoscalingStatusProcessor) CleanUp() {
}

// updateNodeGroupBackoffStatusMetrics updates metrics about backoff situation and reason of the node group
func (p *MetricsAutoscalingStatusProcessor) updateNodeGroupBackoffStatusMetrics(nodeGroup string, backoffStatus backoff.Status) {
if _, ok := p.backoffReasonStatus[nodeGroup]; ok {
for reason := range p.backoffReasonStatus[nodeGroup] {
p.backoffReasonStatus[nodeGroup][reason] = false
}
} else {
p.backoffReasonStatus[nodeGroup] = make(BackoffReasonStatus)
}

if backoffStatus.IsBackedOff {
errorCode := backoffStatus.ErrorInfo.ErrorCode
if errorCode == "" {
// prevent error code from being empty.
errorCode = unknownErrorCode
}
p.backoffReasonStatus[nodeGroup][errorCode] = true
}
metrics.UpdateNodeGroupBackOffStatus(nodeGroup, p.backoffReasonStatus[nodeGroup])
}
Loading