diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index b8ba1d7e1b0f..2240d3f22883 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -462,6 +462,11 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() { metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned) } +// BackoffStatusForNodeGroup queries the backoff status of the node group +func (csr *ClusterStateRegistry) BackoffStatusForNodeGroup(nodeGroup cloudprovider.NodeGroup, now time.Time) backoff.Status { + return csr.backoff.BackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now) +} + // NodeGroupScaleUpSafety returns information about node group safety to be scaled up now. func (csr *ClusterStateRegistry) NodeGroupScaleUpSafety(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety { isHealthy := csr.IsNodeGroupHealthy(nodeGroup.Id()) diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index bfb4d2c432bf..12bfc34b931f 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -215,6 +215,22 @@ var ( }, []string{"node_group"}, ) + nodesGroupHealthiness = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "node_group_healthiness", + Help: "Whether or not node group is healthy enough for autoscaling. 1 if it is, 0 otherwise.", + }, []string{"node_group"}, + ) + + nodeGroupBackOffStatus = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "node_group_backoff_status", + Help: "Whether or not node group is backoff for not autoscaling. 1 if it is, 0 otherwise.", + }, []string{"node_group", "reason"}, + ) + /**** Metrics related to autoscaler execution ****/ lastActivity = k8smetrics.NewGaugeVec( &k8smetrics.GaugeOpts{ @@ -438,6 +454,8 @@ func RegisterAll(emitPerNodeGroupMetrics bool) { legacyregistry.MustRegister(nodesGroupMinNodes) legacyregistry.MustRegister(nodesGroupMaxNodes) legacyregistry.MustRegister(nodesGroupTargetSize) + legacyregistry.MustRegister(nodesGroupHealthiness) + legacyregistry.MustRegister(nodeGroupBackOffStatus) } } @@ -543,6 +561,30 @@ func UpdateNodeGroupTargetSize(targetSizes map[string]int) { } } +// UpdateNodeGroupHealthStatus records if node group is healthy to autoscaling +func UpdateNodeGroupHealthStatus(nodeGroup string, healthy bool) { + if healthy { + nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(1) + } else { + nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(0) + } +} + +// UpdateNodeGroupBackOffStatus records if node group is backoff for not autoscaling +func UpdateNodeGroupBackOffStatus(nodeGroup string, backoffReasonStatus map[string]bool) { + if len(backoffReasonStatus) == 0 { + nodeGroupBackOffStatus.WithLabelValues(nodeGroup, "").Set(0) + } else { + for reason, backoff := range backoffReasonStatus { + if backoff { + nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(1) + } else { + nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(0) + } + } + } +} + // RegisterError records any errors preventing Cluster Autoscaler from working. // No more than one error should be recorded per loop. func RegisterError(err errors.AutoscalerError) { diff --git a/cluster-autoscaler/processors/status/autoscaling_status_processor.go b/cluster-autoscaler/processors/status/autoscaling_status_processor.go index 07d0357306cc..0493433b8ca7 100644 --- a/cluster-autoscaler/processors/status/autoscaling_status_processor.go +++ b/cluster-autoscaler/processors/status/autoscaling_status_processor.go @@ -31,7 +31,9 @@ type AutoscalingStatusProcessor interface { // NewDefaultAutoscalingStatusProcessor creates a default instance of AutoscalingStatusProcessor. func NewDefaultAutoscalingStatusProcessor() AutoscalingStatusProcessor { - return &NoOpAutoscalingStatusProcessor{} + return &MetricsAutoscalingStatusProcessor{ + backoffReasonStatus: make(map[string]BackoffReasonStatus), + } } // NoOpAutoscalingStatusProcessor is an AutoscalingStatusProcessor implementation useful for testing. diff --git a/cluster-autoscaler/processors/status/metrics_autoscaling_status_processor.go b/cluster-autoscaler/processors/status/metrics_autoscaling_status_processor.go new file mode 100644 index 000000000000..816dccd3ec42 --- /dev/null +++ b/cluster-autoscaler/processors/status/metrics_autoscaling_status_processor.go @@ -0,0 +1,77 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package status + +import ( + "time" + + "k8s.io/autoscaler/cluster-autoscaler/clusterstate" + "k8s.io/autoscaler/cluster-autoscaler/context" + "k8s.io/autoscaler/cluster-autoscaler/metrics" + "k8s.io/autoscaler/cluster-autoscaler/utils/backoff" +) + +const ( + // unknownErrorCode means that the cloud provider has not provided an error code. + unknownErrorCode = "unknown" +) + +// BackoffReasonStatus contains information about backoff status and reason +type BackoffReasonStatus map[string]bool + +// MetricsAutoscalingStatusProcessor is used to update metrics after each autoscaling iteration. +type MetricsAutoscalingStatusProcessor struct { + backoffReasonStatus map[string]BackoffReasonStatus +} + +// Process queries the health status and backoff situation of all node groups and updates metrics after each autoscaling iteration. +func (p *MetricsAutoscalingStatusProcessor) Process(context *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, now time.Time) error { + for _, nodeGroup := range context.CloudProvider.NodeGroups() { + if !nodeGroup.Exist() { + continue + } + metrics.UpdateNodeGroupHealthStatus(nodeGroup.Id(), csr.IsNodeGroupHealthy(nodeGroup.Id())) + backoffStatus := csr.BackoffStatusForNodeGroup(nodeGroup, now) + p.updateNodeGroupBackoffStatusMetrics(nodeGroup.Id(), backoffStatus) + } + return nil +} + +// CleanUp cleans up the processor's internal structures. +func (p *MetricsAutoscalingStatusProcessor) CleanUp() { +} + +// updateNodeGroupBackoffStatusMetrics updates metrics about backoff situation and reason of the node group +func (p *MetricsAutoscalingStatusProcessor) updateNodeGroupBackoffStatusMetrics(nodeGroup string, backoffStatus backoff.Status) { + if _, ok := p.backoffReasonStatus[nodeGroup]; ok { + for reason := range p.backoffReasonStatus[nodeGroup] { + p.backoffReasonStatus[nodeGroup][reason] = false + } + } else { + p.backoffReasonStatus[nodeGroup] = make(BackoffReasonStatus) + } + + if backoffStatus.IsBackedOff { + errorCode := backoffStatus.ErrorInfo.ErrorCode + if errorCode == "" { + // prevent error code from being empty. + errorCode = unknownErrorCode + } + p.backoffReasonStatus[nodeGroup][errorCode] = true + } + metrics.UpdateNodeGroupBackOffStatus(nodeGroup, p.backoffReasonStatus[nodeGroup]) +}