-
Notifications
You must be signed in to change notification settings - Fork 4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat:add node group health and back off metrics #6396
Changes from 6 commits
044c03d
1255c95
89241e4
849e9e7
23843ad
ae0ab53
5773f50
68e661f
4b9d4b1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -116,6 +116,9 @@ type ScaleUpFailure struct { | |
Time time.Time | ||
} | ||
|
||
// BackoffReasonStatus contains information about backoff status and reason | ||
type BackoffReasonStatus map[string]int | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the values here will always be either 0 or 1, maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Modifications done and PR submitted |
||
|
||
// ClusterStateRegistry is a structure to keep track the current state of the cluster. | ||
type ClusterStateRegistry struct { | ||
sync.Mutex | ||
|
@@ -132,6 +135,7 @@ type ClusterStateRegistry struct { | |
unregisteredNodes map[string]UnregisteredNode | ||
deletedNodes map[string]struct{} | ||
candidatesForScaleDown map[string][]string | ||
backoffReasonStatus map[string]BackoffReasonStatus | ||
backoff backoff.Backoff | ||
lastStatus *api.ClusterAutoscalerStatus | ||
lastScaleDownUpdateTime time.Time | ||
|
@@ -168,6 +172,7 @@ func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config C | |
unregisteredNodes: make(map[string]UnregisteredNode), | ||
deletedNodes: make(map[string]struct{}), | ||
candidatesForScaleDown: make(map[string][]string), | ||
backoffReasonStatus: make(map[string]BackoffReasonStatus), | ||
backoff: backoff, | ||
lastStatus: utils.EmptyClusterAutoscalerStatus(), | ||
logRecorder: logRecorder, | ||
|
@@ -462,6 +467,38 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() { | |
metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned) | ||
} | ||
|
||
// UpdateSafeScaleUpMetricsForNodeGroup queries the health status and backoff situation of the node group and updates metrics | ||
func (csr *ClusterStateRegistry) UpdateSafeScaleUpMetricsForNodeGroup(now time.Time) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Both the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Modifications done and PR submitted |
||
for _, nodeGroup := range csr.cloudProvider.NodeGroups() { | ||
if !nodeGroup.Exist() { | ||
continue | ||
} | ||
metrics.UpdateNodeGroupHealthStatus(nodeGroup.Id(), csr.IsNodeGroupHealthy(nodeGroup.Id())) | ||
backoffStatus := csr.backoff.BackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now) | ||
csr.updateNodeGroupBackoffStatusMetrics(nodeGroup.Id(), backoffStatus) | ||
} | ||
} | ||
|
||
// updateNodeGroupBackoffStatusMetrics returns information about backoff situation and reason of the node group | ||
func (csr *ClusterStateRegistry) updateNodeGroupBackoffStatusMetrics(nodeGroup string, backoffStatus backoff.Status) { | ||
backoffReasonStatus := make(BackoffReasonStatus) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Do you actually need to allocate this map every time? Zeroing the existing one would have the exact same effect, right? You're iterating over it anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Modifications done and PR submitted |
||
if oldStatus, ok := csr.backoffReasonStatus[nodeGroup]; ok { | ||
for reason := range oldStatus { | ||
backoffReasonStatus[reason] = 0 | ||
} | ||
} | ||
if backoffStatus.IsBackedOff { | ||
errorCode := backoffStatus.ErrorInfo.ErrorCode | ||
if errorCode == "" { | ||
// prevent error code from being empty. | ||
errorCode = cloudprovider.UnknownErrorCode | ||
} | ||
backoffReasonStatus[errorCode] = 1 | ||
} | ||
csr.backoffReasonStatus[nodeGroup] = backoffReasonStatus | ||
metrics.UpdateNodeGroupBackOffStatus(nodeGroup, backoffReasonStatus) | ||
} | ||
|
||
// NodeGroupScaleUpSafety returns information about node group safety to be scaled up now. | ||
func (csr *ClusterStateRegistry) NodeGroupScaleUpSafety(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety { | ||
isHealthy := csr.IsNodeGroupHealthy(nodeGroup.Id()) | ||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -406,6 +406,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr | |||||||
if err != nil { | ||||||||
klog.Errorf("AutoscalingStatusProcessor error: %v.", err) | ||||||||
} | ||||||||
a.clusterStateRegistry.UpdateSafeScaleUpMetricsForNodeGroup(currentTime) | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would better fit as an autoscaler/cluster-autoscaler/processors/status/autoscaling_status_processor.go Lines 33 to 35 in a3a29cf
to return a new processor that does the metric update. The new processor would also be a better place to keep the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Modifications done and PR submitted |
||||||||
}() | ||||||||
|
||||||||
// Check if there are any nodes that failed to register in Kubernetes | ||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this in
cloudprovider
package?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Modifications done and PR submitted