From 986fe3ae205383d646200a0bebd52a72a9f88890 Mon Sep 17 00:00:00 2001 From: Benjamin Pineau Date: Mon, 31 May 2021 15:55:28 +0200 Subject: [PATCH] Metric for CloudProvider.Refresh() duration This function can take an variable amount of time due to various conditions (ie. many nodegroups changes causing forced refreshes, caches time to live expiries, ...). Monitoring that duration is useful to diagnose those variations, and to uncover external issues (ie. throttling from cloud provider) affecting cluster-autoscaler. --- cluster-autoscaler/core/static_autoscaler.go | 2 ++ cluster-autoscaler/metrics/metrics.go | 1 + 2 files changed, 3 insertions(+) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 0e3dd869fce1..cfbf2c249f54 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -260,7 +260,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError } // Call CloudProvider.Refresh before any other calls to cloud provider. + refreshStart := time.Now() err = a.AutoscalingContext.CloudProvider.Refresh() + metrics.UpdateDurationFromStart(metrics.CloudProviderRefresh, refreshStart) if err != nil { klog.Errorf("Failed to refresh cloud provider config: %v", err) return errors.ToAutoscalerError(errors.CloudProviderError, err) diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index 9580ee3344c0..41b7fb7a9d97 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -90,6 +90,7 @@ const ( FindUnneeded FunctionLabel = "findUnneeded" UpdateState FunctionLabel = "updateClusterState" FilterOutSchedulable FunctionLabel = "filterOutSchedulable" + CloudProviderRefresh FunctionLabel = "cloudProviderRefresh" Main FunctionLabel = "main" Poll FunctionLabel = "poll" Reconfigure FunctionLabel = "reconfigure"