From 6eca014c7e6b056d32cc20a301252604aba98548 Mon Sep 17 00:00:00 2001 From: Benjamin Pineau Date: Thu, 3 Sep 2020 08:57:59 +0200 Subject: [PATCH] Azure: serve stale on ongoing throttling k8s Azure clients keeps tracks of previous HTTP 429 and Retry-After cool down periods. On subsequent calls, they will notice the ongoing throttling window and will return a synthetic errors (without HTTPStatusCode) rather than submitting a throttled request to the ARM API: https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/vendor/k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/azure_vmssvmclient.go#L154-L158 https://github.com/kubernetes/autoscaler/blob/a5ed2cc3fe0aabd92c7758e39f1a9c9fe3bd6505/cluster-autoscaler/vendor/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go#L118-L123 Some CA components can cope with a temporarily outdated object view when throttled. They call in to `isAzureRequestsThrottled()` on clients errors to return stale objects from cache (if any) and extend the object's refresh period (if any). But this only works for the first API call (returning HTTP 429). Next calls in the same throttling window (per Retry-After header) won't be identified as throttled by `isAzureRequestsThrottled` due to their nul `HTTPStatusCode`. This can makes the CA panic during startup due a failing cache init, when more than one VMSS call hits throttling. We've seen this causing early restarts loops, re-scanning every VMSS due to cold cache on start, keeping the subscription throttled. Practically this change allows the 3 call sites (`scaleSet.Nodes()`, `scaleSet.getCurSize()`, and `AgentPool.getVirtualMachinesFromCache()`) to serve from cache (and extend the object's next refresh deadline) as they would do on the first HTTP 429 hit, rather than returning an error. --- cluster-autoscaler/cloudprovider/azure/azure_util.go | 7 ++++++- cluster-autoscaler/cloudprovider/azure/azure_util_test.go | 8 ++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_util.go b/cluster-autoscaler/cloudprovider/azure/azure_util.go index 980b36183e48..f231965ea0de 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_util.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_util.go @@ -664,12 +664,17 @@ func convertResourceGroupNameToLower(resourceID string) (string, error) { return strings.Replace(resourceID, resourceGroup, strings.ToLower(resourceGroup), 1), nil } -// isAzureRequestsThrottled returns true when the err is http.StatusTooManyRequests (429). +// isAzureRequestsThrottled returns true when the err is http.StatusTooManyRequests (429), +// and when err shows the requests was not executed due to an ongoing throttling period. func isAzureRequestsThrottled(rerr *retry.Error) bool { klog.V(6).Infof("isAzureRequestsThrottled: starts for error %v", rerr) if rerr == nil { return false } + if rerr.HTTPStatusCode == 0 && rerr.RetryAfter.After(time.Now()) { + return true + } + return rerr.HTTPStatusCode == http.StatusTooManyRequests } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_util_test.go b/cluster-autoscaler/cloudprovider/azure/azure_util_test.go index 812bb1446056..56094a00fd0c 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_util_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_util_test.go @@ -20,6 +20,7 @@ import ( "fmt" "net/http" "testing" + "time" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute" "github.com/stretchr/testify/assert" @@ -270,6 +271,13 @@ func TestIsAzureRequestsThrottled(t *testing.T) { }, expected: true, }, + { + desc: "Nul HTTP code and non-expired Retry-After should return true", + rerr: &retry.Error{ + RetryAfter: time.Now().Add(time.Hour), + }, + expected: true, + }, } for _, test := range tests {