From 6eca014c7e6b056d32cc20a301252604aba98548 Mon Sep 17 00:00:00 2001
From: Benjamin Pineau <benjamin.pineau@datadoghq.com>
Date: Thu, 3 Sep 2020 08:57:59 +0200
Subject: [PATCH] Azure: serve stale on ongoing throttling

k8s Azure clients keeps tracks of previous HTTP 429 and Retry-After cool
down periods. On subsequent calls, they will notice the ongoing throttling
window and will return a synthetic errors (without HTTPStatusCode) rather
than submitting a throttled request to the ARM API:
https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/vendor/k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/azure_vmssvmclient.go#L154-L158
https://github.com/kubernetes/autoscaler/blob/a5ed2cc3fe0aabd92c7758e39f1a9c9fe3bd6505/cluster-autoscaler/vendor/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go#L118-L123

Some CA components can cope with a temporarily outdated object view
when throttled. They call in to `isAzureRequestsThrottled()` on clients
errors to return stale objects from cache (if any) and extend the object's
refresh period (if any).

But this only works for the first API call (returning HTTP 429).
Next calls in the same throttling window (per Retry-After header)
won't be identified as throttled by `isAzureRequestsThrottled` due
to their nul `HTTPStatusCode`.

This can makes the CA panic during startup due a failing cache init, when
more than one VMSS call hits throttling. We've seen this causing early
restarts loops, re-scanning every VMSS due to cold cache on start,
keeping the subscription throttled.

Practically this change allows the 3 call sites (`scaleSet.Nodes()`,
`scaleSet.getCurSize()`, and `AgentPool.getVirtualMachinesFromCache()`) to
serve from cache (and extend the object's next refresh deadline) as they
would do on the first HTTP 429 hit, rather than returning an error.
---
 cluster-autoscaler/cloudprovider/azure/azure_util.go      | 7 ++++++-
 cluster-autoscaler/cloudprovider/azure/azure_util_test.go | 8 ++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/cluster-autoscaler/cloudprovider/azure/azure_util.go b/cluster-autoscaler/cloudprovider/azure/azure_util.go
index 980b36183e48..f231965ea0de 100644
--- a/cluster-autoscaler/cloudprovider/azure/azure_util.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_util.go
@@ -664,12 +664,17 @@ func convertResourceGroupNameToLower(resourceID string) (string, error) {
 	return strings.Replace(resourceID, resourceGroup, strings.ToLower(resourceGroup), 1), nil
 }
 
-// isAzureRequestsThrottled returns true when the err is http.StatusTooManyRequests (429).
+// isAzureRequestsThrottled returns true when the err is http.StatusTooManyRequests (429),
+// and when err shows the requests was not executed due to an ongoing throttling period.
 func isAzureRequestsThrottled(rerr *retry.Error) bool {
 	klog.V(6).Infof("isAzureRequestsThrottled: starts for error %v", rerr)
 	if rerr == nil {
 		return false
 	}
 
+	if rerr.HTTPStatusCode == 0 && rerr.RetryAfter.After(time.Now()) {
+		return true
+	}
+
 	return rerr.HTTPStatusCode == http.StatusTooManyRequests
 }
diff --git a/cluster-autoscaler/cloudprovider/azure/azure_util_test.go b/cluster-autoscaler/cloudprovider/azure/azure_util_test.go
index 812bb1446056..56094a00fd0c 100644
--- a/cluster-autoscaler/cloudprovider/azure/azure_util_test.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_util_test.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"net/http"
 	"testing"
+	"time"
 
 	"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute"
 	"github.com/stretchr/testify/assert"
@@ -270,6 +271,13 @@ func TestIsAzureRequestsThrottled(t *testing.T) {
 			},
 			expected: true,
 		},
+		{
+			desc: "Nul HTTP code and non-expired Retry-After should return true",
+			rerr: &retry.Error{
+				RetryAfter: time.Now().Add(time.Hour),
+			},
+			expected: true,
+		},
 	}
 
 	for _, test := range tests {