From 1e35781b5a227c9959cd333ee8b14eda977205c4 Mon Sep 17 00:00:00 2001
From: Benjamin Pineau <benjamin.pineau@datadoghq.com>
Date: Wed, 19 Aug 2020 19:08:04 +0200
Subject: [PATCH] Azure: optional jitter on initial VMSS VM cache refresh

On (re)start, cluster-autoscaler will refresh all VMSS instances caches
at once, and set those cache TTL to 5mn. All VMSS VM List calls (for VMSS
discovered at boot) will then continuously hit ARM API at the same time,
potentially causing regular throttling bursts.

Exposing an optional jitter subtracted from the initial first scheduled
refresh delay will splay those calls (except for the first one, at start),
while keeping the predictable (max. 5mn, unless the VMSS changed) refresh
interval after the first refresh.
---
 .../cloudprovider/azure/README.md             | 35 ++++++++++++++++--
 .../cloudprovider/azure/azure_manager.go      | 20 ++++++++++
 .../cloudprovider/azure/azure_manager_test.go | 26 ++++++++-----
 .../cloudprovider/azure/azure_scale_set.go    | 37 ++++++++++++++-----
 4 files changed, 96 insertions(+), 22 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/azure/README.md b/cluster-autoscaler/cloudprovider/azure/README.md
index c7aca5cfdaa6..fbb63ab2ccd1 100644
--- a/cluster-autoscaler/cloudprovider/azure/README.md
+++ b/cluster-autoscaler/cloudprovider/azure/README.md
@@ -128,6 +128,35 @@ To run a cluster autoscaler pod on a master node, the deployment should tolerate
 
 To run a cluster autoscaler pod with Azure managed service identity (MSI), use [cluster-autoscaler-vmss-msi.yaml](examples/cluster-autoscaler-vmss-msi.yaml) instead.
 
+
+#### Azure API Throttling
+Azure has hard limits on the number of read and write requests against Azure APIs *per subscription, per region*. Running lots of clusters in a single subscription, or running a single large, dynamic cluster in a subscription can produce side effects that exceed the number of calls permitted within a given time window for a particular category of requests. See the following documents for more detail on Azure API throttling in general:
+
+- https://docs.microsoft.com/en-us/azure/azure-resource-manager/management/request-limits-and-throttling
+- https://docs.microsoft.com/en-us/azure/virtual-machines/troubleshooting/troubleshooting-throttling-errors
+
+Given the dynamic nature of cluster autoscaler, it can be a trigger for hitting those rate limits on the subscriptions. This in turn can affect other components running in the cluster that depend on Azure APIs such as kube-controller-manager.
+
+When using K8s versions older than v1.18, we recommend using at least **v.1.17.5, v1.16.9, v1.15.12** which include various improvements on the cloud-provider side that have an impact on the number of API calls during scale down operations.
+
+As for CA versions older than 1.18, we recommend using at least **v.1.17.2, v1.16.5, v1.15.6**.
+
+In addition, cluster-autoscaler exposes a `AZURE_VMSS_CACHE_TTL` environment variable which controls the rate of `GetVMScaleSet` being made. By default, this is 15 seconds but setting this to a higher value such as 60 seconds can protect against API throttling. The caches used are proactively incremented and decremented with the scale up and down operations and this higher value doesn't have any noticeable impact on performance. **Note that the value is in seconds**
+
+| Config Name | Default | Environment Variable | Cloud Config File |
+| ----------- | ------- | -------------------- | ----------------- |
+| VmssCacheTTL | 15 | AZURE_VMSS_CACHE_TTL | vmssCacheTTL |
+
+The `AZURE_VMSS_VMS_CACHE_TTL` environment variable affects the `GetScaleSetVms` (VMSS VM List) calls rate. The default value is 300 seconds.
+A configurable jitter (`AZURE_VMSS_VMS_CACHE_JITTER` environment variable, default 0) expresses the maximum number of second that will be subtracted from that initial VMSS cache TTL after a new VMSS is discovered by the cluster-autoscaler: this can prevent a dogpile effect on clusters having many VMSS.
+
+| Config Name | Default | Environment Variable | Cloud Config File |
+| ----------- | ------- | -------------------- | ----------------- |
+| vmssVmsCacheTTL | 300 | AZURE_VMSS_VMS_CACHE_TTL | vmssVmsCacheTTL |
+| vmssVmsCacheJitter | 0 | AZURE_VMSS_VMS_CACHE_JITTER | vmssVmsCacheJitter |
+
+When using K8s 1.18 or higher, it is also recommended to configure backoff and retries on the client as described [here](#rate-limit-and-back-off-retries)
+
 ### Standard deployment
 
 Prerequisites:
@@ -148,7 +177,7 @@ Make a copy of [cluster-autoscaler-standard-master.yaml](examples/cluster-autosc
 
 In the `cluster-autoscaler` spec, find the `image:` field and replace `{{ ca_version }}` with a specific cluster autoscaler release.
 
-Below that, in the `command:` section, update the `--nodes=` arguments to reference your node limits and node pool name (tips: node pool name is NOT availability set name, e.g., the corresponding node pool name of the availability set 
+Below that, in the `command:` section, update the `--nodes=` arguments to reference your node limits and node pool name (tips: node pool name is NOT availability set name, e.g., the corresponding node pool name of the availability set
 `agentpool1-availabilitySet-xxxxxxxx` would be `agentpool1`). For example, if node pool "k8s-nodepool-1" should scale from 1 to 10 nodes:
 
 ```yaml
@@ -198,7 +227,7 @@ az aks create \
 
 #### AKS + Availability Set
 
-The CLI based deployment only support VMSS and manual deployment is needed if availability set is used. 
+The CLI based deployment only support VMSS and manual deployment is needed if availability set is used.
 
 Prerequisites:
 
@@ -210,7 +239,7 @@ Prerequisites:
 kubectl get nodes --show-labels
 ```
 
-Make a copy of [cluster-autoscaler-aks.yaml](examples/cluster-autoscaler-aks.yaml). Fill in the placeholder values for 
+Make a copy of [cluster-autoscaler-aks.yaml](examples/cluster-autoscaler-aks.yaml). Fill in the placeholder values for
 the `cluster-autoscaler-azure` secret data by base64-encoding each of your Azure credential fields.
 
 - ClientID: `<base64-encoded-client-id>`
diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager.go b/cluster-autoscaler/cloudprovider/azure/azure_manager.go
index 436cfc82045d..387e502e2e53 100644
--- a/cluster-autoscaler/cloudprovider/azure/azure_manager.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_manager.go
@@ -136,6 +136,12 @@ type Config struct {
 	// VMSS metadata cache TTL in seconds, only applies for vmss type
 	VmssCacheTTL int64 `json:"vmssCacheTTL" yaml:"vmssCacheTTL"`
 
+	// VMSS instances cache TTL in seconds, only applies for vmss type
+	VmssVmsCacheTTL int64 `json:"vmssVmsCacheTTL" yaml:"vmssVmsCacheTTL"`
+
+	// Jitter in seconds subtracted from the VMSS cache TTL before the first refresh
+	VmssVmsCacheJitter int `json:"vmssVmsCacheJitter" yaml:"vmssVmsCacheJitter"`
+
 	// number of latest deployments that will not be deleted
 	MaxDeploymentsCount int64 `json:"maxDeploymentsCount" yaml:"maxDeploymentsCount"`
 
@@ -296,6 +302,20 @@ func CreateAzureManager(configReader io.Reader, discoveryOpts cloudprovider.Node
 			}
 		}
 
+		if vmssVmsCacheTTL := os.Getenv("AZURE_VMSS_VMS_CACHE_TTL"); vmssVmsCacheTTL != "" {
+			cfg.VmssVmsCacheTTL, err = strconv.ParseInt(vmssVmsCacheTTL, 10, 0)
+			if err != nil {
+				return nil, fmt.Errorf("failed to parse AZURE_VMSS_VMS_CACHE_TTL %q: %v", vmssVmsCacheTTL, err)
+			}
+		}
+
+		if vmssVmsCacheJitter := os.Getenv("AZURE_VMSS_VMS_CACHE_JITTER"); vmssVmsCacheJitter != "" {
+			cfg.VmssVmsCacheJitter, err = strconv.Atoi(vmssVmsCacheJitter)
+			if err != nil {
+				return nil, fmt.Errorf("failed to parse AZURE_VMSS_VMS_CACHE_JITTER %q: %v", vmssVmsCacheJitter, err)
+			}
+		}
+
 		if threshold := os.Getenv("AZURE_MAX_DEPLOYMENT_COUNT"); threshold != "" {
 			cfg.MaxDeploymentsCount, err = strconv.ParseInt(threshold, 10, 0)
 			if err != nil {
diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
index fe64212e95c1..96ddbdd574dd 100644
--- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
@@ -42,6 +42,8 @@ const validAzureCfg = `{
 	"routeTableName": "fakeName",
 	"primaryAvailabilitySetName": "fakeName",
 	"vmssCacheTTL": 60,
+	"vmssVmsCacheTTL": 240,
+	"vmssVmsCacheJitter": 120,
 	"maxDeploymentsCount": 8,
 	"cloudProviderRateLimit": false,
 	"routeRateLimit": {
@@ -65,6 +67,8 @@ func TestCreateAzureManagerValidConfig(t *testing.T) {
 		AADClientID:         "fakeId",
 		AADClientSecret:     "fakeId",
 		VmssCacheTTL:        60,
+		VmssVmsCacheTTL:     240,
+		VmssVmsCacheJitter:  120,
 		MaxDeploymentsCount: 8,
 		CloudProviderRateLimitConfig: CloudProviderRateLimitConfig{
 			RateLimitConfig: azclients.RateLimitConfig{
@@ -218,11 +222,12 @@ func TestListScalesets(t *testing.T) {
 				azureRef: azureRef{
 					Name: vmssName,
 				},
-				minSize:           5,
-				maxSize:           50,
-				manager:           manager,
-				curSize:           3,
-				sizeRefreshPeriod: defaultVmssSizeRefreshPeriod,
+				minSize:                5,
+				maxSize:                50,
+				manager:                manager,
+				curSize:                3,
+				sizeRefreshPeriod:      defaultVmssSizeRefreshPeriod,
+				instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod,
 			}},
 		},
 		{
@@ -324,11 +329,12 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) {
 		azureRef: azureRef{
 			Name: vmssName,
 		},
-		minSize:           minVal,
-		maxSize:           maxVal,
-		manager:           manager,
-		curSize:           3,
-		sizeRefreshPeriod: defaultVmssSizeRefreshPeriod,
+		minSize:                minVal,
+		maxSize:                maxVal,
+		manager:                manager,
+		curSize:                3,
+		sizeRefreshPeriod:      defaultVmssSizeRefreshPeriod,
+		instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod,
 	}}
 	assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs)
 }
diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
index 869fdde02238..a2517c933bd1 100644
--- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
@@ -43,10 +43,10 @@ import (
 )
 
 var (
-	defaultVmssSizeRefreshPeriod = 15 * time.Second
-	vmssInstancesRefreshPeriod   = 5 * time.Minute
-	vmssContextTimeout           = 3 * time.Minute
-	vmssSizeMutex                sync.Mutex
+	defaultVmssSizeRefreshPeriod      = 15 * time.Second
+	defaultVmssInstancesRefreshPeriod = 5 * time.Minute
+	vmssContextTimeout                = 3 * time.Minute
+	vmssSizeMutex                     sync.Mutex
 )
 
 var scaleSetStatusCache struct {
@@ -83,6 +83,9 @@ type ScaleSet struct {
 	lastSizeRefresh   time.Time
 	sizeRefreshPeriod time.Duration
 
+	instancesRefreshPeriod time.Duration
+	instancesRefreshJitter int
+
 	instanceMutex       sync.Mutex
 	instanceCache       []cloudprovider.Instance
 	lastInstanceRefresh time.Time
@@ -98,6 +101,8 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) (
 		maxSize: spec.MaxSize,
 		manager: az,
 		curSize: curSize,
+
+		instancesRefreshJitter: az.config.VmssVmsCacheJitter,
 	}
 
 	if az.config.VmssCacheTTL != 0 {
@@ -106,6 +111,12 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) (
 		scaleSet.sizeRefreshPeriod = defaultVmssSizeRefreshPeriod
 	}
 
+	if az.config.VmssVmsCacheTTL != 0 {
+		scaleSet.instancesRefreshPeriod = time.Duration(az.config.VmssVmsCacheTTL) * time.Second
+	} else {
+		scaleSet.instancesRefreshPeriod = defaultVmssInstancesRefreshPeriod
+	}
+
 	return scaleSet, nil
 }
 
@@ -683,25 +694,33 @@ func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) {
 	defer scaleSet.instanceMutex.Unlock()
 
 	if int64(len(scaleSet.instanceCache)) == curSize &&
-		scaleSet.lastInstanceRefresh.Add(vmssInstancesRefreshPeriod).After(time.Now()) {
+		scaleSet.lastInstanceRefresh.Add(scaleSet.instancesRefreshPeriod).After(time.Now()) {
 		klog.V(4).Infof("Nodes: returns with curSize %d", curSize)
 		return scaleSet.instanceCache, nil
 	}
 
 	klog.V(4).Infof("Nodes: starts to get VMSS VMs")
+
+	lastRefresh := time.Now()
+	if scaleSet.lastInstanceRefresh.IsZero() && scaleSet.instancesRefreshJitter > 0 {
+		// new VMSS: spread future refreshs
+		splay := rand.New(rand.NewSource(time.Now().UnixNano())).Intn(scaleSet.instancesRefreshJitter + 1)
+		lastRefresh = time.Now().Add(-time.Second * time.Duration(splay))
+	}
+
 	vms, rerr := scaleSet.GetScaleSetVms()
 	if rerr != nil {
 		if isAzureRequestsThrottled(rerr) {
-			// Log a warning and update the instance refresh time so that it would retry after next vmssInstancesRefreshPeriod.
+			// Log a warning and update the instance refresh time so that it would retry after next scaleSet.instanceRefreshPeriod.
 			klog.Warningf("GetScaleSetVms() is throttled with message %v, would return the cached instances", rerr)
-			scaleSet.lastInstanceRefresh = time.Now()
+			scaleSet.lastInstanceRefresh = lastRefresh
 			return scaleSet.instanceCache, nil
 		}
 		return nil, rerr.Error()
 	}
 
 	scaleSet.instanceCache = buildInstanceCache(vms)
-	scaleSet.lastInstanceRefresh = time.Now()
+	scaleSet.lastInstanceRefresh = lastRefresh
 	klog.V(4).Infof("Nodes: returns")
 	return scaleSet.instanceCache, nil
 }
@@ -766,7 +785,7 @@ func instanceStatusFromVM(vm compute.VirtualMachineScaleSetVM) *cloudprovider.In
 func (scaleSet *ScaleSet) invalidateInstanceCache() {
 	scaleSet.instanceMutex.Lock()
 	// Set the instanceCache as outdated.
-	scaleSet.lastInstanceRefresh = time.Now().Add(-1 * vmssInstancesRefreshPeriod)
+	scaleSet.lastInstanceRefresh = time.Now().Add(-1 * scaleSet.instancesRefreshPeriod)
 	scaleSet.instanceMutex.Unlock()
 }