From e5ebbf61107a8cb208855f624c8b8ba4769fa661 Mon Sep 17 00:00:00 2001 From: k8s-infra-cherrypick-robot <90416843+k8s-infra-cherrypick-robot@users.noreply.github.com> Date: Fri, 6 Sep 2024 11:52:25 -0700 Subject: [PATCH] [cluster-autoscaler-release-1.31] Dynamic listing of SKUs (#7254) * Dynamic listing of SKUs (master) * re-add enableDynamicInstanceList * update README.md --------- Co-authored-by: Rachel Gregory --- .../cloudprovider/azure/azure_cache.go | 60 ++++++++++--------- .../cloudprovider/azure/azure_config.go | 2 +- .../cloudprovider/azure/azure_manager.go | 6 ++ 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_cache.go index c8334ac78771..14ce9254e2c5 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cache.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cache.go @@ -18,6 +18,7 @@ package azure import ( "context" + "errors" "reflect" "regexp" "strings" @@ -96,7 +97,7 @@ type azureCache struct { unownedInstances map[azureRef]bool autoscalingOptions map[azureRef]map[string]string - skus map[string]*skewer.Cache + skus *skewer.Cache } func newAzureCache(client *azClient, cacheTTL time.Duration, config Config) (*azureCache, error) { @@ -113,17 +114,19 @@ func newAzureCache(client *azClient, cacheTTL time.Duration, config Config) (*az instanceToNodeGroup: make(map[azureRef]cloudprovider.NodeGroup), unownedInstances: make(map[azureRef]bool), autoscalingOptions: make(map[azureRef]map[string]string), - skus: make(map[string]*skewer.Cache), - } - - if config.EnableDynamicInstanceList { - cache.skus[config.Location] = &skewer.Cache{} + skus: &skewer.Cache{}, // populated iff config.EnableDynamicInstanceList } if err := cache.regenerate(); err != nil { klog.Errorf("Error while regenerating Azure cache: %v", err) } + if config.EnableDynamicInstanceList { + if err := cache.fetchSKUCache(config.Location); err != nil { + klog.Errorf("Error while populating SKU list: %v", err) + } + } + return cache, nil } @@ -186,21 +189,11 @@ func (m *azureCache) regenerate() error { newAutoscalingOptions[ref] = options } - newSkuCache := make(map[string]*skewer.Cache) - for location := range m.skus { - cache, err := m.fetchSKUs(context.Background(), location) - if err != nil { - return err - } - newSkuCache[location] = cache - } - m.mutex.Lock() defer m.mutex.Unlock() m.instanceToNodeGroup = newInstanceToNodeGroupCache m.autoscalingOptions = newAutoscalingOptions - m.skus = newSkuCache // Reset unowned instances cache. m.unownedInstances = make(map[azureRef]bool) @@ -208,6 +201,18 @@ func (m *azureCache) regenerate() error { return nil } +func (m *azureCache) fetchSKUCache(location string) error { + m.mutex.Lock() + defer m.mutex.Unlock() + + cache, err := m.fetchSKUs(context.Background(), location) + if err != nil { + return err + } + m.skus = cache + return nil +} + // fetchAzureResources retrieves and updates the cached Azure resources. // // This function performs the following: @@ -359,27 +364,26 @@ func (m *azureCache) Unregister(nodeGroup cloudprovider.NodeGroup) bool { } func (m *azureCache) fetchSKUs(ctx context.Context, location string) (*skewer.Cache, error) { + if location == "" { + return nil, errors.New("location not specified") + } + return skewer.NewCache(ctx, skewer.WithLocation(location), skewer.WithResourceClient(m.azClient.skuClient), ) } +// HasVMSKUS returns true if the cache has any VM SKUs. Can be used to judge success of loading. +func (m *azureCache) HasVMSKUs() bool { + // not nil or empty (using the only exposed semi-efficient way to check) + return !(m.skus == nil || m.skus.Equal(&skewer.Cache{})) +} + func (m *azureCache) GetSKU(ctx context.Context, skuName, location string) (skewer.SKU, error) { m.mutex.Lock() defer m.mutex.Unlock() - - cache, ok := m.skus[location] - if !ok { - var err error - cache, err = m.fetchSKUs(ctx, location) - if err != nil { - klog.V(1).Infof("Failed to instantiate cache, err: %v", err) - return skewer.SKU{}, err - } - m.skus[location] = cache - } - + cache := m.skus return cache.Get(ctx, skuName, skewer.VirtualMachines, location) } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config.go b/cluster-autoscaler/cloudprovider/azure/azure_config.go index 49d0bdf32d4d..98a416ed98af 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go @@ -86,7 +86,7 @@ type Config struct { // EnableForceDelete defines whether to enable force deletion on the APIs EnableForceDelete bool `json:"enableForceDelete,omitempty" yaml:"enableForceDelete,omitempty"` - // EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check + // (DEPRECATED, DO NOT USE) EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check EnableDynamicInstanceList bool `json:"enableDynamicInstanceList,omitempty" yaml:"enableDynamicInstanceList,omitempty"` // (DEPRECATED, DO NOT USE) EnableDetailedCSEMessage defines whether to emit error messages in the CSE error body info diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager.go b/cluster-autoscaler/cloudprovider/azure/azure_manager.go index 6c0f62a099fa..ad9a5d836f02 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager.go @@ -110,6 +110,11 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi } manager.azureCache = cache + if !manager.azureCache.HasVMSKUs() { + klog.Warning("No VM SKU info loaded, using only static SKU list") + cfg.EnableDynamicInstanceList = false + } + specs, err := ParseLabelAutoDiscoverySpecs(discoveryOpts) if err != nil { return nil, err @@ -128,6 +133,7 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi Cap: 10 * time.Minute, } + // skuCache will already be created at this step by newAzureCache() err = kretry.OnError(retryBackoff, retry.IsErrorRetriable, func() (err error) { return manager.forceRefresh() })