Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cluster-autoscaler-release-1.31] Dynamic listing of SKUs #7254

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 32 additions & 28 deletions cluster-autoscaler/cloudprovider/azure/azure_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package azure

import (
"context"
"errors"
"reflect"
"regexp"
"strings"
Expand Down Expand Up @@ -96,7 +97,7 @@ type azureCache struct {
unownedInstances map[azureRef]bool

autoscalingOptions map[azureRef]map[string]string
skus map[string]*skewer.Cache
skus *skewer.Cache
}

func newAzureCache(client *azClient, cacheTTL time.Duration, config Config) (*azureCache, error) {
Expand All @@ -113,17 +114,19 @@ func newAzureCache(client *azClient, cacheTTL time.Duration, config Config) (*az
instanceToNodeGroup: make(map[azureRef]cloudprovider.NodeGroup),
unownedInstances: make(map[azureRef]bool),
autoscalingOptions: make(map[azureRef]map[string]string),
skus: make(map[string]*skewer.Cache),
}

if config.EnableDynamicInstanceList {
cache.skus[config.Location] = &skewer.Cache{}
skus: &skewer.Cache{}, // populated iff config.EnableDynamicInstanceList
}

if err := cache.regenerate(); err != nil {
klog.Errorf("Error while regenerating Azure cache: %v", err)
}

if config.EnableDynamicInstanceList {
if err := cache.fetchSKUCache(config.Location); err != nil {
klog.Errorf("Error while populating SKU list: %v", err)
}
}

return cache, nil
}

Expand Down Expand Up @@ -186,28 +189,30 @@ func (m *azureCache) regenerate() error {
newAutoscalingOptions[ref] = options
}

newSkuCache := make(map[string]*skewer.Cache)
for location := range m.skus {
cache, err := m.fetchSKUs(context.Background(), location)
if err != nil {
return err
}
newSkuCache[location] = cache
}

m.mutex.Lock()
defer m.mutex.Unlock()

m.instanceToNodeGroup = newInstanceToNodeGroupCache
m.autoscalingOptions = newAutoscalingOptions
m.skus = newSkuCache

// Reset unowned instances cache.
m.unownedInstances = make(map[azureRef]bool)

return nil
}

func (m *azureCache) fetchSKUCache(location string) error {
m.mutex.Lock()
defer m.mutex.Unlock()

cache, err := m.fetchSKUs(context.Background(), location)
if err != nil {
return err
}
m.skus = cache
return nil
}

// fetchAzureResources retrieves and updates the cached Azure resources.
//
// This function performs the following:
Expand Down Expand Up @@ -359,27 +364,26 @@ func (m *azureCache) Unregister(nodeGroup cloudprovider.NodeGroup) bool {
}

func (m *azureCache) fetchSKUs(ctx context.Context, location string) (*skewer.Cache, error) {
if location == "" {
return nil, errors.New("location not specified")
}

return skewer.NewCache(ctx,
skewer.WithLocation(location),
skewer.WithResourceClient(m.azClient.skuClient),
)
}

// HasVMSKUS returns true if the cache has any VM SKUs. Can be used to judge success of loading.
func (m *azureCache) HasVMSKUs() bool {
// not nil or empty (using the only exposed semi-efficient way to check)
return !(m.skus == nil || m.skus.Equal(&skewer.Cache{}))
}

func (m *azureCache) GetSKU(ctx context.Context, skuName, location string) (skewer.SKU, error) {
m.mutex.Lock()
defer m.mutex.Unlock()

cache, ok := m.skus[location]
if !ok {
var err error
cache, err = m.fetchSKUs(ctx, location)
if err != nil {
klog.V(1).Infof("Failed to instantiate cache, err: %v", err)
return skewer.SKU{}, err
}
m.skus[location] = cache
}

cache := m.skus
return cache.Get(ctx, skuName, skewer.VirtualMachines, location)
}

Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/cloudprovider/azure/azure_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ type Config struct {
// EnableForceDelete defines whether to enable force deletion on the APIs
EnableForceDelete bool `json:"enableForceDelete,omitempty" yaml:"enableForceDelete,omitempty"`

// EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check
// (DEPRECATED, DO NOT USE) EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check
EnableDynamicInstanceList bool `json:"enableDynamicInstanceList,omitempty" yaml:"enableDynamicInstanceList,omitempty"`

// (DEPRECATED, DO NOT USE) EnableDetailedCSEMessage defines whether to emit error messages in the CSE error body info
Expand Down
6 changes: 6 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi
}
manager.azureCache = cache

if !manager.azureCache.HasVMSKUs() {
klog.Warning("No VM SKU info loaded, using only static SKU list")
cfg.EnableDynamicInstanceList = false
}

specs, err := ParseLabelAutoDiscoverySpecs(discoveryOpts)
if err != nil {
return nil, err
Expand All @@ -128,6 +133,7 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi
Cap: 10 * time.Minute,
}

// skuCache will already be created at this step by newAzureCache()
err = kretry.OnError(retryBackoff, retry.IsErrorRetriable, func() (err error) {
return manager.forceRefresh()
})
Expand Down
Loading