Skip to content

Commit

Permalink
feat: HasInstance implementation
Browse files Browse the repository at this point in the history
fix: ci lint

refactor: cloudprovider HasInstance should use case sensitive instance group lookup

feat: using NodeGroupForNode for the azure provider has instance implementation

fix: ci lint

fix: ci

wip: HasInstance impl based on a fork of NodeGroupForNode

refactor: refactoring tests and sharing validation between ngfornode + hasinstance

test: removing commented out test

ci: fix formatting

fix: properly handling unmanaged nodes

fix: removing unused helper

fix: using provider id to get scaleset name

fix: putting items into the set as lowercase to avoid mismatch in lookup

Update cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go

Co-authored-by: Alex Leites <[email protected]>

fix: always fallback if we are unsure an instance is in the cache

test: adding back in TestNodeGroupForNodeWithNoProviderID

refactor: removing log lines since this will spam for non-autoscaled nodes

refactor: removing dead code

fix: simplifying logic since we no longer distinguish on delete

Update cluster-autoscaler/cloudprovider/azure/azure_cache.go

test: renaming test to better reflect the scenario

Update cluster-autoscaler/cloudprovider/azure/azure_cache.go

Co-authored-by: Alex Leites <[email protected]>

refactor: removing shared helpers since they are no longer shared and didnt make any sense in the first place

refactor: removing unused code and adding a test for has instance happy path

Update cluster-autoscaler/cloudprovider/azure/azure_cache.go

ci: lint

Update cluster-autoscaler/cloudprovider/azure/azure_cache.go

Co-authored-by: Alex Leites <[email protected]>
  • Loading branch information
Bryce-Soghigian and tallaxes committed Aug 2, 2024
1 parent fb80743 commit 230b51a
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 49 deletions.
28 changes: 28 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,14 @@ func (m *azureCache) regenerate() error {
return nil
}

// fetchAzureResources retrieves and updates the cached Azure resources.
//
// This function performs the following:
// - Fetches and updates the list of Virtual Machine Scale Sets (VMSS) in the specified resource group.
// - Fetches and updates the list of Virtual Machines (VMs) and identifies the node pools they belong to.
// - Maintains a set of VMs pools and VMSS resources which helps the Cluster Autoscaler (CAS) operate on mixed node pools.
//
// Returns an error if any of the Azure API calls fail.
func (m *azureCache) fetchAzureResources() error {
m.mutex.Lock()
defer m.mutex.Unlock()
Expand Down Expand Up @@ -290,6 +298,7 @@ func (m *azureCache) Register(nodeGroup cloudprovider.NodeGroup) bool {
}

klog.V(4).Infof("Registering Node Group %q", nodeGroup.Id())

m.registeredNodeGroups = append(m.registeredNodeGroups, nodeGroup)
m.invalidateUnownedInstanceCache()
return true
Expand Down Expand Up @@ -358,6 +367,25 @@ func (m *azureCache) getAutoscalingOptions(ref azureRef) map[string]string {
return m.autoscalingOptions[ref]
}

// HasInstance returns if a given instance exists in the azure cache
func (m *azureCache) HasInstance(providerID string) (bool, error) {
m.mutex.Lock()
defer m.mutex.Unlock()
resourceID, err := convertResourceGroupNameToLower(providerID)
if err != nil {
// Most likely an invalid resource id, we should return an error
// most of these shouldn't make it here do to higher level
// validation in the HasInstance azure.cloudprovider function
return false, err
}

if m.getInstanceFromCache(resourceID) != nil {
return true, nil
}
// couldn't find instance in the cache, assume it's deleted
return false, cloudprovider.ErrNotImplemented
}

// FindForInstance returns node group of the given Instance
func (m *azureCache) FindForInstance(instance *azureRef, vmType string) (cloudprovider.NodeGroup, error) {
m.mutex.Lock()
Expand Down
26 changes: 23 additions & 3 deletions cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ limitations under the License.
package azure

import (
"fmt"
"io"
"os"
"strings"

apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
Expand Down Expand Up @@ -114,9 +116,27 @@ func (azure *AzureCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovid
return azure.azureManager.GetNodeGroupForInstance(ref)
}

// HasInstance returns whether a given node has a corresponding instance in this cloud provider
func (azure *AzureCloudProvider) HasInstance(*apiv1.Node) (bool, error) {
return true, cloudprovider.ErrNotImplemented
// HasInstance returns whether a given node has a corresponding instance in this cloud provider.
//
// Used to prevent undercount of existing VMs (taint-based overcount of deleted VMs),
// and so should not return false, nil (no instance) if uncertain; return error instead.
// (Think "has instance for sure, else error".) Returning an error causes fallback to taint-based
// determination; use ErrNotImplemented for silent fallback, any other error will be logged.
//
// Expected behavior (should work for VMSS Uniform/Flex, and VMs):
// - exists : return true, nil
// - !exists : return *, ErrNotImplemented (could use custom error for autoscaled nodes)
// - unimplemented case : return *, ErrNotImplemented
// - any other error : return *, error
func (azure *AzureCloudProvider) HasInstance(node *apiv1.Node) (bool, error) {
if node.Spec.ProviderID == "" {
return false, fmt.Errorf("ProviderID for node: %s is empty, skipped", node.Name)
}

if !strings.HasPrefix(node.Spec.ProviderID, "azure://") {
return false, fmt.Errorf("invalid azure ProviderID prefix for node: %s, skipped", node.Name)
}
return azure.azureManager.azureCache.HasInstance(node.Spec.ProviderID)
}

// Pricing returns pricing model for this cloud provider or error if not available.
Expand Down
213 changes: 167 additions & 46 deletions cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package azure

import (
"fmt"
"testing"

"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" //nolint SA1019 - deprecated package
Expand All @@ -27,6 +28,8 @@ import (
"github.com/stretchr/testify/assert"

apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmclient/mockvmclient"
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssclient/mockvmssclient"
Expand Down Expand Up @@ -119,62 +122,180 @@ func TestNodeGroups(t *testing.T) {
assert.Equal(t, len(provider.NodeGroups()), 1)
}

func TestHasInstance(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

provider := newTestProvider(t)
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMClient := mockvmclient.NewMockInterface(ctrl)
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
provider.azureManager.azClient.virtualMachinesClient = mockVMClient
provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient

// Simulate node groups and instances
expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", compute.Uniform)
expectedVMSSVMs := newTestVMSSVMList(3)

mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes()
mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()

// Register node groups
assert.Equal(t, len(provider.NodeGroups()), 0)
registered := provider.azureManager.RegisterNodeGroup(
newTestScaleSet(provider.azureManager, "test-asg"),
)
provider.azureManager.explicitlyConfigured["test-asg"] = true
assert.True(t, registered)

assert.Equal(t, len(provider.NodeGroups()), 1)

// Refresh cache
provider.azureManager.forceRefresh()

// Test HasInstance for a node from the VMSS pool
node := newApiNode(compute.Uniform, 0)
hasInstance, err := provider.azureManager.azureCache.HasInstance(node.Spec.ProviderID)
assert.True(t, hasInstance)
assert.NoError(t, err)

}

func TestUnownedInstancesFallbackToDeletionTaint(t *testing.T) {
// VMSS Instances that belong to a VMSS on the cluster but do not belong to a registered ASG
// should return err unimplemented for HasInstance
ctrl := gomock.NewController(t)
defer ctrl.Finish()
provider := newTestProvider(t)
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMClient := mockvmclient.NewMockInterface(ctrl)
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
provider.azureManager.azClient.virtualMachinesClient = mockVMClient
provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient

// // Simulate VMSS instances
unregisteredVMSSInstance := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "unregistered-vmss-node",
},
Spec: apiv1.NodeSpec{
ProviderID: "azure:///subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/unregistered-vmss-instance-id/virtualMachines/0",
},
}
// Mock responses to simulate that the instance belongs to a VMSS not in any registered ASG
expectedVMSSVMs := newTestVMSSVMList(1)
mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "unregistered-vmss-instance-id", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()

// Call HasInstance and check the result
hasInstance, err := provider.azureManager.azureCache.HasInstance(unregisteredVMSSInstance.Spec.ProviderID)
assert.False(t, hasInstance)
assert.Equal(t, cloudprovider.ErrNotImplemented, err)
}

func TestHasInstanceProviderIDErrorValidation(t *testing.T) {
provider := newTestProvider(t)
// Test case: Node with an empty ProviderID
nodeWithoutValidProviderID := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "test-node",
},
Spec: apiv1.NodeSpec{
ProviderID: "",
},
}
_, err := provider.HasInstance(nodeWithoutValidProviderID)
assert.Equal(t, "ProviderID for node: test-node is empty, skipped", err.Error())

// Test cases: Nodes with invalid ProviderID prefixes
invalidProviderIDs := []string{
"aazure://",
"kubemark://",
"kwok://",
"incorrect!",
}

for _, providerID := range invalidProviderIDs {
invalidProviderIDNode := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "test-node",
},
Spec: apiv1.NodeSpec{
ProviderID: providerID,
},
}
_, err := provider.HasInstance(invalidProviderIDNode)
assert.Equal(t, "invalid azure ProviderID prefix for node: test-node, skipped", err.Error())
}
}

func TestNodeGroupForNode(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()
orchestrationModes := [2]compute.OrchestrationMode{compute.Uniform, compute.Flexible}
orchestrationModes := []compute.OrchestrationMode{compute.Uniform, compute.Flexible}

expectedVMSSVMs := newTestVMSSVMList(3)
expectedVMs := newTestVMList(3)

for _, orchMode := range orchestrationModes {
expectedScaleSets := newTestVMSSList(3, testASG, "eastus", orchMode)
provider := newTestProvider(t)
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil)
provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient

mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient

mockVMClient := mockvmclient.NewMockInterface(ctrl)
provider.azureManager.azClient.virtualMachinesClient = mockVMClient

if orchMode == compute.Uniform {
mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup,
testASG, gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
} else {
provider.azureManager.config.EnableVmssFlex = true
mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), testASG).Return(expectedVMs, nil).AnyTimes()
}
t.Run(fmt.Sprintf("OrchestrationMode_%v", orchMode), func(t *testing.T) {
expectedScaleSets := newTestVMSSList(3, testASG, "eastus", orchMode)
provider := newTestProvider(t)
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil)
provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient

registered := provider.azureManager.RegisterNodeGroup(
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient

mockVMClient := mockvmclient.NewMockInterface(ctrl)
provider.azureManager.azClient.virtualMachinesClient = mockVMClient

if orchMode == compute.Uniform {
mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup,
testASG, gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
} else {
provider.azureManager.config.EnableVmssFlex = true
mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), testASG).Return(expectedVMs, nil).AnyTimes()
}

registered := provider.azureManager.RegisterNodeGroup(
newTestScaleSet(provider.azureManager, testASG))
provider.azureManager.explicitlyConfigured[testASG] = true
assert.True(t, registered)
assert.Equal(t, len(provider.NodeGroups()), 1)

node := newApiNode(orchMode, 0)
// refresh cache
err := provider.azureManager.forceRefresh()
assert.NoError(t, err)
group, err := provider.NodeGroupForNode(node)
assert.NoError(t, err)
assert.NotNil(t, group, "Group should not be nil")
assert.Equal(t, group.Id(), testASG)
assert.Equal(t, group.MinSize(), 1)
assert.Equal(t, group.MaxSize(), 5)

// test node in cluster that is not in a group managed by cluster autoscaler
nodeNotInGroup := &apiv1.Node{
Spec: apiv1.NodeSpec{
ProviderID: azurePrefix + "/subscriptions/subscripion/resourceGroups/test-resource-group/providers/" +
"Microsoft.Compute/virtualMachines/test-instance-id-not-in-group",
},
}
group, err = provider.NodeGroupForNode(nodeNotInGroup)
assert.NoError(t, err)
assert.Nil(t, group)
provider.azureManager.explicitlyConfigured[testASG] = true
assert.True(t, registered)
assert.Equal(t, len(provider.NodeGroups()), 1)

node := newApiNode(orchMode, 0)
// refresh cache
err := provider.azureManager.forceRefresh()
assert.NoError(t, err)
group, err := provider.NodeGroupForNode(node)
assert.NoError(t, err)
assert.NotNil(t, group, "Group should not be nil")
assert.Equal(t, group.Id(), testASG)
assert.Equal(t, group.MinSize(), 1)
assert.Equal(t, group.MaxSize(), 5)

hasInstance, err := provider.HasInstance(node)
assert.True(t, hasInstance)
assert.NoError(t, err)

// test node in cluster that is not in a group managed by cluster autoscaler
nodeNotInGroup := &apiv1.Node{
Spec: apiv1.NodeSpec{
ProviderID: "azure:///subscriptions/subscription/resourceGroups/test-resource-group/providers/Microsoft.Compute/virtualMachineScaleSets/test/virtualMachines/test-instance-id-not-in-group",
},
}
group, err = provider.NodeGroupForNode(nodeNotInGroup)
assert.NoError(t, err)
assert.Nil(t, group)

hasInstance, err = provider.HasInstance(nodeNotInGroup)
assert.False(t, hasInstance)
assert.Error(t, err)
assert.Equal(t, err, cloudprovider.ErrNotImplemented)
})
}
}

Expand Down

0 comments on commit 230b51a

Please sign in to comment.