feat: HasInstance implementation

fix: ci lint refactor: cloudprovider HasInstance should use case sensitive instance group lookup feat: using NodeGroupForNode for the azure provider has instance implementation fix: ci lint fix: ci wip: HasInstance impl based on a fork of NodeGroupForNode refactor: refactoring tests and sharing validation between ngfornode + hasinstance test: removing commented out test ci: fix formatting fix: properly handling unmanaged nodes fix: removing unused helper fix: using provider id to get scaleset name fix: putting items into the set as lowercase to avoid mismatch in lookup Update cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go Co-authored-by: Alex Leites <[email protected]> fix: always fallback if we are unsure an instance is in the cache test: adding back in TestNodeGroupForNodeWithNoProviderID refactor: removing log lines since this will spam for non-autoscaled nodes refactor: removing dead code fix: simplifying logic since we no longer distinguish on delete Update cluster-autoscaler/cloudprovider/azure/azure_cache.go test: renaming test to better reflect the scenario Update cluster-autoscaler/cloudprovider/azure/azure_cache.go Co-authored-by: Alex Leites <[email protected]> refactor: removing shared helpers since they are no longer shared and didnt make any sense in the first place refactor: removing unused code and adding a test for has instance happy path Update cluster-autoscaler/cloudprovider/azure/azure_cache.go ci: lint Update cluster-autoscaler/cloudprovider/azure/azure_cache.go Co-authored-by: Alex Leites <[email protected]>
kubernetes · Aug 2, 2024 · 230b51a · 230b51a
1 parent fb80743
commit 230b51a
Show file tree

Hide file tree

Showing 3 changed files with 218 additions and 49 deletions.
diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_cache.go
@@ -197,6 +197,14 @@ func (m *azureCache) regenerate() error {
 	return nil
 }
 
+// fetchAzureResources retrieves and updates the cached Azure resources.
+//
+// This function performs the following:
+// - Fetches and updates the list of Virtual Machine Scale Sets (VMSS) in the specified resource group.
+// - Fetches and updates the list of Virtual Machines (VMs) and identifies the node pools they belong to.
+// - Maintains a set of VMs pools and VMSS resources which helps the Cluster Autoscaler (CAS) operate on mixed node pools.
+//
+// Returns an error if any of the Azure API calls fail.
 func (m *azureCache) fetchAzureResources() error {
 	m.mutex.Lock()
 	defer m.mutex.Unlock()
@@ -290,6 +298,7 @@ func (m *azureCache) Register(nodeGroup cloudprovider.NodeGroup) bool {
 	}
 
 	klog.V(4).Infof("Registering Node Group %q", nodeGroup.Id())
+
 	m.registeredNodeGroups = append(m.registeredNodeGroups, nodeGroup)
 	m.invalidateUnownedInstanceCache()
 	return true
@@ -358,6 +367,25 @@ func (m *azureCache) getAutoscalingOptions(ref azureRef) map[string]string {
 	return m.autoscalingOptions[ref]
 }
 
+// HasInstance returns if a given instance exists in the azure cache
+func (m *azureCache) HasInstance(providerID string) (bool, error) {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	resourceID, err := convertResourceGroupNameToLower(providerID)
+	if err != nil {
+		// Most likely an invalid resource id, we should return an error
+		// most of these shouldn't make it here do to higher level
+		// validation in the HasInstance azure.cloudprovider function
+		return false, err
+	}
+
+	if m.getInstanceFromCache(resourceID) != nil {
+		return true, nil
+	}
+	// couldn't find instance in the cache, assume it's deleted
+	return false, cloudprovider.ErrNotImplemented
+}
+
 // FindForInstance returns node group of the given Instance
 func (m *azureCache) FindForInstance(instance *azureRef, vmType string) (cloudprovider.NodeGroup, error) {
 	m.mutex.Lock()

diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go
@@ -17,8 +17,10 @@ limitations under the License.
 package azure
 
 import (
+	"fmt"
 	"io"
 	"os"
+	"strings"
 
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
@@ -114,9 +116,27 @@ func (azure *AzureCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovid
 	return azure.azureManager.GetNodeGroupForInstance(ref)
 }
 
-// HasInstance returns whether a given node has a corresponding instance in this cloud provider
-func (azure *AzureCloudProvider) HasInstance(*apiv1.Node) (bool, error) {
-	return true, cloudprovider.ErrNotImplemented
+// HasInstance returns whether a given node has a corresponding instance in this cloud provider.
+//
+// Used to prevent undercount of existing VMs (taint-based overcount of deleted VMs),
+// and so should not return false, nil (no instance) if uncertain; return error instead.
+// (Think "has instance for sure, else error".) Returning an error causes fallback to taint-based
+// determination; use ErrNotImplemented for silent fallback, any other error will be logged.
+//
+// Expected behavior (should work for VMSS Uniform/Flex, and VMs):
+// -  exists            : return true, nil
+// - !exists            : return *,    ErrNotImplemented (could use custom error for autoscaled nodes)
+// - unimplemented case : return *,    ErrNotImplemented
+// - any other error    : return *,    error
+func (azure *AzureCloudProvider) HasInstance(node *apiv1.Node) (bool, error) {
+	if node.Spec.ProviderID == "" {
+		return false, fmt.Errorf("ProviderID for node: %s is empty, skipped", node.Name)
+	}
+
+	if !strings.HasPrefix(node.Spec.ProviderID, "azure://") {
+		return false, fmt.Errorf("invalid azure ProviderID prefix for node: %s, skipped", node.Name)
+	}
+	return azure.azureManager.azureCache.HasInstance(node.Spec.ProviderID)
 }
 
 // Pricing returns pricing model for this cloud provider or error if not available.

diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go
@@ -17,6 +17,7 @@ limitations under the License.
 package azure
 
 import (
+	"fmt"
 	"testing"
 
 	"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute"     //nolint SA1019 - deprecated package
@@ -27,6 +28,8 @@ import (
 	"github.com/stretchr/testify/assert"
 
 	apiv1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
 	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
 	"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmclient/mockvmclient"
 	"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssclient/mockvmssclient"
@@ -119,62 +122,180 @@ func TestNodeGroups(t *testing.T) {
 	assert.Equal(t, len(provider.NodeGroups()), 1)
 }
 
+func TestHasInstance(t *testing.T) {
+	ctrl := gomock.NewController(t)
+	defer ctrl.Finish()
+
+	provider := newTestProvider(t)
+	mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
+	mockVMClient := mockvmclient.NewMockInterface(ctrl)
+	mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
+	provider.azureManager.azClient.virtualMachinesClient = mockVMClient
+	provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
+	provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
+
+	// Simulate node groups and instances
+	expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", compute.Uniform)
+	expectedVMSSVMs := newTestVMSSVMList(3)
+
+	mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes()
+	mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
+
+	// Register node groups
+	assert.Equal(t, len(provider.NodeGroups()), 0)
+	registered := provider.azureManager.RegisterNodeGroup(
+		newTestScaleSet(provider.azureManager, "test-asg"),
+	)
+	provider.azureManager.explicitlyConfigured["test-asg"] = true
+	assert.True(t, registered)
+
+	assert.Equal(t, len(provider.NodeGroups()), 1)
+
+	// Refresh cache
+	provider.azureManager.forceRefresh()
+
+	// Test HasInstance for a node from the VMSS pool
+	node := newApiNode(compute.Uniform, 0)
+	hasInstance, err := provider.azureManager.azureCache.HasInstance(node.Spec.ProviderID)
+	assert.True(t, hasInstance)
+	assert.NoError(t, err)
+
+}
+
+func TestUnownedInstancesFallbackToDeletionTaint(t *testing.T) {
+	// VMSS Instances that belong to a VMSS on the cluster but do not belong to a registered ASG
+	// should return err unimplemented for HasInstance
+	ctrl := gomock.NewController(t)
+	defer ctrl.Finish()
+	provider := newTestProvider(t)
+	mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
+	mockVMClient := mockvmclient.NewMockInterface(ctrl)
+	mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
+	provider.azureManager.azClient.virtualMachinesClient = mockVMClient
+	provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
+	provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
+
+	// 	// Simulate VMSS instances
+	unregisteredVMSSInstance := &apiv1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "unregistered-vmss-node",
+		},
+		Spec: apiv1.NodeSpec{
+			ProviderID: "azure:///subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/unregistered-vmss-instance-id/virtualMachines/0",
+		},
+	}
+	// Mock responses to simulate that the instance belongs to a VMSS not in any registered ASG
+	expectedVMSSVMs := newTestVMSSVMList(1)
+	mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "unregistered-vmss-instance-id", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
+
+	// Call HasInstance and check the result
+	hasInstance, err := provider.azureManager.azureCache.HasInstance(unregisteredVMSSInstance.Spec.ProviderID)
+	assert.False(t, hasInstance)
+	assert.Equal(t, cloudprovider.ErrNotImplemented, err)
+}
+
+func TestHasInstanceProviderIDErrorValidation(t *testing.T) {
+	provider := newTestProvider(t)
+	// Test case: Node with an empty ProviderID
+	nodeWithoutValidProviderID := &apiv1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "test-node",
+		},
+		Spec: apiv1.NodeSpec{
+			ProviderID: "",
+		},
+	}
+	_, err := provider.HasInstance(nodeWithoutValidProviderID)
+	assert.Equal(t, "ProviderID for node: test-node is empty, skipped", err.Error())
+
+	// Test cases: Nodes with invalid ProviderID prefixes
+	invalidProviderIDs := []string{
+		"aazure://",
+		"kubemark://",
+		"kwok://",
+		"incorrect!",
+	}
+
+	for _, providerID := range invalidProviderIDs {
+		invalidProviderIDNode := &apiv1.Node{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: "test-node",
+			},
+			Spec: apiv1.NodeSpec{
+				ProviderID: providerID,
+			},
+		}
+		_, err := provider.HasInstance(invalidProviderIDNode)
+		assert.Equal(t, "invalid azure ProviderID prefix for node: test-node, skipped", err.Error())
+	}
+}
+
 func TestNodeGroupForNode(t *testing.T) {
 	ctrl := gomock.NewController(t)
 	defer ctrl.Finish()
-	orchestrationModes := [2]compute.OrchestrationMode{compute.Uniform, compute.Flexible}
+	orchestrationModes := []compute.OrchestrationMode{compute.Uniform, compute.Flexible}
 
 	expectedVMSSVMs := newTestVMSSVMList(3)
 	expectedVMs := newTestVMList(3)
 
 	for _, orchMode := range orchestrationModes {
-		expectedScaleSets := newTestVMSSList(3, testASG, "eastus", orchMode)
-		provider := newTestProvider(t)
-		mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
-		mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil)
-		provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
-
-		mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
-		provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
-
-		mockVMClient := mockvmclient.NewMockInterface(ctrl)
-		provider.azureManager.azClient.virtualMachinesClient = mockVMClient
-
-		if orchMode == compute.Uniform {
-			mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup,
-				testASG, gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
-		} else {
-			provider.azureManager.config.EnableVmssFlex = true
-			mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), testASG).Return(expectedVMs, nil).AnyTimes()
-		}
+		t.Run(fmt.Sprintf("OrchestrationMode_%v", orchMode), func(t *testing.T) {
+			expectedScaleSets := newTestVMSSList(3, testASG, "eastus", orchMode)
+			provider := newTestProvider(t)
+			mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
+			mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil)
+			provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
 
-		registered := provider.azureManager.RegisterNodeGroup(
+			mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
+			provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
+
+			mockVMClient := mockvmclient.NewMockInterface(ctrl)
+			provider.azureManager.azClient.virtualMachinesClient = mockVMClient
+
+			if orchMode == compute.Uniform {
+				mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup,
+					testASG, gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
+			} else {
+				provider.azureManager.config.EnableVmssFlex = true
+				mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), testASG).Return(expectedVMs, nil).AnyTimes()
+			}
+
+			registered := provider.azureManager.RegisterNodeGroup(
 			newTestScaleSet(provider.azureManager, testASG))
-		provider.azureManager.explicitlyConfigured[testASG] = true
-		assert.True(t, registered)
-		assert.Equal(t, len(provider.NodeGroups()), 1)
-
-		node := newApiNode(orchMode, 0)
-		// refresh cache
-		err := provider.azureManager.forceRefresh()
-		assert.NoError(t, err)
-		group, err := provider.NodeGroupForNode(node)
-		assert.NoError(t, err)
-		assert.NotNil(t, group, "Group should not be nil")
-		assert.Equal(t, group.Id(), testASG)
-		assert.Equal(t, group.MinSize(), 1)
-		assert.Equal(t, group.MaxSize(), 5)
-
-		// test node in cluster that is not in a group managed by cluster autoscaler
-		nodeNotInGroup := &apiv1.Node{
-			Spec: apiv1.NodeSpec{
-				ProviderID: azurePrefix + "/subscriptions/subscripion/resourceGroups/test-resource-group/providers/" +
-					"Microsoft.Compute/virtualMachines/test-instance-id-not-in-group",
-			},
-		}
-		group, err = provider.NodeGroupForNode(nodeNotInGroup)
-		assert.NoError(t, err)
-		assert.Nil(t, group)
+			provider.azureManager.explicitlyConfigured[testASG] = true
+			assert.True(t, registered)
+			assert.Equal(t, len(provider.NodeGroups()), 1)
+
+			node := newApiNode(orchMode, 0)
+			// refresh cache
+			err := provider.azureManager.forceRefresh()
+			assert.NoError(t, err)
+			group, err := provider.NodeGroupForNode(node)
+			assert.NoError(t, err)
+			assert.NotNil(t, group, "Group should not be nil")
+			assert.Equal(t, group.Id(), testASG)
+			assert.Equal(t, group.MinSize(), 1)
+			assert.Equal(t, group.MaxSize(), 5)
+
+			hasInstance, err := provider.HasInstance(node)
+			assert.True(t, hasInstance)
+			assert.NoError(t, err)
+
+			// test node in cluster that is not in a group managed by cluster autoscaler
+			nodeNotInGroup := &apiv1.Node{
+				Spec: apiv1.NodeSpec{
+					ProviderID: "azure:///subscriptions/subscription/resourceGroups/test-resource-group/providers/Microsoft.Compute/virtualMachineScaleSets/test/virtualMachines/test-instance-id-not-in-group",
+				},
+			}
+			group, err = provider.NodeGroupForNode(nodeNotInGroup)
+			assert.NoError(t, err)
+			assert.Nil(t, group)
+
+			hasInstance, err = provider.HasInstance(nodeNotInGroup)
+			assert.False(t, hasInstance)
+			assert.Error(t, err)
+			assert.Equal(t, err, cloudprovider.ErrNotImplemented)
+		})
 	}
 }