Skip to content

Commit

Permalink
Merge pull request #4372 from marwanad/cherry-pick-4130-1.21
Browse files Browse the repository at this point in the history
Cherry-pick #4130 onto 1.21: dont proactively decrement azure cache for unregistered nodes
  • Loading branch information
k8s-ci-robot authored Oct 1, 2021
2 parents 38dd2fc + 836506e commit 7c4fb20
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 8 deletions.
17 changes: 12 additions & 5 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ func (scaleSet *ScaleSet) Belongs(node *apiv1.Node) (bool, error) {
}

// DeleteInstances deletes the given instances. All instances must be controlled by the same ASG.
func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef) error {
func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregisteredNodes bool) error {
if len(instances) == 0 {
return nil
}
Expand Down Expand Up @@ -405,9 +405,12 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef) error {

// Proactively decrement scale set size so that we don't
// go below minimum node count if cache data is stale
scaleSet.sizeMutex.Lock()
scaleSet.curSize -= int64(len(instanceIDs))
scaleSet.sizeMutex.Unlock()
// only do it for non-unregistered nodes
if !hasUnregisteredNodes {
scaleSet.sizeMutex.Lock()
scaleSet.curSize -= int64(len(instanceIDs))
scaleSet.sizeMutex.Unlock()
}

// Proactively set the status of the instances to be deleted in cache
for _, instance := range instancesToDelete {
Expand All @@ -432,6 +435,7 @@ func (scaleSet *ScaleSet) DeleteNodes(nodes []*apiv1.Node) error {
}

refs := make([]*azureRef, 0, len(nodes))
hasUnregisteredNodes := false
for _, node := range nodes {
belongs, err := scaleSet.Belongs(node)
if err != nil {
Expand All @@ -442,13 +446,16 @@ func (scaleSet *ScaleSet) DeleteNodes(nodes []*apiv1.Node) error {
return fmt.Errorf("%s belongs to a different asg than %s", node.Name, scaleSet.Id())
}

if node.Annotations[cloudprovider.FakeNodeReasonAnnotation] == cloudprovider.FakeNodeUnregistered {
hasUnregisteredNodes = true
}
ref := &azureRef{
Name: node.Spec.ProviderID,
}
refs = append(refs, ref)
}

return scaleSet.DeleteInstances(refs)
return scaleSet.DeleteInstances(refs, hasUnregisteredNodes)
}

// Id returns ScaleSet id.
Expand Down
77 changes: 77 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package azure

import (
"fmt"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"net/http"
"testing"
"time"
Expand Down Expand Up @@ -346,6 +347,82 @@ func TestDeleteNodes(t *testing.T) {
assert.Equal(t, instance2.Status.State, cloudprovider.InstanceDeleting)
}

func TestDeleteNodeUnregistered(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

manager := newTestAzureManager(t)
vmssName := "test-asg"
var vmssCapacity int64 = 2

expectedScaleSets := []compute.VirtualMachineScaleSet{
{
Name: &vmssName,
Sku: &compute.Sku{
Capacity: &vmssCapacity,
},
},
}
expectedVMSSVMs := newTestVMSSVMList(2)

mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).Times(2)
mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any()).Return(nil, nil)
mockVMSSClient.EXPECT().WaitForAsyncOperationResult(gomock.Any(), gomock.Any()).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes()
manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
err := manager.forceRefresh()
assert.NoError(t, err)

resourceLimiter := cloudprovider.NewResourceLimiter(
map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000},
map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000})
provider, err := BuildAzureCloudProvider(manager, resourceLimiter)
assert.NoError(t, err)

registered := manager.RegisterNodeGroup(
newTestScaleSet(manager, "test-asg"))
manager.explicitlyConfigured["test-asg"] = true
assert.True(t, registered)
err = manager.forceRefresh()
assert.NoError(t, err)

scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet)
assert.True(t, ok)

targetSize, err := scaleSet.TargetSize()
assert.NoError(t, err)
assert.Equal(t, 2, targetSize)

// annotate node with unregistered annotation
annotations := make(map[string]string)
annotations[cloudprovider.FakeNodeReasonAnnotation] = cloudprovider.FakeNodeUnregistered
nodesToDelete := []*apiv1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Annotations: annotations,
},
Spec: apiv1.NodeSpec{
ProviderID: "azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0),
},
},
}
err = scaleSet.DeleteNodes(nodesToDelete)
assert.NoError(t, err)

// Ensure the the cached size has NOT been proactively decremented
targetSize, err = scaleSet.TargetSize()
assert.NoError(t, err)
assert.Equal(t, 2, targetSize)

// Ensure that the status for the instances is Deleting
instance0, found := scaleSet.getInstanceByProviderID("azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0))
assert.True(t, found, true)
assert.Equal(t, instance0.Status.State, cloudprovider.InstanceDeleting)
}

func TestDeleteNoConflictRequest(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()
Expand Down
10 changes: 10 additions & 0 deletions cluster-autoscaler/cloudprovider/cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,16 @@ func (c InstanceErrorClass) String() string {
}
}

const (
// FakeNodeReasonAnnotation is an annotation added to the fake placeholder nodes CA has created
// Note that this don't map to real nodes in k8s and are merely used for error handling
FakeNodeReasonAnnotation = "k8s.io/cluster-autoscaler/fake-node-reason"
// FakeNodeUnregistered represents a node that is identified by CA as unregistered
FakeNodeUnregistered = "unregistered"
// FakeNodeCreateError represents a node that is identified by CA as a created node with errors
FakeNodeCreateError = "create-error"
)

// PricingModel contains information about the node price and how it changes in time.
type PricingModel interface {
// NodePrice returns a price of running the given node for a given period of time.
Expand Down
9 changes: 6 additions & 3 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,7 @@ func getNotRegisteredNodes(allNodes []*apiv1.Node, cloudProviderNodeInstances ma
for _, instance := range instances {
if !registered.Has(instance.Id) {
notRegistered = append(notRegistered, UnregisteredNode{
Node: fakeNode(instance),
Node: fakeNode(instance, cloudprovider.FakeNodeUnregistered),
UnregisteredSince: time,
})
}
Expand Down Expand Up @@ -1091,7 +1091,7 @@ func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() []*apiv1.Node {
_, _, instancesByErrorCode := csr.buildInstanceToErrorCodeMappings(nodeGroupInstances)
for _, instances := range instancesByErrorCode {
for _, instance := range instances {
nodesWithCreateErrors = append(nodesWithCreateErrors, fakeNode(instance))
nodesWithCreateErrors = append(nodesWithCreateErrors, fakeNode(instance, cloudprovider.FakeNodeCreateError))
}
}
}
Expand All @@ -1108,10 +1108,13 @@ func (csr *ClusterStateRegistry) InvalidateNodeInstancesCacheEntry(nodeGroup clo
csr.cloudProviderNodeInstancesCache.InvalidateCacheEntry(nodeGroup)
}

func fakeNode(instance cloudprovider.Instance) *apiv1.Node {
func fakeNode(instance cloudprovider.Instance, reason string) *apiv1.Node {
return &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: instance.Id,
Annotations: map[string]string{
cloudprovider.FakeNodeReasonAnnotation: reason,
},
},
Spec: apiv1.NodeSpec{
ProviderID: instance.Id,
Expand Down

0 comments on commit 7c4fb20

Please sign in to comment.