Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check min size of node group and resource limits for set of nodes #5502

Merged
merged 1 commit into from
Feb 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 44 additions & 19 deletions cluster-autoscaler/core/scaledown/unneeded/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,19 +119,24 @@ func (n *Nodes) Drop(node string) {
// unneeded, but are not removable, annotated by reason.
func (n *Nodes) RemovableAt(context *context.AutoscalingContext, ts time.Time, resourcesLeft resource.Limits, resourcesWithLimits []string, as scaledown.ActuationStatus) (empty, needDrain []simulator.NodeToBeRemoved, unremovable []*simulator.UnremovableNode) {
nodeGroupSize := utils.GetNodeGroupSizeMap(context.CloudProvider)
for nodeName, v := range n.byName {
klog.V(2).Infof("%s was unneeded for %s", nodeName, ts.Sub(v.since).String())
resourcesLeftCopy := resourcesLeft.DeepCopy()
emptyNodes, drainNodes := n.splitEmptyAndNonEmptyNodes()

if r := n.unremovableReason(context, v, ts, nodeGroupSize, resourcesLeft, resourcesWithLimits, as); r != simulator.NoReason {
for nodeName, v := range emptyNodes {
klog.V(2).Infof("%s was unneeded for %s", nodeName, ts.Sub(v.since).String())
if r := n.unremovableReason(context, v, ts, nodeGroupSize, resourcesLeftCopy, resourcesWithLimits, as); r != simulator.NoReason {
unremovable = append(unremovable, &simulator.UnremovableNode{Node: v.ntbr.Node, Reason: r})
continue
}

if len(v.ntbr.PodsToReschedule) > 0 {
needDrain = append(needDrain, v.ntbr)
} else {
empty = append(empty, v.ntbr)
empty = append(empty, v.ntbr)
}
for nodeName, v := range drainNodes {
klog.V(2).Infof("%s was unneeded for %s", nodeName, ts.Sub(v.since).String())
if r := n.unremovableReason(context, v, ts, nodeGroupSize, resourcesLeftCopy, resourcesWithLimits, as); r != simulator.NoReason {
unremovable = append(unremovable, &simulator.UnremovableNode{Node: v.ntbr.Node, Reason: r})
continue
}
needDrain = append(needDrain, v.ntbr)
}
return
}
Expand Down Expand Up @@ -177,25 +182,18 @@ func (n *Nodes) unremovableReason(context *context.AutoscalingContext, v *node,
}
}

size, found := nodeGroupSize[nodeGroup.Id()]
if !found {
klog.Errorf("Error while checking node group size %s: group size not found in cache", nodeGroup.Id())
return simulator.UnexpectedError
}

deletionsInProgress := as.DeletionsCount(nodeGroup.Id())
if size-deletionsInProgress <= nodeGroup.MinSize() {
klog.V(1).Infof("Skipping %s - node group min size reached", node.Name)
return simulator.NodeGroupMinSizeReached
if reason := verifyMinSize(node.Name, nodeGroup, nodeGroupSize, as); reason != simulator.NoReason {
return reason
}
nodeGroupSize[nodeGroup.Id()]--

resourceDelta, err := n.limitsFinder.DeltaForNode(context, node, nodeGroup, resourcesWithLimits)
if err != nil {
klog.Errorf("Error getting node resources: %v", err)
return simulator.UnexpectedError
}

checkResult := resourcesLeft.CheckDeltaWithinLimits(resourceDelta)
checkResult := resourcesLeft.TryDecrementBy(resourceDelta)
if checkResult.Exceeded() {
klog.V(4).Infof("Skipping %s - minimal limit exceeded for %v", node.Name, checkResult.ExceededResources)
for _, resource := range checkResult.ExceededResources {
Expand All @@ -213,3 +211,30 @@ func (n *Nodes) unremovableReason(context *context.AutoscalingContext, v *node,

return simulator.NoReason
}

func (n *Nodes) splitEmptyAndNonEmptyNodes() (empty, needDrain map[string]*node) {
empty = make(map[string]*node)
needDrain = make(map[string]*node)
for name, v := range n.byName {
if len(v.ntbr.PodsToReschedule) > 0 {
needDrain[name] = v
} else {
empty[name] = v
}
}
return
}

func verifyMinSize(nodeName string, nodeGroup cloudprovider.NodeGroup, nodeGroupSize map[string]int, as scaledown.ActuationStatus) simulator.UnremovableReason {
yaroslava-serdiuk marked this conversation as resolved.
Show resolved Hide resolved
size, found := nodeGroupSize[nodeGroup.Id()]
if !found {
klog.Errorf("Error while checking node group size %s: group size not found in cache", nodeGroup.Id())
return simulator.UnexpectedError
}
deletionsInProgress := as.DeletionsCount(nodeGroup.Id())
if size-deletionsInProgress <= nodeGroup.MinSize() {
klog.V(1).Infof("Skipping %s - node group min size reached", nodeName)
return simulator.NodeGroupMinSizeReached
}
return simulator.NoReason
}
127 changes: 127 additions & 0 deletions cluster-autoscaler/core/scaledown/unneeded/nodes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,22 @@ limitations under the License.
package unneeded

import (
"fmt"
"testing"
"time"

apiv1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/resource"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status"
. "k8s.io/autoscaler/cluster-autoscaler/core/test"
"k8s.io/autoscaler/cluster-autoscaler/simulator"
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
"k8s.io/client-go/kubernetes/fake"

"github.com/stretchr/testify/assert"
)
Expand Down Expand Up @@ -115,3 +126,119 @@ func makeNode(name, version string) simulator.NodeToBeRemoved {
func version(n simulator.NodeToBeRemoved) string {
return n.Node.Annotations[testVersion]
}

func TestRemovableAt(t *testing.T) {

testCases := []struct {
name string
numEmpty int
numDrain int
minSize int
targetSize int
numOngoingDeletions int
numEmptyToRemove int
numDrainToRemove int
}{
{
name: "Node group min size is not reached",
numEmpty: 3,
numDrain: 2,
minSize: 1,
targetSize: 10,
numOngoingDeletions: 2,
numEmptyToRemove: 3,
numDrainToRemove: 2,
},
{
name: "Node group min size is reached for drain nodes",
numEmpty: 3,
numDrain: 5,
minSize: 1,
targetSize: 10,
numOngoingDeletions: 2,
numEmptyToRemove: 3,
numDrainToRemove: 4,
},
{
name: "Node group min size is reached for empty and drain nodes",
numEmpty: 3,
numDrain: 5,
minSize: 1,
targetSize: 5,
numOngoingDeletions: 2,
numEmptyToRemove: 2,
numDrainToRemove: 0,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ng := testprovider.NewTestNodeGroup("ng", 100, tc.minSize, tc.targetSize, true, false, "", nil, nil)
empty := []simulator.NodeToBeRemoved{}
for i := 0; i < tc.numEmpty; i++ {
empty = append(empty, simulator.NodeToBeRemoved{
Node: BuildTestNode(fmt.Sprintf("empty-%d", i), 10, 100),
})
}
drain := []simulator.NodeToBeRemoved{}
for i := 0; i < tc.numDrain; i++ {
drain = append(drain, simulator.NodeToBeRemoved{
Node: BuildTestNode(fmt.Sprintf("drain-%d", i), 10, 100),
PodsToReschedule: []*apiv1.Pod{BuildTestPod(fmt.Sprintf("pod-%d", i), 1, 1)},
})
}

nodes := append(empty, drain...)
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.InsertNodeGroup(ng)
for _, node := range nodes {
provider.AddNode("ng", node.Node)
}

as := &fakeActuationStatus{deletionCount: map[string]int{"ng": tc.numOngoingDeletions}}

rsLister, err := kube_util.NewTestReplicaSetLister(nil)
assert.NoError(t, err)
registry := kube_util.NewListerRegistry(nil, nil, nil, nil, nil, nil, nil, nil, rsLister, nil)
ctx, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{ScaleDownSimulationTimeout: 5 * time.Minute}, &fake.Clientset{}, registry, provider, nil, nil)
assert.NoError(t, err)

n := NewNodes(&fakeScaleDownTimeGetter{}, &resource.LimitsFinder{})
n.Update(nodes, time.Now())
gotEmptyToRemove, gotDrainToRemove, _ := n.RemovableAt(&ctx, time.Now(), resource.Limits{}, []string{}, as)
if len(gotDrainToRemove) != tc.numDrainToRemove || len(gotEmptyToRemove) != tc.numEmptyToRemove {
t.Errorf("%s: getNodesToRemove() return %d, %d, want %d, %d", tc.name, len(gotEmptyToRemove), len(gotDrainToRemove), tc.numEmptyToRemove, tc.numDrainToRemove)
}
})
}
}

type fakeActuationStatus struct {
recentEvictions []*apiv1.Pod
deletionCount map[string]int
}

func (f *fakeActuationStatus) RecentEvictions() []*apiv1.Pod {
return f.recentEvictions
}

func (f *fakeActuationStatus) DeletionsInProgress() ([]string, []string) {
return nil, nil
}

func (f *fakeActuationStatus) DeletionResults() (map[string]status.NodeDeleteResult, time.Time) {
return nil, time.Time{}
}

func (f *fakeActuationStatus) DeletionsCount(nodeGroup string) int {
return f.deletionCount[nodeGroup]
}

type fakeScaleDownTimeGetter struct{}

func (f *fakeScaleDownTimeGetter) GetScaleDownUnneededTime(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (time.Duration, error) {
return 0 * time.Second, nil
}

func (f *fakeScaleDownTimeGetter) GetScaleDownUnreadyTime(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (time.Duration, error) {
return 0 * time.Second, nil
}