Skip to content

Commit

Permalink
cherry-pick kubernetes#4162 and kubernetes#4172 [cluster-autoscaler]A…
Browse files Browse the repository at this point in the history
…dd flag to control DaemonSet eviction on non-empty nodes & Allow DaemonSet pods to opt in/out

from eviction.
  • Loading branch information
Shubham82 committed May 25, 2022
1 parent 0ad1731 commit fcd0433
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 31 deletions.
2 changes: 2 additions & 0 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ type AutoscalingOptions struct {
CordonNodeBeforeTerminate bool
// DaemonSetEvictionForEmptyNodes is whether CA will gracefully terminate DaemonSet pods from empty nodes.
DaemonSetEvictionForEmptyNodes bool
// DaemonSetEvictionForOccupiedNodes is whether CA will gracefully terminate DaemonSet pods from non-empty nodes.
DaemonSetEvictionForOccupiedNodes bool
// User agent to use for HTTP calls.
UserAgent string
}
15 changes: 9 additions & 6 deletions cluster-autoscaler/core/scale_down.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/processors/customresources"
"k8s.io/autoscaler/cluster-autoscaler/simulator"
"k8s.io/autoscaler/cluster-autoscaler/utils"
"k8s.io/autoscaler/cluster-autoscaler/utils/daemonset"
"k8s.io/autoscaler/cluster-autoscaler/utils/deletetaint"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
Expand Down Expand Up @@ -1114,7 +1115,7 @@ func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client k
return deletedNodes, errors.ToAutoscalerError(errors.ApiCallError, taintErr)
}
deletedNodes = append(deletedNodes, node)
go func(nodeToDelete *apiv1.Node, nodeGroupForDeletedNode cloudprovider.NodeGroup, needToEvictDaemonSetPods bool) {
go func(nodeToDelete *apiv1.Node, nodeGroupForDeletedNode cloudprovider.NodeGroup, evictByDefault bool) {
sd.nodeDeletionTracker.StartDeletion(nodeGroupForDeletedNode.Id())
defer sd.nodeDeletionTracker.EndDeletion(nodeGroupForDeletedNode.Id())
var result status.NodeDeleteResult
Expand All @@ -1130,10 +1131,8 @@ func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client k
sd.context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaleDownEmpty", "Scale-down: empty node %s removed", nodeToDelete.Name)
}
}()
if needToEvictDaemonSetPods {
if err := evictDaemonSetPods(sd.context.ClusterSnapshot, nodeToDelete, client, sd.context.MaxGracefulTerminationSec, time.Now(), DaemonSetEvictionEmptyNodeTimeout, DeamonSetTimeBetweenEvictionRetries, recorder); err != nil {
klog.Warningf("error while evicting DS pods from an empty node: %v", err)
}
if err := evictDaemonSetPods(sd.context.ClusterSnapshot, nodeToDelete, client, sd.context.MaxGracefulTerminationSec, time.Now(), DaemonSetEvictionEmptyNodeTimeout, DeamonSetTimeBetweenEvictionRetries, recorder, evictByDefault); err != nil {
klog.Warningf("error while evicting DS pods from an empty node: %v", err)
}
deleteErr = waitForDelayDeletion(nodeToDelete, sd.context.ListerRegistry.AllNodeLister(), sd.context.AutoscalingOptions.NodeDeletionDelayTimeout)
if deleteErr != nil {
Expand Down Expand Up @@ -1161,7 +1160,7 @@ func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client k

// Create eviction object for all DaemonSet pods on the node
func evictDaemonSetPods(clusterSnapshot simulator.ClusterSnapshot, nodeToDelete *apiv1.Node, client kube_client.Interface, maxGracefulTerminationSec int, timeNow time.Time, dsEvictionTimeout time.Duration, waitBetweenRetries time.Duration,
recorder kube_record.EventRecorder) error {
recorder kube_record.EventRecorder, evictByDefault bool) error {
nodeInfo, err := clusterSnapshot.NodeInfos().Get(nodeToDelete.Name)
if err != nil {
return fmt.Errorf("failed to get node info for %s", nodeToDelete.Name)
Expand All @@ -1171,6 +1170,8 @@ func evictDaemonSetPods(clusterSnapshot simulator.ClusterSnapshot, nodeToDelete
return fmt.Errorf("failed to get DaemonSet pods for %s (error: %v)", nodeToDelete.Name, err)
}

daemonSetPods = daemonset.PodsToEvict(daemonSetPods, evictByDefault)

dsEviction := make(chan status.PodEvictionResult, len(daemonSetPods))

// Perform eviction of DaemonSet pods
Expand Down Expand Up @@ -1226,6 +1227,8 @@ func (sd *ScaleDown) deleteNode(node *apiv1.Node, pods []*apiv1.Pod, daemonSetPo

sd.context.Recorder.Eventf(node, apiv1.EventTypeNormal, "ScaleDown", "marked the node as toBeDeleted/unschedulable")

daemonSetPods = daemonset.PodsToEvict(daemonSetPods, sd.context.DaemonSetEvictionForOccupiedNodes)

// attempt drain
evictionResults, err := drainNode(node, pods, daemonSetPods, sd.context.ClientSet, sd.context.Recorder, sd.context.MaxGracefulTerminationSec, MaxPodEvictionTime, EvictionRetryTime, PodEvictionHeadroom)
if err != nil {
Expand Down
85 changes: 60 additions & 25 deletions cluster-autoscaler/core/scale_down_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/core/utils"
"k8s.io/autoscaler/cluster-autoscaler/utils/daemonset"
"k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
Expand Down Expand Up @@ -1217,33 +1218,35 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) {
dsEvictionTimeout time.Duration
evictionSuccess bool
err error
evictByDefault bool
extraAnnotationValue map[string]string
expectNotEvicted map[string]struct{}
}{
{
name: "Successful attempt to evict DaemonSet pods",
dsPods: []string{"d1", "d2"},
nodeInfoSuccess: true,
evictionTimeoutExceed: false,
dsEvictionTimeout: 5000 * time.Millisecond,
evictionSuccess: true,
err: nil,
name: "Successful attempt to evict DaemonSet pods",
dsPods: []string{"d1", "d2"},
nodeInfoSuccess: true,
dsEvictionTimeout: 5000 * time.Millisecond,
evictionSuccess: true,
evictByDefault: true,
},
{
name: "Failed to get node info",
dsPods: []string{"d1", "d2"},
nodeInfoSuccess: false,
evictionTimeoutExceed: false,
dsEvictionTimeout: 5000 * time.Millisecond,
evictionSuccess: true,
err: fmt.Errorf("failed to get node info"),
name: "Failed to get node info",
dsPods: []string{"d1", "d2"},
nodeInfoSuccess: false,
dsEvictionTimeout: 5000 * time.Millisecond,
evictionSuccess: true,
err: fmt.Errorf("failed to get node info"),
evictByDefault: true,
},
{
name: "Failed to create DaemonSet eviction",
dsPods: []string{"d1", "d2"},
nodeInfoSuccess: true,
evictionTimeoutExceed: false,
dsEvictionTimeout: 5000 * time.Millisecond,
evictionSuccess: false,
err: fmt.Errorf("following DaemonSet pod failed to evict on the"),
name: "Failed to create DaemonSet eviction",
dsPods: []string{"d1", "d2"},
nodeInfoSuccess: true,
dsEvictionTimeout: 5000 * time.Millisecond,
evictionSuccess: false,
err: fmt.Errorf("following DaemonSet pod failed to evict on the"),
evictByDefault: true,
},
{
name: "Eviction timeout exceed",
Expand All @@ -1253,11 +1256,33 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) {
dsEvictionTimeout: 100 * time.Millisecond,
evictionSuccess: true,
err: fmt.Errorf("failed to create DaemonSet eviction for"),
evictByDefault: true,
},
{
name: "Evict single pod due to annotation",
dsPods: []string{"d1", "d2"},
nodeInfoSuccess: true,
dsEvictionTimeout: 5000 * time.Millisecond,
evictionSuccess: true,
extraAnnotationValue: map[string]string{"d1": "true"},
expectNotEvicted: map[string]struct{}{"d2": {}},
},
{
name: "Don't evict single pod due to annotation",
dsPods: []string{"d1", "d2"},
nodeInfoSuccess: true,
dsEvictionTimeout: 5000 * time.Millisecond,
evictionSuccess: true,
evictByDefault: true,
extraAnnotationValue: map[string]string{"d1": "false"},
expectNotEvicted: map[string]struct{}{"d1": {}},
},
}

for _, scenario := range testScenarios {
scenario := scenario
t.Run(scenario.name, func(t *testing.T) {
t.Parallel()
options := config.AutoscalingOptions{
NodeGroupDefaults: config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.5,
Expand All @@ -1277,6 +1302,9 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) {
ds := BuildTestPod(dsName, 100, 0)
ds.Spec.NodeName = "n1"
ds.OwnerReferences = GenerateOwnerReferences("", "DaemonSet", "", "")
if v, ok := scenario.extraAnnotationValue[dsName]; ok {
ds.Annotations[daemonset.EnableDsEvictionKey] = v
}
dsPods[i] = ds
}

Expand Down Expand Up @@ -1312,18 +1340,25 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) {
simulator.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, []*apiv1.Node{}, []*apiv1.Pod{})
}

err = evictDaemonSetPods(context.ClusterSnapshot, n1, fakeClient, options.MaxGracefulTerminationSec, timeNow, scenario.dsEvictionTimeout, waitBetweenRetries, kube_util.CreateEventRecorder(fakeClient))
err = evictDaemonSetPods(context.ClusterSnapshot, n1, fakeClient, options.MaxGracefulTerminationSec, timeNow, scenario.dsEvictionTimeout, waitBetweenRetries, kube_util.CreateEventRecorder(fakeClient), scenario.evictByDefault)
if scenario.err != nil {
assert.NotNil(t, err)
assert.Contains(t, err.Error(), scenario.err.Error())
return
}
assert.Nil(t, err)
deleted := make([]string, len(scenario.dsPods))
for i := 0; i < len(scenario.dsPods); i++ {
var expectEvicted []string
for _, p := range scenario.dsPods {
if _, found := scenario.expectNotEvicted[p]; found {
continue
}
expectEvicted = append(expectEvicted, p)
}
deleted := make([]string, len(expectEvicted))
for i := 0; i < len(expectEvicted); i++ {
deleted[i] = utils.GetStringFromChan(deletedPods)
}
assert.ElementsMatch(t, deleted, scenario.dsPods)
assert.ElementsMatch(t, deleted, expectEvicted)
})
}
}
Expand Down
2 changes: 2 additions & 0 deletions cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ var (
clusterAPICloudConfigAuthoritative = flag.Bool("clusterapi-cloud-config-authoritative", false, "Treat the cloud-config flag authoritatively (do not fallback to using kubeconfig flag). ClusterAPI only")
cordonNodeBeforeTerminate = flag.Bool("cordon-node-before-terminating", false, "Should CA cordon nodes before terminating during downscale process")
daemonSetEvictionForEmptyNodes = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes")
daemonSetEvictionForOccupiedNodes = flag.Bool("daemonset-eviction-for-occupied-nodes", true, "DaemonSet pods will be gracefully terminated from non-empty nodes")
userAgent = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.")

emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.")
Expand Down Expand Up @@ -257,6 +258,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
ClusterAPICloudConfigAuthoritative: *clusterAPICloudConfigAuthoritative,
CordonNodeBeforeTerminate: *cordonNodeBeforeTerminate,
DaemonSetEvictionForEmptyNodes: *daemonSetEvictionForEmptyNodes,
DaemonSetEvictionForOccupiedNodes: *daemonSetEvictionForOccupiedNodes,
UserAgent: *userAgent,
}
}
Expand Down
20 changes: 20 additions & 0 deletions cluster-autoscaler/utils/daemonset/daemonset.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ import (
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)

const (
// EnableDsEvictionKey is the name of annotation controlling whether a
// certain DaemonSet pod should be evicted.
EnableDsEvictionKey = "cluster-autoscaler.kubernetes.io/enable-ds-eviction"
)

// GetDaemonSetPodsForNode returns daemonset nodes for the given pod.
func GetDaemonSetPodsForNode(nodeInfo *schedulerframework.NodeInfo, daemonsets []*appsv1.DaemonSet, predicateChecker simulator.PredicateChecker) ([]*apiv1.Pod, error) {
result := make([]*apiv1.Pod, 0)
Expand Down Expand Up @@ -66,3 +72,17 @@ func newPod(ds *appsv1.DaemonSet, nodeName string) *apiv1.Pod {
newPod.Spec.NodeName = nodeName
return newPod
}

// PodsToEvict returns a list of DaemonSet pods that should be evicted during scale down.
func PodsToEvict(pods []*apiv1.Pod, evictByDefault bool) (evictable []*apiv1.Pod) {
for _, pod := range pods {
if a, ok := pod.Annotations[EnableDsEvictionKey]; ok {
if a == "true" {
evictable = append(evictable, pod)
}
} else if evictByDefault {
evictable = append(evictable, pod)
}
}
return
}
70 changes: 70 additions & 0 deletions cluster-autoscaler/utils/daemonset/daemonset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,76 @@ func TestGetDaemonSetPodsForNode(t *testing.T) {
}
}

func TestEvictedPodsFilter(t *testing.T) {
testCases := []struct {
name string
pods map[string]string
evictionDefault bool
expectedPods []string
}{
{
name: "all pods evicted by default",
pods: map[string]string{
"p1": "",
"p2": "",
"p3": "",
},
evictionDefault: true,
expectedPods: []string{"p1", "p2", "p3"},
},
{
name: "no pods evicted by default",
pods: map[string]string{
"p1": "",
"p2": "",
"p3": "",
},
evictionDefault: false,
expectedPods: []string{},
},
{
name: "all pods evicted by default, one opt-out",
pods: map[string]string{
"p1": "",
"p2": "false",
"p3": "",
},
evictionDefault: true,
expectedPods: []string{"p1", "p3"},
},
{
name: "no pods evicted by default, one opt-in",
pods: map[string]string{
"p1": "",
"p2": "true",
"p3": "",
},
evictionDefault: false,
expectedPods: []string{"p2"},
},
}
for _, tc := range testCases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
var dsPods []*apiv1.Pod
for n, av := range tc.pods {
p := BuildTestPod(n, 100, 0)
if av != "" {
p.Annotations[EnableDsEvictionKey] = av
}
dsPods = append(dsPods, p)
}
pte := PodsToEvict(dsPods, tc.evictionDefault)
got := make([]string, len(pte))
for i, p := range pte {
got[i] = p.Name
}
assert.ElementsMatch(t, got, tc.expectedPods)
})
}
}

func newDaemonSet(name string) *appsv1.DaemonSet {
return &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Expand Down

0 comments on commit fcd0433

Please sign in to comment.