From 89e844771999f272b5d190d59707775d49b426f8 Mon Sep 17 00:00:00 2001
From: Julien Balestra <julien.balestra@datadoghq.com>
Date: Fri, 27 Mar 2020 11:51:34 +0100
Subject: [PATCH] cluster-autoscaler: override by annotation node scale down
 time

Signed-off-by: Julien Balestra <julien.balestra@datadoghq.com>
---
 cluster-autoscaler/FAQ.md             | 17 +++++++++++++++--
 cluster-autoscaler/core/scale_down.go | 22 ++++++++++++++++++++--
 2 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
index c498a474c051..f5b8f27e8cb2 100644
--- a/cluster-autoscaler/FAQ.md
+++ b/cluster-autoscaler/FAQ.md
@@ -307,6 +307,14 @@ It can be added to (or removed from) a node using kubectl:
 kubectl annotate node <nodename> cluster-autoscaler.kubernetes.io/scale-down-disabled=true
 ```
 
+Note that you can annotate any node with the following annotations to customise the scale down times:
+```yaml
+metadata:
+  annotations:
+    cluster-autoscaler.kubernetes.io/scale-down-unneeded-time: "1h"
+    cluster-autoscaler.kubernetes.io/scale-down-unready-time: "2h"
+```
+
 ### How can I configure overprovisioning with Cluster Autoscaler?
 
 Below solution works since version 1.1 (to be shipped with Kubernetes 1.9).
@@ -493,7 +501,9 @@ the scheduler will place the pods somewhere else.
 * It doesn't have scale-down disabled annotation (see [How can I prevent Cluster Autoscaler from scaling down a particular node?](#how-can-i-prevent-cluster-autoscaler-from-scaling-down-a-particular-node))
 
 If a node is unneeded for more than 10 minutes, it will be terminated. (This time can
-be configured by flags - please see [I have a couple of nodes with low utilization, but they are not scaled down. Why?](#i-have-a-couple-of-nodes-with-low-utilization-but-they-are-not-scaled-down-why) section for a more detailed explanation.)
+be configured by flags - please see [I have a couple of nodes with low utilization, but they are not scaled down. Why?](#i-have-a-couple-of-nodes-with-low-utilization-but-they-are-not-scaled-down-why) section for a more detailed explanation.) 
+You can override this global value with a per node annotation time `cluster-autoscaler.kubernetes.io/scale-down-unneeded-time: 60m`
+
 Cluster Autoscaler terminates one non-empty node at a time to reduce the risk of
 creating new unschedulable pods. The next node may possibly be terminated just after the first one,
 if it was also unneeded for more than 10 min and didn't rely on the same nodes
@@ -539,6 +549,9 @@ CA stops all operations until the situation improves. If there are fewer unready
 but they are concentrated in a particular node group,
 then this node group may be excluded from future scale-ups.
 
+The CA scales down NotReady nodes while not in scale down cooldown and after the configured time `--scale-down-unready-time`.
+It's possible to set a per node time with this annotation `cluster-autoscaler.kubernetes.io/scale-down-unready-time: "2h"`
+
 ### How fast is Cluster Autoscaler?
 
 By default, scale-up is considered up to 10 seconds after pod is marked as unschedulable, and scale-down 10 minutes after a node becomes unneeded.
@@ -692,7 +705,7 @@ CA doesn't remove underutilized nodes if they are running pods [that it shouldn'
 * node has the scale-down disabled annotation (see [How can I prevent Cluster Autoscaler from scaling down a particular node?](#how-can-i-prevent-cluster-autoscaler-from-scaling-down-a-particular-node))
 
 * node was unneeded for less than 10 minutes (configurable by
-  `--scale-down-unneeded-time` flag),
+  `--scale-down-unneeded-time` flag or with node annotation `cluster-autoscaler.kubernetes.io/scale-down-unneeded-time: 1h`),
 
 * there was a scale-up in the last 10 min (configurable by `--scale-down-delay-after-add` flag),
 
diff --git a/cluster-autoscaler/core/scale_down.go b/cluster-autoscaler/core/scale_down.go
index 2df6cb072c62..b4df7764baaa 100644
--- a/cluster-autoscaler/core/scale_down.go
+++ b/cluster-autoscaler/core/scale_down.go
@@ -54,6 +54,10 @@ import (
 const (
 	// ScaleDownDisabledKey is the name of annotation marking node as not eligible for scale down.
 	ScaleDownDisabledKey = "cluster-autoscaler.kubernetes.io/scale-down-disabled"
+	// ScaleDownUnneededTimeKey is the name of annotation to override the cluster wide --scale-down-unneeded-time
+	ScaleDownUnneededTimeKey = "cluster-autoscaler.kubernetes.io/scale-down-unneeded-time"
+	// ScaleDownUnreadyTimeKey is the name of annotation to override the cluster wide --scale-down-unready-time
+	ScaleDownUnreadyTimeKey = "cluster-autoscaler.kubernetes.io/scale-down-unready-time"
 	// DelayDeletionAnnotationPrefix is the prefix of annotation marking node as it needs to wait
 	// for other K8s components before deleting node.
 	DelayDeletionAnnotationPrefix = "delay-deletion.cluster-autoscaler.kubernetes.io/"
@@ -746,6 +750,20 @@ func (sd *ScaleDown) SoftTaintUnneededNodes(allNodes []*apiv1.Node) (errors []er
 	return
 }
 
+func getNodeScaleDownTime(annotationKey string, clusterTime time.Duration, node *apiv1.Node) time.Duration {
+	annotationTimeValue, ok := node.Annotations[annotationKey]
+	if !ok {
+		return clusterTime
+	}
+	d, err := time.ParseDuration(annotationTimeValue)
+	if err != nil {
+		klog.Warningf("Failed to parse node % annotation %s:%s: %v, using cluster time: %s", node.Name, annotationKey, annotationTimeValue, err, clusterTime.String())
+		return clusterTime
+	}
+	klog.V(4).Infof("%s override cluster time by annotation %s:%s", node.Name, annotationKey, d.String())
+	return d
+}
+
 // TryToScaleDown tries to scale down the cluster. It returns a result inside a ScaleDownStatus indicating if any node was
 // removed and error if such occurred.
 func (sd *ScaleDown) TryToScaleDown(
@@ -808,13 +826,13 @@ func (sd *ScaleDown) TryToScaleDown(
 		readinessMap[node.Name] = ready
 
 		// Check how long a ready node was underutilized.
-		if ready && !unneededSince.Add(sd.context.ScaleDownUnneededTime).Before(currentTime) {
+		if ready && !unneededSince.Add(getNodeScaleDownTime(ScaleDownUnneededTimeKey, sd.context.ScaleDownUnneededTime, node)).Before(currentTime) {
 			sd.addUnremovableNodeReason(node, simulator.NotUnneededLongEnough)
 			continue
 		}
 
 		// Unready nodes may be deleted after a different time than underutilized nodes.
-		if !ready && !unneededSince.Add(sd.context.ScaleDownUnreadyTime).Before(currentTime) {
+		if !ready && !unneededSince.Add(getNodeScaleDownTime(ScaleDownUnreadyTimeKey, sd.context.ScaleDownUnreadyTime, node)).Before(currentTime) {
 			sd.addUnremovableNodeReason(node, simulator.NotUnreadyLongEnough)
 			continue
 		}