From f87c1da2cb5e6d72c8e3020b98d366b6dcd1645a Mon Sep 17 00:00:00 2001
From: Aleksandra Malinowska <aleksandram@google.com>
Date: Fri, 8 Dec 2017 12:54:30 +0100
Subject: [PATCH 1/5] Use allocatable instead of capacity to calculate node
 utilization

---
 cluster-autoscaler/FAQ.md               | 3 ++-
 cluster-autoscaler/simulator/cluster.go | 9 +++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
index e7aab6d3e547..bf20537576cd 100644
--- a/cluster-autoscaler/FAQ.md
+++ b/cluster-autoscaler/FAQ.md
@@ -351,7 +351,8 @@ Every 10 seconds (configurable by `--scan-interval` flag), if no scale-up is
 needed, Cluster Autoscaler checks which nodes are unneeded. A node is considered for removal when:
 
 * The sum of cpu and memory requests of all pods running on this node is smaller
-  than 50% of the node's capacity. Utilization threshold can be configured using
+  than 50% of the node's allocatable. (Before 1.1.0, node capacity was used
+  instead of allocatable.) Utilization threshold can be configured using
   `--scale-down-utilization-threshold` flag.
 
 * All pods running on the node (except these that run on all nodes by default, like manifest-run pods
diff --git a/cluster-autoscaler/simulator/cluster.go b/cluster-autoscaler/simulator/cluster.go
index e1f3c13862f0..f49c426af354 100644
--- a/cluster-autoscaler/simulator/cluster.go
+++ b/cluster-autoscaler/simulator/cluster.go
@@ -137,7 +137,8 @@ func FindEmptyNodesToRemove(candidates []*apiv1.Node, pods []*apiv1.Pod) []*apiv
 	return result
 }
 
-// CalculateUtilization calculates utilization of a node, defined as total amount of requested resources divided by capacity.
+// CalculateUtilization calculates utilization of a node, defined as maximum of (cpu, memory) utilization.
+// Per resource utilization is the sum of requests for it divided by allocatable.
 func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulercache.NodeInfo) (float64, error) {
 	cpu, err := calculateUtilizationOfResource(node, nodeInfo, apiv1.ResourceCPU)
 	if err != nil {
@@ -151,11 +152,11 @@ func CalculateUtilization(node *apiv1.Node, nodeInfo *schedulercache.NodeInfo) (
 }
 
 func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulercache.NodeInfo, resourceName apiv1.ResourceName) (float64, error) {
-	nodeCapacity, found := node.Status.Capacity[resourceName]
+	nodeAllocatable, found := node.Status.Allocatable[resourceName]
 	if !found {
 		return 0, fmt.Errorf("Failed to get %v from %s", resourceName, node.Name)
 	}
-	if nodeCapacity.MilliValue() == 0 {
+	if nodeAllocatable.MilliValue() == 0 {
 		return 0, fmt.Errorf("%v is 0 at %s", resourceName, node.Name)
 	}
 	podsRequest := resource.MustParse("0")
@@ -166,7 +167,7 @@ func calculateUtilizationOfResource(node *apiv1.Node, nodeInfo *schedulercache.N
 			}
 		}
 	}
-	return float64(podsRequest.MilliValue()) / float64(nodeCapacity.MilliValue()), nil
+	return float64(podsRequest.MilliValue()) / float64(nodeAllocatable.MilliValue()), nil
 }
 
 // TODO: We don't need to pass list of nodes here as they are already available in nodeInfos.

From c7a8bee5895ca7afa9692901c58461b307f32fad Mon Sep 17 00:00:00 2001
From: Tim Hockin <thockin@google.com>
Date: Fri, 22 Dec 2017 09:55:16 -0800
Subject: [PATCH 2/5] Convert registry to k8s.gcr.io

---
 addon-resizer/Makefile                           | 2 +-
 addon-resizer/README.md                          | 2 +-
 cluster-autoscaler/cloudprovider/aws/README.md   | 6 +++---
 cluster-autoscaler/cloudprovider/azure/README.md | 4 ++--
 vertical-pod-autoscaler/initializer/Dockerfile   | 2 +-
 vertical-pod-autoscaler/updater/Dockerfile       | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/addon-resizer/Makefile b/addon-resizer/Makefile
index c1efa43e1010..a6b4df93941c 100644
--- a/addon-resizer/Makefile
+++ b/addon-resizer/Makefile
@@ -22,7 +22,7 @@ ARCH ?= amd64
 
 GOARM=7
 GOLANG_VERSION = 1.8.3
-REGISTRY = gcr.io/google_containers
+REGISTRY = k8s.gcr.io
 IMGNAME = addon-resizer
 IMAGE = $(REGISTRY)/$(IMGNAME)
 MULTI_ARCH_IMG = $(IMAGE)-$(ARCH)
diff --git a/addon-resizer/README.md b/addon-resizer/README.md
index e3dae72f0538..d27a539cbe7f 100644
--- a/addon-resizer/README.md
+++ b/addon-resizer/README.md
@@ -55,7 +55,7 @@ spec:
         kubernetes.io/cluster-service: "true"
     spec:
       containers:
-        - image: gcr.io/google_containers/addon-resizer:1.0
+        - image: k8s.gcr.io/addon-resizer:1.0
           imagePullPolicy: Always
           name: pod-nanny
           resources:
diff --git a/cluster-autoscaler/cloudprovider/aws/README.md b/cluster-autoscaler/cloudprovider/aws/README.md
index 57549f530ad0..3da8550aff20 100644
--- a/cluster-autoscaler/cloudprovider/aws/README.md
+++ b/cluster-autoscaler/cloudprovider/aws/README.md
@@ -72,7 +72,7 @@ spec:
         app: cluster-autoscaler
     spec:
       containers:
-        - image: gcr.io/google_containers/cluster-autoscaler:v0.6.0
+        - image: k8s.gcr.io/cluster-autoscaler:v0.6.0
           name: cluster-autoscaler
           resources:
             limits:
@@ -123,7 +123,7 @@ spec:
         app: cluster-autoscaler
     spec:
       containers:
-        - image: gcr.io/google_containers/cluster-autoscaler:v0.6.0
+        - image: k8s.gcr.io/cluster-autoscaler:v0.6.0
           name: cluster-autoscaler
           resources:
             limits:
@@ -183,7 +183,7 @@ spec:
       nodeSelector:
         kubernetes.io/role: master
       containers:
-        - image: gcr.io/google_containers/cluster-autoscaler:{{ ca_version }}
+        - image: k8s.gcr.io/cluster-autoscaler:{{ ca_version }}
           name: cluster-autoscaler
           resources:
             limits:
diff --git a/cluster-autoscaler/cloudprovider/azure/README.md b/cluster-autoscaler/cloudprovider/azure/README.md
index 0922642d57bd..d9430527e5b4 100644
--- a/cluster-autoscaler/cloudprovider/azure/README.md
+++ b/cluster-autoscaler/cloudprovider/azure/README.md
@@ -59,7 +59,7 @@ spec:
         app: cluster-autoscaler
     spec:
       containers:
-      - image: gcr.io/google_containers/cluster-autoscaler:{{ ca_version }}
+      - image: k8s.gcr.io/cluster-autoscaler:{{ ca_version }}
         name: cluster-autoscaler
         resources:
           limits:
@@ -144,7 +144,7 @@ spec:
       nodeSelector:
         kubernetes.io/role: master
       containers:
-      - image: gcr.io/google_containers/cluster-autoscaler:{{ ca_version }}
+      - image: k8s.gcr.io/cluster-autoscaler:{{ ca_version }}
         name: cluster-autoscaler
         resources:
           limits:
diff --git a/vertical-pod-autoscaler/initializer/Dockerfile b/vertical-pod-autoscaler/initializer/Dockerfile
index 5a7fd5ad4002..dbe9b39beafd 100644
--- a/vertical-pod-autoscaler/initializer/Dockerfile
+++ b/vertical-pod-autoscaler/initializer/Dockerfile
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-FROM gcr.io/google_containers/ubuntu-slim:0.1
+FROM k8s.gcr.io/ubuntu-slim:0.1
 MAINTAINER Beata Skiba "bskiba@google.com"
 MAINTAINER Marcin Wielgus "mwielgus@google.com"
 
diff --git a/vertical-pod-autoscaler/updater/Dockerfile b/vertical-pod-autoscaler/updater/Dockerfile
index fcce056aa1ba..bf27d63693f2 100644
--- a/vertical-pod-autoscaler/updater/Dockerfile
+++ b/vertical-pod-autoscaler/updater/Dockerfile
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-FROM gcr.io/google_containers/ubuntu-slim:0.1
+FROM k8s.gcr.io/ubuntu-slim:0.1
 MAINTAINER Gabriela Filipek "gfilipek@google.com"
 MAINTAINER Marcin Wielgus "mwielgus@google.com"
 

From 847bd797269834b52188443ef410f7cb7c3fb53e Mon Sep 17 00:00:00 2001
From: Pengfei Ni <peni@microsoft.com>
Date: Thu, 28 Dec 2017 13:27:06 +0800
Subject: [PATCH 3/5] Support go1.9 when verifying gofmt

---
 hack/verify-gofmt.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hack/verify-gofmt.sh b/hack/verify-gofmt.sh
index fd8527c36d07..5d5ebd132ed5 100755
--- a/hack/verify-gofmt.sh
+++ b/hack/verify-gofmt.sh
@@ -24,7 +24,7 @@ KUBE_ROOT=$(dirname "${BASH_SOURCE}")/..
 
 GO_VERSION=($(go version))
 
-if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.2|go1.3|go1.4|go1.5|go1.6|go1.7|go1.8') ]]; then
+if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.2|go1.3|go1.4|go1.5|go1.6|go1.7|go1.8|go1.9') ]]; then
   echo "Unknown go version '${GO_VERSION}', skipping gofmt."
   exit 1
 fi

From 62ff748f09bf13dc6516f978937cd7f4a58cff01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomek=20Kulczy=C5=84ski?= <tkulczynski@google.com>
Date: Thu, 28 Dec 2017 16:33:09 +0100
Subject: [PATCH 4/5] Fix error branch in pod initialization

---
 vertical-pod-autoscaler/initializer/core/initializer.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vertical-pod-autoscaler/initializer/core/initializer.go b/vertical-pod-autoscaler/initializer/core/initializer.go
index bb63c4dedd9a..25001da7647d 100644
--- a/vertical-pod-autoscaler/initializer/core/initializer.go
+++ b/vertical-pod-autoscaler/initializer/core/initializer.go
@@ -130,11 +130,13 @@ func (initializer *initializer) updateResourceRequests(obj interface{}) {
 		failedPod, err := markAsFailed(pod)
 		if err != nil {
 			glog.Errorf("unable to mark failed initialization for pod %v: %v", pod.Name, err)
+			return
 		}
 		err = initializer.doUpdatePod(failedPod)
 		if err != nil {
-			glog.Errorf("unable to mark failed initialization for pod %v: %v", pod.Name, err)
+			glog.Errorf("unable to update pod %v after failed initialization: %v", pod.Name, err)
 		}
+		return
 	}
 	err = initializer.doUpdatePod(initializedPod)
 	if err != nil {

From 15b10c8f6760f3afc213af11c9e5ad4898c0fd0c Mon Sep 17 00:00:00 2001
From: Marcin Wielgus <mwielgus@google.com>
Date: Wed, 20 Dec 2017 21:19:10 +0100
Subject: [PATCH 5/5] Skip iteration if pending pods are too new

---
 cluster-autoscaler/core/static_autoscaler.go | 17 ++++++++++++++++-
 cluster-autoscaler/core/utils.go             | 10 ++++++++++
 cluster-autoscaler/core/utils_test.go        | 12 ++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
index 7cc1ad47bae4..719849c854b4 100644
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@@ -32,6 +32,11 @@ import (
 	"github.com/golang/glog"
 )
 
+const (
+	// How old the oldest unschedulable pod should be before starting scale up.
+	unschedulablePodTimeBuffer = 2 * time.Second
+)
+
 // StaticAutoscaler is an autoscaler which has all the core functionality of a CA but without the reconfiguration feature
 type StaticAutoscaler struct {
 	// AutoscalingContext consists of validated settings and options for this autoscaler
@@ -240,10 +245,19 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 		glog.V(4).Info("No schedulable pods")
 	}
 
+	// If all pending pods are new we may want to skip a real scale down (just like if the pods were handled).
+	allPendingPodsToHelpAreNew := false
+
 	if len(unschedulablePodsToHelp) == 0 {
 		glog.V(1).Info("No unschedulable pods")
 	} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
 		glog.V(1).Info("Max total nodes in cluster reached")
+	} else if getOldestCreateTime(unschedulablePodsToHelp).Add(unschedulablePodTimeBuffer).After(currentTime) {
+		// The assumption here is that these pods have been created very recently and probably there
+		// is more pods to come. In theory we could check the newest pod time but then if pod were created
+		// slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time.
+		allPendingPodsToHelpAreNew = true
+		glog.V(1).Info("Unschedulable pods are very new, waiting one iteration for more")
 	} else {
 		daemonsets, err := a.ListerRegistry.DaemonSetLister().List()
 		if err != nil {
@@ -301,7 +315,8 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 			a.lastScaleDownFailTime.Add(a.ScaleDownDelayAfterFailure).After(currentTime) ||
 			a.lastScaleDownDeleteTime.Add(a.ScaleDownDelayAfterDelete).After(currentTime) ||
 			schedulablePodsPresent ||
-			scaleDown.nodeDeleteStatus.IsDeleteInProgress()
+			scaleDown.nodeDeleteStatus.IsDeleteInProgress() ||
+			allPendingPodsToHelpAreNew
 
 		glog.V(4).Infof("Scale down status: unneededOnly=%v lastScaleUpTime=%s "+
 			"lastScaleDownDeleteTime=%v lastScaleDownFailTime=%s schedulablePodsPresent=%v isDeleteInProgress=%v",
diff --git a/cluster-autoscaler/core/utils.go b/cluster-autoscaler/core/utils.go
index 8c2c9ccf9d4a..9aeb1301dfa4 100644
--- a/cluster-autoscaler/core/utils.go
+++ b/cluster-autoscaler/core/utils.go
@@ -491,3 +491,13 @@ func UpdateClusterStateMetrics(csr *clusterstate.ClusterStateRegistry) {
 	readiness := csr.GetClusterReadiness()
 	metrics.UpdateNodesCount(readiness.Ready, readiness.Unready+readiness.LongNotStarted, readiness.NotStarted)
 }
+
+func getOldestCreateTime(pods []*apiv1.Pod) time.Time {
+	oldest := time.Now()
+	for _, pod := range pods {
+		if oldest.After(pod.CreationTimestamp.Time) {
+			oldest = pod.CreationTimestamp.Time
+		}
+	}
+	return oldest
+}
diff --git a/cluster-autoscaler/core/utils_test.go b/cluster-autoscaler/core/utils_test.go
index 8bb1a74fe251..d596d2c08a4f 100644
--- a/cluster-autoscaler/core/utils_test.go
+++ b/cluster-autoscaler/core/utils_test.go
@@ -527,3 +527,15 @@ func TestGetNodeCoresAndMemory(t *testing.T) {
 	_, _, err = getNodeCoresAndMemory(node)
 	assert.Error(t, err)
 }
+
+func TestGetOldestPod(t *testing.T) {
+	p1 := BuildTestPod("p1", 500, 1000)
+	p1.CreationTimestamp = metav1.NewTime(time.Now().Add(-1 * time.Minute))
+	p2 := BuildTestPod("p2", 500, 1000)
+	p2.CreationTimestamp = metav1.NewTime(time.Now().Add(+1 * time.Minute))
+	p3 := BuildTestPod("p3", 500, 1000)
+	p3.CreationTimestamp = metav1.NewTime(time.Now())
+
+	assert.Equal(t, p1.CreationTimestamp.Time, getOldestCreateTime([]*apiv1.Pod{p1, p2, p3}))
+	assert.Equal(t, p1.CreationTimestamp.Time, getOldestCreateTime([]*apiv1.Pod{p3, p2, p1}))
+}