Skip to content

Commit

Permalink
Fix bug where a node that becomes ready after 2
Browse files Browse the repository at this point in the history
mins can be treated as unready. Deprecated LongNotStarted

In cases where node n1 would:
1) Be created at t=0min
2) Ready condition is true at t=2.5min
3) Not ready taint is removed at t=3min
the ready node is counted as unready

Tested cases after fix:
1) Case described above
2) Nodes not starting even after 15mins still
treated as unready
3) Nodes created long ago that suddenly become unready are
counted as unready.
  • Loading branch information
vivekbagade committed Mar 5, 2021
1 parent 5f2e54a commit 47039e0
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 99 deletions.
55 changes: 8 additions & 47 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {

if unjustifiedUnready > csr.config.OkTotalUnreadyCount &&
float64(unjustifiedUnready) > csr.config.MaxTotalUnreadyPercentage/100.0*
float64(readiness.Ready+readiness.Unready+readiness.NotStarted+readiness.LongNotStarted) {
float64(readiness.Ready+readiness.Unready+readiness.NotStarted) {
return false
}

Expand Down Expand Up @@ -452,7 +452,7 @@ func (csr *ClusterStateRegistry) getProvisionedAndTargetSizesForNodeGroup(nodeGr
}
return 0, target, true
}
provisioned = readiness.Registered - readiness.NotStarted - readiness.LongNotStarted
provisioned = readiness.Registered - readiness.NotStarted

return provisioned, target, true
}
Expand Down Expand Up @@ -531,8 +531,6 @@ type Readiness struct {
// Number of nodes that are being currently deleted. They exist in K8S but
// are not included in NodeGroup.TargetSize().
Deleted int
// Number of nodes that failed to start within a reasonable limit.
LongNotStarted int
// Number of nodes that are not yet fully started.
NotStarted int
// Number of all registered nodes in the group (ready/unready/deleted/etc).
Expand All @@ -554,12 +552,10 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
current.Registered++
if deletetaint.HasToBeDeletedTaint(node) {
current.Deleted++
} else if stillStarting := isNodeStillStarting(node); stillStarting && node.CreationTimestamp.Time.Add(MaxNodeStartupTime).Before(currentTime) {
current.LongNotStarted++
} else if stillStarting {
current.NotStarted++
} else if ready {
current.Ready++
} else if node.CreationTimestamp.Time.Add(MaxNodeStartupTime).After(currentTime) {
current.NotStarted++
} else {
current.Unready++
}
Expand Down Expand Up @@ -743,11 +739,10 @@ func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness {
func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition {
condition := api.ClusterAutoscalerCondition{
Type: api.ClusterAutoscalerHealth,
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=%d registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
readiness.Ready,
readiness.Unready,
readiness.NotStarted,
readiness.LongNotStarted,
readiness.Registered,
readiness.LongUnregistered,
acceptable.CurrentTarget,
Expand Down Expand Up @@ -798,11 +793,10 @@ func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time) ap
func buildHealthStatusClusterwide(isReady bool, readiness Readiness) api.ClusterAutoscalerCondition {
condition := api.ClusterAutoscalerCondition{
Type: api.ClusterAutoscalerHealth,
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=%d registered=%d longUnregistered=%d",
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d",
readiness.Ready,
readiness.Unready,
readiness.NotStarted,
readiness.LongNotStarted,
readiness.Registered,
readiness.LongUnregistered,
),
Expand Down Expand Up @@ -860,39 +854,6 @@ func buildScaleDownStatusClusterwide(candidates map[string][]string, lastProbed
return condition
}

func hasTaint(node *apiv1.Node, taintKey string) bool {
for _, taint := range node.Spec.Taints {
if taint.Key == taintKey {
return true
}
}
return false
}

func isNodeStillStarting(node *apiv1.Node) bool {
for _, condition := range node.Status.Conditions {
if condition.Type == apiv1.NodeReady {
notReady := condition.Status != apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNotReady)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
if condition.Type == apiv1.NodeDiskPressure {
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeDiskPressure)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
if condition.Type == apiv1.NodeNetworkUnavailable {
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNetworkUnavailable)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
}
return false
}

func updateLastTransition(oldStatus, newStatus *api.ClusterAutoscalerStatus) {
newStatus.ClusterwideConditions = updateLastTransitionSingleList(
oldStatus.ClusterwideConditions, newStatus.ClusterwideConditions)
Expand Down Expand Up @@ -955,7 +916,7 @@ func (csr *ClusterStateRegistry) GetUpcomingNodes() map[string]int {
readiness := csr.perNodeGroupReadiness[id]
ar := csr.acceptableRanges[id]
// newNodes is the number of nodes that
newNodes := ar.CurrentTarget - (readiness.Ready + readiness.Unready + readiness.LongNotStarted + readiness.LongUnregistered)
newNodes := ar.CurrentTarget - (readiness.Ready + readiness.Unready + readiness.LongUnregistered)
if newNodes <= 0 {
// Negative value is unlikely but theoretically possible.
continue
Expand Down Expand Up @@ -1006,7 +967,7 @@ func (csr *ClusterStateRegistry) GetAutoscaledNodesCount() (currentSize, targetS
targetSize += accRange.CurrentTarget
}
for _, readiness := range csr.perNodeGroupReadiness {
currentSize += readiness.Registered - readiness.NotStarted - readiness.LongNotStarted
currentSize += readiness.Registered - readiness.NotStarted
}
return currentSize, targetSize
}
Expand Down
123 changes: 72 additions & 51 deletions cluster-autoscaler/clusterstate/clusterstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,78 @@ func TestTooManyUnready(t *testing.T) {
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
}

func TestUnreadyLongAfterCreation(t *testing.T) {
now := time.Now()

ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
SetNodeReadyState(ng1_1, true, now.Add(-time.Minute))
ng2_1 := BuildTestNode("ng2-1", 1000, 1000)
SetNodeReadyState(ng2_1, false, now.Add(-time.Minute))
ng2_1.CreationTimestamp = metav1.Time{Time: now.Add(-30 * time.Minute)}

provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 1)
provider.AddNodeGroup("ng2", 1, 10, 1)
provider.AddNode("ng1", ng1_1)
provider.AddNode("ng2", ng2_1)

assert.NotNil(t, provider)
fakeClient := &fake.Clientset{}
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false)
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
MaxTotalUnreadyPercentage: 10,
OkTotalUnreadyCount: 1,
}, fakeLogRecorder, newBackoff())
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().Unready)
assert.Equal(t, 0, clusterstate.GetClusterReadiness().NotStarted)
upcoming := clusterstate.GetUpcomingNodes()
assert.Equal(t, 0, upcoming["ng1"])
}

func TestNotStarted(t *testing.T) {
now := time.Now()

ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
SetNodeReadyState(ng1_1, true, now.Add(-time.Minute))
ng2_1 := BuildTestNode("ng2-1", 1000, 1000)
SetNodeReadyState(ng2_1, false, now.Add(-4*time.Minute))
SetNodeNotReadyTaint(ng2_1)
ng2_1.CreationTimestamp = metav1.Time{Time: now.Add(-10 * time.Minute)}

provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 1)
provider.AddNodeGroup("ng2", 1, 10, 1)
provider.AddNode("ng1", ng1_1)
provider.AddNode("ng2", ng2_1)

assert.NotNil(t, provider)
fakeClient := &fake.Clientset{}
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false)
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
MaxTotalUnreadyPercentage: 10,
OkTotalUnreadyCount: 1,
}, fakeLogRecorder, newBackoff())
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().NotStarted)

// node ng2_1 moves condition to ready
SetNodeReadyState(ng2_1, true, now.Add(-4*time.Minute))
err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().NotStarted)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().Ready)

// node ng2_1 no longer has the taint
RemoveNodeNotReadyTaint(ng2_1)
err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 0, clusterstate.GetClusterReadiness().NotStarted)
assert.Equal(t, 2, clusterstate.GetClusterReadiness().Ready)
}

func TestExpiredScaleUp(t *testing.T) {
now := time.Now()

Expand Down Expand Up @@ -768,57 +840,6 @@ func TestUpdateScaleUp(t *testing.T) {
assert.Nil(t, clusterstate.scaleUpRequests["ng1"])
}

func TestIsNodeStillStarting(t *testing.T) {
testCases := []struct {
desc string
condition apiv1.NodeConditionType
status apiv1.ConditionStatus
taintKey string
expectedResult bool
}{
{"unready", apiv1.NodeReady, apiv1.ConditionFalse, "", true},
{"readiness unknown", apiv1.NodeReady, apiv1.ConditionUnknown, "", true},
{"out of disk", apiv1.NodeDiskPressure, apiv1.ConditionTrue, "", true},
{"network unavailable", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, "", true},
{"started", apiv1.NodeReady, apiv1.ConditionTrue, "", false},
{"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, true},
{"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, true},
{"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, true},
{"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, true},
{"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, true},
{"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, true},
{"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, true},
}
for _, tc := range testCases {
createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node {
node := BuildTestNode("n1", 1000, 1000)
node.CreationTimestamp.Time = time.Time{}
testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation)

SetNodeCondition(node, tc.condition, tc.status, testedTime)

if tc.taintKey != "" {
node.Spec.Taints = []apiv1.Taint{{
Key: tc.taintKey,
Effect: apiv1.TaintEffectNoSchedule,
TimeAdded: &metav1.Time{Time: testedTime},
}}
}

return node
}
t.Run("recent "+tc.desc, func(t *testing.T) {
node := createTestNode(1 * time.Minute)
assert.Equal(t, tc.expectedResult, isNodeStillStarting(node))
})
t.Run("long "+tc.desc, func(t *testing.T) {
node := createTestNode(30 * time.Minute)
// No matter what are the node's conditions, stop considering it not started after long enough.
assert.False(t, isNodeStillStarting(node))
})
}
}

func TestScaleUpFailures(t *testing.T) {
now := time.Now()

Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ func UpdateClusterStateMetrics(csr *clusterstate.ClusterStateRegistry) {
}
metrics.UpdateClusterSafeToAutoscale(csr.IsClusterHealthy())
readiness := csr.GetClusterReadiness()
metrics.UpdateNodesCount(readiness.Ready, readiness.Unready+readiness.LongNotStarted, readiness.NotStarted, readiness.LongUnregistered, readiness.Unregistered)
metrics.UpdateNodesCount(readiness.Ready, readiness.Unready, readiness.NotStarted, readiness.LongUnregistered, readiness.Unregistered)
}

// GetOldestCreateTime returns oldest creation time out of the pods in the set
Expand Down
70 changes: 70 additions & 0 deletions cluster-autoscaler/utils/kubernetes/ready_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package kubernetes

import (
"testing"
"time"

"github.com/stretchr/testify/assert"
apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
)

func TestGetReadiness(t *testing.T) {
testCases := []struct {
desc string
condition apiv1.NodeConditionType
status apiv1.ConditionStatus
taintKey string
expectedResult bool
excludeLong bool
}{
{"ready", apiv1.NodeReady, apiv1.ConditionTrue, "", true, true},
{"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, false, false},
{"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, false, false},
{"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, false, false},
{"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, false, false},
{"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, false, false},
{"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, false, false},
{"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, false, false},
}
for _, tc := range testCases {
createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node {
node := BuildTestNode("n1", 1000, 1000)
node.CreationTimestamp.Time = time.Time{}
testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation)

SetNodeCondition(node, tc.condition, tc.status, testedTime)
if tc.condition != apiv1.NodeReady {
SetNodeCondition(node, apiv1.NodeReady, apiv1.ConditionTrue, testedTime)
}

if tc.taintKey != "" {
node.Spec.Taints = []apiv1.Taint{{
Key: tc.taintKey,
Effect: apiv1.TaintEffectNoSchedule,
TimeAdded: &metav1.Time{Time: testedTime},
}}
}

return node
}
t.Run("recent "+tc.desc, func(t *testing.T) {
node := createTestNode(1 * time.Minute)
isReady, _, err := GetReadinessState(node)
assert.NoError(t, err)
assert.Equal(t, tc.expectedResult, isReady)
})
t.Run("long "+tc.desc, func(t *testing.T) {
if tc.excludeLong {
return
}
node := createTestNode(30 * time.Minute)
// No matter what are the node's conditions, stop considering it not started after long enough.
isReady, _, err := GetReadinessState(node)
assert.NoError(t, err)
assert.False(t, isReady)
})
}
}
17 changes: 17 additions & 0 deletions cluster-autoscaler/utils/test/test_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,23 @@ func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) {
}
}

// SetNodeNotReadyTaint sets the not ready taint on node.
func SetNodeNotReadyTaint(node *apiv1.Node) {
node.Spec.Taints = append(node.Spec.Taints, apiv1.Taint{Key: apiv1.TaintNodeNotReady, Effect: apiv1.TaintEffectNoSchedule})
}

// RemoveNodeNotReadyTaint removes the not ready taint.
func RemoveNodeNotReadyTaint(node *apiv1.Node) {
var final []apiv1.Taint
for i := range node.Spec.Taints {
if node.Spec.Taints[i].Key == apiv1.TaintNodeNotReady {
continue
}
final = append(final, node.Spec.Taints[i])
}
node.Spec.Taints = final
}

// SetNodeCondition sets node condition.
func SetNodeCondition(node *apiv1.Node, conditionType apiv1.NodeConditionType, status apiv1.ConditionStatus, lastTransition time.Time) {
for i := range node.Status.Conditions {
Expand Down

0 comments on commit 47039e0

Please sign in to comment.