Skip to content

Commit

Permalink
Merge pull request kubernetes#4489 from airbnb/drmorr--early-abort-if…
Browse files Browse the repository at this point in the history
…-aws-node-group-no-capacity

Early abort if AWS node group has no capacity
  • Loading branch information
k8s-ci-robot authored and Bruno Batista committed Nov 7, 2022
1 parent c147ba9 commit 7db4d8c
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 19 deletions.
39 changes: 35 additions & 4 deletions cluster-autoscaler/cloudprovider/aws/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,41 @@ discover](#auto-discovery-setup) EC2 Auto Scaling Groups **(recommended)**, add
Launch Configuration) and/or `ec2:DescribeLaunchTemplateVersions` (if you
created your ASG using a Launch Template) to the `Action` list.

If you prefer, you can restrict the target resources for the autoscaling actions
by specifying Auto Scaling Group ARNs in the `Resource` list of the policy. More
information can be found
[here](https://docs.aws.amazon.com/autoscaling/latest/userguide/control-access-using-iam.html#policy-auto-scaling-resources).
*NOTE:* The below policies/arguments to the Cluster Autoscaler need to be modified as appropriate
for the names of your ASGs, as well as account ID and AWS region before being used.

The following policy provides the minimum privileges necessary for Cluster Autoscaler to run.
When using this policy, you cannot use autodiscovery of ASGs. In addition, it restricts the
IAM permissions to the node groups the Cluster Autoscaler is configured to scale.

This in turn means that you must pass the following arguments to the Cluster Autoscaler
binary, replacing min and max node counts and the ASG:

```bash
--aws-use-static-instance-list=false
--nodes=1:100:exampleASG1
--nodes=1:100:exampleASG2
```

```json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"autoscaling:DescribeAutoScalingGroups",
"autoscaling:DescribeAutoScalingInstances",
"autoscaling:DescribeLaunchConfigurations",
"autoscaling:DescribeScalingActivities",
"autoscaling:SetDesiredCapacity",
"autoscaling:TerminateInstanceInAutoScalingGroup"
],
"Resource": ["arn:aws:autoscaling:${YOUR_CLUSTER_AWS_REGION}:${YOUR_AWS_ACCOUNT_ID}:autoScalingGroup:*:autoScalingGroupName/${YOUR_ASG_NAME}"]
}
]
}
```

### Using OIDC Federated Authentication

Expand Down
13 changes: 7 additions & 6 deletions cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ type asg struct {
minSize int
maxSize int
curSize int
lastUpdateTime *time.Time
lastUpdateTime time.Time

AvailabilityZones []string
LaunchConfigurationName string
Expand Down Expand Up @@ -254,7 +254,7 @@ func (m *asgCache) setAsgSizeNoLock(asg *asg, size int) error {
}

// Proactively set the ASG size so autoscaler makes better decisions
asg.lastUpdateTime = &start
asg.lastUpdateTime = start
asg.curSize = size

return nil
Expand Down Expand Up @@ -479,7 +479,6 @@ func (m *asgCache) createPlaceholdersForDesiredNonStartedInstances(groups []*aut
func (m *asgCache) isNodeGroupAvailable(group *autoscaling.Group) (bool, error) {
input := &autoscaling.DescribeScalingActivitiesInput{
AutoScalingGroupName: group.AutoScalingGroupName,
MaxRecords: aws.Int64(1), // We only care about the most recent event
}

start := time.Now()
Expand All @@ -489,12 +488,14 @@ func (m *asgCache) isNodeGroupAvailable(group *autoscaling.Group) (bool, error)
return true, err // If we can't describe the scaling activities we assume the node group is available
}

if len(response.Activities) > 0 {
activity := response.Activities[0]
for _, activity := range response.Activities {
asgRef := AwsRef{Name: *group.AutoScalingGroupName}
if a, ok := m.registeredAsgs[asgRef]; ok {
lut := a.lastUpdateTime
if lut != nil && activity.StartTime.After(*lut) && *activity.StatusCode == "Failed" {
if activity.StartTime.Before(lut) {
break
} else if *activity.StatusCode == "Failed" {
klog.Warningf("ASG %s scaling failed with %s", asgRef.Name, *activity)
return false, nil
}
} else {
Expand Down
11 changes: 5 additions & 6 deletions cluster-autoscaler/cloudprovider/aws/auto_scaling_groups_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func TestCreatePlaceholders(t *testing.T) {
name string
desiredCapacity *int64
activities []*autoscaling.Activity
groupLastUpdateTime *time.Time
groupLastUpdateTime time.Time
describeErr error
asgToCheck *string
}{
Expand All @@ -85,7 +85,7 @@ func TestCreatePlaceholders(t *testing.T) {
StartTime: aws.Time(time.Unix(10, 0)),
},
},
groupLastUpdateTime: aws.Time(time.Unix(9, 0)),
groupLastUpdateTime: time.Unix(9, 0),
},
{
name: "AWS scaling failed event before CA scale_up",
Expand All @@ -96,7 +96,7 @@ func TestCreatePlaceholders(t *testing.T) {
StartTime: aws.Time(time.Unix(9, 0)),
},
},
groupLastUpdateTime: aws.Time(time.Unix(10, 0)),
groupLastUpdateTime: time.Unix(10, 0),
},
{
name: "asg not registered",
Expand All @@ -107,7 +107,7 @@ func TestCreatePlaceholders(t *testing.T) {
StartTime: aws.Time(time.Unix(10, 0)),
},
},
groupLastUpdateTime: aws.Time(time.Unix(9, 0)),
groupLastUpdateTime: time.Unix(9, 0),
asgToCheck: aws.String("unregisteredAsgName"),
},
}
Expand All @@ -128,7 +128,6 @@ func TestCreatePlaceholders(t *testing.T) {
if shouldCallDescribeScalingActivities {
a.On("DescribeScalingActivities", &autoscaling.DescribeScalingActivitiesInput{
AutoScalingGroupName: asgName,
MaxRecords: aws.Int64(1),
}).Return(
&autoscaling.DescribeScalingActivitiesOutput{Activities: tc.activities},
tc.describeErr,
Expand Down Expand Up @@ -158,7 +157,7 @@ func TestCreatePlaceholders(t *testing.T) {
}
asgCache.createPlaceholdersForDesiredNonStartedInstances(groups)
assert.Equal(t, int64(len(groups[0].Instances)), *tc.desiredCapacity)
if tc.activities != nil && *tc.activities[0].StatusCode == "Failed" && tc.activities[0].StartTime.After(*tc.groupLastUpdateTime) && asgName == registeredAsgName {
if tc.activities != nil && *tc.activities[0].StatusCode == "Failed" && tc.activities[0].StartTime.After(tc.groupLastUpdateTime) && asgName == registeredAsgName {
assert.Equal(t, *groups[0].Instances[0].HealthStatus, placeholderUnfulfillableStatus)
} else if len(groups[0].Instances) > 0 {
assert.Equal(t, *groups[0].Instances[0].HealthStatus, "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,6 @@ func TestDeleteNodesWithPlaceholder(t *testing.T) {
a.On("DescribeScalingActivities",
&autoscaling.DescribeScalingActivitiesInput{
AutoScalingGroupName: aws.String("test-asg"),
MaxRecords: aws.Int64(1),
},
).Return(&autoscaling.DescribeScalingActivitiesOutput{}, nil)

Expand Down
2 changes: 0 additions & 2 deletions cluster-autoscaler/cloudprovider/aws/aws_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,6 @@ func TestFetchExplicitAsgs(t *testing.T) {
a.On("DescribeScalingActivities",
&autoscaling.DescribeScalingActivitiesInput{
AutoScalingGroupName: aws.String("coolasg"),
MaxRecords: aws.Int64(1),
},
).Return(&autoscaling.DescribeScalingActivitiesOutput{}, nil)

Expand Down Expand Up @@ -559,7 +558,6 @@ func TestFetchAutoAsgs(t *testing.T) {
a.On("DescribeScalingActivities",
&autoscaling.DescribeScalingActivitiesInput{
AutoScalingGroupName: aws.String("coolasg"),
MaxRecords: aws.Int64(1),
},
).Return(&autoscaling.DescribeScalingActivitiesOutput{}, nil)

Expand Down

0 comments on commit 7db4d8c

Please sign in to comment.