Skip to content

Commit

Permalink
Merge pull request #2993 from Jeffwan/cherry-pick-1.16
Browse files Browse the repository at this point in the history
Cherry pick  #2249  #2737 #2931 #2929 to 1.16
  • Loading branch information
k8s-ci-robot authored Mar 30, 2020
2 parents 2b4a46a + d3c7a55 commit 277f19e
Show file tree
Hide file tree
Showing 12 changed files with 770 additions and 160 deletions.
1 change: 1 addition & 0 deletions cluster-autoscaler/FAQ.md
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,7 @@ The following startup parameters are supported for cluster autoscaler:
| `leader-elect-renew-deadline` | The interval between attempts by the acting master to renew a leadership slot before it stops leading.<br>This must be less than or equal to the lease duration.<br>This is only applicable if leader election is enabled | 10 seconds
| `leader-elect-retry-period` | The duration the clients should wait between attempting acquisition and renewal of a leadership.<br>This is only applicable if leader election is enabled | 2 seconds
| `leader-elect-resource-lock` | The type of resource object that is used for locking during leader election.<br>Supported options are `endpoints` (default) and `configmaps` | "endpoints"
| `aws-use-static-instance-list` | Should CA fetch instance types in runtime or use a static list. AWS only | false

# Troubleshooting:

Expand Down
20 changes: 20 additions & 0 deletions cluster-autoscaler/cloudprovider/aws/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ kubectl apply -f examples/cluster-autoscaler-autodiscover.yaml
From CA 0.6.1 - it is possible to scale a node group to 0 (and obviously from 0), assuming that all scale-down conditions are met.

If you are using `nodeSelector` you need to tag the ASG with a node-template key `"k8s.io/cluster-autoscaler/node-template/label/"` and `"k8s.io/cluster-autoscaler/node-template/taint/"` if you are using taints.
If your pods request resources other than `cpu` and `memory`, you need to tag ASG with key `k8s.io/cluster-autoscaler/node-template/resources/`.

For example for a node label of `foo=bar` you would tag the ASG with:

Expand All @@ -153,6 +154,18 @@ And for a taint of `"dedicated": "foo:NoSchedule"` you would tag the ASG with:
"Key": "k8s.io/cluster-autoscaler/node-template/taint/dedicated"
}
```
If you request other resources on the node, like `vpc.amazonaws.com/PrivateIPv4Address` for Windows nodes, `ephemeral-storage`, etc, you would tag ASG with

```json
{
"ResourceType": "auto-scaling-group",
"ResourceId": "foo.example.com",
"PropagateAtLaunch": true,
"Value": "2",
"Key": "k8s.io/cluster-autoscaler/node-template/resources/vpc.amazonaws.com/PrivateIPv4Address"
}
```
> Note: This is only supported in CA 1.14.x and above
If you'd like to scale node groups from 0, an `autoscaling:DescribeLaunchConfigurations` or `ec2:DescribeLaunchTemplateVersions` permission is required depending on if you made your ASG with Launch Configuration or Launch Template:

Expand Down Expand Up @@ -204,6 +217,13 @@ spec:
- r5ad.2xlarge
```
## Use Static Instance List
The set of the latest supported EC2 instance types will be fetched by the CA at run time. You can find all the available instance types in the CA logs.
If your network access is restricted such that fetching this set is infeasible, you can specify the command-line flag `--aws-use-static-instance-list=true` to switch the CA back to its original use of a statically defined set.

To refresh static list, please run `go run ec2_instance_types/gen.go` under `cluster-autoscaler/cloudprovider/aws/` and update `staticListLastUpdateTime` in `aws_util.go`


### Example usage:

* Create a [Launch Template](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-autoscaling-autoscalinggroup-launchtemplate.html) (LT) with an instance type, for example, r5.2xlarge. Consider this the 'base' instance type. Do not define any spot purchase options here.
Expand Down
34 changes: 31 additions & 3 deletions cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,16 @@ var (
type awsCloudProvider struct {
awsManager *AwsManager
resourceLimiter *cloudprovider.ResourceLimiter
// InstanceTypes is a map of ec2 resources
instanceTypes map[string]*InstanceType
}

// BuildAwsCloudProvider builds CloudProvider implementation for AWS.
func BuildAwsCloudProvider(awsManager *AwsManager, resourceLimiter *cloudprovider.ResourceLimiter) (cloudprovider.CloudProvider, error) {
func BuildAwsCloudProvider(awsManager *AwsManager, instanceTypes map[string]*InstanceType, resourceLimiter *cloudprovider.ResourceLimiter) (cloudprovider.CloudProvider, error) {
aws := &awsCloudProvider{
awsManager: awsManager,
resourceLimiter: resourceLimiter,
instanceTypes: instanceTypes,
}
return aws, nil
}
Expand Down Expand Up @@ -156,7 +159,7 @@ type AwsInstanceRef struct {
Name string
}

var validAwsRefIdRegex = regexp.MustCompile(fmt.Sprintf(`^aws\:\/\/\/[-0-9a-z]*\/[-0-9a-z]*$|aws\:\/\/\/[-0-9a-z]*\/%s.*$`, placeholderInstanceNamePrefix))
var validAwsRefIdRegex = regexp.MustCompile(fmt.Sprintf(`^aws\:\/\/\/[-0-9a-z]*\/[-0-9a-z]*(\/[-0-9a-z\.]*)?$|aws\:\/\/\/[-0-9a-z]*\/%s.*$`, placeholderInstanceNamePrefix))

// AwsRefFromProviderId creates InstanceConfig object from provider id which
// must be in format: aws:///zone/name
Expand Down Expand Up @@ -343,12 +346,37 @@ func BuildAWS(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscover
defer config.Close()
}

// Generate EC2 list
var instanceTypes map[string]*InstanceType
var lastUpdateTime string
if opts.AWSUseStaticInstanceList {
instanceTypes, lastUpdateTime = GetStaticEC2InstanceTypes()
klog.Warningf("Use static EC2 Instance Types and list could be outdated. Last update time: %s", lastUpdateTime)
} else {
region, err := GetCurrentAwsRegion()
if err != nil {
klog.Fatalf("Failed to get AWS Region: %v", err)
}

instanceTypes, err = GenerateEC2InstanceTypes(region)
if err != nil {
klog.Fatalf("Failed to generate AWS EC2 Instance Types: %v", err)
}

keys := make([]string, 0, len(instanceTypes))
for key := range instanceTypes {
keys = append(keys, key)
}

klog.Infof("Successfully load %d EC2 Instance Types %s", len(keys), keys)
}

manager, err := CreateAwsManager(config, do)
if err != nil {
klog.Fatalf("Failed to create AWS Manager: %v", err)
}

provider, err := BuildAwsCloudProvider(manager, rl)
provider, err := BuildAwsCloudProvider(manager, instanceTypes, rl)
if err != nil {
klog.Fatalf("Failed to create AWS cloud provider: %v", err)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ func testProvider(t *testing.T, m *AwsManager) *awsCloudProvider {
map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000},
map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000})

provider, err := BuildAwsCloudProvider(m, resourceLimiter)
instanceTypes, _ := GetStaticEC2InstanceTypes()
provider, err := BuildAwsCloudProvider(m, instanceTypes, resourceLimiter)
assert.NoError(t, err)
return provider.(*awsCloudProvider)
}
Expand All @@ -143,7 +144,8 @@ func TestBuildAwsCloudProvider(t *testing.T) {
map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000},
map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000})

_, err := BuildAwsCloudProvider(testAwsManager, resourceLimiter)
instanceTypes, _ := GetStaticEC2InstanceTypes()
_, err := BuildAwsCloudProvider(testAwsManager, instanceTypes, resourceLimiter)
assert.NoError(t, err)
}

Expand Down
6 changes: 3 additions & 3 deletions cluster-autoscaler/cloudprovider/aws/aws_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ type AwsManager struct {
}

type asgTemplate struct {
InstanceType *instanceType
InstanceType *InstanceType
Region string
Zone string
Tags []*autoscaling.TagDescription
Expand Down Expand Up @@ -363,8 +363,8 @@ func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*ap
node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(template.InstanceType.MemoryMb*1024*1024, resource.DecimalSI)

resourcesFromTags := extractAllocatableResourcesFromAsg(template.Tags)
if val, ok := resourcesFromTags["ephemeral-storage"]; ok {
node.Status.Capacity[apiv1.ResourceEphemeralStorage] = *val
for resourceName, val := range resourcesFromTags {
node.Status.Capacity[apiv1.ResourceName(resourceName)] = *val
}

// TODO: use proper allocatable!!
Expand Down
71 changes: 70 additions & 1 deletion cluster-autoscaler/cloudprovider/aws/aws_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"os"
"reflect"
"sort"
"strconv"
"strings"
"testing"

Expand Down Expand Up @@ -70,7 +71,7 @@ func TestGetRegion(t *testing.T) {

func TestBuildGenericLabels(t *testing.T) {
labels := buildGenericLabels(&asgTemplate{
InstanceType: &instanceType{
InstanceType: &InstanceType{
InstanceType: "c4.large",
VCPU: 2,
MemoryMb: 3840,
Expand Down Expand Up @@ -109,6 +110,74 @@ func TestExtractAllocatableResourcesFromAsg(t *testing.T) {
assert.Equal(t, (&expectedEphemeralStorage).String(), labels["ephemeral-storage"].String())
}

func TestBuildNodeFromTemplate(t *testing.T) {
awsManager := &AwsManager{}
asg := &asg{AwsRef: AwsRef{Name: "test-auto-scaling-group"}}
c5Instance := &InstanceType{
InstanceType: "c5.xlarge",
VCPU: 4,
MemoryMb: 8192,
GPU: 0,
}

// Node with custom resource
ephemeralStorageKey := "ephemeral-storage"
ephemeralStorageValue := int64(20)
vpcIPKey := "vpc.amazonaws.com/PrivateIPv4Address"
observedNode, observedErr := awsManager.buildNodeFromTemplate(asg, &asgTemplate{
InstanceType: c5Instance,
Tags: []*autoscaling.TagDescription{
{
Key: aws.String(fmt.Sprintf("k8s.io/cluster-autoscaler/node-template/resources/%s", ephemeralStorageKey)),
Value: aws.String(strconv.FormatInt(ephemeralStorageValue, 10)),
},
},
})
assert.NoError(t, observedErr)
esValue, esExist := observedNode.Status.Capacity[apiv1.ResourceName(ephemeralStorageKey)]
assert.True(t, esExist)
assert.Equal(t, int64(20), esValue.Value())
_, ipExist := observedNode.Status.Capacity[apiv1.ResourceName(vpcIPKey)]
assert.False(t, ipExist)

// Nod with labels
GPULabelValue := "nvidia-telsa-v100"
observedNode, observedErr = awsManager.buildNodeFromTemplate(asg, &asgTemplate{
InstanceType: c5Instance,
Tags: []*autoscaling.TagDescription{
{
Key: aws.String(fmt.Sprintf("k8s.io/cluster-autoscaler/node-template/label/%s", GPULabel)),
Value: aws.String(GPULabelValue),
},
},
})
assert.NoError(t, observedErr)
gpuValue, gpuLabelExist := observedNode.Labels[GPULabel]
assert.True(t, gpuLabelExist)
assert.Equal(t, GPULabelValue, gpuValue)

// Node with taints
gpuTaint := apiv1.Taint{
Key: "nvidia.com/gpu",
Value: "present",
Effect: "NoSchedule",
}
observedNode, observedErr = awsManager.buildNodeFromTemplate(asg, &asgTemplate{
InstanceType: c5Instance,
Tags: []*autoscaling.TagDescription{
{
Key: aws.String(fmt.Sprintf("k8s.io/cluster-autoscaler/node-template/taint/%s", gpuTaint.Key)),
Value: aws.String(fmt.Sprintf("%s:%s", gpuTaint.Value, gpuTaint.Effect)),
},
},
})

assert.NoError(t, observedErr)
observedTaints := observedNode.Spec.Taints
assert.Equal(t, 1, len(observedTaints))
assert.Equal(t, gpuTaint, observedTaints[0])
}

func TestExtractLabelsFromAsg(t *testing.T) {
tags := []*autoscaling.TagDescription{
{
Expand Down
Loading

0 comments on commit 277f19e

Please sign in to comment.