From 7bdde04750f5c40be5d7a65d7288b10d81b6c6cd Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Tue, 28 Jul 2020 17:31:04 -0700 Subject: [PATCH 1/2] *: set desired capacity defaults Either of these two defaults: 1. desired 10, min 0, max 0 ==> desired 10, min 10, max 10 2. desired 0, min 1, max 10 ==> desired 0, min 1, max 10 Either desired or min must be >0. Signed-off-by: Gyuho Lee --- ec2/asgs.go | 17 ++++++----- ec2/asgs_test.go | 5 ++-- ec2config/validate-defaults.go | 14 +++++---- eks/gpu/gpu.go | 6 ++-- eks/mng/nodes.go | 15 ++++++---- eks/mng/version-upgrade/version-upgrade.go | 8 ++++-- eks/mng/wait/wait.go | 9 ++++-- eks/ng/nodes.go | 33 +++++++++++++--------- eks/ng/nodes_test.go | 5 ++-- eks/ng/wait/wait.go | 9 ++++-- eksconfig/add-on-managed-node-groups.go | 10 +++---- eksconfig/add-on-node-groups.go | 8 +++--- 12 files changed, 86 insertions(+), 53 deletions(-) diff --git a/ec2/asgs.go b/ec2/asgs.go index 2cbe635a6..050d67ff2 100644 --- a/ec2/asgs.go +++ b/ec2/asgs.go @@ -129,7 +129,7 @@ Parameters: ASGDesiredCapacity: Type: Number Description: Desired size auto scaling group - Default: 1 + Default: 0 MinValue: 1 MaxValue: 1000 @@ -222,7 +222,7 @@ Resources: Type: AWS::AutoScaling::AutoScalingGroup UpdatePolicy: AutoScalingRollingUpdate: - MinInstancesInService: !Ref ASGDesiredCapacity + MinInstancesInService: !Ref ASGMinSize MaxBatchSize: 1 SuspendProcesses: - HealthCheck @@ -233,8 +233,8 @@ Resources: Properties: AutoScalingGroupName: !Ref ASGName MinSize: !Ref ASGMinSize - MaxSize: !Ref ASGMaxSize - DesiredCapacity: !Ref ASGDesiredCapacity + MaxSize: !Ref ASGMaxSize{{ if ne .ASGDesiredCapacity 0 }} + DesiredCapacity: !Ref ASGDesiredCapacity{{ end }} VPCZoneIdentifier: !Ref PublicSubnetIDs MetricsCollection: - Granularity: "1Minute" @@ -367,8 +367,9 @@ const userDataAL2InstallSSM = ` UserData: sudo docker info` type templateASG struct { - Metadata string - UserData string + Metadata string + UserData string + ASGDesiredCapacity int64 } func (ts *Tester) createASGs() (err error) { @@ -391,7 +392,9 @@ func (ts *Tester) createASGs() (err error) { // TODO: may not be necessary // "/aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2" // already includes SSM agent + AWS CLI - tg := templateASG{} + tg := templateASG{ + ASGDesiredCapacity: cur.ASGDesiredCapacity, + } switch cur.AMIType { case ec2config.AMITypeBottleRocketCPU: // "bottlerocket" comes with SSM agent diff --git a/ec2/asgs_test.go b/ec2/asgs_test.go index 4895665b6..ea4b5cfec 100644 --- a/ec2/asgs_test.go +++ b/ec2/asgs_test.go @@ -17,8 +17,9 @@ func TestTemplateASG(t *testing.T) { buf.Reset() if err := tpl.Execute(buf, templateASG{ - Metadata: metadataAL2InstallSSM, - UserData: userDataAL2InstallSSM, + Metadata: metadataAL2InstallSSM, + UserData: userDataAL2InstallSSM, + ASGDesiredCapacity: 1, }); err != nil { t.Fatal(err) } diff --git a/ec2config/validate-defaults.go b/ec2config/validate-defaults.go index a797e0499..017d59a92 100644 --- a/ec2config/validate-defaults.go +++ b/ec2config/validate-defaults.go @@ -414,13 +414,13 @@ func (cfg *Config) validateASGs() error { return fmt.Errorf("unknown ASGs[%q].AMIType %q", k, cur.AMIType) } - if cur.ASGDesiredCapacity == 0 { - return fmt.Errorf("ASGs[%q].ASGDesiredCapacity must be >0", k) + if cur.ASGMinSize == 0 && cur.ASGDesiredCapacity == 0 { + return fmt.Errorf("ASGs[%q].ASGMinSize/ASGDesiredCapacity must be >0", k) } - if cur.ASGMinSize == 0 { + if cur.ASGDesiredCapacity > 0 && cur.ASGMinSize == 0 { cur.ASGMinSize = cur.ASGDesiredCapacity } - if cur.ASGMaxSize == 0 { + if cur.ASGDesiredCapacity > 0 && cur.ASGMaxSize == 0 { cur.ASGMaxSize = cur.ASGDesiredCapacity } @@ -464,7 +464,11 @@ func (cfg *Config) validateASGs() error { case false: // use existing one, or don't run any SSM } - total += cur.ASGDesiredCapacity + expectedN := cur.ASGDesiredCapacity + if expectedN == 0 { + expectedN = cur.ASGMinSize + } + total += expectedN processed[k] = cur } diff --git a/eks/gpu/gpu.go b/eks/gpu/gpu.go index 7198c029f..161b7b0c7 100644 --- a/eks/gpu/gpu.go +++ b/eks/gpu/gpu.go @@ -258,11 +258,12 @@ func (ts *tester) InstallNvidiaDriver() (err error) { } ts.cfg.Logger.Info("nodes", zap.Int64("current-ready-nodes", foundReady), + zap.Int64("min-ready-nodes", cur.ASGMinSize), zap.Int64("desired-ready-nodes", cur.ASGDesiredCapacity), ) time.Sleep(5 * time.Second) - if foundReady >= cur.ASGDesiredCapacity { + if foundReady >= cur.ASGMinSize { readyNGs[ngName] = struct{}{} break } @@ -338,11 +339,12 @@ func (ts *tester) InstallNvidiaDriver() (err error) { } ts.cfg.Logger.Info("nodes", zap.Int("current-ready-nodes", foundReady), + zap.Int("min-ready-nodes", cur.ASGMinSize), zap.Int("desired-ready-nodes", cur.ASGDesiredCapacity), ) time.Sleep(5 * time.Second) - if foundReady >= cur.ASGDesiredCapacity { + if foundReady >= cur.ASGMinSize { readyMNGs[mngName] = struct{}{} break } diff --git a/eks/mng/nodes.go b/eks/mng/nodes.go index f44b1a8fc..a963d20c5 100644 --- a/eks/mng/nodes.go +++ b/eks/mng/nodes.go @@ -75,7 +75,7 @@ Parameters: ASGDesiredCapacity: Type: Number - Default: 2 + Default: 0 Description: Desired capacity of Node Group Auto Scaling Group. InstanceTypes: @@ -105,8 +105,8 @@ Resources: Ec2SshKey: !Ref RemoteAccessKeyName ScalingConfig: MinSize: !Ref ASGMinSize - MaxSize: !Ref ASGMaxSize - DesiredSize: !Ref ASGDesiredCapacity + MaxSize: !Ref ASGMaxSize{{ if ne .ASGDesiredCapacity 0 }} + DesiredSize: !Ref ASGDesiredCapacity{{ end }} Subnets: !Ref PublicSubnetIDs Labels: NodeType: regular @@ -130,6 +130,7 @@ const parametersReleaseVersion = ` ReleaseVersion: const propertyReleaseVersion = ` ReleaseVersion: !Ref ReleaseVersion` type templateMNG struct { + ASGDesiredCapacity int64 ParameterReleaseVersion string PropertyReleaseVersion string } @@ -183,9 +184,8 @@ func (ts *tester) createASGs() (err error) { Ec2SshKey: aws.String(ts.cfg.EKSConfig.RemoteAccessKeyName), }, ScalingConfig: &aws_eks.NodegroupScalingConfig{ - MinSize: aws.Int64(int64(cur.ASGMinSize)), - MaxSize: aws.Int64(int64(cur.ASGMaxSize)), - DesiredSize: aws.Int64(int64(cur.ASGDesiredCapacity)), + MinSize: aws.Int64(int64(cur.ASGMinSize)), + MaxSize: aws.Int64(int64(cur.ASGMaxSize)), }, Subnets: aws.StringSlice(ts.cfg.EKSConfig.Parameters.PublicSubnetIDs), Tags: map[string]*string{ @@ -200,6 +200,9 @@ func (ts *tester) createASGs() (err error) { "NGName": aws.String(cur.Name), }, } + if cur.ASGDesiredCapacity > 0 { + createInput.ScalingConfig.DesiredSize = aws.Int64(int64(cur.ASGDesiredCapacity)) + } for k, v := range cur.Tags { createInput.Tags[k] = aws.String(v) ts.cfg.Logger.Info("added EKS tag", zap.String("key", k), zap.String("value", v)) diff --git a/eks/mng/version-upgrade/version-upgrade.go b/eks/mng/version-upgrade/version-upgrade.go index 8409134ed..72029b589 100644 --- a/eks/mng/version-upgrade/version-upgrade.go +++ b/eks/mng/version-upgrade/version-upgrade.go @@ -122,10 +122,12 @@ func (ts *tester) Upgrade(mngName string) (err error) { reqID = aws.StringValue(updateOut.Update.Id) } - // takes TODO initialWait := 5 * time.Minute - totalWait := 2*time.Hour + 30*time.Minute + 3*time.Minute*time.Duration(cur.ASGDesiredCapacity) - + checkN := time.Duration(cur.ASGDesiredCapacity) + if checkN == 0 { + checkN = time.Duration(cur.ASGMinSize) + } + totalWait := 2*time.Hour + 30*time.Minute + 3*time.Minute*checkN ts.cfg.Logger.Info("sent MNG upgrade request; polling", zap.String("cluster-name", ts.cfg.EKSConfig.Name), zap.String("mng-name", mngName), diff --git a/eks/mng/wait/wait.go b/eks/mng/wait/wait.go index 3ae7baf3a..57ab3f51c 100644 --- a/eks/mng/wait/wait.go +++ b/eks/mng/wait/wait.go @@ -140,7 +140,11 @@ func (ts *tester) waitForNodes(mngName string, retriesLeft int) error { } } - waitDur := 3*time.Minute + time.Duration(5*cur.ASGDesiredCapacity)*time.Second + checkN := time.Duration(cur.ASGDesiredCapacity) + if checkN == 0 { + checkN = time.Duration(cur.ASGMinSize) + } + waitDur := 3*time.Minute + 5*time.Second*checkN ts.cfg.Logger.Info( "describing EC2 instances in ASG", zap.String("asg-name", cur.ASGName), @@ -315,10 +319,11 @@ func (ts *tester) waitForNodes(mngName string, retriesLeft int) error { zap.String("command", ts.cfg.EKSConfig.KubectlCommand()+" get nodes"), zap.String("mng-name", cur.Name), zap.Int("current-ready-nodes", readies), + zap.Int("min-ready-nodes", cur.ASGMinSize), zap.Int("desired-ready-nodes", cur.ASGDesiredCapacity), zap.String("all-csrs", fmt.Sprintf("%+v", allCSRs)), ) - if readies >= cur.ASGDesiredCapacity { + if readies >= cur.ASGMinSize { ready = true break } diff --git a/eks/ng/nodes.go b/eks/ng/nodes.go index 7f88232b0..29cc3c50f 100644 --- a/eks/ng/nodes.go +++ b/eks/ng/nodes.go @@ -124,7 +124,7 @@ Parameters: ASGDesiredCapacity: Type: Number - Default: 2 + Default: 0 Description: Desired capacity of Node Group ASG. Conditions: @@ -216,7 +216,7 @@ Resources: Type: AWS::AutoScaling::AutoScalingGroup UpdatePolicy: AutoScalingRollingUpdate: - MinInstancesInService: !Ref ASGDesiredCapacity + MinInstancesInService: !Ref ASGMinSize MaxBatchSize: 1 SuspendProcesses: - HealthCheck @@ -227,8 +227,8 @@ Resources: Properties: AutoScalingGroupName: !Ref ASGName MinSize: !Ref ASGMinSize - MaxSize: !Ref ASGMaxSize - DesiredCapacity: !Ref ASGDesiredCapacity + MaxSize: !Ref ASGMaxSize{{ if ne .ASGDesiredCapacity 0 }} + DesiredCapacity: !Ref ASGDesiredCapacity{{ end }} VPCZoneIdentifier: !Ref PublicSubnetIDs MetricsCollection: - Granularity: "1Minute" @@ -239,7 +239,7 @@ Resources: - Key: !Sub kubernetes.io/cluster/${ClusterName} Value: owned PropagateAtLaunch: true -{{ if ne .AsgTagData "" }}{{.AsgTagData}}{{ end }} +{{ if ne .ASGTagData "" }}{{.ASGTagData}}{{ end }} MixedInstancesPolicy: InstancesDistribution: OnDemandAllocationStrategy: "prioritized" @@ -374,9 +374,10 @@ const asgTagDataNG = ` - Key: !Sub k8s.io/cluster-autoscaler/${ClusterName} ` type templateASG struct { - Metadata string - UserData string - AsgTagData string + Metadata string + UserData string + ASGDesiredCapacity int64 + ASGTagData string } func (ts *tester) createASGs() error { @@ -398,7 +399,9 @@ func (ts *tester) createASGs() error { // TODO: may not be necessary // "/aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2" // already includes SSM agent + AWS CLI - tg := templateASG{} + tg := templateASG{ + ASGDesiredCapacity: cur.ASGDesiredCapacity, + } switch cur.AMIType { case ec2config.AMITypeBottleRocketCPU: // "bottlerocket" comes with SSM agent @@ -448,9 +451,9 @@ func (ts *tester) createASGs() error { tg.UserData += "\n" tg.UserData += ` /opt/aws/bin/cfn-signal --exit-code $? --stack ${AWS::StackName} --resource ASG --region ${AWS::Region}` } - tg.AsgTagData = "" + tg.ASGTagData = "" if cur.ClusterAutoscaler != nil && cur.ClusterAutoscaler.Enable { - tg.AsgTagData = asgTagDataNG + tg.ASGTagData = asgTagDataNG } tpl := template.Must(template.New("TemplateASG").Parse(TemplateASG)) buf := bytes.NewBuffer(nil) @@ -586,11 +589,15 @@ func (ts *tester) createASGs() error { return fmt.Errorf("ASG name %q not found after creation", asgName) } - waitDur := 30*time.Minute + 5*time.Second*time.Duration(cur.ASGDesiredCapacity) + checkN := time.Duration(cur.ASGDesiredCapacity) + if checkN == 0 { + checkN = time.Duration(cur.ASGMinSize) + } + waitDur := 30*time.Minute + 5*time.Second*checkN for _, it := range cur.InstanceTypes { if strings.Contains(it, ".metal") { // "i3.metal" takes much longer ts.cfg.Logger.Info("increasing wait time for metal instance", zap.String("instance-type", it)) - waitDur = time.Hour + time.Minute*time.Duration(cur.ASGDesiredCapacity) + waitDur = time.Hour + time.Minute*checkN } } timeStart := time.Now() diff --git a/eks/ng/nodes_test.go b/eks/ng/nodes_test.go index 7f351bccc..662c5b054 100644 --- a/eks/ng/nodes_test.go +++ b/eks/ng/nodes_test.go @@ -17,8 +17,9 @@ func TestTemplateASG(t *testing.T) { buf.Reset() if err := tpl.Execute(buf, templateASG{ - Metadata: metadataAL2InstallSSM, - UserData: userDataAL2InstallSSM, + Metadata: metadataAL2InstallSSM, + UserData: userDataAL2InstallSSM, + ASGDesiredCapacity: 1, }); err != nil { t.Fatal(err) } diff --git a/eks/ng/wait/wait.go b/eks/ng/wait/wait.go index 57bfbb1b2..cbb827167 100644 --- a/eks/ng/wait/wait.go +++ b/eks/ng/wait/wait.go @@ -102,7 +102,11 @@ func (ts *tester) waitForNodes(asgName string, retriesLeft int) error { } } - waitDur := 3*time.Minute + time.Duration(5*cur.ASGDesiredCapacity)*time.Second + checkN := time.Duration(cur.ASGDesiredCapacity) + if checkN == 0 { + checkN = time.Duration(cur.ASGMinSize) + } + waitDur := 3*time.Minute + 5*time.Second*checkN ts.cfg.Logger.Info( "waiting for EC2 instances in ASG", zap.String("asg-name", cur.Name), @@ -277,10 +281,11 @@ func (ts *tester) waitForNodes(asgName string, retriesLeft int) error { zap.String("command", ts.cfg.EKSConfig.KubectlCommand()+" get nodes"), zap.String("ng-name", cur.Name), zap.Int("current-ready-nodes", readies), + zap.Int64("min-ready-nodes", cur.ASGMinSize), zap.Int64("desired-ready-nodes", cur.ASGDesiredCapacity), zap.String("all-csrs", fmt.Sprintf("%+v", allCSRs)), ) - if int64(readies) >= cur.ASGDesiredCapacity { + if int64(readies) >= cur.ASGMinSize { ready = true break } diff --git a/eksconfig/add-on-managed-node-groups.go b/eksconfig/add-on-managed-node-groups.go index 0a3082474..7c8428974 100644 --- a/eksconfig/add-on-managed-node-groups.go +++ b/eksconfig/add-on-managed-node-groups.go @@ -119,7 +119,7 @@ type MNG struct { // ref. https://docs.aws.amazon.com/eks/latest/userguide/create-managed-node-group.html // ref. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-eks-nodegroup.html ASGMaxSize int `json:"asg-max-size,omitempty"` - // ASGDesiredCapacity is is the desired capacity of Node Group ASG. + // ASGDesiredCapacity is the desired capacity of Node Group ASG. // ref. https://docs.aws.amazon.com/eks/latest/userguide/create-managed-node-group.html // ref. https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-eks-nodegroup.html ASGDesiredCapacity int `json:"asg-desired-capacity,omitempty"` @@ -501,13 +501,13 @@ func (cfg *Config) validateAddOnManagedNodeGroups() error { } } - if cur.ASGDesiredCapacity == 0 { - return fmt.Errorf("AddOnManagedNodeGroups.MNGs[%q].ASGDesiredCapacity must be >0", k) + if cur.ASGMinSize == 0 && cur.ASGDesiredCapacity == 0 { + return fmt.Errorf("AddOnManagedNodeGroups.MNGs[%q].ASGMinSize/ASGDesiredCapacity must be >0", k) } - if cur.ASGMinSize == 0 { + if cur.ASGDesiredCapacity > 0 && cur.ASGMinSize == 0 { cur.ASGMinSize = cur.ASGDesiredCapacity } - if cur.ASGMaxSize == 0 { + if cur.ASGDesiredCapacity > 0 && cur.ASGMaxSize == 0 { cur.ASGMaxSize = cur.ASGDesiredCapacity } diff --git a/eksconfig/add-on-node-groups.go b/eksconfig/add-on-node-groups.go index 9029ee3dc..35e7fcd93 100644 --- a/eksconfig/add-on-node-groups.go +++ b/eksconfig/add-on-node-groups.go @@ -341,13 +341,13 @@ func (cfg *Config) validateAddOnNodeGroups() error { } } - if cur.ASGDesiredCapacity == 0 { - return fmt.Errorf("AddOnNodeGroups.ASGs[%q].ASGDesiredCapacity must be >0", k) + if cur.ASGMinSize == 0 && cur.ASGDesiredCapacity == 0 { + return fmt.Errorf("AddOnNodeGroups.ASGs[%q].ASGMinSize/ASGDesiredCapacity must be >0", k) } - if cur.ASGMinSize == 0 { + if cur.ASGDesiredCapacity > 0 && cur.ASGMinSize == 0 { cur.ASGMinSize = cur.ASGDesiredCapacity } - if cur.ASGMaxSize == 0 { + if cur.ASGDesiredCapacity > 0 && cur.ASGMaxSize == 0 { cur.ASGMaxSize = cur.ASGDesiredCapacity } From 39cba9c4c6150c00f350153c742189b7671a85af Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Tue, 28 Jul 2020 17:35:32 -0700 Subject: [PATCH 2/2] CHANGELOG: update Signed-off-by: Gyuho Lee --- CHANGELOG/CHANGELOG-1.5.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG/CHANGELOG-1.5.md b/CHANGELOG/CHANGELOG-1.5.md index 07ecd2e87..1c42d391d 100644 --- a/CHANGELOG/CHANGELOG-1.5.md +++ b/CHANGELOG/CHANGELOG-1.5.md @@ -10,11 +10,17 @@ See [code changes](https://github.com/aws/aws-k8s-tester/compare/v1.4.8...v1.5.0 ### `ec2config` - Set [ASG size defaults based on desired capacities](https://github.com/aws/aws-k8s-tester/pull/140). + - Either ["desired" or "minimum" must be >0](https://github.com/aws/aws-k8s-tester/pull/143). + - `desired 10, min 0, max 0 ==> desired 10, min 10, max 10`. + - `desired 0, min 1, max 10 ==> desired 0, min 1, max 10`. ### `eksconfig` - Add [`AWS_K8S_TESTER_EKS_CONFIG`](https://github.com/aws/aws-k8s-tester/pull/138). - Set [ASG size defaults based on desired capacities](https://github.com/aws/aws-k8s-tester/pull/140). + - Either ["desired" or "minimum" must be >0](https://github.com/aws/aws-k8s-tester/pull/143). + - `desired 10, min 0, max 0 ==> desired 10, min 10, max 10`. + - `desired 0, min 1, max 10 ==> desired 0, min 1, max 10`. ### `eks`