diff --git a/go.mod b/go.mod index 39f20201dc54..e89a99c967f0 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.16 require ( github.com/Pallinder/go-randomdata v1.2.0 + github.com/avast/retry-go v2.7.0+incompatible github.com/aws/aws-sdk-go v1.38.69 github.com/deckarep/golang-set v1.7.1 github.com/go-logr/zapr v0.4.0 diff --git a/go.sum b/go.sum index 777cd8bb6c16..f25c7dda3cfd 100644 --- a/go.sum +++ b/go.sum @@ -80,6 +80,8 @@ github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmV github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/aryann/difflib v0.0.0-20170710044230-e206f873d14a/go.mod h1:DAHtR1m6lCRdSC2Tm3DSWRPvIPr6xNKyeHdqDQSQT+A= github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= +github.com/avast/retry-go v2.7.0+incompatible h1:XaGnzl7gESAideSjr+I8Hki/JBi+Yb9baHlMRPeSC84= +github.com/avast/retry-go v2.7.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/aws-lambda-go v1.13.3/go.mod h1:4UKl9IzQMoD+QF79YdCuzCwp8VbmG4VAQwij/eHl5CU= github.com/aws/aws-sdk-go v1.23.20/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= github.com/aws/aws-sdk-go v1.27.0/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 179789ffd519..6c0559faf5ac 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -26,7 +26,6 @@ import ( "github.com/aws/aws-sdk-go/service/ssm" "github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha3" "github.com/awslabs/karpenter/pkg/cloudprovider" - "github.com/awslabs/karpenter/pkg/cloudprovider/aws/utils" "github.com/awslabs/karpenter/pkg/utils/parallel" "github.com/awslabs/karpenter/pkg/utils/project" v1 "k8s.io/api/core/v1" @@ -76,10 +75,7 @@ type CloudProvider struct { } func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *CloudProvider { - sess := withUserAgent(session.Must( - session.NewSession(request.WithRetryer( - &aws.Config{STSRegionalEndpoint: endpoints.RegionalSTSEndpoint}, - utils.NewRetryer())))) + sess := withUserAgent(session.Must(session.NewSession(&aws.Config{STSRegionalEndpoint: endpoints.RegionalSTSEndpoint}))) if *sess.Config.Region == "" { logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service") *sess.Config.Region = getRegionFromIMDS(sess) diff --git a/pkg/cloudprovider/aws/node.go b/pkg/cloudprovider/aws/node.go index a2c5068fc279..93fb0df7b646 100644 --- a/pkg/cloudprovider/aws/node.go +++ b/pkg/cloudprovider/aws/node.go @@ -17,6 +17,9 @@ package aws import ( "context" "fmt" + "time" + + "github.com/avast/retry-go" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" @@ -35,32 +38,21 @@ type NodeFactory struct { // For a given set of instanceIDs return a map of instanceID to Kubernetes node object. func (n *NodeFactory) For(ctx context.Context, instanceId *string) (*v1.Node, error) { - describeInstancesOutput, err := n.ec2api.DescribeInstancesWithContext(ctx, &ec2.DescribeInstancesInput{InstanceIds: []*string{instanceId}}) - if aerr, ok := err.(awserr.Error); ok { - return nil, aerr - } - if err != nil { - return nil, fmt.Errorf("failed to describe ec2 instances, %w", err) - } - if len(describeInstancesOutput.Reservations) != 1 { - return nil, fmt.Errorf("expected a single instance reservation, got %d", len(describeInstancesOutput.Reservations)) + instance := ec2.Instance{} + // EC2 is eventually consistent, so backoff-retry until we have the data we need. + if err := retry.Do( + func() (err error) { return n.getInstance(ctx, instanceId, &instance) }, + retry.Delay(1 * time.Second), + retry.Attempts(3), + ); err != nil { + return nil, err } - if len(describeInstancesOutput.Reservations[0].Instances) != 1 { - return nil, fmt.Errorf("expected a single instance, got %d", len(describeInstancesOutput.Reservations[0].Instances)) - } - instance := *describeInstancesOutput.Reservations[0].Instances[0] - logging.FromContext(ctx).Infof("Launched instance: %s, type: %s, zone: %s, hostname: %s", - *instance.InstanceId, - *instance.InstanceType, - *instance.Placement.AvailabilityZone, - *instance.PrivateDnsName, - ) return &v1.Node{ ObjectMeta: metav1.ObjectMeta{ - Name: *instance.PrivateDnsName, + Name: aws.StringValue(instance.PrivateDnsName), }, Spec: v1.NodeSpec{ - ProviderID: fmt.Sprintf("aws:///%s/%s", *instance.Placement.AvailabilityZone, *instance.InstanceId), + ProviderID: fmt.Sprintf("aws:///%s/%s", aws.StringValue(instance.Placement.AvailabilityZone), aws.StringValue(instance.InstanceId)), }, Status: v1.NodeStatus{ Allocatable: v1.ResourceList{ @@ -76,3 +68,30 @@ func (n *NodeFactory) For(ctx context.Context, instanceId *string) (*v1.Node, er }, }, nil } + +func (n *NodeFactory) getInstance(ctx context.Context, instanceId *string, instance *ec2.Instance) error { + describeInstancesOutput, err := n.ec2api.DescribeInstancesWithContext(ctx, &ec2.DescribeInstancesInput{InstanceIds: []*string{instanceId}}) + if aerr, ok := err.(awserr.Error); ok && aerr.Code() == "InvalidInstanceID.NotFound" { + return aerr + } + if err != nil { + return fmt.Errorf("failed to describe ec2 instances, %w", err) + } + if len(describeInstancesOutput.Reservations) != 1 { + return fmt.Errorf("expected a single instance reservation, got %d", len(describeInstancesOutput.Reservations)) + } + if len(describeInstancesOutput.Reservations[0].Instances) != 1 { + return fmt.Errorf("expected a single instance, got %d", len(describeInstancesOutput.Reservations[0].Instances)) + } + *instance = *describeInstancesOutput.Reservations[0].Instances[0] + if len(aws.StringValue(instance.PrivateDnsName)) == 0 { + return fmt.Errorf("expected PrivateDnsName to be set") + } + logging.FromContext(ctx).Infof("Launched instance: %s, type: %s, zone: %s, hostname: %s", + aws.StringValue(instance.InstanceId), + aws.StringValue(instance.InstanceType), + aws.StringValue(instance.Placement.AvailabilityZone), + aws.StringValue(instance.PrivateDnsName), + ) + return nil +} diff --git a/pkg/cloudprovider/aws/utils/retryer.go b/pkg/cloudprovider/aws/utils/retryer.go deleted file mode 100644 index 75938b7a46c9..000000000000 --- a/pkg/cloudprovider/aws/utils/retryer.go +++ /dev/null @@ -1,50 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package utils - -import ( - "github.com/aws/aws-sdk-go/aws/awserr" - "github.com/aws/aws-sdk-go/aws/client" - "github.com/aws/aws-sdk-go/aws/request" -) - -// Retryer implements the aws request.Retryer interface -// and adds support for retrying ec2 InvalidInstanceID.NotFound -// which can occur when instances have recently been created -// and are not yet describe-able due to eventual consistency -type Retryer struct { - request.Retryer -} - -// NewRetryer instantiates a Retryer based on aws client.DefaultRetryer w/ added functionality for karpenter -func NewRetryer() *Retryer { - return &Retryer{ - Retryer: client.DefaultRetryer{ - NumMaxRetries: 3, - }, - } -} - -// ShouldRetry returns true if the request should be retried -func (r Retryer) ShouldRetry(req *request.Request) bool { - if r.Retryer.ShouldRetry(req) { - return true - } - // Retry DescribeInstances because EC2 is eventually consistent - if aerr, ok := req.Error.(awserr.Error); ok && aerr.Code() == "InvalidInstanceID.NotFound" { - return true - } - return false -}