Skip to content

Commit

Permalink
chore: Fix IAM instance profile eventual consistency error (#7849)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis authored Mar 6, 2025
1 parent 0159dd5 commit e7c25e5
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 33 deletions.
45 changes: 26 additions & 19 deletions pkg/controllers/nodeclass/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,10 @@ func (v *Validation) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass)
v.validateCreateLaunchTemplateAuthorization,
v.validateRunInstancesAuthorization,
} {
if failureReason, err := isValid(ctx, nodeClass, nodeClaim, tags); err != nil {
if failureReason, requeue, err := isValid(ctx, nodeClass, nodeClaim, tags); err != nil {
return reconcile.Result{}, err
} else if requeue {
return reconcile.Result{Requeue: true}, nil
} else if failureReason != "" {
v.cache.SetDefault(v.cacheKey(nodeClass, tags), failureReason)
nodeClass.StatusConditions().SetFalse(
Expand All @@ -143,64 +145,64 @@ func (v *Validation) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass)
return reconcile.Result{}, nil
}

type validatorFunc func(context.Context, *v1.EC2NodeClass, *karpv1.NodeClaim, map[string]string) (string, error)
type validatorFunc func(context.Context, *v1.EC2NodeClass, *karpv1.NodeClaim, map[string]string) (string, bool, error)

func (v *Validation) validateCreateFleetAuthorization(
ctx context.Context,
nodeClass *v1.EC2NodeClass,
_ *karpv1.NodeClaim,
tags map[string]string,
) (reason string, err error) {
) (reason string, requeue bool, err error) {
createFleetInput := instance.GetCreateFleetInput(nodeClass, karpv1.CapacityTypeOnDemand, tags, mockLaunchTemplateConfig())
createFleetInput.DryRun = lo.ToPtr(true)
if _, err := v.ec2api.CreateFleet(ctx, createFleetInput); awserrors.IgnoreDryRunError(err) != nil {
if awserrors.IgnoreUnauthorizedOperationError(err) != nil {
// Dry run should only ever return UnauthorizedOperation or DryRunOperation so if we receive any other error
// it would be an unexpected state
return "", fmt.Errorf("validating ec2:CreateFleet authorization, %w", err)
return "", false, fmt.Errorf("validating ec2:CreateFleet authorization, %w", err)
}
return ConditionReasonCreateFleetAuthFailed, nil
return ConditionReasonCreateFleetAuthFailed, false, nil
}
return "", nil
return "", false, nil
}

func (v *Validation) validateCreateLaunchTemplateAuthorization(
ctx context.Context,
nodeClass *v1.EC2NodeClass,
nodeClaim *karpv1.NodeClaim,
tags map[string]string,
) (reason string, err error) {
) (reason string, requeue bool, err error) {
createLaunchTemplateInput := launchtemplate.GetCreateLaunchTemplateInput(ctx, mockOptions(*nodeClaim, nodeClass, tags), corev1.IPv4Protocol, "")
createLaunchTemplateInput.DryRun = lo.ToPtr(true)
if _, err := v.ec2api.CreateLaunchTemplate(ctx, createLaunchTemplateInput); awserrors.IgnoreDryRunError(err) != nil {
if awserrors.IgnoreUnauthorizedOperationError(err) != nil {
// Dry run should only ever return UnauthorizedOperation or DryRunOperation so if we receive any other error
// it would be an unexpected state
return "", fmt.Errorf("validating ec2:CreateLaunchTemplates authorization, %w", err)
return "", false, fmt.Errorf("validating ec2:CreateLaunchTemplates authorization, %w", err)
}
return ConditionReasonCreateLaunchTemplateAuthFailed, nil
return ConditionReasonCreateLaunchTemplateAuthFailed, false, nil
}
return "", nil
return "", false, nil
}

func (v *Validation) validateRunInstancesAuthorization(
ctx context.Context,
nodeClass *v1.EC2NodeClass,
nodeClaim *karpv1.NodeClaim,
tags map[string]string,
) (reason string, err error) {
) (reason string, requeue bool, err error) {
// NOTE: Since we've already validated the status conditions are true, these should never occur
if len(nodeClass.Status.AMIs) == 0 {
return "", fmt.Errorf("no resolved amis in status")
return "", false, fmt.Errorf("no resolved amis in status")
}
if len(nodeClass.Status.Subnets) == 0 {
return "", fmt.Errorf("no resolved subnets in status")
return "", false, fmt.Errorf("no resolved subnets in status")
}
if len(nodeClass.Status.SecurityGroups) == 0 {
return "", fmt.Errorf("no resolved security groups in status")
return "", false, fmt.Errorf("no resolved security groups in status")
}
if nodeClass.Status.InstanceProfile == "" {
return "", fmt.Errorf("no instance profile in status")
return "", false, fmt.Errorf("no instance profile in status")
}

var instanceType ec2types.InstanceType
Expand Down Expand Up @@ -258,15 +260,20 @@ func (v *Validation) validateRunInstancesAuthorization(
},
}

if _, err := v.ec2api.RunInstances(ctx, runInstancesInput); awserrors.IgnoreDryRunError(err) != nil {
if _, err = v.ec2api.RunInstances(ctx, runInstancesInput); awserrors.IgnoreDryRunError(err) != nil {
// If we get InstanceProfile NotFound, but we have a resolved instance profile in the status,
// this means there is most likely an eventual consistency issue and we just need to requeue
if awserrors.IsInstanceProfileNotFound(err) {
return "", true, nil
}
if awserrors.IgnoreUnauthorizedOperationError(err) != nil {
// Dry run should only ever return UnauthorizedOperation or DryRunOperation so if we receive any other error
// it would be an unexpected state
return "", fmt.Errorf("validating ec2:RunInstances authorization, %w", err)
return "", false, fmt.Errorf("validating ec2:RunInstances authorization, %w", err)
}
return ConditionReasonRunInstancesAuthFailed, nil
return ConditionReasonRunInstancesAuthFailed, false, nil
}
return "", nil
return "", false, nil
}

func (*Validation) requiredConditions() []string {
Expand Down
34 changes: 20 additions & 14 deletions pkg/errors/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,19 @@ limitations under the License.
package errors

import (
"errors"
"strings"

ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
"github.com/aws/smithy-go"
"github.com/samber/lo"
"k8s.io/apimachinery/pkg/util/sets"
)

const (
launchTemplateNameNotFoundCode = "InvalidLaunchTemplateName.NotFoundException"
DryRunOperationErrorCode = "DryRunOperation"
UnauthorizedOperationErrorCode = "UnauthorizedOperation"
launchTemplateNameNotFoundCode = "InvalidLaunchTemplateName.NotFoundException"
RunInstancesInvalidParameterValueCode = "InvalidParameterValue"
DryRunOperationErrorCode = "DryRunOperation"
UnauthorizedOperationErrorCode = "UnauthorizedOperation"
)

var (
Expand Down Expand Up @@ -64,8 +65,7 @@ func IsNotFound(err error) bool {
if err == nil {
return false
}
var apiErr smithy.APIError
if errors.As(err, &apiErr) {
if apiErr, ok := lo.ErrorsAs[smithy.APIError](err); ok {
return notFoundErrorCodes.Has(apiErr.ErrorCode())
}
return false
Expand All @@ -82,8 +82,7 @@ func IsAlreadyExists(err error) bool {
if err == nil {
return false
}
var apiErr smithy.APIError
if errors.As(err, &apiErr) {
if apiErr, ok := lo.ErrorsAs[smithy.APIError](err); ok {
return alreadyExistsErrorCodes.Has(apiErr.ErrorCode())
}
return false
Expand All @@ -100,8 +99,7 @@ func IsDryRunError(err error) bool {
if err == nil {
return false
}
var apiErr smithy.APIError
if errors.As(err, &apiErr) {
if apiErr, ok := lo.ErrorsAs[smithy.APIError](err); ok {
return apiErr.ErrorCode() == DryRunOperationErrorCode
}
return false
Expand All @@ -118,8 +116,7 @@ func IsUnauthorizedOperationError(err error) bool {
if err == nil {
return false
}
var apiErr smithy.APIError
if errors.As(err, &apiErr) {
if apiErr, ok := lo.ErrorsAs[smithy.APIError](err); ok {
return apiErr.ErrorCode() == UnauthorizedOperationErrorCode
}
return false
Expand Down Expand Up @@ -148,13 +145,22 @@ func IsLaunchTemplateNotFound(err error) bool {
if err == nil {
return false
}
var apiErr smithy.APIError
if errors.As(err, &apiErr) {
if apiErr, ok := lo.ErrorsAs[smithy.APIError](err); ok {
return apiErr.ErrorCode() == launchTemplateNameNotFoundCode
}
return false
}

func IsInstanceProfileNotFound(err error) bool {
if err == nil {
return false
}
if apiErr, ok := lo.ErrorsAs[smithy.APIError](err); ok {
return apiErr.ErrorCode() == RunInstancesInvalidParameterValueCode && strings.Contains(apiErr.ErrorMessage(), "Invalid IAM Instance Profile name")
}
return false
}

// ToReasonMessage converts an error message from AWS into a well-known condition reason
// and well-known condition message that can be used for Launch failure classification
// nolint:gocyclo
Expand Down

0 comments on commit e7c25e5

Please sign in to comment.