diff --git a/go.mod b/go.mod index 0e971701773d..1a2866111858 100644 --- a/go.mod +++ b/go.mod @@ -42,7 +42,7 @@ require ( k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 sigs.k8s.io/controller-runtime v0.19.1 - sigs.k8s.io/karpenter v1.0.1-0.20241112233246-3e0c51ac84f2 + sigs.k8s.io/karpenter v1.0.1-0.20241115002651-7786f76f87fe sigs.k8s.io/yaml v1.4.0 ) diff --git a/go.sum b/go.sum index 6ca544a38fa8..dcc03fbc1620 100644 --- a/go.sum +++ b/go.sum @@ -323,8 +323,8 @@ sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn sigs.k8s.io/controller-runtime v0.19.1/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/karpenter v1.0.1-0.20241112233246-3e0c51ac84f2 h1:wnXbS7okpGz7RHFrnjJS4r7BfGwCCcOaGO8naB1+thw= -sigs.k8s.io/karpenter v1.0.1-0.20241112233246-3e0c51ac84f2/go.mod h1:RDaWii2JY4Qvnc99/UBjPzYfk/yfGQV4ihpk34BX2EQ= +sigs.k8s.io/karpenter v1.0.1-0.20241115002651-7786f76f87fe h1:OEIvm8hg0wQXtAC5pxuWnlbSgdcDGO+Mes8H7W7Cv4s= +sigs.k8s.io/karpenter v1.0.1-0.20241115002651-7786f76f87fe/go.mod h1:RDaWii2JY4Qvnc99/UBjPzYfk/yfGQV4ihpk34BX2EQ= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= diff --git a/pkg/cloudprovider/cloudprovider.go b/pkg/cloudprovider/cloudprovider.go index 1ac650f56cf4..d03be9193d07 100644 --- a/pkg/cloudprovider/cloudprovider.go +++ b/pkg/cloudprovider/cloudprovider.go @@ -94,18 +94,23 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim) return nil, cloudprovider.NewNodeClassNotReadyError(stderrors.New(nodeClassReady.Message)) } if nodeClassReady.IsUnknown() { - return nil, fmt.Errorf("resolving NodeClass readiness, NodeClass is in Ready=Unknown, %s", nodeClassReady.Message) + return nil, cloudprovider.NewCreateError(fmt.Errorf("resolving NodeClass readiness, NodeClass is in Ready=Unknown, %s", nodeClassReady.Message), "NodeClass is in Ready=Unknown") } instanceTypes, err := c.resolveInstanceTypes(ctx, nodeClaim, nodeClass) if err != nil { - return nil, fmt.Errorf("resolving instance types, %w", err) + return nil, cloudprovider.NewCreateError(fmt.Errorf("resolving instance types, %w", err), "Error resolving instance types") } if len(instanceTypes) == 0 { return nil, cloudprovider.NewInsufficientCapacityError(fmt.Errorf("all requested instance types were unavailable during launch")) } instance, err := c.instanceProvider.Create(ctx, nodeClass, nodeClaim, getTags(ctx, nodeClass, nodeClaim), instanceTypes) if err != nil { - return nil, fmt.Errorf("creating instance, %w", err) + conditionMessage := "Error creating instance" + var createError *cloudprovider.CreateError + if stderrors.As(err, &createError) { + conditionMessage = createError.ConditionMessage + } + return nil, cloudprovider.NewCreateError(fmt.Errorf("creating instance, %w", err), conditionMessage) } instanceType, _ := lo.Find(instanceTypes, func(i *cloudprovider.InstanceType) bool { return i.Name == string(instance.Type) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 45db942c331b..e1ac2b409de5 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -105,8 +105,7 @@ func (p *DefaultProvider) Create(ctx context.Context, nodeClass *v1.EC2NodeClass } instanceTypes, err := cloudprovider.InstanceTypes(instanceTypes).Truncate(schedulingRequirements, maxInstanceTypes) if err != nil { - log.FromContext(ctx).Error(err, "truncating instance types") - return nil, fmt.Errorf("truncating instance types, %w", err) + return nil, cloudprovider.NewCreateError(fmt.Errorf("truncating instance types, %w", err), "Error truncating instance types based on the passed-in requirements") } fleetInstance, err := p.launchInstance(ctx, nodeClass, nodeClaim, instanceTypes, tags) if awserrors.IsLaunchTemplateNotFound(err) { @@ -115,7 +114,6 @@ func (p *DefaultProvider) Create(ctx context.Context, nodeClass *v1.EC2NodeClass fleetInstance, err = p.launchInstance(ctx, nodeClass, nodeClaim, instanceTypes, tags) } if err != nil { - log.FromContext(ctx).Error(err, "launching instance") return nil, err } efaEnabled := lo.Contains(lo.Keys(nodeClaim.Spec.Resources.Requests), v1.ResourceEFA) @@ -213,13 +211,13 @@ func (p *DefaultProvider) launchInstance(ctx context.Context, nodeClass *v1.EC2N capacityType := p.getCapacityType(nodeClaim, instanceTypes) zonalSubnets, err := p.subnetProvider.ZonalSubnetsForLaunch(ctx, nodeClass, instanceTypes, capacityType) if err != nil { - return ec2types.CreateFleetInstance{}, fmt.Errorf("getting subnets, %w", err) + return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("getting subnets, %w", err), "Error getting subnets") } // Get Launch Template Configs, which may differ due to GPU or Architecture requirements launchTemplateConfigs, err := p.getLaunchTemplateConfigs(ctx, nodeClass, nodeClaim, instanceTypes, zonalSubnets, capacityType, tags) if err != nil { - return ec2types.CreateFleetInstance{}, fmt.Errorf("getting launch template configs, %w", err) + return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("getting launch template configs, %w", err), "Error getting launch template configs") } if err := p.checkODFallback(nodeClaim, instanceTypes, launchTemplateConfigs); err != nil { log.FromContext(ctx).Error(err, "failed while checking on-demand fallback") @@ -248,6 +246,7 @@ func (p *DefaultProvider) launchInstance(ctx context.Context, nodeClass *v1.EC2N createFleetOutput, err := p.ec2Batcher.CreateFleet(ctx, createFleetInput) p.subnetProvider.UpdateInflightIPs(createFleetInput, createFleetOutput, instanceTypes, lo.Values(zonalSubnets), capacityType) if err != nil { + conditionMessage := "Error creating fleet" if awserrors.IsLaunchTemplateNotFound(err) { for _, lt := range launchTemplateConfigs { p.launchTemplateProvider.InvalidateCache(ctx, aws.ToString(lt.LaunchTemplateSpecification.LaunchTemplateName), aws.ToString(lt.LaunchTemplateSpecification.LaunchTemplateId)) @@ -256,9 +255,9 @@ func (p *DefaultProvider) launchInstance(ctx context.Context, nodeClass *v1.EC2N } var reqErr *awshttp.ResponseError if errors.As(err, &reqErr) { - return ec2types.CreateFleetInstance{}, fmt.Errorf("creating fleet %w (%v)", err, reqErr.ServiceRequestID()) + return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet %w (%v)", err, reqErr.ServiceRequestID()), conditionMessage) } - return ec2types.CreateFleetInstance{}, fmt.Errorf("creating fleet %w", err) + return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet %w", err), conditionMessage) } p.updateUnavailableOfferingsCache(ctx, createFleetOutput.Errors, capacityType) if len(createFleetOutput.Instances) == 0 || len(createFleetOutput.Instances[0].InstanceIds) == 0 { @@ -503,5 +502,5 @@ func combineFleetErrors(fleetErrs []ec2types.CreateFleetError) (errs error) { if iceErrorCount == len(fleetErrs) { return cloudprovider.NewInsufficientCapacityError(fmt.Errorf("with fleet error(s), %w", errs)) } - return fmt.Errorf("with fleet error(s), %w", errs) + return cloudprovider.NewCreateError(errs, "Error creating fleet") }