Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AWS launch template deletion on cache eviction #1278

Merged
merged 7 commits into from
Feb 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/karpenter/templates/configmap-logging.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: karpenter-config-logging
name: config-logging
namespace: {{ .Release.Namespace }}
labels:
{{- include "karpenter.labels" . | nindent 4 }}
Expand Down
2 changes: 2 additions & 0 deletions pkg/cloudprovider/aws/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ type CloudProvider struct {
}

func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *CloudProvider {
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws"))
bwagner5 marked this conversation as resolved.
Show resolved Hide resolved
sess := withUserAgent(session.Must(session.NewSession(
request.WithRetryer(
&aws.Config{STSRegionalEndpoint: endpoints.RegionalSTSEndpoint},
Expand All @@ -85,6 +86,7 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud
subnetProvider: subnetProvider,
instanceProvider: &InstanceProvider{ec2api, instanceTypeProvider, subnetProvider,
NewLaunchTemplateProvider(
ctx,
ec2api,
NewAMIProvider(ssm.New(sess), options.ClientSet),
NewSecurityGroupProvider(ec2api),
Expand Down
44 changes: 42 additions & 2 deletions pkg/cloudprovider/aws/launchtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"sort"
"strings"
"sync"
"time"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/ec2"
Expand All @@ -33,6 +34,7 @@ import (
"github.com/aws/karpenter/pkg/utils/functional"
"github.com/aws/karpenter/pkg/utils/injection"
"github.com/mitchellh/hashstructure/v2"
"go.uber.org/zap"
core "k8s.io/api/core/v1"
"k8s.io/client-go/transport"
"knative.dev/pkg/logging"
Expand All @@ -47,19 +49,24 @@ const (

type LaunchTemplateProvider struct {
sync.Mutex
logger *zap.SugaredLogger
ec2api ec2iface.EC2API
amiProvider *AMIProvider
securityGroupProvider *SecurityGroupProvider
cache *cache.Cache
}

func NewLaunchTemplateProvider(ec2api ec2iface.EC2API, amiProvider *AMIProvider, securityGroupProvider *SecurityGroupProvider) *LaunchTemplateProvider {
return &LaunchTemplateProvider{
func NewLaunchTemplateProvider(ctx context.Context, ec2api ec2iface.EC2API, amiProvider *AMIProvider, securityGroupProvider *SecurityGroupProvider) *LaunchTemplateProvider {
l := &LaunchTemplateProvider{
ec2api: ec2api,
logger: logging.FromContext(ctx).Named("launchtemplate"),
amiProvider: amiProvider,
securityGroupProvider: securityGroupProvider,
cache: cache.New(CacheTTL, CacheCleanupInterval),
}
l.cache.OnEvicted(l.onCacheEvicted)
l.hydrateCache(ctx)
bwagner5 marked this conversation as resolved.
Show resolved Hide resolved
return l
}

func launchTemplateName(options *launchTemplateOptions) string {
Expand Down Expand Up @@ -138,6 +145,7 @@ func (p *LaunchTemplateProvider) ensureLaunchTemplate(ctx context.Context, optio
name := launchTemplateName(options)
// Read from cache
if launchTemplate, ok := p.cache.Get(name); ok {
p.cache.SetDefault(name, launchTemplate)
return launchTemplate.(*ec2.LaunchTemplate), nil
}
// Attempt to find an existing LT.
Expand Down Expand Up @@ -210,6 +218,38 @@ func (p *LaunchTemplateProvider) createLaunchTemplate(ctx context.Context, optio
return output.LaunchTemplate, nil
}

// hydrateCache queries for existing Launch Templates created by Karpenter for the current cluster and adds to the LT cache.
// Any error during hydration will result in a panic
func (p *LaunchTemplateProvider) hydrateCache(ctx context.Context) {
queryKey := fmt.Sprintf(launchTemplateNameFormat, injection.GetOptions(ctx).ClusterName, "*")
p.logger.Debugf("Hydrating the launch template cache with names matching \"%s\"", queryKey)
if err := p.ec2api.DescribeLaunchTemplatesPagesWithContext(ctx, &ec2.DescribeLaunchTemplatesInput{
Filters: []*ec2.Filter{{Name: aws.String("launch-template-name"), Values: []*string{aws.String(queryKey)}}},
}, func(output *ec2.DescribeLaunchTemplatesOutput, _ bool) bool {
for _, lt := range output.LaunchTemplates {
p.cache.SetDefault(*lt.LaunchTemplateName, lt)
}
return true
}); err != nil {
panic(fmt.Sprintf("Unable to hydrate the AWS launch template cache, %s", err.Error()))
}
p.logger.Debugf("Finished hydrating the launch template cache with %d items", p.cache.ItemCount())
}

func (p *LaunchTemplateProvider) onCacheEvicted(key string, lt interface{}) {
bwagner5 marked this conversation as resolved.
Show resolved Hide resolved
p.Lock()
Copy link
Contributor

@ellistarn ellistarn Feb 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to lock since the cache eviction is already threadsafe.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cache eviction is threadsafe, however onCacheEvicted is not threadsafe. For example, if an LT is evicted from the cache, ensureLaunchTemplate can be executed that will receive a cache miss, and then query LTs and find the LT that was evicted but has not been deleted yet. After it finds that LT, onCacheEvicted can run (if the lock is removed) and delete the LT before it is used which will propagate as a launch failure in Fleet.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lock isn't saving us in this case, since the LT creation has already happened. The only thing saving us from this race is the expiration timeout -- I guess this is fine (necessary?).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the lock does save us in the scenario I mentioned. The LT creation I mentioned would occur in the ensureLaunchTemplate func which also takes a lock. Locking in both of these funcs ensures that the cache is always consistent with the state of EC2.

defer p.Unlock()
if _, expiration, _ := p.cache.GetWithExpiration(key); expiration.After(time.Now()) {
return
}
launchTemplate := lt.(*ec2.LaunchTemplate)
if _, err := p.ec2api.DeleteLaunchTemplate(&ec2.DeleteLaunchTemplateInput{LaunchTemplateId: launchTemplate.LaunchTemplateId}); err != nil {
p.logger.Errorf("Unable to delete launch template, %v", err)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any way we'll want to retry this deletion if we fail? Is this called within a controller somewhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not retried (doesn't reconcile) since this is only executed on cache eviction, BUT it does rehydrate on startup, so if something did happen, a restart of Karpenter would clean them up.

return
}
p.logger.Debugf("Deleted launch template %v", aws.StringValue(launchTemplate.LaunchTemplateId))
}

func sortedTaints(ts []core.Taint) []core.Taint {
sorted := append(ts[:0:0], ts...) // copy to avoid touching original
sort.Slice(sorted, func(i, j int) bool {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ resource "aws_iam_role_policy" "karpenter_controller" {
"iam:PassRole",
"ec2:TerminateInstances",
"ec2:DescribeLaunchTemplates",
"ec2:DeleteLaunchTemplate",
"ec2:DescribeInstances",
"ec2:DescribeSecurityGroups",
"ec2:DescribeSubnets",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Resources:
- ec2:CreateTags
- iam:PassRole
- ec2:TerminateInstances
- ec2:DeleteLaunchTemplate
Copy link
Contributor

@ellistarn ellistarn Feb 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should make some upgrade instructions to help users migrate.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we definitely should... not sure how we'd structure them. I was thinking release notes, but another way would be adding an upgrade section in the versioned docs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just want a single command to run which updates my IAM. Maybe cfn deploy works out of the box.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah cfn should work fine, but I suspect most users are integrating our template (cfn or terraform or cdk) into their own infrastructure-as-code so there won't be a one-size fits all solution.

# Read Operations
- ec2:DescribeLaunchTemplates
- ec2:DescribeInstances
Expand Down