Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable new instance types for 0.33.x #5579

Closed
wants to merge 12 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/pr-snapshot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ jobs:
- uses: ./.github/actions/install-deps
- uses: aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a # v4.0.1
with:
role-to-assume: 'arn:aws:iam::${{ vars.ECR_ACCOUNT_ID }}:role/${{ vars.ECR_SNAPSHOT_ROLE_NAME }}'
aws-region: ${{ vars.ECR_REGION }}
role-to-assume: 'arn:aws:iam::${{ vars.SNAPSHOT_ACCOUNT_ID }}:role/${{ vars.SNAPSHOT_ROLE_NAME }}'
aws-region: ${{ vars.SNAPSHOT_REGION }}
- run: make snapshot
env:
GH_PR_NUMBER: ${{steps.metadata.outputs.PR_NUMBER}}
Expand All @@ -43,7 +43,7 @@ jobs:
issue_number: `${{steps.metadata.outputs.PR_NUMBER}}`,
owner: context.repo.owner,
repo: context.repo.repo,
body: 'Snapshot successfully published to `oci://${{ vars.ECR_ACCOUNT_ID }}.dkr.ecr.${{ vars.ECR_REGION }}.amazonaws.com/karpenter/snapshot/karpenter:v0-${{steps.metadata.outputs.PR_COMMIT}}`.'
body: 'Snapshot successfully published to `oci://${{ vars.SNAPSHOT_ACCOUNT_ID }}.dkr.ecr.${{ vars.SNAPSHOT_REGION }}.amazonaws.com/karpenter/snapshot/karpenter:v0-${{steps.metadata.outputs.PR_COMMIT}}`.'
})
- if: always()
uses: ./.github/actions/commit-status/end
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
id-token: write # aws-actions/[email protected]
contents: write # marvinpinto/[email protected]
pull-requests: write # name: Create PR
if: github.repository == 'aws/karpenter'
if: github.repository == 'aws/karpenter-provider-aws'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
Expand All @@ -30,9 +30,13 @@ jobs:
version: v3.12.3 # Pinned to this version since v3.13.0 has issues with pushing to public ECR: https://github.com/helm/helm/issues/12442
- uses: aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a # v4.0.1
with:
role-to-assume: 'arn:aws:iam::${{ vars.ECR_ACCOUNT_ID }}:role/${{ vars.ECR_RELEASE_ROLE_NAME }}'
aws-region: ${{ vars.ECR_REGION }}
role-to-assume: 'arn:aws:iam::${{ vars.RELEASE_ACCOUNT_ID }}:role/${{ vars.RELEASE_ROLE_NAME }}'
aws-region: ${{ vars.RELEASE_REGION }}
- run: make release
- uses: aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a # v4.0.1
with:
role-to-assume: 'arn:aws:iam::${{ vars.READONLY_ACCOUNT_ID }}:role/${{ vars.READONLY_ROLE_NAME }}'
aws-region: ${{ vars.READONLY_REGION }}
- run: make docgen
- run: make prepare-website
- run: make stable-release-pr
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/snapshot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
release:
permissions:
id-token: write # aws-actions/[email protected]
if: github.repository == 'aws/karpenter'
if: github.repository == 'aws/karpenter-provider-aws'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
Expand All @@ -17,6 +17,6 @@ jobs:
- uses: ./.github/actions/install-deps
- uses: aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a # v4.0.1
with:
role-to-assume: 'arn:aws:iam::${{ vars.ECR_ACCOUNT_ID }}:role/${{ vars.ECR_SNAPSHOT_ROLE_NAME }}'
aws-region: ${{ vars.ECR_REGION }}
role-to-assume: 'arn:aws:iam::${{ vars.SNAPSHOT_ACCOUNT_ID }}:role/${{ vars.SNAPSHOT_ROLE_NAME }}'
aws-region: ${{ vars.SNAPSHOT_REGION }}
- run: make snapshot
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ require (
k8s.io/utils v0.0.0-20230726121419-3b25d923346b
knative.dev/pkg v0.0.0-20231010144348-ca8c009405dd
sigs.k8s.io/controller-runtime v0.16.3
sigs.k8s.io/karpenter v0.33.0
sigs.k8s.io/karpenter v0.33.2
)

require (
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -761,8 +761,8 @@ sigs.k8s.io/controller-runtime v0.16.3 h1:2TuvuokmfXvDUamSx1SuAOO3eTyye+47mJCigw
sigs.k8s.io/controller-runtime v0.16.3/go.mod h1:j7bialYoSn142nv9sCOJmQgDXQXxnroFU4VnX/brVJ0=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
sigs.k8s.io/karpenter v0.33.0 h1:FvFzhYMwFackafImFeFz/ti/z/55NiZtr8HXiS+z9lU=
sigs.k8s.io/karpenter v0.33.0/go.mod h1:7hPB7kdImFHAFp7pqPUqgBDOrh8GEcRnHT5uIlsXMKA=
sigs.k8s.io/karpenter v0.33.2 h1:JAL5LCKoyj1ToMRoSItvBhqlAWcKfjdDvyI8t+tuPKo=
sigs.k8s.io/karpenter v0.33.2/go.mod h1:7hPB7kdImFHAFp7pqPUqgBDOrh8GEcRnHT5uIlsXMKA=
sigs.k8s.io/structured-merge-diff/v4 v4.3.0 h1:UZbZAZfX0wV2zr7YZorDz6GXROfDFj6LvqCRm4VUVKk=
sigs.k8s.io/structured-merge-diff/v4 v4.3.0/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08=
sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo=
Expand Down
6 changes: 3 additions & 3 deletions hack/release/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ config(){
RELEASE_REPO_GH=${RELEASE_REPO_GH:-ghcr.io/${GITHUB_ACCOUNT}/karpenter}

PRIVATE_HOST="${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com"
SNAPSHOT_REPO_ECR=${SNAPSHOT_REPO_ECR:-${PRIVATE_HOST}/karpenter/snapshot/}
SNAPSHOT_ECR="021119463062.dkr.ecr.us-east-1.amazonaws.com"
SNAPSHOT_REPO_ECR=${SNAPSHOT_REPO_ECR:-${SNAPSHOT_ECR}/karpenter/snapshot/}

CURRENT_MAJOR_VERSION="0"
RELEASE_PLATFORM="--platform=linux/amd64,linux/arm64"
Expand Down Expand Up @@ -57,15 +58,14 @@ Helm Chart Version $(helmChartVersion $RELEASE_VERSION)"
cosignImages
publishHelmChart "karpenter" "${RELEASE_VERSION}" "${RELEASE_REPO_ECR}"
publishHelmChart "karpenter-crd" "${RELEASE_VERSION}" "${RELEASE_REPO_ECR}"
pullPrivateReplica "$RELEASE_VERSION"
}

authenticate() {
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "${RELEASE_REPO_ECR}"
}

authenticatePrivateRepo() {
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin ${PRIVATE_HOST}
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin ${SNAPSHOT_ECR}
}

buildImages() {
Expand Down
2 changes: 1 addition & 1 deletion hack/toolchain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ tools() {
go install github.com/mikefarah/yq/v4@latest
go install github.com/norwoodj/helm-docs/cmd/helm-docs@latest
go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest
go install sigs.k8s.io/controller-tools/cmd/controller-gen@latest
go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.13.0
go install github.com/sigstore/cosign/cmd/cosign@latest
go install -tags extended github.com/gohugoio/[email protected]
go install golang.org/x/vuln/cmd/govulncheck@latest
Expand Down
2 changes: 1 addition & 1 deletion pkg/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ const (

const (
// DefaultCleanupInterval triggers cache cleanup (lazy eviction) at this interval.
DefaultCleanupInterval = 10 * time.Minute
DefaultCleanupInterval = time.Minute
)
68 changes: 32 additions & 36 deletions pkg/providers/instancetype/instancetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ import (
const (
InstanceTypesCacheKey = "types"
InstanceTypeOfferingsCacheKey = "offerings"
AvailabilityZonesCacheKey = "zones"
ZonesCacheKey = "zones"
)

type Provider struct {
Expand Down Expand Up @@ -98,12 +98,9 @@ func (p *Provider) List(ctx context.Context, kc *corev1beta1.KubeletConfiguratio
if err != nil {
return nil, err
}
// Get AvailabilityZones from EC2
availabilityZones, err := p.getAvailabilityZones(ctx)
if err != nil {
return nil, err
}
// Constrain AZs from subnets
// Get zones from instancetypeOfferings
zones := p.getZones(ctx, instanceTypeOfferings)
// Constrain zones from subnets
subnets, err := p.subnetProvider.List(ctx, nodeClass)
if err != nil {
return nil, err
Expand All @@ -115,13 +112,14 @@ func (p *Provider) List(ctx context.Context, kc *corev1beta1.KubeletConfiguratio
// Compute fully initialized instance types hash key
subnetHash, _ := hashstructure.Hash(subnets, hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true})
kcHash, _ := hashstructure.Hash(kc, hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true})
key := fmt.Sprintf("%d-%d-%d-%s-%016x-%016x", p.instanceTypesSeqNum, p.instanceTypeOfferingsSeqNum, p.unavailableOfferings.SeqNum, nodeClass.UID, subnetHash, kcHash)
blockDeviceMappingsHash, _ := hashstructure.Hash(nodeClass.Spec.BlockDeviceMappings, hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true})
key := fmt.Sprintf("%d-%d-%d-%016x-%016x-%016x-%s", p.instanceTypesSeqNum, p.instanceTypeOfferingsSeqNum, p.unavailableOfferings.SeqNum, subnetHash, kcHash, blockDeviceMappingsHash, aws.StringValue(nodeClass.Spec.AMIFamily))

if item, ok := p.cache.Get(key); ok {
return item.([]*cloudprovider.InstanceType), nil
}
result := lo.Map(instanceTypes, func(i *ec2.InstanceTypeInfo, _ int) *cloudprovider.InstanceType {
return NewInstanceType(ctx, i, kc, p.region, nodeClass, p.createOfferings(ctx, i, instanceTypeOfferings[aws.StringValue(i.InstanceType)], availabilityZones, subnetZones))
return NewInstanceType(ctx, i, kc, p.region, nodeClass, p.createOfferings(ctx, i, instanceTypeOfferings[aws.StringValue(i.InstanceType)], zones, subnetZones))
})
for _, instanceType := range instanceTypes {
InstanceTypeVCPU.With(prometheus.Labels{
Expand All @@ -142,18 +140,18 @@ func (p *Provider) LivenessProbe(req *http.Request) error {
return p.pricingProvider.LivenessProbe(req)
}

func (p *Provider) createOfferings(ctx context.Context, instanceType *ec2.InstanceTypeInfo, instanceTypeZones, availabilityZones, subnetZones sets.Set[string]) []cloudprovider.Offering {
func (p *Provider) createOfferings(ctx context.Context, instanceType *ec2.InstanceTypeInfo, instanceTypeZones, zones, subnetZones sets.Set[string]) []cloudprovider.Offering {
var offerings []cloudprovider.Offering
for az := range availabilityZones {
for zone := range zones {
// while usage classes should be a distinct set, there's no guarantee of that
for capacityType := range sets.NewString(aws.StringValueSlice(instanceType.SupportedUsageClasses)...) {
// exclude any offerings that have recently seen an insufficient capacity error from EC2
isUnavailable := p.unavailableOfferings.IsUnavailable(*instanceType.InstanceType, az, capacityType)
isUnavailable := p.unavailableOfferings.IsUnavailable(*instanceType.InstanceType, zone, capacityType)
var price float64
var ok bool
switch capacityType {
case ec2.UsageClassTypeSpot:
price, ok = p.pricingProvider.SpotPrice(*instanceType.InstanceType, az)
price, ok = p.pricingProvider.SpotPrice(*instanceType.InstanceType, zone)
case ec2.UsageClassTypeOnDemand:
price, ok = p.pricingProvider.OnDemandPrice(*instanceType.InstanceType)
case "capacity-block":
Expand All @@ -163,9 +161,9 @@ func (p *Provider) createOfferings(ctx context.Context, instanceType *ec2.Instan
logging.FromContext(ctx).Errorf("Received unknown capacity type %s for instance type %s", capacityType, *instanceType.InstanceType)
continue
}
available := !isUnavailable && ok && instanceTypeZones.Has(az) && subnetZones.Has(az)
available := !isUnavailable && ok && instanceTypeZones.Has(zone) && subnetZones.Has(zone)
offerings = append(offerings, cloudprovider.Offering{
Zone: az,
Zone: zone,
CapacityType: capacityType,
Price: price,
Available: available,
Expand All @@ -175,34 +173,28 @@ func (p *Provider) createOfferings(ctx context.Context, instanceType *ec2.Instan
return offerings
}

func (p *Provider) getAvailabilityZones(ctx context.Context) (sets.Set[string], error) {
func (p *Provider) getZones(ctx context.Context, instanceTypeOfferings map[string]sets.Set[string]) sets.Set[string] {
// DO NOT REMOVE THIS LOCK ----------------------------------------------------------------------------
// We lock here so that multiple callers to getAvailabilityZones do not result in cache misses and multiple
// calls to EC2 when we could have just made one call.
// TODO @joinnis: This can be made more efficient by holding a Read lock and only obtaining the Write if not in cache
p.mu.Lock()
defer p.mu.Unlock()
if cached, ok := p.cache.Get(AvailabilityZonesCacheKey); ok {
return cached.(sets.Set[string]), nil
}

// Get zones from EC2
instanceTypeZones := sets.Set[string]{}
output, err := p.ec2api.DescribeAvailabilityZonesWithContext(ctx, &ec2.DescribeAvailabilityZonesInput{})
if err != nil {
return nil, fmt.Errorf("describing availability zones, %w", err)
if cached, ok := p.cache.Get(ZonesCacheKey); ok {
return cached.(sets.Set[string])
}
for i := range output.AvailabilityZones {
zone := output.AvailabilityZones[i]
if aws.StringValue(zone.ZoneType) == "availability-zone" {
instanceTypeZones.Insert(aws.StringValue(zone.ZoneName))
// Get zones from offerings
zones := sets.Set[string]{}
for _, offeringZones := range instanceTypeOfferings {
for zone := range offeringZones {
zones.Insert(zone)
}
}
if p.cm.HasChanged("zones", instanceTypeZones) {
logging.FromContext(ctx).With("zones", instanceTypeZones.UnsortedList()).Debugf("discovered availability zones")
if p.cm.HasChanged("zones", zones) {
logging.FromContext(ctx).With("zones", zones.UnsortedList()).Debugf("discovered zones")
}
p.cache.Set(AvailabilityZonesCacheKey, instanceTypeZones, 24*time.Hour)
return instanceTypeZones, nil
p.cache.Set(ZonesCacheKey, zones, 24*time.Hour)
return zones
}

func (p *Provider) getInstanceTypeOfferings(ctx context.Context) (map[string]sets.Set[string], error) {
Expand Down Expand Up @@ -230,10 +222,12 @@ func (p *Provider) getInstanceTypeOfferings(ctx context.Context) (map[string]set
}); err != nil {
return nil, fmt.Errorf("describing instance type zone offerings, %w", err)
}
if p.cm.HasChanged("instance-type-count", len(instanceTypeOfferings)) {
if p.cm.HasChanged("instance-type-offering", instanceTypeOfferings) {
// Only update instanceTypesSeqNun with the instance type offerings have been changed
// This is to not create new keys with duplicate instance type offerings option
atomic.AddUint64(&p.instanceTypeOfferingsSeqNum, 1)
logging.FromContext(ctx).With("instance-type-count", len(instanceTypeOfferings)).Debugf("discovered offerings for instance types")
}
atomic.AddUint64(&p.instanceTypeOfferingsSeqNum, 1)
p.cache.SetDefault(InstanceTypeOfferingsCacheKey, instanceTypeOfferings)
return instanceTypeOfferings, nil
}
Expand Down Expand Up @@ -270,10 +264,12 @@ func (p *Provider) GetInstanceTypes(ctx context.Context) ([]*ec2.InstanceTypeInf
return nil, fmt.Errorf("fetching instance types using ec2.DescribeInstanceTypes, %w", err)
}
if p.cm.HasChanged("instance-types", instanceTypes) {
// Only update instanceTypesSeqNun with the instance types have been changed
// This is to not create new keys with duplicate instance types option
atomic.AddUint64(&p.instanceTypesSeqNum, 1)
logging.FromContext(ctx).With(
"count", len(instanceTypes)).Debugf("discovered instance types")
}
atomic.AddUint64(&p.instanceTypesSeqNum, 1)
p.cache.SetDefault(InstanceTypesCacheKey, instanceTypes)
return instanceTypes, nil
}
Loading