Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Offerings to InstanceType to hold zone and capacity type, remove OS from InstanceType, promote capacity-type label #780

Merged
merged 11 commits into from
Nov 10, 2021
7 changes: 7 additions & 0 deletions TROUBLESHOOTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,10 @@ github.com/awslabs/karpenter/pkg/controllers/provisioning/v1alpha1/reallocation.
github.com/awslabs/karpenter/pkg/controllers.(*GenericController).Reconcile(0xc000b00720, 0x23354c0, 0xc000e209f0, 0xc001db9be0, 0x7, 0xc001db9bd0, 0x7, 0xc000e209f0, 0x7fc864172d20, 0xc0000be2a0, ...)
```
This is fixed in Karpenter v0.2.7+. Reinstall Karpenter on the latest version.

### Nodes stuck in pending and not running the kubelet due to outdated CNI
If you have an EC2 instance get launched that is stuck in pending and ultimately not running the kubelet, you may see a message like this in your `/var/log/user-data.log`:

> No entry for c6i.xlarge in /etc/eks/eni-max-pods.txt

This means that your CNI plugin is out of date. You can find instructions on how to update your plugin [here](https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html).
8 changes: 4 additions & 4 deletions pkg/apis/provisioning/v1alpha5/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,26 +94,26 @@ type ProvisionerList struct {
Items []Provisioner `json:"items"`
}

// Zones for the constraints
func (r Requirements) Zones() sets.String {
return r.Requirement(v1.LabelTopologyZone)
}

// InstanceTypes for the constraints
func (r Requirements) InstanceTypes() sets.String {
return r.Requirement(v1.LabelInstanceTypeStable)
}

// Architectures for the constraints
func (r Requirements) Architectures() sets.String {
return r.Requirement(v1.LabelArchStable)
}

// OperatingSystems for the constraints
func (r Requirements) OperatingSystems() sets.String {
return r.Requirement(v1.LabelOSStable)
}

func (r Requirements) CapacityTypes() sets.String {
eptiger marked this conversation as resolved.
Show resolved Hide resolved
return r.Requirement(LabelCapacityType)
}

func (r Requirements) WithProvisioner(provisioner Provisioner) Requirements {
return r.
With(provisioner.Spec.Requirements).
Expand Down
6 changes: 4 additions & 2 deletions pkg/apis/provisioning/v1alpha5/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,19 @@ var (
v1.LabelHostname,
}
// These are either prohibited by the kubelet or reserved by karpenter
KarpenterLabelDomain = "karpenter.sh"
RestrictedLabelDomains = []string{
"kubernetes.io",
"k8s.io",
"karpenter.sh",
KarpenterLabelDomain,
}
LabelCapacityType = KarpenterLabelDomain + "/capacity-type"
// WellKnownLabels supported by karpenter
WellKnownLabels = sets.NewString(
v1.LabelTopologyZone,
v1.LabelInstanceTypeStable,
v1.LabelArchStable,
v1.LabelOSStable,
bwagner5 marked this conversation as resolved.
Show resolved Hide resolved
LabelCapacityType,
)
DefaultHook = func(ctx context.Context, constraints *Constraints) {}
ValidateHook = func(ctx context.Context, constraints *Constraints) *apis.FieldError { return nil }
Expand Down
6 changes: 3 additions & 3 deletions pkg/cloudprovider/aws/apis/v1alpha1/provider_defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ func (c *Constraints) Default(ctx context.Context) {
}

func (c *Constraints) defaultCapacityTypes() {
if _, ok := c.Labels[CapacityTypeLabel]; ok {
if _, ok := c.Labels[v1alpha5.LabelCapacityType]; ok {
return
}
if functional.ContainsString(c.Requirements.Keys(), CapacityTypeLabel) {
if functional.ContainsString(c.Requirements.Keys(), v1alpha5.LabelCapacityType) {
return
}
c.Requirements = append(c.Requirements, v1.NodeSelectorRequirement{
Key: CapacityTypeLabel,
Key: v1alpha5.LabelCapacityType,
Operator: v1.NodeSelectorOpIn,
Values: []string{CapacityTypeOnDemand},
})
Expand Down
2 changes: 0 additions & 2 deletions pkg/cloudprovider/aws/apis/v1alpha1/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (

var (
AWSLabelPrefix = "node.k8s.aws/"
CapacityTypeLabel = AWSLabelPrefix + "capacity-type"
CapacityTypeSpot = ec2.DefaultTargetCapacityTypeSpot
CapacityTypeOnDemand = ec2.DefaultTargetCapacityTypeOnDemand
AWSToKubeArchitectures = map[string]string{
Expand All @@ -45,5 +44,4 @@ func init() {
Scheme.AddKnownTypes(schema.GroupVersion{Group: v1alpha5.ExtensionsGroup, Version: "v1alpha1"}, &AWS{})
v1alpha5.RestrictedLabels = append(v1alpha5.RestrictedLabels, AWSLabelPrefix)
v1alpha5.RestrictedLabelDomains = append(v1alpha5.RestrictedLabelDomains, AWSRestrictedLabelDomains...)
v1alpha5.WellKnownLabels.Insert(CapacityTypeLabel)
}
10 changes: 5 additions & 5 deletions pkg/cloudprovider/aws/fake/ec2api.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ func (e *EC2API) DescribeInstanceTypesPagesWithContext(_ context.Context, _ *ec2
InstanceTypes: []*ec2.InstanceTypeInfo{
{
InstanceType: aws.String("m5.large"),
SupportedUsageClasses: []*string{aws.String("on-demand")},
SupportedUsageClasses: []*string{aws.String("on-demand"), aws.String("spot")},
SupportedVirtualizationTypes: []*string{aws.String("hvm")},
BurstablePerformanceSupported: aws.Bool(false),
BareMetal: aws.Bool(false),
Expand All @@ -186,7 +186,7 @@ func (e *EC2API) DescribeInstanceTypesPagesWithContext(_ context.Context, _ *ec2
},
{
InstanceType: aws.String("m5.xlarge"),
SupportedUsageClasses: []*string{aws.String("on-demand")},
SupportedUsageClasses: []*string{aws.String("on-demand"), aws.String("spot")},
SupportedVirtualizationTypes: []*string{aws.String("hvm")},
BurstablePerformanceSupported: aws.Bool(false),
BareMetal: aws.Bool(false),
Expand All @@ -206,7 +206,7 @@ func (e *EC2API) DescribeInstanceTypesPagesWithContext(_ context.Context, _ *ec2
},
{
InstanceType: aws.String("p3.8xlarge"),
SupportedUsageClasses: []*string{aws.String("on-demand")},
SupportedUsageClasses: []*string{aws.String("on-demand"), aws.String("spot")},
SupportedVirtualizationTypes: []*string{aws.String("hvm")},
BurstablePerformanceSupported: aws.Bool(false),
BareMetal: aws.Bool(false),
Expand All @@ -232,7 +232,7 @@ func (e *EC2API) DescribeInstanceTypesPagesWithContext(_ context.Context, _ *ec2
},
{
InstanceType: aws.String("c6g.large"),
SupportedUsageClasses: []*string{aws.String("on-demand")},
SupportedUsageClasses: []*string{aws.String("on-demand"), aws.String("spot")},
SupportedVirtualizationTypes: []*string{aws.String("hvm")},
BurstablePerformanceSupported: aws.Bool(false),
BareMetal: aws.Bool(false),
Expand All @@ -252,7 +252,7 @@ func (e *EC2API) DescribeInstanceTypesPagesWithContext(_ context.Context, _ *ec2
},
{
InstanceType: aws.String("inf1.6xlarge"),
SupportedUsageClasses: []*string{aws.String("on-demand")},
SupportedUsageClasses: []*string{aws.String("on-demand"), aws.String("spot")},
SupportedVirtualizationTypes: []*string{aws.String("hvm")},
BurstablePerformanceSupported: aws.Bool(false),
BareMetal: aws.Bool(false),
Expand Down
14 changes: 9 additions & 5 deletions pkg/cloudprovider/aws/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ func (p *InstanceProvider) launchInstances(ctx context.Context, constraints *v1a
// by constraints.Constrain(). Spot may be selected by constraining the provisioner,
// or using nodeSelectors, required node affinity, or preferred node affinity.
capacityType := v1alpha1.CapacityTypeOnDemand
if capacityTypes := constraints.Requirements.Requirement(v1alpha1.CapacityTypeLabel); len(capacityTypes) == 0 {
if capacityTypes := constraints.Requirements.CapacityTypes(); len(capacityTypes) == 0 {
return nil, fmt.Errorf("invariant violated, must contain at least one capacity type")
} else if len(capacityTypes) == 1 {
capacityType = capacityTypes.UnsortedList()[0]
Expand Down Expand Up @@ -153,7 +153,7 @@ func (p *InstanceProvider) getLaunchTemplateConfigs(ctx context.Context, constra
return nil, fmt.Errorf("getting subnets, %w", err)
}
var launchTemplateConfigs []*ec2.FleetLaunchTemplateConfigRequest
launchTemplates, err := p.launchTemplateProvider.Get(ctx, constraints, instanceTypes, map[string]string{v1alpha1.CapacityTypeLabel: capacityType})
launchTemplates, err := p.launchTemplateProvider.Get(ctx, constraints, instanceTypes, map[string]string{v1alpha5.LabelCapacityType: capacityType})
if err != nil {
return nil, fmt.Errorf("getting launch templates, %w", err)
}
Expand All @@ -172,9 +172,13 @@ func (p *InstanceProvider) getLaunchTemplateConfigs(ctx context.Context, constra
func (p *InstanceProvider) getOverrides(instanceTypeOptions []cloudprovider.InstanceType, subnets []*ec2.Subnet, capacityType string) []*ec2.FleetLaunchTemplateOverridesRequest {
var overrides []*ec2.FleetLaunchTemplateOverridesRequest
for i, instanceType := range instanceTypeOptions {
for zone := range instanceType.Zones() {
for _, offering := range instanceType.Offerings() {
// we can't assume that all zones will be available for all capacity types, hence this check
if offering.CapacityType != capacityType {
continue
}
for _, subnet := range subnets {
if aws.StringValue(subnet.AvailabilityZone) == zone {
if aws.StringValue(subnet.AvailabilityZone) == offering.Zone {
override := &ec2.FleetLaunchTemplateOverridesRequest{
InstanceType: aws.String(instanceType.Name()),
SubnetId: subnet.SubnetId,
Expand Down Expand Up @@ -227,7 +231,7 @@ func (p *InstanceProvider) instanceToNode(instance *ec2.Instance, instanceTypes
Labels: map[string]string{
v1.LabelTopologyZone: aws.StringValue(instance.Placement.AvailabilityZone),
v1.LabelInstanceTypeStable: aws.StringValue(instance.InstanceType),
v1alpha1.CapacityTypeLabel: getCapacityType(instance),
v1alpha5.LabelCapacityType: getCapacityType(instance),
},
},
Spec: v1.NodeSpec{
Expand Down
17 changes: 4 additions & 13 deletions pkg/cloudprovider/aws/instancetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,27 @@ import (

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha5"
"github.com/awslabs/karpenter/pkg/cloudprovider"
"github.com/awslabs/karpenter/pkg/cloudprovider/aws/apis/v1alpha1"
"github.com/awslabs/karpenter/pkg/utils/resources"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
)

// EC2VMAvailableMemoryFactor assumes the EC2 VM will consume <7.25% of the memory of a given machine
const EC2VMAvailableMemoryFactor = .925

type InstanceType struct {
ec2.InstanceTypeInfo
ZoneOptions sets.String
AvailableOfferings []cloudprovider.Offering
}

func (i *InstanceType) Name() string {
return aws.StringValue(i.InstanceType)
}

func (i *InstanceType) Zones() sets.String {
return i.ZoneOptions
}

func (i *InstanceType) CapacityTypes() sets.String {
return sets.NewString(aws.StringValueSlice(i.SupportedUsageClasses)...)
func (i *InstanceType) Offerings() []cloudprovider.Offering {
return i.AvailableOfferings
}

func (i *InstanceType) Architecture() string {
Expand All @@ -56,10 +51,6 @@ func (i *InstanceType) Architecture() string {
return fmt.Sprint(aws.StringValueSlice(i.ProcessorInfo.SupportedArchitectures)) // Unrecognized, but used for error printing
}

func (i *InstanceType) OperatingSystems() sets.String {
return sets.NewString(v1alpha5.OperatingSystemLinux)
}

func (i *InstanceType) CPU() *resource.Quantity {
return resources.Quantity(fmt.Sprint(*i.VCpuInfo.DefaultVCpus))
}
Expand Down
10 changes: 9 additions & 1 deletion pkg/cloudprovider/aws/instancetypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,15 @@ func (p *InstanceTypeProvider) Get(ctx context.Context, constraints *v1alpha1.Co
// Convert to cloudprovider.InstanceType
result := []cloudprovider.InstanceType{}
for _, instanceType := range instanceTypes {
instanceType.ZoneOptions = subnetZones.Intersection(instanceTypeZones[instanceType.Name()])
//TODO filter out possible zones and capacity types using an ICE cache https://github.com/awslabs/karpenter/issues/371
offerings := []cloudprovider.Offering{}
for zone := range subnetZones.Intersection(instanceTypeZones[instanceType.Name()]) {
// while usage classes should be a distinct set, there's no guarantee of that
for capacityType := range sets.NewString(aws.StringValueSlice(instanceType.SupportedUsageClasses)...) {
offerings = append(offerings, cloudprovider.Offering{Zone: zone, CapacityType: capacityType})
}
}
instanceType.AvailableOfferings = offerings
result = append(result, instanceType)
}
return result, nil
Expand Down
10 changes: 5 additions & 5 deletions pkg/cloudprovider/aws/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,10 +223,10 @@ var _ = Describe("Allocation", func() {
})
It("should launch spot capacity if flexible to both spot and on demand", func() {
// Setup
provisioner.Spec.Requirements = v1alpha5.Requirements{{Key: v1alpha1.CapacityTypeLabel, Operator: v1.NodeSelectorOpIn, Values: []string{v1alpha1.CapacityTypeSpot, v1alpha1.CapacityTypeOnDemand}}}
provisioner.Spec.Requirements = v1alpha5.Requirements{{Key: v1alpha5.LabelCapacityType, Operator: v1.NodeSelectorOpIn, Values: []string{v1alpha1.CapacityTypeSpot, v1alpha1.CapacityTypeOnDemand}}}
ExpectCreated(env.Client, provisioner)
pods := ExpectProvisioningSucceeded(ctx, env.Client, controller, provisioner,
test.UnschedulablePod(test.PodOptions{NodeSelector: map[string]string{v1alpha1.CapacityTypeLabel: v1alpha1.CapacityTypeSpot}}),
test.UnschedulablePod(test.PodOptions{NodeSelector: map[string]string{v1alpha5.LabelCapacityType: v1alpha1.CapacityTypeSpot}}),
)
// Assertions
ExpectNodeExists(env.Client, pods[0].Spec.NodeName)
Expand Down Expand Up @@ -356,7 +356,7 @@ var _ = Describe("Allocation", func() {
})
It("should default requirements", func() {
provisioner.SetDefaults(ctx)
Expect(provisioner.Spec.Requirements.Requirement(v1alpha1.CapacityTypeLabel).UnsortedList()).To(ConsistOf(v1alpha1.CapacityTypeOnDemand))
Expect(provisioner.Spec.Requirements.CapacityTypes().UnsortedList()).To(ConsistOf(v1alpha1.CapacityTypeOnDemand))
Expect(provisioner.Spec.Requirements.Architectures().UnsortedList()).To(ConsistOf(v1alpha5.ArchitectureAmd64))
})
})
Expand Down Expand Up @@ -419,8 +419,8 @@ var _ = Describe("Allocation", func() {
})
It("should support a capacity type label", func() {
for _, value := range []string{v1alpha1.CapacityTypeOnDemand, v1alpha1.CapacityTypeSpot} {
provisioner.Spec.Labels = map[string]string{v1alpha1.CapacityTypeLabel: value}
Expect(provisioner.Validate(ctx)).ToNot(Succeed())
provisioner.Spec.Labels = map[string]string{v1alpha5.LabelCapacityType: value}
Expect(provisioner.Validate(ctx)).To(Succeed())
eptiger marked this conversation as resolved.
Show resolved Hide resolved
}
})
})
Expand Down
20 changes: 12 additions & 8 deletions pkg/cloudprovider/fake/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
)

type CloudProvider struct{}
Expand All @@ -37,8 +36,16 @@ func (c *CloudProvider) Create(_ context.Context, constraints *v1alpha5.Constrai
for i := 0; i < quantity; i++ {
name := strings.ToLower(randomdata.SillyName())
instance := instanceTypes[0]
zone := instance.Zones().Intersection(constraints.Requirements.Zones()).UnsortedList()[0]
operatingSystem := instance.OperatingSystems().UnsortedList()[0]
var zone, capacityType string
for _, o := range instance.Offerings() {
if constraints.Requirements.CapacityTypes().Has(o.CapacityType) {
if constraints.Requirements.Zones().Has(o.Zone) {
zone = o.Zone
capacityType = o.CapacityType
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to break here? Does it matter?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh shoot yeah, good catch. I mean it ultimately doesn't matter, but yeah I was supposed to. I don't know how I missed that =P

break
}
}
}

go func() {
err <- bind(&v1.Node{
Expand All @@ -47,6 +54,7 @@ func (c *CloudProvider) Create(_ context.Context, constraints *v1alpha5.Constrai
Labels: map[string]string{
v1.LabelTopologyZone: zone,
v1.LabelInstanceTypeStable: instance.Name(),
v1alpha5.LabelCapacityType: capacityType,
},
},
Spec: v1.NodeSpec{
Expand All @@ -55,7 +63,7 @@ func (c *CloudProvider) Create(_ context.Context, constraints *v1alpha5.Constrai
Status: v1.NodeStatus{
NodeInfo: v1.NodeSystemInfo{
Architecture: instance.Architecture(),
OperatingSystem: operatingSystem,
OperatingSystem: v1alpha5.OperatingSystemLinux,
},
Allocatable: v1.ResourceList{
v1.ResourcePods: *instance.Pods(),
Expand Down Expand Up @@ -86,10 +94,6 @@ func (c *CloudProvider) GetInstanceTypes(_ context.Context, _ *v1alpha5.Constrai
name: "aws-neuron-instance-type",
awsNeurons: resource.MustParse("2"),
}),
NewInstanceType(InstanceTypeOptions{
name: "windows-instance-type",
operatingSystems: sets.NewString("windows"),
}),
NewInstanceType(InstanceTypeOptions{
name: "arm-instance-type",
architecture: "arm64",
Expand Down
Loading