Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove the specialness from GPU requests #1489

Merged
merged 4 commits into from
Mar 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pkg/cloudprovider/aws/amifamily/al2.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ package amifamily
import (
"fmt"

"github.com/aws/karpenter/pkg/utils/resources"

"github.com/aws/aws-sdk-go/aws"
core "k8s.io/api/core/v1"

Expand All @@ -33,7 +35,7 @@ type AL2 struct {
// SSMAlias returns the AMI Alias to query SSM
func (a AL2) SSMAlias(version string, instanceType cloudprovider.InstanceType) string {
amiSuffix := ""
if !instanceType.NvidiaGPUs().IsZero() || !instanceType.AWSNeurons().IsZero() {
if !resources.IsZero(instanceType.Resources()[v1alpha1.ResourceNVIDIAGPU]) || !resources.IsZero(instanceType.Resources()[v1alpha1.ResourceAWSNeuron]) {
amiSuffix = "-gpu"
} else if instanceType.Architecture() == v1alpha5.ArchitectureArm64 {
amiSuffix = fmt.Sprintf("-%s", instanceType.Architecture())
Expand Down
4 changes: 3 additions & 1 deletion pkg/cloudprovider/aws/amifamily/bottlerocket.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ package amifamily
import (
"fmt"

"github.com/aws/karpenter/pkg/utils/resources"

"github.com/aws/aws-sdk-go/aws"
core "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
Expand All @@ -35,7 +37,7 @@ type Bottlerocket struct {
func (b Bottlerocket) SSMAlias(version string, instanceType cloudprovider.InstanceType) string {
arch := "x86_64"
amiSuffix := ""
if !instanceType.NvidiaGPUs().IsZero() {
if !resources.IsZero(instanceType.Resources()[v1alpha1.ResourceNVIDIAGPU]) {
amiSuffix = "-nvidia"
}
if instanceType.Architecture() == v1alpha5.ArchitectureArm64 {
Expand Down
5 changes: 5 additions & 0 deletions pkg/cloudprovider/aws/apis/v1alpha1/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package v1alpha1

import (
"github.com/aws/aws-sdk-go/service/ec2"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/runtime/serializer"
Expand All @@ -41,6 +42,10 @@ var (
AMIFamilyAL2,
AMIFamilyUbuntu,
}
ResourceNVIDIAGPU v1.ResourceName = "nvidia.com/gpu"
ResourceAMDGPU v1.ResourceName = "amd.com/gpu"
ResourceAWSNeuron v1.ResourceName = "aws.amazon.com/neuron"
ResourceAWSPodENI v1.ResourceName = "vpc.amazonaws.com/pod-eni"
)

var (
Expand Down
24 changes: 24 additions & 0 deletions pkg/cloudprovider/aws/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import (
"fmt"
"time"

"github.com/aws/karpenter/pkg/utils/resources"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/client"
"github.com/aws/aws-sdk-go/aws/ec2metadata"
Expand Down Expand Up @@ -104,6 +106,8 @@ func (c *CloudProvider) Create(ctx context.Context, constraints *v1alpha5.Constr
if err != nil {
return err
}
instanceTypes = c.filterInstanceTypes(instanceTypes)

// Create will only return an error if zero nodes could be launched.
// Partial fulfillment will be logged
nodes, err := c.instanceProvider.Create(ctx, vendorConstraints, instanceTypes, quantity)
Expand Down Expand Up @@ -157,6 +161,26 @@ func (c *CloudProvider) Name() string {
return "aws"
}

// filterInstanceTypes is used to eliminate GPU instance types from the list of possible instance types when a
// non-GPU instance type will work. If the list of instance types consists of both GPU and non-GPU types, then only
// the non-GPU types will be returned. If it has only GPU types, the list will be returned unaltered.
func (c *CloudProvider) filterInstanceTypes(instanceTypes []cloudprovider.InstanceType) []cloudprovider.InstanceType {
var genericInstanceTypes []cloudprovider.InstanceType
for _, it := range instanceTypes {
itRes := it.Resources()
if resources.IsZero(itRes[v1alpha1.ResourceAWSNeuron]) &&
resources.IsZero(itRes[v1alpha1.ResourceAMDGPU]) &&
resources.IsZero(itRes[v1alpha1.ResourceNVIDIAGPU]) {
genericInstanceTypes = append(genericInstanceTypes, it)
}
}
// if we got some subset of non-GPU types, then prefer to use those
if len(genericInstanceTypes) != 0 {
return genericInstanceTypes
}
return instanceTypes
}

// get the current region from EC2 IMDS
func getRegionFromIMDS(sess *session.Session) string {
region, err := ec2metadata.New(sess).Region()
Expand Down
9 changes: 7 additions & 2 deletions pkg/cloudprovider/aws/fake/ssmapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ package fake

import (
"context"
"fmt"

"github.com/mitchellh/hashstructure/v2"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/request"
Expand All @@ -29,14 +32,16 @@ type SSMAPI struct {
WantErr error
}

func (a SSMAPI) GetParameterWithContext(context.Context, *ssm.GetParameterInput, ...request.Option) (*ssm.GetParameterOutput, error) {
func (a SSMAPI) GetParameterWithContext(ctx context.Context, input *ssm.GetParameterInput, opts ...request.Option) (*ssm.GetParameterOutput, error) {
if a.WantErr != nil {
return nil, a.WantErr
}
hc, _ := hashstructure.Hash(input.Name, hashstructure.FormatV2, nil)
if a.GetParameterOutput != nil {
return a.GetParameterOutput, nil
}

return &ssm.GetParameterOutput{
Parameter: &ssm.Parameter{Value: aws.String("test-ami-id")},
Parameter: &ssm.Parameter{Value: aws.String(fmt.Sprintf("test-ami-id-%x", hc))},
}, nil
}
22 changes: 10 additions & 12 deletions pkg/cloudprovider/aws/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,7 @@ const (
CreationQPS = 2
// CreationBurst limits the additional burst requests.
// https://docs.aws.amazon.com/AWSEC2/latest/APIReference/throttling.html#throttling-limits
CreationBurst = 100
nvidiaGPUResourceName v1.ResourceName = "nvidia.com/gpu"
amdGPUResourceName v1.ResourceName = "amd.com/gpu"
awsNeuronResourceName v1.ResourceName = "aws.amazon.com/neuron"
CreationBurst = 100
)

type InstanceProvider struct {
Expand Down Expand Up @@ -276,17 +273,18 @@ func (p *InstanceProvider) instanceToNode(ctx context.Context, instance *ec2.Ins
if injection.GetOptions(ctx).GetAWSNodeNameConvention() == options.ResourceName {
nodeName = aws.StringValue(instance.InstanceId)
}

resources := v1.ResourceList{}
for resourceName, quantity := range map[v1.ResourceName]*resource.Quantity{
v1.ResourcePods: instanceType.Pods(),
v1.ResourceCPU: instanceType.CPU(),
v1.ResourceMemory: instanceType.Memory(),
nvidiaGPUResourceName: instanceType.NvidiaGPUs(),
amdGPUResourceName: instanceType.AMDGPUs(),
awsNeuronResourceName: instanceType.AWSNeurons(),
for resourceName, quantity := range map[v1.ResourceName]resource.Quantity{
v1.ResourcePods: instanceType.Resources()[v1.ResourcePods],
v1.ResourceCPU: instanceType.Resources()[v1.ResourceCPU],
v1.ResourceMemory: instanceType.Resources()[v1.ResourceMemory],
v1alpha1.ResourceNVIDIAGPU: instanceType.Resources()[v1alpha1.ResourceNVIDIAGPU],
v1alpha1.ResourceAMDGPU: instanceType.Resources()[v1alpha1.ResourceAMDGPU],
v1alpha1.ResourceAWSNeuron: instanceType.Resources()[v1alpha1.ResourceAWSNeuron],
} {
if !quantity.IsZero() {
resources[resourceName] = *quantity
resources[resourceName] = quantity
}
}

Expand Down
71 changes: 52 additions & 19 deletions pkg/cloudprovider/aws/instancetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,66 +60,98 @@ func (i *InstanceType) Architecture() string {
return fmt.Sprint(aws.StringValueSlice(i.ProcessorInfo.SupportedArchitectures)) // Unrecognized, but used for error printing
}

func (i *InstanceType) CPU() *resource.Quantity {
return resources.Quantity(fmt.Sprint(*i.VCpuInfo.DefaultVCpus))
func (i *InstanceType) Resources() v1.ResourceList {
return v1.ResourceList{
v1.ResourceCPU: i.cpu(),
v1.ResourceMemory: i.memory(),
v1.ResourcePods: i.pods(),
v1alpha1.ResourceAWSPodENI: i.awsPodENI(),
v1alpha1.ResourceNVIDIAGPU: i.nvidiaGPUs(),
v1alpha1.ResourceAMDGPU: i.amdGPUs(),
v1alpha1.ResourceAWSNeuron: i.awsNeurons(),
}
}

func (i *InstanceType) Price() float64 {
const (
GPUCostWeight = 5
CPUCostWeight = 1
MemoryMBCostWeight = 1024
)

gpuCount := 0.0
if i.GpuInfo != nil {
for _, gpu := range i.GpuInfo.Gpus {
if gpu.Count != nil {
gpuCount += float64(*gpu.Count)
}
}
}

return CPUCostWeight*float64(*i.VCpuInfo.DefaultVCpus) +
MemoryMBCostWeight*float64(*i.MemoryInfo.SizeInMiB) +
GPUCostWeight*gpuCount
}
func (i *InstanceType) cpu() resource.Quantity {
return *resources.Quantity(fmt.Sprint(*i.VCpuInfo.DefaultVCpus))
}

func (i *InstanceType) Memory() *resource.Quantity {
return resources.Quantity(
func (i *InstanceType) memory() resource.Quantity {
return *resources.Quantity(
fmt.Sprintf("%dMi", int32(
float64(*i.MemoryInfo.SizeInMiB)*EC2VMAvailableMemoryFactor,
)),
)
}

func (i *InstanceType) Pods() *resource.Quantity {
func (i *InstanceType) pods() resource.Quantity {
if i.MaxPods != nil {
return resources.Quantity(fmt.Sprint(ptr.Int32Value(i.MaxPods)))
return *resources.Quantity(fmt.Sprint(ptr.Int32Value(i.MaxPods)))
}
return resources.Quantity(fmt.Sprint(i.eniLimitedPods()))
return *resources.Quantity(fmt.Sprint(i.eniLimitedPods()))
}

func (i *InstanceType) AWSPodENI() *resource.Quantity {
func (i *InstanceType) awsPodENI() resource.Quantity {
// https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html#supported-instance-types
limits, ok := vpc.Limits[aws.StringValue(i.InstanceType)]
if ok && limits.IsTrunkingCompatible {
return resources.Quantity(fmt.Sprint(limits.BranchInterface))
return *resources.Quantity(fmt.Sprint(limits.BranchInterface))
}
return resources.Quantity("0")
return *resources.Quantity("0")
}

func (i *InstanceType) NvidiaGPUs() *resource.Quantity {
func (i *InstanceType) nvidiaGPUs() resource.Quantity {
count := int64(0)
if i.GpuInfo != nil {
for _, gpu := range i.GpuInfo.Gpus {
if *i.GpuInfo.Gpus[0].Manufacturer == "NVIDIA" {
if *gpu.Manufacturer == "NVIDIA" {
count += *gpu.Count
}
}
}
return resources.Quantity(fmt.Sprint(count))
return *resources.Quantity(fmt.Sprint(count))
}

func (i *InstanceType) AMDGPUs() *resource.Quantity {
func (i *InstanceType) amdGPUs() resource.Quantity {
count := int64(0)
if i.GpuInfo != nil {
for _, gpu := range i.GpuInfo.Gpus {
if *i.GpuInfo.Gpus[0].Manufacturer == "AMD" {
if *gpu.Manufacturer == "NVIDIA" {
count += *gpu.Count
}
}
}
return resources.Quantity(fmt.Sprint(count))
return *resources.Quantity(fmt.Sprint(count))
}

func (i *InstanceType) AWSNeurons() *resource.Quantity {
func (i *InstanceType) awsNeurons() resource.Quantity {
count := int64(0)
if i.InferenceAcceleratorInfo != nil {
for _, accelerator := range i.InferenceAcceleratorInfo.Accelerators {
count += *accelerator.Count
}
}
return resources.Quantity(fmt.Sprint(count))
return *resources.Quantity(fmt.Sprint(count))
}

// Overhead computes overhead for https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#node-allocatable
Expand Down Expand Up @@ -152,7 +184,8 @@ func (i *InstanceType) Overhead() v1.ResourceList {
{start: 2000, end: 4000, percentage: 0.005},
{start: 4000, end: 1 << 31, percentage: 0.0025},
} {
if cpu := i.CPU().MilliValue(); cpu >= cpuRange.start {
cpuSt := i.cpu()
if cpu := cpuSt.MilliValue(); cpu >= cpuRange.start {
r := float64(cpuRange.end - cpuRange.start)
if cpu < cpuRange.end {
r = float64(cpu - cpuRange.start)
Expand Down
2 changes: 1 addition & 1 deletion pkg/cloudprovider/aws/instancetypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func (p *InstanceTypeProvider) Get(ctx context.Context, provider *v1alpha1.AWS)
if err != nil {
return nil, err
}
result := []cloudprovider.InstanceType{}
var result []cloudprovider.InstanceType
for _, instanceType := range instanceTypes {
offerings := p.createOfferings(instanceType, subnetZones, instanceTypeZones[instanceType.Name()])
if len(offerings) > 0 {
Expand Down
Loading