Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scheduling benchmark #1594

Merged
merged 4 commits into from
Mar 30, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ LDFLAGS ?= -ldflags=-X=github.com/aws/karpenter/pkg/utils/project.Version=$(shel
GOFLAGS ?= -tags=$(CLOUD_PROVIDER) $(LDFLAGS)
WITH_GOFLAGS = GOFLAGS="$(GOFLAGS)"

# detect nice binary
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the interest of keeping this Makefile manageable, I think it's fair to assume that you need nice installed if you want to run the benchmark.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine removing it entirely, doubt CI has it and if you want it locally you can just run nice -19 make benchmark

NICE := $(shell command -v nice 2> /dev/null)
NICE_PREFIX = ""
ifdef NICE
NICE_PREFIX = ${NICE} -19
endif

## Extra helm options
CLUSTER_NAME ?= $(shell kubectl config view --minify -o jsonpath='{.clusters[].name}' | rev | cut -d"/" -f1 | rev)
CLUSTER_ENDPOINT ?= $(shell kubectl config view --minify -o jsonpath='{.clusters[].cluster.server}')
Expand All @@ -21,21 +28,24 @@ help: ## Display help

dev: verify test ## Run all steps in the developer loop

ci: toolchain verify licenses battletest ## Run all steps used by continuous integration
ci: toolchain verify licenses battletest benchmark ## Run all steps used by continuous integration

test: ## Run tests
ginkgo -r

strongertests:
# Run randomized, parallelized, racing, code coveraged, tests
# Run randomized, racing, code coveraged, tests
ginkgo -r \
-cover -coverprofile=coverage.out -outputdir=. -coverpkg=./pkg/... \
--randomizeAllSpecs --randomizeSuites -race

benchmark:
${NICE_PREFIX} go test -tags=test_performance -run=NoTests -bench=. ./...

deflake:
for i in {1..10}; do make strongertests || exit 1; done

battletest: strongertests
battletest: strongertests
go tool cover -html coverage.out -o coverage.html

verify: codegen ## Verify code. Includes dependencies, linting, formatting, etc
Expand Down
16 changes: 16 additions & 0 deletions pkg/cloudprovider/aws/instancetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ type InstanceType struct {
ec2.InstanceTypeInfo
AvailableOfferings []cloudprovider.Offering
MaxPods *int32
resources v1.ResourceList
overhead v1.ResourceList
}

func newInstanceType(info ec2.InstanceTypeInfo) *InstanceType {
it := &InstanceType{InstanceTypeInfo: info}
it.resources = it.computeResources()
it.overhead = it.computeOverhead()
return it
}

func (i *InstanceType) Name() string {
Expand All @@ -61,6 +70,10 @@ func (i *InstanceType) Architecture() string {
}

func (i *InstanceType) Resources() v1.ResourceList {
return i.resources
}

func (i *InstanceType) computeResources() v1.ResourceList {
return v1.ResourceList{
v1.ResourceCPU: i.cpu(),
v1.ResourceMemory: i.memory(),
Expand Down Expand Up @@ -175,6 +188,9 @@ func (i *InstanceType) awsNeurons() resource.Quantity {
// While this doesn't calculate the correct overhead for non-ENI-limited nodes, we're using this approach until further
// analysis can be performed
func (i *InstanceType) Overhead() v1.ResourceList {
return i.overhead
}
func (i *InstanceType) computeOverhead() v1.ResourceList {
overhead := v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(
100, // system-reserved
Expand Down
2 changes: 1 addition & 1 deletion pkg/cloudprovider/aws/instancetypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ func (p *InstanceTypeProvider) getInstanceTypes(ctx context.Context) (map[string
}, func(page *ec2.DescribeInstanceTypesOutput, lastPage bool) bool {
for _, instanceType := range page.InstanceTypes {
if p.filter(instanceType) {
instanceTypes[aws.StringValue(instanceType.InstanceType)] = &InstanceType{InstanceTypeInfo: *instanceType}
instanceTypes[aws.StringValue(instanceType.InstanceType)] = newInstanceType(*instanceType)
}
}
return true
Expand Down
13 changes: 9 additions & 4 deletions pkg/cloudprovider/fake/instancetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ func NewInstanceType(options InstanceTypeOptions) *InstanceType {
if options.Resources == nil {
options.Resources = map[v1.ResourceName]resource.Quantity{}
}
if options.Overhead == nil {
options.Overhead = v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("10Mi"),
}
}
if len(options.Offerings) == 0 {
options.Offerings = []cloudprovider.Offering{
{CapacityType: "spot", Zone: "test-zone-1"},
Expand Down Expand Up @@ -64,6 +70,7 @@ func NewInstanceType(options InstanceTypeOptions) *InstanceType {
Architecture: options.Architecture,
OperatingSystems: options.OperatingSystems,
Resources: options.Resources,
Overhead: options.Overhead,
Price: options.Price},
}
}
Expand Down Expand Up @@ -127,6 +134,7 @@ type InstanceTypeOptions struct {
Offerings []cloudprovider.Offering
Architecture string
OperatingSystems sets.String
Overhead v1.ResourceList
Resources v1.ResourceList
Price float64
}
Expand Down Expand Up @@ -175,8 +183,5 @@ func (i *InstanceType) OperatingSystems() sets.String {
}

func (i *InstanceType) Overhead() v1.ResourceList {
return v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("10Mi"),
}
return i.options.Overhead
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package scheduling_test

import (
"fmt"
"github.com/mitchellh/hashstructure/v2"
"math"
"math/rand"

Expand Down Expand Up @@ -450,6 +451,18 @@ var _ = Describe("Instance Type Selection", func() {
Expect(cloudProv.CreateCalls).To(HaveLen(0))
})
It("should schedule on an instance with enough resources", func() {
// this is a pretty thorough exercise of scheduling, so we also check an invariant that scheduling doesn't
// modify the instance type's Overhead() or Resources() maps so they can return the same map every time instead
// of re-alllocating a new one per call
resourceHashes := map[string]uint64{}
overheadHashes := map[string]uint64{}
for _, it := range cloudProv.InstanceTypes {
var err error
resourceHashes[it.Name()], err = hashstructure.Hash(it.Resources(), hashstructure.FormatV2, nil)
Expect(err).To(BeNil())
overheadHashes[it.Name()], err = hashstructure.Hash(it.Overhead(), hashstructure.FormatV2, nil)
}

// these values are constructed so that three of these pods can always fit on at least one of our instance types
for _, cpu := range []float64{0.1, 1.0, 2, 2.5, 4, 8, 16} {
for _, mem := range []float64{0.1, 1.0, 2, 4, 8, 16, 32} {
Expand Down Expand Up @@ -478,6 +491,14 @@ var _ = Describe("Instance Type Selection", func() {
}
}
}
for _, it := range cloudProv.InstanceTypes {
resourceHash, err := hashstructure.Hash(it.Resources(), hashstructure.FormatV2, nil)
Expect(err).To(BeNil())
overheadHash, err := hashstructure.Hash(it.Overhead(), hashstructure.FormatV2, nil)
Expect(err).To(BeNil())
Expect(resourceHash).To(Equal(resourceHashes[it.Name()]), fmt.Sprintf("expected %s Resources() to not be modified by scheduling", it.Name()))
Expect(overheadHash).To(Equal(overheadHashes[it.Name()]), fmt.Sprintf("expected %s Overhead() to not be modified by scheduling", it.Name()))
}
})
})

Expand Down
Loading