From 612e685fdbee25b778d7e8ce3f284ceae32e0957 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Sun, 3 Mar 2024 00:06:25 -0800 Subject: [PATCH] Add instance type offering availability to metrics --- pkg/controllers/pricing/suite_test.go | 20 ---- pkg/providers/instancetype/instancetype.go | 25 +++-- pkg/providers/instancetype/metrics.go | 45 +++++++-- pkg/providers/instancetype/suite_test.go | 93 +++++++++++++------ pkg/providers/pricing/metrics.go | 50 ---------- pkg/providers/pricing/pricing.go | 15 --- .../content/en/preview/reference/metrics.md | 7 +- .../en/preview/upgrading/upgrade-guide.md | 8 ++ 8 files changed, 131 insertions(+), 132 deletions(-) delete mode 100644 pkg/providers/pricing/metrics.go diff --git a/pkg/controllers/pricing/suite_test.go b/pkg/controllers/pricing/suite_test.go index 6aad6dba3c90..0427d6b2d642 100644 --- a/pkg/controllers/pricing/suite_test.go +++ b/pkg/controllers/pricing/suite_test.go @@ -124,12 +124,10 @@ var _ = Describe("Pricing", func() { price, ok := awsEnv.PricingProvider.OnDemandPrice("c98.large") Expect(ok).To(BeTrue()) Expect(price).To(BeNumerically("==", 1.20)) - Expect(getPricingEstimateMetricValue("c98.large", ec2.UsageClassTypeOnDemand, "")).To(BeNumerically("==", 1.20)) price, ok = awsEnv.PricingProvider.OnDemandPrice("c99.large") Expect(ok).To(BeTrue()) Expect(price).To(BeNumerically("==", 1.23)) - Expect(getPricingEstimateMetricValue("c99.large", ec2.UsageClassTypeOnDemand, "")).To(BeNumerically("==", 1.23)) }) It("should update spot pricing with response from the pricing API", func() { now := time.Now() @@ -172,12 +170,10 @@ var _ = Describe("Pricing", func() { price, ok := awsEnv.PricingProvider.SpotPrice("c98.large", "test-zone-1b") Expect(ok).To(BeTrue()) Expect(price).To(BeNumerically("==", 1.10)) - Expect(getPricingEstimateMetricValue("c98.large", ec2.UsageClassTypeSpot, "test-zone-1b")).To(BeNumerically("==", 1.10)) price, ok = awsEnv.PricingProvider.SpotPrice("c99.large", "test-zone-1a") Expect(ok).To(BeTrue()) Expect(price).To(BeNumerically("==", 1.23)) - Expect(getPricingEstimateMetricValue("c99.large", ec2.UsageClassTypeSpot, "test-zone-1a")).To(BeNumerically("==", 1.23)) }) It("should update zonal pricing with data from the spot pricing API", func() { now := time.Now() @@ -208,7 +204,6 @@ var _ = Describe("Pricing", func() { price, ok := awsEnv.PricingProvider.SpotPrice("c98.large", "test-zone-1a") Expect(ok).To(BeTrue()) Expect(price).To(BeNumerically("==", 1.20)) - Expect(getPricingEstimateMetricValue("c98.large", ec2.UsageClassTypeSpot, "test-zone-1a")).To(BeNumerically("==", 1.20)) _, ok = awsEnv.PricingProvider.SpotPrice("c98.large", "test-zone-1b") Expect(ok).ToNot(BeTrue()) @@ -301,20 +296,5 @@ var _ = Describe("Pricing", func() { price, ok = awsEnv.PricingProvider.SpotPrice("c98.large", "test-zone-1b") Expect(ok).To(BeTrue()) Expect(price).To(BeNumerically("==", 1.10)) - Expect(getPricingEstimateMetricValue("c98.large", ec2.UsageClassTypeSpot, "test-zone-1b")).To(BeNumerically("==", 1.10)) }) }) - -func getPricingEstimateMetricValue(instanceType string, capacityType string, zone string) float64 { - var value *float64 - metric, ok := FindMetricWithLabelValues("karpenter_cloudprovider_instance_type_price_estimate", map[string]string{ - pricing.InstanceTypeLabel: instanceType, - pricing.CapacityTypeLabel: capacityType, - pricing.RegionLabel: fake.DefaultRegion, - pricing.TopologyLabel: zone, - }) - Expect(ok).To(BeTrue()) - value = metric.GetGauge().Value - Expect(value).To(Not(BeNil())) - return *value -} diff --git a/pkg/providers/instancetype/instancetype.go b/pkg/providers/instancetype/instancetype.go index 3f1a157aeac4..6540e2590037 100644 --- a/pkg/providers/instancetype/instancetype.go +++ b/pkg/providers/instancetype/instancetype.go @@ -118,16 +118,15 @@ func (p *Provider) List(ctx context.Context, kc *corev1beta1.KubeletConfiguratio return item.([]*cloudprovider.InstanceType), nil } result := lo.Map(instanceTypes, func(i *ec2.InstanceTypeInfo, _ int) *cloudprovider.InstanceType { + instanceTypeVCPU.With(prometheus.Labels{ + instanceTypeLabel: *i.InstanceType, + }).Set(float64(aws.Int64Value(i.VCpuInfo.DefaultVCpus))) + instanceTypeMemory.With(prometheus.Labels{ + instanceTypeLabel: *i.InstanceType, + }).Set(float64(aws.Int64Value(i.MemoryInfo.SizeInMiB) * 1024 * 1024)) + return NewInstanceType(ctx, i, kc, p.region, nodeClass, p.createOfferings(ctx, i, instanceTypeOfferings[aws.StringValue(i.InstanceType)], zones, subnetZones)) }) - for _, instanceType := range instanceTypes { - InstanceTypeVCPU.With(prometheus.Labels{ - InstanceTypeLabel: *instanceType.InstanceType, - }).Set(float64(aws.Int64Value(instanceType.VCpuInfo.DefaultVCpus))) - InstanceTypeMemory.With(prometheus.Labels{ - InstanceTypeLabel: *instanceType.InstanceType, - }).Set(float64(aws.Int64Value(instanceType.MemoryInfo.SizeInMiB) * 1024 * 1024)) - } p.cache.SetDefault(key, result) return result, nil } @@ -167,6 +166,16 @@ func (p *Provider) createOfferings(ctx context.Context, instanceType *ec2.Instan Price: price, Available: available, }) + instanceTypeOfferingAvailable.With(prometheus.Labels{ + instanceTypeLabel: *instanceType.InstanceType, + capacityTypeLabel: capacityType, + zoneLabel: zone, + }).Set(float64(lo.Ternary(available, 1, 0))) + instanceTypeOfferingPriceEstimate.With(prometheus.Labels{ + instanceTypeLabel: *instanceType.InstanceType, + capacityTypeLabel: capacityType, + zoneLabel: zone, + }).Set(price) } } return offerings diff --git a/pkg/providers/instancetype/metrics.go b/pkg/providers/instancetype/metrics.go index 5e343aa9b12b..5632d3dd14dc 100644 --- a/pkg/providers/instancetype/metrics.go +++ b/pkg/providers/instancetype/metrics.go @@ -23,12 +23,13 @@ import ( const ( cloudProviderSubsystem = "cloudprovider" + instanceTypeLabel = "instance_type" + capacityTypeLabel = "capacity_type" + zoneLabel = "zone" ) var ( - InstanceTypeLabel = "instance_type" - - InstanceTypeVCPU = prometheus.NewGaugeVec( + instanceTypeVCPU = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: cloudProviderSubsystem, @@ -36,10 +37,10 @@ var ( Help: "VCPUs cores for a given instance type.", }, []string{ - InstanceTypeLabel, - }) - - InstanceTypeMemory = prometheus.NewGaugeVec( + instanceTypeLabel, + }, + ) + instanceTypeMemory = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: cloudProviderSubsystem, @@ -47,10 +48,36 @@ var ( Help: "Memory, in bytes, for a given instance type.", }, []string{ - InstanceTypeLabel, + instanceTypeLabel, + }, + ) + instanceTypeOfferingAvailable = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metrics.Namespace, + Subsystem: cloudProviderSubsystem, + Name: "instance_type_offering_available", + Help: "Instance type offering availability, based on instance type, capacity type, and zone", + }, + []string{ + instanceTypeLabel, + capacityTypeLabel, + zoneLabel, + }, + ) + instanceTypeOfferingPriceEstimate = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metrics.Namespace, + Subsystem: cloudProviderSubsystem, + Name: "instance_type_offering_price_estimate", + Help: "Instance type offering estimated estimated hourly price used when making informed decisions on node cost calculation, based on instance type, capacity type, and zone.", + }, + []string{ + instanceTypeLabel, + capacityTypeLabel, + zoneLabel, }) ) func init() { - crmetrics.Registry.MustRegister(InstanceTypeVCPU, InstanceTypeMemory) + crmetrics.Registry.MustRegister(instanceTypeVCPU, instanceTypeMemory, instanceTypeOfferingAvailable, instanceTypeOfferingPriceEstimate) } diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index 052b362e0418..9f251e5f395f 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -741,34 +741,71 @@ var _ = Describe("InstanceTypes", func() { Expect(it.Capacity.Pods().Value()).To(BeNumerically("==", 110)) } }) - - It("should expose vcpu metrics for instance types", func() { - instanceInfo, err := awsEnv.InstanceTypesProvider.List(ctx, nodePool.Spec.Template.Spec.Kubelet, nodeClass) - Expect(err).To(BeNil()) - Expect(len(instanceInfo)).To(BeNumerically(">", 0)) - for _, info := range instanceInfo { - metric, ok := FindMetricWithLabelValues("karpenter_cloudprovider_instance_type_cpu_cores", map[string]string{ - instancetype.InstanceTypeLabel: info.Name, - }) - Expect(ok).To(BeTrue()) - Expect(metric).To(Not(BeNil())) - value := metric.GetGauge().Value - Expect(aws.Float64Value(value)).To(BeNumerically(">", 0)) - } - }) - It("should expose memory metrics for instance types", func() { - instanceInfo, err := awsEnv.InstanceTypesProvider.List(ctx, nodePool.Spec.Template.Spec.Kubelet, nodeClass) - Expect(err).To(BeNil()) - Expect(len(instanceInfo)).To(BeNumerically(">", 0)) - for _, info := range instanceInfo { - metric, ok := FindMetricWithLabelValues("karpenter_cloudprovider_instance_type_memory_bytes", map[string]string{ - instancetype.InstanceTypeLabel: info.Name, - }) - Expect(ok).To(BeTrue()) - Expect(metric).To(Not(BeNil())) - value := metric.GetGauge().Value - Expect(aws.Float64Value(value)).To(BeNumerically(">", 0)) - } + Context("Metrics", func() { + It("should expose vcpu metrics for instance types", func() { + instanceTypes, err := awsEnv.InstanceTypesProvider.List(ctx, nodePool.Spec.Template.Spec.Kubelet, nodeClass) + Expect(err).To(BeNil()) + Expect(len(instanceTypes)).To(BeNumerically(">", 0)) + for _, it := range instanceTypes { + metric, ok := FindMetricWithLabelValues("karpenter_cloudprovider_instance_type_cpu_cores", map[string]string{ + "instance_type": it.Name, + }) + Expect(ok).To(BeTrue()) + Expect(metric).To(Not(BeNil())) + value := metric.GetGauge().Value + Expect(aws.Float64Value(value)).To(BeNumerically(">", 0)) + } + }) + It("should expose memory metrics for instance types", func() { + instanceTypes, err := awsEnv.InstanceTypesProvider.List(ctx, nodePool.Spec.Template.Spec.Kubelet, nodeClass) + Expect(err).To(BeNil()) + Expect(len(instanceTypes)).To(BeNumerically(">", 0)) + for _, it := range instanceTypes { + metric, ok := FindMetricWithLabelValues("karpenter_cloudprovider_instance_type_memory_bytes", map[string]string{ + "instance_type": it.Name, + }) + Expect(ok).To(BeTrue()) + Expect(metric).To(Not(BeNil())) + value := metric.GetGauge().Value + Expect(aws.Float64Value(value)).To(BeNumerically(">", 0)) + } + }) + It("should expose availability metrics for instance types", func() { + instanceTypes, err := awsEnv.InstanceTypesProvider.List(ctx, nodePool.Spec.Template.Spec.Kubelet, nodeClass) + Expect(err).To(BeNil()) + Expect(len(instanceTypes)).To(BeNumerically(">", 0)) + for _, it := range instanceTypes { + for _, of := range it.Offerings { + metric, ok := FindMetricWithLabelValues("karpenter_cloudprovider_instance_type_offering_available", map[string]string{ + "instance_type": it.Name, + "capacity_type": of.CapacityType, + "zone": of.Zone, + }) + Expect(ok).To(BeTrue()) + Expect(metric).To(Not(BeNil())) + value := metric.GetGauge().Value + Expect(aws.Float64Value(value)).To(BeNumerically("==", lo.Ternary(of.Available, 1, 0))) + } + } + }) + It("should expose pricing metrics for instance types", func() { + instanceTypes, err := awsEnv.InstanceTypesProvider.List(ctx, nodePool.Spec.Template.Spec.Kubelet, nodeClass) + Expect(err).To(BeNil()) + Expect(len(instanceTypes)).To(BeNumerically(">", 0)) + for _, it := range instanceTypes { + for _, of := range it.Offerings { + metric, ok := FindMetricWithLabelValues("karpenter_cloudprovider_instance_type_offering_price_estimate", map[string]string{ + "instance_type": it.Name, + "capacity_type": of.CapacityType, + "zone": of.Zone, + }) + Expect(ok).To(BeTrue()) + Expect(metric).To(Not(BeNil())) + value := metric.GetGauge().Value + Expect(aws.Float64Value(value)).To(BeNumerically("==", of.Price)) + } + } + }) }) It("should launch instances in local zones", func() { ExpectApplied(ctx, env.Client, nodePool, nodeClass) diff --git a/pkg/providers/pricing/metrics.go b/pkg/providers/pricing/metrics.go deleted file mode 100644 index 60c053580439..000000000000 --- a/pkg/providers/pricing/metrics.go +++ /dev/null @@ -1,50 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package pricing - -import ( - "github.com/prometheus/client_golang/prometheus" - crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" - - "sigs.k8s.io/karpenter/pkg/metrics" -) - -const ( - cloudProviderSubsystem = "cloudprovider" -) - -var ( - InstanceTypeLabel = "instance_type" - CapacityTypeLabel = "capacity_type" - RegionLabel = "region" - TopologyLabel = "zone" - InstancePriceEstimate = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: cloudProviderSubsystem, - Name: "instance_type_price_estimate", - Help: "Estimated hourly price used when making informed decisions on node cost calculation. This is updated once on startup and then every 12 hours.", - }, - []string{ - InstanceTypeLabel, - CapacityTypeLabel, - RegionLabel, - TopologyLabel, - }) -) - -func init() { - crmetrics.Registry.MustRegister(InstancePriceEstimate) -} diff --git a/pkg/providers/pricing/pricing.go b/pkg/providers/pricing/pricing.go index 3681fa80535d..7edc92edba20 100644 --- a/pkg/providers/pricing/pricing.go +++ b/pkg/providers/pricing/pricing.go @@ -33,7 +33,6 @@ import ( "github.com/aws/aws-sdk-go/service/ec2/ec2iface" "github.com/aws/aws-sdk-go/service/pricing" "github.com/aws/aws-sdk-go/service/pricing/pricingiface" - "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" "go.uber.org/multierr" "knative.dev/pkg/logging" @@ -206,14 +205,6 @@ func (p *Provider) UpdateOnDemandPricing(ctx context.Context) error { } p.onDemandPrices = lo.Assign(onDemandPrices, onDemandMetalPrices) - for instanceType, price := range p.onDemandPrices { - InstancePriceEstimate.With(prometheus.Labels{ - InstanceTypeLabel: instanceType, - CapacityTypeLabel: ec2.UsageClassTypeOnDemand, - RegionLabel: p.region, - TopologyLabel: "", - }).Set(price) - } if p.cm.HasChanged("on-demand-prices", p.onDemandPrices) { logging.FromContext(ctx).With("instance-type-count", len(p.onDemandPrices)).Debugf("updated on-demand pricing") } @@ -343,12 +334,6 @@ func (p *Provider) UpdateSpotPricing(ctx context.Context) error { prices[instanceType] = map[string]float64{} } prices[instanceType][az] = spotPrice - InstancePriceEstimate.With(prometheus.Labels{ - InstanceTypeLabel: instanceType, - CapacityTypeLabel: ec2.UsageClassTypeSpot, - RegionLabel: p.region, - TopologyLabel: az, - }).Set(spotPrice) } return true }) diff --git a/website/content/en/preview/reference/metrics.md b/website/content/en/preview/reference/metrics.md index b7cf1366f50f..c2c3ba455dcd 100644 --- a/website/content/en/preview/reference/metrics.md +++ b/website/content/en/preview/reference/metrics.md @@ -151,8 +151,11 @@ Current count of nodes in cluster state ## Cloudprovider Metrics -### `karpenter_cloudprovider_instance_type_price_estimate` -Estimated hourly price used when making informed decisions on node cost calculation. This is updated once on startup and then every 12 hours. +### `karpenter_cloudprovider_instance_type_offering_price_estimate` +Instance type offering estimated estimated hourly price used when making informed decisions on node cost calculation, based on instance type, capacity type, and zone. + +### `karpenter_cloudprovider_instance_type_offering_available` +Instance type offering availability, based on instance type, capacity type, and zone ### `karpenter_cloudprovider_instance_type_memory_bytes` Memory, in bytes, for a given instance type. diff --git a/website/content/en/preview/upgrading/upgrade-guide.md b/website/content/en/preview/upgrading/upgrade-guide.md index 2c6f86930005..25b41fc19025 100644 --- a/website/content/en/preview/upgrading/upgrade-guide.md +++ b/website/content/en/preview/upgrading/upgrade-guide.md @@ -37,6 +37,14 @@ kubectl apply -f https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef WHEN CREATING A NEW SECTION OF THE UPGRADE GUIDANCE FOR NEWER VERSIONS, ENSURE THAT YOU COPY THE BETA API ALERT SECTION FROM THE LAST RELEASE TO PROPERLY WARN USERS OF THE RISK OF UPGRADING WITHOUT GOING TO 0.32.x FIRST --> +### Upgrading to `0.36.0`+ + +{{% alert title="Warning" color="warning" %}} +`0.33.0`+ _only_ supports Karpenter v1beta1 APIs and will not work with existing Provisioner, AWSNodeTemplate or Machine alpha APIs. Do not upgrade to `0.35.0`+ without first [upgrading to `0.32.x`]({{}}). This version supports both the alpha and beta APIs, allowing you to migrate all of your existing APIs to beta APIs without experiencing downtime. +{{% /alert %}} + +* Karpenter changed the name of the `karpenter_cloudprovider_instance_type_price_estimate` metric to `karpenter_cloudprovider_instance_type_offering_price_estimate` to align with the new `karpenter_cloudprovider_instance_type_offering_available` metric. The `region` label was also dropped from the metric, since this can be inferred from the environment that Karpenter is running in. + ### Upgrading to `0.35.0`+ {{% alert title="Warning" color="warning" %}}