aws · cjerad · Sep 16, 2021 · Sep 15, 2021 · Sep 16, 2021 · Sep 16, 2021
@@ -20,6 +20,7 @@ import (
 	"time"
 
 	"github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha3"
+	"github.com/awslabs/karpenter/pkg/metrics"
 	"github.com/prometheus/client_golang/prometheus"
 	"go.uber.org/multierr"
 	v1 "k8s.io/api/core/v1"
@@ -29,7 +30,7 @@ import (
 	"k8s.io/client-go/util/workqueue"
 	"knative.dev/pkg/logging"
 	"sigs.k8s.io/controller-runtime/pkg/client"
-	"sigs.k8s.io/controller-runtime/pkg/metrics"
+	crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
 )
 
 type Binder interface {
@@ -99,23 +100,18 @@ type binderMetricsDecorator struct {
 	bindTimeHistogramVec *prometheus.HistogramVec
 }
 
-const metricLabelResult = "result"
-
 func DecorateBinderMetrics(binder Binder) Binder {
 	bindTimeHistogramVec := prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
-			Namespace: "karpenter",
+			Namespace: metrics.KarpenterNamespace,
 			Subsystem: "allocation_controller",
 			Name:      "bind_duration_seconds",
 			Help:      "Duration of bind process in seconds. Broken down by result.",
-			// Use same bucket thresholds as controller-runtime.
-			// https://github.com/kubernetes-sigs/controller-runtime/blob/v0.10.0/pkg/internal/controller/metrics/metrics.go#L47-L48
-			Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
-				1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60},
+			Buckets:   metrics.DurationBuckets(),
 		},
-		[]string{metricLabelResult},
+		[]string{metrics.ResultLabel},
 	)
-	metrics.Registry.MustRegister(bindTimeHistogramVec)
+	crmetrics.Registry.MustRegister(bindTimeHistogramVec)
 
 	return &binderMetricsDecorator{binder: binder, bindTimeHistogramVec: bindTimeHistogramVec}
 }
@@ -130,11 +126,11 @@ func (b *binderMetricsDecorator) Bind(ctx context.Context, node *v1.Node, pods [
 		result = "error"
 	}
 
-	observer, promErr := b.bindTimeHistogramVec.GetMetricWith(prometheus.Labels{metricLabelResult: result})
+	observer, promErr := b.bindTimeHistogramVec.GetMetricWith(prometheus.Labels{metrics.ResultLabel: result})
 	if promErr != nil {
 		logging.FromContext(ctx).Warnf(
 			"Failed to record bind duration metric [%s=%s, duration=%f]: error=%w",
-			metricLabelResult,
+			metrics.ResultLabel,
 			result,
 			durationSeconds,
 			promErr,

@@ -17,25 +17,35 @@ package scheduling
 import (
 	"context"
 	"fmt"
+	"time"
 
 	"github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha3"
 	"github.com/awslabs/karpenter/pkg/cloudprovider"
+	"github.com/awslabs/karpenter/pkg/metrics"
 	"github.com/mitchellh/hashstructure/v2"
+	"github.com/prometheus/client_golang/prometheus"
 	appsv1 "k8s.io/api/apps/v1"
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
+	"knative.dev/pkg/logging"
 	"sigs.k8s.io/controller-runtime/pkg/client"
+	crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
 )
 
-func NewScheduler(cloudProvider cloudprovider.CloudProvider, kubeClient client.Client) *Scheduler {
-	return &Scheduler{
-		KubeClient: kubeClient,
-		Topology: &Topology{
-			cloudProvider: cloudProvider,
-			kubeClient:    kubeClient,
-		},
-	}
+var scheduleTimeHistogramVec = prometheus.NewHistogramVec(
+	prometheus.HistogramOpts{
+		Namespace: metrics.KarpenterNamespace,
+		Subsystem: "allocation_controller",
+		Name:      "scheduling_duration_seconds",
+		Help:      "Duration of scheduling process in seconds. Broken down by provisioner and result.",
+		Buckets:   metrics.DurationBuckets(),
+	},
+	[]string{metrics.ProvisionerLabel, metrics.ResultLabel},
+)
+
+func init() {
+	crmetrics.Registry.MustRegister(scheduleTimeHistogramVec)
 }
 
 type Scheduler struct {
@@ -51,7 +61,49 @@ type Schedule struct {
 	Daemons []*v1.Pod
 }
 
+func NewScheduler(cloudProvider cloudprovider.CloudProvider, kubeClient client.Client) *Scheduler {
+	return &Scheduler{
+		KubeClient: kubeClient,
+		Topology: &Topology{
+			cloudProvider: cloudProvider,
+			kubeClient:    kubeClient,
+		},
+	}
+}
+
 func (s *Scheduler) Solve(ctx context.Context, provisioner *v1alpha3.Provisioner, pods []*v1.Pod) ([]*Schedule, error) {
+	startTime := time.Now()
+	schedules, scheduleErr := s.solve(ctx, provisioner, pods)
+	durationSeconds := time.Since(startTime).Seconds()
+
+	result := "success"
+	if scheduleErr != nil {
+		result = "error"
+	}
+
+	provisionerName := provisioner.ObjectMeta.Name
+	observer, promErr := scheduleTimeHistogramVec.GetMetricWith(prometheus.Labels{
+		metrics.ProvisionerLabel: provisionerName,
+		metrics.ResultLabel:      result,
+	})
+	if promErr != nil {
+		logging.FromContext(ctx).Warnf(
+			"Failed to record scheduling duration metric [%s=%s, %s=%s, duration=%f]: error=%w",
+			metrics.ProvisionerLabel,
+			provisionerName,
+			metrics.ResultLabel,
+			result,
+			durationSeconds,
+			promErr,
+		)
+	} else {
+		observer.Observe(durationSeconds)
+	}
+
+	return schedules, scheduleErr
+}
+
+func (s *Scheduler) solve(ctx context.Context, provisioner *v1alpha3.Provisioner, pods []*v1.Pod) ([]*Schedule, error) {
 	// 1. Inject temporarily adds specific NodeSelectors to pods, which are then
 	// used by scheduling logic. This isn't strictly necessary, but is a useful
 	// trick to avoid passing topology decisions through the scheduling code. It

@@ -0,0 +1,33 @@
+/*
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+const (
+	// Common namespace for application metrics.
+	KarpenterNamespace = "karpenter"
+
+	// Common set of metric label names.
+	ResultLabel      = "result"
+	ProvisionerLabel = "provisioner"
+)
+
+// DurationBuckets returns a []float64 of default threshold values for duration histograms.
+// Each returned slice is new and may be modified without impacting other bucket definitions.
+func DurationBuckets() []float64 {
+	// Use same bucket thresholds as controller-runtime.
+	// https://github.com/kubernetes-sigs/controller-runtime/blob/v0.10.0/pkg/internal/controller/metrics/metrics.go#L47-L48
+	return []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
+		1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}
+}