Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add allocation scheduling duration metric #680

Merged
merged 3 commits into from
Sep 16, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 8 additions & 12 deletions pkg/controllers/allocation/bind.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"time"

"github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha3"
"github.com/awslabs/karpenter/pkg/metrics"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/multierr"
v1 "k8s.io/api/core/v1"
Expand All @@ -29,7 +30,7 @@ import (
"k8s.io/client-go/util/workqueue"
"knative.dev/pkg/logging"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/metrics"
crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
)

type Binder interface {
Expand Down Expand Up @@ -99,23 +100,18 @@ type binderMetricsDecorator struct {
bindTimeHistogramVec *prometheus.HistogramVec
}

const metricLabelResult = "result"

func DecorateBinderMetrics(binder Binder) Binder {
bindTimeHistogramVec := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "karpenter",
Namespace: metrics.KarpenterNamespace,
Subsystem: "allocation_controller",
Name: "bind_duration_seconds",
Help: "Duration of bind process in seconds. Broken down by result.",
// Use same bucket thresholds as controller-runtime.
// https://github.com/kubernetes-sigs/controller-runtime/blob/v0.10.0/pkg/internal/controller/metrics/metrics.go#L47-L48
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60},
Buckets: metrics.DurationBuckets(),
},
[]string{metricLabelResult},
[]string{metrics.ResultLabel},
)
metrics.Registry.MustRegister(bindTimeHistogramVec)
crmetrics.Registry.MustRegister(bindTimeHistogramVec)

return &binderMetricsDecorator{binder: binder, bindTimeHistogramVec: bindTimeHistogramVec}
}
Expand All @@ -130,11 +126,11 @@ func (b *binderMetricsDecorator) Bind(ctx context.Context, node *v1.Node, pods [
result = "error"
}

observer, promErr := b.bindTimeHistogramVec.GetMetricWith(prometheus.Labels{metricLabelResult: result})
observer, promErr := b.bindTimeHistogramVec.GetMetricWith(prometheus.Labels{metrics.ResultLabel: result})
if promErr != nil {
logging.FromContext(ctx).Warnf(
"Failed to record bind duration metric [%s=%s, duration=%f]: error=%w",
metricLabelResult,
metrics.ResultLabel,
result,
durationSeconds,
promErr,
Expand Down
68 changes: 60 additions & 8 deletions pkg/controllers/allocation/scheduling/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,35 @@ package scheduling
import (
"context"
"fmt"
"time"

"github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha3"
"github.com/awslabs/karpenter/pkg/cloudprovider"
"github.com/awslabs/karpenter/pkg/metrics"
"github.com/mitchellh/hashstructure/v2"
"github.com/prometheus/client_golang/prometheus"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"knative.dev/pkg/logging"
"sigs.k8s.io/controller-runtime/pkg/client"
crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
)

func NewScheduler(cloudProvider cloudprovider.CloudProvider, kubeClient client.Client) *Scheduler {
return &Scheduler{
KubeClient: kubeClient,
Topology: &Topology{
cloudProvider: cloudProvider,
kubeClient: kubeClient,
},
}
var scheduleTimeHistogramVec = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metrics.KarpenterNamespace,
Subsystem: "allocation_controller",
Name: "scheduling_duration_seconds",
Help: "Duration of scheduling process in seconds. Broken down by provisioner and result.",
Buckets: metrics.DurationBuckets(),
},
[]string{metrics.ProvisionerLabel, metrics.ResultLabel},
)

func init() {
crmetrics.Registry.MustRegister(scheduleTimeHistogramVec)
}

type Scheduler struct {
Expand All @@ -51,7 +61,49 @@ type Schedule struct {
Daemons []*v1.Pod
}

func NewScheduler(cloudProvider cloudprovider.CloudProvider, kubeClient client.Client) *Scheduler {
return &Scheduler{
KubeClient: kubeClient,
Topology: &Topology{
cloudProvider: cloudProvider,
kubeClient: kubeClient,
},
}
}

func (s *Scheduler) Solve(ctx context.Context, provisioner *v1alpha3.Provisioner, pods []*v1.Pod) ([]*Schedule, error) {
startTime := time.Now()
schedules, scheduleErr := s.solve(ctx, provisioner, pods)
durationSeconds := time.Since(startTime).Seconds()

result := "success"
if scheduleErr != nil {
result = "error"
}

provisionerName := provisioner.ObjectMeta.Name
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not to overly focus on this, but I just realized that you could print out the labels in the log line to avoid the case you were talking about. It's a map, so it should print [key1: value, key2: value2]

labels = prometheus.Labels{...}
logging(...)Warnf("%s", labels)

observer, promErr := scheduleTimeHistogramVec.GetMetricWith(prometheus.Labels{
metrics.ProvisionerLabel: provisionerName,
metrics.ResultLabel: result,
})
if promErr != nil {
logging.FromContext(ctx).Warnf(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: It just struck me that our ctx already decorates the log line with the provisioner name controller.allocation.provisioner/default, so you could get away with leaving it out here.

"Failed to record scheduling duration metric [%s=%s, %s=%s, duration=%f]: error=%w",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

%w is intended for error chaining. When logging an error, you should use something like fmt.Errorf("%s", error.Error())

metrics.ProvisionerLabel,
provisionerName,
metrics.ResultLabel,
result,
durationSeconds,
promErr,
)
} else {
observer.Observe(durationSeconds)
}

return schedules, scheduleErr
}

func (s *Scheduler) solve(ctx context.Context, provisioner *v1alpha3.Provisioner, pods []*v1.Pod) ([]*Schedule, error) {
// 1. Inject temporarily adds specific NodeSelectors to pods, which are then
// used by scheduling logic. This isn't strictly necessary, but is a useful
// trick to avoid passing topology decisions through the scheduling code. It
Expand Down
33 changes: 33 additions & 0 deletions pkg/metrics/constants.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

const (
// Common namespace for application metrics.
KarpenterNamespace = "karpenter"

// Common set of metric label names.
ResultLabel = "result"
ProvisionerLabel = "provisioner"
)

// DurationBuckets returns a []float64 of default threshold values for duration histograms.
// Each returned slice is new and may be modified without impacting other bucket definitions.
func DurationBuckets() []float64 {
// Use same bucket thresholds as controller-runtime.
// https://github.com/kubernetes-sigs/controller-runtime/blob/v0.10.0/pkg/internal/controller/metrics/metrics.go#L47-L48
return []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}
}