Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce High Level MR metrics #683

Merged
merged 6 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/evanphx/json-patch v5.6.0+incompatible
github.com/go-logr/logr v1.4.1
github.com/google/go-cmp v0.6.0
github.com/prometheus/client_golang v1.18.0
github.com/spf13/afero v1.11.0
golang.org/x/time v0.5.0
google.golang.org/grpc v1.61.0
Expand All @@ -19,6 +20,7 @@ require (
k8s.io/apiextensions-apiserver v0.29.1
k8s.io/apimachinery v0.29.1
k8s.io/client-go v0.29.1
k8s.io/component-base v0.29.1
k8s.io/klog/v2 v2.110.1
sigs.k8s.io/controller-runtime v0.17.0
sigs.k8s.io/controller-tools v0.14.0
Expand All @@ -31,6 +33,7 @@ require (
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
github.com/Microsoft/go-winio v0.6.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/bufbuild/protocompile v0.6.0 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/containerd/log v0.1.0 // indirect
Expand Down Expand Up @@ -87,7 +90,6 @@ require (
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pkg/profile v1.7.0 // indirect
github.com/prometheus/client_golang v1.18.0 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
Expand Down Expand Up @@ -123,7 +125,6 @@ require (
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/component-base v0.29.1 // indirect
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect
k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migc
github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
github.com/bufbuild/buf v1.27.2 h1:uX2kvZfPfRoOsrxUW4LwpykSyH+wI5dUnIG0QWHDCCU=
github.com/bufbuild/buf v1.27.2/go.mod h1:7RImDhFDqhEsdK5wbuMhoVSlnrMggGGcd3s9WozvHtM=
github.com/bufbuild/protocompile v0.6.0 h1:Uu7WiSQ6Yj9DbkdnOe7U4mNKp58y9WDMKDn28/ZlunY=
Expand Down
17 changes: 17 additions & 0 deletions pkg/controller/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ import (
"github.com/crossplane/crossplane-runtime/pkg/feature"
"github.com/crossplane/crossplane-runtime/pkg/logging"
"github.com/crossplane/crossplane-runtime/pkg/ratelimiter"
"github.com/crossplane/crossplane-runtime/pkg/reconciler/managed"
"github.com/crossplane/crossplane-runtime/pkg/statemetrics"
)

// DefaultOptions returns a functional set of options with conservative
Expand Down Expand Up @@ -61,6 +63,9 @@ type Options struct {

// ESSOptions for External Secret Stores.
ESSOptions *ESSOptions

// MetricOptions for recording metrics.
MetricOptions *MetricOptions
}

// ForControllerRuntime extracts options for controller-runtime.
Expand All @@ -79,3 +84,15 @@ type ESSOptions struct {
TLSConfig *tls.Config
TLSSecretName *string
}

// MetricOptions for recording metrics.
type MetricOptions struct {
// PollStateMetricInterval at which each controller should record state
PollStateMetricInterval time.Duration

// MetricsRecorder to use for recording metrics.
MRMetrics managed.MetricRecorder

// MRStateMetrics to use for recording state metrics.
MRStateMetrics *statemetrics.MRStateMetrics
}
178 changes: 178 additions & 0 deletions pkg/reconciler/managed/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/*
Copyright 2024 The Crossplane Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package managed

import (
"sync"
"time"

"github.com/prometheus/client_golang/prometheus"
corev1 "k8s.io/api/core/v1"
kmetrics "k8s.io/component-base/metrics"

xpv1 "github.com/crossplane/crossplane-runtime/apis/common/v1"
"github.com/crossplane/crossplane-runtime/pkg/resource"
)

const subSystem = "crossplane"

// MetricRecorder records the managed resource metrics.
type MetricRecorder interface { //nolint:interfacebloat // The first two methods are coming from Prometheus
Describe(ch chan<- *prometheus.Desc)
Collect(ch chan<- prometheus.Metric)

recordUnchanged(name string)
recordFirstTimeReconciled(managed resource.Managed)
recordFirstTimeReady(managed resource.Managed)
recordDrift(managed resource.Managed)
recordDeleted(managed resource.Managed)
}

// MRMetricRecorder records the lifecycle metrics of managed resources.
type MRMetricRecorder struct {
firstObservation sync.Map
lastObservation sync.Map

mrDetected *prometheus.HistogramVec
mrFirstTimeReady *prometheus.HistogramVec
mrDeletion *prometheus.HistogramVec
mrDrift *prometheus.HistogramVec
}

// NewMRMetricRecorder returns a new MRMetricRecorder which records metrics for managed resources.
func NewMRMetricRecorder() *MRMetricRecorder {
return &MRMetricRecorder{
mrDetected: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: subSystem,
Name: "managed_resource_first_time_to_reconcile_seconds",
Help: "The time it took for a managed resource to be detected by the controller",
Buckets: kmetrics.ExponentialBuckets(10e-9, 10, 10),
}, []string{"gvk"}),
mrFirstTimeReady: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: subSystem,
Name: "managed_resource_first_time_to_readiness_seconds",
Help: "The time it took for a managed resource to become ready first time after creation",
Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600},
}, []string{"gvk"}),
mrDeletion: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: subSystem,
Name: "managed_resource_deletion_seconds",
Help: "The time it took for a managed resource to be deleted",
Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600},
}, []string{"gvk"}),
mrDrift: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: subSystem,
Name: "managed_resource_drift_seconds",
Help: "ALPHA: How long since the previous successful reconcile when a resource was found to be out of sync; excludes restart of the provider",
Buckets: kmetrics.ExponentialBuckets(10e-9, 10, 10),
}, []string{"gvk"}),
}
}

// Describe sends the super-set of all possible descriptors of metrics
// collected by this Collector to the provided channel and returns once
// the last descriptor has been sent.
func (r *MRMetricRecorder) Describe(ch chan<- *prometheus.Desc) {
r.mrDetected.Describe(ch)
r.mrFirstTimeReady.Describe(ch)
r.mrDeletion.Describe(ch)
r.mrDrift.Describe(ch)
}

// Collect is called by the Prometheus registry when collecting
// metrics. The implementation sends each collected metric via the
// provided channel and returns once the last metric has been sent.
func (r *MRMetricRecorder) Collect(ch chan<- prometheus.Metric) {
r.mrDetected.Collect(ch)
r.mrFirstTimeReady.Collect(ch)
r.mrDeletion.Collect(ch)
r.mrDrift.Collect(ch)
}

func (r *MRMetricRecorder) recordUnchanged(name string) {
r.lastObservation.Store(name, time.Now())
}

func (r *MRMetricRecorder) recordFirstTimeReconciled(managed resource.Managed) {
if managed.GetCondition(xpv1.TypeSynced).Status == corev1.ConditionUnknown {
r.mrDetected.With(getLabels(managed)).Observe(time.Since(managed.GetCreationTimestamp().Time).Seconds())
r.firstObservation.Store(managed.GetName(), time.Now()) // this is the first time we reconciled on this resource
}
}

func (r *MRMetricRecorder) recordDrift(managed resource.Managed) {
name := managed.GetName()
last, ok := r.lastObservation.Load(name)
if !ok {
return
}
lt, ok := last.(time.Time)
if !ok {
return
}

r.mrDrift.With(getLabels(managed)).Observe(time.Since(lt).Seconds())

r.lastObservation.Store(name, time.Now())
}

func (r *MRMetricRecorder) recordDeleted(managed resource.Managed) {
r.mrDeletion.With(getLabels(managed)).Observe(time.Since(managed.GetDeletionTimestamp().Time).Seconds())
}

func (r *MRMetricRecorder) recordFirstTimeReady(managed resource.Managed) {
// Note that providers may set the ready condition to "True", so we need
// to check the value here to send the ready metric
if managed.GetCondition(xpv1.TypeReady).Status == corev1.ConditionTrue {
_, ok := r.firstObservation.Load(managed.GetName()) // This map is used to identify the first time to readiness
if !ok {
return
}
r.mrFirstTimeReady.With(getLabels(managed)).Observe(time.Since(managed.GetCreationTimestamp().Time).Seconds())
r.firstObservation.Delete(managed.GetName())
}
}

// A NopMetricRecorder does nothing.
type NopMetricRecorder struct{}

// NewNopMetricRecorder returns a MRMetricRecorder that does nothing.
func NewNopMetricRecorder() *NopMetricRecorder {
return &NopMetricRecorder{}
}

// Describe does nothing.
func (r *NopMetricRecorder) Describe(_ chan<- *prometheus.Desc) {}

// Collect does nothing.
func (r *NopMetricRecorder) Collect(_ chan<- prometheus.Metric) {}

func (r *NopMetricRecorder) recordUnchanged(_ string) {}

func (r *NopMetricRecorder) recordFirstTimeReconciled(_ resource.Managed) {}

func (r *NopMetricRecorder) recordDrift(_ resource.Managed) {}

func (r *NopMetricRecorder) recordDeleted(_ resource.Managed) {}

func (r *NopMetricRecorder) recordFirstTimeReady(_ resource.Managed) {}

func getLabels(r resource.Managed) prometheus.Labels {
return prometheus.Labels{
"gvk": r.GetObjectKind().GroupVersionKind().String(),
}
}
28 changes: 26 additions & 2 deletions pkg/reconciler/managed/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -485,8 +485,9 @@ type Reconciler struct {

supportedManagementPolicies []sets.Set[xpv1.ManagementAction]

log logging.Logger
record event.Recorder
log logging.Logger
record event.Recorder
metricRecorder MetricRecorder
}

type mrManaged struct {
Expand Down Expand Up @@ -544,6 +545,13 @@ func WithPollInterval(after time.Duration) ReconcilerOption {
}
}

// WithMetricRecorder configures the Reconciler to use the supplied MetricRecorder.
func WithMetricRecorder(recorder MetricRecorder) ReconcilerOption {
return func(r *Reconciler) {
r.metricRecorder = recorder
}
}

// PollIntervalHook represents the function type passed to the
// WithPollIntervalHook option to support dynamic computation of the poll
// interval.
Expand Down Expand Up @@ -701,6 +709,7 @@ func NewReconciler(m manager.Manager, of resource.ManagedKind, o ...ReconcilerOp
supportedManagementPolicies: defaultSupportedManagementPolicies(),
log: logging.NewNopLogger(),
record: event.NewNopRecorder(),
metricRecorder: NewNopMetricRecorder(),
}

for _, ro := range o {
Expand Down Expand Up @@ -734,6 +743,8 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (resu
return reconcile.Result{}, errors.Wrap(resource.IgnoreNotFound(err), errGetManaged)
}

r.metricRecorder.recordFirstTimeReconciled(managed)

record := r.record.WithAnnotations("external-name", meta.GetExternalName(managed))
log = log.WithValues(
"uid", managed.GetUID(),
Expand Down Expand Up @@ -823,6 +834,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (resu
// details and removed our finalizer. If we assume we were the only
// controller that added a finalizer to this resource then it should no
// longer exist and thus there is no point trying to update its status.
r.metricRecorder.recordDeleted(managed)
log.Debug("Successfully deleted managed resource")
return reconcile.Result{Requeue: false}, nil
}
Expand Down Expand Up @@ -994,6 +1006,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (resu
// removed our finalizer. If we assume we were the only controller that
// added a finalizer to this resource then it should no longer exist and
// thus there is no point trying to update its status.
r.metricRecorder.recordDeleted(managed)
log.Debug("Successfully deleted managed resource")
return reconcile.Result{Requeue: false}, nil
}
Expand Down Expand Up @@ -1150,6 +1163,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (resu
reconcileAfter := r.pollIntervalHook(managed, r.pollInterval)
log.Debug("External resource is up to date", "requeue-after", time.Now().Add(reconcileAfter))
managed.SetConditions(xpv1.ReconcileSuccess())
r.metricRecorder.recordFirstTimeReady(managed)

// record that we intentionally did not update the managed resource
// because no drift was detected. We call this so late in the reconcile
// because all the cases above could contribute (for different reasons)
// that the external object would not have been updated.
r.metricRecorder.recordUnchanged(managed.GetName())

return reconcile.Result{RequeueAfter: reconcileAfter}, errors.Wrap(r.client.Status().Update(ctx, managed), errUpdateManagedStatus)
}

Expand Down Expand Up @@ -1178,6 +1199,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (resu
return reconcile.Result{Requeue: true}, errors.Wrap(r.client.Status().Update(ctx, managed), errUpdateManagedStatus)
}

// record the drift after the successful update.
r.metricRecorder.recordDrift(managed)

if _, err := r.managed.PublishConnection(ctx, managed, update.ConnectionDetails); err != nil {
// If this is the first time we encounter this issue we'll be requeued
// implicitly when we update our status with the new error condition. If
Expand Down
Loading
Loading