crossplane · sttts · Aug 3, 2023 · Aug 3, 2023 · negz · Aug 7, 2023
diff --git a/go.mod b/go.mod
@@ -8,6 +8,7 @@ require (
 	github.com/go-logr/logr v1.2.4
 	github.com/google/go-cmp v0.5.9
 	github.com/hashicorp/vault/api v1.9.2
+	github.com/prometheus/client_golang v1.15.1
 	github.com/spf13/afero v1.9.5
 	golang.org/x/time v0.3.0
 	google.golang.org/grpc v1.57.0
@@ -17,6 +18,7 @@ require (
 	k8s.io/apiextensions-apiserver v0.27.4
 	k8s.io/apimachinery v0.27.4
 	k8s.io/client-go v0.27.4
+	k8s.io/component-base v0.27.4
 	sigs.k8s.io/controller-runtime v0.15.0
 	sigs.k8s.io/controller-tools v0.12.1
 	sigs.k8s.io/yaml v1.3.0
@@ -26,6 +28,7 @@ require (
 	github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
 	github.com/Microsoft/go-winio v0.6.1 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
+	github.com/blang/semver/v4 v4.0.0 // indirect
 	github.com/bufbuild/connect-go v1.9.0 // indirect
 	github.com/bufbuild/connect-opentelemetry-go v0.4.0 // indirect
 	github.com/bufbuild/protocompile v0.6.0 // indirect
@@ -93,7 +96,6 @@ require (
 	github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pkg/profile v1.7.0 // indirect
-	github.com/prometheus/client_golang v1.15.1 // indirect
 	github.com/prometheus/client_model v0.4.0 // indirect
 	github.com/prometheus/common v0.44.0 // indirect
 	github.com/prometheus/procfs v0.10.0 // indirect
@@ -127,7 +129,6 @@ require (
 	gopkg.in/inf.v0 v0.9.1 // indirect
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
-	k8s.io/component-base v0.27.4 // indirect
 	k8s.io/klog/v2 v2.100.1 // indirect
 	k8s.io/kube-openapi v0.0.0-20230525220651-2546d827e515 // indirect
 	k8s.io/utils v0.0.0-20230505201702-9f6742963106 // indirect

diff --git a/go.sum b/go.sum
@@ -52,6 +52,8 @@ github.com/benbjohnson/clock v1.3.5 h1:VvXlSJBzZpA/zum6Sj74hxwYI2DIxRWuNIoXAzHZz
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
+github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
+github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
 github.com/bufbuild/buf v1.25.1 h1:8ed5AjZ+zPIJf72rxtfsDit/MtaBimaSRn9Y+5G++y0=
 github.com/bufbuild/buf v1.25.1/go.mod h1:UMPncXMWgrmIM+0QpwTEwjNr2SA0z2YIVZZsmNflvB4=
 github.com/bufbuild/connect-go v1.9.0 h1:JIgAeNuFpo+SUPfU19Yt5TcWlznsN5Bv10/gI/6Pjoc=

diff --git a/pkg/reconciler/managed/metrics.go b/pkg/reconciler/managed/metrics.go
@@ -0,0 +1,100 @@
+/*
+Copyright 2023 The Crossplane Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package managed
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/client-go/tools/cache"
+	kmetrics "k8s.io/component-base/metrics"
+	"sigs.k8s.io/controller-runtime/pkg/cluster"
+	"sigs.k8s.io/controller-runtime/pkg/manager"
+	"sigs.k8s.io/controller-runtime/pkg/metrics"
+
+	"github.com/crossplane/crossplane-runtime/pkg/errors"
+	"github.com/crossplane/crossplane-runtime/pkg/resource"
+)
+
+func init() {
+	metrics.Registry.MustRegister(drift)
+}
+
+var subSystem = "crossplane"
+
+var (
+	drift = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Subsystem: subSystem,
+		Name:      "resource_drift_seconds",
+		Help:      "ALPHA: How long since the previous successful reconcile when a resource was found to be out of sync; excludes restart of the provider",
+		Buckets:   kmetrics.ExponentialBuckets(10e-9, 10, 10),
+	}, []string{"group", "kind"})
+)
+
+// driftRecorder records the time since the last observation of a resource
+// and records the time since on update as a metric. This represents an upper
+// bound for the duration the drift existed.
+type driftRecorder struct {
+	lastObservation sync.Map
+	gvk             schema.GroupVersionKind
+
+	cluster cluster.Cluster
+}
+
+var _ manager.Runnable = &driftRecorder{}
+
+func (r *driftRecorder) Start(ctx context.Context) error {
+	inf, err := r.cluster.GetCache().GetInformerForKind(ctx, r.gvk)
+	if err != nil {
+		return errors.Wrapf(err, "cannot get informer for drift recorder for resource %s", r.gvk)
+	}
+
+	registered, err := inf.AddEventHandler(cache.ResourceEventHandlerFuncs{
+		DeleteFunc: func(obj interface{}) {
+			if final, ok := obj.(cache.DeletedFinalStateUnknown); ok {
+				obj = final.Obj
+			}
+			managed := obj.(resource.Managed)
+			r.lastObservation.Delete(managed.GetName())
+		},
+	})
+	if err != nil {
+		return errors.Wrap(err, "cannot add delete event handler to informer for drift recorder")
+	}
+	defer inf.RemoveEventHandler(registered) //nolint:errcheck // this happens on destruction. We cannot do anything anyway.
+
+	<-ctx.Done()
+
+	return nil
+}
+
+func (r *driftRecorder) recordUnchanged(name string) {
+	r.lastObservation.Store(name, time.Now())
+}
+
+func (r *driftRecorder) recordUpdate(name string) {
+	last, ok := r.lastObservation.Load(name)
+	if !ok {
+		return
+	}
+	drift.WithLabelValues(r.gvk.Group, r.gvk.Kind).Observe(time.Since(last.(time.Time)).Seconds())
+
+	r.lastObservation.Store(name, time.Now())
+}
diff --git a/pkg/reconciler/managed/reconciler.go b/pkg/reconciler/managed/reconciler.go
@@ -18,6 +18,7 @@ package managed
 
 import (
 	"context"
+	"fmt"
 	"strings"
 	"time"
 
@@ -88,6 +89,30 @@ const (
 	reasonReconciliationPaused event.Reason = "ReconciliationPaused"
 )
 
+// RetryAfterError is returned by a reconciler when it is unable to complete
+// the actions, and the reconciler should retry after the specified duration.
+type RetryAfterError struct {
+	retryAfter time.Duration
+}
+
+// NewRetryAfterError creates a new ErrRetryAfter with the given retry-after duration.
+func NewRetryAfterError(retryAfter time.Duration) RetryAfterError {
+	return RetryAfterError{retryAfter: retryAfter}
+}
+
+// DefaultRetryAfter is the default retry-after duration for ErrRetryAfter.
+const DefaultRetryAfter = 30 * time.Second
+
+// Error implements the error interface.
+func (e RetryAfterError) Error() string {
+	return fmt.Sprintf("retry after %v", e.retryAfter)
+}
+
+// RetryAfter returns the duration after which the reconciler should retry.
+func (e RetryAfterError) RetryAfter() time.Duration {
+	return e.retryAfter
+}
+
 // ControllerName returns the recommended name for controllers that use this
 // package to reconcile a particular kind of managed resource.
 func ControllerName(kind string) string {
@@ -318,15 +343,27 @@ type ExternalClient interface {
 	// external resource does not exist. Create implementations may update
 	// managed resource annotations, and those updates will be persisted.
 	// All other updates will be discarded.
+	//
+	// This can return RetryAfterError to indicate that the resource cannot
+	// be created at this time, and that the controller should retry after
+	// the given duration.
 	Create(ctx context.Context, mg resource.Managed) (ExternalCreation, error)
 
 	// Update the external resource represented by the supplied Managed
 	// resource, if necessary. Called unless Observe reports that the
 	// associated external resource is up to date.
+	//
+	// This can return RetryAfterError to indicate that the resource cannot
+	// be updated at this time, and that the controller should retry after
+	// the given duration.
 	Update(ctx context.Context, mg resource.Managed) (ExternalUpdate, error)
 
 	// Delete the external resource upon deletion of its associated Managed
 	// resource. Called when the managed resource has been deleted.
+	//
+	// This can return RetryAfterError to indicate that the resource cannot
+	// be deleted at this time, and that the controller should retry after
+	// the given duration.
 	Delete(ctx context.Context, mg resource.Managed) error
 }
 
@@ -482,6 +519,8 @@ type Reconciler struct {
 
 	features feature.Flags
 
+	driftRecorder driftRecorder
+
 	// The below structs embed the set of interfaces used to implement the
 	// managed resource reconciler. We do this primarily for readability, so
 	// that the reconciler logic reads r.external.Connect(),
@@ -671,6 +710,7 @@ func NewReconciler(m manager.Manager, of resource.ManagedKind, o ...ReconcilerOp
 		creationGracePeriod:         defaultGracePeriod,
 		timeout:                     reconcileTimeout,
 		managed:                     defaultMRManaged(m),
+		driftRecorder:               driftRecorder{cluster: m},
 		external:                    defaultMRExternal(),
 		supportedManagementPolicies: defaultSupportedManagementPolicies(),
 		log:                         logging.NewNopLogger(),
@@ -681,6 +721,11 @@ func NewReconciler(m manager.Manager, of resource.ManagedKind, o ...ReconcilerOp
 		ro(r)
 	}
 
+	if err := m.Add(&r.driftRecorder); err != nil {
+		r.log.Info("unable to register drift recorder with controller manager", "error", err)
+		// no way to recover from this
+	}
+
 	return r
 }
 
@@ -892,7 +937,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
 		log = log.WithValues("deletion-timestamp", managed.GetDeletionTimestamp())
 
 		if observation.ResourceExists && policy.ShouldDelete() {
-			if err := external.Delete(externalCtx, managed); err != nil {
+			err := external.Delete(externalCtx, managed)
+			var retryAfterErr RetryAfterError
+			if errors.As(err, &retryAfterErr) {
+				log.Debug("External resource cannot be deleted now", "requeue-after", time.Now().Add(retryAfterErr.RetryAfter()))
+				return reconcile.Result{RequeueAfter: retryAfterErr.RetryAfter()}, nil
+			}
+			if err != nil {
 				// We'll hit this condition if we can't delete our external
 				// resource, for example if our provider credentials don't have
 				// access to delete it. If this is the first time we encounter
@@ -981,6 +1032,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
 		}
 
 		creation, err := external.Create(externalCtx, managed)
+		var retryAfterErr RetryAfterError
+		if errors.As(err, &retryAfterErr) {
+			log.Debug("External resource cannot be created now", "requeue-after", time.Now().Add(retryAfterErr.RetryAfter()))
+			return reconcile.Result{RequeueAfter: retryAfterErr.RetryAfter()}, nil
+		}
 		if err != nil {
 			// We'll hit this condition if we can't create our external
 			// resource, for example if our provider credentials don't have
@@ -1079,6 +1135,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
 		// https://github.com/crossplane/crossplane/issues/289
 		log.Debug("External resource is up to date", "requeue-after", time.Now().Add(r.pollInterval))
 		managed.SetConditions(xpv1.ReconcileSuccess())
+
+		// record that we intentionally did not update the managed resource
+		// because no drift was detected. We call this so late in the reconcile
+		// because all the cases above could contribute (for different reasons)
+		// that the external object would not have been updated.
+		r.driftRecorder.recordUnchanged(managed.GetName())
+
 		return reconcile.Result{RequeueAfter: r.pollInterval}, errors.Wrap(r.client.Status().Update(ctx, managed), errUpdateManagedStatus)
 	}
 
@@ -1094,6 +1157,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
 	}
 
 	update, err := external.Update(externalCtx, managed)
+	var retryAfterErr RetryAfterError
+	if errors.As(err, &retryAfterErr) {
+		log.Debug("External resource cannot be updated now", "requeue-after", time.Now().Add(retryAfterErr.RetryAfter()))
+		return reconcile.Result{RequeueAfter: retryAfterErr.RetryAfter()}, nil
+	}
 	if err != nil {
 		// We'll hit this condition if we can't update our external resource,
 		// for example if our provider credentials don't have access to update
@@ -1106,6 +1174,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
 		return reconcile.Result{Requeue: true}, errors.Wrap(r.client.Status().Update(ctx, managed), errUpdateManagedStatus)
 	}
 
+	// record the drift after the successful update.
+	r.driftRecorder.recordUpdate(managed.GetName())
+
 	if _, err := r.managed.PublishConnection(ctx, managed, update.ConnectionDetails); err != nil {
 		// If this is the first time we encounter this issue we'll be requeued
 		// implicitly when we update our status with the new error condition. If

diff --git a/pkg/resource/fake/mocks.go b/pkg/resource/fake/mocks.go
@@ -480,6 +480,11 @@ func (m *Manager) GetRESTMapper() meta.RESTMapper { return m.RESTMapper }
 // GetLogger returns the logger.
 func (m *Manager) GetLogger() logr.Logger { return m.Logger }
 
+// Add adds a runnable to the manager.
+func (m *Manager) Add(_ manager.Runnable) error {
+	return nil // do nothing
+}
+
 // GV returns a mock schema.GroupVersion.
 var GV = schema.GroupVersion{Group: "g", Version: "v"}