diff --git a/go.mod b/go.mod index c4ae75394..992adb6cc 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/go-logr/logr v1.2.4 github.com/google/go-cmp v0.5.9 github.com/hashicorp/vault/api v1.9.2 + github.com/prometheus/client_golang v1.15.1 github.com/spf13/afero v1.9.5 golang.org/x/time v0.3.0 google.golang.org/grpc v1.57.0 @@ -17,6 +18,7 @@ require ( k8s.io/apiextensions-apiserver v0.27.4 k8s.io/apimachinery v0.27.4 k8s.io/client-go v0.27.4 + k8s.io/component-base v0.27.4 sigs.k8s.io/controller-runtime v0.15.0 sigs.k8s.io/controller-tools v0.12.1 sigs.k8s.io/yaml v1.3.0 @@ -26,6 +28,7 @@ require ( github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/blang/semver/v4 v4.0.0 // indirect github.com/bufbuild/connect-go v1.9.0 // indirect github.com/bufbuild/connect-opentelemetry-go v0.4.0 // indirect github.com/bufbuild/protocompile v0.6.0 // indirect @@ -93,7 +96,6 @@ require ( github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pkg/profile v1.7.0 // indirect - github.com/prometheus/client_golang v1.15.1 // indirect github.com/prometheus/client_model v0.4.0 // indirect github.com/prometheus/common v0.44.0 // indirect github.com/prometheus/procfs v0.10.0 // indirect @@ -127,7 +129,6 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/component-base v0.27.4 // indirect k8s.io/klog/v2 v2.100.1 // indirect k8s.io/kube-openapi v0.0.0-20230525220651-2546d827e515 // indirect k8s.io/utils v0.0.0-20230505201702-9f6742963106 // indirect diff --git a/go.sum b/go.sum index 7c0d0dd17..1035ac846 100644 --- a/go.sum +++ b/go.sum @@ -52,6 +52,8 @@ github.com/benbjohnson/clock v1.3.5 h1:VvXlSJBzZpA/zum6Sj74hxwYI2DIxRWuNIoXAzHZz github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/bufbuild/buf v1.25.1 h1:8ed5AjZ+zPIJf72rxtfsDit/MtaBimaSRn9Y+5G++y0= github.com/bufbuild/buf v1.25.1/go.mod h1:UMPncXMWgrmIM+0QpwTEwjNr2SA0z2YIVZZsmNflvB4= github.com/bufbuild/connect-go v1.9.0 h1:JIgAeNuFpo+SUPfU19Yt5TcWlznsN5Bv10/gI/6Pjoc= diff --git a/pkg/reconciler/managed/metrics.go b/pkg/reconciler/managed/metrics.go new file mode 100644 index 000000000..a9b07f85c --- /dev/null +++ b/pkg/reconciler/managed/metrics.go @@ -0,0 +1,100 @@ +/* +Copyright 2023 The Crossplane Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package managed + +import ( + "context" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/tools/cache" + kmetrics "k8s.io/component-base/metrics" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/crossplane/crossplane-runtime/pkg/errors" + "github.com/crossplane/crossplane-runtime/pkg/resource" +) + +func init() { + metrics.Registry.MustRegister(drift) +} + +var subSystem = "crossplane" + +var ( + drift = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: subSystem, + Name: "resource_drift_seconds", + Help: "ALPHA: How long since the previous successful reconcile when a resource was found to be out of sync; excludes restart of the provider", + Buckets: kmetrics.ExponentialBuckets(10e-9, 10, 10), + }, []string{"group", "kind"}) +) + +// driftRecorder records the time since the last observation of a resource +// and records the time since on update as a metric. This represents an upper +// bound for the duration the drift existed. +type driftRecorder struct { + lastObservation sync.Map + gvk schema.GroupVersionKind + + cluster cluster.Cluster +} + +var _ manager.Runnable = &driftRecorder{} + +func (r *driftRecorder) Start(ctx context.Context) error { + inf, err := r.cluster.GetCache().GetInformerForKind(ctx, r.gvk) + if err != nil { + return errors.Wrapf(err, "cannot get informer for drift recorder for resource %s", r.gvk) + } + + registered, err := inf.AddEventHandler(cache.ResourceEventHandlerFuncs{ + DeleteFunc: func(obj interface{}) { + if final, ok := obj.(cache.DeletedFinalStateUnknown); ok { + obj = final.Obj + } + managed := obj.(resource.Managed) + r.lastObservation.Delete(managed.GetName()) + }, + }) + if err != nil { + return errors.Wrap(err, "cannot add delete event handler to informer for drift recorder") + } + defer inf.RemoveEventHandler(registered) //nolint:errcheck // this happens on destruction. We cannot do anything anyway. + + <-ctx.Done() + + return nil +} + +func (r *driftRecorder) recordUnchanged(name string) { + r.lastObservation.Store(name, time.Now()) +} + +func (r *driftRecorder) recordUpdate(name string) { + last, ok := r.lastObservation.Load(name) + if !ok { + return + } + drift.WithLabelValues(r.gvk.Group, r.gvk.Kind).Observe(time.Since(last.(time.Time)).Seconds()) + + r.lastObservation.Store(name, time.Now()) +} diff --git a/pkg/reconciler/managed/reconciler.go b/pkg/reconciler/managed/reconciler.go index 803a7cbdf..e277ccddd 100644 --- a/pkg/reconciler/managed/reconciler.go +++ b/pkg/reconciler/managed/reconciler.go @@ -18,6 +18,7 @@ package managed import ( "context" + "fmt" "strings" "time" @@ -88,6 +89,30 @@ const ( reasonReconciliationPaused event.Reason = "ReconciliationPaused" ) +// RetryAfterError is returned by a reconciler when it is unable to complete +// the actions, and the reconciler should retry after the specified duration. +type RetryAfterError struct { + retryAfter time.Duration +} + +// NewRetryAfterError creates a new ErrRetryAfter with the given retry-after duration. +func NewRetryAfterError(retryAfter time.Duration) RetryAfterError { + return RetryAfterError{retryAfter: retryAfter} +} + +// DefaultRetryAfter is the default retry-after duration for ErrRetryAfter. +const DefaultRetryAfter = 30 * time.Second + +// Error implements the error interface. +func (e RetryAfterError) Error() string { + return fmt.Sprintf("retry after %v", e.retryAfter) +} + +// RetryAfter returns the duration after which the reconciler should retry. +func (e RetryAfterError) RetryAfter() time.Duration { + return e.retryAfter +} + // ControllerName returns the recommended name for controllers that use this // package to reconcile a particular kind of managed resource. func ControllerName(kind string) string { @@ -318,15 +343,27 @@ type ExternalClient interface { // external resource does not exist. Create implementations may update // managed resource annotations, and those updates will be persisted. // All other updates will be discarded. + // + // This can return RetryAfterError to indicate that the resource cannot + // be created at this time, and that the controller should retry after + // the given duration. Create(ctx context.Context, mg resource.Managed) (ExternalCreation, error) // Update the external resource represented by the supplied Managed // resource, if necessary. Called unless Observe reports that the // associated external resource is up to date. + // + // This can return RetryAfterError to indicate that the resource cannot + // be updated at this time, and that the controller should retry after + // the given duration. Update(ctx context.Context, mg resource.Managed) (ExternalUpdate, error) // Delete the external resource upon deletion of its associated Managed // resource. Called when the managed resource has been deleted. + // + // This can return RetryAfterError to indicate that the resource cannot + // be deleted at this time, and that the controller should retry after + // the given duration. Delete(ctx context.Context, mg resource.Managed) error } @@ -482,6 +519,8 @@ type Reconciler struct { features feature.Flags + driftRecorder driftRecorder + // The below structs embed the set of interfaces used to implement the // managed resource reconciler. We do this primarily for readability, so // that the reconciler logic reads r.external.Connect(), @@ -671,6 +710,7 @@ func NewReconciler(m manager.Manager, of resource.ManagedKind, o ...ReconcilerOp creationGracePeriod: defaultGracePeriod, timeout: reconcileTimeout, managed: defaultMRManaged(m), + driftRecorder: driftRecorder{cluster: m}, external: defaultMRExternal(), supportedManagementPolicies: defaultSupportedManagementPolicies(), log: logging.NewNopLogger(), @@ -681,6 +721,11 @@ func NewReconciler(m manager.Manager, of resource.ManagedKind, o ...ReconcilerOp ro(r) } + if err := m.Add(&r.driftRecorder); err != nil { + r.log.Info("unable to register drift recorder with controller manager", "error", err) + // no way to recover from this + } + return r } @@ -892,7 +937,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco log = log.WithValues("deletion-timestamp", managed.GetDeletionTimestamp()) if observation.ResourceExists && policy.ShouldDelete() { - if err := external.Delete(externalCtx, managed); err != nil { + err := external.Delete(externalCtx, managed) + var retryAfterErr RetryAfterError + if errors.As(err, &retryAfterErr) { + log.Debug("External resource cannot be deleted now", "requeue-after", time.Now().Add(retryAfterErr.RetryAfter())) + return reconcile.Result{RequeueAfter: retryAfterErr.RetryAfter()}, nil + } + if err != nil { // We'll hit this condition if we can't delete our external // resource, for example if our provider credentials don't have // access to delete it. If this is the first time we encounter @@ -981,6 +1032,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco } creation, err := external.Create(externalCtx, managed) + var retryAfterErr RetryAfterError + if errors.As(err, &retryAfterErr) { + log.Debug("External resource cannot be created now", "requeue-after", time.Now().Add(retryAfterErr.RetryAfter())) + return reconcile.Result{RequeueAfter: retryAfterErr.RetryAfter()}, nil + } if err != nil { // We'll hit this condition if we can't create our external // resource, for example if our provider credentials don't have @@ -1079,6 +1135,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco // https://github.com/crossplane/crossplane/issues/289 log.Debug("External resource is up to date", "requeue-after", time.Now().Add(r.pollInterval)) managed.SetConditions(xpv1.ReconcileSuccess()) + + // record that we intentionally did not update the managed resource + // because no drift was detected. We call this so late in the reconcile + // because all the cases above could contribute (for different reasons) + // that the external object would not have been updated. + r.driftRecorder.recordUnchanged(managed.GetName()) + return reconcile.Result{RequeueAfter: r.pollInterval}, errors.Wrap(r.client.Status().Update(ctx, managed), errUpdateManagedStatus) } @@ -1094,6 +1157,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco } update, err := external.Update(externalCtx, managed) + var retryAfterErr RetryAfterError + if errors.As(err, &retryAfterErr) { + log.Debug("External resource cannot be updated now", "requeue-after", time.Now().Add(retryAfterErr.RetryAfter())) + return reconcile.Result{RequeueAfter: retryAfterErr.RetryAfter()}, nil + } if err != nil { // We'll hit this condition if we can't update our external resource, // for example if our provider credentials don't have access to update @@ -1106,6 +1174,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco return reconcile.Result{Requeue: true}, errors.Wrap(r.client.Status().Update(ctx, managed), errUpdateManagedStatus) } + // record the drift after the successful update. + r.driftRecorder.recordUpdate(managed.GetName()) + if _, err := r.managed.PublishConnection(ctx, managed, update.ConnectionDetails); err != nil { // If this is the first time we encounter this issue we'll be requeued // implicitly when we update our status with the new error condition. If diff --git a/pkg/resource/fake/mocks.go b/pkg/resource/fake/mocks.go index 9a41b7bca..339974d38 100644 --- a/pkg/resource/fake/mocks.go +++ b/pkg/resource/fake/mocks.go @@ -480,6 +480,11 @@ func (m *Manager) GetRESTMapper() meta.RESTMapper { return m.RESTMapper } // GetLogger returns the logger. func (m *Manager) GetLogger() logr.Logger { return m.Logger } +// Add adds a runnable to the manager. +func (m *Manager) Add(_ manager.Runnable) error { + return nil // do nothing +} + // GV returns a mock schema.GroupVersion. var GV = schema.GroupVersion{Group: "g", Version: "v"}