From 16904169517295bc8b3f7bd80a7b99b1d5cf2ff5 Mon Sep 17 00:00:00 2001 From: rakeshgm Date: Thu, 16 Jan 2025 19:53:43 +0530 Subject: [PATCH] adding DRCluster metrics Signed-off-by: rakeshgm --- config/prometheus/alerts.yaml | 8 ++++ internal/controller/drcluster_controller.go | 19 +++++++++- internal/controller/metrics.go | 41 +++++++++++++++++++++ 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/config/prometheus/alerts.yaml b/config/prometheus/alerts.yaml index 3230f3c59..8f9250637 100644 --- a/config/prometheus/alerts.yaml +++ b/config/prometheus/alerts.yaml @@ -40,3 +40,11 @@ spec: annotations: description: "Workload is not protected for disaster recovery (DRPC: {{ $labels.obj_name }}, Namespace: {{ $labels.obj_namespace }}). Inspect DRPC status.conditions for details." alert_type: "DisasterRecovery" + - alert: DRClusterAvailableStatus + expr: drcluster_available_status == 0 + for: 10m + labels: + severity: critical + annotations: + description: "Error from DRCluster (DRCluster: {{ $labels.obj_name }}, Error: {{ $labels.error_message }})." + alert_type: "DisasterRecovery" diff --git a/internal/controller/drcluster_controller.go b/internal/controller/drcluster_controller.go index a04186c92..cfc8be76a 100644 --- a/internal/controller/drcluster_controller.go +++ b/internal/controller/drcluster_controller.go @@ -65,6 +65,12 @@ const ( DRClusterConditionReasonErrorUnknown = "UnknownError" ) +// s3Error reasons +const ( + s3ConnectionFailed = "s3ConnectionFailed" + s3ListFailed = "s3ListFailed" +) + //nolint:gosec const ( StorageAnnotationSecretName = "drcluster.ramendr.openshift.io/storage-secret-name" @@ -425,6 +431,8 @@ func (r DRClusterReconciler) processCreateOrUpdate(u *drclusterInstance) (ctrl.R if reason, err := validateS3Profile(u.ctx, r.APIReader, r.ObjectStoreGetter, u.object, u.namespacedName.String(), u.log); err != nil { + u.setDRClusterAvailableStatusMetric(err) + return ctrl.Result{}, fmt.Errorf("drclusters s3Profile validate: %w", u.validatedSetFalseAndUpdate(reason, err)) } @@ -511,16 +519,23 @@ func s3ProfileValidate(ctx context.Context, apiReader client.Reader, objectStore, _, err := objectStoreGetter.ObjectStore( ctx, apiReader, s3ProfileName, "drpolicy validation", log) if err != nil { - return "s3ConnectionFailed", fmt.Errorf("%s: %w", s3ProfileName, err) + return s3ConnectionFailed, fmt.Errorf("%s: %w", s3ProfileName, err) } if _, err := objectStore.ListKeys(listKeyPrefix); err != nil { - return "s3ListFailed", fmt.Errorf("%s: %w", s3ProfileName, err) + return s3ListFailed, fmt.Errorf("%s: %w", s3ProfileName, err) } return "", nil } +func (u *drclusterInstance) setDRClusterAvailableStatusMetric(err error) { + u.log.Info("setting DRCluster Metrics") + drClusterAvailableMetricLabels := DRClusterAvailableStatusLabels(u.object, err.Error()) + drClusterAvailableMetric := NewDRClusterAvailableStatusMetric(drClusterAvailableMetricLabels) + drClusterAvailableMetric.DRClusterAvailableStatus.Set(0) +} + func validateCIDRsFormat(drcluster *ramen.DRCluster, log logr.Logger) error { // validate the CIDRs format invalidCidrs := []string{} diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go index b8265615d..d9a169613 100644 --- a/internal/controller/metrics.go +++ b/internal/controller/metrics.go @@ -22,6 +22,7 @@ const ( LastSyncDurationSeconds = "last_sync_duration_seconds" LastSyncDataBytes = "last_sync_data_bytes" WorkloadProtectionStatus = "workload_protection_status" + DRClusterAvailableStatus = "drcluster_available_status" ) type SyncTimeMetrics struct { @@ -44,6 +45,10 @@ type WorkloadProtectionMetrics struct { WorkloadProtectionStatus prometheus.Gauge } +type DRClusterAvailableStatusMetrics struct { + DRClusterAvailableStatus prometheus.Gauge +} + type SyncMetrics struct { SyncTimeMetrics SyncDurationMetrics @@ -56,6 +61,7 @@ const ( ObjNamespace = "obj_namespace" Policyname = "policyname" SchedulingInterval = "scheduling_interval" + ErrorMessage = "error_message" ) var ( @@ -90,6 +96,12 @@ var ( ObjName, // Name of the resoure [drpc-name] ObjNamespace, // DRPC namespace } + + dRClusterAvailableStatusLabels = []string{ + ObjType, // Name of the type of the resource [drcluster] + ObjName, // Name of the resoure [drcluster-name] + ErrorMessage, // ErrorMessage from status.conditions + } ) var ( @@ -137,6 +149,15 @@ var ( }, workloadProtectionStatusLabels, ) + + drClusterAvailableStatus = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: DRClusterAvailableStatus, + Namespace: metricNamespace, + Help: "DRCluster Availability Status", + }, + dRClusterAvailableStatusLabels, + ) ) // lastSyncTime metrics reports value from lastGrpupSyncTime taken from DRPC status @@ -234,6 +255,25 @@ func DeleteWorkloadProtectionStatusMetric(labels prometheus.Labels) bool { return workloadProtectionStatus.Delete(labels) } +// drClusterAvailableStatus Metrics +func DRClusterAvailableStatusLabels(drCluster *rmn.DRCluster, errorMessage string) prometheus.Labels { + return prometheus.Labels{ + ObjType: "DRCluster", + ObjName: drCluster.Name, + ErrorMessage: errorMessage, + } +} + +func NewDRClusterAvailableStatusMetric(labels prometheus.Labels) DRClusterAvailableStatusMetrics { + return DRClusterAvailableStatusMetrics{ + DRClusterAvailableStatus: drClusterAvailableStatus.With(labels), + } +} + +func DeleteDRClusterAvailableStatusMetric(labels prometheus.Labels) bool { + return drClusterAvailableStatus.Delete(labels) +} + func init() { // Register custom metrics with the global prometheus registry metrics.Registry.MustRegister(dRPolicySyncInterval) @@ -241,4 +281,5 @@ func init() { metrics.Registry.MustRegister(lastSyncDuration) metrics.Registry.MustRegister(lastSyncDataBytes) metrics.Registry.MustRegister(workloadProtectionStatus) + metrics.Registry.MustRegister(drClusterAvailableStatus) }