Skip to content

Commit

Permalink
adding DRCluster metrics
Browse files Browse the repository at this point in the history
Signed-off-by: rakeshgm <[email protected]>
  • Loading branch information
rakeshgm committed Jan 16, 2025
1 parent aae07f6 commit 1690416
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 2 deletions.
8 changes: 8 additions & 0 deletions config/prometheus/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,11 @@ spec:
annotations:
description: "Workload is not protected for disaster recovery (DRPC: {{ $labels.obj_name }}, Namespace: {{ $labels.obj_namespace }}). Inspect DRPC status.conditions for details."
alert_type: "DisasterRecovery"
- alert: DRClusterAvailableStatus
expr: drcluster_available_status == 0
for: 10m
labels:
severity: critical
annotations:
description: "Error from DRCluster (DRCluster: {{ $labels.obj_name }}, Error: {{ $labels.error_message }})."
alert_type: "DisasterRecovery"
19 changes: 17 additions & 2 deletions internal/controller/drcluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ const (
DRClusterConditionReasonErrorUnknown = "UnknownError"
)

// s3Error reasons
const (
s3ConnectionFailed = "s3ConnectionFailed"
s3ListFailed = "s3ListFailed"
)

//nolint:gosec
const (
StorageAnnotationSecretName = "drcluster.ramendr.openshift.io/storage-secret-name"
Expand Down Expand Up @@ -425,6 +431,8 @@ func (r DRClusterReconciler) processCreateOrUpdate(u *drclusterInstance) (ctrl.R

if reason, err := validateS3Profile(u.ctx, r.APIReader, r.ObjectStoreGetter, u.object, u.namespacedName.String(),
u.log); err != nil {
u.setDRClusterAvailableStatusMetric(err)

return ctrl.Result{}, fmt.Errorf("drclusters s3Profile validate: %w", u.validatedSetFalseAndUpdate(reason, err))
}

Expand Down Expand Up @@ -511,16 +519,23 @@ func s3ProfileValidate(ctx context.Context, apiReader client.Reader,
objectStore, _, err := objectStoreGetter.ObjectStore(
ctx, apiReader, s3ProfileName, "drpolicy validation", log)
if err != nil {
return "s3ConnectionFailed", fmt.Errorf("%s: %w", s3ProfileName, err)
return s3ConnectionFailed, fmt.Errorf("%s: %w", s3ProfileName, err)
}

if _, err := objectStore.ListKeys(listKeyPrefix); err != nil {
return "s3ListFailed", fmt.Errorf("%s: %w", s3ProfileName, err)
return s3ListFailed, fmt.Errorf("%s: %w", s3ProfileName, err)
}

return "", nil
}

func (u *drclusterInstance) setDRClusterAvailableStatusMetric(err error) {
u.log.Info("setting DRCluster Metrics")
drClusterAvailableMetricLabels := DRClusterAvailableStatusLabels(u.object, err.Error())
drClusterAvailableMetric := NewDRClusterAvailableStatusMetric(drClusterAvailableMetricLabels)
drClusterAvailableMetric.DRClusterAvailableStatus.Set(0)
}

func validateCIDRsFormat(drcluster *ramen.DRCluster, log logr.Logger) error {
// validate the CIDRs format
invalidCidrs := []string{}
Expand Down
41 changes: 41 additions & 0 deletions internal/controller/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const (
LastSyncDurationSeconds = "last_sync_duration_seconds"
LastSyncDataBytes = "last_sync_data_bytes"
WorkloadProtectionStatus = "workload_protection_status"
DRClusterAvailableStatus = "drcluster_available_status"
)

type SyncTimeMetrics struct {
Expand All @@ -44,6 +45,10 @@ type WorkloadProtectionMetrics struct {
WorkloadProtectionStatus prometheus.Gauge
}

type DRClusterAvailableStatusMetrics struct {
DRClusterAvailableStatus prometheus.Gauge
}

type SyncMetrics struct {
SyncTimeMetrics
SyncDurationMetrics
Expand All @@ -56,6 +61,7 @@ const (
ObjNamespace = "obj_namespace"
Policyname = "policyname"
SchedulingInterval = "scheduling_interval"
ErrorMessage = "error_message"
)

var (
Expand Down Expand Up @@ -90,6 +96,12 @@ var (
ObjName, // Name of the resoure [drpc-name]
ObjNamespace, // DRPC namespace
}

dRClusterAvailableStatusLabels = []string{
ObjType, // Name of the type of the resource [drcluster]
ObjName, // Name of the resoure [drcluster-name]
ErrorMessage, // ErrorMessage from status.conditions
}
)

var (
Expand Down Expand Up @@ -137,6 +149,15 @@ var (
},
workloadProtectionStatusLabels,
)

drClusterAvailableStatus = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: DRClusterAvailableStatus,
Namespace: metricNamespace,
Help: "DRCluster Availability Status",
},
dRClusterAvailableStatusLabels,
)
)

// lastSyncTime metrics reports value from lastGrpupSyncTime taken from DRPC status
Expand Down Expand Up @@ -234,11 +255,31 @@ func DeleteWorkloadProtectionStatusMetric(labels prometheus.Labels) bool {
return workloadProtectionStatus.Delete(labels)
}

// drClusterAvailableStatus Metrics
func DRClusterAvailableStatusLabels(drCluster *rmn.DRCluster, errorMessage string) prometheus.Labels {
return prometheus.Labels{
ObjType: "DRCluster",
ObjName: drCluster.Name,
ErrorMessage: errorMessage,
}
}

func NewDRClusterAvailableStatusMetric(labels prometheus.Labels) DRClusterAvailableStatusMetrics {
return DRClusterAvailableStatusMetrics{
DRClusterAvailableStatus: drClusterAvailableStatus.With(labels),
}
}

func DeleteDRClusterAvailableStatusMetric(labels prometheus.Labels) bool {
return drClusterAvailableStatus.Delete(labels)
}

func init() {
// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(dRPolicySyncInterval)
metrics.Registry.MustRegister(lastSyncTime)
metrics.Registry.MustRegister(lastSyncDuration)
metrics.Registry.MustRegister(lastSyncDataBytes)
metrics.Registry.MustRegister(workloadProtectionStatus)
metrics.Registry.MustRegister(drClusterAvailableStatus)
}

0 comments on commit 1690416

Please sign in to comment.