From 3c45d4770b16289847acf44fa329226246fca8a8 Mon Sep 17 00:00:00 2001 From: avlitman Date: Mon, 23 Dec 2024 18:38:48 +0200 Subject: [PATCH] Fix hco system health metric values kubevirt_hco_system_health_status values changed to value+1, this pr remove unknown from the options and change back the values to be again 0 for health, 1 for warning and 2 for error. also tests and related recording rules and alerts updated accordingly. Signed-off-by: avlitman --- .../hyperconverged_controller.go | 43 ++++++++----------- .../hyperconverged_controller_test.go | 10 ++--- hack/prom-rule-ci/prom-rules-tests.yaml | 39 ++++++++--------- pkg/monitoring/metrics/operator_metrics.go | 33 +++++--------- .../metrics/operator_metrics_test.go | 16 +++---- pkg/monitoring/rules/alerts/health_alerts.go | 16 ++++--- 6 files changed, 68 insertions(+), 89 deletions(-) diff --git a/controllers/hyperconverged/hyperconverged_controller.go b/controllers/hyperconverged/hyperconverged_controller.go index cf25852f8..913ab6126 100644 --- a/controllers/hyperconverged/hyperconverged_controller.go +++ b/controllers/hyperconverged/hyperconverged_controller.go @@ -1011,6 +1011,8 @@ func (r *ReconcileHyperConverged) updateConditions(req *common.HcoRequest) { req.Instance.Status.SystemHealthStatus = systemHealthStatus req.StatusDirty = true } + + metrics.SetHCOMetricSystemHealthStatus(getNumericalHealthStatus(systemHealthStatus)) } func (r *ReconcileHyperConverged) setLabels(req *common.HcoRequest) { @@ -1066,42 +1068,23 @@ func (r *ReconcileHyperConverged) detectTaintedConfiguration(req *common.HcoRequ } func (r *ReconcileHyperConverged) getSystemHealthStatus(conditions common.HcoConditions) string { - if isError, reason := isSystemHealthStatusError(conditions); isError { - metrics.SetHCOSystemError(reason) + if isSystemHealthStatusError(conditions) { return systemHealthStatusError } - if isWarning, reason := isSystemHealthStatusWarning(conditions); isWarning { - metrics.SetHCOSystemWarning(reason) + if isSystemHealthStatusWarning(conditions) { return systemHealthStatusWarning } - metrics.SetHCOSystemHealthy() return systemHealthStatusHealthy } -func isSystemHealthStatusError(conditions common.HcoConditions) (bool, string) { - if cond, found := conditions.GetCondition(hcov1beta1.ConditionDegraded); found && cond.Status == metav1.ConditionTrue { - return true, cond.Reason - } - - if cond, found := conditions.GetCondition(hcov1beta1.ConditionAvailable); found && cond.Status != metav1.ConditionTrue { - return true, cond.Reason - } - - return false, "" +func isSystemHealthStatusError(conditions common.HcoConditions) bool { + return !conditions.IsStatusConditionTrue(hcov1beta1.ConditionAvailable) || conditions.IsStatusConditionTrue(hcov1beta1.ConditionDegraded) } -func isSystemHealthStatusWarning(conditions common.HcoConditions) (bool, string) { - if cond, found := conditions.GetCondition(hcov1beta1.ConditionProgressing); found && cond.Status == metav1.ConditionTrue { - return true, cond.Reason - } - - if cond, found := conditions.GetCondition(hcov1beta1.ConditionReconcileComplete); found && cond.Status != metav1.ConditionTrue { - return true, cond.Reason - } - - return false, "" +func isSystemHealthStatusWarning(conditions common.HcoConditions) bool { + return !conditions.IsStatusConditionTrue(hcov1beta1.ConditionReconcileComplete) || conditions.IsStatusConditionTrue(hcov1beta1.ConditionProgressing) } func getNumOfChangesJSONPatch(jsonPatch string) int { @@ -1112,6 +1095,16 @@ func getNumOfChangesJSONPatch(jsonPatch string) int { return len(patches) } +func getNumericalHealthStatus(status string) float64 { + healthStatusCodes := map[string]float64{ + systemHealthStatusHealthy: metrics.SystemHealthStatusHealthy, + systemHealthStatusWarning: metrics.SystemHealthStatusWarning, + systemHealthStatusError: metrics.SystemHealthStatusError, + } + + return healthStatusCodes[status] +} + func (r *ReconcileHyperConverged) firstLoopInitialization(request *common.HcoRequest) { // Initialize operand handler. r.operandHandler.FirstUseInitiation(r.scheme, hcoutil.GetClusterInfo(), request.Instance) diff --git a/controllers/hyperconverged/hyperconverged_controller_test.go b/controllers/hyperconverged/hyperconverged_controller_test.go index 0911388bb..0e98341e3 100644 --- a/controllers/hyperconverged/hyperconverged_controller_test.go +++ b/controllers/hyperconverged/hyperconverged_controller_test.go @@ -192,7 +192,7 @@ var _ = Describe("HyperconvergedController", func() { Message: "Initializing HyperConverged cluster", }))) - verifySystemHealthStatusError(foundResource, reconcileInit) + verifySystemHealthStatusError(foundResource) expectedFeatureGates := []string{ "CPUManager", @@ -307,7 +307,7 @@ var _ = Describe("HyperconvergedController", func() { Message: reconcileCompletedMessage, }))) - verifySystemHealthStatusError(foundResource, "SSPConditions") + verifySystemHealthStatusError(foundResource) Expect(foundResource.Status.RelatedObjects).To(HaveLen(21)) expectedRef := corev1.ObjectReference{ @@ -3794,15 +3794,15 @@ func verifyHyperConvergedCRExistsMetricFalse() { func verifySystemHealthStatusHealthy(hco *hcov1beta1.HyperConverged) { ExpectWithOffset(1, hco.Status.SystemHealthStatus).To(Equal(systemHealthStatusHealthy)) - systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus("healthy") + systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus() ExpectWithOffset(1, err).ToNot(HaveOccurred()) ExpectWithOffset(1, systemHealthStatusMetric).To(Equal(metrics.SystemHealthStatusHealthy)) } -func verifySystemHealthStatusError(hco *hcov1beta1.HyperConverged, reason string) { +func verifySystemHealthStatusError(hco *hcov1beta1.HyperConverged) { ExpectWithOffset(1, hco.Status.SystemHealthStatus).To(Equal(systemHealthStatusError)) - systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus(reason) + systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus() ExpectWithOffset(1, err).ToNot(HaveOccurred()) ExpectWithOffset(1, systemHealthStatusMetric).To(Equal(metrics.SystemHealthStatusError)) } diff --git a/hack/prom-rule-ci/prom-rules-tests.yaml b/hack/prom-rule-ci/prom-rules-tests.yaml index cd69db952..7ebdf7083 100755 --- a/hack/prom-rule-ci/prom-rules-tests.yaml +++ b/hack/prom-rule-ci/prom-rules-tests.yaml @@ -610,7 +610,7 @@ tests: input_series: - series: 'kubevirt_hco_system_health_status' # time: 0 1 2 3 4 5 6 7 8 9 10 11 - values: "1 1 1 1 2 2 2 2 3 3 3 3" + values: "0 0 0 0 1 1 1 1 2 2 2 2" - series: 'ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="warning"}' # time: 0 1 2 3 4 5 6 7 8 9 10 11 values: "1 1 stale stale 1 1 stale stale 1 1 stale stale" @@ -771,47 +771,46 @@ tests: kubernetes_operator_part_of: "kubevirt" kubernetes_operator_component: "hyperconverged-cluster-operator" -# Test OperatorConditionsUnhealthy +# Test HCOOperatorConditionsUnhealthy - interval: 1m input_series: - - series: 'kubevirt_hco_system_health_status{reason="healthy"}' - - values: "stale 1 stale stale stale" + - series: 'kubevirt_hco_system_health_status' + - values: "stale stale 2 stale" - - series: 'kubevirt_hco_system_health_status{reason="SOME_ERROR"}' - values: "stale stale stale 3 stale" - - - series: 'kubevirt_hco_system_health_status{reason="SOME_WARNING"}' - values: "stale stale stale stale stale 2 stale" + - series: 'kubevirt_hco_system_health_status' + values: "stale stale 2 stale 1 stale" alert_rule_test: - eval_time: 1m - alertname: OperatorConditionsUnhealthy + alertname: HCOOperatorConditionsUnhealthy exp_alerts: [ ] - - eval_time: 3m - alertname: OperatorConditionsUnhealthy + - eval_time: 1m + alertname: HCOOperatorConditionsUnhealthy + exp_alerts: [ ] + + - eval_time: 2m + alertname: HCOOperatorConditionsUnhealthy exp_alerts: - exp_annotations: - description: "HCO and its secondary resources are in a critical state due to SOME_ERROR." + description: "HCO and its secondary resources are in a critical state due to system error. Go to Installed Operators -> kubevirt-hyperconverged-operator -> HyperConverged details and inspect the Conditions." summary: "HCO and its secondary resources are in a critical state." - runbook_url: "https://kubevirt.io/monitoring/runbooks/OperatorConditionsUnhealthy" + runbook_url: "https://kubevirt.io/monitoring/runbooks/HCOOperatorConditionsUnhealthy" exp_labels: severity: "critical" operator_health_impact: "critical" kubernetes_operator_part_of: "kubevirt" kubernetes_operator_component: "hyperconverged-cluster-operator" - reason: "SOME_ERROR" - - eval_time: 5m - alertname: OperatorConditionsUnhealthy + - eval_time: 4m + alertname: HCOOperatorConditionsUnhealthy exp_alerts: - exp_annotations: - description: "HCO and its secondary resources are in a warning state due to SOME_WARNING." + description: "HCO and its secondary resources are in a warning state due to system warning. Go to Installed Operators -> kubevirt-hyperconverged-operator -> HyperConverged details and inspect the Conditions." summary: "HCO and its secondary resources are in a warning state." - runbook_url: "https://kubevirt.io/monitoring/runbooks/OperatorConditionsUnhealthy" + runbook_url: "https://kubevirt.io/monitoring/runbooks/HCOOperatorConditionsUnhealthy" exp_labels: severity: "warning" operator_health_impact: "warning" kubernetes_operator_part_of: "kubevirt" kubernetes_operator_component: "hyperconverged-cluster-operator" - reason: "SOME_WARNING" diff --git a/pkg/monitoring/metrics/operator_metrics.go b/pkg/monitoring/metrics/operator_metrics.go index 2190659ef..a988f1763 100644 --- a/pkg/monitoring/metrics/operator_metrics.go +++ b/pkg/monitoring/metrics/operator_metrics.go @@ -16,8 +16,7 @@ const ( ) const ( - SystemHealthStatusUnknown float64 = iota - SystemHealthStatusHealthy + SystemHealthStatusHealthy float64 = iota SystemHealthStatusWarning SystemHealthStatusError ) @@ -53,12 +52,11 @@ var ( }, ) - systemHealthStatus = operatormetrics.NewGaugeVec( + systemHealthStatus = operatormetrics.NewGauge( operatormetrics.MetricOpts{ Name: "kubevirt_hco_system_health_status", Help: "Indicates whether the system health status is healthy (0), warning (1), or error (2), by aggregating the conditions of HCO and its secondary resources", }, - []string{"reason"}, ) ) @@ -119,30 +117,19 @@ func IsHCOMetricHyperConvergedExists() (bool, error) { return value == hyperConvergedExists, nil } -func SetHCOSystemHealthy() { - systemHealthStatus.Reset() - systemHealthStatus.WithLabelValues("healthy").Set(SystemHealthStatusHealthy) +func SetHCOMetricSystemHealthStatus(status float64) { + systemHealthStatus.Set(status) } -func SetHCOSystemWarning(reason string) { - systemHealthStatus.Reset() - systemHealthStatus.WithLabelValues(reason).Set(SystemHealthStatusWarning) -} - -func SetHCOSystemError(reason string) { - systemHealthStatus.Reset() - systemHealthStatus.WithLabelValues(reason).Set(SystemHealthStatusError) -} - -// GetHCOMetricSystemHealthStatus returns current value of gauge. If error is not nil then value is undefined -func GetHCOMetricSystemHealthStatus(reason string) (float64, error) { +func GetHCOMetricSystemHealthStatus() (float64, error) { dto := &ioprometheusclient.Metric{} - err := systemHealthStatus.WithLabelValues(reason).Write(dto) + err := systemHealthStatus.Write(dto) + value := dto.Gauge.GetValue() + if err != nil { - return SystemHealthStatusUnknown, err + return 0, err } - - return dto.Gauge.GetValue(), nil + return value, nil } func getLabelsForObj(kind string, name string) string { diff --git a/pkg/monitoring/metrics/operator_metrics_test.go b/pkg/monitoring/metrics/operator_metrics_test.go index 3b3df11fd..26d758165 100644 --- a/pkg/monitoring/metrics/operator_metrics_test.go +++ b/pkg/monitoring/metrics/operator_metrics_test.go @@ -9,20 +9,16 @@ import ( var _ = Describe("Operator Metrics", func() { Context("kubevirt_hco_system_health_status", func() { - It("should set system error reason correctly", func() { - metrics.SetHCOSystemError("Reason1") - v, err := metrics.GetHCOMetricSystemHealthStatus("Reason1") + It("should set the correct system health status", func() { + metrics.SetHCOMetricSystemHealthStatus(2) + v, err := metrics.GetHCOMetricSystemHealthStatus() Expect(err).ToNot(HaveOccurred()) Expect(v).To(Equal(metrics.SystemHealthStatusError)) - metrics.SetHCOSystemError("Reason2") - v, err = metrics.GetHCOMetricSystemHealthStatus("Reason2") + metrics.SetHCOMetricSystemHealthStatus(1) + v, err = metrics.GetHCOMetricSystemHealthStatus() Expect(err).ToNot(HaveOccurred()) - Expect(v).To(Equal(metrics.SystemHealthStatusError)) - - v, err = metrics.GetHCOMetricSystemHealthStatus("Reason1") - Expect(err).ToNot(HaveOccurred()) - Expect(v).To(Equal(metrics.SystemHealthStatusUnknown)) + Expect(v).To(Equal(metrics.SystemHealthStatusWarning)) }) }) }) diff --git a/pkg/monitoring/rules/alerts/health_alerts.go b/pkg/monitoring/rules/alerts/health_alerts.go index 8ab62dfd6..0c5236531 100644 --- a/pkg/monitoring/rules/alerts/health_alerts.go +++ b/pkg/monitoring/rules/alerts/health_alerts.go @@ -12,11 +12,13 @@ import ( func healthAlerts() []promv1.Rule { return []promv1.Rule{ { - Alert: "OperatorConditionsUnhealthy", + Alert: "HCOOperatorConditionsUnhealthy", Expr: intstr.FromString(fmt.Sprintf("kubevirt_hco_system_health_status == %f", metrics.SystemHealthStatusError)), Annotations: map[string]string{ - "description": "HCO and its secondary resources are in a critical state due to {{ $labels.reason }}.", - "summary": "HCO and its secondary resources are in a critical state.", + "description": "HCO and its secondary resources are in a critical state due to system error. Go to " + + "Installed Operators -> kubevirt-hyperconverged-operator -> HyperConverged details " + + "and inspect the Conditions.", + "summary": "HCO and its secondary resources are in a critical state.", }, Labels: map[string]string{ severityAlertLabelKey: "critical", @@ -24,11 +26,13 @@ func healthAlerts() []promv1.Rule { }, }, { - Alert: "OperatorConditionsUnhealthy", + Alert: "HCOOperatorConditionsUnhealthy", Expr: intstr.FromString(fmt.Sprintf("kubevirt_hco_system_health_status == %f", metrics.SystemHealthStatusWarning)), Annotations: map[string]string{ - "description": "HCO and its secondary resources are in a warning state due to {{ $labels.reason }}.", - "summary": "HCO and its secondary resources are in a warning state.", + "description": "HCO and its secondary resources are in a warning state due to system warning. Go to " + + "Installed Operators -> kubevirt-hyperconverged-operator -> HyperConverged details " + + "and inspect the Conditions.", + "summary": "HCO and its secondary resources are in a warning state.", }, Labels: map[string]string{ severityAlertLabelKey: "warning",