From 91ebbd24fcffac92ce6ec3d7cc3c0728d158483b Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Wed, 6 Nov 2024 18:57:56 +0200 Subject: [PATCH 1/4] Fix default Validated condition It was added in failed state (Status=False, Reason=PrerequisiteNotMet) by default which is wrong. The issue was hidden since we check the condition only when deleting the VRG. We want to inspect the condition when the when the VRG is live, so we can report the condition status in the protected pvcs conditions. Signed-off-by: Nir Soffer --- internal/controller/vrg_volrep_test.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/internal/controller/vrg_volrep_test.go b/internal/controller/vrg_volrep_test.go index d428e1f83..a7138e736 100644 --- a/internal/controller/vrg_volrep_test.go +++ b/internal/controller/vrg_volrep_test.go @@ -2388,6 +2388,9 @@ func (v *vrgTest) promoteVolRepsAndDo(options promoteOptions, do func(int, int)) v.waitForVolRepCondition(volrepKey, volrep.ConditionValidated, metav1.ConditionFalse) } } else { + if !options.ValidatedMissing { + v.waitForVolRepCondition(volrepKey, volrep.ConditionValidated, metav1.ConditionTrue) + } v.waitForVolRepCondition(volrepKey, volrep.ConditionCompleted, metav1.ConditionTrue) v.waitForProtectedPVCs(volrepKey) } @@ -2404,9 +2407,9 @@ func (v *vrgTest) generateVRConditions(generation int64, options promoteOptions) if !options.ValidatedMissing { validated := metav1.Condition{ Type: volrep.ConditionValidated, - Reason: volrep.PrerequisiteNotMet, + Reason: volrep.PrerequisiteMet, ObservedGeneration: generation, - Status: metav1.ConditionFalse, + Status: metav1.ConditionTrue, LastTransitionTime: lastTransitionTime, } From 586281b34f25ab9787cff63ba5bd394274503a2a Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Wed, 6 Nov 2024 19:03:07 +0200 Subject: [PATCH 2/4] Add .Message to VR conditions Based on the messages addded in https://github.com/csi-addons/kubernetes-csi-addons/pull/691. We want to propagate the error messages to the protected pvcs conditions. Signed-off-by: Nir Soffer --- internal/controller/vrg_volrep_test.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/internal/controller/vrg_volrep_test.go b/internal/controller/vrg_volrep_test.go index a7138e736..83138d0f5 100644 --- a/internal/controller/vrg_volrep_test.go +++ b/internal/controller/vrg_volrep_test.go @@ -2368,7 +2368,7 @@ func (v *vrgTest) promoteVolRepsAndDo(options promoteOptions, do func(int, int)) if options.ValidatedFailed { volRepStatus.State = volrep.UnknownState - volRepStatus.Message = "precondition failed ..." + volRepStatus.Message = "failed to meet prerequisite: details..." } volRep.Status = volRepStatus @@ -2411,11 +2411,13 @@ func (v *vrgTest) generateVRConditions(generation int64, options promoteOptions) ObservedGeneration: generation, Status: metav1.ConditionTrue, LastTransitionTime: lastTransitionTime, + Message: "volume is validated", } if options.ValidatedFailed { validated.Status = metav1.ConditionFalse validated.Reason = volrep.PrerequisiteNotMet + validated.Message = "failed to meet prerequisite: details..." } conditions = append(conditions, validated) @@ -2427,11 +2429,13 @@ func (v *vrgTest) generateVRConditions(generation int64, options promoteOptions) ObservedGeneration: generation, Status: metav1.ConditionTrue, LastTransitionTime: lastTransitionTime, + Message: "volume is completed", } if options.ValidatedFailed { completed.Status = metav1.ConditionFalse completed.Reason = volrep.FailedToPromote + completed.Message = "failed to promote" } degraded := metav1.Condition{ @@ -2440,6 +2444,7 @@ func (v *vrgTest) generateVRConditions(generation int64, options promoteOptions) ObservedGeneration: generation, Status: metav1.ConditionFalse, LastTransitionTime: lastTransitionTime, + Message: "volume is healthy", } resyncing := metav1.Condition{ Type: volrep.ConditionResyncing, @@ -2447,6 +2452,7 @@ func (v *vrgTest) generateVRConditions(generation int64, options promoteOptions) ObservedGeneration: generation, Status: metav1.ConditionFalse, LastTransitionTime: lastTransitionTime, + Message: "volume is not resyncing", } return append(conditions, completed, degraded, resyncing) From b8871c67d07b2091134c6859efaa3e77ca344d1d Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Tue, 5 Nov 2024 16:18:52 +0200 Subject: [PATCH 3/4] Propagate VR conditions .Message When a VR condition is not met, we set the protected PVC condition message using the error message returned from isVRConditionMet(). When using csi-addons > 0.10.0, we use now the message from the condition instead of the default message. Since the Validated condition is not reported by older version of csi-addons, and we must wait until the Validated condition status is known when VRG is deleted, isVRConditionMet() returns now also the state of the condition, which can be: - missing: condition not found - stale: observed generation does not match object generation - unknown: the special "Unknown" value - known: status is True or False When we validate the Validate condition we have these cases: - Condition is missing: continue to next condition. - Condition is met: continue to the next condition. - Condition not met and its status is False. This VR will never complete and it is safe to delete since replication will never start. If VRG is deleted, we return true since the VR reached the designed state. Otherwise we return false. In this case we updated the protected pvc condition with the message from the VR condition. - Condition is not met and is stale or unnown: we need to check again later. There is no point to check the completed condition since a VR cannot complete without validation.In this case we updated the protected pvc condition with the message generated by isVRConditionMet() for stale or unknown conditions. Example protected pvc DataReady condition with propagated message when VR validation failed: conditions: - lastTransitionTime: "2024-11-06T15:33:06Z" message: 'failed to meet prerequisite: rpc error: code = FailedPrecondition desc = system is not in a state required for the operation''s execution: failed to enable mirroring on image "replicapool/csi-vol-fe2ca7f8-713c-4c51-bf52-0d4b2c11d329": parent image "replicapool/csi-snap-e2114105-b451-469b-ad97-eb3cbe2af54e" is not enabled for mirroring' observedGeneration: 1 reason: Error status: "False" type: DataReady Signed-off-by: Nir Soffer --- internal/controller/vrg_volrep.go | 91 +++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 29 deletions(-) diff --git a/internal/controller/vrg_volrep.go b/internal/controller/vrg_volrep.go index 5b41ab12f..ee2e2695f 100644 --- a/internal/controller/vrg_volrep.go +++ b/internal/controller/vrg_volrep.go @@ -1430,20 +1430,27 @@ func (v *VRGInstance) checkVRStatus(pvc *corev1.PersistentVolumeClaim, volRep *v // // We handle 3 cases: // - Primary deleted VRG: If Validated condition exists and false, the VR will never complete and can be -// deleted safely. Otherwise Completed condition is checked. -// - Primary VRG: Completed condition is checked. +// deleted safely. +// - Primary VRG: Validated condition is checked, and if successful the Completed conditions is also checked. // - Secondary VRG: Completed, Degraded and Resyncing conditions are checked and ensured healthy. func (v *VRGInstance) validateVRStatus(pvc *corev1.PersistentVolumeClaim, volRep *volrep.VolumeReplication, state ramendrv1alpha1.ReplicationState, ) bool { - // Check validated for primary during VRG deletion. - if state == ramendrv1alpha1.Primary && rmnutil.ResourceIsDeleted(v.instance) { - validated, ok := v.validateVRValidatedStatus(volRep) - if !validated && ok { - v.log.Info(fmt.Sprintf("VolumeReplication %s/%s failed validation and can be deleted", - volRep.GetName(), volRep.GetNamespace())) + // If primary, check the validated condition. + if state == ramendrv1alpha1.Primary { + validated, condState := v.validateVRValidatedStatus(pvc, volRep) + if !validated && condState != conditionMissing { + // If the condition is known, this VR will never complete since it failed initial validation. + if condState == conditionKnown { + v.log.Info(fmt.Sprintf("VolumeReplication %s/%s failed validation and can be deleted", + volRep.GetName(), volRep.GetNamespace())) + + // If the VRG is deleted the VR has reached the desired state. + return rmnutil.ResourceIsDeleted(v.instance) + } - return true + // The condition is stale or unknown so we need to check again later. + return false } } @@ -1467,19 +1474,25 @@ func (v *VRGInstance) validateVRStatus(pvc *corev1.PersistentVolumeClaim, volRep return true } -// validateVRValidatedStatus validates that VolumeReplicaion resource was validated. -// Return 2 booleans +// validateVRValidatedStatus validates that VolumeReplication resource was validated. +// Returns 2 values: // - validated: true if the condition is true, otherwise false -// - ok: true if the check was succeesfull, false if the condition is missing, stale, or unknown. +// - state: condition state func (v *VRGInstance) validateVRValidatedStatus( + pvc *corev1.PersistentVolumeClaim, volRep *volrep.VolumeReplication, -) (bool, bool) { - conditionMet, errorMsg := isVRConditionMet(volRep, volrep.ConditionValidated, metav1.ConditionTrue) - if errorMsg != "" { - v.log.Info(fmt.Sprintf("%s (VolRep: %s/%s)", errorMsg, volRep.GetName(), volRep.GetNamespace())) +) (bool, conditionState) { + conditionMet, condState, errorMsg := isVRConditionMet(volRep, volrep.ConditionValidated, metav1.ConditionTrue) + if !conditionMet && condState != conditionMissing { + defaultMsg := "VolumeReplication resource not validated" + v.updatePVCDataReadyConditionHelper(pvc.Namespace, pvc.Name, VRGConditionReasonError, errorMsg, + defaultMsg) + v.updatePVCDataProtectedConditionHelper(pvc.Namespace, pvc.Name, VRGConditionReasonError, errorMsg, + defaultMsg) + v.log.Info(fmt.Sprintf("%s (VolRep: %s/%s)", defaultMsg, volRep.GetName(), volRep.GetNamespace())) } - return conditionMet, errorMsg == "" + return conditionMet, condState } // validateVRCompletedStatus validates if the VolumeReplication resource Completed condition is met and update @@ -1502,7 +1515,7 @@ func (v *VRGInstance) validateVRCompletedStatus(pvc *corev1.PersistentVolumeClai action = "demoted" } - conditionMet, msg := isVRConditionMet(volRep, volrep.ConditionCompleted, metav1.ConditionTrue) + conditionMet, _, msg := isVRConditionMet(volRep, volrep.ConditionCompleted, metav1.ConditionTrue) if !conditionMet { defaultMsg := fmt.Sprintf("VolumeReplication resource for pvc not %s to %s", action, stateString) v.updatePVCDataReadyConditionHelper(pvc.Namespace, pvc.Name, VRGConditionReasonError, msg, @@ -1539,12 +1552,12 @@ func (v *VRGInstance) validateAdditionalVRStatusForSecondary(pvc *corev1.Persist ) bool { v.updatePVCLastSyncCounters(pvc.Namespace, pvc.Name, nil) - conditionMet, _ := isVRConditionMet(volRep, volrep.ConditionResyncing, metav1.ConditionTrue) + conditionMet, _, _ := isVRConditionMet(volRep, volrep.ConditionResyncing, metav1.ConditionTrue) if !conditionMet { return v.checkResyncCompletionAsSecondary(pvc, volRep) } - conditionMet, msg := isVRConditionMet(volRep, volrep.ConditionDegraded, metav1.ConditionTrue) + conditionMet, _, msg := isVRConditionMet(volRep, volrep.ConditionDegraded, metav1.ConditionTrue) if !conditionMet { defaultMsg := "VolumeReplication resource for pvc is not in Degraded condition while resyncing" v.updatePVCDataProtectedConditionHelper(pvc.Namespace, pvc.Name, VRGConditionReasonError, msg, @@ -1573,7 +1586,7 @@ func (v *VRGInstance) validateAdditionalVRStatusForSecondary(pvc *corev1.Persist func (v *VRGInstance) checkResyncCompletionAsSecondary(pvc *corev1.PersistentVolumeClaim, volRep *volrep.VolumeReplication, ) bool { - conditionMet, msg := isVRConditionMet(volRep, volrep.ConditionResyncing, metav1.ConditionFalse) + conditionMet, _, msg := isVRConditionMet(volRep, volrep.ConditionResyncing, metav1.ConditionFalse) if !conditionMet { defaultMsg := "VolumeReplication resource for pvc not syncing as Secondary" v.updatePVCDataReadyConditionHelper(pvc.Namespace, pvc.Name, VRGConditionReasonError, msg, @@ -1587,7 +1600,7 @@ func (v *VRGInstance) checkResyncCompletionAsSecondary(pvc *corev1.PersistentVol return false } - conditionMet, msg = isVRConditionMet(volRep, volrep.ConditionDegraded, metav1.ConditionFalse) + conditionMet, _, msg = isVRConditionMet(volRep, volrep.ConditionDegraded, metav1.ConditionFalse) if !conditionMet { defaultMsg := "VolumeReplication resource for pvc is not syncing and is degraded as Secondary" v.updatePVCDataReadyConditionHelper(pvc.Namespace, pvc.Name, VRGConditionReasonError, msg, @@ -1611,35 +1624,55 @@ func (v *VRGInstance) checkResyncCompletionAsSecondary(pvc *corev1.PersistentVol return true } -// isVRConditionMet returns true if the condition is met, and an error mesage if we could not get the -// condition value. +type conditionState string + +const ( + conditionMissing = conditionState("missing") + conditionStale = conditionState("stale") + conditionUnknown = conditionState("unknown") + conditionKnown = conditionState("known") +) + +// isVRConditionMet check if condition is met. +// Returns 3 values: +// - met: true if the condition status matches the desired status, otherwise false +// - state: one of (conditionMissing, conditionStale, conditionUnknown, conditionKnown) +// generation, and its value is not unknown. +// - errorMsg: error message describing why the condition is not met func isVRConditionMet(volRep *volrep.VolumeReplication, conditionType string, desiredStatus metav1.ConditionStatus, -) (bool, string) { +) (bool, conditionState, string) { + met := true + volRepCondition := findCondition(volRep.Status.Conditions, conditionType) if volRepCondition == nil { errorMsg := fmt.Sprintf("Failed to get the %s condition from status of VolumeReplication resource.", conditionType) - return false, errorMsg + return !met, conditionMissing, errorMsg } if volRep.GetGeneration() != volRepCondition.ObservedGeneration { errorMsg := fmt.Sprintf("Stale generation for condition %s from status of VolumeReplication resource.", conditionType) - return false, errorMsg + return !met, conditionStale, errorMsg } if volRepCondition.Status == metav1.ConditionUnknown { errorMsg := fmt.Sprintf("Unknown status for condition %s from status of VolumeReplication resource.", conditionType) - return false, errorMsg + return !met, conditionUnknown, errorMsg + } + + if volRepCondition.Status != desiredStatus { + // csi-addons > 0.10.0 returns detailed error message + return !met, conditionKnown, volRepCondition.Message } - return volRepCondition.Status == desiredStatus, "" + return met, conditionKnown, "" } // Disabling unparam linter as currently every invokation of this From aa0a819415cfb9bb5a22cd09434cec9e09434b66 Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Wed, 6 Nov 2024 20:35:08 +0200 Subject: [PATCH 4/4] Test propagating VR conditions message Verify that when VR Validated condition is False, the condition message is propagated to the protected pvc DataReady condition message. To make this easy to test, we have a new helper for waiting until protected pvc condition status and message are updated to specified values. We propagate the same message to the DataProtected condition, but this behavior seems like unwanted behavior that should change and is not worth testing. Signed-off-by: Nir Soffer --- internal/controller/vrg_volrep_test.go | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/internal/controller/vrg_volrep_test.go b/internal/controller/vrg_volrep_test.go index 83138d0f5..965c82646 100644 --- a/internal/controller/vrg_volrep_test.go +++ b/internal/controller/vrg_volrep_test.go @@ -654,6 +654,19 @@ var _ = Describe("VolumeReplicationGroupVolRepController", func() { It("simulate VR with failed validation", func() { vrgDeleteFailedVR.promoteVolRepsWithOptions(promoteOptions{ValidatedFailed: true}) }) + It("propagate VR condition message to protected pvc conditions", func() { + vrName := vrgDeleteFailedVR.pvcNames[0] + vr := volrep.VolumeReplication{} + Expect(k8sClient.Get(context.TODO(), vrName, &vr)).To(Succeed()) + validated := meta.FindStatusCondition(vr.Status.Conditions, volrep.ConditionValidated) + Expect(validated).NotTo(BeNil()) + vrgDeleteFailedVR.waitForProtectedPVCCondition( + vrName, + vrgController.VRGConditionTypeDataReady, + metav1.ConditionFalse, + validated.Message, + ) + }) It("VRG can be deleted", func() { By("deleting the VRG") vrg := vrgDeleteFailedVR.getVRG() @@ -2532,6 +2545,30 @@ func (v *vrgTest) waitForVolRepCondition( "failed to wait for volRep condition %q to become %q", conditionType, conditionStatus) } +func (v *vrgTest) waitForProtectedPVCCondition( + key types.NamespacedName, + conditionType string, + conditionStatus metav1.ConditionStatus, + conditionMessage string, +) { + Eventually(func() bool { + vrg := v.getVRG() + protectedPVC := vrgController.FindProtectedPVC(vrg, key.Namespace, key.Name) + if protectedPVC == nil { + return false + } + + condition := meta.FindStatusCondition(protectedPVC.Conditions, conditionType) + if condition == nil { + return false + } + + return condition.Status == conditionStatus && condition.Message == conditionMessage + }, vrgtimeout, vrginterval).Should(BeTrue(), + "failed to wait for protected pvc condition %q to become %q with message %q", + conditionType, conditionStatus, conditionMessage) +} + func (v *vrgTest) waitForProtectedPVCs(vrNamespacedName types.NamespacedName) { Eventually(func() bool { vrg := v.getVRG()