From 75e52489139b3938d279c4d33565e6b90a827712 Mon Sep 17 00:00:00 2001 From: matthew-richerson <82597529+matthew-richerson@users.noreply.github.com> Date: Wed, 2 Oct 2024 19:26:02 -0500 Subject: [PATCH] Wait for persistent storage owner label changes in teardown (#396) The workflow controller add/removes owner labels on the PersistentStorageInstance resource in the teardown phase of the create_persistent/destroy_persistent directives. This is so that the later call to DeleteChildren() will find (or not find) the persistent storage and delete it if necessary. The call to DeleteChildren() may do the wrong thing if the PersistentStorageInstance resource in the cache is stale. This commit adds a check after the labels are changed to make sure the changes are visible in our client cache. Also, change the Requeue while waiting for children to delete to a RequeueAfter. Fix a bug in the NnfSystemStorage and NnfAccess tests. The Storage resource are created by the SystemConfiguration controller, so we don't need to create or delete them Signed-off-by: Matt Richerson --- .../controller/nnf_access_controller_test.go | 53 ++++----- .../controller/nnf_workflow_controller.go | 33 +++++- .../nnfsystemstorage_controller_test.go | 106 +++++++++--------- 3 files changed, 105 insertions(+), 87 deletions(-) diff --git a/internal/controller/nnf_access_controller_test.go b/internal/controller/nnf_access_controller_test.go index a41d9976..534897b3 100644 --- a/internal/controller/nnf_access_controller_test.go +++ b/internal/controller/nnf_access_controller_test.go @@ -44,7 +44,6 @@ var _ = Describe("Access Controller Test", func() { "rabbit-nnf-access-test-node-2"} nnfNodes := [2]*nnfv1alpha2.NnfNode{} - storages := [2]*dwsv1alpha2.Storage{} nodes := [2]*corev1.Node{} var systemConfiguration *dwsv1alpha2.SystemConfiguration @@ -59,6 +58,26 @@ var _ = Describe("Access Controller Test", func() { } }) + systemConfiguration = &dwsv1alpha2.SystemConfiguration{ + TypeMeta: metav1.TypeMeta{}, + ObjectMeta: metav1.ObjectMeta{ + Name: "default", + Namespace: corev1.NamespaceDefault, + }, + Spec: dwsv1alpha2.SystemConfigurationSpec{ + StorageNodes: []dwsv1alpha2.SystemConfigurationStorageNode{ + { + Type: "Rabbit", + Name: "rabbit-nnf-access-test-node-1", + }, + { + Type: "Rabbit", + Name: "rabbit-nnf-access-test-node-2", + }, + }, + }, + } + for i, nodeName := range nodeNames { // Create the node - set it to up as ready nodes[i] = &corev1.Node{ @@ -98,34 +117,16 @@ var _ = Describe("Access Controller Test", func() { return k8sClient.Update(context.TODO(), nnfNodes[i]) }).Should(Succeed(), "set LNet Nid in NnfNode") - storages[i] = &dwsv1alpha2.Storage{ + storage := &dwsv1alpha2.Storage{ ObjectMeta: metav1.ObjectMeta{ Name: nodeName, Namespace: corev1.NamespaceDefault, }, } - Expect(k8sClient.Create(context.TODO(), storages[i])).To(Succeed()) - } - - systemConfiguration = &dwsv1alpha2.SystemConfiguration{ - TypeMeta: metav1.TypeMeta{}, - ObjectMeta: metav1.ObjectMeta{ - Name: "default", - Namespace: corev1.NamespaceDefault, - }, - Spec: dwsv1alpha2.SystemConfigurationSpec{ - StorageNodes: []dwsv1alpha2.SystemConfigurationStorageNode{ - { - Type: "Rabbit", - Name: "rabbit-nnf-access-test-node-1", - }, - { - Type: "Rabbit", - Name: "rabbit-nnf-access-test-node-2", - }, - }, - }, + Eventually(func() error { // wait until the SystemConfiguration controller creates the storage + return k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(storage), storage) + }).ShouldNot(Succeed()) } Expect(k8sClient.Create(context.TODO(), systemConfiguration)).To(Succeed()) @@ -142,12 +143,6 @@ var _ = Describe("Access Controller Test", func() { }).ShouldNot(Succeed()) for i := range nodeNames { - Expect(k8sClient.Delete(context.TODO(), storages[i])).To(Succeed()) - tempStorage := &dwsv1alpha2.Storage{} - Eventually(func() error { // Delete can still return the cached object. Wait until the object is no longer present - return k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(storages[i]), tempStorage) - }).ShouldNot(Succeed()) - Expect(k8sClient.Delete(context.TODO(), nnfNodes[i])).To(Succeed()) tempNnfNode := &nnfv1alpha2.NnfNode{} Eventually(func() error { // Delete can still return the cached object. Wait until the object is no longer present diff --git a/internal/controller/nnf_workflow_controller.go b/internal/controller/nnf_workflow_controller.go index a4c714d3..730fcc32 100644 --- a/internal/controller/nnf_workflow_controller.go +++ b/internal/controller/nnf_workflow_controller.go @@ -1083,7 +1083,7 @@ func (r *NnfWorkflowReconciler) startTeardownState(ctx context.Context, workflow } if !deleteStatus.Complete() { - return Requeue("delete").withDeleteStatus(deleteStatus), nil + return Requeue("delete").after(5 * time.Second).withDeleteStatus(deleteStatus), nil } } @@ -1122,6 +1122,20 @@ func (r *NnfWorkflowReconciler) finishTeardownState(ctx context.Context, workflo return nil, dwsv1alpha2.NewResourceError("could not update PersistentStorage: %v", client.ObjectKeyFromObject(persistentStorage)).WithError(err).WithUserMessage("could not finalize peristent storage") } log.Info("Removed owner reference from persistent storage", "psi", persistentStorage) + + persistentStorage, err = r.findPersistentInstance(ctx, workflow, dwArgs["name"]) + if err != nil { + return nil, dwsv1alpha2.NewResourceError("").WithError(err).WithUserMessage("could not find persistent storage %v", dwArgs["name"]) + } + + labels = persistentStorage.GetLabels() + if labels != nil { + if ownerUid, exists := labels[dwsv1alpha2.OwnerUidLabel]; exists { + if types.UID(ownerUid) == workflow.GetUID() { + return Requeue("persistent storage owner release").after(2 * time.Second).withObject(persistentStorage), nil + } + } + } case "destroy_persistent": persistentStorage, err := r.findPersistentInstance(ctx, workflow, dwArgs["name"]) if err != nil { @@ -1156,6 +1170,21 @@ func (r *NnfWorkflowReconciler) finishTeardownState(ctx context.Context, workflo return nil, dwsv1alpha2.NewResourceError("could not update PersistentInstance: %v", client.ObjectKeyFromObject(persistentStorage)).WithError(err).WithUserMessage("could not delete persistent storage %v", dwArgs["name"]) } log.Info("Add owner reference for persistent storage for deletion", "psi", persistentStorage) + + persistentStorage, err = r.findPersistentInstance(ctx, workflow, dwArgs["name"]) + if err != nil { + return nil, dwsv1alpha2.NewResourceError("").WithError(err).WithUserMessage("could not find persistent storage %v", dwArgs["name"]) + } + + labels := persistentStorage.GetLabels() + if labels == nil { + return Requeue("persistent storage owner add").after(2 * time.Second).withObject(persistentStorage), nil + } + + ownerUid, exists := labels[dwsv1alpha2.OwnerUidLabel] + if !exists || types.UID(ownerUid) != workflow.GetUID() { + return Requeue("persistent storage owner add").after(2 * time.Second).withObject(persistentStorage), nil + } case "persistentdw": err := r.removePersistentStorageReference(ctx, workflow, index) if err != nil { @@ -1181,7 +1210,7 @@ func (r *NnfWorkflowReconciler) finishTeardownState(ctx context.Context, workflo } if !deleteStatus.Complete() { - return Requeue("delete").withDeleteStatus(deleteStatus), nil + return Requeue("delete").after(5 * time.Second).withDeleteStatus(deleteStatus), nil } return nil, nil diff --git a/internal/controller/nnfsystemstorage_controller_test.go b/internal/controller/nnfsystemstorage_controller_test.go index 153da5a8..cdb54c0f 100644 --- a/internal/controller/nnfsystemstorage_controller_test.go +++ b/internal/controller/nnfsystemstorage_controller_test.go @@ -43,7 +43,6 @@ var _ = Describe("NnfSystemStorage Controller Test", func() { "rabbit-systemstorage-node-2"} nnfNodes := [2]*nnfv1alpha2.NnfNode{} - storages := [2]*dwsv1alpha2.Storage{} nodes := [2]*corev1.Node{} var systemConfiguration *dwsv1alpha2.SystemConfiguration @@ -58,55 +57,6 @@ var _ = Describe("NnfSystemStorage Controller Test", func() { } }) - for i, nodeName := range nodeNames { - // Create the node - set it to up as ready - nodes[i] = &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{ - nnfv1alpha2.RabbitNodeSelectorLabel: "true", - }, - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Status: corev1.ConditionTrue, - Type: corev1.NodeReady, - }, - }, - }, - } - - Expect(k8sClient.Create(context.TODO(), nodes[i])).To(Succeed()) - - nnfNodes[i] = &nnfv1alpha2.NnfNode{ - TypeMeta: metav1.TypeMeta{}, - ObjectMeta: metav1.ObjectMeta{ - Name: "nnf-nlc", - Namespace: nodeName, - }, - Spec: nnfv1alpha2.NnfNodeSpec{ - State: nnfv1alpha2.ResourceEnable, - }, - } - Expect(k8sClient.Create(context.TODO(), nnfNodes[i])).To(Succeed()) - - Eventually(func(g Gomega) error { - g.Expect(k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(nnfNodes[i]), nnfNodes[i])).To(Succeed()) - nnfNodes[i].Status.LNetNid = "1.2.3.4@tcp0" - return k8sClient.Update(context.TODO(), nnfNodes[i]) - }).Should(Succeed(), "set LNet Nid in NnfNode") - - storages[i] = &dwsv1alpha2.Storage{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Namespace: corev1.NamespaceDefault, - }, - } - - Expect(k8sClient.Create(context.TODO(), storages[i])).To(Succeed()) - } - systemConfiguration = &dwsv1alpha2.SystemConfiguration{ TypeMeta: metav1.TypeMeta{}, ObjectMeta: metav1.ObjectMeta{ @@ -260,6 +210,56 @@ var _ = Describe("NnfSystemStorage Controller Test", func() { } Expect(k8sClient.Create(context.TODO(), systemConfiguration)).To(Succeed()) + for i, nodeName := range nodeNames { + // Create the node - set it to up as ready + nodes[i] = &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + nnfv1alpha2.RabbitNodeSelectorLabel: "true", + }, + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + { + Status: corev1.ConditionTrue, + Type: corev1.NodeReady, + }, + }, + }, + } + + Expect(k8sClient.Create(context.TODO(), nodes[i])).To(Succeed()) + + nnfNodes[i] = &nnfv1alpha2.NnfNode{ + TypeMeta: metav1.TypeMeta{}, + ObjectMeta: metav1.ObjectMeta{ + Name: "nnf-nlc", + Namespace: nodeName, + }, + Spec: nnfv1alpha2.NnfNodeSpec{ + State: nnfv1alpha2.ResourceEnable, + }, + } + Expect(k8sClient.Create(context.TODO(), nnfNodes[i])).To(Succeed()) + + Eventually(func(g Gomega) error { + g.Expect(k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(nnfNodes[i]), nnfNodes[i])).To(Succeed()) + nnfNodes[i].Status.LNetNid = "1.2.3.4@tcp0" + return k8sClient.Update(context.TODO(), nnfNodes[i]) + }).Should(Succeed(), "set LNet Nid in NnfNode") + + storage := &dwsv1alpha2.Storage{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Namespace: corev1.NamespaceDefault, + }, + } + + Eventually(func() error { // wait until the SystemConfiguration controller creates the storage + return k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(storage), storage) + }).Should(Succeed()) + } // Create a pinned NnfStorageProfile for the unit tests. storageProfile = createBasicPinnedNnfStorageProfile() @@ -273,12 +273,6 @@ var _ = Describe("NnfSystemStorage Controller Test", func() { }).ShouldNot(Succeed()) for i := range nodeNames { - Expect(k8sClient.Delete(context.TODO(), storages[i])).To(Succeed()) - tempStorage := &dwsv1alpha2.Storage{} - Eventually(func() error { // Delete can still return the cached object. Wait until the object is no longer present - return k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(storages[i]), tempStorage) - }).ShouldNot(Succeed()) - Expect(k8sClient.Delete(context.TODO(), nnfNodes[i])).To(Succeed()) tempNnfNode := &nnfv1alpha2.NnfNode{} Eventually(func() error { // Delete can still return the cached object. Wait until the object is no longer present