From aba25da7653a04e04e6df997fb5fc232512b084a Mon Sep 17 00:00:00 2001 From: matthew-richerson <82597529+matthew-richerson@users.noreply.github.com> Date: Thu, 7 Nov 2024 08:47:34 -0600 Subject: [PATCH] Only aggregate status from children that would be created using the current spec (#410) For NnfStorage and NnfAccess resources created by the NnfSystemStorage, the spec section may change as Storage resources are disabled/enabled. When aggregating status from child objects (NnfNodeBlockStorage, NnfNodeStorage, and ClientMounts), only check the status from child resources that are currently requested by the spec. This avoids trying to collect status from Rabbits that are disabled. Signed-off-by: Matt Richerson --- go.mod | 2 +- go.sum | 4 +- internal/controller/nnf_access_controller.go | 39 +++++++++------ internal/controller/nnf_storage_controller.go | 48 +++++++++++++++---- .../dws/api/v1alpha2/owner_labels.go | 1 + vendor/modules.txt | 2 +- 6 files changed, 68 insertions(+), 28 deletions(-) diff --git a/go.mod b/go.mod index cc8b41979..edb41aea4 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/NearNodeFlash/nnf-sos go 1.21 require ( - github.com/DataWorkflowServices/dws v0.0.1-0.20240916174522-9062a91241cd + github.com/DataWorkflowServices/dws v0.0.1-0.20241029172011-d5898d0b8640 github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20240925185149-26d9d6071a1c github.com/NearNodeFlash/nnf-ec v0.0.1-0.20241017152925-afc4d0cf1a4b github.com/ghodss/yaml v1.0.0 diff --git a/go.sum b/go.sum index 96657563c..7bb263ea8 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/DataWorkflowServices/dws v0.0.1-0.20240916174522-9062a91241cd h1:/bSL8rV9cB13b3I5SCf20t4q3ab91i533sPcYIISI+8= -github.com/DataWorkflowServices/dws v0.0.1-0.20240916174522-9062a91241cd/go.mod h1:6MrEEHISskyooSKcKU6R3mFqH6Yh6KzWgajhcw2s+nM= +github.com/DataWorkflowServices/dws v0.0.1-0.20241029172011-d5898d0b8640 h1:JSjgesWkPo9sAc7QkjWisNDOlIOGR0MQX/hxXL56FTA= +github.com/DataWorkflowServices/dws v0.0.1-0.20241029172011-d5898d0b8640/go.mod h1:6MrEEHISskyooSKcKU6R3mFqH6Yh6KzWgajhcw2s+nM= github.com/HewlettPackard/structex v1.0.4 h1:RVTdN5FWhDWr1IkjllU8wxuLjISo4gr6u5ryZpzyHcA= github.com/HewlettPackard/structex v1.0.4/go.mod h1:3frC4RY/cPsP/4+N8rkxsNAGlQwHV+zDC7qvrN+N+rE= github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20240925185149-26d9d6071a1c h1:fSuMz3j8UzlYZI59Ded8XuUjYd7C5IyLB55jwgSTIew= diff --git a/internal/controller/nnf_access_controller.go b/internal/controller/nnf_access_controller.go index 4fd42426e..426b793e1 100644 --- a/internal/controller/nnf_access_controller.go +++ b/internal/controller/nnf_access_controller.go @@ -1039,25 +1039,28 @@ func (r *NnfAccessReconciler) getClientMountStatus(ctx context.Context, access * return true, nil } - clientMounts := []dwsv1alpha2.ClientMount{} + clientMountList := &dwsv1alpha2.ClientMountList{} + matchLabels := dwsv1alpha2.MatchingOwner(access) + + listOptions := []client.ListOption{ + matchLabels, + } + + if err := r.List(ctx, clientMountList, listOptions...); err != nil { + return false, dwsv1alpha2.NewResourceError("could not list ClientMounts").WithError(err) + } + // make a map with empty data of the client names to allow easy searching + clientNameMap := map[string]struct{}{} for _, clientName := range clientList { - clientMount := &dwsv1alpha2.ClientMount{ - ObjectMeta: metav1.ObjectMeta{ - Name: clientMountName(access), - Namespace: clientName, - }, - } - err := r.Get(ctx, client.ObjectKeyFromObject(clientMount), clientMount) - if err != nil { - if apierrors.IsNotFound(err) { - return false, nil - } + clientNameMap[clientName] = struct{}{} + } - return false, err + clientMounts := []dwsv1alpha2.ClientMount{} + for _, clientMount := range clientMountList.Items { + if _, exists := clientNameMap[clientMount.GetNamespace()]; exists { + clientMounts = append(clientMounts, clientMount) } - - clientMounts = append(clientMounts, *clientMount) } // Check the clientmounts for any errors first @@ -1106,6 +1109,12 @@ func (r *NnfAccessReconciler) getClientMountStatus(ctx context.Context, access * } } + if len(clientMounts) != len(clientList) { + if access.GetDeletionTimestamp().IsZero() { + log.Info("unexpected number of ClientMounts", "found", len(clientMounts), "expected", len(clientList)) + } + return false, nil + } return true, nil } diff --git a/internal/controller/nnf_storage_controller.go b/internal/controller/nnf_storage_controller.go index 560d2d06b..360ae828b 100644 --- a/internal/controller/nnf_storage_controller.go +++ b/internal/controller/nnf_storage_controller.go @@ -418,7 +418,22 @@ func (r *NnfStorageReconciler) aggregateNodeBlockStorageStatus(ctx context.Conte return &ctrl.Result{}, dwsv1alpha2.NewResourceError("could not list NnfNodeBlockStorages").WithError(err) } + // make a map with empty data of the Rabbit names to allow easy searching + nodeNameMap := map[string]struct{}{} + for _, node := range nnfStorage.Spec.AllocationSets[allocationSetIndex].Nodes { + nodeNameMap[node.Name] = struct{}{} + } + + // prune out any entries that aren't in the NnfStorage. This can happen if the NnfStorage was modified + // after it was created, as is the case with NnfStorages from an NnfSystemStorage + nnfNodeBlockStorages := []nnfv1alpha3.NnfNodeBlockStorage{} for _, nnfNodeBlockStorage := range nnfNodeBlockStorageList.Items { + if _, exists := nodeNameMap[nnfNodeBlockStorage.GetNamespace()]; exists { + nnfNodeBlockStorages = append(nnfNodeBlockStorages, nnfNodeBlockStorage) + } + } + + for _, nnfNodeBlockStorage := range nnfNodeBlockStorages { for _, nodeAllocation := range nnfNodeBlockStorage.Status.Allocations { if nodeAllocation.CapacityAllocated > 0 { allocationSet.AllocationCount++ @@ -426,7 +441,7 @@ func (r *NnfStorageReconciler) aggregateNodeBlockStorageStatus(ctx context.Conte } } - for _, nnfNodeBlockStorage := range nnfNodeBlockStorageList.Items { + for _, nnfNodeBlockStorage := range nnfNodeBlockStorages { if nnfNodeBlockStorage.Status.Error != nil { return &ctrl.Result{}, dwsv1alpha2.NewResourceError("Node: %s", nnfNodeBlockStorage.GetNamespace()).WithError(nnfNodeBlockStorage.Status.Error) } @@ -440,7 +455,7 @@ func (r *NnfStorageReconciler) aggregateNodeBlockStorageStatus(ctx context.Conte childTimeout = 300 } - for _, nnfNodeBlockStorage := range nnfNodeBlockStorageList.Items { + for _, nnfNodeBlockStorage := range nnfNodeBlockStorages { // check if the finalizer has been added by the controller on the Rabbit if len(nnfNodeBlockStorage.GetFinalizers()) > 0 { continue @@ -454,7 +469,7 @@ func (r *NnfStorageReconciler) aggregateNodeBlockStorageStatus(ctx context.Conte } } - for _, nnfNodeBlockStorage := range nnfNodeBlockStorageList.Items { + for _, nnfNodeBlockStorage := range nnfNodeBlockStorages { if nnfNodeBlockStorage.Status.Ready == false { return &ctrl.Result{}, nil } @@ -463,9 +478,9 @@ func (r *NnfStorageReconciler) aggregateNodeBlockStorageStatus(ctx context.Conte // Ensure that we found all the NnfNodeBlockStorage resources we were expecting. This can be expected // transiently as it takes time for the client cache to be updated. Log a message in case the count // never reaches the expected value. - if len(nnfNodeBlockStorageList.Items) != len(nnfStorage.Spec.AllocationSets[allocationSetIndex].Nodes) { + if len(nnfNodeBlockStorages) != len(nnfStorage.Spec.AllocationSets[allocationSetIndex].Nodes) { if nnfStorage.GetDeletionTimestamp().IsZero() { - log.Info("unexpected number of NnfNodeBlockStorages", "found", len(nnfNodeBlockStorageList.Items), "expected", len(nnfStorage.Spec.AllocationSets[allocationSetIndex].Nodes)) + log.Info("unexpected number of NnfNodeBlockStorages", "found", len(nnfNodeBlockStorages), "expected", len(nnfStorage.Spec.AllocationSets[allocationSetIndex].Nodes)) } return &ctrl.Result{}, nil } @@ -642,7 +657,22 @@ func (r *NnfStorageReconciler) aggregateNodeStorageStatus(ctx context.Context, s return &ctrl.Result{}, dwsv1alpha2.NewResourceError("could not list NnfNodeStorages").WithError(err) } + // make a map with empty data of the Rabbit names to allow easy searching + nodeNameMap := map[string]struct{}{} + for _, node := range storage.Spec.AllocationSets[allocationSetIndex].Nodes { + nodeNameMap[node.Name] = struct{}{} + } + + // prune out any entries that aren't in the NnfStorage. This can happen if the NnfStorage was modified + // after it was created, as is the case with NnfStorages from an NnfSystemStorage + nnfNodeStorages := []nnfv1alpha3.NnfNodeStorage{} for _, nnfNodeStorage := range nnfNodeStorageList.Items { + if _, exists := nodeNameMap[nnfNodeStorage.GetNamespace()]; exists { + nnfNodeStorages = append(nnfNodeStorages, nnfNodeStorage) + } + } + + for _, nnfNodeStorage := range nnfNodeStorages { // If we're in the delete path, only propagate errors for storages that are deleting. Errors // from creation aren't interesting anymore if deleting && nnfNodeStorage.GetDeletionTimestamp().IsZero() { @@ -661,7 +691,7 @@ func (r *NnfStorageReconciler) aggregateNodeStorageStatus(ctx context.Context, s childTimeout = 300 } - for _, nnfNodeStorage := range nnfNodeStorageList.Items { + for _, nnfNodeStorage := range nnfNodeStorages { // check if the finalizer has been added by the controller on the Rabbit if len(nnfNodeStorage.GetFinalizers()) > 0 { continue @@ -675,7 +705,7 @@ func (r *NnfStorageReconciler) aggregateNodeStorageStatus(ctx context.Context, s } } - for _, nnfNodeStorage := range nnfNodeStorageList.Items { + for _, nnfNodeStorage := range nnfNodeStorages { if nnfNodeStorage.Status.Ready == false { return &ctrl.Result{}, nil } @@ -684,9 +714,9 @@ func (r *NnfStorageReconciler) aggregateNodeStorageStatus(ctx context.Context, s // Ensure that we found all the NnfNodeStorage resources we were expecting. This can be expected // transiently as it takes time for the client cache to be updated. Log a message in case the count // never reaches the expected value. - if len(nnfNodeStorageList.Items) != len(storage.Spec.AllocationSets[allocationSetIndex].Nodes) { + if len(nnfNodeStorages) != len(storage.Spec.AllocationSets[allocationSetIndex].Nodes) { if storage.GetDeletionTimestamp().IsZero() { - log.Info("unexpected number of NnfNodeStorages", "found", len(nnfNodeStorageList.Items), "expected", len(storage.Spec.AllocationSets[allocationSetIndex].Nodes)) + log.Info("unexpected number of NnfNodeStorages", "found", len(nnfNodeStorages), "expected", len(storage.Spec.AllocationSets[allocationSetIndex].Nodes)) } return &ctrl.Result{}, nil } diff --git a/vendor/github.com/DataWorkflowServices/dws/api/v1alpha2/owner_labels.go b/vendor/github.com/DataWorkflowServices/dws/api/v1alpha2/owner_labels.go index 2af9d95b2..313f618c4 100644 --- a/vendor/github.com/DataWorkflowServices/dws/api/v1alpha2/owner_labels.go +++ b/vendor/github.com/DataWorkflowServices/dws/api/v1alpha2/owner_labels.go @@ -63,6 +63,7 @@ func MatchingOwner(owner metav1.Object) client.MatchingLabels { OwnerKindLabel: reflect.Indirect(reflect.ValueOf(owner)).Type().Name(), OwnerNameLabel: owner.GetName(), OwnerNamespaceLabel: owner.GetNamespace(), + OwnerUidLabel: string(owner.GetUID()), }) } diff --git a/vendor/modules.txt b/vendor/modules.txt index fbdd4a5a2..d14d11a7a 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,4 +1,4 @@ -# github.com/DataWorkflowServices/dws v0.0.1-0.20240916174522-9062a91241cd +# github.com/DataWorkflowServices/dws v0.0.1-0.20241029172011-d5898d0b8640 ## explicit; go 1.21 github.com/DataWorkflowServices/dws/api/v1alpha2 github.com/DataWorkflowServices/dws/config/crd/bases