Skip to content

Commit

Permalink
Only aggregate status from children that would be created using the c…
Browse files Browse the repository at this point in the history
…urrent spec (#410)

For NnfStorage and NnfAccess resources created by the NnfSystemStorage, the spec section may
change as Storage resources are disabled/enabled. When aggregating status from child objects
(NnfNodeBlockStorage, NnfNodeStorage, and ClientMounts), only check the status from child
resources that are currently requested by the spec. This avoids trying to collect status
from Rabbits that are disabled.

Signed-off-by: Matt Richerson <[email protected]>
  • Loading branch information
matthew-richerson authored Nov 7, 2024
1 parent ed4c94b commit aba25da
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 28 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/NearNodeFlash/nnf-sos
go 1.21

require (
github.com/DataWorkflowServices/dws v0.0.1-0.20240916174522-9062a91241cd
github.com/DataWorkflowServices/dws v0.0.1-0.20241029172011-d5898d0b8640
github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20240925185149-26d9d6071a1c
github.com/NearNodeFlash/nnf-ec v0.0.1-0.20241017152925-afc4d0cf1a4b
github.com/ghodss/yaml v1.0.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/DataWorkflowServices/dws v0.0.1-0.20240916174522-9062a91241cd h1:/bSL8rV9cB13b3I5SCf20t4q3ab91i533sPcYIISI+8=
github.com/DataWorkflowServices/dws v0.0.1-0.20240916174522-9062a91241cd/go.mod h1:6MrEEHISskyooSKcKU6R3mFqH6Yh6KzWgajhcw2s+nM=
github.com/DataWorkflowServices/dws v0.0.1-0.20241029172011-d5898d0b8640 h1:JSjgesWkPo9sAc7QkjWisNDOlIOGR0MQX/hxXL56FTA=
github.com/DataWorkflowServices/dws v0.0.1-0.20241029172011-d5898d0b8640/go.mod h1:6MrEEHISskyooSKcKU6R3mFqH6Yh6KzWgajhcw2s+nM=
github.com/HewlettPackard/structex v1.0.4 h1:RVTdN5FWhDWr1IkjllU8wxuLjISo4gr6u5ryZpzyHcA=
github.com/HewlettPackard/structex v1.0.4/go.mod h1:3frC4RY/cPsP/4+N8rkxsNAGlQwHV+zDC7qvrN+N+rE=
github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20240925185149-26d9d6071a1c h1:fSuMz3j8UzlYZI59Ded8XuUjYd7C5IyLB55jwgSTIew=
Expand Down
39 changes: 24 additions & 15 deletions internal/controller/nnf_access_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1039,25 +1039,28 @@ func (r *NnfAccessReconciler) getClientMountStatus(ctx context.Context, access *
return true, nil
}

clientMounts := []dwsv1alpha2.ClientMount{}
clientMountList := &dwsv1alpha2.ClientMountList{}
matchLabels := dwsv1alpha2.MatchingOwner(access)

listOptions := []client.ListOption{
matchLabels,
}

if err := r.List(ctx, clientMountList, listOptions...); err != nil {
return false, dwsv1alpha2.NewResourceError("could not list ClientMounts").WithError(err)
}

// make a map with empty data of the client names to allow easy searching
clientNameMap := map[string]struct{}{}
for _, clientName := range clientList {
clientMount := &dwsv1alpha2.ClientMount{
ObjectMeta: metav1.ObjectMeta{
Name: clientMountName(access),
Namespace: clientName,
},
}
err := r.Get(ctx, client.ObjectKeyFromObject(clientMount), clientMount)
if err != nil {
if apierrors.IsNotFound(err) {
return false, nil
}
clientNameMap[clientName] = struct{}{}
}

return false, err
clientMounts := []dwsv1alpha2.ClientMount{}
for _, clientMount := range clientMountList.Items {
if _, exists := clientNameMap[clientMount.GetNamespace()]; exists {
clientMounts = append(clientMounts, clientMount)
}

clientMounts = append(clientMounts, *clientMount)
}

// Check the clientmounts for any errors first
Expand Down Expand Up @@ -1106,6 +1109,12 @@ func (r *NnfAccessReconciler) getClientMountStatus(ctx context.Context, access *
}
}

if len(clientMounts) != len(clientList) {
if access.GetDeletionTimestamp().IsZero() {
log.Info("unexpected number of ClientMounts", "found", len(clientMounts), "expected", len(clientList))
}
return false, nil
}
return true, nil
}

Expand Down
48 changes: 39 additions & 9 deletions internal/controller/nnf_storage_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -418,15 +418,30 @@ func (r *NnfStorageReconciler) aggregateNodeBlockStorageStatus(ctx context.Conte
return &ctrl.Result{}, dwsv1alpha2.NewResourceError("could not list NnfNodeBlockStorages").WithError(err)
}

// make a map with empty data of the Rabbit names to allow easy searching
nodeNameMap := map[string]struct{}{}
for _, node := range nnfStorage.Spec.AllocationSets[allocationSetIndex].Nodes {
nodeNameMap[node.Name] = struct{}{}
}

// prune out any entries that aren't in the NnfStorage. This can happen if the NnfStorage was modified
// after it was created, as is the case with NnfStorages from an NnfSystemStorage
nnfNodeBlockStorages := []nnfv1alpha3.NnfNodeBlockStorage{}
for _, nnfNodeBlockStorage := range nnfNodeBlockStorageList.Items {
if _, exists := nodeNameMap[nnfNodeBlockStorage.GetNamespace()]; exists {
nnfNodeBlockStorages = append(nnfNodeBlockStorages, nnfNodeBlockStorage)
}
}

for _, nnfNodeBlockStorage := range nnfNodeBlockStorages {
for _, nodeAllocation := range nnfNodeBlockStorage.Status.Allocations {
if nodeAllocation.CapacityAllocated > 0 {
allocationSet.AllocationCount++
}
}
}

for _, nnfNodeBlockStorage := range nnfNodeBlockStorageList.Items {
for _, nnfNodeBlockStorage := range nnfNodeBlockStorages {
if nnfNodeBlockStorage.Status.Error != nil {
return &ctrl.Result{}, dwsv1alpha2.NewResourceError("Node: %s", nnfNodeBlockStorage.GetNamespace()).WithError(nnfNodeBlockStorage.Status.Error)
}
Expand All @@ -440,7 +455,7 @@ func (r *NnfStorageReconciler) aggregateNodeBlockStorageStatus(ctx context.Conte
childTimeout = 300
}

for _, nnfNodeBlockStorage := range nnfNodeBlockStorageList.Items {
for _, nnfNodeBlockStorage := range nnfNodeBlockStorages {
// check if the finalizer has been added by the controller on the Rabbit
if len(nnfNodeBlockStorage.GetFinalizers()) > 0 {
continue
Expand All @@ -454,7 +469,7 @@ func (r *NnfStorageReconciler) aggregateNodeBlockStorageStatus(ctx context.Conte
}
}

for _, nnfNodeBlockStorage := range nnfNodeBlockStorageList.Items {
for _, nnfNodeBlockStorage := range nnfNodeBlockStorages {
if nnfNodeBlockStorage.Status.Ready == false {
return &ctrl.Result{}, nil
}
Expand All @@ -463,9 +478,9 @@ func (r *NnfStorageReconciler) aggregateNodeBlockStorageStatus(ctx context.Conte
// Ensure that we found all the NnfNodeBlockStorage resources we were expecting. This can be expected
// transiently as it takes time for the client cache to be updated. Log a message in case the count
// never reaches the expected value.
if len(nnfNodeBlockStorageList.Items) != len(nnfStorage.Spec.AllocationSets[allocationSetIndex].Nodes) {
if len(nnfNodeBlockStorages) != len(nnfStorage.Spec.AllocationSets[allocationSetIndex].Nodes) {
if nnfStorage.GetDeletionTimestamp().IsZero() {
log.Info("unexpected number of NnfNodeBlockStorages", "found", len(nnfNodeBlockStorageList.Items), "expected", len(nnfStorage.Spec.AllocationSets[allocationSetIndex].Nodes))
log.Info("unexpected number of NnfNodeBlockStorages", "found", len(nnfNodeBlockStorages), "expected", len(nnfStorage.Spec.AllocationSets[allocationSetIndex].Nodes))
}
return &ctrl.Result{}, nil
}
Expand Down Expand Up @@ -642,7 +657,22 @@ func (r *NnfStorageReconciler) aggregateNodeStorageStatus(ctx context.Context, s
return &ctrl.Result{}, dwsv1alpha2.NewResourceError("could not list NnfNodeStorages").WithError(err)
}

// make a map with empty data of the Rabbit names to allow easy searching
nodeNameMap := map[string]struct{}{}
for _, node := range storage.Spec.AllocationSets[allocationSetIndex].Nodes {
nodeNameMap[node.Name] = struct{}{}
}

// prune out any entries that aren't in the NnfStorage. This can happen if the NnfStorage was modified
// after it was created, as is the case with NnfStorages from an NnfSystemStorage
nnfNodeStorages := []nnfv1alpha3.NnfNodeStorage{}
for _, nnfNodeStorage := range nnfNodeStorageList.Items {
if _, exists := nodeNameMap[nnfNodeStorage.GetNamespace()]; exists {
nnfNodeStorages = append(nnfNodeStorages, nnfNodeStorage)
}
}

for _, nnfNodeStorage := range nnfNodeStorages {
// If we're in the delete path, only propagate errors for storages that are deleting. Errors
// from creation aren't interesting anymore
if deleting && nnfNodeStorage.GetDeletionTimestamp().IsZero() {
Expand All @@ -661,7 +691,7 @@ func (r *NnfStorageReconciler) aggregateNodeStorageStatus(ctx context.Context, s
childTimeout = 300
}

for _, nnfNodeStorage := range nnfNodeStorageList.Items {
for _, nnfNodeStorage := range nnfNodeStorages {
// check if the finalizer has been added by the controller on the Rabbit
if len(nnfNodeStorage.GetFinalizers()) > 0 {
continue
Expand All @@ -675,7 +705,7 @@ func (r *NnfStorageReconciler) aggregateNodeStorageStatus(ctx context.Context, s
}
}

for _, nnfNodeStorage := range nnfNodeStorageList.Items {
for _, nnfNodeStorage := range nnfNodeStorages {
if nnfNodeStorage.Status.Ready == false {
return &ctrl.Result{}, nil
}
Expand All @@ -684,9 +714,9 @@ func (r *NnfStorageReconciler) aggregateNodeStorageStatus(ctx context.Context, s
// Ensure that we found all the NnfNodeStorage resources we were expecting. This can be expected
// transiently as it takes time for the client cache to be updated. Log a message in case the count
// never reaches the expected value.
if len(nnfNodeStorageList.Items) != len(storage.Spec.AllocationSets[allocationSetIndex].Nodes) {
if len(nnfNodeStorages) != len(storage.Spec.AllocationSets[allocationSetIndex].Nodes) {
if storage.GetDeletionTimestamp().IsZero() {
log.Info("unexpected number of NnfNodeStorages", "found", len(nnfNodeStorageList.Items), "expected", len(storage.Spec.AllocationSets[allocationSetIndex].Nodes))
log.Info("unexpected number of NnfNodeStorages", "found", len(nnfNodeStorages), "expected", len(storage.Spec.AllocationSets[allocationSetIndex].Nodes))
}
return &ctrl.Result{}, nil
}
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion vendor/modules.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# github.com/DataWorkflowServices/dws v0.0.1-0.20240916174522-9062a91241cd
# github.com/DataWorkflowServices/dws v0.0.1-0.20241029172011-d5898d0b8640
## explicit; go 1.21
github.com/DataWorkflowServices/dws/api/v1alpha2
github.com/DataWorkflowServices/dws/config/crd/bases
Expand Down

0 comments on commit aba25da

Please sign in to comment.