From f3983e845696ca70cf97c858924552fa18162982 Mon Sep 17 00:00:00 2001 From: Blake Devcich Date: Wed, 13 Nov 2024 14:45:22 -0600 Subject: [PATCH] Use label for ost0 and move OST0 ordering to NnfStorage controller Signed-off-by: Blake Devcich --- api/v1alpha3/nnfstorage_types.go | 2 + .../controller/nnf_node_storage_controller.go | 98 +++++++++---------- internal/controller/nnf_storage_controller.go | 41 +++++++- 3 files changed, 91 insertions(+), 50 deletions(-) diff --git a/api/v1alpha3/nnfstorage_types.go b/api/v1alpha3/nnfstorage_types.go index 841e2e9a..b706e476 100644 --- a/api/v1alpha3/nnfstorage_types.go +++ b/api/v1alpha3/nnfstorage_types.go @@ -29,6 +29,8 @@ import ( const ( AllocationSetLabel = "nnf.cray.hpe.com/allocationset" + // TODO: + // AllocationSetOST0Label = "nnf.cray.hpe.com/allocationset_ost0" ) // NnfStorageAllocationNodes identifies the node and properties of the allocation to make on that node diff --git a/internal/controller/nnf_node_storage_controller.go b/internal/controller/nnf_node_storage_controller.go index e89192de..97a9a148 100644 --- a/internal/controller/nnf_node_storage_controller.go +++ b/internal/controller/nnf_node_storage_controller.go @@ -27,7 +27,6 @@ import ( "github.com/go-logr/logr" apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kruntime "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" @@ -40,7 +39,6 @@ import ( dwsv1alpha2 "github.com/DataWorkflowServices/dws/api/v1alpha2" "github.com/DataWorkflowServices/dws/utils/updater" - "github.com/NearNodeFlash/nnf-sos/api/v1alpha3" nnfv1alpha3 "github.com/NearNodeFlash/nnf-sos/api/v1alpha3" "github.com/NearNodeFlash/nnf-sos/internal/controller/metrics" ) @@ -243,60 +241,60 @@ func (r *NnfNodeStorageReconciler) deleteAllocation(ctx context.Context, nnfNode } // For Lustre, wait until OST0 is gone first so that PreUnmount can run before any allocations are deleted - doPreUnmount := false - lustreOST0 := nnfNodeStorage.Spec.FileSystemType == "lustre" && nnfNodeStorage.Spec.LustreStorage.TargetType == "ost" && nnfNodeStorage.Spec.LustreStorage.StartIndex == 0 - if lustreOST0 || nnfNodeStorage.Spec.FileSystemType != "lustre" { - doPreUnmount = true - - } else if nnfNodeStorage.Spec.FileSystemType == "lustre" { - waitForOST0Deletion := func() (bool, error) { - // Get the owner and directive index from labels - ownerKind, ownerExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerKindLabel] - ownerName, ownerNameExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerNameLabel] - ownerNS, ownerNSExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerNamespaceLabel] - if !ownerExists || !ownerNameExists || !ownerNSExists || ownerKind != "NnfStorage" { - return false, dwsv1alpha2.NewResourceError("expected NnfNodeStorage owner to be of kind NnfStorage and have the expected labels").WithMajor() - } + /* + doPreUnmount := false + if lustreOST0 || nnfNodeStorage.Spec.FileSystemType != "lustre" { + doPreUnmount = true + + } else if nnfNodeStorage.Spec.FileSystemType == "lustre" { + waitForOST0Deletion := func() (bool, error) { + // Get the owner and directive index from labels + ownerKind, ownerExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerKindLabel] + ownerName, ownerNameExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerNameLabel] + ownerNS, ownerNSExists := nnfNodeStorage.Labels[dwsv1alpha2.OwnerNamespaceLabel] + if !ownerExists || !ownerNameExists || !ownerNSExists || ownerKind != "NnfStorage" { + return false, dwsv1alpha2.NewResourceError("expected NnfNodeStorage owner to be of kind NnfStorage and have the expected labels").WithMajor() + } - // Get the owner - storage := &v1alpha3.NnfStorage{ObjectMeta: metav1.ObjectMeta{ - Name: ownerName, - Namespace: ownerNS, - }} - if err := r.Get(ctx, client.ObjectKeyFromObject(storage), storage); err != nil { - return false, dwsv1alpha2.NewResourceError("unable retrieve NnfStorage resource").WithError(err).WithMajor() - } + // Get the owner + storage := &v1alpha3.NnfStorage{ObjectMeta: metav1.ObjectMeta{ + Name: ownerName, + Namespace: ownerNS, + }} + if err := r.Get(ctx, client.ObjectKeyFromObject(storage), storage); err != nil { + return false, dwsv1alpha2.NewResourceError("unable retrieve NnfStorage resource").WithError(err).WithMajor() + } - // Get al the NnfNodeStorages for the OSTs - nnfNodeStorageList := &nnfv1alpha3.NnfNodeStorageList{} - matchLabels := dwsv1alpha2.MatchingOwner(storage) - matchLabels[nnfv1alpha3.AllocationSetLabel] = "ost" + // Get al the NnfNodeStorages for the OSTs + nnfNodeStorageList := &nnfv1alpha3.NnfNodeStorageList{} + matchLabels := dwsv1alpha2.MatchingOwner(storage) + matchLabels[nnfv1alpha3.AllocationSetLabel] = "ost" - listOptions := []client.ListOption{ - matchLabels, - } + listOptions := []client.ListOption{ + matchLabels, + } - if err := r.List(ctx, nnfNodeStorageList, listOptions...); err != nil { - return false, dwsv1alpha2.NewResourceError("could not list NnfNodeStorages").WithError(err) - } + if err := r.List(ctx, nnfNodeStorageList, listOptions...); err != nil { + return false, dwsv1alpha2.NewResourceError("could not list NnfNodeStorages").WithError(err) + } - // wait until OST0 no longer exists - for _, nnfNodeStorage := range nnfNodeStorageList.Items { - if nnfNodeStorage.Spec.LustreStorage.StartIndex == 0 { - return false, nil + // wait until OST0 no longer exists + for _, nnfNodeStorage := range nnfNodeStorageList.Items { + if nnfNodeStorage.Spec.LustreStorage.StartIndex == 0 { + return false, nil + } } - } - return true, nil - } + return true, nil + } - // Wait for OST0 to be deleted first so it can run PreUnmount - if deleted, err := waitForOST0Deletion(); err != nil { - return nil, dwsv1alpha2.NewResourceError("failed to wait for lustre OST0 deletion").WithError(err).WithMajor() - } else if !deleted { - return &ctrl.Result{Requeue: true}, nil - } - } + // Wait for OST0 to be deleted first so it can run PreUnmount + if deleted, err := waitForOST0Deletion(); err != nil { + return nil, dwsv1alpha2.NewResourceError("failed to wait for lustre OST0 deletion").WithError(err).WithMajor() + } else if !deleted { + return &ctrl.Result{Requeue: true}, nil + } + }*/ if blockDeviceExists && nnfNodeStorage.Status.Allocations[index].Ready { ran, err := blockDevice.Activate(ctx) @@ -315,7 +313,9 @@ func (r *NnfNodeStorageReconciler) deleteAllocation(ctx context.Context, nnfNode log.Info("Activated file system", "allocation", index) } - if doPreUnmount { + // if doPreUnmount { + lustreOST0 := nnfNodeStorage.Spec.FileSystemType == "lustre" && nnfNodeStorage.Spec.LustreStorage.TargetType == "ost" && nnfNodeStorage.Spec.LustreStorage.StartIndex == 0 + if lustreOST0 || nnfNodeStorage.Spec.FileSystemType != "lustre" { ran, err = fileSystem.PreUnmount(ctx) if err != nil { return nil, dwsv1alpha2.NewResourceError("could not run pre unmount for file system").WithError(err).WithMajor() diff --git a/internal/controller/nnf_storage_controller.go b/internal/controller/nnf_storage_controller.go index 3bbf2ef8..47083452 100644 --- a/internal/controller/nnf_storage_controller.go +++ b/internal/controller/nnf_storage_controller.go @@ -617,6 +617,10 @@ func (r *NnfStorageReconciler) createNodeStorage(ctx context.Context, storage *n labels := nnfNodeStorage.GetLabels() labels[nnfv1alpha3.AllocationSetLabel] = allocationSet.Name + if lustreOST && startIndex == 0 { + // TODO: use label from API + labels["nnf.cray.hpe.com/allocationset_ost0"] = "true" + } nnfNodeStorage.SetLabels(labels) nnfNodeStorage.Spec.BlockReference = corev1.ObjectReference{ @@ -1082,6 +1086,13 @@ func (r *NnfStorageReconciler) teardownStorage(ctx context.Context, storage *nnf &nnfv1alpha3.NnfNodeStorageList{}, } + // Delete OST0 first so that PreUnmount commands can happen + // TODO: use label from API + ost0DeleteStatus, err := dwsv1alpha2.DeleteChildrenWithLabels(ctx, r.Client, childObjects, storage, client.MatchingLabels{"nnf.cray.hpe.com/allocationset_ost0": "true"}) + if err != nil { + return nodeStoragesExist, err + } + ostDeleteStatus, err := dwsv1alpha2.DeleteChildrenWithLabels(ctx, r.Client, childObjects, storage, client.MatchingLabels{nnfv1alpha3.AllocationSetLabel: "ost"}) if err != nil { return nodeStoragesExist, err @@ -1101,7 +1112,7 @@ func (r *NnfStorageReconciler) teardownStorage(ctx context.Context, storage *nnf } } - if !ostDeleteStatus.Complete() || !mdtDeleteStatus.Complete() { + if !ost0DeleteStatus.Complete() || !ostDeleteStatus.Complete() || !mdtDeleteStatus.Complete() { return nodeStoragesExist, nil } @@ -1221,6 +1232,34 @@ func nnfNodeStorageName(storage *nnfv1alpha3.NnfStorage, allocationSetIndex int, return storage.Namespace + "-" + storage.Name + "-" + storage.Spec.AllocationSets[allocationSetIndex].Name + "-" + strconv.Itoa(duplicateRabbitIndex) } +// Get the NnfNodeStorage for Lustre OST0 for a given NnfStorage +func (r *NnfStorageReconciler) getLustreOST0(ctx context.Context, storage *nnfv1alpha3.NnfStorage) (*nnfv1alpha3.NnfNodeStorage, error) { + if storage.Spec.FileSystemType != "lustre" { + return nil, nil + } + + // Get al the NnfNodeStorages for the OSTs + nnfNodeStorageList := &nnfv1alpha3.NnfNodeStorageList{} + matchLabels := dwsv1alpha2.MatchingOwner(storage) + matchLabels[nnfv1alpha3.AllocationSetLabel] = "ost" + + listOptions := []client.ListOption{ + matchLabels, + } + + if err := r.List(ctx, nnfNodeStorageList, listOptions...); err != nil { + return nil, dwsv1alpha2.NewResourceError("could not list NnfNodeStorages").WithError(err) + } + + for _, nnfNodeStorage := range nnfNodeStorageList.Items { + if nnfNodeStorage.Spec.LustreStorage.StartIndex == 0 { + return &nnfNodeStorage, nil + } + } + + return nil, nil +} + // SetupWithManager sets up the controller with the Manager. func (r *NnfStorageReconciler) SetupWithManager(mgr ctrl.Manager) error { r.ChildObjects = []dwsv1alpha2.ObjectList{