Skip to content

Commit

Permalink
Detach volume groups before deleting machine
Browse files Browse the repository at this point in the history
This commit is a workaround to a known problem with hypervisor attached
volumes that are created by CSI 3.0. Hypervisor attached volumes result
in a volume group disk being attached to the VM on which the pod consuming
the PV is running. When a cluster is deleted before deleting the PVC, the
prism task fails to delete the VM because an existing volume group is attached
to the VM. Since this problem is restricted to CSI 3.0 which relies on a
minimum PC version of 2024.1 and the solution requires the volume group
detach v4 API which is also available starting 2024.1, we make a v3
get prism central info call to fetch the PC version. If PC version is
2024 or greater we create a v4 client for the cluster as well. Before a machine
is deleted, we check if the VM backing the machine has any disks backed by
volume groups. If it does, we detach those volume groups before deleting the
machine.
  • Loading branch information
thunderboltsid committed Jul 25, 2024
1 parent 9ebba60 commit bcc1ed5
Show file tree
Hide file tree
Showing 25 changed files with 5,712 additions and 682 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ cluster-e2e-templates-v1beta1: ## Generate cluster templates for v1beta1
kustomize build $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-kcp-remediation --load-restrictor LoadRestrictionsNone > $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-kcp-remediation.yaml
kustomize build $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-kcp-scale-in --load-restrictor LoadRestrictionsNone > $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-kcp-scale-in.yaml
kustomize build $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-csi --load-restrictor LoadRestrictionsNone > $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-csi.yaml
kustomize build $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-csi3 --load-restrictor LoadRestrictionsNone > $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-csi3.yaml
kustomize build $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-failure-domains --load-restrictor LoadRestrictionsNone > $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-failure-domains.yaml
kustomize build $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-clusterclass --load-restrictor LoadRestrictionsNone > $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-clusterclass.yaml
kustomize build $(NUTANIX_E2E_TEMPLATES)/v1beta1/cluster-template-clusterclass --load-restrictor LoadRestrictionsNone > $(NUTANIX_E2E_TEMPLATES)/v1beta1/clusterclass-nutanix-quick-start.yaml
Expand All @@ -287,6 +288,7 @@ cluster-e2e-templates-no-kubeproxy: ##Generate cluster templates without kubepro
cluster-templates: ## Generate cluster templates for all flavors
kustomize build $(TEMPLATES_DIR)/base > $(TEMPLATES_DIR)/cluster-template.yaml
kustomize build $(TEMPLATES_DIR)/csi > $(TEMPLATES_DIR)/cluster-template-csi.yaml
kustomize build $(TEMPLATES_DIR)/csi3 > $(TEMPLATES_DIR)/cluster-template-csi3.yaml
kustomize build $(TEMPLATES_DIR)/clusterclass > $(TEMPLATES_DIR)/cluster-template-clusterclass.yaml
kustomize build $(TEMPLATES_DIR)/topology > $(TEMPLATES_DIR)/cluster-template-topology.yaml

Expand Down
182 changes: 179 additions & 3 deletions controllers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,24 @@ package controllers

import (
"context"
"errors"
"fmt"
"reflect"
"strconv"
"strings"
"time"

"github.com/google/uuid"
"github.com/nutanix-cloud-native/prism-go-client/utils"
prismclientv3 "github.com/nutanix-cloud-native/prism-go-client/v3"
prismclientv4 "github.com/nutanix-cloud-native/prism-go-client/v4"
prismcommonconfig "github.com/nutanix/ntnx-api-golang-clients/prism-go-client/v4/models/common/v1/config"
prismapi "github.com/nutanix/ntnx-api-golang-clients/prism-go-client/v4/models/prism/v4/config"
vmconfig "github.com/nutanix/ntnx-api-golang-clients/vmm-go-client/v4/models/vmm/v4/ahv/config"
prismconfig "github.com/nutanix/ntnx-api-golang-clients/volumes-go-client/v4/models/prism/v4/config"
volumesconfig "github.com/nutanix/ntnx-api-golang-clients/volumes-go-client/v4/models/volumes/v4/config"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/wait"
v1 "k8s.io/client-go/informers/core/v1"
capiv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util/conditions"
Expand Down Expand Up @@ -755,7 +765,7 @@ func GetFailureDomain(failureDomainName string, nutanixCluster *infrav1.NutanixC
return nil, fmt.Errorf("failed to find failure domain %s on nutanix cluster object", failureDomainName)
}

func getPrismCentralClientForCluster(ctx context.Context, cluster *infrav1.NutanixCluster, secretInformer v1.SecretInformer, mapInformer v1.ConfigMapInformer) (*prismclientv3.Client, error) {
func getPrismCentralV3ClientForCluster(ctx context.Context, cluster *infrav1.NutanixCluster, secretInformer v1.SecretInformer, mapInformer v1.ConfigMapInformer) (*prismclientv3.Client, error) {
log := ctrl.LoggerFrom(ctx)

clientHelper := nutanixclient.NewHelper(secretInformer, mapInformer)
Expand All @@ -766,7 +776,7 @@ func getPrismCentralClientForCluster(ctx context.Context, cluster *infrav1.Nutan
return nil, err
}

v3Client, err := nutanixclient.NutanixClientCache.GetOrCreate(&nutanixclient.CacheParams{
client, err := nutanixclient.NutanixClientCacheV3.GetOrCreate(&nutanixclient.CacheParams{
NutanixCluster: cluster,
PrismManagementEndpoint: managementEndpoint,
})
Expand All @@ -777,5 +787,171 @@ func getPrismCentralClientForCluster(ctx context.Context, cluster *infrav1.Nutan
}

conditions.MarkTrue(cluster, infrav1.PrismCentralClientCondition)
return v3Client, nil
return client, nil
}

func getPrismCentralV4ClientForCluster(ctx context.Context, cluster *infrav1.NutanixCluster, secretInformer v1.SecretInformer, mapInformer v1.ConfigMapInformer) (*prismclientv4.Client, error) {
log := ctrl.LoggerFrom(ctx)

clientHelper := nutanixclient.NewHelper(secretInformer, mapInformer)
managementEndpoint, err := clientHelper.BuildManagementEndpoint(ctx, cluster)
if err != nil {
log.Error(err, fmt.Sprintf("error occurred while getting management endpoint for cluster %q", cluster.GetNamespacedName()))
conditions.MarkFalse(cluster, infrav1.PrismCentralClientCondition, infrav1.PrismCentralClientInitializationFailed, capiv1.ConditionSeverityError, err.Error())
return nil, err
}

client, err := nutanixclient.NutanixClientCacheV4.GetOrCreate(&nutanixclient.CacheParams{
NutanixCluster: cluster,
PrismManagementEndpoint: managementEndpoint,
})
if err != nil {
log.Error(err, "error occurred while getting nutanix prism client from cache")
conditions.MarkFalse(cluster, infrav1.PrismCentralClientCondition, infrav1.PrismCentralClientInitializationFailed, capiv1.ConditionSeverityError, err.Error())
return nil, fmt.Errorf("nutanix prism client error: %w", err)
}

conditions.MarkTrue(cluster, infrav1.PrismCentralClientCondition)
return client, nil
}

// isPrismCentralV4Compatible checks if the Prism Central is v4 API compatible
func isPrismCentralV4Compatible(ctx context.Context, v3Client *prismclientv3.Client) (bool, error) {
pcVersion, err := getPrismCentralVersion(ctx, v3Client)
if err != nil {
return false, fmt.Errorf("failed to get Prism Central version: %w", err)
}

// Check if the version is v4 compatible
// PC versions look like pc.2024.1.0.1
// We can check if the version is greater than or equal to 2024

if pcVersion == "" {
return false, errors.New("version is empty")
}

// Remove the prefix "pc."
version := strings.TrimPrefix(pcVersion, "pc.")
// Split the version string by "." to extract the year part
parts := strings.Split(version, ".")
if len(parts) < 1 {
return false, errors.New("invalid version format")
}

// Convert the year part to an integer
year, err := strconv.Atoi(parts[0])
if err != nil {
return false, errors.New("invalid version: failed to parse year from PC version")
}

if year >= 2024 {
return true, nil
}

return false, nil
}

// getPrismCentralVersion returns the version of the Prism Central instance
func getPrismCentralVersion(ctx context.Context, v3Client *prismclientv3.Client) (string, error) {
pcInfo, err := v3Client.V3.GetPrismCentral(ctx)
if err != nil {
return "", err
}

if pcInfo.Resources == nil || pcInfo.Resources.Version == nil {
return "", fmt.Errorf("failed to get Prism Central version")
}

return *pcInfo.Resources.Version, nil
}

func detachVolumeGroupsFromVM(ctx context.Context, v4Client *prismclientv4.Client, vmUUID string) error {
log := ctrl.LoggerFrom(ctx)
vmResp, err := v4Client.VmApiInstance.GetVmById(&vmUUID)
if err != nil {
return fmt.Errorf("failed to get virtual machine: %w", err)
}

data := vmResp.GetData()
vm, ok := data.(vmconfig.Vm)
if !ok {
return fmt.Errorf("failed to cast response to VM")
}

volumeGroupsToDetach := make([]string, 0)
for _, disk := range vm.Disks {
backingInfo, ok := disk.GetBackingInfo().(vmconfig.ADSFVolumeGroupReference)
if !ok {
continue
}

volumeGroupsToDetach = append(volumeGroupsToDetach, *backingInfo.VolumeGroupExtId)
}

log.Info(fmt.Sprintf("detaching %d volume groups from virtual machine %s", len(volumeGroupsToDetach), vmUUID))

// Detach the volume groups from the virtual machine
for _, volumeGroup := range volumeGroupsToDetach {
log.Info(fmt.Sprintf("detaching volume group %s from virtual machine %s", volumeGroup, vmUUID))
body := &volumesconfig.VmAttachment{
ExtId: &vmUUID,
}
resp, err := v4Client.VolumeGroupsApiInstance.DetachVm(&volumeGroup, body)
if err != nil {
return fmt.Errorf("failed to detach volume group %s from virtual machine %s: %w", volumeGroup, vmUUID, err)
}

data := resp.GetData()
task, ok := data.(prismconfig.TaskReference)
if !ok {
return fmt.Errorf("failed to cast response to TaskReference")
}

// Wait for the task to complete
if _, err := waitForTaskCompletionV4(ctx, v4Client, *task.ExtId); err != nil {
return fmt.Errorf("failed to wait for task %s to complete: %w", *task.ExtId, err)
}
}

return nil
}

// waitForTaskCompletionV4 waits for a task to complete and returns the completion details
// of the task. The function will poll the task status every 100ms until the task is
// completed or the context is cancelled.
func waitForTaskCompletionV4(ctx context.Context, v4Client *prismclientv4.Client, taskID string) ([]prismcommonconfig.KVPair, error) {
var data []prismcommonconfig.KVPair

if err := wait.PollUntilContextCancel(
ctx,
100*time.Millisecond,
true,
func(ctx context.Context) (done bool, err error) {
task, err := v4Client.TasksApiInstance.GetTaskById(utils.StringPtr(taskID))
if err != nil {
return false, fmt.Errorf("failed to get task %s: %w", taskID, err)
}

taskData, ok := task.GetData().(prismapi.Task)
if !ok {
return false, fmt.Errorf("unexpected task data type %[1]T: %+[1]v", task.GetData())
}

if taskData.Status == nil {
return false, nil
}

if *taskData.Status != prismapi.TASKSTATUS_SUCCEEDED {
return false, nil
}

data = taskData.CompletionDetails

return true, nil
},
); err != nil {
return nil, fmt.Errorf("failed to wait for task %s to complete: %w", taskID, err)
}

return data, nil
}
12 changes: 6 additions & 6 deletions controllers/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ func TestGetPrismCentralClientForCluster(t *testing.T) {
mapInformer := mockk8sclient.NewMockConfigMapInformer(ctrl)
secretInformer.EXPECT().Lister().Return(secretLister)

_, err := getPrismCentralClientForCluster(ctx, cluster, secretInformer, mapInformer)
_, err := getPrismCentralV3ClientForCluster(ctx, cluster, secretInformer, mapInformer)
assert.Error(t, err)
})

Expand Down Expand Up @@ -187,20 +187,20 @@ func TestGetPrismCentralClientForCluster(t *testing.T) {
mapInformer := mockk8sclient.NewMockConfigMapInformer(ctrl)
secretInformer.EXPECT().Lister().Return(secretLister)

_, err = getPrismCentralClientForCluster(ctx, cluster, secretInformer, mapInformer)
_, err = getPrismCentralV3ClientForCluster(ctx, cluster, secretInformer, mapInformer)
assert.Error(t, err)
})

t.Run("GetOrCreate succeeds", func(t *testing.T) {
ctrl := gomock.NewController(t)

oldNutanixClientCache := nutanixclient.NutanixClientCache
oldNutanixClientCache := nutanixclient.NutanixClientCacheV3
defer func() {
nutanixclient.NutanixClientCache = oldNutanixClientCache
nutanixclient.NutanixClientCacheV3 = oldNutanixClientCache
}()

// Create a new client cache with session auth disabled to avoid network calls in tests
nutanixclient.NutanixClientCache = prismclientv3.NewClientCache()
nutanixclient.NutanixClientCacheV3 = prismclientv3.NewClientCache()

creds := []credentialtypes.Credential{
{
Expand All @@ -225,7 +225,7 @@ func TestGetPrismCentralClientForCluster(t *testing.T) {
mapInformer := mockk8sclient.NewMockConfigMapInformer(ctrl)
secretInformer.EXPECT().Lister().Return(secretLister)

_, err = getPrismCentralClientForCluster(ctx, cluster, secretInformer, mapInformer)
_, err = getPrismCentralV3ClientForCluster(ctx, cluster, secretInformer, mapInformer)
assert.NoError(t, err)
})
}
33 changes: 24 additions & 9 deletions controllers/nutanixcluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/source"

infrav1 "github.com/nutanix-cloud-native/cluster-api-provider-nutanix/api/v1beta1"
nutanixClient "github.com/nutanix-cloud-native/cluster-api-provider-nutanix/pkg/client"
nutanixclient "github.com/nutanix-cloud-native/cluster-api-provider-nutanix/pkg/client"
nctx "github.com/nutanix-cloud-native/cluster-api-provider-nutanix/pkg/context"
)

Expand Down Expand Up @@ -189,17 +189,32 @@ func (r *NutanixClusterReconciler) Reconcile(ctx context.Context, req ctrl.Reque
return reconcile.Result{}, err
}

v3Client, err := getPrismCentralClientForCluster(ctx, cluster, r.SecretInformer, r.ConfigMapInformer)
v3Client, err := getPrismCentralV3ClientForCluster(ctx, cluster, r.SecretInformer, r.ConfigMapInformer)
if err != nil {
log.Error(err, "error occurred while fetching prism central client")
return reconcile.Result{}, err
}

rctx := &nctx.ClusterContext{
Context: ctx,
Cluster: capiCluster,
NutanixCluster: cluster,
NutanixClient: v3Client,
Context: ctx,
Cluster: capiCluster,
NutanixCluster: cluster,
NutanixClientV3: v3Client,
}

createV4Client, err := isPrismCentralV4Compatible(ctx, v3Client)
if err != nil {
log.Error(err, "error occurred while checking environment compatibility for Prism Central v4 APIs")
} else {
if createV4Client {
v4Client, err := getPrismCentralV4ClientForCluster(ctx, cluster, r.SecretInformer, r.ConfigMapInformer)
if err != nil {
log.Error(err, "error occurred while fetching Prism Central v4 client")
return reconcile.Result{}, err
}

rctx.NutanixClientV4 = v4Client
}
}

// Check for request action
Expand Down Expand Up @@ -236,7 +251,7 @@ func (r *NutanixClusterReconciler) reconcileDelete(rctx *nctx.ClusterContext) (r

// delete the client from the cache
log.Info(fmt.Sprintf("deleting nutanix prism client for cluster %s from cache", rctx.NutanixCluster.GetNamespacedName()))
nutanixClient.NutanixClientCache.Delete(&nutanixClient.CacheParams{NutanixCluster: rctx.NutanixCluster})
nutanixclient.NutanixClientCacheV3.Delete(&nutanixclient.CacheParams{NutanixCluster: rctx.NutanixCluster})

if err := r.reconcileCredentialRefDelete(rctx.Context, rctx.NutanixCluster); err != nil {
log.Error(err, fmt.Sprintf("error occurred while reconciling credential ref deletion for cluster %s", rctx.Cluster.Name))
Expand Down Expand Up @@ -319,7 +334,7 @@ func (r *NutanixClusterReconciler) reconcileCategories(rctx *nctx.ClusterContext
log := ctrl.LoggerFrom(rctx.Context)
log.Info("Reconciling categories for cluster")
defaultCategories := GetDefaultCAPICategoryIdentifiers(rctx.Cluster.Name)
_, err := GetOrCreateCategories(rctx.Context, rctx.NutanixClient, defaultCategories)
_, err := GetOrCreateCategories(rctx.Context, rctx.NutanixClientV3, defaultCategories)
if err != nil {
conditions.MarkFalse(rctx.NutanixCluster, infrav1.ClusterCategoryCreatedCondition, infrav1.ClusterCategoryCreationFailed, capiv1.ConditionSeverityError, err.Error())
return err
Expand All @@ -335,7 +350,7 @@ func (r *NutanixClusterReconciler) reconcileCategoriesDelete(rctx *nctx.ClusterC
conditions.GetReason(rctx.NutanixCluster, infrav1.ClusterCategoryCreatedCondition) == infrav1.DeletionFailed {
defaultCategories := GetDefaultCAPICategoryIdentifiers(rctx.Cluster.Name)
obsoleteCategories := GetObsoleteDefaultCAPICategoryIdentifiers(rctx.Cluster.Name)
err := DeleteCategories(rctx.Context, rctx.NutanixClient, defaultCategories, obsoleteCategories)
err := DeleteCategories(rctx.Context, rctx.NutanixClientV3, defaultCategories, obsoleteCategories)
if err != nil {
conditions.MarkFalse(rctx.NutanixCluster, infrav1.ClusterCategoryCreatedCondition, infrav1.DeletionFailed, capiv1.ConditionSeverityWarning, err.Error())
return err
Expand Down
Loading

0 comments on commit bcc1ed5

Please sign in to comment.