Skip to content

Commit

Permalink
fix: use SNO check to disable node-removal controller
Browse files Browse the repository at this point in the history
node-removal controller wouldn't be necessary at the end of the lifecycle of SNO nodes as the entire cluster would go down

Signed-off-by: jakobmoellerdev <[email protected]>

fix: allow correct indexing for status objects and run in integration tests

Signed-off-by: jakobmoellerdev <[email protected]>

chore:update controller to ignore no nodeStatus on deviceClass as otherwise deletion gets stuck

Signed-off-by: jakobmoellerdev <[email protected]>

fix: ensure correct rbac for new controller and refactor updateLVMClusterStatus to contain checks per deviceClass per Node

Signed-off-by: jakobmoellerdev <[email protected]>

fix: ensure LVMVolumeGroupNodeStatus is removed by dedicated cleanup controller in case of multi-node
  • Loading branch information
jakobmoellerdev committed Aug 11, 2023
1 parent 6f10571 commit 840c0de
Show file tree
Hide file tree
Showing 10 changed files with 206 additions and 28 deletions.
2 changes: 2 additions & 0 deletions bundle/manifests/lvms-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ spec:
verbs:
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
Expand Down
2 changes: 2 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ rules:
verbs:
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
Expand Down
11 changes: 10 additions & 1 deletion controllers/lvmcluster_controller_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)

var _ = Describe("LVMCluster controller", func() {
Expand Down Expand Up @@ -189,7 +190,15 @@ var _ = Describe("LVMCluster controller", func() {

// delete lvmVolumeGroupNodeStatus as it should be deleted by vgmanager
// and if it is present lvmcluster reconciler takes it as vg is present on node
Expect(k8sClient.Delete(ctx, lvmVolumeGroupNodeStatusIn)).Should(Succeed())

// we will now remove the node which will cause the LVM cluster status to also lose that vg
Expect(k8sClient.Delete(ctx, nodeIn)).Should(Succeed())
// deletion of LVMCluster CR and thus also the NodeStatus through the removal controller
Eventually(func() bool {
err := k8sClient.Get(ctx, client.ObjectKeyFromObject(lvmVolumeGroupNodeStatusIn),
&lvmv1alpha1.LVMVolumeGroupNodeStatus{})
return errors.IsNotFound(err)
}, timeout, interval).Should(BeTrue())

// deletion of LVMCluster CR
Eventually(func() bool {
Expand Down
6 changes: 6 additions & 0 deletions controllers/lvmcluster_controller_watches.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ import (
"context"

lvmv1alpha1 "github.com/openshift/lvm-operator/api/v1alpha1"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
Expand All @@ -44,6 +46,10 @@ func (r *LVMClusterReconciler) SetupWithManager(mgr ctrl.Manager) error {
&storagev1.StorageClass{},
handler.EnqueueRequestsFromMapFunc(r.getLVMClusterObjsForReconcile),
).
Watches(
&corev1.Node{},
handler.EnqueueRequestsFromMapFunc(r.getLVMClusterObjsForReconcile),
).
Complete(r)
}

Expand Down
108 changes: 108 additions & 0 deletions controllers/node/removal_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
package node

import (
"context"
"fmt"

lvmv1alpha1 "github.com/openshift/lvm-operator/api/v1alpha1"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

const cleanupFinalizer = "lvm.topolvm.io/node-removal-hook"
const fieldOwner = "lvms"

type RemovalController struct {
client.Client
}

//+kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;patch;update;watch
//+kubebuilder:rbac:groups=lvm.topolvm.io,resources=lvmvolumegroupnodestatuses,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=lvm.topolvm.io,resources=lvmvolumegroupnodestatuses/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=lvm.topolvm.io,resources=lvmvolumegroupnodestatuses/finalizers,verbs=update

// Reconcile takes care of watching a node, adding a finalizer, and reacting to a removal request by deleting
// the unwanted LVMVolumeGroupNodeStatus that was associated with the node, before removing the finalizer.
// It does nothing on active Nodes. If it can be assumed that there will always be only one node (SNO),
// this controller should not be started.
func (r *RemovalController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)

node := &v1.Node{}
if err := r.Get(ctx, req.NamespacedName, node); err != nil {
// we'll ignore not-found errors, since they can't be fixed by an immediate
// requeue (we'll need to wait for a new notification), and we can get them
// on deleted requests.
return ctrl.Result{}, client.IgnoreNotFound(err)
}

if node.DeletionTimestamp.IsZero() {
// Add a finalizer in case the node is fresh or the controller newly deployed
if needsUpdate := controllerutil.AddFinalizer(node, cleanupFinalizer); needsUpdate {
if err := r.Update(ctx, node, client.FieldOwner(fieldOwner)); err != nil {
return ctrl.Result{}, fmt.Errorf("node finalizer could not be updated: %w", err)
}
}
// nothing to do here, the node exists and is happy,
// maybe there is a NodeVolumeGroupStatus but we don't care
return ctrl.Result{}, nil
}

logger.Info("node getting deleted, removing leftover LVMVolumeGroupNodeStatus")

vgNodeStatusList := &lvmv1alpha1.LVMVolumeGroupNodeStatusList{}
if err := r.Client.List(ctx, vgNodeStatusList, client.MatchingFields{"metadata.name": node.GetName()}); err != nil {
return ctrl.Result{}, fmt.Errorf("error retrieving fitting LVMVolumeGroupNodeStatus for Node %s: %w", node.GetName(), err)
}

if len(vgNodeStatusList.Items) == 0 {
logger.Info("LVMVolumeGroupNodeStatus already deleted")
return ctrl.Result{}, nil
}

for i := range vgNodeStatusList.Items {
if err := r.Client.Delete(ctx, &vgNodeStatusList.Items[i]); err != nil {
return ctrl.Result{}, fmt.Errorf("could not cleanup LVMVolumeGroupNodeStatus for Node %s: %w", node.GetName(), err)
}
}

logger.Info("every LVMVolumeGroupNodeStatus for node was removed, removing finalizer to allow node removal")
if needsUpdate := controllerutil.RemoveFinalizer(node, cleanupFinalizer); needsUpdate {
if err := r.Update(ctx, node, client.FieldOwner(fieldOwner)); err != nil {
return ctrl.Result{}, fmt.Errorf("node finalizer could not be removed: %w", err)
}
}

return ctrl.Result{}, nil
}

// SetupWithManager sets up the controller with the Manager.
func (r *RemovalController) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).For(&v1.Node{}).Watches(&lvmv1alpha1.LVMVolumeGroupNodeStatus{},
handler.EnqueueRequestsFromMapFunc(r.getNodeForLVMVolumeGroupNodeStatus)).Complete(r)
}

func (r *RemovalController) getNodeForLVMVolumeGroupNodeStatus(ctx context.Context, object client.Object) []reconcile.Request {
node := &v1.Node{}
node.SetName(object.GetName())

err := r.Client.Get(ctx, client.ObjectKeyFromObject(node), node)
if errors.IsNotFound(err) {
return []reconcile.Request{}
}

if err != nil {
log.FromContext(ctx).Error(err, "could not get Node for LVMVolumeGroupNodeStatus", "LVMVolumeGroupNodeStatus", object.GetName())
return []reconcile.Request{}
}

return []reconcile.Request{{NamespacedName: types.NamespacedName{Name: node.GetName()}}}
}
11 changes: 11 additions & 0 deletions controllers/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
snapapi "github.com/kubernetes-csi/external-snapshotter/client/v4/apis/volumesnapshot/v1"
secv1client "github.com/openshift/client-go/security/clientset/versioned/typed/security/v1"
lvmv1alpha1 "github.com/openshift/lvm-operator/api/v1alpha1"
"github.com/openshift/lvm-operator/controllers/node"
topolvmv1 "github.com/topolvm/topolvm/api/v1"
//+kubebuilder:scaffold:imports
)
Expand Down Expand Up @@ -118,6 +119,16 @@ var _ = BeforeSuite(func() {
}).SetupWithManager(k8sManager)
Expect(err).ToNot(HaveOccurred())

err = (&node.RemovalController{
Client: k8sManager.GetClient(),
}).SetupWithManager(k8sManager)
Expect(err).ToNot(HaveOccurred())

err = k8sManager.GetFieldIndexer().IndexField(context.Background(), &lvmv1alpha1.LVMVolumeGroupNodeStatus{}, "metadata.name", func(object client.Object) []string {
return []string{object.GetName()}
})
Expect(err).ToNot(HaveOccurred(), "unable to create name index on LVMVolumeGroupNodeStatus")

go func() {
defer GinkgoRecover()
err = k8sManager.Start(ctx)
Expand Down
28 changes: 27 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"flag"
"fmt"
"github.com/openshift/lvm-operator/controllers/node"
"os"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
Expand All @@ -32,6 +33,7 @@ import (
"k8s.io/klog/v2"

ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"

Expand Down Expand Up @@ -89,7 +91,13 @@ func main() {
}
setupLog.Info("Watching namespace", "Namespace", operatorNamespace)

leaderElectionResolver, err := cluster.NewLeaderElectionResolver(ctrl.GetConfigOrDie(), scheme, enableLeaderElection, operatorNamespace)
setupClient, err := client.New(ctrl.GetConfigOrDie(), client.Options{Scheme: scheme})
if err != nil {
setupLog.Error(err, "unable to initialize setup client for pre-manager startup checks")
os.Exit(1)
}
snoCheck := cluster.NewMasterSNOCheck(setupClient)
leaderElectionResolver, err := cluster.NewLeaderElectionResolver(snoCheck, enableLeaderElection, operatorNamespace)
if err != nil {
setupLog.Error(err, "unable to setup leader election")
os.Exit(1)
Expand Down Expand Up @@ -129,6 +137,24 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "LVMCluster")
os.Exit(1)
}

if !snoCheck.IsSNO(context.Background()) {
setupLog.Info("starting node-removal controller to observe node removal in MultiNode")
if err = (&node.RemovalController{
Client: mgr.GetClient(),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NodeRemovalControlelr")
os.Exit(1)
}
}

if err = mgr.GetFieldIndexer().IndexField(context.Background(), &lvmv1alpha1.LVMVolumeGroupNodeStatus{}, "metadata.name", func(object client.Object) []string {
return []string{object.GetName()}
}); err != nil {
setupLog.Error(err, "unable to create name index on LVMVolumeGroupNodeStatus")
os.Exit(1)
}

if err = (&lvmv1alpha1.LVMCluster{}).SetupWebhookWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "LVMCluster")
os.Exit(1)
Expand Down
31 changes: 6 additions & 25 deletions pkg/cluster/leaderelection.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,8 @@ package cluster

import (
"context"
"fmt"
configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/library-go/pkg/config/leaderelection"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/rest"
"os"
"sigs.k8s.io/controller-runtime/pkg/client"
log "sigs.k8s.io/controller-runtime/pkg/log"
)

Expand All @@ -25,45 +19,32 @@ type LeaderElectionResolver interface {
// on the amount of master nodes discovered in the cluster. If there is exactly one control-plane/master node,
// the returned LeaderElectionResolver settings are optimized for SNO deployments.
func NewLeaderElectionResolver(
config *rest.Config,
scheme *runtime.Scheme,
snoCheck SNOCheck,
enableLeaderElection bool,
operatorNamespace string,
) (LeaderElectionResolver, error) {
leaderElectionClient, err := client.New(config, client.Options{Scheme: scheme})
if err != nil {
return nil, fmt.Errorf("cannot create leader election client: %w", err)
}

defaultElectionConfig := leaderelection.LeaderElectionDefaulting(configv1.LeaderElection{
Disable: !enableLeaderElection,
}, operatorNamespace, "1136b8a6.topolvm.io")

return &nodeLookupSNOLeaderElection{
clnt: leaderElectionClient,
snoCheck: snoCheck,
defaultElectionConfig: defaultElectionConfig,
}, nil
}

type nodeLookupSNOLeaderElection struct {
clnt client.Client
snoCheck SNOCheck
defaultElectionConfig configv1.LeaderElection
}

func (le *nodeLookupSNOLeaderElection) Resolve(ctx context.Context) (configv1.LeaderElection, error) {
logger := log.FromContext(ctx)
nodes := &corev1.NodeList{}
if err := le.clnt.List(context.Background(), nodes, client.MatchingLabels{
ControlPlaneIDLabel: "",
}); err != nil {
logger.Error(err, "unable to retrieve nodes for SNO check with lease configuration")
os.Exit(1)
}
if len(nodes.Items) != 1 {
if !le.snoCheck.IsSNO(ctx) {
logger.Info("Using default Multi-Node leader election settings optimized for high-availability")
return le.defaultElectionConfig, nil
}
logger.Info("Overwriting defaults with SNO leader election config as only a single node was discovered",
"node", nodes.Items[0].GetName())
logger.Info("Overwriting defaults with SNO leader election config as only a single node was discovered")
config := leaderelection.LeaderElectionSNOConfig(le.defaultElectionConfig)
logger.Info("leader election config setup succeeded",
"retry-period", config.RetryPeriod,
Expand Down
2 changes: 1 addition & 1 deletion pkg/cluster/leaderelection_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ func Test_nodeLookupSNOLeaderElection_Resolve(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
clnt := fake.NewClientBuilder().WithObjects(tt.nodes...).Build()
le := &nodeLookupSNOLeaderElection{
clnt: clnt,
snoCheck: NewMasterSNOCheck(clnt),
defaultElectionConfig: leaderelection.LeaderElectionDefaulting(configv1.LeaderElection{},
"test", "test-leader-id"),
}
Expand Down
33 changes: 33 additions & 0 deletions pkg/cluster/sno.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package cluster

import (
"context"
corev1 "k8s.io/api/core/v1"
"os"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)

type SNOCheck interface {
IsSNO(ctx context.Context) bool
}

func NewMasterSNOCheck(clnt client.Client) SNOCheck {
return &masterSNOCheck{clnt: clnt}
}

type masterSNOCheck struct {
clnt client.Client
}

func (chk *masterSNOCheck) IsSNO(ctx context.Context) bool {
logger := log.FromContext(ctx)
nodes := &corev1.NodeList{}
if err := chk.clnt.List(context.Background(), nodes, client.MatchingLabels{
ControlPlaneIDLabel: "",
}); err != nil {
logger.Error(err, "unable to retrieve nodes for SNO check with lease configuration")
os.Exit(1)
}
return nodes.Items != nil && len(nodes.Items) == 1
}

0 comments on commit 840c0de

Please sign in to comment.