Add documentation for new saftey checks, allow to set the minimum upt…

…ime and make sure exclude and include calls are coordinated
FoundationDB · Mar 15, 2024 · 60ea96f · 60ea96f
1 parent 1dbfa7e
commit 60ea96f
Show file tree

Hide file tree

Showing 10 changed files with 118 additions and 20 deletions.
diff --git a/controllers/cluster_controller.go b/controllers/cluster_controller.go
@@ -63,6 +63,16 @@ type FoundationDBClusterReconciler struct {
 	GetTimeout                                  time.Duration
 	PostTimeout                                 time.Duration
 	MinimumRequiredUptimeCCBounce               time.Duration
+	// MinimumRecoveryTimeForInclusion defines the duration in seconds that a cluster must be up
+	// before new inclusions are allowed. The operator issuing frequent inclusions in a short time window
+	// could cause instability for the cluster as each inclusion will/can cause a recovery. Delaying the inclusion
+	// of deleted process groups is not an issue as all the process groups that have no resources and are marked for
+	// deletion and are fully excluded, will be batched together in a single inclusion call.
+	MinimumRecoveryTimeForInclusion float64
+	// MinimumRecoveryTimeForExclusion defines the duration in seconds that a cluster must be up
+	// before new exclusions are allowed. The operator issuing frequent exclusions in a short time window
+	// could cause instability for the cluster as each exclusion will/can cause a recovery.
+	MinimumRecoveryTimeForExclusion float64
 }
 
 // NewFoundationDBClusterReconciler creates a new FoundationDBClusterReconciler with defaults.

diff --git a/controllers/exclude_processes.go b/controllers/exclude_processes.go
@@ -71,8 +71,28 @@ func (e excludeProcesses) reconcile(_ context.Context, r *FoundationDBClusterRec
 		return nil
 	}
 
+	// Make sure the exclusions are coordinated across multiple operator instances.
+	if cluster.ShouldUseLocks() {
+		lockClient, err := r.getLockClient(cluster)
+		if err != nil {
+			return &requeue{curError: err}
+		}
+
+		_, err = lockClient.TakeLock()
+		if err != nil {
+			return &requeue{curError: err, delayedRequeue: true}
+		}
+
+		defer func() {
+			err = lockClient.ReleaseLock()
+			if err != nil {
+				logger.Error(err, "could not release lock")
+			}
+		}()
+	}
+
 	// Make sure it's safe to exclude processes.
-	err = fdbstatus.CanSafelyExcludeProcessesWithRecoveryState(cluster, status)
+	err = fdbstatus.CanSafelyExcludeProcessesWithRecoveryState(cluster, status, r.MinimumRecoveryTimeForExclusion)
 	if err != nil {
 		return &requeue{curError: err, delayedRequeue: true}
 	}

diff --git a/controllers/remove_process_groups.go b/controllers/remove_process_groups.go
@@ -63,7 +63,7 @@ func (u removeProcessGroups) reconcile(ctx context.Context, r *FoundationDBClust
 		}
 	}
 
-	remainingMap, err := removals.GetRemainingMap(logger, adminClient, cluster, status)
+	remainingMap, err := removals.GetRemainingMap(logger, adminClient, cluster, status, r.MinimumRecoveryTimeForExclusion)
 	if err != nil {
 		return &requeue{curError: err}
 	}
@@ -133,7 +133,6 @@ func (u removeProcessGroups) reconcile(ctx context.Context, r *FoundationDBClust
 
 	// This will return a map of the newly removed ProcessGroups and the ProcessGroups with the ResourcesTerminating condition
 	removedProcessGroups := r.removeProcessGroups(ctx, logger, cluster, zoneRemovals, zonedRemovals[removals.TerminatingZone])
-
 	err = includeProcessGroup(ctx, logger, r, cluster, removedProcessGroups, status)
 	if err != nil {
 		return &requeue{curError: err}
@@ -261,8 +260,28 @@ func includeProcessGroup(ctx context.Context, logger logr.Logger, r *FoundationD
 		return nil
 	}
 
+	// Make sure the inclusion are coordinated across multiple operator instances.
+	if cluster.ShouldUseLocks() {
+		lockClient, err := r.getLockClient(cluster)
+		if err != nil {
+			return err
+		}
+
+		_, err = lockClient.TakeLock()
+		if err != nil {
+			return err
+		}
+
+		defer func() {
+			err = lockClient.ReleaseLock()
+			if err != nil {
+				logger.Error(err, "could not release lock")
+			}
+		}()
+	}
+
 	// Make sure it's safe to include processes.
-	err = fdbstatus.CanSafelyIncludeProcesses(cluster, status)
+	err = fdbstatus.CanSafelyIncludeProcesses(cluster, status, r.MinimumRecoveryTimeForInclusion)
 	if err != nil {
 		return err
 	}

diff --git a/controllers/remove_process_groups_test.go b/controllers/remove_process_groups_test.go
@@ -183,7 +183,7 @@ var _ = Describe("remove_process_groups", func() {
 
 					It("should not remove the process group and should not exclude processes", func() {
 						Expect(result).NotTo(BeNil())
-						Expect(result.message).To(Equal("Reconciliation needs to exclude more processes"))
+						Expect(result.curError).To(HaveOccurred())
 						// Ensure resources are not deleted
 						removed, include, err := confirmRemoval(context.Background(), globalControllerLogger, clusterReconciler, cluster, removedProcessGroup)
 						Expect(err).To(BeNil())

diff --git a/docs/manual/technical_design.md b/docs/manual/technical_design.md
@@ -234,6 +234,15 @@ The `ChooseRemovals` subreconciler flags processes for removal when the current
 The `ExcludeProcesses` subreconciler runs an [exclude command](https://apple.github.io/foundationdb/administration.html#removing-machines-from-a-cluster) in `fdbcli` for any process group that is marked for removal and is not already being excluded.
 The `exclude` command tells FoundationDB that a process should not serve any roles, and that any data on that process should be moved to other processes.
 This exclusion can take a long time, but this subreconciler does not wait for exclusion to complete.
+The operator will only run the exclude command if it is safe to run it.
+The safety checks are defined in the [status_checks.go](../../pkg/fdbstatus/status_checks.go) file and includes the following checks:
+
+- There is a low number of active generations.
+- The cluster is available from the client perspective.
+- The last recovery was at least `MinimumRecoveryTimeForExclusion` seconds ago.
+
+The `MinimumRecoveryTimeForExclusion` parameter can be changed with the `--minimum-recovery-time-for-exclusion` argument and the default is `120.0` seconds.
+Having a wait time between the exclusions will reduce the risk of successive recoveries which might cause issues to clients.
 
 The operator will only trigger a replacement if the new processes are available.
 In addition the operator will not trigger any exclusion if any of the process groups with the same process clas has the `MissingProcess` condition for less than 5 minutes.
@@ -304,18 +313,38 @@ The `RemoveProcessGroups` subreconciler deletes any pods that are marked for rem
 
 This performs the following sequence of steps for every pod:
 
-1. Confirm that the exclusion is complete
-1. Trigger the deletion of the pod
-1. Confirm that the pod is fully terminated
-1. Trigger the deletion of the PVC
-1. Confirm that the PVC is fully terminated
-1. Trigger the deletion of the service
-1. Confirm that the service is fully terminated
+1. Confirm that the exclusion is complete.
+1. Trigger the deletion of the pod.
+1. Confirm that the pod is fully terminated.
+1. Trigger the deletion of the PVC.
+1. Confirm that the PVC is fully terminated.
+1. Trigger the deletion of the service.
+1. Confirm that the service is fully terminated.
+1. Include the processes where all resources are deleted.
+1. Remove the process group from the list of process groups.
 
-If any process group is marked for removal but cannot complete the sequence above, this will requeue reconciliation. However, we will always run through this sequence on all of the process groups that need to be removed, getting as far as we can for each one. This means that one pod being stuck in terminating should not block other pods from being deleted.
+If any process group is marked for removal but cannot complete the sequence above, this will requeue reconciliation.
+However, we will always run through this sequence on all of the process groups that need to be removed, getting as far as we can for each one.
+This means that one pod being stuck in terminating should not block other pods from being deleted.
 
 This will not allow deleting any pods that are serving as coordinators.
 
+The `RemoveProcessGroups` subreconciler has some additional safety checks to reduce the risk of successive recoveries.
+
+The operator will only run the exclude command if it is safe to run it.
+The safety checks are defined in the [status_checks.go](../../pkg/fdbstatus/status_checks.go) file and includes the following checks:
+
+- There is a low number of active generations.
+- The cluster is available from the client perspective.
+- The last recovery was at least `MinimumRecoveryTimeForExclusion` seconds ago.
+
+The `MinimumRecoveryTimeForExclusion` parameter can be changed with the `--minimum-recovery-time-for-exclusion` argument and the default is `120.0` seconds.
+Having a wait time between the exclusions will reduce the risk of successive recoveries which might cause issues to clients.
+
+The same is true for the include operation with the difference that `MinimumRecoveryTimeForInclusion` is used to determine the minimum uptime of the cluster.
+The `MinimumRecoveryTimeForInclusion` parameter can be changed with the `--minimum-recovery-time-for-inclusion` argument and the default is `600.0` seconds. 
+The operator will batch all outstanding inclusion together into a single include call.
+
 ### UpdateStatus (again)
 
 Once we have completed all other steps in reconciliation, we run the `UpdateStatus` subreconciler a second time to check that everything is in the desired state. If there is anything that is not in the desired state, the operator will requeue reconciliation.

diff --git a/e2e/fixtures/fdb_operator_client.go b/e2e/fixtures/fdb_operator_client.go
@@ -259,6 +259,11 @@ spec:
         - --zap-log-level=debug
         - --minimum-required-uptime-for-cc-bounce=60s
         #- --server-side-apply
+        # We are setting low values here as the e2e test are taking down processes multiple times
+        # and having a high wait time between recoveries will increase the reliability of the cluster but also
+        # increase the time our e2e test take.
+        - -- minimum-recovery-time-for-inclusion=1.0
+        - -- minimum-recovery-time-for-exclusion=1.0
         image: {{ .OperatorImage }}
         name: manager
         imagePullPolicy: Always

diff --git a/internal/removals/remove.go b/internal/removals/remove.go
@@ -124,12 +124,21 @@ func GetZonedRemovals(processGroupsToRemove []*fdbv1beta2.ProcessGroupStatus) (m
 }
 
 // GetRemainingMap returns a map that indicates if a process group is fully excluded in the cluster.
-func GetRemainingMap(logger logr.Logger, adminClient fdbadminclient.AdminClient, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) (map[string]bool, error) {
+func GetRemainingMap(logger logr.Logger, adminClient fdbadminclient.AdminClient, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, minRecoverySeconds float64) (map[string]bool, error) {
 	remainingMap, addresses := getAddressesToValidateBeforeRemoval(logger, cluster)
 	if len(addresses) == 0 {
 		return nil, nil
 	}
 
+	// The CanSafelyRemoveFromStatus will run the exclusion command for processes that are either assumed to be fully excluded
+	// and processes that are currently missing from the machine-readable status. If it's not safe to run the exclude command
+	// we will block all further checks and assume that those processes are not yet excluded. This should reduce the risk
+	// of successive recoveries because of the exclusion call.
+	err := fdbstatus.CanSafelyExcludeProcessesWithRecoveryState(cluster, status, minRecoverySeconds)
+	if err != nil {
+		return nil, err
+	}
+
 	remaining, err := fdbstatus.CanSafelyRemoveFromStatus(logger, adminClient, addresses, status)
 	if err != nil {
 		return nil, err

diff --git a/pkg/fdbstatus/status_checks.go b/pkg/fdbstatus/status_checks.go
@@ -551,14 +551,14 @@ func recoveryStateAllowsInclusion(status *fdbv1beta2.FoundationDBStatus) bool {
 
 // CanSafelyExcludeProcessesWithRecoveryState currently performs the DefaultSafetyChecks and makes sure that the last recovery was at least 60 seconds ago.
 // In the future this check might be extended to perform more specific checks.
-func CanSafelyExcludeProcessesWithRecoveryState(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) error {
-	return canSafelyExcludeOrIncludeProcesses(cluster, status, false, 120.0)
+func CanSafelyExcludeProcessesWithRecoveryState(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, minRecoverySeconds float64) error {
+	return canSafelyExcludeOrIncludeProcesses(cluster, status, false, minRecoverySeconds)
 }
 
 // CanSafelyIncludeProcesses currently performs the DefaultSafetyChecks and makes sure that the last recovery was at least 60 seconds ago.
 // In the future this check might be extended to perform more specific checks.
-func CanSafelyIncludeProcesses(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) error {
-	return canSafelyExcludeOrIncludeProcesses(cluster, status, true, 300.0)
+func CanSafelyIncludeProcesses(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, minRecoverySeconds float64) error {
+	return canSafelyExcludeOrIncludeProcesses(cluster, status, true, minRecoverySeconds)
 }
 
 // ConfigurationChangeAllowed will return an error if the configuration change is assumed to be unsafe. If no error

diff --git a/pkg/fdbstatus/status_checks_test.go b/pkg/fdbstatus/status_checks_test.go
@@ -1814,7 +1814,7 @@ var _ = Describe("status_checks", func() {
 	When("performing the exclude safety check.", func() {
 		DescribeTable("should return if the safety check is satisfied or not",
 			func(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, expected error) {
-				err := CanSafelyExcludeProcessesWithRecoveryState(cluster, status)
+				err := CanSafelyExcludeProcessesWithRecoveryState(cluster, status, 120.0)
 				if expected == nil {
 					Expect(err).To(BeNil())
 				} else {
@@ -1950,7 +1950,7 @@ var _ = Describe("status_checks", func() {
 	When("performing the include safety check.", func() {
 		DescribeTable("should return if the safety check is satisfied or not",
 			func(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, expected error) {
-				err := CanSafelyIncludeProcesses(cluster, status)
+				err := CanSafelyIncludeProcesses(cluster, status, 300.0)
 				if expected == nil {
 					Expect(err).To(BeNil())
 				} else {

diff --git a/setup/setup.go b/setup/setup.go
@@ -77,6 +77,8 @@ type Options struct {
 	LogFileMaxSize                     int
 	LogFileMaxAge                      int
 	MaxNumberOfOldLogFiles             int
+	MinimumRecoveryTimeForExclusion    float64
+	MinimumRecoveryTimeForInclusion    float64
 	LogFileMinAge                      time.Duration
 	GetTimeout                         time.Duration
 	PostTimeout                        time.Duration
@@ -130,6 +132,8 @@ func (o *Options) BindFlags(fs *flag.FlagSet) {
 	fs.BoolVar(&o.EnableRecoveryState, "enable-recovery-state", true, "This flag enables the use of the recovery state for the minimum uptime between bounced if the FDB version supports it.")
 	fs.BoolVar(&o.CacheDatabaseStatus, "cache-database-status", true, "Defines the default value for caching the database status.")
 	fs.BoolVar(&o.EnableNodeIndex, "enable-node-index", false, "Defines if the operator should add an index for accessing node objects. This requires a ClusterRoleBinding with node access. If the taint feature should be used, this setting should be set to true.")
+	fs.Float64Var(&o.MinimumRecoveryTimeForInclusion, "minimum-recovery-time-for-inclusion", 600.0, "Defines the minimum uptime of the cluster before inclusions are allowed. For clusters after 7.1 this will use the recovery state. This should reduce the risk of frequent recoveries because of inclusions.")
+	fs.Float64Var(&o.MinimumRecoveryTimeForExclusion, "minimum-recovery-time-for-exclusion", 120.0, "Defines the minimum uptime of the cluster before exclusions are allowed. For clusters after 7.1 this will use the recovery state. This should reduce the risk of frequent recoveries because of exclusions.")
 }
 
 // StartManager will start the FoundationDB operator manager.
@@ -248,6 +252,8 @@ func StartManager(
 		clusterReconciler.EnableRecoveryState = operatorOpts.EnableRecoveryState
 		clusterReconciler.CacheDatabaseStatusForReconciliationDefault = operatorOpts.CacheDatabaseStatus
 		clusterReconciler.MinimumRequiredUptimeCCBounce = operatorOpts.MinimumRequiredUptimeCCBounce
+		clusterReconciler.MinimumRecoveryTimeForInclusion = operatorOpts.MinimumRecoveryTimeForInclusion
+		clusterReconciler.MinimumRecoveryTimeForExclusion = operatorOpts.MinimumRecoveryTimeForExclusion
 
 		if err := clusterReconciler.SetupWithManager(mgr, operatorOpts.MaxConcurrentReconciles, operatorOpts.EnableNodeIndex, *labelSelector, watchedObjects...); err != nil {
 			setupLog.Error(err, "unable to create controller", "controller", "FoundationDBCluster")