diff --git a/controllers/cluster_controller.go b/controllers/cluster_controller.go index 5efaee059..9aca9ef27 100644 --- a/controllers/cluster_controller.go +++ b/controllers/cluster_controller.go @@ -63,6 +63,16 @@ type FoundationDBClusterReconciler struct { GetTimeout time.Duration PostTimeout time.Duration MinimumRequiredUptimeCCBounce time.Duration + // MinimumRecoveryTimeForInclusion defines the duration in seconds that a cluster must be up + // before new inclusions are allowed. The operator issuing frequent inclusions in a short time window + // could cause instability for the cluster as each inclusion will/can cause a recovery. Delaying the inclusion + // of deleted process groups is not an issue as all the process groups that have no resources and are marked for + // deletion and are fully excluded, will be batched together in a single inclusion call. + MinimumRecoveryTimeForInclusion float64 + // MinimumRecoveryTimeForExclusion defines the duration in seconds that a cluster must be up + // before new exclusions are allowed. The operator issuing frequent exclusions in a short time window + // could cause instability for the cluster as each exclusion will/can cause a recovery. + MinimumRecoveryTimeForExclusion float64 } // NewFoundationDBClusterReconciler creates a new FoundationDBClusterReconciler with defaults. diff --git a/controllers/exclude_processes.go b/controllers/exclude_processes.go index a1ea64e07..bb6ac02b1 100644 --- a/controllers/exclude_processes.go +++ b/controllers/exclude_processes.go @@ -71,8 +71,28 @@ func (e excludeProcesses) reconcile(_ context.Context, r *FoundationDBClusterRec return nil } + // Make sure the exclusions are coordinated across multiple operator instances. + if cluster.ShouldUseLocks() { + lockClient, err := r.getLockClient(cluster) + if err != nil { + return &requeue{curError: err} + } + + _, err = lockClient.TakeLock() + if err != nil { + return &requeue{curError: err, delayedRequeue: true} + } + + defer func() { + err = lockClient.ReleaseLock() + if err != nil { + logger.Error(err, "could not release lock") + } + }() + } + // Make sure it's safe to exclude processes. - err = fdbstatus.CanSafelyExcludeProcessesWithRecoveryState(cluster, status) + err = fdbstatus.CanSafelyExcludeProcessesWithRecoveryState(cluster, status, r.MinimumRecoveryTimeForExclusion) if err != nil { return &requeue{curError: err, delayedRequeue: true} } diff --git a/controllers/remove_process_groups.go b/controllers/remove_process_groups.go index 5b2057fb1..901b96954 100644 --- a/controllers/remove_process_groups.go +++ b/controllers/remove_process_groups.go @@ -63,7 +63,7 @@ func (u removeProcessGroups) reconcile(ctx context.Context, r *FoundationDBClust } } - remainingMap, err := removals.GetRemainingMap(logger, adminClient, cluster, status) + remainingMap, err := removals.GetRemainingMap(logger, adminClient, cluster, status, r.MinimumRecoveryTimeForExclusion) if err != nil { return &requeue{curError: err} } @@ -133,7 +133,6 @@ func (u removeProcessGroups) reconcile(ctx context.Context, r *FoundationDBClust // This will return a map of the newly removed ProcessGroups and the ProcessGroups with the ResourcesTerminating condition removedProcessGroups := r.removeProcessGroups(ctx, logger, cluster, zoneRemovals, zonedRemovals[removals.TerminatingZone]) - err = includeProcessGroup(ctx, logger, r, cluster, removedProcessGroups, status) if err != nil { return &requeue{curError: err} @@ -261,8 +260,28 @@ func includeProcessGroup(ctx context.Context, logger logr.Logger, r *FoundationD return nil } + // Make sure the inclusion are coordinated across multiple operator instances. + if cluster.ShouldUseLocks() { + lockClient, err := r.getLockClient(cluster) + if err != nil { + return err + } + + _, err = lockClient.TakeLock() + if err != nil { + return err + } + + defer func() { + err = lockClient.ReleaseLock() + if err != nil { + logger.Error(err, "could not release lock") + } + }() + } + // Make sure it's safe to include processes. - err = fdbstatus.CanSafelyIncludeProcesses(cluster, status) + err = fdbstatus.CanSafelyIncludeProcesses(cluster, status, r.MinimumRecoveryTimeForInclusion) if err != nil { return err } diff --git a/controllers/remove_process_groups_test.go b/controllers/remove_process_groups_test.go index 2f3896b62..78e50a9c4 100644 --- a/controllers/remove_process_groups_test.go +++ b/controllers/remove_process_groups_test.go @@ -183,7 +183,7 @@ var _ = Describe("remove_process_groups", func() { It("should not remove the process group and should not exclude processes", func() { Expect(result).NotTo(BeNil()) - Expect(result.message).To(Equal("Reconciliation needs to exclude more processes")) + Expect(result.curError).To(HaveOccurred()) // Ensure resources are not deleted removed, include, err := confirmRemoval(context.Background(), globalControllerLogger, clusterReconciler, cluster, removedProcessGroup) Expect(err).To(BeNil()) diff --git a/docs/manual/technical_design.md b/docs/manual/technical_design.md index c79c8d8c3..b13c04db1 100644 --- a/docs/manual/technical_design.md +++ b/docs/manual/technical_design.md @@ -234,6 +234,15 @@ The `ChooseRemovals` subreconciler flags processes for removal when the current The `ExcludeProcesses` subreconciler runs an [exclude command](https://apple.github.io/foundationdb/administration.html#removing-machines-from-a-cluster) in `fdbcli` for any process group that is marked for removal and is not already being excluded. The `exclude` command tells FoundationDB that a process should not serve any roles, and that any data on that process should be moved to other processes. This exclusion can take a long time, but this subreconciler does not wait for exclusion to complete. +The operator will only run the exclude command if it is safe to run it. +The safety checks are defined in the [status_checks.go](../../pkg/fdbstatus/status_checks.go) file and includes the following checks: + +- There is a low number of active generations. +- The cluster is available from the client perspective. +- The last recovery was at least `MinimumRecoveryTimeForExclusion` seconds ago. + +The `MinimumRecoveryTimeForExclusion` parameter can be changed with the `--minimum-recovery-time-for-exclusion` argument and the default is `120.0` seconds. +Having a wait time between the exclusions will reduce the risk of successive recoveries which might cause issues to clients. The operator will only trigger a replacement if the new processes are available. In addition the operator will not trigger any exclusion if any of the process groups with the same process clas has the `MissingProcess` condition for less than 5 minutes. @@ -304,18 +313,38 @@ The `RemoveProcessGroups` subreconciler deletes any pods that are marked for rem This performs the following sequence of steps for every pod: -1. Confirm that the exclusion is complete -1. Trigger the deletion of the pod -1. Confirm that the pod is fully terminated -1. Trigger the deletion of the PVC -1. Confirm that the PVC is fully terminated -1. Trigger the deletion of the service -1. Confirm that the service is fully terminated +1. Confirm that the exclusion is complete. +1. Trigger the deletion of the pod. +1. Confirm that the pod is fully terminated. +1. Trigger the deletion of the PVC. +1. Confirm that the PVC is fully terminated. +1. Trigger the deletion of the service. +1. Confirm that the service is fully terminated. +1. Include the processes where all resources are deleted. +1. Remove the process group from the list of process groups. -If any process group is marked for removal but cannot complete the sequence above, this will requeue reconciliation. However, we will always run through this sequence on all of the process groups that need to be removed, getting as far as we can for each one. This means that one pod being stuck in terminating should not block other pods from being deleted. +If any process group is marked for removal but cannot complete the sequence above, this will requeue reconciliation. +However, we will always run through this sequence on all of the process groups that need to be removed, getting as far as we can for each one. +This means that one pod being stuck in terminating should not block other pods from being deleted. This will not allow deleting any pods that are serving as coordinators. +The `RemoveProcessGroups` subreconciler has some additional safety checks to reduce the risk of successive recoveries. + +The operator will only run the exclude command if it is safe to run it. +The safety checks are defined in the [status_checks.go](../../pkg/fdbstatus/status_checks.go) file and includes the following checks: + +- There is a low number of active generations. +- The cluster is available from the client perspective. +- The last recovery was at least `MinimumRecoveryTimeForExclusion` seconds ago. + +The `MinimumRecoveryTimeForExclusion` parameter can be changed with the `--minimum-recovery-time-for-exclusion` argument and the default is `120.0` seconds. +Having a wait time between the exclusions will reduce the risk of successive recoveries which might cause issues to clients. + +The same is true for the include operation with the difference that `MinimumRecoveryTimeForInclusion` is used to determine the minimum uptime of the cluster. +The `MinimumRecoveryTimeForInclusion` parameter can be changed with the `--minimum-recovery-time-for-inclusion` argument and the default is `600.0` seconds. +The operator will batch all outstanding inclusion together into a single include call. + ### UpdateStatus (again) Once we have completed all other steps in reconciliation, we run the `UpdateStatus` subreconciler a second time to check that everything is in the desired state. If there is anything that is not in the desired state, the operator will requeue reconciliation. diff --git a/e2e/fixtures/fdb_operator_client.go b/e2e/fixtures/fdb_operator_client.go index c6d28dfc7..eddbe3e82 100644 --- a/e2e/fixtures/fdb_operator_client.go +++ b/e2e/fixtures/fdb_operator_client.go @@ -259,6 +259,11 @@ spec: - --zap-log-level=debug - --minimum-required-uptime-for-cc-bounce=60s #- --server-side-apply + # We are setting low values here as the e2e test are taking down processes multiple times + # and having a high wait time between recoveries will increase the reliability of the cluster but also + # increase the time our e2e test take. + - -- minimum-recovery-time-for-inclusion=1.0 + - -- minimum-recovery-time-for-exclusion=1.0 image: {{ .OperatorImage }} name: manager imagePullPolicy: Always diff --git a/internal/removals/remove.go b/internal/removals/remove.go index d4da32e24..4fe4066dd 100644 --- a/internal/removals/remove.go +++ b/internal/removals/remove.go @@ -124,12 +124,21 @@ func GetZonedRemovals(processGroupsToRemove []*fdbv1beta2.ProcessGroupStatus) (m } // GetRemainingMap returns a map that indicates if a process group is fully excluded in the cluster. -func GetRemainingMap(logger logr.Logger, adminClient fdbadminclient.AdminClient, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) (map[string]bool, error) { +func GetRemainingMap(logger logr.Logger, adminClient fdbadminclient.AdminClient, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, minRecoverySeconds float64) (map[string]bool, error) { remainingMap, addresses := getAddressesToValidateBeforeRemoval(logger, cluster) if len(addresses) == 0 { return nil, nil } + // The CanSafelyRemoveFromStatus will run the exclusion command for processes that are either assumed to be fully excluded + // and processes that are currently missing from the machine-readable status. If it's not safe to run the exclude command + // we will block all further checks and assume that those processes are not yet excluded. This should reduce the risk + // of successive recoveries because of the exclusion call. + err := fdbstatus.CanSafelyExcludeProcessesWithRecoveryState(cluster, status, minRecoverySeconds) + if err != nil { + return nil, err + } + remaining, err := fdbstatus.CanSafelyRemoveFromStatus(logger, adminClient, addresses, status) if err != nil { return nil, err diff --git a/pkg/fdbstatus/status_checks.go b/pkg/fdbstatus/status_checks.go index 4a92253d4..d2bc2f8f0 100644 --- a/pkg/fdbstatus/status_checks.go +++ b/pkg/fdbstatus/status_checks.go @@ -551,14 +551,14 @@ func recoveryStateAllowsInclusion(status *fdbv1beta2.FoundationDBStatus) bool { // CanSafelyExcludeProcessesWithRecoveryState currently performs the DefaultSafetyChecks and makes sure that the last recovery was at least 60 seconds ago. // In the future this check might be extended to perform more specific checks. -func CanSafelyExcludeProcessesWithRecoveryState(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) error { - return canSafelyExcludeOrIncludeProcesses(cluster, status, false, 120.0) +func CanSafelyExcludeProcessesWithRecoveryState(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, minRecoverySeconds float64) error { + return canSafelyExcludeOrIncludeProcesses(cluster, status, false, minRecoverySeconds) } // CanSafelyIncludeProcesses currently performs the DefaultSafetyChecks and makes sure that the last recovery was at least 60 seconds ago. // In the future this check might be extended to perform more specific checks. -func CanSafelyIncludeProcesses(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) error { - return canSafelyExcludeOrIncludeProcesses(cluster, status, true, 300.0) +func CanSafelyIncludeProcesses(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, minRecoverySeconds float64) error { + return canSafelyExcludeOrIncludeProcesses(cluster, status, true, minRecoverySeconds) } // ConfigurationChangeAllowed will return an error if the configuration change is assumed to be unsafe. If no error diff --git a/pkg/fdbstatus/status_checks_test.go b/pkg/fdbstatus/status_checks_test.go index 405d898fb..5091482c9 100644 --- a/pkg/fdbstatus/status_checks_test.go +++ b/pkg/fdbstatus/status_checks_test.go @@ -1814,7 +1814,7 @@ var _ = Describe("status_checks", func() { When("performing the exclude safety check.", func() { DescribeTable("should return if the safety check is satisfied or not", func(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, expected error) { - err := CanSafelyExcludeProcessesWithRecoveryState(cluster, status) + err := CanSafelyExcludeProcessesWithRecoveryState(cluster, status, 120.0) if expected == nil { Expect(err).To(BeNil()) } else { @@ -1950,7 +1950,7 @@ var _ = Describe("status_checks", func() { When("performing the include safety check.", func() { DescribeTable("should return if the safety check is satisfied or not", func(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, expected error) { - err := CanSafelyIncludeProcesses(cluster, status) + err := CanSafelyIncludeProcesses(cluster, status, 300.0) if expected == nil { Expect(err).To(BeNil()) } else { diff --git a/setup/setup.go b/setup/setup.go index 6fbf58546..e35c8a09d 100644 --- a/setup/setup.go +++ b/setup/setup.go @@ -77,6 +77,8 @@ type Options struct { LogFileMaxSize int LogFileMaxAge int MaxNumberOfOldLogFiles int + MinimumRecoveryTimeForExclusion float64 + MinimumRecoveryTimeForInclusion float64 LogFileMinAge time.Duration GetTimeout time.Duration PostTimeout time.Duration @@ -130,6 +132,8 @@ func (o *Options) BindFlags(fs *flag.FlagSet) { fs.BoolVar(&o.EnableRecoveryState, "enable-recovery-state", true, "This flag enables the use of the recovery state for the minimum uptime between bounced if the FDB version supports it.") fs.BoolVar(&o.CacheDatabaseStatus, "cache-database-status", true, "Defines the default value for caching the database status.") fs.BoolVar(&o.EnableNodeIndex, "enable-node-index", false, "Defines if the operator should add an index for accessing node objects. This requires a ClusterRoleBinding with node access. If the taint feature should be used, this setting should be set to true.") + fs.Float64Var(&o.MinimumRecoveryTimeForInclusion, "minimum-recovery-time-for-inclusion", 600.0, "Defines the minimum uptime of the cluster before inclusions are allowed. For clusters after 7.1 this will use the recovery state. This should reduce the risk of frequent recoveries because of inclusions.") + fs.Float64Var(&o.MinimumRecoveryTimeForExclusion, "minimum-recovery-time-for-exclusion", 120.0, "Defines the minimum uptime of the cluster before exclusions are allowed. For clusters after 7.1 this will use the recovery state. This should reduce the risk of frequent recoveries because of exclusions.") } // StartManager will start the FoundationDB operator manager. @@ -248,6 +252,8 @@ func StartManager( clusterReconciler.EnableRecoveryState = operatorOpts.EnableRecoveryState clusterReconciler.CacheDatabaseStatusForReconciliationDefault = operatorOpts.CacheDatabaseStatus clusterReconciler.MinimumRequiredUptimeCCBounce = operatorOpts.MinimumRequiredUptimeCCBounce + clusterReconciler.MinimumRecoveryTimeForInclusion = operatorOpts.MinimumRecoveryTimeForInclusion + clusterReconciler.MinimumRecoveryTimeForExclusion = operatorOpts.MinimumRecoveryTimeForExclusion if err := clusterReconciler.SetupWithManager(mgr, operatorOpts.MaxConcurrentReconciles, operatorOpts.EnableNodeIndex, *labelSelector, watchedObjects...); err != nil { setupLog.Error(err, "unable to create controller", "controller", "FoundationDBCluster")