Skip to content

Commit

Permalink
Add documentation for new saftey checks, allow to set the minimum upt…
Browse files Browse the repository at this point in the history
…ime and make sure exclude and include calls are coordinated
  • Loading branch information
johscheuer committed Mar 15, 2024
1 parent 1dbfa7e commit 60ea96f
Show file tree
Hide file tree
Showing 10 changed files with 118 additions and 20 deletions.
10 changes: 10 additions & 0 deletions controllers/cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ type FoundationDBClusterReconciler struct {
GetTimeout time.Duration
PostTimeout time.Duration
MinimumRequiredUptimeCCBounce time.Duration
// MinimumRecoveryTimeForInclusion defines the duration in seconds that a cluster must be up
// before new inclusions are allowed. The operator issuing frequent inclusions in a short time window
// could cause instability for the cluster as each inclusion will/can cause a recovery. Delaying the inclusion
// of deleted process groups is not an issue as all the process groups that have no resources and are marked for
// deletion and are fully excluded, will be batched together in a single inclusion call.
MinimumRecoveryTimeForInclusion float64
// MinimumRecoveryTimeForExclusion defines the duration in seconds that a cluster must be up
// before new exclusions are allowed. The operator issuing frequent exclusions in a short time window
// could cause instability for the cluster as each exclusion will/can cause a recovery.
MinimumRecoveryTimeForExclusion float64
}

// NewFoundationDBClusterReconciler creates a new FoundationDBClusterReconciler with defaults.
Expand Down
22 changes: 21 additions & 1 deletion controllers/exclude_processes.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,28 @@ func (e excludeProcesses) reconcile(_ context.Context, r *FoundationDBClusterRec
return nil
}

// Make sure the exclusions are coordinated across multiple operator instances.
if cluster.ShouldUseLocks() {
lockClient, err := r.getLockClient(cluster)
if err != nil {
return &requeue{curError: err}
}

_, err = lockClient.TakeLock()
if err != nil {
return &requeue{curError: err, delayedRequeue: true}
}

defer func() {
err = lockClient.ReleaseLock()
if err != nil {
logger.Error(err, "could not release lock")
}
}()
}

// Make sure it's safe to exclude processes.
err = fdbstatus.CanSafelyExcludeProcessesWithRecoveryState(cluster, status)
err = fdbstatus.CanSafelyExcludeProcessesWithRecoveryState(cluster, status, r.MinimumRecoveryTimeForExclusion)
if err != nil {
return &requeue{curError: err, delayedRequeue: true}
}
Expand Down
25 changes: 22 additions & 3 deletions controllers/remove_process_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func (u removeProcessGroups) reconcile(ctx context.Context, r *FoundationDBClust
}
}

remainingMap, err := removals.GetRemainingMap(logger, adminClient, cluster, status)
remainingMap, err := removals.GetRemainingMap(logger, adminClient, cluster, status, r.MinimumRecoveryTimeForExclusion)
if err != nil {
return &requeue{curError: err}
}
Expand Down Expand Up @@ -133,7 +133,6 @@ func (u removeProcessGroups) reconcile(ctx context.Context, r *FoundationDBClust

// This will return a map of the newly removed ProcessGroups and the ProcessGroups with the ResourcesTerminating condition
removedProcessGroups := r.removeProcessGroups(ctx, logger, cluster, zoneRemovals, zonedRemovals[removals.TerminatingZone])

err = includeProcessGroup(ctx, logger, r, cluster, removedProcessGroups, status)
if err != nil {
return &requeue{curError: err}
Expand Down Expand Up @@ -261,8 +260,28 @@ func includeProcessGroup(ctx context.Context, logger logr.Logger, r *FoundationD
return nil
}

// Make sure the inclusion are coordinated across multiple operator instances.
if cluster.ShouldUseLocks() {
lockClient, err := r.getLockClient(cluster)
if err != nil {
return err
}

_, err = lockClient.TakeLock()
if err != nil {
return err
}

defer func() {
err = lockClient.ReleaseLock()
if err != nil {
logger.Error(err, "could not release lock")
}
}()
}

// Make sure it's safe to include processes.
err = fdbstatus.CanSafelyIncludeProcesses(cluster, status)
err = fdbstatus.CanSafelyIncludeProcesses(cluster, status, r.MinimumRecoveryTimeForInclusion)
if err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion controllers/remove_process_groups_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ var _ = Describe("remove_process_groups", func() {

It("should not remove the process group and should not exclude processes", func() {
Expect(result).NotTo(BeNil())
Expect(result.message).To(Equal("Reconciliation needs to exclude more processes"))
Expect(result.curError).To(HaveOccurred())
// Ensure resources are not deleted
removed, include, err := confirmRemoval(context.Background(), globalControllerLogger, clusterReconciler, cluster, removedProcessGroup)
Expect(err).To(BeNil())
Expand Down
45 changes: 37 additions & 8 deletions docs/manual/technical_design.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,15 @@ The `ChooseRemovals` subreconciler flags processes for removal when the current
The `ExcludeProcesses` subreconciler runs an [exclude command](https://apple.github.io/foundationdb/administration.html#removing-machines-from-a-cluster) in `fdbcli` for any process group that is marked for removal and is not already being excluded.
The `exclude` command tells FoundationDB that a process should not serve any roles, and that any data on that process should be moved to other processes.
This exclusion can take a long time, but this subreconciler does not wait for exclusion to complete.
The operator will only run the exclude command if it is safe to run it.
The safety checks are defined in the [status_checks.go](../../pkg/fdbstatus/status_checks.go) file and includes the following checks:

- There is a low number of active generations.
- The cluster is available from the client perspective.
- The last recovery was at least `MinimumRecoveryTimeForExclusion` seconds ago.

The `MinimumRecoveryTimeForExclusion` parameter can be changed with the `--minimum-recovery-time-for-exclusion` argument and the default is `120.0` seconds.
Having a wait time between the exclusions will reduce the risk of successive recoveries which might cause issues to clients.

The operator will only trigger a replacement if the new processes are available.
In addition the operator will not trigger any exclusion if any of the process groups with the same process clas has the `MissingProcess` condition for less than 5 minutes.
Expand Down Expand Up @@ -304,18 +313,38 @@ The `RemoveProcessGroups` subreconciler deletes any pods that are marked for rem

This performs the following sequence of steps for every pod:

1. Confirm that the exclusion is complete
1. Trigger the deletion of the pod
1. Confirm that the pod is fully terminated
1. Trigger the deletion of the PVC
1. Confirm that the PVC is fully terminated
1. Trigger the deletion of the service
1. Confirm that the service is fully terminated
1. Confirm that the exclusion is complete.
1. Trigger the deletion of the pod.
1. Confirm that the pod is fully terminated.
1. Trigger the deletion of the PVC.
1. Confirm that the PVC is fully terminated.
1. Trigger the deletion of the service.
1. Confirm that the service is fully terminated.
1. Include the processes where all resources are deleted.
1. Remove the process group from the list of process groups.

If any process group is marked for removal but cannot complete the sequence above, this will requeue reconciliation. However, we will always run through this sequence on all of the process groups that need to be removed, getting as far as we can for each one. This means that one pod being stuck in terminating should not block other pods from being deleted.
If any process group is marked for removal but cannot complete the sequence above, this will requeue reconciliation.
However, we will always run through this sequence on all of the process groups that need to be removed, getting as far as we can for each one.
This means that one pod being stuck in terminating should not block other pods from being deleted.

This will not allow deleting any pods that are serving as coordinators.

The `RemoveProcessGroups` subreconciler has some additional safety checks to reduce the risk of successive recoveries.

The operator will only run the exclude command if it is safe to run it.
The safety checks are defined in the [status_checks.go](../../pkg/fdbstatus/status_checks.go) file and includes the following checks:

- There is a low number of active generations.
- The cluster is available from the client perspective.
- The last recovery was at least `MinimumRecoveryTimeForExclusion` seconds ago.

The `MinimumRecoveryTimeForExclusion` parameter can be changed with the `--minimum-recovery-time-for-exclusion` argument and the default is `120.0` seconds.
Having a wait time between the exclusions will reduce the risk of successive recoveries which might cause issues to clients.

The same is true for the include operation with the difference that `MinimumRecoveryTimeForInclusion` is used to determine the minimum uptime of the cluster.
The `MinimumRecoveryTimeForInclusion` parameter can be changed with the `--minimum-recovery-time-for-inclusion` argument and the default is `600.0` seconds.
The operator will batch all outstanding inclusion together into a single include call.

### UpdateStatus (again)

Once we have completed all other steps in reconciliation, we run the `UpdateStatus` subreconciler a second time to check that everything is in the desired state. If there is anything that is not in the desired state, the operator will requeue reconciliation.
Expand Down
5 changes: 5 additions & 0 deletions e2e/fixtures/fdb_operator_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,11 @@ spec:
- --zap-log-level=debug
- --minimum-required-uptime-for-cc-bounce=60s
#- --server-side-apply
# We are setting low values here as the e2e test are taking down processes multiple times
# and having a high wait time between recoveries will increase the reliability of the cluster but also
# increase the time our e2e test take.
- -- minimum-recovery-time-for-inclusion=1.0
- -- minimum-recovery-time-for-exclusion=1.0
image: {{ .OperatorImage }}
name: manager
imagePullPolicy: Always
Expand Down
11 changes: 10 additions & 1 deletion internal/removals/remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,21 @@ func GetZonedRemovals(processGroupsToRemove []*fdbv1beta2.ProcessGroupStatus) (m
}

// GetRemainingMap returns a map that indicates if a process group is fully excluded in the cluster.
func GetRemainingMap(logger logr.Logger, adminClient fdbadminclient.AdminClient, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) (map[string]bool, error) {
func GetRemainingMap(logger logr.Logger, adminClient fdbadminclient.AdminClient, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, minRecoverySeconds float64) (map[string]bool, error) {
remainingMap, addresses := getAddressesToValidateBeforeRemoval(logger, cluster)
if len(addresses) == 0 {
return nil, nil
}

// The CanSafelyRemoveFromStatus will run the exclusion command for processes that are either assumed to be fully excluded
// and processes that are currently missing from the machine-readable status. If it's not safe to run the exclude command
// we will block all further checks and assume that those processes are not yet excluded. This should reduce the risk
// of successive recoveries because of the exclusion call.
err := fdbstatus.CanSafelyExcludeProcessesWithRecoveryState(cluster, status, minRecoverySeconds)
if err != nil {
return nil, err
}

remaining, err := fdbstatus.CanSafelyRemoveFromStatus(logger, adminClient, addresses, status)
if err != nil {
return nil, err
Expand Down
8 changes: 4 additions & 4 deletions pkg/fdbstatus/status_checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -551,14 +551,14 @@ func recoveryStateAllowsInclusion(status *fdbv1beta2.FoundationDBStatus) bool {

// CanSafelyExcludeProcessesWithRecoveryState currently performs the DefaultSafetyChecks and makes sure that the last recovery was at least 60 seconds ago.
// In the future this check might be extended to perform more specific checks.
func CanSafelyExcludeProcessesWithRecoveryState(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) error {
return canSafelyExcludeOrIncludeProcesses(cluster, status, false, 120.0)
func CanSafelyExcludeProcessesWithRecoveryState(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, minRecoverySeconds float64) error {
return canSafelyExcludeOrIncludeProcesses(cluster, status, false, minRecoverySeconds)
}

// CanSafelyIncludeProcesses currently performs the DefaultSafetyChecks and makes sure that the last recovery was at least 60 seconds ago.
// In the future this check might be extended to perform more specific checks.
func CanSafelyIncludeProcesses(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) error {
return canSafelyExcludeOrIncludeProcesses(cluster, status, true, 300.0)
func CanSafelyIncludeProcesses(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, minRecoverySeconds float64) error {
return canSafelyExcludeOrIncludeProcesses(cluster, status, true, minRecoverySeconds)
}

// ConfigurationChangeAllowed will return an error if the configuration change is assumed to be unsafe. If no error
Expand Down
4 changes: 2 additions & 2 deletions pkg/fdbstatus/status_checks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1814,7 +1814,7 @@ var _ = Describe("status_checks", func() {
When("performing the exclude safety check.", func() {
DescribeTable("should return if the safety check is satisfied or not",
func(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, expected error) {
err := CanSafelyExcludeProcessesWithRecoveryState(cluster, status)
err := CanSafelyExcludeProcessesWithRecoveryState(cluster, status, 120.0)
if expected == nil {
Expect(err).To(BeNil())
} else {
Expand Down Expand Up @@ -1950,7 +1950,7 @@ var _ = Describe("status_checks", func() {
When("performing the include safety check.", func() {
DescribeTable("should return if the safety check is satisfied or not",
func(cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, expected error) {
err := CanSafelyIncludeProcesses(cluster, status)
err := CanSafelyIncludeProcesses(cluster, status, 300.0)
if expected == nil {
Expect(err).To(BeNil())
} else {
Expand Down
6 changes: 6 additions & 0 deletions setup/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ type Options struct {
LogFileMaxSize int
LogFileMaxAge int
MaxNumberOfOldLogFiles int
MinimumRecoveryTimeForExclusion float64
MinimumRecoveryTimeForInclusion float64
LogFileMinAge time.Duration
GetTimeout time.Duration
PostTimeout time.Duration
Expand Down Expand Up @@ -130,6 +132,8 @@ func (o *Options) BindFlags(fs *flag.FlagSet) {
fs.BoolVar(&o.EnableRecoveryState, "enable-recovery-state", true, "This flag enables the use of the recovery state for the minimum uptime between bounced if the FDB version supports it.")
fs.BoolVar(&o.CacheDatabaseStatus, "cache-database-status", true, "Defines the default value for caching the database status.")
fs.BoolVar(&o.EnableNodeIndex, "enable-node-index", false, "Defines if the operator should add an index for accessing node objects. This requires a ClusterRoleBinding with node access. If the taint feature should be used, this setting should be set to true.")
fs.Float64Var(&o.MinimumRecoveryTimeForInclusion, "minimum-recovery-time-for-inclusion", 600.0, "Defines the minimum uptime of the cluster before inclusions are allowed. For clusters after 7.1 this will use the recovery state. This should reduce the risk of frequent recoveries because of inclusions.")
fs.Float64Var(&o.MinimumRecoveryTimeForExclusion, "minimum-recovery-time-for-exclusion", 120.0, "Defines the minimum uptime of the cluster before exclusions are allowed. For clusters after 7.1 this will use the recovery state. This should reduce the risk of frequent recoveries because of exclusions.")
}

// StartManager will start the FoundationDB operator manager.
Expand Down Expand Up @@ -248,6 +252,8 @@ func StartManager(
clusterReconciler.EnableRecoveryState = operatorOpts.EnableRecoveryState
clusterReconciler.CacheDatabaseStatusForReconciliationDefault = operatorOpts.CacheDatabaseStatus
clusterReconciler.MinimumRequiredUptimeCCBounce = operatorOpts.MinimumRequiredUptimeCCBounce
clusterReconciler.MinimumRecoveryTimeForInclusion = operatorOpts.MinimumRecoveryTimeForInclusion
clusterReconciler.MinimumRecoveryTimeForExclusion = operatorOpts.MinimumRecoveryTimeForExclusion

if err := clusterReconciler.SetupWithManager(mgr, operatorOpts.MaxConcurrentReconciles, operatorOpts.EnableNodeIndex, *labelSelector, watchedObjects...); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "FoundationDBCluster")
Expand Down

0 comments on commit 60ea96f

Please sign in to comment.