Skip to content

Commit

Permalink
Disable recovery monitor before recovery start
Browse files Browse the repository at this point in the history
We do nontrivial amounts of work before we start a peer recovery,
particularly recovering from the local translog up to its global
checkpoint. Today the recovery monitor is running during this time, and
will (repeatedly) fail the recovery if it takes more than 30 minutes to
complete. With this commit we disable the recovery monitor until this
local process has completed.

Closes elastic#93542
  • Loading branch information
DaveCTurner committed Feb 7, 2023
1 parent 3f47fe9 commit d382a3d
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.elasticsearch.core.CheckedFunction;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.index.IndexNotFoundException;
import org.elasticsearch.index.engine.RecoveryEngineException;
Expand Down Expand Up @@ -219,6 +220,7 @@ private void doRecovery(final long recoveryId, final StartRecoveryRequest preExi
final RecoveryState recoveryState = recoveryTarget.state();
final RecoveryState.Timer timer = recoveryState.getTimer();
final IndexShard indexShard = recoveryTarget.indexShard();
final Releasable onCompletion = Releasables.wrap(recoveryTarget.disableRecoveryMonitor(), recoveryRef);

final var failureHandler = ActionListener.notifyOnce(ActionListener.runBefore(ActionListener.noop().delegateResponse((l, e) -> {
// this will be logged as warning later on...
Expand All @@ -228,7 +230,7 @@ private void doRecovery(final long recoveryId, final StartRecoveryRequest preExi
new RecoveryFailedException(recoveryTarget.state(), "failed to prepare shard for recovery", e),
true
);
}), recoveryRef::close));
}), onCompletion::close));

if (indexShard.routingEntry().isPromotableToPrimary() == false) {
assert preExistingRequest == null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,11 +255,11 @@ public boolean cancelRecoveriesForShard(ShardId shardId, String reason) {
}

/**
* a reference to {@link RecoveryTarget}, which implements {@link AutoCloseable}. closing the reference
* a reference to {@link RecoveryTarget}, which implements {@link Releasable}. closing the reference
* causes {@link RecoveryTarget#decRef()} to be called. This makes sure that the underlying resources
* will not be freed until {@link RecoveryRef#close()} is called.
*/
public static class RecoveryRef implements AutoCloseable {
public static class RecoveryRef implements Releasable {

private final RecoveryTarget status;
private final AtomicBoolean closed = new AtomicBoolean(false);
Expand Down

0 comments on commit d382a3d

Please sign in to comment.