Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Automatically retry the build if encountered remote cache eviction error
Browse files Browse the repository at this point in the history
coeuvre committed Mar 10, 2023
1 parent f9008f6 commit 1a43dcb
Showing 8 changed files with 215 additions and 68 deletions.
Original file line number Diff line number Diff line change
@@ -32,7 +32,9 @@
import com.google.devtools.build.lib.analysis.test.TestProvider;
import com.google.devtools.build.lib.bugreport.BugReporter;
import com.google.devtools.build.lib.buildtool.buildevent.ExecutionProgressReceiverAvailableEvent;
import com.google.devtools.build.lib.events.Event;
import com.google.devtools.build.lib.events.Reporter;
import com.google.devtools.build.lib.exec.ExecutionOptions;
import com.google.devtools.build.lib.profiler.Profiler;
import com.google.devtools.build.lib.profiler.SilentCloseable;
import com.google.devtools.build.lib.runtime.KeepGoingOption;
@@ -44,6 +46,7 @@
import com.google.devtools.build.lib.util.AbruptExitException;
import com.google.devtools.build.lib.util.DetailedExitCode;
import com.google.devtools.build.lib.util.DetailedExitCode.DetailedExitCodeComparator;
import com.google.devtools.build.lib.util.ExitCode;
import com.google.devtools.build.lib.vfs.ModifiedFileSet;
import com.google.devtools.build.skyframe.EvaluationResult;
import com.google.devtools.common.options.OptionsProvider;
@@ -100,12 +103,6 @@ public void buildArtifacts(
TopLevelArtifactContext topLevelArtifactContext,
boolean trustRemoteArtifacts)
throws BuildFailedException, AbruptExitException, TestExecException, InterruptedException {
BuildRequestOptions buildRequestOptions = options.getOptions(BuildRequestOptions.class);
// TODO(bazel-team): Should use --experimental_fsvc_threads instead of the hardcoded constant
// but plumbing the flag through is hard.
int fsvcThreads = buildRequestOptions == null ? 200 : buildRequestOptions.fsvcThreads;
skyframeExecutor.detectModifiedOutputFiles(
modifiedOutputFiles, lastExecutionTimeRange, trustRemoteArtifacts, fsvcThreads);
try (SilentCloseable c = Profiler.instance().profile("configureActionExecutor")) {
skyframeExecutor.configureActionExecutor(fileCache, actionInputPrefetcher);
}
@@ -119,9 +116,6 @@ public void buildArtifacts(
.getEventBus()
.post(new ExecutionProgressReceiverAvailableEvent(executionProgressReceiver));

List<DetailedExitCode> detailedExitCodes = new ArrayList<>();
EvaluationResult<?> result;

ActionExecutionStatusReporter statusReporter = ActionExecutionStatusReporter.create(
reporter, skyframeExecutor.getEventBus());

@@ -141,70 +135,126 @@ public void buildArtifacts(
parallelTests = Sets.difference(parallelTests, targetsToSkip);
exclusiveTests = Sets.difference(exclusiveTests, targetsToSkip);

var remoteCacheEvictionRetries =
options.getOptions(ExecutionOptions.class).remoteCacheEvictionRetries;
try {
result =
skyframeExecutor.buildArtifacts(
while (true) {
try {
buildArtifactsOnce(
reporter,
resourceManager,
executor,
artifacts,
targetsToBuild,
aspects,
parallelTests,
exclusiveTests,
targetsToBuild,
aspects,
executor,
options,
actionCacheChecker,
lastExecutionTimeRange,
topLevelArtifactContext,
trustRemoteArtifacts,
executionProgressReceiver,
isBuildingExclusiveArtifacts);
break;
} catch (BuildFailedException e) {
if (e.getDetailedExitCode().getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) {
if (remoteCacheEvictionRetries > 0) {
--remoteCacheEvictionRetries;
reporter.handle(
Event.warn("Found remote cache eviction error, retrying the build..."));
continue;
}
}
throw e;
}
}
} finally {
watchdog.stop();
skyframeExecutor.setActionExecutionProgressReportingObjects(null, null, null);
statusReporter.unregisterFromEventBus();
}
}

private void buildArtifactsOnce(
Reporter reporter,
Set<Artifact> artifacts,
Set<ConfiguredTarget> parallelTests,
Set<ConfiguredTarget> exclusiveTests,
Set<ConfiguredTarget> targetsToBuild,
ImmutableSet<AspectKey> aspects,
Executor executor,
OptionsProvider options,
@Nullable Range<Long> lastExecutionTimeRange,
TopLevelArtifactContext topLevelArtifactContext,
boolean trustRemoteArtifacts,
ExecutionProgressReceiver executionProgressReceiver,
AtomicBoolean isBuildingExclusiveArtifacts)
throws BuildFailedException, AbruptExitException, TestExecException, InterruptedException {
BuildRequestOptions buildRequestOptions = options.getOptions(BuildRequestOptions.class);
// TODO(bazel-team): Should use --experimental_fsvc_threads instead of the hardcoded constant
// but plumbing the flag through is hard.
int fsvcThreads = buildRequestOptions == null ? 200 : buildRequestOptions.fsvcThreads;
skyframeExecutor.detectModifiedOutputFiles(
modifiedOutputFiles, lastExecutionTimeRange, trustRemoteArtifacts, fsvcThreads);

List<DetailedExitCode> detailedExitCodes = new ArrayList<>();
EvaluationResult<?> result =
skyframeExecutor.buildArtifacts(
reporter,
resourceManager,
executor,
artifacts,
targetsToBuild,
aspects,
parallelTests,
exclusiveTests,
options,
actionCacheChecker,
executionProgressReceiver,
topLevelArtifactContext);
// progressReceiver is finished, so unsynchronized access to builtTargets is now safe.
DetailedExitCode detailedExitCode =
SkyframeErrorProcessor.processResult(
reporter,
result,
options.getOptions(KeepGoingOption.class).keepGoing,
skyframeExecutor.getCyclesReporter(),
bugReporter);

if (detailedExitCode != null) {
detailedExitCodes.add(detailedExitCode);
}

// Run exclusive tests: either tagged as "exclusive" or is run in an invocation with
// --test_output=streamed.
isBuildingExclusiveArtifacts.set(true);
for (ConfiguredTarget exclusiveTest : exclusiveTests) {
// Since only one artifact is being built at a time, we don't worry about an artifact being
// built and then the build being interrupted.
result =
skyframeExecutor.runExclusiveTest(
reporter,
resourceManager,
executor,
exclusiveTest,
options,
actionCacheChecker,
topLevelArtifactContext);
// progressReceiver is finished, so unsynchronized access to builtTargets is now safe.
DetailedExitCode detailedExitCode =
detailedExitCode =
SkyframeErrorProcessor.processResult(
reporter,
result,
options.getOptions(KeepGoingOption.class).keepGoing,
skyframeExecutor.getCyclesReporter(),
bugReporter);
Preconditions.checkState(
detailedExitCode != null || !result.keyNames().isEmpty(),
"Build reported as successful but test %s not executed: %s",
exclusiveTest,
result);

if (detailedExitCode != null) {
detailedExitCodes.add(detailedExitCode);
}

// Run exclusive tests: either tagged as "exclusive" or is run in an invocation with
// --test_output=streamed.
isBuildingExclusiveArtifacts.set(true);
for (ConfiguredTarget exclusiveTest : exclusiveTests) {
// Since only one artifact is being built at a time, we don't worry about an artifact being
// built and then the build being interrupted.
result =
skyframeExecutor.runExclusiveTest(
reporter,
resourceManager,
executor,
exclusiveTest,
options,
actionCacheChecker,
topLevelArtifactContext);
detailedExitCode =
SkyframeErrorProcessor.processResult(
reporter,
result,
options.getOptions(KeepGoingOption.class).keepGoing,
skyframeExecutor.getCyclesReporter(),
bugReporter);
Preconditions.checkState(
detailedExitCode != null || !result.keyNames().isEmpty(),
"Build reported as successful but test %s not executed: %s",
exclusiveTest,
result);

if (detailedExitCode != null) {
detailedExitCodes.add(detailedExitCode);
}
}
} finally {
watchdog.stop();
skyframeExecutor.setActionExecutionProgressReportingObjects(null, null, null);
statusReporter.unregisterFromEventBus();
}

if (detailedExitCodes.isEmpty()) {
Original file line number Diff line number Diff line change
@@ -495,6 +495,16 @@ public boolean usingLocalTestJobs() {
+ "test log. Otherwise, Bazel generates a test.xml as part of the test action.")
public boolean splitXmlGeneration;

@Option(
name = "experimental_remote_cache_eviction_retries",
defaultValue = "0",
documentationCategory = OptionDocumentationCategory.REMOTE,
effectTags = {OptionEffectTag.EXECUTION},
help =
"The maximum number of attempts to retry if the build encountered remote cache eviction error.")
public int remoteCacheEvictionRetries;


/** An enum for specifying different formats of test output. */
public enum TestOutputFormat {
SUMMARY, // Provide summary output only.
Original file line number Diff line number Diff line change
@@ -684,7 +684,9 @@ public void flushOutputTree() throws InterruptedException {
downloadCache.awaitInProgressTasks();
}

public ImmutableSet<ActionInput> getMissingActionInputs() {
return ImmutableSet.copyOf(missingActionInputs);
public ImmutableSet<ActionInput> takeMissingActionInputs() {
var result = ImmutableSet.copyOf(missingActionInputs);
missingActionInputs.removeAll(result);
return result;
}
}
Original file line number Diff line number Diff line change
@@ -152,10 +152,7 @@ protected Completable onErrorResumeNext(Throwable error) {
new EnvironmentalExecException(
(BulkTransferException) error,
FailureDetail.newBuilder()
.setMessage(
"Failed to fetch blobs because they do not exist remotely."
+ " Build without the Bytes does not work if your remote"
+ " cache evicts blobs during builds")
.setMessage("Failed to fetch blobs because they do not exist remotely")
.setSpawn(FailureDetails.Spawn.newBuilder().setCode(code))
.build());
}
Original file line number Diff line number Diff line change
@@ -20,13 +20,13 @@
import com.google.common.collect.ImmutableMap;
import com.google.common.eventbus.Subscribe;
import com.google.devtools.build.lib.actions.Action;
import com.google.devtools.build.lib.actions.ActionCompletionEvent;
import com.google.devtools.build.lib.actions.ActionInputMap;
import com.google.devtools.build.lib.actions.Artifact;
import com.google.devtools.build.lib.actions.ArtifactPathResolver;
import com.google.devtools.build.lib.actions.FilesetOutputSymlink;
import com.google.devtools.build.lib.actions.cache.MetadataHandler;
import com.google.devtools.build.lib.actions.cache.MetadataInjector;
import com.google.devtools.build.lib.buildtool.buildevent.ExecutionPhaseCompleteEvent;
import com.google.devtools.build.lib.events.EventHandler;
import com.google.devtools.build.lib.util.AbruptExitException;
import com.google.devtools.build.lib.vfs.BatchStat;
@@ -115,9 +115,9 @@ public void finalizeBuild(boolean buildSuccessful) {
}

@Subscribe
public void onExecutionPhaseCompleteEvent(ExecutionPhaseCompleteEvent event) {
public void onActionCompletion(ActionCompletionEvent event) {
if (leaseService != null && actionInputFetcher != null) {
leaseService.handleMissingInputs(actionInputFetcher.getMissingActionInputs());
leaseService.handleMissingInputs(actionInputFetcher.takeMissingActionInputs());
}
}

Original file line number Diff line number Diff line change
@@ -54,6 +54,7 @@
import com.google.devtools.build.lib.util.AnsiStrippingOutputStream;
import com.google.devtools.build.lib.util.DebugLoggerConfigurator;
import com.google.devtools.build.lib.util.DetailedExitCode;
import com.google.devtools.build.lib.util.ExitCode;
import com.google.devtools.build.lib.util.InterruptedFailureDetails;
import com.google.devtools.build.lib.util.LoggingUtil;
import com.google.devtools.build.lib.util.Pair;
Original file line number Diff line number Diff line change
@@ -596,7 +596,7 @@ public void missingInputs_addedToList() {
assertThrows(
Exception.class, () -> wait(prefetcher.prefetchFiles(metadata.keySet(), metadataProvider)));

assertThat(prefetcher.getMissingActionInputs()).contains(a);
assertThat(prefetcher.takeMissingActionInputs()).contains(a);
}

protected static void wait(ListenableFuture<Void> future)
Loading

0 comments on commit 1a43dcb

Please sign in to comment.