Retry ensureInputsPresent/execute/download This observably removes any ill effect of CAS transience. Closes #5229. PiperOrigin-RevId: 204010317
diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteActionContextProvider.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteActionContextProvider.java index e86b41e..145ae9b 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteActionContextProvider.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteActionContextProvider.java
@@ -40,6 +40,7 @@ private final CommandEnvironment env; @Nullable private final AbstractRemoteActionCache cache; @Nullable private final GrpcRemoteExecutor executor; + private final RemoteRetrier retrier; private final DigestUtil digestUtil; private final Path logDir; @@ -47,11 +48,13 @@ CommandEnvironment env, @Nullable AbstractRemoteActionCache cache, @Nullable GrpcRemoteExecutor executor, + RemoteRetrier retrier, DigestUtil digestUtil, Path logDir) { this.env = env; this.executor = executor; this.cache = cache; + this.retrier = retrier; this.digestUtil = digestUtil; this.logDir = logDir; } @@ -88,6 +91,7 @@ commandId, cache, executor, + retrier, digestUtil, logDir); return ImmutableList.of(new RemoteSpawnStrategy(env.getExecRoot(), spawnRunner));
diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java index 58a6248..a12ee30 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java
@@ -28,6 +28,7 @@ import com.google.devtools.build.lib.buildtool.BuildRequest; import com.google.devtools.build.lib.events.Event; import com.google.devtools.build.lib.exec.ExecutorBuilder; +import com.google.devtools.build.lib.remote.Retrier.RetryException; import com.google.devtools.build.lib.remote.logging.LoggingInterceptor; import com.google.devtools.build.lib.remote.util.DigestUtil; import com.google.devtools.build.lib.runtime.BlazeModule; @@ -43,11 +44,18 @@ import com.google.devtools.common.options.OptionsBase; import com.google.devtools.common.options.OptionsProvider; import com.google.devtools.remoteexecution.v1test.Digest; +import com.google.protobuf.Any; +import com.google.protobuf.InvalidProtocolBufferException; +import com.google.rpc.PreconditionFailure; +import com.google.rpc.PreconditionFailure.Violation; import io.grpc.Channel; import io.grpc.ClientInterceptors; +import io.grpc.Status.Code; +import io.grpc.protobuf.StatusProto; import java.io.IOException; import java.util.Map; import java.util.concurrent.Executors; +import java.util.function.Predicate; import java.util.logging.Logger; /** RemoteModule provides distributed cache and remote execution for Bazel. */ @@ -114,6 +122,40 @@ "remote"); } + private static final String VIOLATION_TYPE_MISSING = "MISSING"; + + private static final Predicate<? super Exception> RETRIABLE_EXEC_ERRORS = + e -> { + if (e instanceof CacheNotFoundException || e.getCause() instanceof CacheNotFoundException) { + return true; + } + if (!(e instanceof RetryException) + || !RemoteRetrierUtils.causedByStatus((RetryException) e, Code.FAILED_PRECONDITION)) { + return false; + } + com.google.rpc.Status status = StatusProto.fromThrowable(e); + if (status == null || status.getDetailsCount() == 0) { + return false; + } + for (Any details : status.getDetailsList()) { + PreconditionFailure f; + try { + f = details.unpack(PreconditionFailure.class); + } catch (InvalidProtocolBufferException protoEx) { + return false; + } + if (f.getViolationsCount() == 0) { + return false; // Generally shouldn't happen + } + for (Violation v : f.getViolationsList()) { + if (!v.getType().equals(VIOLATION_TYPE_MISSING)) { + return false; + } + } + } + return true; // if *all* > 0 violations have type MISSING + }; + @Override public void beforeCommand(CommandEnvironment env) throws AbruptExitException { env.getEventBus().register(this); @@ -167,6 +209,7 @@ logger = new LoggingInterceptor(rpcLogFile, env.getRuntime().getClock()); } + final RemoteRetrier executeRetrier; final AbstractRemoteActionCache cache; if (enableBlobStoreCache) { Retrier retrier = @@ -175,6 +218,7 @@ (e) -> false, retryScheduler, Retrier.ALLOW_ALL_CALLS); + executeRetrier = null; cache = new SimpleBlobStoreActionCache( remoteOptions, @@ -197,6 +241,7 @@ RemoteRetrier.RETRIABLE_GRPC_ERRORS, retryScheduler, Retrier.ALLOW_ALL_CALLS); + executeRetrier = createExecuteRetrier(remoteOptions, retryScheduler); cache = new GrpcRemoteCache( ch, @@ -205,6 +250,7 @@ retrier, digestUtil); } else { + executeRetrier = null; cache = null; } @@ -230,7 +276,7 @@ executor = null; } actionContextProvider = - new RemoteActionContextProvider(env, cache, executor, digestUtil, logDir); + new RemoteActionContextProvider(env, cache, executor, executeRetrier, digestUtil, logDir); } catch (IOException e) { env.getReporter().handle(Event.error(e.getMessage())); env.getBlazeModuleEnvironment() @@ -272,4 +318,15 @@ return SimpleBlobStoreFactory.isRemoteCacheOptions(options) || GrpcRemoteCache.isRemoteCacheOptions(options); } + + public static RemoteRetrier createExecuteRetrier( + RemoteOptions options, ListeningScheduledExecutorService retryService) { + return new RemoteRetrier( + options.experimentalRemoteRetry + ? () -> new Retrier.ZeroBackoff(options.experimentalRemoteRetryMaxAttempts) + : () -> Retrier.RETRIES_DISABLED, + RemoteModule.RETRIABLE_EXEC_ERRORS, + retryService, + Retrier.ALLOW_ALL_CALLS); + } }
diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java index 1198c2f..95511d0 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java
@@ -88,6 +88,7 @@ @Nullable private final Reporter cmdlineReporter; @Nullable private final AbstractRemoteActionCache remoteCache; @Nullable private final GrpcRemoteExecutor remoteExecutor; + @Nullable private final RemoteRetrier retrier; private final String buildRequestId; private final String commandId; private final DigestUtil digestUtil; @@ -107,6 +108,7 @@ String commandId, @Nullable AbstractRemoteActionCache remoteCache, @Nullable GrpcRemoteExecutor remoteExecutor, + @Nullable RemoteRetrier retrier, DigestUtil digestUtil, Path logDir) { this.execRoot = execRoot; @@ -119,6 +121,7 @@ this.cmdlineReporter = cmdlineReporter; this.buildRequestId = buildRequestId; this.commandId = commandId; + this.retrier = retrier; this.digestUtil = digestUtil; this.logDir = logDir; } @@ -194,34 +197,26 @@ return execLocally(spawn, context, inputMap, uploadLocalResults, remoteCache, actionKey); } + ExecuteRequest request = + ExecuteRequest.newBuilder() + .setInstanceName(remoteOptions.remoteInstanceName) + .setAction(action) + .setSkipCacheLookup(!acceptCachedResult) + .build(); try { - // Upload the command and all the inputs into the remote cache. - remoteCache.ensureInputsPresent(repository, execRoot, inputRoot, command); - } catch (IOException e) { - return execLocallyOrFail(spawn, context, inputMap, actionKey, uploadLocalResults, e); - } + return retrier.execute( + () -> { + // Upload the command and all the inputs into the remote cache. + remoteCache.ensureInputsPresent(repository, execRoot, inputRoot, command); - final ActionResult result; - boolean remoteCacheHit = false; - try { - ExecuteRequest.Builder request = - ExecuteRequest.newBuilder() - .setInstanceName(remoteOptions.remoteInstanceName) - .setAction(action) - .setSkipCacheLookup(!acceptCachedResult); - ExecuteResponse reply = remoteExecutor.executeRemotely(request.build()); - maybeDownloadServerLogs(reply, actionKey); - result = reply.getResult(); - remoteCacheHit = reply.getCachedResult(); - } catch (IOException e) { - return execLocallyOrFail(spawn, context, inputMap, actionKey, uploadLocalResults, e); - } + ExecuteResponse reply = remoteExecutor.executeRemotely(request); + maybeDownloadServerLogs(reply, actionKey); - try { - return downloadRemoteResults(result, context.getFileOutErr()) - .setRunnerName(remoteCacheHit ? "remote cache hit" : getName()) - .setCacheHit(remoteCacheHit) - .build(); + return downloadRemoteResults(reply.getResult(), context.getFileOutErr()) + .setRunnerName(reply.getCachedResult() ? "remote cache hit" : getName()) + .setCacheHit(reply.getCachedResult()) + .build(); + }); } catch (IOException e) { return execLocallyOrFail(spawn, context, inputMap, actionKey, uploadLocalResults, e); }
diff --git a/src/main/java/com/google/devtools/build/lib/remote/Retrier.java b/src/main/java/com/google/devtools/build/lib/remote/Retrier.java index a329c04..9d7228d 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/Retrier.java +++ b/src/main/java/com/google/devtools/build/lib/remote/Retrier.java
@@ -185,6 +185,31 @@ } }; + /** No backoff. */ + public static class ZeroBackoff implements Backoff { + + private final int maxRetries; + private int retries; + + public ZeroBackoff(int maxRetries) { + this.maxRetries = maxRetries; + } + + @Override + public long nextDelayMillis() { + if (retries >= maxRetries) { + return -1; + } + retries++; + return 0; + } + + @Override + public int getRetryAttempts() { + return retries; + } + } + private final Supplier<Backoff> backoffSupplier; private final Predicate<? super Exception> shouldRetry; private final CircuitBreaker circuitBreaker;
diff --git a/src/test/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutionClientTest.java b/src/test/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutionClientTest.java index 5c45d45..849259f 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutionClientTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutionClientTest.java
@@ -277,6 +277,7 @@ "command-id", remoteCache, executor, + retrier, DIGEST_UTIL, logDir); inputDigest = fakeFileCache.createScratchInput(simpleSpawn.getInputFiles().get(0), "xyz");
diff --git a/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java b/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java index 283cc95..6b024b5 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java
@@ -30,6 +30,8 @@ import com.google.common.collect.ImmutableMap; import com.google.common.eventbus.EventBus; import com.google.common.io.ByteStreams; +import com.google.common.util.concurrent.ListeningScheduledExecutorService; +import com.google.common.util.concurrent.MoreExecutors; import com.google.common.util.concurrent.SettableFuture; import com.google.devtools.build.lib.actions.ActionInput; import com.google.devtools.build.lib.actions.Artifact.ArtifactExpander; @@ -79,7 +81,10 @@ import java.time.Duration; import java.util.Collection; import java.util.SortedMap; +import java.util.concurrent.Executors; +import org.junit.AfterClass; import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -94,6 +99,7 @@ private static final ImmutableMap<String, String> NO_CACHE = ImmutableMap.of(ExecutionRequirements.NO_CACHE, ""); + private static ListeningScheduledExecutorService retryService; private Path execRoot; private Path logDir; @@ -101,6 +107,9 @@ private FakeActionInputFileCache fakeFileCache; private FileOutErr outErr; + private RemoteOptions options; + private RemoteRetrier retrier; + @Mock private AbstractRemoteActionCache cache; @Mock @@ -113,6 +122,11 @@ private final String simpleActionId = "eb45b20cc979d504f96b9efc9a08c48103c6f017afa09c0df5c70a5f92a98ea8"; + @BeforeClass + public static void beforeEverything() { + retryService = MoreExecutors.listeningDecorator(Executors.newScheduledThreadPool(1)); + } + @Before public final void setUp() throws Exception { MockitoAnnotations.initMocks(this); @@ -128,6 +142,14 @@ FileSystemUtils.createDirectoryAndParents(stdout.getParentDirectory()); FileSystemUtils.createDirectoryAndParents(stderr.getParentDirectory()); outErr = new FileOutErr(stdout, stderr); + + options = Options.getDefaults(RemoteOptions.class); + retrier = RemoteModule.createExecuteRetrier(options, retryService); + } + + @AfterClass + public static void afterEverything() { + retryService.shutdownNow(); } @Test @@ -137,7 +159,6 @@ // It should be executed remotely, but marked non-cacheable to remote execution, so that // the action result is not saved in the remote cache. - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteAcceptCached = true; options.remoteLocalFallback = false; options.remoteUploadLocalResults = true; @@ -154,6 +175,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -197,7 +219,6 @@ // Test that if a spawn is executed locally, due to the local fallback, that its result is not // uploaded to the remote cache. However, the artifacts should still be uploaded. - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteAcceptCached = true; options.remoteLocalFallback = true; options.remoteUploadLocalResults = true; @@ -214,6 +235,7 @@ "command-id", cache, null, + retrier, digestUtil, logDir); @@ -252,7 +274,6 @@ // Test that the outputs of a failed locally executed action are uploaded to a remote cache, // but the action result itself is not. - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteUploadLocalResults = true; RemoteSpawnRunner runner = @@ -268,6 +289,7 @@ "command-id", cache, null, + retrier, digestUtil, logDir)); @@ -318,6 +340,7 @@ "command-id", cache, null, + retrier, digestUtil, logDir)); @@ -334,7 +357,6 @@ public void printWarningIfCacheIsDown() throws Exception { // If we try to upload to a local cache, that is down a warning should be printed. - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteUploadLocalResults = true; options.remoteLocalFallback = true; @@ -354,6 +376,7 @@ "command-id", cache, null, + retrier, digestUtil, logDir); @@ -396,7 +419,6 @@ public void noRemoteExecutorFallbackFails() throws Exception { // Errors from the fallback runner should be propogated out of the remote runner. - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteUploadLocalResults = true; options.remoteLocalFallback = true; @@ -412,6 +434,7 @@ "command-id", cache, null, + retrier, digestUtil, logDir); @@ -437,7 +460,6 @@ public void remoteCacheErrorFallbackFails() throws Exception { // Errors from the fallback runner should be propogated out of the remote runner. - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteUploadLocalResults = true; options.remoteLocalFallback = true; @@ -453,6 +475,7 @@ "command-id", cache, null, + retrier, digestUtil, logDir); @@ -476,7 +499,6 @@ @Test public void testLocalFallbackFailureRemoteExecutorFailure() throws Exception { - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteLocalFallback = true; RemoteSpawnRunner runner = @@ -491,6 +513,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -527,6 +550,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -568,6 +592,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -612,6 +637,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -651,6 +677,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -680,8 +707,6 @@ public void cacheDownloadFailureTriggersRemoteExecution() throws Exception { // If downloading a cached action fails, remote execution should be tried. - RemoteOptions options = Options.getDefaults(RemoteOptions.class); - RemoteSpawnRunner runner = new RemoteSpawnRunner( execRoot, @@ -694,6 +719,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -722,7 +748,6 @@ public void testRemoteExecutionTimeout() throws Exception { // If remote execution times out the SpawnResult status should be TIMEOUT. - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteLocalFallback = false; RemoteSpawnRunner runner = @@ -737,6 +762,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -771,7 +797,6 @@ // If remote execution times out the SpawnResult status should be TIMEOUT, regardess of local // fallback option. - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteLocalFallback = true; RemoteSpawnRunner runner = @@ -786,6 +811,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -818,7 +844,6 @@ @Test public void testRemoteExecutionCommandFailureDoesNotTriggerFallback() throws Exception { - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteLocalFallback = true; RemoteSpawnRunner runner = @@ -833,6 +858,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -860,7 +886,6 @@ // If we get a failure due to the remote cache not working, the exit code should be // ExitCode.REMOTE_ERROR. - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteLocalFallback = false; RemoteSpawnRunner runner = @@ -875,6 +900,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -898,7 +924,6 @@ // If we get a failure due to the remote executor not working, the exit code should be // ExitCode.REMOTE_ERROR. - RemoteOptions options = Options.getDefaults(RemoteOptions.class); options.remoteLocalFallback = false; RemoteSpawnRunner runner = @@ -913,6 +938,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir); @@ -947,6 +973,7 @@ "command-id", cache, executor, + retrier, digestUtil, logDir);
diff --git a/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java b/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java index 624d074..7dde0c6 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java
@@ -28,6 +28,7 @@ import com.google.devtools.build.lib.remote.Retrier.CircuitBreaker.State; import com.google.devtools.build.lib.remote.Retrier.CircuitBreakerException; import com.google.devtools.build.lib.remote.Retrier.RetryException; +import com.google.devtools.build.lib.remote.Retrier.ZeroBackoff; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; @@ -318,28 +319,4 @@ state = State.TRIAL_CALL; } } - - private static class ZeroBackoff implements Backoff { - - private final int maxRetries; - private int retries; - - public ZeroBackoff(int maxRetries) { - this.maxRetries = maxRetries; - } - - @Override - public long nextDelayMillis() { - if (retries >= maxRetries) { - return -1; - } - retries++; - return 0; - } - - @Override - public int getRetryAttempts() { - return retries; - } - } }