Retry ensureInputsPresent/execute/download

This observably removes any ill effect of CAS transience.

Closes #5229.

PiperOrigin-RevId: 204010317
diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteActionContextProvider.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteActionContextProvider.java
index e86b41e..145ae9b 100644
--- a/src/main/java/com/google/devtools/build/lib/remote/RemoteActionContextProvider.java
+++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteActionContextProvider.java
@@ -40,6 +40,7 @@
   private final CommandEnvironment env;
   @Nullable private final AbstractRemoteActionCache cache;
   @Nullable private final GrpcRemoteExecutor executor;
+  private final RemoteRetrier retrier;
   private final DigestUtil digestUtil;
   private final Path logDir;
 
@@ -47,11 +48,13 @@
       CommandEnvironment env,
       @Nullable AbstractRemoteActionCache cache,
       @Nullable GrpcRemoteExecutor executor,
+      RemoteRetrier retrier,
       DigestUtil digestUtil,
       Path logDir) {
     this.env = env;
     this.executor = executor;
     this.cache = cache;
+    this.retrier = retrier;
     this.digestUtil = digestUtil;
     this.logDir = logDir;
   }
@@ -88,6 +91,7 @@
               commandId,
               cache,
               executor,
+              retrier,
               digestUtil,
               logDir);
       return ImmutableList.of(new RemoteSpawnStrategy(env.getExecRoot(), spawnRunner));
diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java
index 58a6248..a12ee30 100644
--- a/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java
+++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java
@@ -28,6 +28,7 @@
 import com.google.devtools.build.lib.buildtool.BuildRequest;
 import com.google.devtools.build.lib.events.Event;
 import com.google.devtools.build.lib.exec.ExecutorBuilder;
+import com.google.devtools.build.lib.remote.Retrier.RetryException;
 import com.google.devtools.build.lib.remote.logging.LoggingInterceptor;
 import com.google.devtools.build.lib.remote.util.DigestUtil;
 import com.google.devtools.build.lib.runtime.BlazeModule;
@@ -43,11 +44,18 @@
 import com.google.devtools.common.options.OptionsBase;
 import com.google.devtools.common.options.OptionsProvider;
 import com.google.devtools.remoteexecution.v1test.Digest;
+import com.google.protobuf.Any;
+import com.google.protobuf.InvalidProtocolBufferException;
+import com.google.rpc.PreconditionFailure;
+import com.google.rpc.PreconditionFailure.Violation;
 import io.grpc.Channel;
 import io.grpc.ClientInterceptors;
+import io.grpc.Status.Code;
+import io.grpc.protobuf.StatusProto;
 import java.io.IOException;
 import java.util.Map;
 import java.util.concurrent.Executors;
+import java.util.function.Predicate;
 import java.util.logging.Logger;
 
 /** RemoteModule provides distributed cache and remote execution for Bazel. */
@@ -114,6 +122,40 @@
         "remote");
   }
 
+  private static final String VIOLATION_TYPE_MISSING = "MISSING";
+
+  private static final Predicate<? super Exception> RETRIABLE_EXEC_ERRORS =
+      e -> {
+        if (e instanceof CacheNotFoundException || e.getCause() instanceof CacheNotFoundException) {
+          return true;
+        }
+        if (!(e instanceof RetryException)
+            || !RemoteRetrierUtils.causedByStatus((RetryException) e, Code.FAILED_PRECONDITION)) {
+          return false;
+        }
+        com.google.rpc.Status status = StatusProto.fromThrowable(e);
+        if (status == null || status.getDetailsCount() == 0) {
+          return false;
+        }
+        for (Any details : status.getDetailsList()) {
+          PreconditionFailure f;
+          try {
+            f = details.unpack(PreconditionFailure.class);
+          } catch (InvalidProtocolBufferException protoEx) {
+            return false;
+          }
+          if (f.getViolationsCount() == 0) {
+            return false; // Generally shouldn't happen
+          }
+          for (Violation v : f.getViolationsList()) {
+            if (!v.getType().equals(VIOLATION_TYPE_MISSING)) {
+              return false;
+            }
+          }
+        }
+        return true; // if *all* > 0 violations have type MISSING
+      };
+
   @Override
   public void beforeCommand(CommandEnvironment env) throws AbruptExitException {
     env.getEventBus().register(this);
@@ -167,6 +209,7 @@
         logger = new LoggingInterceptor(rpcLogFile, env.getRuntime().getClock());
       }
 
+      final RemoteRetrier executeRetrier;
       final AbstractRemoteActionCache cache;
       if (enableBlobStoreCache) {
         Retrier retrier =
@@ -175,6 +218,7 @@
                 (e) -> false,
                 retryScheduler,
                 Retrier.ALLOW_ALL_CALLS);
+        executeRetrier = null;
         cache =
             new SimpleBlobStoreActionCache(
                 remoteOptions,
@@ -197,6 +241,7 @@
                 RemoteRetrier.RETRIABLE_GRPC_ERRORS,
                 retryScheduler,
                 Retrier.ALLOW_ALL_CALLS);
+        executeRetrier = createExecuteRetrier(remoteOptions, retryScheduler);
         cache =
             new GrpcRemoteCache(
                 ch,
@@ -205,6 +250,7 @@
                 retrier,
                 digestUtil);
       } else {
+        executeRetrier = null;
         cache = null;
       }
 
@@ -230,7 +276,7 @@
         executor = null;
       }
       actionContextProvider =
-          new RemoteActionContextProvider(env, cache, executor, digestUtil, logDir);
+          new RemoteActionContextProvider(env, cache, executor, executeRetrier, digestUtil, logDir);
     } catch (IOException e) {
       env.getReporter().handle(Event.error(e.getMessage()));
       env.getBlazeModuleEnvironment()
@@ -272,4 +318,15 @@
     return SimpleBlobStoreFactory.isRemoteCacheOptions(options)
         || GrpcRemoteCache.isRemoteCacheOptions(options);
   }
+
+  public static RemoteRetrier createExecuteRetrier(
+      RemoteOptions options, ListeningScheduledExecutorService retryService) {
+    return new RemoteRetrier(
+        options.experimentalRemoteRetry
+            ? () -> new Retrier.ZeroBackoff(options.experimentalRemoteRetryMaxAttempts)
+            : () -> Retrier.RETRIES_DISABLED,
+        RemoteModule.RETRIABLE_EXEC_ERRORS,
+        retryService,
+        Retrier.ALLOW_ALL_CALLS);
+  }
 }
diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java
index 1198c2f..95511d0 100644
--- a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java
+++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java
@@ -88,6 +88,7 @@
   @Nullable private final Reporter cmdlineReporter;
   @Nullable private final AbstractRemoteActionCache remoteCache;
   @Nullable private final GrpcRemoteExecutor remoteExecutor;
+  @Nullable private final RemoteRetrier retrier;
   private final String buildRequestId;
   private final String commandId;
   private final DigestUtil digestUtil;
@@ -107,6 +108,7 @@
       String commandId,
       @Nullable AbstractRemoteActionCache remoteCache,
       @Nullable GrpcRemoteExecutor remoteExecutor,
+      @Nullable RemoteRetrier retrier,
       DigestUtil digestUtil,
       Path logDir) {
     this.execRoot = execRoot;
@@ -119,6 +121,7 @@
     this.cmdlineReporter = cmdlineReporter;
     this.buildRequestId = buildRequestId;
     this.commandId = commandId;
+    this.retrier = retrier;
     this.digestUtil = digestUtil;
     this.logDir = logDir;
   }
@@ -194,34 +197,26 @@
         return execLocally(spawn, context, inputMap, uploadLocalResults, remoteCache, actionKey);
       }
 
+      ExecuteRequest request =
+          ExecuteRequest.newBuilder()
+              .setInstanceName(remoteOptions.remoteInstanceName)
+              .setAction(action)
+              .setSkipCacheLookup(!acceptCachedResult)
+              .build();
       try {
-        // Upload the command and all the inputs into the remote cache.
-        remoteCache.ensureInputsPresent(repository, execRoot, inputRoot, command);
-      } catch (IOException e) {
-        return execLocallyOrFail(spawn, context, inputMap, actionKey, uploadLocalResults, e);
-      }
+        return retrier.execute(
+            () -> {
+              // Upload the command and all the inputs into the remote cache.
+              remoteCache.ensureInputsPresent(repository, execRoot, inputRoot, command);
 
-      final ActionResult result;
-      boolean remoteCacheHit = false;
-      try {
-        ExecuteRequest.Builder request =
-            ExecuteRequest.newBuilder()
-                .setInstanceName(remoteOptions.remoteInstanceName)
-                .setAction(action)
-                .setSkipCacheLookup(!acceptCachedResult);
-        ExecuteResponse reply = remoteExecutor.executeRemotely(request.build());
-        maybeDownloadServerLogs(reply, actionKey);
-        result = reply.getResult();
-        remoteCacheHit = reply.getCachedResult();
-      } catch (IOException e) {
-        return execLocallyOrFail(spawn, context, inputMap, actionKey, uploadLocalResults, e);
-      }
+              ExecuteResponse reply = remoteExecutor.executeRemotely(request);
+              maybeDownloadServerLogs(reply, actionKey);
 
-      try {
-        return downloadRemoteResults(result, context.getFileOutErr())
-            .setRunnerName(remoteCacheHit ? "remote cache hit" : getName())
-            .setCacheHit(remoteCacheHit)
-            .build();
+              return downloadRemoteResults(reply.getResult(), context.getFileOutErr())
+                  .setRunnerName(reply.getCachedResult() ? "remote cache hit" : getName())
+                  .setCacheHit(reply.getCachedResult())
+                  .build();
+            });
       } catch (IOException e) {
         return execLocallyOrFail(spawn, context, inputMap, actionKey, uploadLocalResults, e);
       }
diff --git a/src/main/java/com/google/devtools/build/lib/remote/Retrier.java b/src/main/java/com/google/devtools/build/lib/remote/Retrier.java
index a329c04..9d7228d 100644
--- a/src/main/java/com/google/devtools/build/lib/remote/Retrier.java
+++ b/src/main/java/com/google/devtools/build/lib/remote/Retrier.java
@@ -185,6 +185,31 @@
         }
       };
 
+  /** No backoff. */
+  public static class ZeroBackoff implements Backoff {
+
+    private final int maxRetries;
+    private int retries;
+
+    public ZeroBackoff(int maxRetries) {
+      this.maxRetries = maxRetries;
+    }
+
+    @Override
+    public long nextDelayMillis() {
+      if (retries >= maxRetries) {
+        return -1;
+      }
+      retries++;
+      return 0;
+    }
+
+    @Override
+    public int getRetryAttempts() {
+      return retries;
+    }
+  }
+
   private final Supplier<Backoff> backoffSupplier;
   private final Predicate<? super Exception> shouldRetry;
   private final CircuitBreaker circuitBreaker;
diff --git a/src/test/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutionClientTest.java b/src/test/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutionClientTest.java
index 5c45d45..849259f 100644
--- a/src/test/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutionClientTest.java
+++ b/src/test/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutionClientTest.java
@@ -277,6 +277,7 @@
             "command-id",
             remoteCache,
             executor,
+            retrier,
             DIGEST_UTIL,
             logDir);
     inputDigest = fakeFileCache.createScratchInput(simpleSpawn.getInputFiles().get(0), "xyz");
diff --git a/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java b/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java
index 283cc95..6b024b5 100644
--- a/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java
+++ b/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java
@@ -30,6 +30,8 @@
 import com.google.common.collect.ImmutableMap;
 import com.google.common.eventbus.EventBus;
 import com.google.common.io.ByteStreams;
+import com.google.common.util.concurrent.ListeningScheduledExecutorService;
+import com.google.common.util.concurrent.MoreExecutors;
 import com.google.common.util.concurrent.SettableFuture;
 import com.google.devtools.build.lib.actions.ActionInput;
 import com.google.devtools.build.lib.actions.Artifact.ArtifactExpander;
@@ -79,7 +81,10 @@
 import java.time.Duration;
 import java.util.Collection;
 import java.util.SortedMap;
+import java.util.concurrent.Executors;
+import org.junit.AfterClass;
 import org.junit.Before;
+import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -94,6 +99,7 @@
 
   private static final ImmutableMap<String, String> NO_CACHE =
       ImmutableMap.of(ExecutionRequirements.NO_CACHE, "");
+  private static ListeningScheduledExecutorService retryService;
 
   private Path execRoot;
   private Path logDir;
@@ -101,6 +107,9 @@
   private FakeActionInputFileCache fakeFileCache;
   private FileOutErr outErr;
 
+  private RemoteOptions options;
+  private RemoteRetrier retrier;
+
   @Mock private AbstractRemoteActionCache cache;
 
   @Mock
@@ -113,6 +122,11 @@
   private final String simpleActionId =
       "eb45b20cc979d504f96b9efc9a08c48103c6f017afa09c0df5c70a5f92a98ea8";
 
+  @BeforeClass
+  public static void beforeEverything() {
+    retryService = MoreExecutors.listeningDecorator(Executors.newScheduledThreadPool(1));
+  }
+
   @Before
   public final void setUp() throws Exception {
     MockitoAnnotations.initMocks(this);
@@ -128,6 +142,14 @@
     FileSystemUtils.createDirectoryAndParents(stdout.getParentDirectory());
     FileSystemUtils.createDirectoryAndParents(stderr.getParentDirectory());
     outErr = new FileOutErr(stdout, stderr);
+
+    options = Options.getDefaults(RemoteOptions.class);
+    retrier = RemoteModule.createExecuteRetrier(options, retryService);
+  }
+
+  @AfterClass
+  public static void afterEverything() {
+    retryService.shutdownNow();
   }
 
   @Test
@@ -137,7 +159,6 @@
     // It should be executed remotely, but marked non-cacheable to remote execution, so that
     // the action result is not saved in the remote cache.
 
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteAcceptCached = true;
     options.remoteLocalFallback = false;
     options.remoteUploadLocalResults = true;
@@ -154,6 +175,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -197,7 +219,6 @@
     // Test that if a spawn is executed locally, due to the local fallback, that its result is not
     // uploaded to the remote cache. However, the artifacts should still be uploaded.
 
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteAcceptCached = true;
     options.remoteLocalFallback = true;
     options.remoteUploadLocalResults = true;
@@ -214,6 +235,7 @@
             "command-id",
             cache,
             null,
+            retrier,
             digestUtil,
             logDir);
 
@@ -252,7 +274,6 @@
     // Test that the outputs of a failed locally executed action are uploaded to a remote cache,
     // but the action result itself is not.
 
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteUploadLocalResults = true;
 
     RemoteSpawnRunner runner =
@@ -268,6 +289,7 @@
                 "command-id",
                 cache,
                 null,
+                retrier,
                 digestUtil,
                 logDir));
 
@@ -318,6 +340,7 @@
                 "command-id",
                 cache,
                 null,
+                retrier,
                 digestUtil,
                 logDir));
 
@@ -334,7 +357,6 @@
   public void printWarningIfCacheIsDown() throws Exception {
     // If we try to upload to a local cache, that is down a warning should be printed.
 
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteUploadLocalResults = true;
     options.remoteLocalFallback = true;
 
@@ -354,6 +376,7 @@
             "command-id",
             cache,
             null,
+            retrier,
             digestUtil,
             logDir);
 
@@ -396,7 +419,6 @@
   public void noRemoteExecutorFallbackFails() throws Exception {
     // Errors from the fallback runner should be propogated out of the remote runner.
 
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteUploadLocalResults = true;
     options.remoteLocalFallback = true;
 
@@ -412,6 +434,7 @@
             "command-id",
             cache,
             null,
+            retrier,
             digestUtil,
             logDir);
 
@@ -437,7 +460,6 @@
   public void remoteCacheErrorFallbackFails() throws Exception {
     // Errors from the fallback runner should be propogated out of the remote runner.
 
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteUploadLocalResults = true;
     options.remoteLocalFallback = true;
 
@@ -453,6 +475,7 @@
             "command-id",
             cache,
             null,
+            retrier,
             digestUtil,
             logDir);
 
@@ -476,7 +499,6 @@
 
   @Test
   public void testLocalFallbackFailureRemoteExecutorFailure() throws Exception {
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteLocalFallback = true;
 
     RemoteSpawnRunner runner =
@@ -491,6 +513,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -527,6 +550,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -568,6 +592,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -612,6 +637,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -651,6 +677,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -680,8 +707,6 @@
   public void cacheDownloadFailureTriggersRemoteExecution() throws Exception {
     // If downloading a cached action fails, remote execution should be tried.
 
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
-
     RemoteSpawnRunner runner =
         new RemoteSpawnRunner(
             execRoot,
@@ -694,6 +719,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -722,7 +748,6 @@
   public void testRemoteExecutionTimeout() throws Exception {
     // If remote execution times out the SpawnResult status should be TIMEOUT.
 
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteLocalFallback = false;
 
     RemoteSpawnRunner runner =
@@ -737,6 +762,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -771,7 +797,6 @@
     // If remote execution times out the SpawnResult status should be TIMEOUT, regardess of local
     // fallback option.
 
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteLocalFallback = true;
 
     RemoteSpawnRunner runner =
@@ -786,6 +811,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -818,7 +844,6 @@
 
   @Test
   public void testRemoteExecutionCommandFailureDoesNotTriggerFallback() throws Exception {
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteLocalFallback = true;
 
     RemoteSpawnRunner runner =
@@ -833,6 +858,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -860,7 +886,6 @@
     // If we get a failure due to the remote cache not working, the exit code should be
     // ExitCode.REMOTE_ERROR.
 
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteLocalFallback = false;
 
     RemoteSpawnRunner runner =
@@ -875,6 +900,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -898,7 +924,6 @@
     // If we get a failure due to the remote executor not working, the exit code should be
     // ExitCode.REMOTE_ERROR.
 
-    RemoteOptions options = Options.getDefaults(RemoteOptions.class);
     options.remoteLocalFallback = false;
 
     RemoteSpawnRunner runner =
@@ -913,6 +938,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
@@ -947,6 +973,7 @@
             "command-id",
             cache,
             executor,
+            retrier,
             digestUtil,
             logDir);
 
diff --git a/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java b/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java
index 624d074..7dde0c6 100644
--- a/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java
+++ b/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java
@@ -28,6 +28,7 @@
 import com.google.devtools.build.lib.remote.Retrier.CircuitBreaker.State;
 import com.google.devtools.build.lib.remote.Retrier.CircuitBreakerException;
 import com.google.devtools.build.lib.remote.Retrier.RetryException;
+import com.google.devtools.build.lib.remote.Retrier.ZeroBackoff;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Executors;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -318,28 +319,4 @@
       state = State.TRIAL_CALL;
     }
   }
-
-  private static class ZeroBackoff implements Backoff {
-
-    private final int maxRetries;
-    private int retries;
-
-    public ZeroBackoff(int maxRetries) {
-      this.maxRetries = maxRetries;
-    }
-
-    @Override
-    public long nextDelayMillis() {
-      if (retries >= maxRetries) {
-        return -1;
-      }
-      retries++;
-      return 0;
-    }
-
-    @Override
-    public int getRetryAttempts() {
-      return retries;
-    }
-  }
 }