Add the --experimental_local_retries_on_crash option.

This is intended to be used with --experimental_local_lockfree_output,
which causes lots more local process to be eagerly terminated, and is
an attempt to workaround an odd OSXFUSE bug that introduces data corruption
on reads for which we have no diagnostic nor fix yet.

Part of #7818.

RELNOTES: None.
PiperOrigin-RevId: 315485365
diff --git a/src/main/java/com/google/devtools/build/lib/exec/local/LocalExecutionOptions.java b/src/main/java/com/google/devtools/build/lib/exec/local/LocalExecutionOptions.java
index f404845..e436223 100644
--- a/src/main/java/com/google/devtools/build/lib/exec/local/LocalExecutionOptions.java
+++ b/src/main/java/com/google/devtools/build/lib/exec/local/LocalExecutionOptions.java
@@ -90,6 +90,19 @@
       help = "Helper to roll out the process-wrapper's --wait_fix bug fix in a controlled manner.")
   public boolean processWrapperWaitFix;
 
+  @Option(
+      name = "experimental_local_retries_on_crash",
+      defaultValue = "0",
+      documentationCategory = OptionDocumentationCategory.UNDOCUMENTED,
+      effectTags = {OptionEffectTag.EXECUTION},
+      help =
+          "Number of times to retry a local action when we detect that it crashed. This exists "
+              + "to workaround a bug in OSXFUSE which is tickled by the use of the dynamic "
+              + "scheduler and --experimental_local_lockfree_output due to constant process "
+              + "churn. The bug can be triggered by a cancelled process that ran *before* the "
+              + "process we are trying to run, introducing corruption in its file reads.")
+  public int localRetriesOnCrash;
+
   public Duration getLocalSigkillGraceSeconds() {
     // TODO(ulfjack): Change localSigkillGraceSeconds type to Duration.
     return Duration.ofSeconds(localSigkillGraceSeconds);
diff --git a/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java
index 3cf1e63..675ae40 100644
--- a/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java
+++ b/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java
@@ -187,6 +187,32 @@
     }
 
     public SpawnResult run() throws InterruptedException, IOException {
+      if (localExecutionOptions.localRetriesOnCrash == 0) {
+        return runOnce();
+      } else {
+        int attempts = 0;
+        while (true) {
+          // Assume that any exceptions from runOnce() come from the Java side of things, not the
+          // subprocess, so let them bubble up on first occurrence. In particular, we need this to
+          // be true for InterruptedException to ensure that the dynamic scheduler can stop us
+          // quickly.
+          SpawnResult result = runOnce();
+          if (attempts == localExecutionOptions.localRetriesOnCrash
+              || !TerminationStatus.crashed(result.exitCode())) {
+            return result;
+          }
+          stepLog(
+              SEVERE,
+              "Retrying crashed subprocess due to exit code %s (attempt %s)",
+              result.exitCode(),
+              attempts);
+          Thread.sleep(attempts * 1000);
+          attempts++;
+        }
+      }
+    }
+
+    private SpawnResult runOnce() throws InterruptedException, IOException {
       try {
         return start();
       } catch (InterruptedException | InterruptedIOException e) {
diff --git a/src/main/java/com/google/devtools/build/lib/shell/TerminationStatus.java b/src/main/java/com/google/devtools/build/lib/shell/TerminationStatus.java
index 7c520e4..009cb85 100644
--- a/src/main/java/com/google/devtools/build/lib/shell/TerminationStatus.java
+++ b/src/main/java/com/google/devtools/build/lib/shell/TerminationStatus.java
@@ -14,6 +14,7 @@
 
 package com.google.devtools.build.lib.shell;
 
+import com.google.common.annotations.VisibleForTesting;
 import java.time.Duration;
 import java.util.Optional;
 
@@ -133,12 +134,26 @@
   // that waitResult is the exit status when the process returns normally, or
   // 128+signalnumber when the process is terminated by a signal.  We further
   // assume that value signal numbers fall in the interval [1, 63].
-  private static final int SIGNAL_1  = 128 + 1;
-  private static final int SIGNAL_63 = 128 + 63;
+  @VisibleForTesting static final int SIGNAL_1 = 128 + 1;
+  @VisibleForTesting static final int SIGNAL_63 = 128 + 63;
+  @VisibleForTesting static final int SIGNAL_SIGABRT = 128 + 6;
+  @VisibleForTesting static final int SIGNAL_SIGKILL = 128 + 9;
+  @VisibleForTesting static final int SIGNAL_SIGBUS = 128 + 10;
+  @VisibleForTesting static final int SIGNAL_SIGTERM = 128 + 15;
 
   /**
-   * Returns true iff the process exited normally.
+   * Returns true if the given exit code represents a crash.
+   *
+   * <p>This is a static function that processes a raw exit status because that's all the
+   * information that we have around in the single use case of this function. Propagating a {@link
+   * TerminationStatus} object to that point would be costly. If this function is needed for
+   * anything else, then this should be reevaluated.
    */
+  public static boolean crashed(int rawStatus) {
+    return rawStatus == SIGNAL_SIGABRT || rawStatus == SIGNAL_SIGBUS;
+  }
+
+  /** Returns true iff the process exited normally. */
   public boolean exited() {
     return !timedOut && (waitResult < SIGNAL_1 || waitResult > SIGNAL_63);
   }
diff --git a/src/test/java/com/google/devtools/build/lib/shell/TerminationStatusTest.java b/src/test/java/com/google/devtools/build/lib/shell/TerminationStatusTest.java
index fbbd238..7eb77eb 100644
--- a/src/test/java/com/google/devtools/build/lib/shell/TerminationStatusTest.java
+++ b/src/test/java/com/google/devtools/build/lib/shell/TerminationStatusTest.java
@@ -14,6 +14,7 @@
 
 package com.google.devtools.build.lib.shell;
 
+import static com.google.common.truth.Truth.assertThat;
 import static com.google.common.truth.Truth8.assertThat;
 import static org.junit.Assert.assertThrows;
 
@@ -27,6 +28,27 @@
 public final class TerminationStatusTest {
 
   @Test
+  public void testCrashed_exitCodesReturnFalse() {
+    assertThat(TerminationStatus.crashed(0)).isFalse();
+    assertThat(TerminationStatus.crashed(1)).isFalse();
+    assertThat(TerminationStatus.crashed(127)).isFalse();
+  }
+
+  @Test
+  public void testCrashed_terminationSignalsReturnFalse() {
+    assertThat(TerminationStatus.crashed(TerminationStatus.SIGNAL_1)).isFalse();
+    assertThat(TerminationStatus.crashed(TerminationStatus.SIGNAL_63)).isFalse();
+    assertThat(TerminationStatus.crashed(TerminationStatus.SIGNAL_SIGKILL)).isFalse();
+    assertThat(TerminationStatus.crashed(TerminationStatus.SIGNAL_SIGTERM)).isFalse();
+  }
+
+  @Test
+  public void testCrashed_abruptSignalsReturnTrue() {
+    assertThat(TerminationStatus.crashed(TerminationStatus.SIGNAL_SIGABRT)).isTrue();
+    assertThat(TerminationStatus.crashed(TerminationStatus.SIGNAL_SIGBUS)).isTrue();
+  }
+
+  @Test
   public void testBuilder_WithNoWaitResponse() {
     assertThrows(
         IllegalStateException.class, () -> TerminationStatus.builder().setTimedOut(false).build());