Mitigate unordered dirtying of rewound nodes by allowing repeats

When a Skyframe node is restarted, and it dirties some of its transitive
dependencies, it does not dirty those dependencies as a single atomic
action, nor does it dirty them in a reverse topological order (i.e.
starting from the leaves of the dirtied subgraph and proceeding to the
subgraph's root(s)).

This leaves rewinding vulnerable to a race where e.g. a sibling action
requests the to-be-dirtied subgraph's root artifact node (corresponding
to the lost input) after it's been dirtied, but before its child
generating action node has been dirtied.

That artifact node may then evaluate to completion, and become done,
before its child is dirtied. Then, when the failed action node (which
requested the rewinding) restarts, and requests its inputs, it will find
the lost input's node is done and proceed with its evaluation, despite
that its lost input's generating action has yet to run.

A proper fix will ensure that no node in a rewinding's to-be-dirtied
subgraph is scheduled for evaluation before its transitive deps in that
subgraph are dirtied.

Drive-by limiting of action toString length in rewinding log statements.

RELNOTES: None.
PiperOrigin-RevId: 235625329
diff --git a/src/main/java/com/google/devtools/build/lib/skyframe/ActionRewindStrategy.java b/src/main/java/com/google/devtools/build/lib/skyframe/ActionRewindStrategy.java
index 9cdee97..e004a2a 100644
--- a/src/main/java/com/google/devtools/build/lib/skyframe/ActionRewindStrategy.java
+++ b/src/main/java/com/google/devtools/build/lib/skyframe/ActionRewindStrategy.java
@@ -15,13 +15,14 @@
 package com.google.devtools.build.lib.skyframe;
 
 import com.google.auto.value.AutoValue;
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.collect.ConcurrentHashMultiset;
 import com.google.common.collect.HashMultimap;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
 import com.google.devtools.build.lib.actions.Action;
 import com.google.devtools.build.lib.actions.ActionExecutionException;
 import com.google.devtools.build.lib.actions.ActionInput;
@@ -49,11 +50,13 @@
  */
 public class ActionRewindStrategy {
   private static final Logger logger = Logger.getLogger(ActionRewindStrategy.class.getName());
+  @VisibleForTesting public static final int MAX_REPEATED_LOST_INPUTS = 20;
 
   // Note that this reference is mutated only outside of Skyframe evaluations, and accessed only
   // inside of them. Its visibility piggybacks on Skyframe evaluation synchronizations, like
   // ActionExecutionFunction's stateMap does.
-  private Set<LostInputRecord> lostInputRecords = Sets.newConcurrentHashSet();
+  private ConcurrentHashMultiset<LostInputRecord> lostInputRecords =
+      ConcurrentHashMultiset.create();
 
   /**
    * Returns a {@link RewindPlan} specifying:
@@ -81,7 +84,7 @@
       ActionInputDepOwners runfilesDepOwners,
       Environment env)
       throws ActionExecutionException, InterruptedException {
-    checkIfActionLostInputTwice(actionLookupData, failedAction, lostInputsException);
+    checkIfActionLostInputTooManyTimes(actionLookupData, failedAction, lostInputsException);
 
     ImmutableList<ActionInput> lostInputs = lostInputsException.getLostInputs().values().asList();
 
@@ -128,34 +131,47 @@
 
   /** Clear the history of failed actions' lost inputs. */
   void reset() {
-    lostInputRecords = Sets.newConcurrentHashSet();
+    lostInputRecords = ConcurrentHashMultiset.create();
   }
 
-  private void checkIfActionLostInputTwice(
+  private void checkIfActionLostInputTooManyTimes(
       ActionLookupData actionLookupData,
       Action failedAction,
       LostInputsActionExecutionException lostInputsException)
       throws ActionExecutionException {
     ImmutableMap<String, ActionInput> lostInputsByDigest = lostInputsException.getLostInputs();
     for (String digest : lostInputsByDigest.keySet()) {
-      // The same action losing the same input twice is unexpected. The action should have waited
-      // until the depended-on action which generates the lost input is (re)run before trying
-      // again.
+      // The same action losing the same input more than once is unexpected [*]. The action should
+      // have waited until the depended-on action which generates the lost input is (re)run before
+      // trying again.
       //
       // Note that we could enforce a stronger check: if action A, which depends on an input N
       // previously detected as lost (by any action, not just A), discovers that N is still lost,
       // and action A started after the re-evaluation of N's generating action, then something has
       // gone wrong. Administering that check would be more complex (e.g., the start/completion
       // times of actions would need tracking), so we punt on it for now.
-      if (!lostInputRecords.add(LostInputRecord.create(actionLookupData, digest))) {
+      //
+      // [*], TODO(b/123993876): To mitigate a race condition (believed to be) caused by
+      // non-topological Skyframe dirtying of depended-on nodes, this check fails the build only if
+      // the same input is repeatedly lost.
+      int priorLosses =
+          lostInputRecords.add(
+              LostInputRecord.create(actionLookupData, digest), /*occurrences=*/ 1);
+      if (MAX_REPEATED_LOST_INPUTS <= priorLosses) {
         BugReport.sendBugReport(
             new IllegalStateException(
                 String.format(
-                    "lost input twice for the same action. lostInput: %s, lostInput digest: %s, "
-                        + "failedAction: %s",
-                    lostInputsByDigest.get(digest), digest, failedAction)));
+                    "lost input too many times (#%s) for the same action. lostInput: %s, "
+                        + "lostInput digest: %s, failedAction: %.10000s",
+                    priorLosses + 1, lostInputsByDigest.get(digest), digest, failedAction)));
         throw new ActionExecutionException(
             lostInputsException, failedAction, /*catastrophe=*/ false);
+      } else if (0 < priorLosses) {
+        logger.info(
+            String.format(
+                "lost input again (#%s) for the same action. lostInput: %s, "
+                    + "lostInput digest: %s, failedAction: %.10000s",
+                priorLosses + 1, lostInputsByDigest.get(digest), digest, failedAction));
       }
     }
   }
@@ -173,7 +189,7 @@
       logger.info(
           String.format(
               "lostArtifact unexpectedly source. lostArtifact: %s, lostInputs for artifact: %s, "
-                  + "failedAction: %s",
+                  + "failedAction: %.10000s",
               lostArtifact, associatedLostInputs, failedAction));
       // Launder the LostInputs exception as a plain ActionExecutionException so that it may be
       // processed by SkyframeActionExecutor without short-circuiting.
@@ -210,7 +226,7 @@
         Preconditions.checkState(
             lostInput instanceof Artifact,
             "unexpected non-artifact lostInput which is a dep of the current action. "
-                + "lostInput: %s, failedAction: %s",
+                + "lostInput: %s, failedAction: %.10000s",
             lostInput,
             failedActionForLogging);
         lostInputsByDepOwners.put((Artifact) lostInput, lostInput);
@@ -256,7 +272,7 @@
           String.format(
               "lostInput not a dep of the failed action, and can't be associated with such a dep. "
                   + "lostInput: %s, owner: %s, runfilesDepOwner: %s, runfilesDepTransitiveOwner: %s"
-                  + ", failedAction: %s",
+                  + ", failedAction: %.10000s",
               lostInput,
               owner,
               runfilesDepOwner,