blob: bcca7c7eecd6f0dd89531b3b5279a7910bc48bef [file] [log] [blame]
// Copyright 2018 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.devtools.build.lib.skyframe;
import com.google.auto.value.AutoValue;
import com.google.common.base.Preconditions;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.devtools.build.lib.actions.Action;
import com.google.devtools.build.lib.actions.ActionExecutionException;
import com.google.devtools.build.lib.actions.ActionInput;
import com.google.devtools.build.lib.actions.ActionInputDepOwners;
import com.google.devtools.build.lib.actions.ActionLookupData;
import com.google.devtools.build.lib.actions.Artifact;
import com.google.devtools.build.lib.actions.LostInputsExecException;
import com.google.devtools.build.lib.actions.LostInputsExecException.LostInputsActionExecutionException;
import com.google.devtools.build.lib.bugreport.BugReport;
import com.google.devtools.build.skyframe.SkyFunction.Environment;
import com.google.devtools.build.skyframe.SkyFunction.Restart;
import com.google.devtools.build.skyframe.SkyKey;
import java.util.ArrayDeque;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import javax.annotation.Nullable;
/**
* Given an action that failed to execute because of lost inputs which were generated by other
* actions, this finds the actions which generated them and the set of Skyframe nodes which must be
* restarted in order to recreate the lost inputs.
*/
public class ActionRewindStrategy {
private static final Logger logger = Logger.getLogger(ActionRewindStrategy.class.getName());
// Note that this reference is mutated only outside of Skyframe evaluations, and accessed only
// inside of them. Its visibility piggybacks on Skyframe evaluation synchronizations, like
// ActionExecutionFunction's stateMap does.
private Set<LostInputRecord> lostInputRecords = Sets.newConcurrentHashSet();
/**
* Returns a {@link RewindPlan} specifying:
*
* <ol>
* <li>the Skyframe nodes to restart to recreate the lost inputs specified by {@code
* lostInputsException}
* <li>the actions whose execution state (in {@link SkyframeActionExecutor}) must be reset
* (aside from failedAction, which the caller already knows must be reset)
* </ol>
*
* <p>Note that all Skyframe nodes between the currently executing (failed) action's node and the
* nodes corresponding to the actions which create the lost inputs, inclusive, must be restarted.
* This ensures that reevaluating the current node will also reevaluate the nodes that will
* recreate the lost inputs.
*
* @throws ActionExecutionException if any lost inputs have been seen by this action as lost
* before, or if any lost inputs are not the outputs of previously executed actions
*/
RewindPlan getRewindPlan(
Action failedAction,
ActionLookupData actionLookupData,
Iterable<? extends SkyKey> failedActionDeps,
LostInputsActionExecutionException lostInputsException,
ActionInputDepOwners runfilesDepOwners,
Environment env)
throws ActionExecutionException, InterruptedException {
checkIfActionLostInputTwice(actionLookupData, failedAction, lostInputsException);
ImmutableList<ActionInput> lostInputs = lostInputsException.getLostInputs().values().asList();
// This collection tracks which Skyframe nodes must be restarted.
HashSet<SkyKey> depsToRestart = new HashSet<>();
// SkyframeActionExecutor must re-execute the actions being restarted, so we must tell it to
// evict its cached results for those actions. This collection tracks those actions (aside from
// failedAction, which the caller of getRewindPlan already knows must be restarted).
ImmutableList.Builder<Action> additionalActionsToRestart = ImmutableList.builder();
HashMultimap<Artifact, ActionInput> lostInputsByDepOwners =
getLostInputsByDepOwners(
lostInputs,
lostInputsException.getInputOwners(),
runfilesDepOwners,
ImmutableSet.copyOf(failedActionDeps),
failedAction);
for (Map.Entry<Artifact, Collection<ActionInput>> entry :
lostInputsByDepOwners.asMap().entrySet()) {
Artifact lostArtifact = entry.getKey();
checkIfLostArtifactIsSource(
lostArtifact, failedAction, lostInputsException, entry.getValue());
// Note that this artifact must be restarted.
depsToRestart.add(lostArtifact);
Map<ActionLookupData, Action> actionMap = getActionsForLostArtifact(lostArtifact, env);
if (actionMap == null) {
// Some deps of the artifact are not done. Another rewind must be in-flight, and there is no
// need to restart the shared deps twice.
continue;
}
ImmutableList<Action> actionsToCheck =
noteDepsAndActionsToRestartAndGetActionsToCheck(
actionMap, depsToRestart, additionalActionsToRestart);
checkActions(actionsToCheck, env, depsToRestart, additionalActionsToRestart);
}
return new RewindPlan(
Restart.selfAnd(ImmutableList.copyOf(depsToRestart)), additionalActionsToRestart.build());
}
/** Clear the history of failed actions' lost inputs. */
void reset() {
lostInputRecords = Sets.newConcurrentHashSet();
}
private void checkIfActionLostInputTwice(
ActionLookupData actionLookupData,
Action failedAction,
LostInputsActionExecutionException lostInputsException)
throws ActionExecutionException {
ImmutableMap<String, ActionInput> lostInputsByDigest = lostInputsException.getLostInputs();
for (String digest : lostInputsByDigest.keySet()) {
// The same action losing the same input twice is unexpected. The action should have waited
// until the depended-on action which generates the lost input is (re)run before trying
// again.
//
// Note that we could enforce a stronger check: if action A, which depends on an input N
// previously detected as lost (by any action, not just A), discovers that N is still lost,
// and action A started after the re-evaluation of N's generating action, then something has
// gone wrong. Administering that check would be more complex (e.g., the start/completion
// times of actions would need tracking), so we punt on it for now.
if (!lostInputRecords.add(LostInputRecord.create(actionLookupData, digest))) {
BugReport.sendBugReport(
new IllegalStateException(
String.format(
"lost input twice for the same action. lostInput: %s, lostInput digest: %s, "
+ "failedAction: %s",
lostInputsByDigest.get(digest), digest, failedAction)),
ImmutableList.of());
throw new ActionExecutionException(
lostInputsException, failedAction, /*catastrophe=*/ false);
}
}
}
private void checkIfLostArtifactIsSource(
Artifact lostArtifact,
Action failedAction,
LostInputsActionExecutionException lostInputsException,
Collection<ActionInput> associatedLostInputs)
throws ActionExecutionException {
if (lostArtifact.isSourceArtifact()) {
// Rewinding source artifacts is not possible. They should not be losable, but we tolerate
// their loss--by failing the build instead of crashing--in case some kind of infrastructure
// failure results in their apparent loss.
logger.info(
String.format(
"lostArtifact unexpectedly source. lostArtifact: %s, lostInputs for artifact: %s, "
+ "failedAction: %s",
lostArtifact, associatedLostInputs, failedAction));
// Launder the LostInputs exception as a plain ActionExecutionException so that it may be
// processed by SkyframeActionExecutor without short-circuiting.
throw new ActionExecutionException(lostInputsException, failedAction, /*catastrophe=*/ false);
}
}
private ImmutableList<Action> noteDepsAndActionsToRestartAndGetActionsToCheck(
Map<ActionLookupData, Action> actionMap,
Set<SkyKey> depsToRestart,
ImmutableList.Builder<Action> additionalActionsToRestart) {
ImmutableList.Builder<Action> actionsToCheckForPropagation =
ImmutableList.builderWithExpectedSize(actionMap.size());
for (Map.Entry<ActionLookupData, Action> actionEntry : actionMap.entrySet()) {
if (depsToRestart.add(actionEntry.getKey())) {
Action action = actionEntry.getValue();
additionalActionsToRestart.add(action);
actionsToCheckForPropagation.add(action);
}
}
return actionsToCheckForPropagation.build();
}
private HashMultimap<Artifact, ActionInput> getLostInputsByDepOwners(
ImmutableList<ActionInput> lostInputs,
LostInputsExecException.InputOwners inputOwners,
ActionInputDepOwners runfilesDepOwners,
ImmutableSet<SkyKey> failedActionDeps,
Action failedActionForLogging) {
HashMultimap<Artifact, ActionInput> lostInputsByDepOwners = HashMultimap.create();
for (ActionInput lostInput : lostInputs) {
if (failedActionDeps.contains(lostInput)) {
Preconditions.checkState(
lostInput instanceof Artifact,
"unexpected non-artifact lostInput which is a dep of the current action. "
+ "lostInput: %s, failedAction: %s",
lostInput,
failedActionForLogging);
lostInputsByDepOwners.put((Artifact) lostInput, lostInput);
continue;
}
Artifact owner = inputOwners.getOwner(lostInput);
if (owner != null && failedActionDeps.contains(owner)) {
// The lost input is included in a tree artifact or fileset that the action directly depends
// on.
lostInputsByDepOwners.put(owner, lostInput);
continue;
}
Artifact runfilesDepOwner = runfilesDepOwners.getDepOwner(lostInput);
if (runfilesDepOwner != null && failedActionDeps.contains(runfilesDepOwner)) {
// The lost input is included in a runfiles middleman that the action directly depends on.
lostInputsByDepOwners.put(runfilesDepOwner, lostInput);
continue;
}
Artifact runfilesDepTransitiveOwner = null;
if (owner != null) {
runfilesDepTransitiveOwner = runfilesDepOwners.getDepOwner(owner);
if (runfilesDepTransitiveOwner != null
&& failedActionDeps.contains(runfilesDepTransitiveOwner)) {
// The lost input is included in a tree artifact or fileset which is included in a
// runfiles middleman that the action directly depends on.
lostInputsByDepOwners.put(runfilesDepTransitiveOwner, lostInput);
continue;
}
}
// Rewinding can't do anything about a lost input that can't be associated with a direct dep
// of the failed action. This may happen if the action consists of a sequence of spawns where
// an output generated by one spawn is consumed by another but was lost in-between. In this
// case, reevaluating the failed action (and no other deps) may help, because doing so may
// rerun the generating spawn.
//
// In other cases, such as with bugs, the second time the action fails will cause a crash in
// checkIfActionLostInputTwice. We log that this has occurred.
logger.info(
String.format(
"lostInput not a dep of the failed action, and can't be associated with such a dep. "
+ "lostInput: %s, owner: %s, runfilesDepOwner: %s, runfilesDepTransitiveOwner: %s"
+ ", failedAction: %s",
lostInput,
owner,
runfilesDepOwner,
runfilesDepTransitiveOwner,
failedActionForLogging));
}
return lostInputsByDepOwners;
}
/**
* Looks at each action in {@code actionsToCheck} and determines whether additional artifacts,
* actions, and (in the case of {@link SkyframeAwareAction}s) other Skyframe nodes need to be
* restarted. If this finds more actions to restart, those actions are recursively checked too.
*/
private void checkActions(
ImmutableList<Action> actionsToCheck,
Environment env,
HashSet<SkyKey> depsToRestart,
ImmutableList.Builder<Action> additionalActionsToRestart)
throws InterruptedException {
ArrayDeque<Action> uncheckedActions = new ArrayDeque<>(actionsToCheck);
while (!uncheckedActions.isEmpty()) {
Action action = uncheckedActions.removeFirst();
if (action instanceof SkyframeAwareAction) {
depsToRestart.addAll(((SkyframeAwareAction) action).getSkyframeDependenciesForRewinding());
}
if (!action.mayInsensitivelyPropagateInputs()) {
continue;
}
// Restarting this action is insufficient. Doing so will not recreate the missing input.
// We need to also restart this action's non-source inputs and the actions which created
// those inputs.
//
// Note that the artifacts returned by Action#getAllowedDerivedInputs do not need to be
// considered because these two sets:
// 1) the set of actions with non-throwing implementations of getAllowedDerivedInputs
// 2) the set of actions that "mayInsensitivelyPropagateInputs", plus SkyframeAwareActions
// have no overlap.
Iterable<Artifact> inputs = action.getInputs();
for (Artifact input : inputs) {
if (input.isSourceArtifact()) {
continue;
}
// Restarting all derived inputs of propagating actions is overkill. Preferably, we'd want
// to only restart the inputs which correspond to the known lost outputs. The information
// to do this is probably present in the ActionInputs contained in getRewindPlan's
// lostInputsByOwners.
//
// Rewinding is expected to be rare, so refining this may not be necessary.
depsToRestart.add(input);
Map<ActionLookupData, Action> actionMap = getActionsForLostArtifact(input, env);
if (actionMap == null) {
continue;
}
ImmutableList<Action> nextActionsToCheck =
noteDepsAndActionsToRestartAndGetActionsToCheck(
actionMap, depsToRestart, additionalActionsToRestart);
uncheckedActions.addAll(nextActionsToCheck);
}
}
}
@Nullable
private Map<ActionLookupData, Action> getActionsForLostArtifact(
Artifact lostInput, Environment env) throws InterruptedException {
Set<ActionLookupData> actionExecutionDeps = getActionExecutionDeps(lostInput, env);
if (actionExecutionDeps == null) {
return null;
}
Map<ActionLookupData, Action> actions =
Maps.newHashMapWithExpectedSize(actionExecutionDeps.size());
for (ActionLookupData dep : actionExecutionDeps) {
actions.put(dep, ActionExecutionFunction.getActionForLookupData(env, dep));
}
return actions;
}
/**
* Returns the set of {@code lostInput}'s execution-phase dependencies, or {@code null} if any of
* those dependencies are not done.
*/
@Nullable
private Set<ActionLookupData> getActionExecutionDeps(Artifact lostInput, Environment env)
throws InterruptedException {
ArtifactFunction.ArtifactDependencies artifactDependencies =
ArtifactFunction.ArtifactDependencies.discoverDependencies(lostInput, env);
if (artifactDependencies == null) {
return null;
}
if (artifactDependencies.isTemplateActionForTreeArtifact()) {
ArtifactFunction.ActionTemplateExpansion actionTemplateExpansion =
artifactDependencies.getActionTemplateExpansion(env);
if (actionTemplateExpansion == null) {
return null;
}
// This ignores the ActionTemplateExpansionKey dependency of the template artifact because we
// expect to never need to rewind that.
return ImmutableSet.copyOf(actionTemplateExpansion.getExpandedActionExecutionKeys());
}
return ImmutableSet.of(artifactDependencies.getNontemplateActionExecutionKey());
}
static class RewindPlan {
private final Restart nodesToRestart;
private final ImmutableList<Action> additionalActionsToRestart;
RewindPlan(Restart nodesToRestart, ImmutableList<Action> additionalActionsToRestart) {
this.nodesToRestart = nodesToRestart;
this.additionalActionsToRestart = additionalActionsToRestart;
}
Restart getNodesToRestart() {
return nodesToRestart;
}
ImmutableList<Action> getAdditionalActionsToRestart() {
return additionalActionsToRestart;
}
}
/**
* A record indicating that a Skyframe action execution failed because it lost an input with the
* specified digest.
*/
@AutoValue
abstract static class LostInputRecord {
abstract ActionLookupData failedActionLookupData();
abstract String lostInputDigest();
static LostInputRecord create(ActionLookupData failedActionLookupData, String lostInputDigest) {
return new AutoValue_ActionRewindStrategy_LostInputRecord(
failedActionLookupData, lostInputDigest);
}
}
}