Wipe tree artifact directories when cancelling a local spawn.
When we enable dynamic execution without sandboxing, the local spawn is
allowed to run until the remote spawn is done executing (only when
--experimental_local_lockfree_output is given) and we are ready to
download its outputs. At that point, we cancel the local spawn and then
proceed to download the remote outputs.
This is fine in the general case but breaks with tree artifacts: the
local spawn may create extra files in the output directory during its
execution (think temporary files) that are deleted immediately by the
spawn itself before exiting. When running remotely, these files never
become part of the tree artifact manifest, but when running locally,
these files will stay behind if the local spawn is interrupted. Once
this happens, we fail tree artifact metadata validation later on because
we encounter undeclared files on disk.
To resolve this, clean up any files written by the local spawn as part
of its interrupt.
RELNOTES: None.
PiperOrigin-RevId: 275441715
diff --git a/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java
index 707684e..2bb1676 100644
--- a/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java
+++ b/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java
@@ -22,6 +22,7 @@
import com.google.common.base.Preconditions;
import com.google.devtools.build.lib.actions.ActionExecutionMetadata;
import com.google.devtools.build.lib.actions.ActionInput;
+import com.google.devtools.build.lib.actions.Artifact;
import com.google.devtools.build.lib.actions.CommandLines.ParamFileActionInput;
import com.google.devtools.build.lib.actions.ExecException;
import com.google.devtools.build.lib.actions.ResourceManager;
@@ -196,6 +197,12 @@
private State currentState = State.INITIALIZING;
private final Map<State, Long> stateTimes = new EnumMap<>(State.class);
+ /**
+ * If true, the local subprocess has already started, which means we need to clean up the output
+ * tree once we get interrupted.
+ */
+ private boolean needCleanup = false;
+
private final int id;
public SubprocessHandler(Spawn spawn, SpawnExecutionContext context) {
@@ -209,6 +216,12 @@
public SpawnResult run() throws InterruptedException, IOException {
try {
return start();
+ } catch (InterruptedException e) {
+ maybeCleanupOnInterrupt();
+ // Logging the exception causes a lot of noise in builds using the dynamic scheduler, and
+ // the information is not very interesting, so avoid that.
+ stepLog(SEVERE, "Interrupted (and cleanup finished)");
+ throw e;
} catch (Error e) {
stepLog(SEVERE, e, UNHANDLED_EXCEPTION_MSG);
throw e;
@@ -367,6 +380,7 @@
try (SilentCloseable c =
Profiler.instance()
.profile(ProfilerTask.PROCESS_TIME, spawn.getResourceOwner().getMnemonic())) {
+ needCleanup = true;
Subprocess subprocess = subprocessBuilder.start();
subprocess.getOutputStream().close();
try {
@@ -445,6 +459,43 @@
private boolean wasTimeout(Duration timeout, Duration wallTime) {
return !timeout.isZero() && wallTime.compareTo(timeout) > 0;
}
+
+ /**
+ * Clean up any known side-effects that the running spawn may have had on the output tree.
+ *
+ * <p>This is supposed to leave the output tree as it was right after {@link
+ * com.google.devtools.build.lib.skyframe.SkyframeActionExecutor} created the output directories
+ * for the spawn, which means that any outputs have to be deleted but any top-level directory
+ * for tree artifacts has to be kept behind (and empty).
+ */
+ private void maybeCleanupOnInterrupt() {
+ if (!localExecutionOptions.localLockfreeOutput) {
+ // If we don't allow lockfree executions of local subprocesses, there is no need to clean up
+ // anything: we would have already locked the output tree upfront, so we "own" it.
+ return;
+ }
+ if (!needCleanup) {
+ // If the subprocess has not yet started, there is no need to worry about checking on-disk
+ // state.
+ return;
+ }
+
+ for (ActionInput output : spawn.getOutputFiles()) {
+ Path path = context.getPathResolver().toPath(output);
+ try {
+ if (path.exists()) {
+ stepLog(INFO, "Clearing output %s after interrupt", path);
+ if (output instanceof Artifact && ((Artifact) output).isTreeArtifact()) {
+ path.deleteTreesBelow();
+ } else {
+ path.deleteTree();
+ }
+ }
+ } catch (IOException e) {
+ stepLog(SEVERE, e, "Cannot delete local output %s after interrupt", path);
+ }
+ }
+ }
}
private enum State {
diff --git a/src/test/shell/integration/execution_strategies_test.sh b/src/test/shell/integration/execution_strategies_test.sh
index cb6981b..8f6f96c 100755
--- a/src/test/shell/integration/execution_strategies_test.sh
+++ b/src/test/shell/integration/execution_strategies_test.sh
@@ -44,6 +44,19 @@
source "$(rlocation "io_bazel/src/test/shell/integration_test_setup.sh")" \
|| { echo "integration_test_setup.sh not found!" >&2; exit 1; }
+case "$(uname -s | tr [:upper:] [:lower:])" in
+msys*|mingw*|cygwin*)
+ declare -r is_windows=true
+ ;;
+*)
+ declare -r is_windows=false
+ ;;
+esac
+
+if "$is_windows"; then
+ export MSYS_NO_PATHCONV=1
+ export MSYS2_ARG_CONV_EXCL="*"
+fi
# Tests that you can set the spawn strategy flags to a list of strategies.
function test_multiple_strategies() {
@@ -80,4 +93,100 @@
expect_not_log "\"FooBar\" = "
}
+# Runs a build, waits for the given dir and file to appear, and then kills
+# Bazel to check what happens with said files.
+function build_and_interrupt() {
+ local dir="${1}"; shift
+ local file="${1}"; shift
+
+ bazel clean
+ bazel build --genrule_strategy=local --nolegacy_spawn_scheduler \
+ "${@}" //pkg &> $TEST_log &
+ local pid=$!
+ while [[ ! -e "${dir}" && ! -e "${file}" ]]; do
+ echo "Still waiting for action to create outputs" >>$TEST_log
+ sleep 1
+ done
+ kill "${pid}"
+ wait || true
+}
+
+function test_local_deletes_plain_outputs_on_interrupt() {
+ if "$is_windows"; then
+ cat 1>&2 <<EOF
+This test is known to be broken on Windows because the kill+wait sequence
+in build_and_interrupt doesn't seem to do the right thing.
+Skipping...
+EOF
+ return 0
+ fi
+
+ mkdir -p pkg
+ cat >pkg/BUILD <<'EOF'
+genrule(
+ name = "pkg",
+ srcs = ["pkg.txt"],
+ outs = ["dir", "file"],
+ cmd = ("d=$(location :dir) f=$(location :file); "
+ + "mkdir -p $$d; touch $$d/subfile $$f; sleep 60"),
+)
+EOF
+ touch pkg/pkg.txt
+ local dir="bazel-genfiles/pkg/dir"
+ local file="bazel-genfiles/pkg/file"
+
+ build_and_interrupt "${dir}" "${file}" --noexperimental_local_lockfree_output
+ [[ -d "${dir}" ]] || fail "Expected directory output to not exist"
+ [[ -f "${file}" ]] || fail "Expected regular output to exist"
+
+ build_and_interrupt "${dir}" "${file}" --experimental_local_lockfree_output
+ if [[ -d "${dir}" ]]; then
+ fail "Expected directory output to not exist"
+ fi
+ if [[ -f "${file}" ]]; then
+ fail "Expected regular output to not exist"
+ fi
+}
+
+function test_local_deletes_tree_artifacts_on_interrupt() {
+ if "$is_windows"; then
+ cat 1>&2 <<EOF
+This test is known to be broken on Windows because the kill+wait sequence
+in build_and_interrupt doesn't seem to do the right thing.
+Skipping...
+EOF
+ return 0
+ fi
+
+ mkdir -p pkg
+ cat >pkg/rules.bzl <<'EOF'
+def _test_tree_artifact_impl(ctx):
+ tree = ctx.actions.declare_directory(ctx.attr.name + ".dir")
+ cmd = """
+mkdir -p {path} && touch {path}/file && sleep 60
+""".format(path = tree.path)
+ ctx.actions.run_shell(outputs = [tree], command = cmd)
+ return DefaultInfo(files = depset([tree]))
+
+test_tree_artifact = rule(
+ implementation = _test_tree_artifact_impl,
+)
+EOF
+ cat >pkg/BUILD <<'EOF'
+load(":rules.bzl", "test_tree_artifact")
+test_tree_artifact(name = "pkg")
+EOF
+ local dir="bazel-bin/pkg/pkg.dir"
+ local file="bazel-bin/pkg/pkg.dir/file"
+
+ build_and_interrupt "${dir}" "${file}" --noexperimental_local_lockfree_output
+ [[ -d "${dir}" ]] || fail "Expected tree artifact root to exist"
+
+ build_and_interrupt "${dir}" "${file}" --experimental_local_lockfree_output
+ [[ -d "${dir}" ]] || fail "Expected tree artifact root to exist"
+ if [[ -f "${file}" ]]; then
+ fail "Expected tree artifact contents to not exist"
+ fi
+}
+
run_suite "Tests for the execution strategy selection."