Add option to start locally scheduling actions above a certain age in dynamic scheduling. This allows handling when remote systems get stuck for whatever reason.
Currently, local actions are scheduled LIFO, so we have the best chance of actually doing something faster locally, rather than getting cancelled when the remote branch finishes shortly after. With this change, we switch to FIFO for actions that are above a certain age, where we guess the remote is not going to answer in a reasonable time.
We can't tell what timeouts to expect, since they could be caused by many different parts of the remote system. Instead, this allows adjusting based on experience. If only very few remote calls end up timing out, setting this to ~half the seen timeout should be fine. The more often remote calls time out, the lower you want to set this.
Setting this flag can hide problems in your remote system. Make sure to have proper monitoring and SLOs in place for your remote system before papering it over with this flag.
PiperOrigin-RevId: 422434741
diff --git a/src/main/java/com/google/devtools/build/lib/dynamic/DynamicExecutionOptions.java b/src/main/java/com/google/devtools/build/lib/dynamic/DynamicExecutionOptions.java
index 7081c03..1167fff 100644
--- a/src/main/java/com/google/devtools/build/lib/dynamic/DynamicExecutionOptions.java
+++ b/src/main/java/com/google/devtools/build/lib/dynamic/DynamicExecutionOptions.java
@@ -146,4 +146,14 @@
"If set, dynamic execution is turned off until there has been at least one successful"
+ " build.")
public boolean skipFirstBuild;
+
+ @Option(
+ name = "experimental_dynamic_slow_remote_time",
+ documentationCategory = OptionDocumentationCategory.UNDOCUMENTED,
+ effectTags = {OptionEffectTag.UNKNOWN},
+ defaultValue = "0",
+ help =
+ "If >0, the number of seconds a dynamically run action must run remote-only before we"
+ + " prioritize its local execution to avoid remote timeouts.")
+ public int slowRemoteTime;
}
diff --git a/src/main/java/com/google/devtools/build/lib/dynamic/DynamicSpawnStrategy.java b/src/main/java/com/google/devtools/build/lib/dynamic/DynamicSpawnStrategy.java
index 5314378..b7328c4 100644
--- a/src/main/java/com/google/devtools/build/lib/dynamic/DynamicSpawnStrategy.java
+++ b/src/main/java/com/google/devtools/build/lib/dynamic/DynamicSpawnStrategy.java
@@ -44,6 +44,7 @@
import com.google.devtools.build.lib.server.FailureDetails.FailureDetail;
import com.google.errorprone.annotations.FormatMethod;
import com.google.errorprone.annotations.FormatString;
+import java.time.Duration;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.List;
@@ -238,7 +239,6 @@
waitingLocalJobs.add(localBranch);
tryScheduleLocalJob();
}
-
remoteBranch.execute(executorService);
try {
@@ -273,7 +273,18 @@
private void tryScheduleLocalJob() {
synchronized (waitingLocalJobs) {
while (!waitingLocalJobs.isEmpty() && threadLimiter.tryAcquire()) {
- LocalBranch job = waitingLocalJobs.pollLast();
+ LocalBranch job;
+ // TODO(b/120910324): Prioritize jobs where the remote branch has already failed.
+ if (options.slowRemoteTime > 0
+ && waitingLocalJobs
+ .peekFirst()
+ .getAge()
+ .compareTo(Duration.ofSeconds(options.slowRemoteTime))
+ > 0) {
+ job = waitingLocalJobs.pollFirst();
+ } else {
+ job = waitingLocalJobs.pollLast();
+ }
job.execute(executorService);
}
}
diff --git a/src/main/java/com/google/devtools/build/lib/dynamic/LocalBranch.java b/src/main/java/com/google/devtools/build/lib/dynamic/LocalBranch.java
index 70feb2b..a6dfdfd 100644
--- a/src/main/java/com/google/devtools/build/lib/dynamic/LocalBranch.java
+++ b/src/main/java/com/google/devtools/build/lib/dynamic/LocalBranch.java
@@ -32,6 +32,8 @@
import com.google.devtools.build.lib.actions.SpawnStrategy;
import com.google.devtools.build.lib.dynamic.DynamicExecutionModule.IgnoreFailureCheck;
import com.google.devtools.build.lib.util.io.FileOutErr;
+import java.time.Duration;
+import java.time.Instant;
import java.util.Optional;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicBoolean;
@@ -51,6 +53,7 @@
private final IgnoreFailureCheck ignoreFailureCheck;
private final Function<Spawn, Optional<Spawn>> getExtraSpawnForLocalExecution;
private final AtomicBoolean delayLocalExecution;
+ private final Instant creationTime = Instant.now();
public LocalBranch(
ActionExecutionContext actionExecutionContext,
@@ -71,6 +74,10 @@
return LOCAL;
}
+ public Duration getAge() {
+ return Duration.between(creationTime, Instant.now());
+ }
+
/**
* Try to run the given spawn locally.
*