Allow users to override Bazel's 120s connect timeout
Bazel constrains starting up and connecting to a local server to 120s. This
occasionally causes problems for us on heavily loaded, high numcpu machines,
because the Bazel client+server may end up starved out via simple CPU
contention. (Where we observed startup timeouts, machines had plenty of RAM,
I/O looked normal---procs weren't stuck in disk wait---but the run queues were
60+.)
Mitigate this by introducing a new startup option, `--local_startup_timeout_secs`,
which allows users to specify their own timeout values. (Note: I primarily
used `connect_timeout_secs` as a reference.)
TODO: Consult Bazel team to add test case per comments in
[bazel_startup_options_test.cc].
Resolves [#8988].
[bazel_startup_options_test.cc]: https://github.com/bazelbuild/bazel/blob/8075057af6108ebc23c146f18eecec911d4b8c00/src/test/cpp/bazel_startup_options_test.cc#L79-L81
[#8988]: https://github.com/bazelbuild/bazel/issues/8988
Testing Done:
- Induced high load on my MBP, then
```console
$ vbazel --local_startup_timeout_secs=1 info release
Starting local Bazel server and connecting to it...
FATAL: couldn't connect to server (16290) after 1 seconds.
```
Closes #11391.
PiperOrigin-RevId: 338729701
diff --git a/src/main/cpp/blaze.cc b/src/main/cpp/blaze.cc
index e0cbd41..f42a4f8 100644
--- a/src/main/cpp/blaze.cc
+++ b/src/main/cpp/blaze.cc
@@ -778,7 +778,9 @@
// Give the server two minutes to start up. That's enough to connect with a
// debugger.
const auto start_time = std::chrono::system_clock::now();
- const auto try_until_time = start_time + std::chrono::seconds(120);
+ const auto try_until_time =
+ start_time +
+ std::chrono::seconds(startup_options.local_startup_timeout_secs);
// Print an update at most once every 10 seconds if we are still trying to
// connect.
const auto min_message_interval = std::chrono::seconds(10);
@@ -820,7 +822,8 @@
}
}
BAZEL_DIE(blaze_exit_code::INTERNAL_ERROR)
- << "couldn't connect to server (" << server_pid << ") after 120 seconds.";
+ << "couldn't connect to server (" << server_pid << ") after "
+ << startup_options.local_startup_timeout_secs << " seconds.";
}
// Ensures that any server previously associated with `server_dir` is no longer
@@ -1048,7 +1051,7 @@
// server command line difference logic can be simplified then.
static const std::set<string> volatile_startup_options = {
"--option_sources=", "--max_idle_secs=", "--connect_timeout_secs=",
- "--client_debug="};
+ "--local_startup_timeout_secs=", "--client_debug="};
// Split arg based on the first "=" if one exists in arg.
const string::size_type eq_pos = arg.find_first_of('=');
diff --git a/src/main/cpp/startup_options.cc b/src/main/cpp/startup_options.cc
index 3e36151..5ee43ba 100644
--- a/src/main/cpp/startup_options.cc
+++ b/src/main/cpp/startup_options.cc
@@ -82,6 +82,7 @@
fatal_event_bus_exceptions(false),
command_port(0),
connect_timeout_secs(30),
+ local_startup_timeout_secs(120),
have_invocation_policy_(false),
client_debug(false),
java_logging_formatter(
@@ -147,6 +148,7 @@
&windows_enable_symlinks);
RegisterUnaryStartupFlag("command_port");
RegisterUnaryStartupFlag("connect_timeout_secs");
+ RegisterUnaryStartupFlag("local_startup_timeout_secs");
RegisterUnaryStartupFlag("digest_function");
RegisterUnaryStartupFlag("unix_digest_hash_attribute_name");
RegisterUnaryStartupFlag("server_javabase");
@@ -354,6 +356,18 @@
return blaze_exit_code::BAD_ARGV;
}
option_sources["connect_timeout_secs"] = rcfile;
+ } else if ((value = GetUnaryOption(arg, next_arg,
+ "--local_startup_timeout_secs")) != NULL) {
+ if (!blaze_util::safe_strto32(value, &local_startup_timeout_secs) ||
+ local_startup_timeout_secs < 1) {
+ blaze_util::StringPrintf(
+ error,
+ "Invalid argument to --local_startup_timeout_secs: '%s'.\n"
+ "Must be a positive integer.\n",
+ value);
+ return blaze_exit_code::BAD_ARGV;
+ }
+ option_sources["local_startup_timeout_secs"] = rcfile;
} else if ((value = GetUnaryOption(arg, next_arg, "--digest_function")) !=
NULL) {
digest_function = value;
diff --git a/src/main/cpp/startup_options.h b/src/main/cpp/startup_options.h
index bc5daf8..41aac66 100644
--- a/src/main/cpp/startup_options.h
+++ b/src/main/cpp/startup_options.h
@@ -234,6 +234,9 @@
// Connection timeout for each gRPC connection attempt.
int connect_timeout_secs;
+ // Local server startup timeout duration.
+ int local_startup_timeout_secs;
+
// Invocation policy proto, or an empty string.
std::string invocation_policy;
// Invocation policy can only be specified once.
diff --git a/src/main/java/com/google/devtools/build/lib/runtime/BlazeServerStartupOptions.java b/src/main/java/com/google/devtools/build/lib/runtime/BlazeServerStartupOptions.java
index c8b525e..a9f976e 100644
--- a/src/main/java/com/google/devtools/build/lib/runtime/BlazeServerStartupOptions.java
+++ b/src/main/java/com/google/devtools/build/lib/runtime/BlazeServerStartupOptions.java
@@ -383,6 +383,14 @@
help = "The amount of time the client waits for each attempt to connect to the server")
public int connectTimeoutSecs;
+ @Option(
+ name = "local_startup_timeout_secs",
+ defaultValue = "120", // NOTE: only for documentation, value is set and used by the client.
+ documentationCategory = OptionDocumentationCategory.BAZEL_CLIENT_OPTIONS,
+ effectTags = {OptionEffectTag.BAZEL_INTERNAL_CONFIGURATION},
+ help = "The maximum amount of time the client waits to connect to the server")
+ public int localStartupTimeoutSecs;
+
// TODO(b/109764197): Add OptionDocumentationCategory.BAZEL_CLIENT_OPTIONS & remove the
// experimental tag once this has been tested and is ready for use.
@Option(
diff --git a/src/test/cpp/bazel_startup_options_test.cc b/src/test/cpp/bazel_startup_options_test.cc
index 998bebf..91f1c14 100644
--- a/src/test/cpp/bazel_startup_options_test.cc
+++ b/src/test/cpp/bazel_startup_options_test.cc
@@ -117,6 +117,7 @@
ExpectIsUnaryOption(options, "install_base");
ExpectIsUnaryOption(options, "invocation_policy");
ExpectIsUnaryOption(options, "io_nice_level");
+ ExpectIsUnaryOption(options, "local_startup_timeout_secs");
ExpectIsUnaryOption(options, "macos_qos_class");
ExpectIsUnaryOption(options, "max_idle_secs");
ExpectIsUnaryOption(options, "output_base");
diff --git a/src/test/shell/integration/client_test.sh b/src/test/shell/integration/client_test.sh
index 1609ee0..9b696e7 100755
--- a/src/test/shell/integration/client_test.sh
+++ b/src/test/shell/integration/client_test.sh
@@ -180,6 +180,34 @@
expect_log "Usage: b\\(laze\\|azel\\)"
}
+function test_local_startup_timeout() {
+ local output_base=$(bazel info output_base 2>"$TEST_log") ||
+ fail "bazel info failed"
+
+ # --host-jvm_debug will cause the server to block, forcing the client
+ # into the timeout condition.
+ bazel --host_jvm_args="-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=localhost:41687" \
+ --local_startup_timeout_secs=1 2>"$TEST_log" &
+ local timeout=20
+ while true; do
+ local jobs_output=$(jobs)
+ [[ $jobs_output =~ Exit ]] && break
+ [[ $jobs_output =~ Done ]] && fail "bazel should have exited non-zero"
+
+ timeout="$(( ${timeout} - 1 ))"
+ [[ "${timeout}" -gt 0 ]] || {
+ kill -9 %1
+ wait %1
+ fail "--local_startup_timeout_secs was not respected"
+ }
+ # Wait for the client to exit.
+ sleep 1
+ done
+
+ expect_log "Starting local.*server and connecting to it"
+ expect_log "FATAL: couldn't connect to server"
+}
+
function test_max_idle_secs() {
# TODO(https://github.com/bazelbuild/bazel/issues/6773): Remove when fixed.
bazel shutdown