Allow users to override Bazel's 120s connect timeout
Bazel constrains starting up and connecting to a local server to 120s. This
occasionally causes problems for us on heavily loaded, high numcpu machines,
because the Bazel client+server may end up starved out via simple CPU
contention. (Where we observed startup timeouts, machines had plenty of RAM,
I/O looked normal---procs weren't stuck in disk wait---but the run queues were
60+.)
Mitigate this by introducing a new startup option, `--local_startup_timeout_secs`,
which allows users to specify their own timeout values. (Note: I primarily
used `connect_timeout_secs` as a reference.)
TODO: Consult Bazel team to add test case per comments in
[bazel_startup_options_test.cc].
Resolves [#8988].
[bazel_startup_options_test.cc]: https://github.com/bazelbuild/bazel/blob/8075057af6108ebc23c146f18eecec911d4b8c00/src/test/cpp/bazel_startup_options_test.cc#L79-L81
[#8988]: https://github.com/bazelbuild/bazel/issues/8988
Testing Done:
- Induced high load on my MBP, then
```console
$ vbazel --local_startup_timeout_secs=1 info release
Starting local Bazel server and connecting to it...
FATAL: couldn't connect to server (16290) after 1 seconds.
```
Closes #11391.
PiperOrigin-RevId: 338729701
diff --git a/src/test/cpp/bazel_startup_options_test.cc b/src/test/cpp/bazel_startup_options_test.cc
index 998bebf..91f1c14 100644
--- a/src/test/cpp/bazel_startup_options_test.cc
+++ b/src/test/cpp/bazel_startup_options_test.cc
@@ -117,6 +117,7 @@
ExpectIsUnaryOption(options, "install_base");
ExpectIsUnaryOption(options, "invocation_policy");
ExpectIsUnaryOption(options, "io_nice_level");
+ ExpectIsUnaryOption(options, "local_startup_timeout_secs");
ExpectIsUnaryOption(options, "macos_qos_class");
ExpectIsUnaryOption(options, "max_idle_secs");
ExpectIsUnaryOption(options, "output_base");
diff --git a/src/test/shell/integration/client_test.sh b/src/test/shell/integration/client_test.sh
index 1609ee0..9b696e7 100755
--- a/src/test/shell/integration/client_test.sh
+++ b/src/test/shell/integration/client_test.sh
@@ -180,6 +180,34 @@
expect_log "Usage: b\\(laze\\|azel\\)"
}
+function test_local_startup_timeout() {
+ local output_base=$(bazel info output_base 2>"$TEST_log") ||
+ fail "bazel info failed"
+
+ # --host-jvm_debug will cause the server to block, forcing the client
+ # into the timeout condition.
+ bazel --host_jvm_args="-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=localhost:41687" \
+ --local_startup_timeout_secs=1 2>"$TEST_log" &
+ local timeout=20
+ while true; do
+ local jobs_output=$(jobs)
+ [[ $jobs_output =~ Exit ]] && break
+ [[ $jobs_output =~ Done ]] && fail "bazel should have exited non-zero"
+
+ timeout="$(( ${timeout} - 1 ))"
+ [[ "${timeout}" -gt 0 ]] || {
+ kill -9 %1
+ wait %1
+ fail "--local_startup_timeout_secs was not respected"
+ }
+ # Wait for the client to exit.
+ sleep 1
+ done
+
+ expect_log "Starting local.*server and connecting to it"
+ expect_log "FATAL: couldn't connect to server"
+}
+
function test_max_idle_secs() {
# TODO(https://github.com/bazelbuild/bazel/issues/6773): Remove when fixed.
bazel shutdown