Fix duration measurement issues in the client
Even with `CLOCK_MONOTONIC`, we are still seeing Bazel servers fail to start up occasionally due to start time being larger than end time. Make this non-fatal by truncating to 0 and emitting a warning with start and end time to facilitate further investigation.
Also flip the conditions for command and extraction wait time, which previously were only included if *not* known.
Closes #24344.
PiperOrigin-RevId: 697932531
Change-Id: Ia124a1e10640fde909ec377ab493f1654b5933e7
diff --git a/src/main/cpp/archive_utils.cc b/src/main/cpp/archive_utils.cc
index f999e91..e54d6f2 100644
--- a/src/main/cpp/archive_utils.cc
+++ b/src/main/cpp/archive_utils.cc
@@ -15,11 +15,13 @@
#include <functional>
#include <memory>
+#include <optional>
#include <set>
#include <string>
#include <thread> // NOLINT
#include <vector>
+#include "src/main/cpp/blaze_util.h"
#include "src/main/cpp/blaze_util_platform.h"
#include "src/main/cpp/startup_options.h"
#include "src/main/cpp/util/errors.h"
@@ -110,11 +112,10 @@
// it is in place. Concurrency during extraction is handled by
// extracting in a tmp dir and then renaming it into place where it
// becomes visible atomically at the new path.
-ExtractionDurationMillis ExtractData(const string &self_path,
- const vector<string> &archive_contents,
- const string &expected_install_md5,
- const StartupOptions &startup_options,
- LoggingInfo *logging_info) {
+std::optional<DurationMillis> ExtractData(
+ const string &self_path, const vector<string> &archive_contents,
+ const string &expected_install_md5, const StartupOptions &startup_options,
+ LoggingInfo *logging_info) {
const string &install_base = startup_options.install_base;
// If the install dir doesn't exist, create it, if it does, we know it's good.
if (!blaze_util::PathExists(install_base)) {
@@ -126,8 +127,7 @@
BlessFiles(tmp_install);
uint64_t et = GetMillisecondsMonotonic();
- const ExtractionDurationMillis extract_data_duration(
- et - st, /*archived_extracted=*/true);
+ const DurationMillis extract_data_duration(st, et);
// Now rename the completed installation to its final name.
int attempts = 0;
@@ -197,7 +197,7 @@
<< expected_install_md5
<< "). Remove it or specify a different --install_base.";
}
- return ExtractionDurationMillis();
+ return std::nullopt;
}
}
diff --git a/src/main/cpp/archive_utils.h b/src/main/cpp/archive_utils.h
index 18856ab..15177b8 100644
--- a/src/main/cpp/archive_utils.h
+++ b/src/main/cpp/archive_utils.h
@@ -15,9 +15,11 @@
#ifndef BAZEL_SRC_MAIN_CPP_ARCHIVE_UTILS_H_
#define BAZEL_SRC_MAIN_CPP_ARCHIVE_UTILS_H_
+#include <optional>
#include <string>
#include <vector>
+#include "src/main/cpp/blaze_util.h"
#include "src/main/cpp/startup_options.h"
#include "src/main/cpp/util/logging.h"
@@ -29,28 +31,6 @@
std::vector<std::string> *files,
std::string *install_md5);
-struct DurationMillis {
- public:
- const uint64_t millis;
-
- DurationMillis() : millis(kUnknownDuration) {}
- DurationMillis(const uint64_t ms) : millis(ms) {}
-
- bool IsUnknown() const { return millis == kUnknownDuration; }
-
- private:
- // Value representing that a timing event never occurred or is unknown.
- static constexpr uint64_t kUnknownDuration = 0;
-};
-
-// DurationMillis that tracks if an archive was extracted.
-struct ExtractionDurationMillis : DurationMillis {
- const bool archive_extracted;
- ExtractionDurationMillis() : DurationMillis(), archive_extracted(false) {}
- ExtractionDurationMillis(const uint64_t ms, const bool archive_extracted)
- : DurationMillis(ms), archive_extracted(archive_extracted) {}
-};
-
// The reason for a blaze server restart.
// Keep in sync with logging.proto.
enum RestartReason {
@@ -94,7 +74,7 @@
// BlessFiles. If the install base, the location the archive is unpacked,
// already exists, extraction is skipped. Kills the client if an error is
// encountered.
-ExtractionDurationMillis ExtractData(
+std::optional<DurationMillis> ExtractData(
const std::string &self_path,
const std::vector<std::string> &archive_contents,
const std::string &expected_install_md5,
diff --git a/src/main/cpp/blaze.cc b/src/main/cpp/blaze.cc
index 0bb3795..7e22563 100644
--- a/src/main/cpp/blaze.cc
+++ b/src/main/cpp/blaze.cc
@@ -197,8 +197,8 @@
explicit BlazeServer(const StartupOptions &startup_options);
// Acquire a lock for the output base this server is running in.
- // Returns the number of milliseconds spent waiting for the lock.
- uint64_t AcquireLock();
+ // If we had to wait for the lock, returns the time we spent waiting.
+ std::optional<DurationMillis> AcquireLock();
// Whether there is an active connection to a server.
bool Connected() const { return client_.get(); }
@@ -215,10 +215,9 @@
const std::string &command, const std::vector<std::string> &command_args,
const std::string &invocation_policy,
const std::vector<RcStartupFlag> &original_startup_options,
- const LoggingInfo &logging_info,
- const DurationMillis client_startup_duration,
- const DurationMillis extract_data_duration,
- const DurationMillis command_wait_duration_ms);
+ const LoggingInfo &logging_info, DurationMillis client_startup_duration,
+ std::optional<DurationMillis> extract_data_duration,
+ std::optional<DurationMillis> command_wait_duration);
// Disconnects and kills an existing server. Only call this when this object
// is in connected state.
@@ -275,7 +274,7 @@
// _exit(2) (attributed with ATTRIBUTE_NORETURN) meaning we have to delete the
// objects before those.
-uint64_t BlazeServer::AcquireLock() {
+std::optional<DurationMillis> BlazeServer::AcquireLock() {
if (output_base_lock_.has_value()) {
BAZEL_DIE(blaze_exit_code::INTERNAL_ERROR)
<< "AcquireLock() called but the lock is already held.";
@@ -284,11 +283,11 @@
// commands may not run against the same output base. Note that this lock will
// be released by ReleaseLock() once the server is running, as it can handle
// concurrent clients on its own.
- uint64_t wait_time;
- output_base_lock_ = blaze::AcquireLock(
- "output base", output_base_.GetRelative("lock"), LockMode::kExclusive,
- batch_, block_for_lock_, &wait_time);
- return wait_time;
+ auto lock_and_time =
+ blaze::AcquireLock("output base", output_base_.GetRelative("lock"),
+ LockMode::kExclusive, batch_, block_for_lock_);
+ output_base_lock_ = lock_and_time.first;
+ return lock_and_time.second;
}
void BlazeServer::ReleaseLock() {
@@ -545,11 +544,12 @@
}
// Add common command options for logging to the given argument array.
-static void AddLoggingArgs(const LoggingInfo &logging_info,
- const DurationMillis client_startup_duration,
- const DurationMillis extract_data_duration,
- const DurationMillis command_wait_duration_ms,
- vector<string> *args) {
+static void AddLoggingArgs(
+ const LoggingInfo &logging_info,
+ const DurationMillis client_startup_duration,
+ const std::optional<DurationMillis> extract_data_duration,
+ const std::optional<DurationMillis> command_wait_duration,
+ vector<string> *args) {
// The time in ms the launcher spends before sending the request to the blaze
// server.
args->push_back("--startup_time=" +
@@ -557,17 +557,18 @@
// The time in ms a command had to wait on a busy Blaze server process.
// This is part of startup_time.
- if (command_wait_duration_ms.IsUnknown()) {
+ if (command_wait_duration.has_value()) {
args->push_back("--command_wait_time=" +
- blaze_util::ToString(command_wait_duration_ms.millis));
+ blaze_util::ToString(command_wait_duration->millis));
}
// The time in ms spent on extracting the new blaze version.
// This is part of startup_time.
- if (extract_data_duration.IsUnknown()) {
+ if (extract_data_duration.has_value()) {
args->push_back("--extract_data_time=" +
- blaze_util::ToString(extract_data_duration.millis));
+ blaze_util::ToString(extract_data_duration->millis));
}
+
if (logging_info.restart_reason != NO_RESTART) {
args->push_back(string("--restart_reason=") +
ReasonString(logging_info.restart_reason));
@@ -651,14 +652,15 @@
const WorkspaceLayout &workspace_layout, const string &workspace,
const OptionProcessor &option_processor,
const StartupOptions &startup_options, LoggingInfo *logging_info,
- const DurationMillis extract_data_duration,
- const DurationMillis command_wait_duration_ms, BlazeServer *server) {
+ const std::optional<DurationMillis> extract_data_duration,
+ const std::optional<DurationMillis> command_wait_duration,
+ BlazeServer *server) {
if (server->Connected()) {
server->KillRunningServer();
}
- const DurationMillis client_startup_duration(GetMillisecondsMonotonic() -
- logging_info->start_time_ms);
+ const DurationMillis client_startup_duration(logging_info->start_time_ms,
+ GetMillisecondsMonotonic());
BAZEL_LOG(INFO) << "Starting " << startup_options.product_name
<< " in batch mode.";
@@ -683,7 +685,7 @@
if (!command.empty()) {
jvm_args_vector.push_back(command);
AddLoggingArgs(*logging_info, client_startup_duration,
- extract_data_duration, command_wait_duration_ms,
+ extract_data_duration, command_wait_duration,
&jvm_args_vector);
}
@@ -1051,9 +1053,9 @@
const blaze_util::Path &server_dir, const WorkspaceLayout &workspace_layout,
const string &workspace, const OptionProcessor &option_processor,
const StartupOptions &startup_options, LoggingInfo *logging_info,
- const DurationMillis extract_data_duration,
- const DurationMillis command_wait_duration_ms, BlazeServer *server,
- const string &build_label) {
+ const std::optional<DurationMillis> extract_data_duration,
+ const std::optional<DurationMillis> command_wait_duration,
+ BlazeServer *server, const string &build_label) {
while (true) {
if (!server->Connected()) {
StartServerAndConnect(server_exe, server_exe_args, server_dir,
@@ -1095,8 +1097,8 @@
<< server->ProcessInfo().server_pid_ << ").";
// Wall clock time since process startup.
- const DurationMillis client_startup_duration =
- (GetMillisecondsMonotonic() - logging_info->start_time_ms);
+ const DurationMillis client_startup_duration(logging_info->start_time_ms,
+ GetMillisecondsMonotonic());
SignalHandler::Get().Install(startup_options.product_name,
startup_options.output_base,
@@ -1105,8 +1107,7 @@
option_processor.GetCommand(), option_processor.GetCommandArguments(),
startup_options.invocation_policy,
startup_options.original_startup_options_, *logging_info,
- client_startup_duration, extract_data_duration,
- command_wait_duration_ms));
+ client_startup_duration, extract_data_duration, command_wait_duration));
}
// Parse the options.
@@ -1399,13 +1400,16 @@
const string &workspace, LoggingInfo *logging_info) {
blaze_server = new BlazeServer(startup_options);
- const DurationMillis command_wait_duration_ms(blaze_server->AcquireLock());
- BAZEL_LOG(INFO) << "Acquired the client lock, waited "
- << command_wait_duration_ms.millis << " milliseconds";
+ const std::optional<DurationMillis> command_wait_duration =
+ blaze_server->AcquireLock();
+ const uint64_t wait_ms =
+ command_wait_duration.has_value() ? command_wait_duration->millis : 0;
+ BAZEL_LOG(INFO) << "Acquired the client lock, waited " << wait_ms
+ << " milliseconds";
WarnFilesystemType(startup_options.output_base);
- const ExtractionDurationMillis extract_data_duration = ExtractData(
+ const std::optional<DurationMillis> extract_data_duration = ExtractData(
self_path, archive_contents, install_md5, startup_options, logging_info);
blaze_server->Connect();
@@ -1462,14 +1466,14 @@
} else if (startup_options.batch) {
RunBatchMode(server_exe, server_exe_args, workspace_layout, workspace,
option_processor, startup_options, logging_info,
- extract_data_duration, command_wait_duration_ms, blaze_server);
+ extract_data_duration, command_wait_duration, blaze_server);
} else {
string build_label;
ExtractBuildLabel(self_path, &build_label);
RunClientServerMode(server_exe, server_exe_args, server_dir,
workspace_layout, workspace, option_processor,
startup_options, logging_info, extract_data_duration,
- command_wait_duration_ms, blaze_server, build_label);
+ command_wait_duration, blaze_server, build_label);
}
}
@@ -1838,8 +1842,8 @@
const vector<RcStartupFlag> &original_startup_options,
const LoggingInfo &logging_info,
const DurationMillis client_startup_duration,
- const DurationMillis extract_data_duration,
- const DurationMillis command_wait_duration_ms) {
+ const std::optional<DurationMillis> extract_data_duration,
+ const std::optional<DurationMillis> command_wait_duration) {
assert(Connected());
assert(process_info_.server_pid_ > 0);
@@ -1847,7 +1851,7 @@
if (!command.empty()) {
arg_vector.push_back(command);
AddLoggingArgs(logging_info, client_startup_duration, extract_data_duration,
- command_wait_duration_ms, &arg_vector);
+ command_wait_duration, &arg_vector);
}
arg_vector.insert(arg_vector.end(), command_args.begin(), command_args.end());
diff --git a/src/main/cpp/blaze_util.h b/src/main/cpp/blaze_util.h
index 78738b0..e8dd9a9 100644
--- a/src/main/cpp/blaze_util.h
+++ b/src/main/cpp/blaze_util.h
@@ -131,6 +131,24 @@
~WithEnvVars();
};
+struct DurationMillis {
+ public:
+ const uint64_t millis;
+
+ DurationMillis(const uint64_t start, const uint64_t end)
+ : millis(ComputeDuration(start, end)) {}
+
+ private:
+ static uint64_t ComputeDuration(const uint64_t start, const uint64_t end) {
+ if (end < start) {
+ BAZEL_LOG(WARNING) << "Invalid duration: start=" << start
+ << ", end=" << end;
+ return 0;
+ }
+ return end - start;
+ }
+};
+
} // namespace blaze
#endif // BAZEL_SRC_MAIN_CPP_BLAZE_UTIL_H_
diff --git a/src/main/cpp/blaze_util_platform.h b/src/main/cpp/blaze_util_platform.h
index 17b8bc9..08dadc5 100644
--- a/src/main/cpp/blaze_util_platform.h
+++ b/src/main/cpp/blaze_util_platform.h
@@ -19,12 +19,13 @@
#include <map>
#include <memory>
+#include <optional>
#include <string>
+#include <utility>
#include <vector>
#include "src/main/cpp/blaze_util.h"
#include "src/main/cpp/server_process_info.h"
-#include "src/main/cpp/util/path.h"
#include "src/main/cpp/util/port.h"
namespace blaze {
@@ -208,12 +209,12 @@
// Acquires a `mode` lock on `path`, busy-waiting until it becomes available if
// `block` is true, and releasing it on exec if `batch_mode` is false.
// Crashes if the lock cannot be acquired. Returns a handle that can be
-// subsequently passed to ReleaseLock. Sets `wait_time` to the number of
-// milliseconds spent waiting for the lock. The `name` argument is used to
-// distinguish it from other locks in human-readable error messages.
-LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path,
- LockMode mode, bool batch_mode, bool block,
- uint64_t* wait_time);
+// subsequently passed to ReleaseLock as well as the time spent waiting for the
+// lock, if any. The `name` argument is used to distinguish it from other locks
+// in human-readable error messages.
+std::pair<LockHandle, std::optional<DurationMillis>> AcquireLock(
+ const std::string& name, const blaze_util::Path& path, LockMode mode,
+ bool batch_mode, bool block);
// Releases a lock previously obtained from AcquireLock.
void ReleaseLock(LockHandle lock_handle);
diff --git a/src/main/cpp/blaze_util_posix.cc b/src/main/cpp/blaze_util_posix.cc
index 74759b5..b5bc003 100644
--- a/src/main/cpp/blaze_util_posix.cc
+++ b/src/main/cpp/blaze_util_posix.cc
@@ -39,15 +39,19 @@
#include <cstdlib>
#include <fstream>
#include <iterator>
+#include <map>
+#include <optional>
#include <set>
#include <string>
+#include <utility>
+#include <vector>
#include "src/main/cpp/blaze_util.h"
#include "src/main/cpp/blaze_util_platform.h"
#include "src/main/cpp/startup_options.h"
#include "src/main/cpp/util/errors.h"
#include "src/main/cpp/util/exit_code.h"
-#include "src/main/cpp/util/file.h"
+#include "src/main/cpp/util/file_platform.h"
#include "src/main/cpp/util/logging.h"
#include "src/main/cpp/util/md5.h"
#include "src/main/cpp/util/numbers.h"
@@ -632,9 +636,9 @@
return -1;
}
-LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path,
- LockMode mode, bool batch_mode, bool block,
- uint64_t* wait_time) {
+std::pair<LockHandle, std::optional<DurationMillis>> AcquireLock(
+ const std::string& name, const blaze_util::Path& path, LockMode mode,
+ bool batch_mode, bool block) {
int flags = O_CREAT;
switch (mode) {
case LockMode::kShared:
@@ -712,7 +716,10 @@
// avoid unnecessary noise in the logs. In this metric, we are only
// interested in knowing how long it took for other commands to complete, not
// how fast acquiring a lock is.
- const uint64_t elapsed_time = !multiple_attempts ? 0 : end_time - start_time;
+ const auto elapsed_time =
+ multiple_attempts
+ ? std::make_optional(DurationMillis(start_time, end_time))
+ : std::nullopt;
// If taking an exclusive lock, identify ourselves in the lockfile.
// The contents are printed for human consumption when another client
@@ -731,8 +738,7 @@
}
}
- *wait_time = elapsed_time;
- return static_cast<LockHandle>(fd);
+ return std::make_pair(static_cast<LockHandle>(fd), elapsed_time);
}
void ReleaseLock(LockHandle lock_handle) {
diff --git a/src/main/cpp/blaze_util_windows.cc b/src/main/cpp/blaze_util_windows.cc
index 94e8f3f..acefea4 100644
--- a/src/main/cpp/blaze_util_windows.cc
+++ b/src/main/cpp/blaze_util_windows.cc
@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// clang-format off
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
+// clang-format on
#include <fcntl.h>
#include <io.h>
@@ -28,10 +30,12 @@
#include <cstdlib>
#include <memory>
#include <mutex> // NOLINT
+#include <optional>
#include <set>
#include <sstream>
#include <thread> // NOLINT (to silence Google-internal linter)
#include <type_traits> // static_assert
+#include <utility>
#include <vector>
#include "src/main/cpp/blaze_util.h"
@@ -1087,9 +1091,9 @@
return GetMillisecondsAsLargeInt(kFrequency).QuadPart;
}
-LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path,
- LockMode mode, bool batch_mode, bool block,
- uint64_t* wait_time) {
+std::pair<LockHandle, std::optional<DurationMillis>> AcquireLock(
+ const std::string& name, const blaze_util::Path& path, LockMode mode,
+ bool batch_mode, bool block) {
DWORD desired_access = GENERIC_READ;
if (mode == LockMode::kExclusive) {
desired_access |= GENERIC_WRITE;
@@ -1164,8 +1168,8 @@
// a concurrent process can read and display it. On Windows we can't do so
// because locks are mandatory, thus we cannot read the file concurrently.
- *wait_time = GetMillisecondsMonotonic() - start_time;
- return reinterpret_cast<LockHandle>(handle);
+ return std::make_pair(reinterpret_cast<LockHandle>(handle),
+ DurationMillis(start_time, GetMillisecondsMonotonic()));
}
void ReleaseLock(LockHandle lock_handle) {
diff --git a/src/test/cpp/blaze_archive_test.cc b/src/test/cpp/blaze_archive_test.cc
index b7a58db..15399d9 100644
--- a/src/test/cpp/blaze_archive_test.cc
+++ b/src/test/cpp/blaze_archive_test.cc
@@ -13,8 +13,13 @@
// limitations under the License.
#include <stdlib.h>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
#include <vector>
+#include "file/base/filesystem.h"
#include "file/base/helpers.h"
#include "file/base/path.h"
#include "file/util/temp_path.h"
@@ -23,9 +28,14 @@
#include "src/main/cpp/bazel_startup_options.h"
#include "googlemock/include/gmock/gmock.h"
#include "googletest/include/gtest/gtest.h"
-#include "src/main/cpp/blaze.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "src/main/cpp/blaze_util.h"
#include "src/main/cpp/blaze_util_platform.h"
+#include "src/main/cpp/util/exit_code.h"
#include "src/main/cpp/util/file_platform.h"
+#include "src/main/cpp/workspace_layout.h"
+#include "util/task/status_macros.h"
using ::testing::Gt;
using ::testing::status::IsOkAndHolds;
@@ -121,11 +131,11 @@
set_startup_options(startup_options, blaze_path, output_dir);
LoggingInfo logging_info(blaze_path, blaze::GetMillisecondsMonotonic());
- ExtractionDurationMillis extraction_time =
+ std::optional<DurationMillis> extraction_time =
ExtractData(blaze_path, archive_contents, expected_install_md5,
startup_options, &logging_info);
- ASSERT_TRUE(extraction_time.archive_extracted);
+ ASSERT_TRUE(extraction_time.has_value());
const std::string foo_path = file::JoinPath(output_dir, "foo");
const std::string bar_path = file::JoinPath(output_dir, "bar");
@@ -155,15 +165,14 @@
set_startup_options(startup_options, blaze_path, output_dir);
LoggingInfo logging_info(blaze_path, blaze::GetMillisecondsMonotonic());
- ExtractionDurationMillis extraction_time_one =
+ std::optional<DurationMillis> extraction_time_one =
ExtractData(blaze_path, archive_contents, expected_install_md5,
startup_options, &logging_info);
- ASSERT_TRUE(extraction_time_one.archive_extracted);
+ ASSERT_TRUE(extraction_time_one.has_value());
- ExtractionDurationMillis extraction_time_two =
+ std::optional<DurationMillis> extraction_time_two =
ExtractData(blaze_path, archive_contents, expected_install_md5,
startup_options, &logging_info);
-
- ASSERT_FALSE(extraction_time_two.archive_extracted);
+ ASSERT_FALSE(extraction_time_two.has_value());
}
} // namespace blaze