Fix duration measurement issues in the client

Even with `CLOCK_MONOTONIC`, we are still seeing Bazel servers fail to start up occasionally due to start time being larger than end time. Make this non-fatal by truncating to 0 and emitting a warning with start and end time to facilitate further investigation.

Also flip the conditions for command and extraction wait time, which previously were only included if *not* known.

Closes #24344.

PiperOrigin-RevId: 697932531
Change-Id: Ia124a1e10640fde909ec377ab493f1654b5933e7
diff --git a/src/main/cpp/archive_utils.cc b/src/main/cpp/archive_utils.cc
index f999e91..e54d6f2 100644
--- a/src/main/cpp/archive_utils.cc
+++ b/src/main/cpp/archive_utils.cc
@@ -15,11 +15,13 @@
 
 #include <functional>
 #include <memory>
+#include <optional>
 #include <set>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "src/main/cpp/blaze_util.h"
 #include "src/main/cpp/blaze_util_platform.h"
 #include "src/main/cpp/startup_options.h"
 #include "src/main/cpp/util/errors.h"
@@ -110,11 +112,10 @@
 // it is in place. Concurrency during extraction is handled by
 // extracting in a tmp dir and then renaming it into place where it
 // becomes visible atomically at the new path.
-ExtractionDurationMillis ExtractData(const string &self_path,
-                                     const vector<string> &archive_contents,
-                                     const string &expected_install_md5,
-                                     const StartupOptions &startup_options,
-                                     LoggingInfo *logging_info) {
+std::optional<DurationMillis> ExtractData(
+    const string &self_path, const vector<string> &archive_contents,
+    const string &expected_install_md5, const StartupOptions &startup_options,
+    LoggingInfo *logging_info) {
   const string &install_base = startup_options.install_base;
   // If the install dir doesn't exist, create it, if it does, we know it's good.
   if (!blaze_util::PathExists(install_base)) {
@@ -126,8 +127,7 @@
     BlessFiles(tmp_install);
 
     uint64_t et = GetMillisecondsMonotonic();
-    const ExtractionDurationMillis extract_data_duration(
-        et - st, /*archived_extracted=*/true);
+    const DurationMillis extract_data_duration(st, et);
 
     // Now rename the completed installation to its final name.
     int attempts = 0;
@@ -197,7 +197,7 @@
           << expected_install_md5
           << ").  Remove it or specify a different --install_base.";
     }
-    return ExtractionDurationMillis();
+    return std::nullopt;
   }
 }
 
diff --git a/src/main/cpp/archive_utils.h b/src/main/cpp/archive_utils.h
index 18856ab..15177b8 100644
--- a/src/main/cpp/archive_utils.h
+++ b/src/main/cpp/archive_utils.h
@@ -15,9 +15,11 @@
 #ifndef BAZEL_SRC_MAIN_CPP_ARCHIVE_UTILS_H_
 #define BAZEL_SRC_MAIN_CPP_ARCHIVE_UTILS_H_
 
+#include <optional>
 #include <string>
 #include <vector>
 
+#include "src/main/cpp/blaze_util.h"
 #include "src/main/cpp/startup_options.h"
 #include "src/main/cpp/util/logging.h"
 
@@ -29,28 +31,6 @@
                               std::vector<std::string> *files,
                               std::string *install_md5);
 
-struct DurationMillis {
- public:
-  const uint64_t millis;
-
-  DurationMillis() : millis(kUnknownDuration) {}
-  DurationMillis(const uint64_t ms) : millis(ms) {}
-
-  bool IsUnknown() const { return millis == kUnknownDuration; }
-
- private:
-  // Value representing that a timing event never occurred or is unknown.
-  static constexpr uint64_t kUnknownDuration = 0;
-};
-
-// DurationMillis that tracks if an archive was extracted.
-struct ExtractionDurationMillis : DurationMillis {
-  const bool archive_extracted;
-  ExtractionDurationMillis() : DurationMillis(), archive_extracted(false) {}
-  ExtractionDurationMillis(const uint64_t ms, const bool archive_extracted)
-      : DurationMillis(ms), archive_extracted(archive_extracted) {}
-};
-
 // The reason for a blaze server restart.
 // Keep in sync with logging.proto.
 enum RestartReason {
@@ -94,7 +74,7 @@
 // BlessFiles. If the install base, the location the archive is unpacked,
 // already exists, extraction is skipped. Kills the client if an error is
 // encountered.
-ExtractionDurationMillis ExtractData(
+std::optional<DurationMillis> ExtractData(
     const std::string &self_path,
     const std::vector<std::string> &archive_contents,
     const std::string &expected_install_md5,
diff --git a/src/main/cpp/blaze.cc b/src/main/cpp/blaze.cc
index 0bb3795..7e22563 100644
--- a/src/main/cpp/blaze.cc
+++ b/src/main/cpp/blaze.cc
@@ -197,8 +197,8 @@
   explicit BlazeServer(const StartupOptions &startup_options);
 
   // Acquire a lock for the output base this server is running in.
-  // Returns the number of milliseconds spent waiting for the lock.
-  uint64_t AcquireLock();
+  // If we had to wait for the lock, returns the time we spent waiting.
+  std::optional<DurationMillis> AcquireLock();
 
   // Whether there is an active connection to a server.
   bool Connected() const { return client_.get(); }
@@ -215,10 +215,9 @@
       const std::string &command, const std::vector<std::string> &command_args,
       const std::string &invocation_policy,
       const std::vector<RcStartupFlag> &original_startup_options,
-      const LoggingInfo &logging_info,
-      const DurationMillis client_startup_duration,
-      const DurationMillis extract_data_duration,
-      const DurationMillis command_wait_duration_ms);
+      const LoggingInfo &logging_info, DurationMillis client_startup_duration,
+      std::optional<DurationMillis> extract_data_duration,
+      std::optional<DurationMillis> command_wait_duration);
 
   // Disconnects and kills an existing server. Only call this when this object
   // is in connected state.
@@ -275,7 +274,7 @@
 // _exit(2) (attributed with ATTRIBUTE_NORETURN) meaning we have to delete the
 // objects before those.
 
-uint64_t BlazeServer::AcquireLock() {
+std::optional<DurationMillis> BlazeServer::AcquireLock() {
   if (output_base_lock_.has_value()) {
     BAZEL_DIE(blaze_exit_code::INTERNAL_ERROR)
         << "AcquireLock() called but the lock is already held.";
@@ -284,11 +283,11 @@
   // commands may not run against the same output base. Note that this lock will
   // be released by ReleaseLock() once the server is running, as it can handle
   // concurrent clients on its own.
-  uint64_t wait_time;
-  output_base_lock_ = blaze::AcquireLock(
-      "output base", output_base_.GetRelative("lock"), LockMode::kExclusive,
-      batch_, block_for_lock_, &wait_time);
-  return wait_time;
+  auto lock_and_time =
+      blaze::AcquireLock("output base", output_base_.GetRelative("lock"),
+                         LockMode::kExclusive, batch_, block_for_lock_);
+  output_base_lock_ = lock_and_time.first;
+  return lock_and_time.second;
 }
 
 void BlazeServer::ReleaseLock() {
@@ -545,11 +544,12 @@
 }
 
 // Add common command options for logging to the given argument array.
-static void AddLoggingArgs(const LoggingInfo &logging_info,
-                           const DurationMillis client_startup_duration,
-                           const DurationMillis extract_data_duration,
-                           const DurationMillis command_wait_duration_ms,
-                           vector<string> *args) {
+static void AddLoggingArgs(
+    const LoggingInfo &logging_info,
+    const DurationMillis client_startup_duration,
+    const std::optional<DurationMillis> extract_data_duration,
+    const std::optional<DurationMillis> command_wait_duration,
+    vector<string> *args) {
   // The time in ms the launcher spends before sending the request to the blaze
   // server.
   args->push_back("--startup_time=" +
@@ -557,17 +557,18 @@
 
   // The time in ms a command had to wait on a busy Blaze server process.
   // This is part of startup_time.
-  if (command_wait_duration_ms.IsUnknown()) {
+  if (command_wait_duration.has_value()) {
     args->push_back("--command_wait_time=" +
-                    blaze_util::ToString(command_wait_duration_ms.millis));
+                    blaze_util::ToString(command_wait_duration->millis));
   }
 
   // The time in ms spent on extracting the new blaze version.
   // This is part of startup_time.
-  if (extract_data_duration.IsUnknown()) {
+  if (extract_data_duration.has_value()) {
     args->push_back("--extract_data_time=" +
-                    blaze_util::ToString(extract_data_duration.millis));
+                    blaze_util::ToString(extract_data_duration->millis));
   }
+
   if (logging_info.restart_reason != NO_RESTART) {
     args->push_back(string("--restart_reason=") +
                     ReasonString(logging_info.restart_reason));
@@ -651,14 +652,15 @@
     const WorkspaceLayout &workspace_layout, const string &workspace,
     const OptionProcessor &option_processor,
     const StartupOptions &startup_options, LoggingInfo *logging_info,
-    const DurationMillis extract_data_duration,
-    const DurationMillis command_wait_duration_ms, BlazeServer *server) {
+    const std::optional<DurationMillis> extract_data_duration,
+    const std::optional<DurationMillis> command_wait_duration,
+    BlazeServer *server) {
   if (server->Connected()) {
     server->KillRunningServer();
   }
 
-  const DurationMillis client_startup_duration(GetMillisecondsMonotonic() -
-                                               logging_info->start_time_ms);
+  const DurationMillis client_startup_duration(logging_info->start_time_ms,
+                                               GetMillisecondsMonotonic());
 
   BAZEL_LOG(INFO) << "Starting " << startup_options.product_name
                   << " in batch mode.";
@@ -683,7 +685,7 @@
   if (!command.empty()) {
     jvm_args_vector.push_back(command);
     AddLoggingArgs(*logging_info, client_startup_duration,
-                   extract_data_duration, command_wait_duration_ms,
+                   extract_data_duration, command_wait_duration,
                    &jvm_args_vector);
   }
 
@@ -1051,9 +1053,9 @@
     const blaze_util::Path &server_dir, const WorkspaceLayout &workspace_layout,
     const string &workspace, const OptionProcessor &option_processor,
     const StartupOptions &startup_options, LoggingInfo *logging_info,
-    const DurationMillis extract_data_duration,
-    const DurationMillis command_wait_duration_ms, BlazeServer *server,
-    const string &build_label) {
+    const std::optional<DurationMillis> extract_data_duration,
+    const std::optional<DurationMillis> command_wait_duration,
+    BlazeServer *server, const string &build_label) {
   while (true) {
     if (!server->Connected()) {
       StartServerAndConnect(server_exe, server_exe_args, server_dir,
@@ -1095,8 +1097,8 @@
                   << server->ProcessInfo().server_pid_ << ").";
 
   // Wall clock time since process startup.
-  const DurationMillis client_startup_duration =
-      (GetMillisecondsMonotonic() - logging_info->start_time_ms);
+  const DurationMillis client_startup_duration(logging_info->start_time_ms,
+                                               GetMillisecondsMonotonic());
 
   SignalHandler::Get().Install(startup_options.product_name,
                                startup_options.output_base,
@@ -1105,8 +1107,7 @@
       option_processor.GetCommand(), option_processor.GetCommandArguments(),
       startup_options.invocation_policy,
       startup_options.original_startup_options_, *logging_info,
-      client_startup_duration, extract_data_duration,
-      command_wait_duration_ms));
+      client_startup_duration, extract_data_duration, command_wait_duration));
 }
 
 // Parse the options.
@@ -1399,13 +1400,16 @@
                         const string &workspace, LoggingInfo *logging_info) {
   blaze_server = new BlazeServer(startup_options);
 
-  const DurationMillis command_wait_duration_ms(blaze_server->AcquireLock());
-  BAZEL_LOG(INFO) << "Acquired the client lock, waited "
-                  << command_wait_duration_ms.millis << " milliseconds";
+  const std::optional<DurationMillis> command_wait_duration =
+      blaze_server->AcquireLock();
+  const uint64_t wait_ms =
+      command_wait_duration.has_value() ? command_wait_duration->millis : 0;
+  BAZEL_LOG(INFO) << "Acquired the client lock, waited " << wait_ms
+                  << " milliseconds";
 
   WarnFilesystemType(startup_options.output_base);
 
-  const ExtractionDurationMillis extract_data_duration = ExtractData(
+  const std::optional<DurationMillis> extract_data_duration = ExtractData(
       self_path, archive_contents, install_md5, startup_options, logging_info);
 
   blaze_server->Connect();
@@ -1462,14 +1466,14 @@
   } else if (startup_options.batch) {
     RunBatchMode(server_exe, server_exe_args, workspace_layout, workspace,
                  option_processor, startup_options, logging_info,
-                 extract_data_duration, command_wait_duration_ms, blaze_server);
+                 extract_data_duration, command_wait_duration, blaze_server);
   } else {
     string build_label;
     ExtractBuildLabel(self_path, &build_label);
     RunClientServerMode(server_exe, server_exe_args, server_dir,
                         workspace_layout, workspace, option_processor,
                         startup_options, logging_info, extract_data_duration,
-                        command_wait_duration_ms, blaze_server, build_label);
+                        command_wait_duration, blaze_server, build_label);
   }
 }
 
@@ -1838,8 +1842,8 @@
     const vector<RcStartupFlag> &original_startup_options,
     const LoggingInfo &logging_info,
     const DurationMillis client_startup_duration,
-    const DurationMillis extract_data_duration,
-    const DurationMillis command_wait_duration_ms) {
+    const std::optional<DurationMillis> extract_data_duration,
+    const std::optional<DurationMillis> command_wait_duration) {
   assert(Connected());
   assert(process_info_.server_pid_ > 0);
 
@@ -1847,7 +1851,7 @@
   if (!command.empty()) {
     arg_vector.push_back(command);
     AddLoggingArgs(logging_info, client_startup_duration, extract_data_duration,
-                   command_wait_duration_ms, &arg_vector);
+                   command_wait_duration, &arg_vector);
   }
 
   arg_vector.insert(arg_vector.end(), command_args.begin(), command_args.end());
diff --git a/src/main/cpp/blaze_util.h b/src/main/cpp/blaze_util.h
index 78738b0..e8dd9a9 100644
--- a/src/main/cpp/blaze_util.h
+++ b/src/main/cpp/blaze_util.h
@@ -131,6 +131,24 @@
   ~WithEnvVars();
 };
 
+struct DurationMillis {
+ public:
+  const uint64_t millis;
+
+  DurationMillis(const uint64_t start, const uint64_t end)
+      : millis(ComputeDuration(start, end)) {}
+
+ private:
+  static uint64_t ComputeDuration(const uint64_t start, const uint64_t end) {
+    if (end < start) {
+      BAZEL_LOG(WARNING) << "Invalid duration: start=" << start
+                         << ", end=" << end;
+      return 0;
+    }
+    return end - start;
+  }
+};
+
 }  // namespace blaze
 
 #endif  // BAZEL_SRC_MAIN_CPP_BLAZE_UTIL_H_
diff --git a/src/main/cpp/blaze_util_platform.h b/src/main/cpp/blaze_util_platform.h
index 17b8bc9..08dadc5 100644
--- a/src/main/cpp/blaze_util_platform.h
+++ b/src/main/cpp/blaze_util_platform.h
@@ -19,12 +19,13 @@
 
 #include <map>
 #include <memory>
+#include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "src/main/cpp/blaze_util.h"
 #include "src/main/cpp/server_process_info.h"
-#include "src/main/cpp/util/path.h"
 #include "src/main/cpp/util/port.h"
 
 namespace blaze {
@@ -208,12 +209,12 @@
 // Acquires a `mode` lock on `path`, busy-waiting until it becomes available if
 // `block` is true, and releasing it on exec if `batch_mode` is false.
 // Crashes if the lock cannot be acquired. Returns a handle that can be
-// subsequently passed to ReleaseLock. Sets `wait_time` to the number of
-// milliseconds spent waiting for the lock. The `name` argument is used to
-// distinguish it from other locks in human-readable error messages.
-LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path,
-                       LockMode mode, bool batch_mode, bool block,
-                       uint64_t* wait_time);
+// subsequently passed to ReleaseLock as well as the time spent waiting for the
+// lock, if any. The `name` argument is used to distinguish it from other locks
+// in human-readable error messages.
+std::pair<LockHandle, std::optional<DurationMillis>> AcquireLock(
+    const std::string& name, const blaze_util::Path& path, LockMode mode,
+    bool batch_mode, bool block);
 
 // Releases a lock previously obtained from AcquireLock.
 void ReleaseLock(LockHandle lock_handle);
diff --git a/src/main/cpp/blaze_util_posix.cc b/src/main/cpp/blaze_util_posix.cc
index 74759b5..b5bc003 100644
--- a/src/main/cpp/blaze_util_posix.cc
+++ b/src/main/cpp/blaze_util_posix.cc
@@ -39,15 +39,19 @@
 #include <cstdlib>
 #include <fstream>
 #include <iterator>
+#include <map>
+#include <optional>
 #include <set>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "src/main/cpp/blaze_util.h"
 #include "src/main/cpp/blaze_util_platform.h"
 #include "src/main/cpp/startup_options.h"
 #include "src/main/cpp/util/errors.h"
 #include "src/main/cpp/util/exit_code.h"
-#include "src/main/cpp/util/file.h"
+#include "src/main/cpp/util/file_platform.h"
 #include "src/main/cpp/util/logging.h"
 #include "src/main/cpp/util/md5.h"
 #include "src/main/cpp/util/numbers.h"
@@ -632,9 +636,9 @@
   return -1;
 }
 
-LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path,
-                       LockMode mode, bool batch_mode, bool block,
-                       uint64_t* wait_time) {
+std::pair<LockHandle, std::optional<DurationMillis>> AcquireLock(
+    const std::string& name, const blaze_util::Path& path, LockMode mode,
+    bool batch_mode, bool block) {
   int flags = O_CREAT;
   switch (mode) {
     case LockMode::kShared:
@@ -712,7 +716,10 @@
   // avoid unnecessary noise in the logs.  In this metric, we are only
   // interested in knowing how long it took for other commands to complete, not
   // how fast acquiring a lock is.
-  const uint64_t elapsed_time = !multiple_attempts ? 0 : end_time - start_time;
+  const auto elapsed_time =
+      multiple_attempts
+          ? std::make_optional(DurationMillis(start_time, end_time))
+          : std::nullopt;
 
   // If taking an exclusive lock, identify ourselves in the lockfile.
   // The contents are printed for human consumption when another client
@@ -731,8 +738,7 @@
     }
   }
 
-  *wait_time = elapsed_time;
-  return static_cast<LockHandle>(fd);
+  return std::make_pair(static_cast<LockHandle>(fd), elapsed_time);
 }
 
 void ReleaseLock(LockHandle lock_handle) {
diff --git a/src/main/cpp/blaze_util_windows.cc b/src/main/cpp/blaze_util_windows.cc
index 94e8f3f..acefea4 100644
--- a/src/main/cpp/blaze_util_windows.cc
+++ b/src/main/cpp/blaze_util_windows.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
+// clang-format on
 
 #include <fcntl.h>
 #include <io.h>
@@ -28,10 +30,12 @@
 #include <cstdlib>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <optional>
 #include <set>
 #include <sstream>
 #include <thread>       // NOLINT (to silence Google-internal linter)
 #include <type_traits>  // static_assert
+#include <utility>
 #include <vector>
 
 #include "src/main/cpp/blaze_util.h"
@@ -1087,9 +1091,9 @@
   return GetMillisecondsAsLargeInt(kFrequency).QuadPart;
 }
 
-LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path,
-                       LockMode mode, bool batch_mode, bool block,
-                       uint64_t* wait_time) {
+std::pair<LockHandle, std::optional<DurationMillis>> AcquireLock(
+    const std::string& name, const blaze_util::Path& path, LockMode mode,
+    bool batch_mode, bool block) {
   DWORD desired_access = GENERIC_READ;
   if (mode == LockMode::kExclusive) {
     desired_access |= GENERIC_WRITE;
@@ -1164,8 +1168,8 @@
   // a concurrent process can read and display it. On Windows we can't do so
   // because locks are mandatory, thus we cannot read the file concurrently.
 
-  *wait_time = GetMillisecondsMonotonic() - start_time;
-  return reinterpret_cast<LockHandle>(handle);
+  return std::make_pair(reinterpret_cast<LockHandle>(handle),
+                        DurationMillis(start_time, GetMillisecondsMonotonic()));
 }
 
 void ReleaseLock(LockHandle lock_handle) {
diff --git a/src/test/cpp/blaze_archive_test.cc b/src/test/cpp/blaze_archive_test.cc
index b7a58db..15399d9 100644
--- a/src/test/cpp/blaze_archive_test.cc
+++ b/src/test/cpp/blaze_archive_test.cc
@@ -13,8 +13,13 @@
 // limitations under the License.
 #include <stdlib.h>
 
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "file/base/filesystem.h"
 #include "file/base/helpers.h"
 #include "file/base/path.h"
 #include "file/util/temp_path.h"
@@ -23,9 +28,14 @@
 #include "src/main/cpp/bazel_startup_options.h"
 #include "googlemock/include/gmock/gmock.h"
 #include "googletest/include/gtest/gtest.h"
-#include "src/main/cpp/blaze.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "src/main/cpp/blaze_util.h"
 #include "src/main/cpp/blaze_util_platform.h"
+#include "src/main/cpp/util/exit_code.h"
 #include "src/main/cpp/util/file_platform.h"
+#include "src/main/cpp/workspace_layout.h"
+#include "util/task/status_macros.h"
 
 using ::testing::Gt;
 using ::testing::status::IsOkAndHolds;
@@ -121,11 +131,11 @@
   set_startup_options(startup_options, blaze_path, output_dir);
   LoggingInfo logging_info(blaze_path, blaze::GetMillisecondsMonotonic());
 
-  ExtractionDurationMillis extraction_time =
+  std::optional<DurationMillis> extraction_time =
       ExtractData(blaze_path, archive_contents, expected_install_md5,
                   startup_options, &logging_info);
 
-  ASSERT_TRUE(extraction_time.archive_extracted);
+  ASSERT_TRUE(extraction_time.has_value());
 
   const std::string foo_path = file::JoinPath(output_dir, "foo");
   const std::string bar_path = file::JoinPath(output_dir, "bar");
@@ -155,15 +165,14 @@
   set_startup_options(startup_options, blaze_path, output_dir);
   LoggingInfo logging_info(blaze_path, blaze::GetMillisecondsMonotonic());
 
-  ExtractionDurationMillis extraction_time_one =
+  std::optional<DurationMillis> extraction_time_one =
       ExtractData(blaze_path, archive_contents, expected_install_md5,
                   startup_options, &logging_info);
-  ASSERT_TRUE(extraction_time_one.archive_extracted);
+  ASSERT_TRUE(extraction_time_one.has_value());
 
-  ExtractionDurationMillis extraction_time_two =
+  std::optional<DurationMillis> extraction_time_two =
       ExtractData(blaze_path, archive_contents, expected_install_md5,
                   startup_options, &logging_info);
-
-  ASSERT_FALSE(extraction_time_two.archive_extracted);
+  ASSERT_FALSE(extraction_time_two.has_value());
 }
 }  // namespace blaze