BES: make uploader retry attempts configurable

Depends on different Build Event Service setup, there could be different failure modes that may tolerate less or more failures for Build Events uploading.

Allow users to tweak the number without having to use a custom JVM args or shipping a fork of Bazel with these number tweaked.

Closes #16305.

PiperOrigin-RevId: 482303303
Change-Id: I71d9aeaf7527b0ff1a81af069390eedee2c22aa0
diff --git a/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceUploader.java b/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceUploader.java
index 610c339..af0becd 100644
--- a/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceUploader.java
+++ b/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceUploader.java
@@ -93,12 +93,6 @@
 public final class BuildEventServiceUploader implements Runnable {
   private static final GoogleLogger logger = GoogleLogger.forEnclosingClass();
 
-  /** Configuration knobs related to RPC retries. Values chosen by good judgement. */
-  private static final int MAX_NUM_RETRIES =
-      Integer.parseInt(System.getProperty("BAZEL_BES_NUM_RETRIES_ON_RPC_FAILURE", "4"));
-
-  private static final int DELAY_MILLIS = 1000;
-
   private final BuildEventServiceClient besClient;
   private final BuildEventArtifactUploader buildEventUploader;
   private final BuildEventServiceProtoUtil besProtoUtil;
@@ -544,7 +538,7 @@
                     BuildProgress.Code.BES_STREAM_NOT_RETRYING_FAILURE,
                     message);
               }
-              if (retryAttempt == MAX_NUM_RETRIES) {
+              if (retryAttempt == buildEventProtocolOptions.besUploadMaxRetries) {
                 String message =
                     String.format(
                         "Not retrying publishBuildEvents, no more attempts left: status='%s'",
@@ -629,7 +623,7 @@
       throws DetailedStatusException, InterruptedException {
     int retryAttempt = 0;
     StatusException cause = null;
-    while (retryAttempt <= MAX_NUM_RETRIES) {
+    while (retryAttempt <= this.buildEventProtocolOptions.besUploadMaxRetries) {
       try {
         besClient.publish(request);
         return;
@@ -656,7 +650,7 @@
     throw withFailureDetail(
         cause,
         BuildProgress.Code.BES_UPLOAD_RETRY_LIMIT_EXCEEDED_FAILURE,
-        "All retry attempts failed.");
+        String.format("All %d retry attempts failed.", retryAttempt - 1));
   }
 
   private void ensureUploadThreadStarted() {
@@ -723,9 +717,12 @@
         && !status.getCode().equals(Code.FAILED_PRECONDITION);
   }
 
-  private static long retrySleepMillis(int attempt) {
+  private long retrySleepMillis(int attempt) {
+    Preconditions.checkArgument(attempt >= 0, "attempt must be nonnegative: %s", attempt);
     // This somewhat matches the backoff used for gRPC connection backoffs.
-    return (long) (DELAY_MILLIS * Math.pow(1.6, attempt));
+    return (long)
+        (this.buildEventProtocolOptions.besUploadRetryInitialDelay.toMillis()
+            * Math.pow(1.6, attempt));
   }
 
   private DetailedStatusException withFailureDetail(
diff --git a/src/main/java/com/google/devtools/build/lib/buildeventstream/BuildEventProtocolOptions.java b/src/main/java/com/google/devtools/build/lib/buildeventstream/BuildEventProtocolOptions.java
index 0333f0c..76c3fa0 100644
--- a/src/main/java/com/google/devtools/build/lib/buildeventstream/BuildEventProtocolOptions.java
+++ b/src/main/java/com/google/devtools/build/lib/buildeventstream/BuildEventProtocolOptions.java
@@ -18,6 +18,7 @@
 import com.google.devtools.common.options.OptionDocumentationCategory;
 import com.google.devtools.common.options.OptionEffectTag;
 import com.google.devtools.common.options.OptionsBase;
+import java.time.Duration;
 
 /** Options used to configure the build event protocol. */
 public class BuildEventProtocolOptions extends OptionsBase {
@@ -34,15 +35,32 @@
   public boolean legacyImportantOutputs;
 
   @Option(
-    name = "experimental_build_event_upload_strategy",
-    defaultValue = "null",
-    documentationCategory = OptionDocumentationCategory.LOGGING,
-    effectTags = {OptionEffectTag.AFFECTS_OUTPUTS},
-    help = "Selects how to upload artifacts referenced in the build event protocol."
-  )
+      name = "experimental_build_event_upload_strategy",
+      defaultValue = "null",
+      documentationCategory = OptionDocumentationCategory.LOGGING,
+      effectTags = {OptionEffectTag.AFFECTS_OUTPUTS},
+      help = "Selects how to upload artifacts referenced in the build event protocol.")
   public String buildEventUploadStrategy;
 
   @Option(
+      name = "experimental_build_event_upload_max_retries",
+      defaultValue = "4",
+      documentationCategory = OptionDocumentationCategory.LOGGING,
+      effectTags = {OptionEffectTag.BAZEL_INTERNAL_CONFIGURATION},
+      help = "The maximum number of times Bazel should retry uploading a build event.")
+  public int besUploadMaxRetries;
+
+  @Option(
+      name = "experimental_build_event_upload_retry_minimum_delay",
+      defaultValue = "1s",
+      documentationCategory = OptionDocumentationCategory.LOGGING,
+      effectTags = {OptionEffectTag.BAZEL_INTERNAL_CONFIGURATION},
+      help =
+          "Initial, minimum delay for exponential backoff retries when BEP upload fails. (exponent:"
+              + " 1.6)")
+  public Duration besUploadRetryInitialDelay;
+
+  @Option(
       name = "experimental_stream_log_file_uploads",
       defaultValue = "false",
       documentationCategory = OptionDocumentationCategory.LOGGING,
diff --git a/src/test/java/com/google/devtools/build/lib/buildeventservice/BazelBuildEventServiceModuleTest.java b/src/test/java/com/google/devtools/build/lib/buildeventservice/BazelBuildEventServiceModuleTest.java
index 8063ae0..50b5433 100644
--- a/src/test/java/com/google/devtools/build/lib/buildeventservice/BazelBuildEventServiceModuleTest.java
+++ b/src/test/java/com/google/devtools/build/lib/buildeventservice/BazelBuildEventServiceModuleTest.java
@@ -233,6 +233,16 @@
   }
 
   @Test
+  public void testRetryCount() throws Exception {
+    runBuildWithOptions(
+        "--bes_backend=does.not.exist:1234", "--experimental_build_event_upload_max_retries=3");
+    afterBuildCommand();
+
+    events.assertContainsError(
+        "The Build Event Protocol upload failed: All 3 retry attempts failed");
+  }
+
+  @Test
   public void testConnectivityFailureDisablesBesStreaming() throws Exception {
     class FailingConnectivityStatusProvider extends BlazeModule
         implements ConnectivityStatusProvider {