Add --experimental_repository_cache_urls_as_default_canonical_id to help detect broken repository URLs

This new flag can be used to force redownloading when repository URLs are changed. Otherwise, it's possible broken URLs can be masked by the presence of a repository cache entry with the same hash. Specifying a `canonical_id` works as before, overriding this behavior.

Closes #14128.

Closes #14268.

PiperOrigin-RevId: 420976730
diff --git a/src/main/java/com/google/devtools/build/lib/bazel/BazelRepositoryModule.java b/src/main/java/com/google/devtools/build/lib/bazel/BazelRepositoryModule.java
index 0569527..6ae11fe 100644
--- a/src/main/java/com/google/devtools/build/lib/bazel/BazelRepositoryModule.java
+++ b/src/main/java/com/google/devtools/build/lib/bazel/BazelRepositoryModule.java
@@ -279,6 +279,7 @@
       if (repoOptions.repositoryDownloaderRetries >= 0) {
         downloadManager.setRetries(repoOptions.repositoryDownloaderRetries);
       }
+      downloadManager.setUrlsAsDefaultCanonicalId(repoOptions.urlsAsDefaultCanonicalId);
 
       repositoryCache.setHardlink(repoOptions.useHardlinks);
       if (repoOptions.experimentalScaleTimeouts > 0.0) {
diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/RepositoryOptions.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/RepositoryOptions.java
index 5f28b96..6a0c5c6 100644
--- a/src/main/java/com/google/devtools/build/lib/bazel/repository/RepositoryOptions.java
+++ b/src/main/java/com/google/devtools/build/lib/bazel/repository/RepositoryOptions.java
@@ -215,6 +215,19 @@
               + " to escalate it to a resolution failure.")
   public CheckDirectDepsMode checkDirectDependencies;
 
+  @Option(
+      name = "experimental_repository_cache_urls_as_default_canonical_id",
+      defaultValue = "false",
+      documentationCategory = OptionDocumentationCategory.BAZEL_CLIENT_OPTIONS,
+      effectTags = {OptionEffectTag.LOADING_AND_ANALYSIS},
+      metadataTags = {OptionMetadataTag.EXPERIMENTAL},
+      help =
+          "If true, use a string derived from the URLs of repository downloads as the canonical_id "
+              + "if not specified. This causes a change in the URLs to result in a redownload even "
+              + "if the cache contains a download with the same hash. This can be used to verify "
+              + "that URL changes don't result in broken repositories being masked by the cache.")
+  public boolean urlsAsDefaultCanonicalId;
+
   /** An enum for specifying different modes for checking direct dependency accuracy. */
   public enum CheckDirectDepsMode {
     OFF, // Don't check direct dependency accuracy.
diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/downloader/DownloadManager.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/downloader/DownloadManager.java
index 00aaee6..399872f 100644
--- a/src/main/java/com/google/devtools/build/lib/bazel/repository/downloader/DownloadManager.java
+++ b/src/main/java/com/google/devtools/build/lib/bazel/repository/downloader/DownloadManager.java
@@ -35,6 +35,7 @@
 import java.net.URL;
 import java.util.List;
 import java.util.Map;
+import java.util.stream.Collectors;
 import javax.annotation.Nullable;
 
 /**
@@ -51,6 +52,7 @@
   private final Downloader downloader;
   private boolean disableDownload = false;
   private int retries = 0;
+  private boolean urlsAsDefaultCanonicalId;
 
   public DownloadManager(RepositoryCache repositoryCache, Downloader downloader) {
     this.repositoryCache = repositoryCache;
@@ -74,6 +76,10 @@
     this.retries = retries;
   }
 
+  public void setUrlsAsDefaultCanonicalId(boolean urlsAsDefaultCanonicalId) {
+    this.urlsAsDefaultCanonicalId = urlsAsDefaultCanonicalId;
+  }
+
   /**
    * Downloads file to disk and returns path.
    *
@@ -108,6 +114,10 @@
       throw new InterruptedException();
     }
 
+    if (Strings.isNullOrEmpty(canonicalId) && urlsAsDefaultCanonicalId) {
+      canonicalId = originalUrls.stream().map(URL::toExternalForm).collect(Collectors.joining(" "));
+    }
+
     List<URL> rewrittenUrls = originalUrls;
     Map<URI, Map<String, String>> rewrittenAuthHeaders = authHeaders;
 
diff --git a/src/test/shell/bazel/bazel_repository_cache_test.sh b/src/test/shell/bazel/bazel_repository_cache_test.sh
index 11b9eed..28235a0 100755
--- a/src/test/shell/bazel/bazel_repository_cache_test.sh
+++ b/src/test/shell/bazel/bazel_repository_cache_test.sh
@@ -465,4 +465,68 @@
   expect_log "Error downloading"
 }
 
+function test_break_url() {
+  setup_repository
+
+  bazel fetch --repository_cache="$repo_cache_dir" //zoo:breeding-program >& $TEST_log \
+    || echo "Expected fetch to succeed"
+
+  shutdown_server
+
+  bazel fetch --repository_cache="$repo_cache_dir" //zoo:breeding-program >& $TEST_log \
+    || echo "Expected fetch to succeed"
+
+  # Break url in WORKSPACE
+  rm WORKSPACE
+  cat >> $(create_workspace_with_default_repos WORKSPACE) <<EOF
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = 'endangered',
+    url = 'http://localhost:$nc_port/bleh.broken',
+    sha256 = '$sha256',
+    type = 'zip',
+)
+EOF
+
+  # By default, cache entry will still match by sha256, even if url is changed.
+  bazel fetch --repository_cache="$repo_cache_dir" //zoo:breeding-program >& $TEST_log \
+    || echo "Expected fetch to succeed"
+}
+
+function test_experimental_repository_cache_urls_as_default_canonical_id() {
+  setup_repository
+
+  bazel fetch --repository_cache="$repo_cache_dir" \
+        --experimental_repository_cache_urls_as_default_canonical_id \
+        //zoo:breeding-program >& $TEST_log \
+    || echo "Expected fetch to succeed"
+
+  shutdown_server
+
+  bazel fetch --repository_cache="$repo_cache_dir" \
+        --experimental_repository_cache_urls_as_default_canonical_id \
+        //zoo:breeding-program >& $TEST_log \
+    || echo "Expected fetch to succeed"
+
+  # Break url in WORKSPACE
+  rm WORKSPACE
+  cat >> $(create_workspace_with_default_repos WORKSPACE) <<EOF
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = 'endangered',
+    url = 'http://localhost:$nc_port/bleh.broken',
+    sha256 = '$sha256',
+    type = 'zip',
+)
+EOF
+
+  # As repository cache key should depend on urls, we expect fetching to fail now.
+  bazel fetch --repository_cache="$repo_cache_dir" \
+        --experimental_repository_cache_urls_as_default_canonical_id \
+        //zoo:breeding-program >& $TEST_log \
+    && fail "expected failure" || :
+}
+
 run_suite "repository cache tests"