Mirror attestations during BCR post-submit. (#2185)

Progress towards
https://github.com/bazelbuild/continuous-integration/issues/2177
diff --git a/buildkite/bazel-central-registry/bcr_postsubmit.py b/buildkite/bazel-central-registry/bcr_postsubmit.py
index 570a02d..dd852b9 100755
--- a/buildkite/bazel-central-registry/bcr_postsubmit.py
+++ b/buildkite/bazel-central-registry/bcr_postsubmit.py
@@ -23,25 +23,156 @@
   - Sync the bazel_registry.json and modules/ directory in the main branch of the BCR to https://bcr.bazel.build
 """
 
+import base64
+import hashlib
+import json
+import os
+import requests
 import subprocess
 import sys
+import tempfile
 
 BCR_BUCKET = "gs://bcr.bazel.build/"
+ATTESTATION_METADATA_FILE = "attestations.json"
+FILES_WITH_ATTESTATIONS = ("source.json", "MODULE.bazel")
+
+# Basename of the file that contains the most recent commit
+# that passed through the post-submit pipeline successfully.
+LAST_GREEN_FILE = "last_green.txt"
+
+
+class AttestationError(Exception):
+    """Raised when there is a problem wrt attestations."""
 
 def print_expanded_group(name):
     print("\n\n+++ {0}\n\n".format(name))
 
+def get_output(command):
+    return subprocess.run(
+          command,
+          encoding='utf-8',
+          stdout=subprocess.PIPE,
+      ).stdout
+
+def check_and_write_new_attestations():
+    print_expanded_group(":cop::copybara: Check & write attestations")
+    paths = get_new_attestations_json_paths()
+    if not paths:
+        # TODO: turn this into an error
+        print(f"No {ATTESTATION_METADATA_FILE} files were changed.")
+        return
+
+    for p in paths:
+        check_and_write_module_attestations(p)
+
+
+def get_new_attestations_json_paths():
+    cwd = os.getcwd()
+    cmd = ["git", "diff-tree", "--no-commit-id", "--name-only", "-r"]
+
+    # last_green should be the parent commit. However, sometimes the
+    # pipeline can fail due to infra issues. In this case we need
+    # to mirror attestations in the commits of the failing runs, too.
+    last_green = get_last_green()
+    if last_green:
+        cmd.append(last_green)
+
+    paths = get_output(cmd + [get_commit()])
+    return [os.path.join(cwd, p) for p in paths.split("\n") if p.endswith(f"/{ATTESTATION_METADATA_FILE}")]
+
+
+def get_last_green():
+    url = os.path.join(
+        BCR_BUCKET.replace("gs://", "https://storage.googleapis.com/"), LAST_GREEN_FILE
+    )
+    with requests.get(url) as response:
+        if response.status_code != 200:
+            return ""
+
+        return response.content.decode("utf-8")
+
+
+def get_commit():
+    return os.getenv("BUILDKITE_COMMIT")
+
+
+def check_and_write_module_attestations(attestations_json_path):
+    print(f"Checking {attestations_json_path}...")
+    dest_dir = os.path.dirname(attestations_json_path)
+    with open(attestations_json_path, "rb") as af:
+        metadata = json.loads(af.read())
+    
+    for f in FILES_WITH_ATTESTATIONS:
+        try:
+            entry = metadata["attestations"][f]
+            check_and_write_single_attestation(entry["url"], entry["integrity"], dest_dir)
+        except Exception as ex:
+            raise AttestationError(f"{attestations_json_path} - {f}: {ex}") from ex
+
+    print("Done!")
+
+def check_and_write_single_attestation(url, integrity, dest_dir):
+    print(f"\tFound attestation @ {url}")
+    with requests.get(url) as response:
+        if response.status_code != 200:
+            raise AttestationError(f"{url}: HTTP {response.status_code}")
+
+        raw_content = response.content
+
+    check_integrity(raw_content, integrity)
+    print("\t\tIntegrity: OK")
+
+    dest = os.path.join(dest_dir, get_canonical_basename(url))
+    print(f"\t\tWriting attestation to {dest}...")
+    with open(dest, "wb") as f:
+        f.write(raw_content)
+
+def check_integrity(data, expected):
+    algorithm, _, _ = expected.partition("-")
+    assert algorithm in {"sha224", "sha256", "sha384", "sha512"}, "Unsupported SRI algorithm"
+
+    hash = getattr(hashlib, algorithm)(data)
+    encoded = base64.b64encode(hash.digest()).decode()
+    actual = f"{algorithm}-{encoded}"
+    if actual != expected:
+        raise AttestationError(f"Expected checksum {expected}, got {actual}.")
+
+# Attestation files in GitHub releases may have prefixes in their basename
+# to avoid conflicts when multiple modules are released together
+# (e.g. rules_python and rules_python_gazelle_plugin).
+# In this case we need to get the canonical basename.
+def get_canonical_basename(url):
+    actual_basename = os.path.basename(url)
+    for f in FILES_WITH_ATTESTATIONS:
+        if f in actual_basename:
+            return f"{f}.intoto.jsonl"
+    
+    raise AttestationError(f"Invalid basename of {url}.")
+
+
 def sync_bcr_content():
     print_expanded_group(":gcloud: Sync BCR content")
     subprocess.check_output(
         ["gsutil", "-h", "Cache-Control:no-cache", "cp", "./bazel_registry.json", BCR_BUCKET]
     )
     subprocess.check_output(
-        ["gsutil", "-h", "Cache-Control:no-cache", "-m", "rsync", "-d", "-r", "./modules", BCR_BUCKET + "modules"]
+        ["gsutil", "-h", "Cache-Control:no-cache", "-m", "rsync", "-r", "./modules", BCR_BUCKET + "modules"]
     )
 
+
+def update_last_green():
+    path = os.path.join(tempfile.mkdtemp(), LAST_GREEN_FILE)
+    with open(path, "wt") as f:
+        f.write(get_commit())
+
+    dest = os.path.join(BCR_BUCKET, LAST_GREEN_FILE)
+    subprocess.check_output(["gsutil", "cp", path, dest])
+
+
 def main():
+    check_and_write_new_attestations()
     sync_bcr_content()
+    update_last_green()
     return 0
 
 if __name__ == "__main__":