Implement auto-retry on infra flake and a global job timeout of 8h.
diff --git a/buildkite/bazelci.py b/buildkite/bazelci.py
index 5fdb173..989b04c 100755
--- a/buildkite/bazelci.py
+++ b/buildkite/bazelci.py
@@ -1521,6 +1521,12 @@
         step["label"] += " (shard %n)"
         step["parallelism"] = shards
 
+    # Enforce a global 8 hour job timeout.
+    step["timeout_in_minutes"] = 8 * 60
+
+    # Automatically retry when an agent got lost (usually due to an infra flake).
+    step["retry"]["automatic"] = {"exit_status": -1, "limit": 3}
+
     return step