Implement auto-retry on infra flake and a global job timeout of 8h.
diff --git a/buildkite/bazelci.py b/buildkite/bazelci.py
index 5fdb173..989b04c 100755
--- a/buildkite/bazelci.py
+++ b/buildkite/bazelci.py
@@ -1521,6 +1521,12 @@
step["label"] += " (shard %n)"
step["parallelism"] = shards
+ # Enforce a global 8 hour job timeout.
+ step["timeout_in_minutes"] = 8 * 60
+
+ # Automatically retry when an agent got lost (usually due to an infra flake).
+ step["retry"]["automatic"] = {"exit_status": -1, "limit": 3}
+
return step