Print top N slowest targets... (#1989)
...when --print_shard_summary is set.
This change adds a new annotation that shows the 20 slowest
test(shard)s.
Example:
https://buildkite.com/bazel/bazel-bazel-macos-ninja/builds/1939#annotation-slowest_targets
(Note: The latest version of the code drops the "in 1 out of 1 run(s)"
part.)
diff --git a/buildkite/bazelci.py b/buildkite/bazelci.py
index e73ff82..4603ce9 100755
--- a/buildkite/bazelci.py
+++ b/buildkite/bazelci.py
@@ -657,6 +657,7 @@
_TEST_BEP_FILE = "test_bep.json"
_SHARD_RE = re.compile(r"(.+) \(shard (\d+)\)")
+_SLOWEST_N_TARGETS = 20
class BuildkiteException(Exception):
@@ -1439,7 +1440,9 @@
project=project,
)
except Exception as ex:
- print_collapsed_group(":rotating_light: Test analytics disabled due to an error :warning:")
+ print_collapsed_group(
+ ":rotating_light: Test analytics disabled due to an error :warning:"
+ )
eprint(ex)
test_bep_file = os.path.join(tmpdir, _TEST_BEP_FILE)
@@ -3833,8 +3836,9 @@
tmpdir = tempfile.mkdtemp()
try:
print_collapsed_group("Fetching test artifacts...")
- all_test_artifacts = get_artifacts_for_failing_tests()
+ all_test_artifacts = get_test_artifacts()
print_collapsed_group("Downloading & parsing BEP files...")
+ perf_stats = []
for base_task, current_test_artifacts in all_test_artifacts.items():
failures = []
for test_artifact in current_test_artifacts:
@@ -3847,8 +3851,13 @@
continue
for test_execution in parse_bep(local_bep_path):
+ for shard in test_execution.shards:
+ perf_stats.append(shard.get_metrics(test_execution.label, test_artifact))
+
if test_execution.overall_status == "PASSED":
continue
+ if len(test_execution.shards) == 1:
+ continue
failures.append(
test_execution.Format(test_artifact.job_id, test_artifact.is_windows)
@@ -3866,6 +3875,22 @@
f"{base_task}",
]
)
+
+ if perf_stats:
+ slowest = sorted(perf_stats, key=lambda x: -x[0])
+ message = "\n".join(
+ f"{rank}. {tpl[1]}" for rank, tpl in enumerate(slowest[:_SLOWEST_N_TARGETS], 1)
+ )
+ execute_command(
+ [
+ "buildkite-agent",
+ "annotate",
+ "--style=warning",
+ f"**Slowest {_SLOWEST_N_TARGETS} targets**\n\n{message}",
+ "--context",
+ "slowest_targets",
+ ]
+ )
except Exception as ex:
eprint(f"Failed to print shard summary: {ex}")
finally:
@@ -3885,7 +3910,7 @@
return path.replace("/", ESCAPED_BACKSLASH) if is_windows else path
-def get_artifacts_for_failing_tests():
+def get_test_artifacts():
org_slug = os.getenv("BUILDKITE_ORGANIZATION_SLUG")
pipeline_slug = os.getenv("BUILDKITE_PIPELINE_SLUG")
build_number = os.getenv("BUILDKITE_BUILD_NUMBER")
@@ -3895,22 +3920,23 @@
paths = collections.defaultdict(list)
for job in build_info["jobs"]:
- if job.get("state") in (None, "passed"):
+ state = job.get("state")
+ if not state: # Wait steps etc.
continue
+ job_name = job.get("name", "")
# This is a bit hacky, but saves us one API request per job (to check for BUILDKITE_PARALLEL_JOB)
- match = _SHARD_RE.search(job.get("name", ""))
- if not match:
- continue
+ match = _SHARD_RE.search(job_name)
+ base_task = match.group(1) if match else job_name
relative_bep_path, relative_log_paths = get_test_file_paths(job["id"])
- # TODO: show build failures in the annotation, too?
if not relative_bep_path:
continue
- base_task = match.group(1)
ta = TestArtifacts(
job_id=job["id"],
+ job_name=job_name,
+ job_url=job.get("web_url"),
relative_bep_path=relative_bep_path,
relative_log_paths=relative_log_paths,
)
@@ -3920,9 +3946,10 @@
class TestArtifacts:
-
- def __init__(self, job_id, relative_bep_path, relative_log_paths) -> None:
+ def __init__(self, job_id, job_name, job_url, relative_bep_path, relative_log_paths) -> None:
self.job_id = job_id
+ self.job_name = job_name
+ self.job_url = job_url
self.relative_bep_path = relative_bep_path
self.relative_log_paths = relative_log_paths
@@ -4011,8 +4038,9 @@
class TestShard:
- def __init__(self, number, attempts) -> None:
+ def __init__(self, number, total_shards, attempts) -> None:
self.number = number
+ self.total_shards = total_shards
self.attempts = attempts
def _get_detailed_overall_status(self):
@@ -4032,9 +4060,19 @@
def get_details(self):
overall, bad_runs, total_runs = self._get_detailed_overall_status()
qualifier = "" if not bad_runs else f"{bad_runs} out of "
- time = f" over {format_millis(self.attempt_millis)}" if self.attempt_millis else ""
+ runs = f" in {qualifier}{total_runs} runs" if total_runs > 1 else ""
+ time = f" in {format_millis(self.attempt_millis)}" if self.attempt_millis else ""
cause = f" because of {self.root_cause}" if self.root_cause else ""
- return overall, (f"in {qualifier}{total_runs} run(s){time}{cause}")
+ return overall, f"{runs}{time}{cause}"
+
+ def get_metrics(self, label, test_artifact):
+ total_time = sum(self.attempt_millis)
+ shard_info = "" if self.total_shards == 1 else f" (shard {self.number}/{self.total_shards})"
+ overall, stats = self.get_details()
+ return (
+ total_time,
+ f"{label}{shard_info} on {test_artifact.job_name}: {format_test_status(overall)} {stats} ([log]({test_artifact.job_url}))",
+ )
@property
def overall_status(self):
@@ -4131,7 +4169,9 @@
tests = []
for test, attempts_per_shard in data.items():
shards = [
- TestShard(number=shard, attempts=attempts_per_shard[shard])
+ TestShard(
+ number=shard, total_shards=len(data[test]), attempts=attempts_per_shard[shard]
+ )
for shard in sorted(attempts_per_shard.keys())
]
tests.append(TestExecution(label=test, shards=shards))
@@ -4149,8 +4189,6 @@
if test_result:
if "testResult" not in data:
- # No testResult field means "aborted" -> NO_STATUS
- # TODO: show these targets in the UI?
continue
yield (