Add metric for tracking zombie VMs. (#775)
Zombie VM = a GCE instance that has been online for a certain amount of time, but does not run a Buildkite agent.
WARNING: The corresponding SQL table hasn't been created yet - that's something we need to do before deploying a new version of the service.
diff --git a/metrics/main.go b/metrics/main.go
index b5e1db0..af8478c 100644
--- a/metrics/main.go
+++ b/metrics/main.go
@@ -59,6 +59,11 @@
}
bk := clients.CreateCachedBuildkiteClient(bkAPI, time.Duration(settings.BuildkiteCacheTimeoutMinutes)*time.Minute)
+ computeClient, err := clients.CreateComputeEngineClient()
+ if err != nil {
+ log.Fatalf("Cannot create Compute Engine client: %v", err)
+ }
+
storageClient, err := clients.CreateCloudStorageClient()
if err != nil {
log.Fatalf("Cannot create Cloud Storage client: %v", err)
@@ -123,6 +128,10 @@
workerAvailability := metrics.CreateWorkerAvailability(bk, settings.BuildkiteOrgs...)
srv.AddMetric(workerAvailability, minutes(5), defaultPublisher)
+ // TODO(fweikert): Read gracePeriod from Datastore
+ zombieInstances := metrics.CreateZombieInstances(computeClient, settings.CloudProjects, bk, settings.BuildkiteOrgs, minutes(3))
+ srv.AddMetric(zombieInstances, minutes(5), defaultPublisher)
+
if *testMode {
logInTestMode("Running all jobs exactly once...")
srv.RunJobsOnce()