Add metric for tracking zombie VMs. (#775)

Zombie VM = a GCE instance that has been online for a certain amount of time, but does not run a Buildkite agent.

WARNING: The corresponding SQL table hasn't been created yet - that's something we need to do before deploying a new version of the service.
diff --git a/metrics/README.md b/metrics/README.md
index 0858e5f..1ccd813 100644
--- a/metrics/README.md
+++ b/metrics/README.md
@@ -20,6 +20,7 @@
 CREATE TABLE platform_usage (org VARCHAR(255), pipeline VARCHAR(255), build INT, platform VARCHAR(255), usage_seconds FLOAT, PRIMARY KEY(org, pipeline, build, platform));
 CREATE TABLE release_downloads (release_name VARCHAR(255), artifact VARCHAR(255), downloads INT, PRIMARY KEY(release_name, artifact));
 CREATE TABLE worker_availability (timestamp DATETIME, org VARCHAR(255), platform VARCHAR(255), idle_count INT, busy_count INT, PRIMARY KEY(timestamp, org, platform));
+CREATE TABLE zombie_instances (timestamp DATETIME, cloud_project VARCHAR(255), zone VARCHAR(255), instance VARCHAR(255), status VARCHAR(255), seconds_online FLOAT, PRIMARY KEY(timestamp, cloud_project, zone, instance));
 ```
 
 ## Service Deployment
diff --git a/metrics/clients/compute_engine.go b/metrics/clients/compute_engine.go
new file mode 100644
index 0000000..2a9e82e
--- /dev/null
+++ b/metrics/clients/compute_engine.go
@@ -0,0 +1,68 @@
+package clients
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	compute "google.golang.org/api/compute/v1"
+)
+
+type ComputeInstance struct {
+	Name         string
+	Zone         string
+	Project      string
+	Status       string
+	CreationTime time.Time
+}
+
+type ComputeEngineClient struct {
+	service *compute.Service
+}
+
+func CreateComputeEngineClient() (*ComputeEngineClient, error) {
+	ctx := context.Background()
+	service, err := compute.NewService(ctx)
+	if err != nil {
+		return nil, err
+	}
+	return &ComputeEngineClient{service: service}, nil
+}
+
+func (c *ComputeEngineClient) GetAllInstances(projects []string) ([]*ComputeInstance, error) {
+	allInstances := make([]*ComputeInstance, 0)
+	for _, project := range projects {
+		projectInstances, err := c.getAllInstancesForProject(project)
+		if err != nil {
+			return nil, err
+		}
+		allInstances = append(allInstances, projectInstances...)
+	}
+	return allInstances, nil
+}
+
+func (c *ComputeEngineClient) getAllInstancesForProject(project string) ([]*ComputeInstance, error) {
+	instances := make([]*ComputeInstance, 0)
+	request := c.service.Instances.AggregatedList(project)
+	for {
+		response, err := request.Do()
+		if err != nil {
+			return nil, fmt.Errorf("Could not retrieve instances for project '%s': %v", project, err)
+		}
+		for zone, instanceList := range response.Items {
+			for _, instance := range instanceList.Instances {
+				creationTime, error := time.Parse(time.RFC3339, instance.CreationTimestamp)
+				if error != nil {
+					return nil, fmt.Errorf("Failed to parse creation time for instance %s/%s: %v", zone, instance.Name, err)
+				}
+				instances = append(instances, &ComputeInstance{Name: instance.Name, Zone: zone, Project: project, Status: instance.Status, CreationTime: creationTime})
+			}
+		}
+		if response.NextPageToken == "" {
+			break
+		} else {
+			request.PageToken(response.NextPageToken)
+		}
+	}
+	return instances, nil
+}
diff --git a/metrics/main.go b/metrics/main.go
index b5e1db0..af8478c 100644
--- a/metrics/main.go
+++ b/metrics/main.go
@@ -59,6 +59,11 @@
 	}
 	bk := clients.CreateCachedBuildkiteClient(bkAPI, time.Duration(settings.BuildkiteCacheTimeoutMinutes)*time.Minute)
 
+	computeClient, err := clients.CreateComputeEngineClient()
+	if err != nil {
+		log.Fatalf("Cannot create Compute Engine client: %v", err)
+	}
+
 	storageClient, err := clients.CreateCloudStorageClient()
 	if err != nil {
 		log.Fatalf("Cannot create Cloud Storage client: %v", err)
@@ -123,6 +128,10 @@
 	workerAvailability := metrics.CreateWorkerAvailability(bk, settings.BuildkiteOrgs...)
 	srv.AddMetric(workerAvailability, minutes(5), defaultPublisher)
 
+	// TODO(fweikert): Read gracePeriod from Datastore
+	zombieInstances := metrics.CreateZombieInstances(computeClient, settings.CloudProjects, bk, settings.BuildkiteOrgs, minutes(3))
+	srv.AddMetric(zombieInstances, minutes(5), defaultPublisher)
+
 	if *testMode {
 		logInTestMode("Running all jobs exactly once...")
 		srv.RunJobsOnce()
diff --git a/metrics/metrics/zombie_instances.go b/metrics/metrics/zombie_instances.go
new file mode 100644
index 0000000..647a663
--- /dev/null
+++ b/metrics/metrics/zombie_instances.go
@@ -0,0 +1,98 @@
+package metrics
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/bazelbuild/continuous-integration/metrics/clients"
+	"github.com/bazelbuild/continuous-integration/metrics/data"
+)
+
+const ciWorkerNamePrefix = "bk-"
+
+type ZombieInstances struct {
+	computeClient *clients.ComputeEngineClient
+	cloudProjects []string
+
+	bkClient clients.BuildkiteClient
+	bkOrgs   []string
+
+	gracePeriod time.Duration
+	columns     []Column
+}
+
+func (zi *ZombieInstances) Name() string {
+	return "zombie_instances"
+}
+
+func (zi *ZombieInstances) Columns() []Column {
+	return zi.columns
+}
+
+func (zi *ZombieInstances) Collect() (data.DataSet, error) {
+	agentHostNameIndex, err := zi.getAgentHostNameIndex()
+	if err != nil {
+		return nil, fmt.Errorf("Failed to fetch Buildkite agents: %v", err)
+	}
+
+	instances, err := zi.getInstances()
+	if err != nil {
+		return nil, fmt.Errorf("Failed to fetch GCE instances: %v", err)
+	}
+
+	result := data.CreateDataSet(GetColumnNames(zi.columns))
+	for _, instance := range instances {
+		if _, ok := agentHostNameIndex[instance.Name]; ok {
+			// Agent is up and running
+			continue
+		}
+		if instance.Status == "STOPPING" {
+			continue
+		}
+		onlineTime := time.Since(instance.CreationTime)
+		if onlineTime < zi.gracePeriod {
+			// VM was started only very recently
+			continue
+		}
+		err = result.AddRow(time.Now(), instance.Project, instance.Zone, instance.Name, instance.Status, onlineTime.Seconds())
+		if err != nil {
+			return nil, err
+		}
+	}
+	return result, nil
+}
+
+func (zi *ZombieInstances) getInstances() ([]*clients.ComputeInstance, error) {
+	ciInstances := make([]*clients.ComputeInstance, 0)
+	allInstances, err := zi.computeClient.GetAllInstances(zi.cloudProjects)
+	if err != nil {
+		return nil, err
+	}
+	for _, instance := range allInstances {
+		if strings.HasPrefix(instance.Name, ciWorkerNamePrefix) {
+			ciInstances = append(ciInstances, instance)
+		}
+	}
+	return ciInstances, nil
+}
+
+func (zi *ZombieInstances) getAgentHostNameIndex() (map[string]bool, error) {
+	hostNameIndex := make(map[string]bool)
+	for _, org := range zi.bkOrgs {
+		agents, err := zi.bkClient.GetAgents(org)
+		if err != nil {
+			return nil, err
+		}
+		for _, agent := range agents {
+			hostNameIndex[*agent.Hostname] = false
+		}
+	}
+	return hostNameIndex, nil
+}
+
+// CREATE TABLE zombie_instances (timestamp DATETIME, cloud_project VARCHAR(255), zone VARCHAR(255), instance VARCHAR(255), status VARCHAR(255), seconds_online FLOAT, PRIMARY KEY(timestamp, cloud_project, zone, instance));
+func CreateZombieInstances(computeClient *clients.ComputeEngineClient, cloudProjects []string, bkClient clients.BuildkiteClient, bkOrgs []string, gracePeriod time.Duration) *ZombieInstances {
+	columns := []Column{Column{"timestamp", true}, Column{"cloud_project", true}, Column{"zone", true}, Column{"instance", true}, Column{"status", false}, Column{"seconds_online", false}}
+	return &ZombieInstances{computeClient: computeClient, cloudProjects: cloudProjects, bkClient: bkClient, bkOrgs: bkOrgs, columns: columns, gracePeriod: gracePeriod}
+}
diff --git a/metrics/settings.go b/metrics/settings.go
index 9dc98b7..b048159 100644
--- a/metrics/settings.go
+++ b/metrics/settings.go
@@ -24,6 +24,7 @@
 	CloudSqlInstance             string
 	CloudSqlDatabase             string
 	CloudSqlLocalPort            int
+	CloudProjects                []string
 }
 
 func ReadSettingsFromDatastore(projectID, settingsName string) (*Settings, error) {