Add metric for tracking zombie VMs. (#775)
Zombie VM = a GCE instance that has been online for a certain amount of time, but does not run a Buildkite agent.
WARNING: The corresponding SQL table hasn't been created yet - that's something we need to do before deploying a new version of the service.
diff --git a/metrics/README.md b/metrics/README.md
index 0858e5f..1ccd813 100644
--- a/metrics/README.md
+++ b/metrics/README.md
@@ -20,6 +20,7 @@
CREATE TABLE platform_usage (org VARCHAR(255), pipeline VARCHAR(255), build INT, platform VARCHAR(255), usage_seconds FLOAT, PRIMARY KEY(org, pipeline, build, platform));
CREATE TABLE release_downloads (release_name VARCHAR(255), artifact VARCHAR(255), downloads INT, PRIMARY KEY(release_name, artifact));
CREATE TABLE worker_availability (timestamp DATETIME, org VARCHAR(255), platform VARCHAR(255), idle_count INT, busy_count INT, PRIMARY KEY(timestamp, org, platform));
+CREATE TABLE zombie_instances (timestamp DATETIME, cloud_project VARCHAR(255), zone VARCHAR(255), instance VARCHAR(255), status VARCHAR(255), seconds_online FLOAT, PRIMARY KEY(timestamp, cloud_project, zone, instance));
```
## Service Deployment
diff --git a/metrics/clients/compute_engine.go b/metrics/clients/compute_engine.go
new file mode 100644
index 0000000..2a9e82e
--- /dev/null
+++ b/metrics/clients/compute_engine.go
@@ -0,0 +1,68 @@
+package clients
+
+import (
+ "context"
+ "fmt"
+ "time"
+
+ compute "google.golang.org/api/compute/v1"
+)
+
+type ComputeInstance struct {
+ Name string
+ Zone string
+ Project string
+ Status string
+ CreationTime time.Time
+}
+
+type ComputeEngineClient struct {
+ service *compute.Service
+}
+
+func CreateComputeEngineClient() (*ComputeEngineClient, error) {
+ ctx := context.Background()
+ service, err := compute.NewService(ctx)
+ if err != nil {
+ return nil, err
+ }
+ return &ComputeEngineClient{service: service}, nil
+}
+
+func (c *ComputeEngineClient) GetAllInstances(projects []string) ([]*ComputeInstance, error) {
+ allInstances := make([]*ComputeInstance, 0)
+ for _, project := range projects {
+ projectInstances, err := c.getAllInstancesForProject(project)
+ if err != nil {
+ return nil, err
+ }
+ allInstances = append(allInstances, projectInstances...)
+ }
+ return allInstances, nil
+}
+
+func (c *ComputeEngineClient) getAllInstancesForProject(project string) ([]*ComputeInstance, error) {
+ instances := make([]*ComputeInstance, 0)
+ request := c.service.Instances.AggregatedList(project)
+ for {
+ response, err := request.Do()
+ if err != nil {
+ return nil, fmt.Errorf("Could not retrieve instances for project '%s': %v", project, err)
+ }
+ for zone, instanceList := range response.Items {
+ for _, instance := range instanceList.Instances {
+ creationTime, error := time.Parse(time.RFC3339, instance.CreationTimestamp)
+ if error != nil {
+ return nil, fmt.Errorf("Failed to parse creation time for instance %s/%s: %v", zone, instance.Name, err)
+ }
+ instances = append(instances, &ComputeInstance{Name: instance.Name, Zone: zone, Project: project, Status: instance.Status, CreationTime: creationTime})
+ }
+ }
+ if response.NextPageToken == "" {
+ break
+ } else {
+ request.PageToken(response.NextPageToken)
+ }
+ }
+ return instances, nil
+}
diff --git a/metrics/main.go b/metrics/main.go
index b5e1db0..af8478c 100644
--- a/metrics/main.go
+++ b/metrics/main.go
@@ -59,6 +59,11 @@
}
bk := clients.CreateCachedBuildkiteClient(bkAPI, time.Duration(settings.BuildkiteCacheTimeoutMinutes)*time.Minute)
+ computeClient, err := clients.CreateComputeEngineClient()
+ if err != nil {
+ log.Fatalf("Cannot create Compute Engine client: %v", err)
+ }
+
storageClient, err := clients.CreateCloudStorageClient()
if err != nil {
log.Fatalf("Cannot create Cloud Storage client: %v", err)
@@ -123,6 +128,10 @@
workerAvailability := metrics.CreateWorkerAvailability(bk, settings.BuildkiteOrgs...)
srv.AddMetric(workerAvailability, minutes(5), defaultPublisher)
+ // TODO(fweikert): Read gracePeriod from Datastore
+ zombieInstances := metrics.CreateZombieInstances(computeClient, settings.CloudProjects, bk, settings.BuildkiteOrgs, minutes(3))
+ srv.AddMetric(zombieInstances, minutes(5), defaultPublisher)
+
if *testMode {
logInTestMode("Running all jobs exactly once...")
srv.RunJobsOnce()
diff --git a/metrics/metrics/zombie_instances.go b/metrics/metrics/zombie_instances.go
new file mode 100644
index 0000000..647a663
--- /dev/null
+++ b/metrics/metrics/zombie_instances.go
@@ -0,0 +1,98 @@
+package metrics
+
+import (
+ "fmt"
+ "strings"
+ "time"
+
+ "github.com/bazelbuild/continuous-integration/metrics/clients"
+ "github.com/bazelbuild/continuous-integration/metrics/data"
+)
+
+const ciWorkerNamePrefix = "bk-"
+
+type ZombieInstances struct {
+ computeClient *clients.ComputeEngineClient
+ cloudProjects []string
+
+ bkClient clients.BuildkiteClient
+ bkOrgs []string
+
+ gracePeriod time.Duration
+ columns []Column
+}
+
+func (zi *ZombieInstances) Name() string {
+ return "zombie_instances"
+}
+
+func (zi *ZombieInstances) Columns() []Column {
+ return zi.columns
+}
+
+func (zi *ZombieInstances) Collect() (data.DataSet, error) {
+ agentHostNameIndex, err := zi.getAgentHostNameIndex()
+ if err != nil {
+ return nil, fmt.Errorf("Failed to fetch Buildkite agents: %v", err)
+ }
+
+ instances, err := zi.getInstances()
+ if err != nil {
+ return nil, fmt.Errorf("Failed to fetch GCE instances: %v", err)
+ }
+
+ result := data.CreateDataSet(GetColumnNames(zi.columns))
+ for _, instance := range instances {
+ if _, ok := agentHostNameIndex[instance.Name]; ok {
+ // Agent is up and running
+ continue
+ }
+ if instance.Status == "STOPPING" {
+ continue
+ }
+ onlineTime := time.Since(instance.CreationTime)
+ if onlineTime < zi.gracePeriod {
+ // VM was started only very recently
+ continue
+ }
+ err = result.AddRow(time.Now(), instance.Project, instance.Zone, instance.Name, instance.Status, onlineTime.Seconds())
+ if err != nil {
+ return nil, err
+ }
+ }
+ return result, nil
+}
+
+func (zi *ZombieInstances) getInstances() ([]*clients.ComputeInstance, error) {
+ ciInstances := make([]*clients.ComputeInstance, 0)
+ allInstances, err := zi.computeClient.GetAllInstances(zi.cloudProjects)
+ if err != nil {
+ return nil, err
+ }
+ for _, instance := range allInstances {
+ if strings.HasPrefix(instance.Name, ciWorkerNamePrefix) {
+ ciInstances = append(ciInstances, instance)
+ }
+ }
+ return ciInstances, nil
+}
+
+func (zi *ZombieInstances) getAgentHostNameIndex() (map[string]bool, error) {
+ hostNameIndex := make(map[string]bool)
+ for _, org := range zi.bkOrgs {
+ agents, err := zi.bkClient.GetAgents(org)
+ if err != nil {
+ return nil, err
+ }
+ for _, agent := range agents {
+ hostNameIndex[*agent.Hostname] = false
+ }
+ }
+ return hostNameIndex, nil
+}
+
+// CREATE TABLE zombie_instances (timestamp DATETIME, cloud_project VARCHAR(255), zone VARCHAR(255), instance VARCHAR(255), status VARCHAR(255), seconds_online FLOAT, PRIMARY KEY(timestamp, cloud_project, zone, instance));
+func CreateZombieInstances(computeClient *clients.ComputeEngineClient, cloudProjects []string, bkClient clients.BuildkiteClient, bkOrgs []string, gracePeriod time.Duration) *ZombieInstances {
+ columns := []Column{Column{"timestamp", true}, Column{"cloud_project", true}, Column{"zone", true}, Column{"instance", true}, Column{"status", false}, Column{"seconds_online", false}}
+ return &ZombieInstances{computeClient: computeClient, cloudProjects: cloudProjects, bkClient: bkClient, bkOrgs: bkOrgs, columns: columns, gracePeriod: gracePeriod}
+}
diff --git a/metrics/settings.go b/metrics/settings.go
index 9dc98b7..b048159 100644
--- a/metrics/settings.go
+++ b/metrics/settings.go
@@ -24,6 +24,7 @@
CloudSqlInstance string
CloudSqlDatabase string
CloudSqlLocalPort int
+ CloudProjects []string
}
func ReadSettingsFromDatastore(projectID, settingsName string) (*Settings, error) {