Improve Buildkite agent imaging scripts.
Add support for a 'trusted' worker that we'll run release steps on.
diff --git a/buildkite/create_images.py b/buildkite/create_images.py
index 96f88e3..44e9431 100755
--- a/buildkite/create_images.py
+++ b/buildkite/create_images.py
@@ -36,18 +36,16 @@
# Find the newest FreeBSD 11 image via:
# gcloud compute images list --project freebsd-org-cloud-dev \
# --no-standard-images
- # 'buildkite-freebsd11': {
+ # ('buildkite-freebsd11',): {
# 'source_image': 'https://www.googleapis.com/compute/v1/projects/freebsd-org-cloud-dev/global/images/freebsd-11-1-stable-amd64-2017-12-28',
- # 'target_image_family': 'bazel-freebsd11',
# 'scripts': [
# 'setup-freebsd.sh',
# 'install-buildkite-agent.sh'
# ]
# },
- 'buildkite-ubuntu1404': {
+ ('buildkite-ubuntu1404',): {
'source_image_project': 'ubuntu-os-cloud',
'source_image_family': 'ubuntu-1404-lts',
- 'target_image_family': 'buildkite-ubuntu1404',
'scripts': [
'shell-utils.sh',
'setup-ubuntu.sh',
@@ -64,10 +62,9 @@
'https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx'
]
},
- 'buildkite-ubuntu1604': {
+ ('buildkite-ubuntu1604', 'buildkite-trusted-ubuntu1604', 'buildkite-pipeline-ubuntu1604'): {
'source_image_project': 'ubuntu-os-cloud',
'source_image_family': 'ubuntu-1604-lts',
- 'target_image_family': 'buildkite-ubuntu1604',
'scripts': [
'shell-utils.sh',
'setup-ubuntu.sh',
@@ -84,43 +81,9 @@
'https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx'
]
},
- 'philwo-ubuntu1604': {
- 'source_image_project': 'ubuntu-os-cloud',
- 'source_image_family': 'ubuntu-1604-lts',
- 'target_image_family': 'philwo-ubuntu1604',
- 'scripts': [
- 'shell-utils.sh',
- 'setup-ubuntu.sh',
- 'install-azul-zulu.sh',
- 'install-bazel.sh',
- 'install-buildkite-agent.sh',
- 'install-docker.sh',
- 'install-nodejs.sh',
- 'install-python36.sh',
- 'install-android-sdk.sh',
- 'shutdown.sh'
- ],
- 'licenses': [
- 'https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx'
- ]
- },
- 'buildkite-pipeline-ubuntu1604': {
- 'source_image_project': 'ubuntu-os-cloud',
- 'source_image_family': 'ubuntu-1604-lts',
- 'target_image_family': 'buildkite-pipeline-ubuntu1604',
- 'scripts': [
- 'shell-utils.sh',
- 'setup-ubuntu.sh',
- 'install-azul-zulu.sh',
- 'install-buildkite-agent.sh',
- 'install-python36.sh',
- 'shutdown.sh'
- ]
- },
- 'buildkite-windows': {
+ ('buildkite-windows',): {
'source_image_project': 'windows-cloud',
'source_image_family': 'windows-1709-core',
- 'target_image_family': 'buildkite-windows',
'scripts': [
'setup-windows-manual.ps1'
]
@@ -163,7 +126,8 @@
'image-family': params['source_image_family']
}
- gcloud.create_instance(instance_name,
+ gcloud.create_instance(
+ instance_name,
zone=LOCATION,
machine_type='n1-standard-8',
network='buildkite',
@@ -228,8 +192,9 @@
gcloud_utils.wait_for_instance(instance_name, zone=LOCATION, status='TERMINATED')
# Create a new image from our VM.
- gcloud.create_image(instance_name,
- family=params['target_image_family'],
+ gcloud.create_image(
+ instance_name,
+ family=name,
source_disk=instance_name,
source_disk_zone=LOCATION,
licenses=params.get('licenses', []))
@@ -253,13 +218,14 @@
argv = sys.argv[1:]
# Put VM creation instructions into the work queue.
- for name, params in IMAGE_CREATION_VMS.items():
- if argv and name not in argv:
- continue
- WORK_QUEUE.put({
- 'name': name,
- 'params': params
- })
+ for names, params in IMAGE_CREATION_VMS.items():
+ for name in names:
+ if argv and name not in argv:
+ continue
+ WORK_QUEUE.put({
+ 'name': name,
+ 'params': params
+ })
# Spawn worker threads that will create the VMs.
threads = []
diff --git a/buildkite/create_instances.py b/buildkite/create_instances.py
index 3caf5a3..72c8e6a 100755
--- a/buildkite/create_instances.py
+++ b/buildkite/create_instances.py
@@ -62,6 +62,14 @@
'local_ssd': 'interface=nvme',
'metadata_from_file': 'startup-script=startup-ubuntu.sh',
},
+ 'buildkite-trusted-ubuntu1604': {
+ 'count': 1,
+ 'image_family': 'buildkite-trusted-ubuntu1604',
+ 'local_ssd': 'interface=nvme',
+ 'machine_type': 'n1-standard-8',
+ 'metadata_from_file': 'startup-script=startup-ubuntu.sh',
+ 'service_account': 'bazel-release-process@bazel-public.iam.gserviceaccount.com',
+ },
'buildkite-windows': {
'count': 4,
'image_family': 'buildkite-windows',
@@ -87,10 +95,6 @@
'metadata_from_file': 'startup-script=startup-ubuntu.sh',
'persistent_disk': 'name={0},device-name={0},mode=rw,boot=no'.format('testing-ubuntu1604-persistent'),
},
- 'philwo-ubuntu1604': {
- 'image_family': 'buildkite-ubuntu1604',
- 'metadata_from_file': 'startup-script=startup-ubuntu.sh',
- },
'testing-windows': {
'boot_disk_size': '500GB',
'image_family': 'buildkite-windows',
@@ -117,7 +121,7 @@
if gcloud.delete_instance_group(instance_group_name, zone=LOCATION).returncode == 0:
print('Deleted existing instance group: {}'.format(instance_group_name))
- if gcloud.delete_instance_template(template_name, zone=LOCATION).returncode == 0:
+ if gcloud.delete_instance_template(template_name).returncode == 0:
print('Deleted existing VM template: {}'.format(template_name))
gcloud.create_instance_template(template_name, **kwargs)
diff --git a/buildkite/gcloud_utils.py b/buildkite/gcloud_utils.py
index 3d7f41d..e946b9e 100644
--- a/buildkite/gcloud_utils.py
+++ b/buildkite/gcloud_utils.py
@@ -74,6 +74,9 @@
try:
result = gcloud.get_serial_port_output(instance_name, zone=zone, start=next_start)
except subprocess.CalledProcessError as e:
+ if 'Could not fetch serial port output: TIMEOUT' in e.stderr:
+ gcloud.debug('tail_serial_console: Retrying after TIMEOUT')
+ continue
gcloud.debug('tail_serial_console: Done, because got exception: {}'.format(e))
if e.stdout:
gcloud.debug('stdout: ' + e.stdout)
diff --git a/buildkite/install-buildkite-agent.sh b/buildkite/install-buildkite-agent.sh
index d743878..be404bb 100755
--- a/buildkite/install-buildkite-agent.sh
+++ b/buildkite/install-buildkite-agent.sh
@@ -14,43 +14,45 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-# Deduce the operating system from the hostname and put it into the metadata.
+###
+### Install the Buildkite agent.
+###
+
case $(hostname) in
- *pipeline*)
- AGENT_TAGS="os=pipeline,pipeline=true"
+ *ubuntu*)
+ apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 \
+ --recv-keys 32A37959C2FA5C3C99EFBC32A79206696452D198 &> /dev/null
+ add-apt-repository -y "deb https://apt.buildkite.com/buildkite-agent unstable main"
+ apt-get -qqy update > /dev/null
+ apt-get -qqy install buildkite-agent > /dev/null
;;
- *ubuntu1404*)
- AGENT_TAGS="os=ubuntu1404"
- ;;
- *ubuntu1604*)
- AGENT_TAGS="os=ubuntu1604"
- ;;
- default)
- echo "Could not deduce operating system from hostname: $(hostname)!"
+ *)
+ echo "Don't know how to install the Buildkite agent on this host: $(hostname)!"
exit 1
esac
-if [[ $(hostname) == *ubuntu* ]]; then
- apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 \
- --recv-keys 32A37959C2FA5C3C99EFBC32A79206696452D198 &> /dev/null
- add-apt-repository -y "deb https://apt.buildkite.com/buildkite-agent unstable main"
- apt-get -qqy update > /dev/null
- apt-get -qqy install buildkite-agent > /dev/null
-fi
+###
+### /etc/buildkite-agent/buildkite-agent.cfg
+###
-# Add the Buildkite agent hooks.
-cat > /etc/buildkite-agent/hooks/environment <<'EOF'
-#!/bin/bash
-
-set -eu
-
-export ANDROID_HOME="/opt/android-sdk-linux"
-export ANDROID_NDK_HOME="/opt/android-ndk-r15c"
-export BUILDKITE_ARTIFACT_UPLOAD_DESTINATION="gs://bazel-buildkite-artifacts/$BUILDKITE_JOB_ID"
-export BUILDKITE_GS_ACL="publicRead"
-EOF
-chmod 0500 /etc/buildkite-agent/hooks/*
-chown -R buildkite-agent:buildkite-agent /etc/buildkite-agent
+# Deduce the operating system from the hostname and put it into the metadata.
+case $(hostname) in
+ *pipeline*)
+ AGENT_TAGS="kind=pipeline,os=pipeline,pipeline=true"
+ ;;
+ *trusted*)
+ AGENT_TAGS="kind=trusted,os=trusted"
+ ;;
+ *ubuntu1404*)
+ AGENT_TAGS="kind=worker,os=ubuntu1404"
+ ;;
+ *ubuntu1604*)
+ AGENT_TAGS="kind=worker,os=ubuntu1604"
+ ;;
+ *)
+ echo "Could not deduce operating system from hostname: $(hostname)!"
+ exit 1
+esac
# Write the Buildkite agent configuration.
cat > /etc/buildkite-agent/buildkite-agent.cfg <<EOF
@@ -62,23 +64,89 @@
hooks-path="/etc/buildkite-agent/hooks"
plugins-path="/etc/buildkite-agent/plugins"
EOF
+
+# Stop the agent after each job on stateless worker machines.
if [[ $(hostname) != *pipeline* ]]; then
- # Stop the agent after each job on stateless worker machines.
cat >> /etc/buildkite-agent/buildkite-agent.cfg <<EOF
disconnect-after-job=true
disconnect-after-job-timeout=86400
EOF
fi
-chmod 0400 /etc/buildkite-agent/buildkite-agent.cfg
-chown -R buildkite-agent:buildkite-agent /etc/buildkite-agent
+
+###
+### /etc/buildkite-agent/hooks/environment
+###
+
+# Add the Buildkite agent hooks.
+cat > /etc/buildkite-agent/hooks/environment <<'EOF'
+#!/bin/bash
+
+set -euo pipefail
+
+export ANDROID_HOME="/opt/android-sdk-linux"
+echo "Android SDK is at $ANDROID_HOME"
+
+export ANDROID_NDK_HOME="/opt/android-ndk-*"
+echo "Android NDK is at $ANDROID_NDK_HOME"
+
+export BUILDKITE_ARTIFACT_UPLOAD_DESTINATION="gs://bazel-buildkite-artifacts/$BUILDKITE_JOB_ID"
+export BUILDKITE_GS_ACL="publicRead"
+EOF
+
+# The trusted worker machine may only execute certain whitelisted builds.
+if [[ $(hostname) == *trusted* ]]; then
+ cat >> /etc/buildkite-agent/hooks/environment <<'EOF'
+case ${BUILDKITE_BUILD_CREATOR_EMAIL} in
+ *@google.com)
+ ;;
+ *)
+ echo "Build creator not allowed: ${BUILDKITE_BUILD_CREATOR_EMAIL}"
+ exit 1
+esac
+
+case ${BUILDKITE_REPO} in
+ https://github.com/bazelbuild/bazel.git|\
+ https://github.com/bazelbuild/continuous-integration.git)
+ ;;
+ *)
+ echo "Repository not allowed: ${BUILDKITE_REPO}"
+ exit 1
+esac
+
+case ${BUILDKITE_ORGANIZATION_SLUG} in
+ bazel)
+ ;;
+ *)
+ echo "Organization not allowed: ${BUILDKITE_PIPELINE_SLUG}"
+ exit 1
+esac
+
+case ${BUILDKITE_PIPELINE_SLUG} in
+ google-bazel-presubmit-metrics|\
+ release)
+ ;;
+ *)
+ echo "Pipeline not allowed: ${BUILDKITE_PIPELINE_SLUG}"
+ exit 1
+esac
+
+export BUILDKITE_API_TOKEN=$(gsutil cat "gs://bazel-encrypted-secrets/buildkite-api-token.enc" | \
+ gcloud kms decrypt --location "global" --keyring "buildkite" --key "buildkite-api-token" \
+ --plaintext-file "-" --ciphertext-file "-")
+EOF
+fi
+
+###
+### Service configuration.
+###
# Some notes about our service config:
#
# - All Buildkite agents except the pipeline agent are stateless and need a special service config
-# that kills remaining processes and deletes temporary files.
+# that kills remaining processes and deletes temporary files.
#
# - We set the service to not launch automatically, as the startup script will start it once it is
-# done with setting up the local SSD and writing the agent configuration.
+# done with setting up the local SSD and writing the agent configuration.
if [[ $(hostname) == *pipeline* ]]; then
# This is a pipeline worker machine.
systemctl disable buildkite-agent
diff --git a/buildkite/startup-ubuntu.sh b/buildkite/startup-ubuntu.sh
index 5900cf2..aecaea5 100755
--- a/buildkite/startup-ubuntu.sh
+++ b/buildkite/startup-ubuntu.sh
@@ -78,12 +78,18 @@
# being used by someone for testing / development).
if [[ $(hostname) == buildkite* ]]; then
# Get the Buildkite Token from GCS and decrypt it using KMS.
- BUILDKITE_TOKEN=$(curl -sS "https://storage.googleapis.com/bazel-encrypted-secrets/buildkite-agent-token.enc" | \
+ BUILDKITE_TOKEN=$(gsutil cat "gs://bazel-encrypted-secrets/buildkite-agent-token.enc" | \
gcloud kms decrypt --location global --keyring buildkite --key buildkite-agent-token --ciphertext-file - --plaintext-file -)
# Insert the Buildkite Token into the agent configuration.
sed -i "s/token=\"xxx\"/token=\"${BUILDKITE_TOKEN}\"/" /etc/buildkite-agent/buildkite-agent.cfg
+ # Fix permissions of the Buildkite agent configuration files and hooks.
+ chmod 0400 /etc/buildkite-agent/buildkite-agent.cfg
+ chmod 0500 /etc/buildkite-agent/hooks/*
+ chown -R buildkite-agent:buildkite-agent /etc/buildkite-agent
+
+ # Start the Buildkite agent service.
if [[ $(hostname) == *pipeline* ]]; then
# Start 8 instances of the Buildkite agent.
for i in $(seq 8); do