Update to infrastructure scripts.
diff --git a/buildkite/create_instances.py b/buildkite/create_instances.py
index 5d28b54..cfaf184 100755
--- a/buildkite/create_instances.py
+++ b/buildkite/create_instances.py
@@ -25,6 +25,16 @@
LOCATION = 'europe-west1-d'
+# Note that the hostnames are parsed and trigger specific behavior for different use cases.
+# The following parts have a special meaning:
+#
+# - "buildkite": This is a normal production VM running the Buildkite agent.
+# - "pipeline": This is a special production VM that only runs pipeline setup scripts.
+# - "testing": This is a shared VM that can be used by project members for experiments.
+# It does not run the Buildkite agent.
+# - "$USER": This is a VM used by one specific engineer for tests. It does not run the Buildkite
+# agent.
+#
INSTANCE_GROUPS = {
'buildkite-ubuntu1404': {
'count': 8,
@@ -52,14 +62,37 @@
'machine_type': 'n1-standard-8',
'persistent_disk': 'buildkite-pipeline-persistent'
},
+ 'testing-ubuntu1404': {
+ 'image_family': 'buildkite-ubuntu1404',
+ 'startup_script': 'startup-ubuntu.sh',
+ 'machine_type': 'n1-standard-32',
+ 'persistent_disk': 'testing-ubuntu1404-persistent'
+ },
+ 'testing-ubuntu1604': {
+ 'image_family': 'buildkite-ubuntu1604',
+ 'startup_script': 'startup-ubuntu.sh',
+ 'machine_type': 'n1-standard-32',
+ 'persistent_disk': 'testing-ubuntu1604-persistent'
+ },
+ 'testing-windows': {
+ 'image_family': 'buildkite-windows',
+ 'machine_type': 'n1-standard-32',
+ 'boot_disk_size': '500GB'
+ },
+ '{}-ubuntu1404'.format(getpass.getuser()): {
+ 'image_family': 'buildkite-ubuntu1404',
+ 'startup_script': 'startup-ubuntu.sh',
+ 'machine_type': 'n1-standard-32',
+ 'local_ssd': 'interface=nvme',
+ },
'{}-ubuntu1604'.format(getpass.getuser()): {
- 'image': 'buildkite-ubuntu1604',
+ 'image_family': 'buildkite-ubuntu1604',
'startup_script': 'startup-ubuntu.sh',
'machine_type': 'n1-standard-32',
'local_ssd': 'interface=nvme',
},
'{}-windows'.format(getpass.getuser()): {
- 'image': 'buildkite-windows',
+ 'image_family': 'buildkite-windows',
'startup_script': 'startup-windows.ps1',
'machine_type': 'n1-standard-32',
'local_ssd': 'interface=scsi',
@@ -83,10 +116,12 @@
def flags_for_instance(image_family, params):
cmd = ['--machine-type', params['machine_type']]
cmd.extend(['--network', 'buildkite'])
- if 'windows' in image_family:
- cmd.extend(['--metadata-from-file', 'windows-startup-script-ps1=' + params['startup_script']])
- else:
- cmd.extend(['--metadata-from-file', 'startup-script=' + params['startup_script']])
+ if 'startup_script' in params:
+ if 'windows' in image_family:
+ cmd.extend(['--metadata-from-file',
+ 'windows-startup-script-ps1=' + params['startup_script']])
+ else:
+ cmd.extend(['--metadata-from-file', 'startup-script=' + params['startup_script']])
cmd.extend(['--min-cpu-platform', 'Intel Skylake'])
cmd.extend(['--boot-disk-type', 'pd-ssd'])
cmd.extend(['--boot-disk-size', params.get('boot_disk_size', '50GB')])
@@ -120,11 +155,18 @@
def delete_instance(instance_name):
- return run(['gcloud', 'compute', 'instances', 'delete', '--quiet', instance_name])
+ cmd = ['gcloud', 'compute', 'instances', 'delete', '--quiet', instance_name]
+ result = run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
+ if result.returncode != 0:
+ # It's not an error if 'delete' failed, because the template didn't exist in the first place.
+ # But we do want to error out on other unexpected errors.
+ if not re.search(r'The resource .* was not found', result.stdout):
+ raise Exception('"gcloud compute instance delete" returned unexpected error:\n{}'.format(result.stdout))
+ return result
def create_instance(instance_name, image_family, params):
- cmd = ['gcloud', 'compute', 'instance', 'create', instance_name]
+ cmd = ['gcloud', 'compute', 'instances', 'create', instance_name]
cmd.extend(['--zone', LOCATION])
cmd.extend(flags_for_instance(image_family, params))
run(cmd)
@@ -208,8 +250,12 @@
})
for instance_name, params in SINGLE_INSTANCES.items():
+ # If the user specified instance (group) names on the command-line, we process only these
+ # instances, otherwise we process all.
if argv and instance_name not in argv:
continue
+ # Do not automatically create user-specific instances. These must be specified explicitly
+ # on the command-line.
if instance_name.startswith(getpass.getuser()) and instance_name not in argv:
continue
WORK_QUEUE.put({
diff --git a/buildkite/install-buildkite-agent.sh b/buildkite/install-buildkite-agent.sh
index d276bd1..0d5d486 100755
--- a/buildkite/install-buildkite-agent.sh
+++ b/buildkite/install-buildkite-agent.sh
@@ -17,13 +17,13 @@
# Deduce the operating system from the hostname and put it into the metadata.
case $(hostname) in
*pipeline*)
- AGENT_TAGS="osname=pipeline,pipeline=true"
+ AGENT_TAGS="os=pipeline,pipeline=true"
;;
*ubuntu1404*)
- AGENT_TAGS="osname=ubuntu1404"
+ AGENT_TAGS="os=ubuntu1404"
;;
*ubuntu1604*)
- AGENT_TAGS="osname=ubuntu1604"
+ AGENT_TAGS="os=ubuntu1604"
;;
default)
echo "Could not deduce operating system from hostname: $(hostname)!"
@@ -55,7 +55,7 @@
# Write the Buildkite agent configuration.
cat > /etc/buildkite-agent/buildkite-agent.cfg <<EOF
token="xxx"
-name="%hostname"
+name="%hostname-%n"
tags="${AGENT_TAGS}"
tags-from-gcp=true
build-path="/var/lib/buildkite-agent/builds"
diff --git a/buildkite/setup-ubuntu.sh b/buildkite/setup-ubuntu.sh
index a07b1a4..8ba7d3c 100755
--- a/buildkite/setup-ubuntu.sh
+++ b/buildkite/setup-ubuntu.sh
@@ -60,5 +60,8 @@
# Dependencies for TensorFlow.
libcurl3-dev
swig
+
+ # Infra stuff.
+ lvm2
)
apt-get -qqy install "${packages[@]}" > /dev/null
diff --git a/buildkite/setup-windows-manual.ps1 b/buildkite/setup-windows-manual.ps1
index 87ba0c6..9f204c2 100755
--- a/buildkite/setup-windows-manual.ps1
+++ b/buildkite/setup-windows-manual.ps1
@@ -16,6 +16,8 @@
}
[Environment]::SetEnvironmentVariable("TEMP", "C:\temp", "Machine")
[Environment]::SetEnvironmentVariable("TMP", "C:\temp", "Machine")
+[Environment]::SetEnvironmentVariable("TEMP", "C:\temp", "User")
+[Environment]::SetEnvironmentVariable("TMP", "C:\temp", "User")
$env:TEMP = [Environment]::GetEnvironmentVariable("TEMP", "Machine")
$env:TMP = [Environment]::GetEnvironmentVariable("TMP", "Machine")
@@ -250,7 +252,7 @@
## Create a service wrapper script for the Buildkite agent.
Write-Host "Creating Buildkite agent environment hook..."
$buildkite_environment_hook = @"
-SET BUILDKITE_ARTIFACT_UPLOAD_DESTINATION=gs://bazel-buildkite-artifacts/`$BUILDKITE_JOB_ID
+SET BUILDKITE_ARTIFACT_UPLOAD_DESTINATION=gs://bazel-buildkite-artifacts/%BUILDKITE_JOB_ID%
SET BUILDKITE_GS_ACL=publicRead
SET JAVA_HOME=${env:JAVA_HOME}
SET PATH=${env:PATH}
@@ -325,6 +327,8 @@
nssm set "buildkite-monitor" "AppStdout" "c:\buildkite\logs\buildkite-monitor.log"
nssm set "buildkite-monitor" "AppStderr" "c:\buildkite\logs\buildkite-monitor.log"
nssm set "buildkite-monitor" "AppRotateFiles" "1"
+nssm set "buildkite-monitor" "AppRotateSeconds" 86400
+nssm set "buildkite-monitor" "AppRotateBytes" 1048576
Write-Host "Creating Buildkite Agent service..."
nssm install "buildkite-agent" `
@@ -338,6 +342,8 @@
nssm set "buildkite-agent" "AppStdout" "c:\buildkite\logs\buildkite-agent.log"
nssm set "buildkite-agent" "AppStderr" "c:\buildkite\logs\buildkite-agent.log"
nssm set "buildkite-agent" "AppRotateFiles" "1"
+nssm set "buildkite-agent" "AppRotateSeconds" 86400
+nssm set "buildkite-agent" "AppRotateBytes" 1048576
Write-Host "All done, adding GCESysprep to RunOnce and rebooting..."
Set-ItemProperty "HKLM:\Software\Microsoft\Windows\CurrentVersion\RunOnce" -Name "GCESysprep" -Value "c:\Program Files\Google\Compute Engine\sysprep\gcesysprep.bat"
diff --git a/buildkite/setup-windows.ps1 b/buildkite/setup-windows.ps1
index b7ee08f..7dbc465 100755
--- a/buildkite/setup-windows.ps1
+++ b/buildkite/setup-windows.ps1
@@ -15,6 +15,8 @@
}
[Environment]::SetEnvironmentVariable("TEMP", "C:\temp", "Machine")
[Environment]::SetEnvironmentVariable("TMP", "C:\temp", "Machine")
+[Environment]::SetEnvironmentVariable("TEMP", "C:\temp", "User")
+[Environment]::SetEnvironmentVariable("TMP", "C:\temp", "User")
$env:TEMP = [Environment]::GetEnvironmentVariable("TEMP", "Machine")
$env:TMP = [Environment]::GetEnvironmentVariable("TMP", "Machine")
@@ -249,7 +251,7 @@
## Create a service wrapper script for the Buildkite agent.
Write-Host "Creating Buildkite agent environment hook..."
$buildkite_environment_hook = @"
-SET BUILDKITE_ARTIFACT_UPLOAD_DESTINATION=gs://bazel-buildkite-artifacts/`$BUILDKITE_JOB_ID
+SET BUILDKITE_ARTIFACT_UPLOAD_DESTINATION=gs://bazel-buildkite-artifacts/%BUILDKITE_JOB_ID%
SET BUILDKITE_GS_ACL=publicRead
SET JAVA_HOME=${env:JAVA_HOME}
SET PATH=${env:PATH}
diff --git a/buildkite/startup-ubuntu.sh b/buildkite/startup-ubuntu.sh
index 8f4aa8a..cf4fe75 100755
--- a/buildkite/startup-ubuntu.sh
+++ b/buildkite/startup-ubuntu.sh
@@ -16,20 +16,43 @@
set -euxo pipefail
-# Use a local SSD if available, otherwise use a RAM disk for our builds.
-# if [ -e /dev/nvme0n1 ]; then
-# mkfs.ext4 -F /dev/nvme0n1
-# # TODO(philwo) add 'discard' option again, when b/68062163 is fixed.
-# mount -o defaults,nobarrier /dev/nvme0n1 /var/lib/buildkite-agent
-# chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent
-# chmod 0755 /var/lib/buildkite-agent
-# mkdir /var/lib/buildkite-agent/docker
-# chown root:root /var/lib/buildkite-agent/docker
-# chmod 0711 /var/lib/buildkite-agent/docker
-# fi
+# If available: Use a persistent disk as a use-case specific data volume.
+if [[ -e /dev/sdb ]]; then
+ if [[ ! -e /dev/vg0 ]]; then
+ pvcreate /dev/sdb
+ vgcreate vg0 /dev/sdb
+ fi
-# Use the local SSD as swap space.
-if [ -e /dev/nvme0n1 ]; then
+ if [[ $(hostname) == *testing* ]]; then
+ # On "testing" machines, we create big /var/lib/docker and /home directories so that everyone
+ # has enough space to try out stuff.
+ if [[ ! -e /dev/vg0/docker ]]; then
+ lvcreate -n docker -l25%FREE vg0
+ mkfs.ext4 /dev/vg0/docker
+ fi
+ mount /dev/vg0/docker /var/lib/docker
+ chmod 0711 /var/lib/docker
+
+ if [[ ! -e /dev/vg0/home ]]; then
+ lvcreate -n home -l100%FREE vg0
+ mkfs.ext4 /dev/vg0/home
+ fi
+ mount /dev/vg0/home /home
+ chmod 0755 /home
+ elif [[ $(hostname) == *pipeline* ]]; then
+ # On "pipeline" machines, we create a big /var/lib/buildkite-agent directory, because these
+ # machines check out a lot of different Git repositories.
+ if [[ ! -e /dev/vg0/buildkite-agent ]]; then
+ lvcreate -n buildkite-agent -l100%FREE vg0
+ mkfs.ext4 /dev/vg0/buildkite-agent
+ fi
+ mount /dev/vg0/buildkite-agent /var/lib/buildkite-agent
+ chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent
+ fi
+fi
+
+# If available: Use the local SSD as swap space.
+if [[ -e /dev/nvme0n1 ]]; then
mkswap -f /dev/nvme0n1
swapon /dev/nvme0n1
@@ -39,7 +62,7 @@
mount -t tmpfs -o mode=0755,uid=buildkite-agent,gid=buildkite-agent,size=$((100 * 1024 * 1024 * 1024)) tmpfs /var/lib/buildkite-agent
fi
-# Start Docker.
+# Start Docker if it's installed.
if [[ $(docker --version 2>/dev/null) ]]; then
if [[ $(systemctl --version 2>/dev/null) ]]; then
systemctl start docker
@@ -48,17 +71,22 @@
fi
fi
-# Get the Buildkite Token from GCS and decrypt it using KMS.
-BUILDKITE_TOKEN=$(curl -sS "https://storage.googleapis.com/bazel-encrypted-secrets/buildkite-agent-token.enc" | \
- gcloud kms decrypt --location global --keyring buildkite --key buildkite-agent-token --ciphertext-file - --plaintext-file -)
-
-# Insert the Buildkite Token into the agent configuration.
-sed -i "s/token=\"xxx\"/token=\"${BUILDKITE_TOKEN}\"/" /etc/buildkite-agent/buildkite-agent.cfg
-
# Only start the Buildkite Agent if this is a worker node (as opposed to a VM
# being used by someone for testing / development).
if [[ $(hostname) == buildkite* ]]; then
- if [[ -e /bin/systemctl ]]; then
+ # Get the Buildkite Token from GCS and decrypt it using KMS.
+ BUILDKITE_TOKEN=$(curl -sS "https://storage.googleapis.com/bazel-encrypted-secrets/buildkite-agent-token.enc" | \
+ gcloud kms decrypt --location global --keyring buildkite --key buildkite-agent-token --ciphertext-file - --plaintext-file -)
+
+ # Insert the Buildkite Token into the agent configuration.
+ sed -i "s/token=\"xxx\"/token=\"${BUILDKITE_TOKEN}\"/" /etc/buildkite-agent/buildkite-agent.cfg
+
+ if [[ $(hostname) == *pipeline* ]]; then
+ # Start 8 instances of the Buildkite agent.
+ for i in $(seq 8); do
+ systemctl start buildkite-agent@$i
+ done
+ elif [[ -e /bin/systemctl ]]; then
systemctl start buildkite-agent
else
service buildkite-agent start