Update to infrastructure scripts.
diff --git a/buildkite/create_instances.py b/buildkite/create_instances.py
index 5d28b54..cfaf184 100755
--- a/buildkite/create_instances.py
+++ b/buildkite/create_instances.py
@@ -25,6 +25,16 @@
 
 LOCATION = 'europe-west1-d'
 
+# Note that the hostnames are parsed and trigger specific behavior for different use cases.
+# The following parts have a special meaning:
+#
+# - "buildkite": This is a normal production VM running the Buildkite agent.
+# - "pipeline": This is a special production VM that only runs pipeline setup scripts.
+# - "testing": This is a shared VM that can be used by project members for experiments.
+#              It does not run the Buildkite agent.
+# - "$USER": This is a VM used by one specific engineer for tests. It does not run the Buildkite
+#            agent.
+#
 INSTANCE_GROUPS = {
     'buildkite-ubuntu1404': {
         'count': 8,
@@ -52,14 +62,37 @@
         'machine_type': 'n1-standard-8',
         'persistent_disk': 'buildkite-pipeline-persistent'
     },
+    'testing-ubuntu1404': {
+        'image_family': 'buildkite-ubuntu1404',
+        'startup_script': 'startup-ubuntu.sh',
+        'machine_type': 'n1-standard-32',
+        'persistent_disk': 'testing-ubuntu1404-persistent'
+    },
+    'testing-ubuntu1604': {
+        'image_family': 'buildkite-ubuntu1604',
+        'startup_script': 'startup-ubuntu.sh',
+        'machine_type': 'n1-standard-32',
+        'persistent_disk': 'testing-ubuntu1604-persistent'
+    },
+    'testing-windows': {
+        'image_family': 'buildkite-windows',
+        'machine_type': 'n1-standard-32',
+        'boot_disk_size': '500GB'
+    },
+    '{}-ubuntu1404'.format(getpass.getuser()): {
+        'image_family': 'buildkite-ubuntu1404',
+        'startup_script': 'startup-ubuntu.sh',
+        'machine_type': 'n1-standard-32',
+        'local_ssd': 'interface=nvme',
+    },
     '{}-ubuntu1604'.format(getpass.getuser()): {
-        'image': 'buildkite-ubuntu1604',
+        'image_family': 'buildkite-ubuntu1604',
         'startup_script': 'startup-ubuntu.sh',
         'machine_type': 'n1-standard-32',
         'local_ssd': 'interface=nvme',
     },
     '{}-windows'.format(getpass.getuser()): {
-        'image': 'buildkite-windows',
+        'image_family': 'buildkite-windows',
         'startup_script': 'startup-windows.ps1',
         'machine_type': 'n1-standard-32',
         'local_ssd': 'interface=scsi',
@@ -83,10 +116,12 @@
 def flags_for_instance(image_family, params):
     cmd = ['--machine-type', params['machine_type']]
     cmd.extend(['--network', 'buildkite'])
-    if 'windows' in image_family:
-        cmd.extend(['--metadata-from-file', 'windows-startup-script-ps1=' + params['startup_script']])
-    else:
-        cmd.extend(['--metadata-from-file', 'startup-script=' + params['startup_script']])
+    if 'startup_script' in params:
+        if 'windows' in image_family:
+            cmd.extend(['--metadata-from-file',
+                        'windows-startup-script-ps1=' + params['startup_script']])
+        else:
+            cmd.extend(['--metadata-from-file', 'startup-script=' + params['startup_script']])
     cmd.extend(['--min-cpu-platform', 'Intel Skylake'])
     cmd.extend(['--boot-disk-type', 'pd-ssd'])
     cmd.extend(['--boot-disk-size', params.get('boot_disk_size', '50GB')])
@@ -120,11 +155,18 @@
 
 
 def delete_instance(instance_name):
-    return run(['gcloud', 'compute', 'instances', 'delete', '--quiet', instance_name])
+    cmd = ['gcloud', 'compute', 'instances', 'delete', '--quiet', instance_name]
+    result = run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
+    if result.returncode != 0:
+        # It's not an error if 'delete' failed, because the template didn't exist in the first place.
+        # But we do want to error out on other unexpected errors.
+        if not re.search(r'The resource .* was not found', result.stdout):
+            raise Exception('"gcloud compute instance delete" returned unexpected error:\n{}'.format(result.stdout))
+    return result
 
 
 def create_instance(instance_name, image_family, params):
-    cmd = ['gcloud', 'compute', 'instance', 'create', instance_name]
+    cmd = ['gcloud', 'compute', 'instances', 'create', instance_name]
     cmd.extend(['--zone', LOCATION])
     cmd.extend(flags_for_instance(image_family, params))
     run(cmd)
@@ -208,8 +250,12 @@
         })
 
     for instance_name, params in SINGLE_INSTANCES.items():
+        # If the user specified instance (group) names on the command-line, we process only these
+        # instances, otherwise we process all.
         if argv and instance_name not in argv:
             continue
+        # Do not automatically create user-specific instances. These must be specified explicitly
+        # on the command-line.
         if instance_name.startswith(getpass.getuser()) and instance_name not in argv:
             continue
         WORK_QUEUE.put({
diff --git a/buildkite/install-buildkite-agent.sh b/buildkite/install-buildkite-agent.sh
index d276bd1..0d5d486 100755
--- a/buildkite/install-buildkite-agent.sh
+++ b/buildkite/install-buildkite-agent.sh
@@ -17,13 +17,13 @@
 # Deduce the operating system from the hostname and put it into the metadata.
 case $(hostname) in
   *pipeline*)
-    AGENT_TAGS="osname=pipeline,pipeline=true"
+    AGENT_TAGS="os=pipeline,pipeline=true"
     ;;
   *ubuntu1404*)
-    AGENT_TAGS="osname=ubuntu1404"
+    AGENT_TAGS="os=ubuntu1404"
     ;;
   *ubuntu1604*)
-    AGENT_TAGS="osname=ubuntu1604"
+    AGENT_TAGS="os=ubuntu1604"
     ;;
   default)
     echo "Could not deduce operating system from hostname: $(hostname)!"
@@ -55,7 +55,7 @@
 # Write the Buildkite agent configuration.
 cat > /etc/buildkite-agent/buildkite-agent.cfg <<EOF
 token="xxx"
-name="%hostname"
+name="%hostname-%n"
 tags="${AGENT_TAGS}"
 tags-from-gcp=true
 build-path="/var/lib/buildkite-agent/builds"
diff --git a/buildkite/setup-ubuntu.sh b/buildkite/setup-ubuntu.sh
index a07b1a4..8ba7d3c 100755
--- a/buildkite/setup-ubuntu.sh
+++ b/buildkite/setup-ubuntu.sh
@@ -60,5 +60,8 @@
   # Dependencies for TensorFlow.
   libcurl3-dev
   swig
+
+  # Infra stuff.
+  lvm2
 )
 apt-get -qqy install "${packages[@]}" > /dev/null
diff --git a/buildkite/setup-windows-manual.ps1 b/buildkite/setup-windows-manual.ps1
index 87ba0c6..9f204c2 100755
--- a/buildkite/setup-windows-manual.ps1
+++ b/buildkite/setup-windows-manual.ps1
@@ -16,6 +16,8 @@
 }

 [Environment]::SetEnvironmentVariable("TEMP", "C:\temp", "Machine")

 [Environment]::SetEnvironmentVariable("TMP", "C:\temp", "Machine")

+[Environment]::SetEnvironmentVariable("TEMP", "C:\temp", "User")

+[Environment]::SetEnvironmentVariable("TMP", "C:\temp", "User")

 $env:TEMP = [Environment]::GetEnvironmentVariable("TEMP", "Machine")

 $env:TMP = [Environment]::GetEnvironmentVariable("TMP", "Machine")

 

@@ -250,7 +252,7 @@
 ## Create a service wrapper script for the Buildkite agent.

 Write-Host "Creating Buildkite agent environment hook..."

 $buildkite_environment_hook = @"

-SET BUILDKITE_ARTIFACT_UPLOAD_DESTINATION=gs://bazel-buildkite-artifacts/`$BUILDKITE_JOB_ID

+SET BUILDKITE_ARTIFACT_UPLOAD_DESTINATION=gs://bazel-buildkite-artifacts/%BUILDKITE_JOB_ID%

 SET BUILDKITE_GS_ACL=publicRead

 SET JAVA_HOME=${env:JAVA_HOME}

 SET PATH=${env:PATH}

@@ -325,6 +327,8 @@
 nssm set "buildkite-monitor" "AppStdout" "c:\buildkite\logs\buildkite-monitor.log"

 nssm set "buildkite-monitor" "AppStderr" "c:\buildkite\logs\buildkite-monitor.log"

 nssm set "buildkite-monitor" "AppRotateFiles" "1"

+nssm set "buildkite-monitor" "AppRotateSeconds" 86400

+nssm set "buildkite-monitor" "AppRotateBytes" 1048576

 

 Write-Host "Creating Buildkite Agent service..."

 nssm install "buildkite-agent" `

@@ -338,6 +342,8 @@
 nssm set "buildkite-agent" "AppStdout" "c:\buildkite\logs\buildkite-agent.log"

 nssm set "buildkite-agent" "AppStderr" "c:\buildkite\logs\buildkite-agent.log"

 nssm set "buildkite-agent" "AppRotateFiles" "1"

+nssm set "buildkite-agent" "AppRotateSeconds" 86400

+nssm set "buildkite-agent" "AppRotateBytes" 1048576

 

 Write-Host "All done, adding GCESysprep to RunOnce and rebooting..."

 Set-ItemProperty "HKLM:\Software\Microsoft\Windows\CurrentVersion\RunOnce" -Name "GCESysprep" -Value "c:\Program Files\Google\Compute Engine\sysprep\gcesysprep.bat"

diff --git a/buildkite/setup-windows.ps1 b/buildkite/setup-windows.ps1
index b7ee08f..7dbc465 100755
--- a/buildkite/setup-windows.ps1
+++ b/buildkite/setup-windows.ps1
@@ -15,6 +15,8 @@
 }

 [Environment]::SetEnvironmentVariable("TEMP", "C:\temp", "Machine")

 [Environment]::SetEnvironmentVariable("TMP", "C:\temp", "Machine")

+[Environment]::SetEnvironmentVariable("TEMP", "C:\temp", "User")

+[Environment]::SetEnvironmentVariable("TMP", "C:\temp", "User")

 $env:TEMP = [Environment]::GetEnvironmentVariable("TEMP", "Machine")

 $env:TMP = [Environment]::GetEnvironmentVariable("TMP", "Machine")

 

@@ -249,7 +251,7 @@
 ## Create a service wrapper script for the Buildkite agent.

 Write-Host "Creating Buildkite agent environment hook..."

 $buildkite_environment_hook = @"

-SET BUILDKITE_ARTIFACT_UPLOAD_DESTINATION=gs://bazel-buildkite-artifacts/`$BUILDKITE_JOB_ID

+SET BUILDKITE_ARTIFACT_UPLOAD_DESTINATION=gs://bazel-buildkite-artifacts/%BUILDKITE_JOB_ID%

 SET BUILDKITE_GS_ACL=publicRead

 SET JAVA_HOME=${env:JAVA_HOME}

 SET PATH=${env:PATH}

diff --git a/buildkite/startup-ubuntu.sh b/buildkite/startup-ubuntu.sh
index 8f4aa8a..cf4fe75 100755
--- a/buildkite/startup-ubuntu.sh
+++ b/buildkite/startup-ubuntu.sh
@@ -16,20 +16,43 @@
 
 set -euxo pipefail
 
-# Use a local SSD if available, otherwise use a RAM disk for our builds.
-# if [ -e /dev/nvme0n1 ]; then
-#   mkfs.ext4 -F /dev/nvme0n1
-#   # TODO(philwo) add 'discard' option again, when b/68062163 is fixed.
-#   mount -o defaults,nobarrier /dev/nvme0n1 /var/lib/buildkite-agent
-#   chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent
-#   chmod 0755 /var/lib/buildkite-agent
-#   mkdir /var/lib/buildkite-agent/docker
-#   chown root:root /var/lib/buildkite-agent/docker
-#   chmod 0711 /var/lib/buildkite-agent/docker
-# fi
+# If available: Use a persistent disk as a use-case specific data volume.
+if [[ -e /dev/sdb ]]; then
+  if [[ ! -e /dev/vg0 ]]; then
+    pvcreate /dev/sdb
+    vgcreate vg0 /dev/sdb
+  fi
 
-# Use the local SSD as swap space.
-if [ -e /dev/nvme0n1 ]; then
+  if [[ $(hostname) == *testing* ]]; then
+    # On "testing" machines, we create big /var/lib/docker and /home directories so that everyone
+    # has enough space to try out stuff.
+    if [[ ! -e /dev/vg0/docker ]]; then
+      lvcreate -n docker -l25%FREE vg0
+      mkfs.ext4 /dev/vg0/docker
+    fi
+    mount /dev/vg0/docker /var/lib/docker
+    chmod 0711 /var/lib/docker
+
+    if [[ ! -e /dev/vg0/home ]]; then
+      lvcreate -n home -l100%FREE vg0
+      mkfs.ext4 /dev/vg0/home
+    fi
+    mount /dev/vg0/home /home
+    chmod 0755 /home
+  elif [[ $(hostname) == *pipeline* ]]; then
+    # On "pipeline" machines, we create a big /var/lib/buildkite-agent directory, because these
+    # machines check out a lot of different Git repositories.
+    if [[ ! -e /dev/vg0/buildkite-agent ]]; then
+      lvcreate -n buildkite-agent -l100%FREE vg0
+      mkfs.ext4 /dev/vg0/buildkite-agent
+    fi
+    mount /dev/vg0/buildkite-agent /var/lib/buildkite-agent
+    chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent
+  fi
+fi
+
+# If available: Use the local SSD as swap space.
+if [[ -e /dev/nvme0n1 ]]; then
   mkswap -f /dev/nvme0n1
   swapon /dev/nvme0n1
 
@@ -39,7 +62,7 @@
   mount -t tmpfs -o mode=0755,uid=buildkite-agent,gid=buildkite-agent,size=$((100 * 1024 * 1024 * 1024)) tmpfs /var/lib/buildkite-agent
 fi
 
-# Start Docker.
+# Start Docker if it's installed.
 if [[ $(docker --version 2>/dev/null) ]]; then
   if [[ $(systemctl --version 2>/dev/null) ]]; then
     systemctl start docker
@@ -48,17 +71,22 @@
   fi
 fi
 
-# Get the Buildkite Token from GCS and decrypt it using KMS.
-BUILDKITE_TOKEN=$(curl -sS "https://storage.googleapis.com/bazel-encrypted-secrets/buildkite-agent-token.enc" | \
-  gcloud kms decrypt --location global --keyring buildkite --key buildkite-agent-token --ciphertext-file - --plaintext-file -)
-
-# Insert the Buildkite Token into the agent configuration.
-sed -i "s/token=\"xxx\"/token=\"${BUILDKITE_TOKEN}\"/" /etc/buildkite-agent/buildkite-agent.cfg
-
 # Only start the Buildkite Agent if this is a worker node (as opposed to a VM
 # being used by someone for testing / development).
 if [[ $(hostname) == buildkite* ]]; then
-  if [[ -e /bin/systemctl ]]; then
+  # Get the Buildkite Token from GCS and decrypt it using KMS.
+  BUILDKITE_TOKEN=$(curl -sS "https://storage.googleapis.com/bazel-encrypted-secrets/buildkite-agent-token.enc" | \
+    gcloud kms decrypt --location global --keyring buildkite --key buildkite-agent-token --ciphertext-file - --plaintext-file -)
+
+  # Insert the Buildkite Token into the agent configuration.
+  sed -i "s/token=\"xxx\"/token=\"${BUILDKITE_TOKEN}\"/" /etc/buildkite-agent/buildkite-agent.cfg
+
+  if [[ $(hostname) == *pipeline* ]]; then
+    # Start 8 instances of the Buildkite agent.
+    for i in $(seq 8); do
+      systemctl start buildkite-agent@$i
+    done
+  elif [[ -e /bin/systemctl ]]; then
     systemctl start buildkite-agent
   else
     service buildkite-agent start