Automate GCP VM image creation (#2130)
With this change, updating CI Linux & Windows image is as simple as
triggering builds in the following pipeline:
- https://buildkite.com/bazel-trusted/create-windows-vm-image
- https://buildkite.com/bazel-trusted/create-linux-vm-image
Check the documentation for more details.
diff --git a/buildkite/create_images.py b/buildkite/create_images.py
index 8d41a61..2d7f594 100755
--- a/buildkite/create_images.py
+++ b/buildkite/create_images.py
@@ -15,13 +15,9 @@
# limitations under the License.
from datetime import datetime
-import json
import os
-import queue
-import subprocess
import sys
import tempfile
-import threading
import gcloud
import gcloud_utils
@@ -59,12 +55,6 @@
},
}
-WORK_QUEUE = queue.Queue()
-
-
-def run(args, **kwargs):
- return subprocess.run(args, **kwargs)
-
def preprocess_setup_script(setup_script, is_windows):
output_file = tempfile.mkstemp()[1]
@@ -77,6 +67,7 @@
if is_windows:
f.write("'@\n")
f.write('[System.IO.File]::WriteAllLines("c:\\setup.ps1", $setup_script)\n')
+ f.write('Start-Process -FilePath "powershell.exe" -ArgumentList "-File c:\\setup.ps1" -RedirectStandardOutput "c:\\setup-stdout.log" -RedirectStandardError "c:\\setup-stderr.log" -NoNewWindow\n')
return output_file
@@ -112,47 +103,6 @@
os.remove(setup_script)
-# https://stackoverflow.com/a/25802742
-def write_to_clipboard(output):
- process = subprocess.Popen("pbcopy", env={"LANG": "en_US.UTF-8"}, stdin=subprocess.PIPE)
- process.communicate(output.encode("utf-8"))
-
-
-def print_windows_instructions(project, zone, instance_name):
- tail_start = gcloud_utils.tail_serial_console(
- instance_name, project=project, zone=zone, until="Finished running startup scripts"
- )
-
- pw = json.loads(
- gcloud.reset_windows_password(
- instance_name, format="json", project=project, zone=zone
- ).stdout
- )
- rdp_file = tempfile.mkstemp(suffix=".rdp")[1]
- with open(rdp_file, "w") as f:
- f.write("full address:s:" + pw["ip_address"] + "\n")
- f.write("username:s:" + pw["username"] + "\n")
- print("Opening ", rdp_file)
- subprocess.run(["open", rdp_file])
- write_to_clipboard(pw["password"])
- with gcloud.PRINT_LOCK:
- print("Use this password to connect to the Windows VM: " + pw["password"])
- print("Please run the setup script C:\\setup.ps1 once you're logged in.")
-
- # Wait until the VM reboots once, then open RDP again.
- tail_start = gcloud_utils.tail_serial_console(
- instance_name,
- project=project,
- zone=zone,
- start=tail_start,
- until="GCEGuestAgent: GCE Agent Started",
- )
- print("Connecting via RDP a second time to finish the setup...")
- write_to_clipboard(pw["password"])
- run(["open", rdp_file])
- return tail_start
-
-
def workflow(name, params):
instance_name = "%s-image-%s" % (name, int(datetime.now().timestamp()))
project = params["project"]
@@ -164,16 +114,8 @@
# Wait for the VM to become ready.
gcloud_utils.wait_for_instance(instance_name, project=project, zone=zone, status="RUNNING")
- if "windows" in instance_name:
- # Wait for VM to be ready, then print setup instructions.
- tail_start = print_windows_instructions(project, zone, instance_name)
- # Continue printing the serial console until the VM shuts down.
- gcloud_utils.tail_serial_console(
- instance_name, project=project, zone=zone, start=tail_start
- )
- else:
- # Continuously print the serial console.
- gcloud_utils.tail_serial_console(instance_name, project=project, zone=zone)
+ # Continuously print the serial console.
+ gcloud_utils.tail_serial_console(instance_name, project=project, zone=zone)
# Wait for the VM to completely shutdown.
gcloud_utils.wait_for_instance(
@@ -194,17 +136,6 @@
gcloud.delete_instance(instance_name, project=project, zone=zone)
-def worker():
- while True:
- item = WORK_QUEUE.get()
- if not item:
- break
- try:
- workflow(**item)
- finally:
- WORK_QUEUE.task_done()
-
-
def main(argv=None):
if argv is None:
argv = sys.argv[1:]
@@ -222,27 +153,11 @@
)
return 1
- # Put VM creation instructions into the work queue.
- for name in argv:
- WORK_QUEUE.put({"name": name, "params": IMAGE_CREATION_VMS[name]})
+ if len(argv) > 1:
+ print("Only one platform can be created at a time.")
+ return 1
- # Spawn worker threads that will create the VMs.
- threads = []
- for _ in range(WORK_QUEUE.qsize()):
- t = threading.Thread(target=worker)
- t.start()
- threads.append(t)
-
- # Wait for all VMs to be created.
- WORK_QUEUE.join()
-
- # Signal worker threads to exit.
- for _ in range(len(threads)):
- WORK_QUEUE.put(None)
-
- # Wait for worker threads to exit.
- for t in threads:
- t.join()
+ workflow(argv[0], IMAGE_CREATION_VMS[argv[0]])
return 0
diff --git a/buildkite/gcloud_utils.py b/buildkite/gcloud_utils.py
index 9649c5b..6dac2df 100755
--- a/buildkite/gcloud_utils.py
+++ b/buildkite/gcloud_utils.py
@@ -49,12 +49,12 @@
# Then drop the common prefix to make the output easier to read.
# For unknown platforms, we just take every line unmodified.
if "ubuntu" in instance_name or "docker" in instance_name:
- match = re.match(r".*GCEMetadataScripts: startup-script: (.*)", line)
+ match = re.match(r".*: startup-script: (.*)", line)
if not match:
continue
line = match.group(1)
elif "windows" in instance_name:
- match = re.match(r".*windows-startup-script-ps1: (.*)", line)
+ match = re.match(r".*\[setup-windows.ps1\]: (.*)", line)
if not match:
continue
line = match.group(1)
diff --git a/buildkite/setup-windows.ps1 b/buildkite/setup-windows.ps1
index b36d722..56942c0 100755
--- a/buildkite/setup-windows.ps1
+++ b/buildkite/setup-windows.ps1
@@ -11,6 +11,21 @@
## Use TLS1.2 for HTTPS (fixes an issue where later steps can't connect to github.com)
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
+## If choco is already installed, this is the second time the VM starts up, run GCESysprep and then shutdown
+if (Get-Command choco -ErrorAction SilentlyContinue) {
+ $port = New-Object System.IO.Ports.SerialPort COM1,9600,None,8,one
+ $port.Open()
+ $port.WriteLine("[setup-windows.ps1]: choco is already installed, this is the second time the VM starts up, running GCESysprep and then shutdown...")
+ $port.Close()
+ GCESysprep
+ exit 0
+}
+
+$port = New-Object System.IO.Ports.SerialPort COM1,9600,None,8,one
+$port.Open()
+$port.WriteLine("[setup-windows.ps1]: Starting to setup windows... This could take up to one hour, check C:/setup-stdout.log on the VM for progress.")
+$port.Close()
+
## Create C:\temp
Write-Host "Creating temporary folder C:\temp..."
if (-Not (Test-Path "c:\temp")) {
@@ -337,6 +352,11 @@
$pagefile.MaximumSize = 64 * 1024;
$pagefile.Put();
-Write-Host "All done, adding GCESysprep to RunOnce and rebooting..."
-Set-ItemProperty "HKLM:\Software\Microsoft\Windows\CurrentVersion\RunOnce" -Name "GCESysprep" -Value "c:\Program Files\Google\Compute Engine\sysprep\gcesysprep.bat"
+Write-Host "All done, rebooting..."
+
+$port = New-Object System.IO.Ports.SerialPort COM1,9600,None,8,one
+$port.Open()
+$port.WriteLine("[setup-windows.ps1]: Setup windows done, rebooting...")
+$port.Close()
+
Restart-Computer
diff --git a/docs/ci-playbook.md b/docs/ci-playbook.md
index e300aeb..68c9e2f 100644
--- a/docs/ci-playbook.md
+++ b/docs/ci-playbook.md
@@ -6,60 +6,55 @@
## Deploying new CI worker images
-Our Linux and Windows CI workers run on GCE instances. The basic update process consists of the following two steps:
+Our **Linux** and **Windows** CI workers run on GCE instances. The basic update process consists of the following two steps and has been **automated** in the bazel-trusted BuildKite org:
1. Run [create_images](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/create_images.py) to create new VM images. This step starts a temporary VM, configures it as a CI worker, and then saves its image in GCE before destroying the temporary VM. This step does not affect running builds.
1. Run [create_instances](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/create_instances.py) to deploy instances with the new VM images. This step deletes the existing instances, then reads the [configuration file](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/instances.yml) to determine how many instances are needed, and finally creates new instances with the new images. As a result, any running builds will be interrupted.
Note: Many changes to the Linux workers don't require these two steps since we run Docker containers on Linux. See below for a description on how to create and deploy new Docker images.
-### Prerequesites
+For **macOS**, follow the internal playbook (go/bazel-ci-playbook).
-All steps require `git` and the [Google Cloud SDK](https://cloud.google.com/sdk/install) to be installed on your machine.
+### Windows & Linux
-### Windows
+You will need to be a member of the `bazel-trusted` BuildKite org.
-You need a machine with a recent version of MacOS and Microsoft Remote Desktop (10) installed.
+1. Submit your change to this repository.
+1. Initiate a build on the [Create Windows VM image](https://buildkite.com/bazel-trusted/create-windows-vm-image) or [Create Linux VM image](https://buildkite.com/bazel-trusted/create-linux-vm-image) pipeline.
+1. Wait for the first build step to finish. This will create a new Windows VM image.
+1. Deploy the new image to the `bazel-testing` org by unblocking the next step.
+1. Initiate a new build on the [Bazel](https://buildkite.com/bazel-testing/bazel-bazel) pipeline to test the new image.
+1. Push the image to prod by unblocking the next step (eg. `bk-testing-windows` to `bk-windows`).
+1. Wait for the VMs to be recreated in the bazel and bazel-trusted orgs and the new image to be deployed.
-1. First, create new images.
- 1. Clone the continuous-integration repository.
- 1. `cd` into the `continuous-integration/buildkite` directory.
- 1. Create new images by running `create_images.py <platform1> <platform2> <...>`. For Windows, this usually means to include `bk-windows` and `bk-trusted-windows`, whereas the `windows-playground` platform is optional. Hint: You can see a list of available platforms by running the script without any arguments.
- 1. The script opens Microsoft Remote Desktop to establish a connection to the VM that is used for building the image. Accept any popups and log into the machine by pasting the password into the password field (the script already copied into the clipboard).
- 1. Run the setup script by executing `\setup.ps1`.
- 1. Wait until the script has finished. At one point the VM will be rebooted, so the script has to open the remote connection again. The whole process can take up to 30 minutes.
- 1. Login into the Google Cloud Console and check that the created images are no longer busy. Make sure to select the project that matches the image (e.g. `bazel-public` for `trusted` images, `bazel-untrusted` for "normal" images).
- 1. If something fails, you can always run `create_images` again.
-1. Deploy CI workers with the newly created image by running `create_instances.py --local_config <instance_group1> <instance_group2> <...>`. The available instances group names can be found in the [configuration file](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/instances.yml). Moreover, you can run the script without any arguments to get a list of available instance groups or check the configuration file. For Windows you would usually pass `bk-windows bk-trusted-windows` to the script.
+Note: if anything goes wrong in the new image, you can always revert to the previous image by deleting the new image in the GCP console and re-create the VMs.
-### Linux
+### Deploy new Docker images for Linux
-Most changes can be rolled out by creating and deploying new Docker images. This step requires that Docker is installed and set up, and you need permissions to access the container registry in our GCP project.
+Most changes can be rolled out by creating and deploying new Docker images. This step requires that
+
+- You are on a Linux machine (images built on macOS may cause problem).
+- Docker is installed and set up.
+- You need permissions to access the container registry in our GCP project.
+
+Follow these steps to build and deploy a new Docker image:
1. Clear your local Docker cache via `docker builder prune -a -f`.
1. Clone the continuous-integration repository.
1. `cd` into the `continuous-integration/buildkite/docker` directory.
1. Run `build.sh`.
+1. Run `push.sh`.
-If you need to create and deploy new VM images, you can follow these steps:
-
-1. Clone the continuous-integration repository.
-1. `cd` into the `continuous-integration/buildkite` directory.
-1. Create new images by running `python3.6 create_images.py <platform1> <platform2> <...>`. For Linux, this usually means to include `bk-docker` and `bk-trusted-docker`. Hint: You can see a list of available platforms by running the script without any arguments.
-1. Deploy CI workers with the newly created image by running `python3.6 create_instances.py --local_config <instance_group1> <instance_group2> <...>`. The available instances group names can be found in the [configuration file](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/instances.yml). For Linux you would usually pass `bk-docker bk-trusted-docker` to the script.
-
-### MacOS
-
-We are operating a number of physical Mac machines in our office. Please see [go/bazel-ci-playbook](http://go/bazel-ci-playbook) if you're in the Google network.
+If you are on the `testing` branch, the new image will be pushed to the `gcr.io/bazel-public/testing` repository. If you are on the `master` branch, the new image will be pushed to the `gcr.io/bazel-public` repository.
## Deploying a new Bazelisk version
1. Create a [new Bazelisk release](https://github.com/bazelbuild/bazelisk/releases). This step has to be done on a Mac machine (due to [cross-compilation problems](https://github.com/golang/go/issues/22510)), and requires permissions to create a release.
1. To deploy this release on MacOS:
1. Update the [Bazelisk Homebrew formula](https://github.com/fweikert/homebrew-tap/blob/master/Formula/bazelisk.rb).
- 1. SSH into the machines and update them via Homebrew (see internal instructions for more details).
+ 1. Update the startup script for macOS VMs to install the latest Bazelisk version.
1. To deploy this release on Linux:
1. Update the [Dockerfile](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/docker/Dockerfile).
- 1. Follow the instructions [here](#linux) to deploy new Docker images.
+ 1. Follow the above instructions to deploy new Docker images.
1. To deploy this release on Windows:
- 1. Create and deploy new VM images by following the [instructions](#windows). There is no need to update any files manually since the [setup script](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/setup-windows.ps1) always fetches the latest version of Bazelisk
+ 1. Create and deploy new VM images by following the above instructions. There is no need to update any files manually since the [setup script](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/setup-windows.ps1) always fetches the latest version of Bazelisk
diff --git a/pipelines/publish-vm-image.yml b/pipelines/publish-vm-image.yml
new file mode 100644
index 0000000..71be2e5
--- /dev/null
+++ b/pipelines/publish-vm-image.yml
@@ -0,0 +1,97 @@
+---
+steps:
+ - command: |-
+ cd buildkite
+ ./create_images.py ${BAZEL_TEST_VM_NAME}
+ label: ":pipeline:"
+ agents:
+ - "queue=default"
+ plugins:
+ - docker#v3.8.0:
+ always-pull: true
+ environment:
+ - "ANDROID_HOME"
+ - "ANDROID_NDK_HOME"
+ - "BUILDKITE_ARTIFACT_UPLOAD_DESTINATION"
+ image: "gcr.io/bazel-public/ubuntu2204"
+ network: "host"
+ privileged: true
+ propagate-environment: true
+ propagate-uid-gid: true
+ volumes:
+ - "/etc/group:/etc/group:ro"
+ - "/etc/passwd:/etc/passwd:ro"
+ - "/etc/shadow:/etc/shadow:ro"
+ - "/opt/android-ndk-r15c:/opt/android-ndk-r15c:ro"
+ - "/opt/android-sdk-linux:/opt/android-sdk-linux:ro"
+ - "/var/lib/buildkite-agent:/var/lib/buildkite-agent"
+ - "/var/lib/gitmirrors:/var/lib/gitmirrors:ro"
+ - "/var/run/docker.sock:/var/run/docker.sock"
+
+ - wait
+
+ - block: ":arrows_counterclockwise: Re-create instance group in the bazel-testing org"
+
+ - command: |-
+ cd buildkite
+ ./create_instances.py ${BAZEL_TEST_VM_NAME}
+ label: ":pipeline:"
+ agents:
+ - "queue=default"
+ plugins:
+ - docker#v3.8.0:
+ always-pull: true
+ environment:
+ - "ANDROID_HOME"
+ - "ANDROID_NDK_HOME"
+ - "BUILDKITE_ARTIFACT_UPLOAD_DESTINATION"
+ image: "gcr.io/bazel-public/ubuntu2204"
+ network: "host"
+ privileged: true
+ propagate-environment: true
+ propagate-uid-gid: true
+ volumes:
+ - "/etc/group:/etc/group:ro"
+ - "/etc/passwd:/etc/passwd:ro"
+ - "/etc/shadow:/etc/shadow:ro"
+ - "/opt/android-ndk-r15c:/opt/android-ndk-r15c:ro"
+ - "/opt/android-sdk-linux:/opt/android-sdk-linux:ro"
+ - "/var/lib/buildkite-agent:/var/lib/buildkite-agent"
+ - "/var/lib/gitmirrors:/var/lib/gitmirrors:ro"
+ - "/var/run/docker.sock:/var/run/docker.sock"
+
+ - wait
+
+ - block: ":white_check_mark: Confirm you have tested the new VM image in the bazel-testing org"
+
+ - wait
+
+ - block: ":rocket: Promote the VM image to Prod"
+
+ - command: |-
+ cd buildkite
+ ./promote_images.py ${BAZEL_VM_NAME}
+ label: ":pipeline:"
+ agents:
+ - "queue=default"
+ plugins:
+ - docker#v3.8.0:
+ always-pull: true
+ environment:
+ - "ANDROID_HOME"
+ - "ANDROID_NDK_HOME"
+ - "BUILDKITE_ARTIFACT_UPLOAD_DESTINATION"
+ image: "gcr.io/bazel-public/ubuntu2204"
+ network: "host"
+ privileged: true
+ propagate-environment: true
+ propagate-uid-gid: true
+ volumes:
+ - "/etc/group:/etc/group:ro"
+ - "/etc/passwd:/etc/passwd:ro"
+ - "/etc/shadow:/etc/shadow:ro"
+ - "/opt/android-ndk-r15c:/opt/android-ndk-r15c:ro"
+ - "/opt/android-sdk-linux:/opt/android-sdk-linux:ro"
+ - "/var/lib/buildkite-agent:/var/lib/buildkite-agent"
+ - "/var/lib/gitmirrors:/var/lib/gitmirrors:ro"
+ - "/var/run/docker.sock:/var/run/docker.sock"