blob: 1aedbad4368352b4a3e574d0759def576884780a [file] [log] [blame]
# configure cloud storage backend to keep state information. This is shared
# across all users and contains the previously configured parts. Accessing
# GCS requires that the environment variable `GOOGLE_CLOUD_KEYFILE_JSON` points
# to your credential file, e.g.
# ~/.config/gcloud/legacy_credentials/<your email>/adc.json
terraform {
backend "gcs" {
bucket = "buildbot_cluster_terraform_backend"
prefix = "terraform/state"
}
}
# configure Google Cloud project
provider "google" {
project = var.gcp_config.project
region = var.gcp_config.region
}
# create a network for the cluster, required for Kubernetes on Windows
# FIXME: rename to "buildbot-vpc-network", causes destruction of the cluster!
resource "google_compute_network" "vpc_network" {
name = "vpc-network"
}
# Create the cluster runningn all Kubernetes services
resource "google_container_cluster" "primary" {
name = "buildbot-cluster"
# maybe have a regional cluster for Kubernetes, as we depend on this...
location = var.gcp_config.zone_a
# configure local network, required for Kubernetes on Windows
network = google_compute_network.vpc_network.name
# enable alias IP addresses, required for Kubernetes for Windows
ip_allocation_policy {}
# use newer Kubernetes version, otherwise Windows node pools can't be created
min_master_version = "1.16"
# one node is enough (at the moment)
initial_node_count = 1
node_config {
# FIXME(kuhnel): turn this into a private cluster, without external IP
# We need at least 2 vCPU to run all kubernetes services
machine_type = "e2-medium"
# use preemptible, as this saves costs
preemptible = true
}
}
resource "null_resource" "update_cluster" {
# Add NVIDIA driver daemonset.
depends_on = [google_container_cluster.primary]
# Update kubectl context for the cluster and apply nvidia's daemonset.
provisioner "local-exec" {
command = <<EOT
gcloud container clusters get-credentials cudabot
kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml
EOT
}
}
# Create machines for mlir-nvidia
# Note: The buildbot mlir-nividia is deployed using a kubernetes file. See
# the README.md for details on GPUs.
resource "google_container_node_pool" "nvidia_16core_pool_nodes" {
name = "nvidia-16core-pool"
# specify a zone here (e.g. "-a") to avoid a redundant deployment
location = var.gcp_config.zone_a
cluster = google_container_cluster.primary.name
# use autoscaling to only create a machine when there is a deployment
autoscaling {
min_node_count = 0
max_node_count = 2
}
node_config {
# use preemptible, as this saves costs
preemptible = true
# FIXME upgrade to "n1-custom-24-32768"
machine_type = "n1-highcpu-16"
disk_size_gb = 100
# FIXME: test if SSDs are actually faster than HDDs for our use case
disk_type = "pd-ssd"
guest_accelerator {
type = "nvidia-tesla-t4"
count= 1
}
# set the premissions required for the deployment later
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/devstorage.read_only",
]
# add a label to all machines of this type, so we can select them
# during deployment
labels = {
pool = "nvidia-16core-pool"
}
}
}
resource "null_resource" "deployment-mlir-nvidia" {
# Add NVIDIA driver daemonset.
depends_on = [null_resource.update_cluster]
triggers = {
t4_contents = filemd5("${path.module}/deployment-mlir-nvidia-production.yaml")
}
# Add NVIDIA daemonset and deploy mlir-nvidia.
# Using this workaround as terraform does not support GPUs on Google Cloud.
# https://github.com/terraform-providers/terraform-provider-kubernetes/issues/149
provisioner "local-exec" {
command = "kubectl apply -f ${path.module}/deployment-mlir-nvidia-production.yaml"
}
}
# node pool for windows machines
resource "google_container_node_pool" "windows_32core_pool_nodes" {
name = "windows-32core-pool"
# specify a zone here (e.g. "-a") to avoid a redundant deployment
location = var.gcp_config.zone_a
cluster = google_container_cluster.primary.name
# use autoscaling to only create a machine when there is a deployment
autoscaling {
min_node_count = 0
max_node_count = 1
}
node_config {
# use preemptible, as this saves costs
preemptible = true
machine_type = "e2-highcpu-32"
# Windows deployments tend to require more disk space, so using 300GB here.
disk_size_gb = 300
# FIXME: test if SSDs are actually faster than HDDs for our use case
disk_type = "pd-ssd"
# Configure Windows image. As Windows is picky about the combination of
# host and container OS versions, this must be compatible with the version
# in your container. Recommondation: Use LTSC for long-term stability.
# For details see
# https://docs.microsoft.com/en-us/virtualization/windowscontainers/deploy-containers/version-compatibility
# https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-cluster-windows#choose_your_windows_server_node_image
image_type = "WINDOWS_LTSC"
# set the premissions required for the deployment later
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/devstorage.read_only",
]
# add a label to all machines of this type, so we can select them
# during deployment
labels = {
pool = "win-32core-pool"
}
}
}
# Deployment for the buildbot windows10_vs2019 running on Windows rather than
# Linux.
# Note: Deploying this takes significantly longer (~35 min) than on Linux
# as the images tend to be larger (~18GB) and IO performance is lower.
resource "kubernetes_deployment" "windows10_vs2019" {
metadata {
name = "windows10-vs2019"
labels = {
app = "windows10_vs2019"
}
}
spec {
# create one instance of this container
replicas = 1
selector {
match_labels = {
app = "windows10_vs2019"
}
}
strategy{
rolling_update{
# do not deploy more replicas, as the buildbot server
# can't handle multiple workers with the same credentials
max_surge = 0
# Allow to have 0 replicas during updates.
max_unavailable = 1
}
type = "RollingUpdate"
}
template {
metadata {
labels = {
app = "windows10_vs2019"
}
}
spec {
container {
image = "${var.gcp_config.gcr_prefix}/buildbot-windows10-vs2019:17"
name = "windows10-vs2019"
# reserve "<number of cores>-1" for this image, kubernetes also
# needs <1 core for management tools
resources {
limits {
cpu = "31"
memory = "20Gi"
}
requests {
cpu = "31"
memory = "20Gi"
}
}
# mount the secrets into a folder
volume_mount {
mount_path = "c:\\volumes\\secrets"
name = "buildbot-token"
}
volume_mount {
mount_path = "c:\\volumes\\sccache"
name = "sccache-vol"
}
volume_mount {
mount_path = "c:\\volumes\\buildbot"
name = "buildbot-vol"
}
}
# select which node pool to deploy to
node_selector = {
pool = "win-32core-pool"
}
# restart in case of any crashes
restart_policy = "Always"
# select the secret to be mounted
volume {
name = "buildbot-token"
secret {
optional = false
secret_name = "password-windows10-vs2019"
}
}
volume {
name = "sccache-vol"
empty_dir {}
}
volume {
name = "buildbot-vol"
empty_dir {}
}
# Windows nodes from the node pool are marked with the taint
# "node.kubernetes.io/os=windows". So we need to "tolerate" this to
# deploy to such nodes.
toleration {
effect = "NoSchedule"
key = "node.kubernetes.io/os"
operator = "Equal"
value = "windows"
}
}
}
}
}
resource "google_container_node_pool" "linux_16_core_pool" {
name = "linux-16-core-pool"
# specify a zone here (e.g. "-a") to avoid a redundant deployment
location = var.gcp_config.zone_a
cluster = google_container_cluster.primary.name
# use autoscaling to only create a machine when there is a deployment
autoscaling {
min_node_count = 0
max_node_count = 2
}
node_config {
# use preemptible, as this saves costs
preemptible = true
#custom machine type: 16 core, 32 GB as tsan needs more RAM
machine_type = "n2d-custom-16-32768"
disk_size_gb = 100
disk_type = "pd-ssd"
# set the premissions required for the deployment later
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/devstorage.read_only",
]
# add a label to all machines of this type, so we can select them
# during deployment
labels = {
pool = "linux-16-core-pool"
}
}
}
resource "kubernetes_deployment" "clangd-ubuntu-clang" {
metadata {
name = "clangd-ubuntu-clang"
labels = {
app = "clangd-ubuntu-clang"
}
}
spec {
# create one instance of this container
replicas = 1
selector {
match_labels = {
app = "clangd-ubuntu-clang"
}
}
strategy{
rolling_update{
# do not deploy more replicas, as the buildbot server
# can't handle multiple workers with the same credentials
max_surge = 0
# Allow to have 0 replicas during updates.
max_unavailable = 1
}
type = "RollingUpdate"
}
template {
metadata {
labels = {
app = "clangd-ubuntu-clang"
}
}
spec {
container {
image = "${var.gcp_config.gcr_prefix}/buildbot-clangd-ubuntu-clang:3"
name = "buildbot-clangd-ubuntu-clang"
# reserve "<number of cores>-1" for this image, kubernetes also
# needs <1 core for management tools
resources {
limits {
cpu = "15"
memory = "28G"
}
requests {
cpu = "15"
memory = "28G"
}
}
# mount the secrets into a folder
volume_mount {
mount_path = "/vol/secrets"
name = "buildbot-token"
}
volume_mount {
mount_path = "/vol/cccache"
name = "ccache-vol"
}
volume_mount {
mount_path = "/vol/worker"
name = "worker-vol"
}
env {
# connect to production environment, running at port 9990
# staging would be at 9994
name = "BUILDBOT_PORT"
value = "9990"
}
}
# select which node pool to deploy to
node_selector = {
pool = "linux-16-core-pool"
}
# restart in case of any crashes
restart_policy = "Always"
# select the secret to be mounted
volume {
name = "buildbot-token"
secret {
optional = false
secret_name = "password-clangd-ubuntu-clang"
}
}
volume {
name = "ccache-vol"
empty_dir {}
}
volume {
name = "worker-vol"
empty_dir {}
}
}
}
}
}