buildbot/google/terraform/main.tf - llvm-zorg - Git at Google

 # configure cloud storage backend to keep state information. This is shared
 # across all users and contains the previously configured parts. Accessing
 # GCS requires that the environment variable `GOOGLE_CLOUD_KEYFILE_JSON` points
 # to your credential file, e.g.
 # ~/.config/gcloud/legacy_credentials/<your email>/adc.json
 terraform {
   backend "gcs" {
     bucket  = "buildbot_cluster_terraform_backend"
     prefix  = "terraform/state"
   }
 }

 # configure Google Cloud project
 provider "google" {
   project  = var.gcp_config.project
   region   = var.gcp_config.region
 }


 # create a network for the cluster, required for Kubernetes on Windows
 # FIXME: rename to "buildbot-vpc-network", causes destruction of the cluster!
 resource "google_compute_network" "vpc_network" {
   name = "vpc-network"
 }

 # Create the cluster runningn all Kubernetes services
 resource "google_container_cluster" "primary" {
   name     = "buildbot-cluster"
   # maybe have a regional cluster for Kubernetes, as we depend on this...
   location = var.gcp_config.zone_a

   # configure local network, required for Kubernetes on Windows
   network = google_compute_network.vpc_network.name
   # enable alias IP addresses, required for Kubernetes for Windows
   ip_allocation_policy {}

   # use newer Kubernetes version, otherwise Windows node pools can't be created
   min_master_version = "1.16"

   # one node is enough (at the moment)
   initial_node_count = 1

   node_config {
     # FIXME(kuhnel): turn this into a private cluster, without external IP
     # We need at least 2 vCPU to run all kubernetes services
     machine_type = "e2-medium"
     # use preemptible, as this saves costs
     preemptible = true
   }

 }

 resource "null_resource" "update_cluster" {
   # Add NVIDIA driver daemonset.
   depends_on = [google_container_cluster.primary]
   # Update kubectl context for the cluster and apply nvidia's daemonset.
   provisioner "local-exec" {
       command = <<EOT
         gcloud container clusters get-credentials cudabot
         kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml
       EOT
   }
 }

 # Create machines for mlir-nvidia
 # Note: The buildbot mlir-nividia is deployed using a kubernetes file. See
 # the README.md for details on GPUs.

 resource "google_container_node_pool" "nvidia_16core_pool_nodes" {
   name       = "nvidia-16core-pool"
   # specify a zone here (e.g. "-a") to avoid a redundant deployment
   location   = var.gcp_config.zone_a
   cluster    = google_container_cluster.primary.name

   # use autoscaling to only create a machine when there is a deployment
   autoscaling {
     min_node_count = 0
     max_node_count = 2
   }

   node_config {
     # use preemptible, as this saves costs
     preemptible  = true
     # FIXME upgrade to "n1-custom-24-32768"
     machine_type = "n1-highcpu-16"
     disk_size_gb = 100
     # FIXME: test if SSDs are actually faster than HDDs for our use case
     disk_type = "pd-ssd"
     guest_accelerator {
       type = "nvidia-tesla-t4"
       count= 1
     }

     # set the premissions required for the deployment later
     oauth_scopes = [
       "https://www.googleapis.com/auth/logging.write",
       "https://www.googleapis.com/auth/monitoring",
       "https://www.googleapis.com/auth/devstorage.read_only",
     ]

     # add a label to all machines of this type, so we can select them
     # during deployment
     labels = {
       pool = "nvidia-16core-pool"
     }
   }
 }

 resource "null_resource" "deployment-mlir-nvidia" {
   # Add NVIDIA driver daemonset.
   depends_on = [null_resource.update_cluster]
   triggers = {
     t4_contents = filemd5("${path.module}/deployment-mlir-nvidia-production.yaml")
   }
   # Add NVIDIA daemonset and deploy mlir-nvidia.
   # Using this workaround as terraform does not support GPUs on Google Cloud.
   # https://github.com/terraform-providers/terraform-provider-kubernetes/issues/149
   provisioner "local-exec" {
       command = "kubectl apply -f ${path.module}/deployment-mlir-nvidia-production.yaml"
   }
 }

 # node pool for windows machines
 resource "google_container_node_pool" "windows_32core_pool_nodes" {
   name       = "windows-32core-pool"
   # specify a zone here (e.g. "-a") to avoid a redundant deployment
   location   = var.gcp_config.zone_a
   cluster    = google_container_cluster.primary.name

   # use autoscaling to only create a machine when there is a deployment
   autoscaling {
     min_node_count = 0
     max_node_count = 1
   }

   node_config {
     # use preemptible, as this saves costs
     preemptible  = true
     machine_type = "e2-highcpu-32"
     # Windows deployments tend to require more disk space, so using 300GB here.
     disk_size_gb = 300
     # FIXME: test if SSDs are actually faster than HDDs for our use case
     disk_type = "pd-ssd"

     # Configure Windows image. As Windows is picky about the combination of
     # host and container OS versions, this must be compatible with the version
     # in your container. Recommondation: Use LTSC for long-term stability.
     # For details see
     # https://docs.microsoft.com/en-us/virtualization/windowscontainers/deploy-containers/version-compatibility
     # https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-cluster-windows#choose_your_windows_server_node_image
     image_type = "WINDOWS_LTSC"

     # set the premissions required for the deployment later
     oauth_scopes = [
       "https://www.googleapis.com/auth/logging.write",
       "https://www.googleapis.com/auth/monitoring",
       "https://www.googleapis.com/auth/devstorage.read_only",
     ]

     # add a label to all machines of this type, so we can select them
     # during deployment
     labels = {
       pool = "win-32core-pool"
     }
   }
 }

 # Deployment for the buildbot windows10_vs2019 running on Windows rather than
 # Linux.
 # Note: Deploying this takes significantly longer (~35 min) than on Linux
 # as the images tend to be larger (~18GB) and IO performance is lower.
 resource "kubernetes_deployment" "windows10_vs2019" {
   metadata {
     name = "windows10-vs2019"
     labels = {
       app = "windows10_vs2019"
     }
   }

   spec {
     # create one instance of this container
     replicas = 1

     selector {
       match_labels = {
         app = "windows10_vs2019"
       }
     }
     strategy{
       rolling_update{
         # do not deploy more replicas, as the buildbot server
         # can't handle multiple workers with the same credentials
         max_surge = 0
         # Allow to have 0 replicas during updates.
         max_unavailable = 1
         }
       type = "RollingUpdate"
     }
     template {
       metadata {
         labels = {
           app = "windows10_vs2019"
         }
       }

       spec {
         container {
           image = "${var.gcp_config.gcr_prefix}/buildbot-windows10-vs2019:17"
           name  = "windows10-vs2019"

           # reserve "<number of cores>-1" for this image, kubernetes also
           # needs <1 core for management tools
           resources {
             limits {
               cpu    = "31"
               memory = "20Gi"
             }
             requests {
               cpu    = "31"
               memory = "20Gi"
             }
           }

           # mount the secrets into a folder
           volume_mount {
             mount_path = "c:\\volumes\\secrets"
             name = "buildbot-token"
           }
           volume_mount {
             mount_path = "c:\\volumes\\sccache"
             name = "sccache-vol"
           }
           volume_mount {
             mount_path = "c:\\volumes\\buildbot"
             name = "buildbot-vol"
           }

         }
         # select which node pool to deploy to
         node_selector = {
           pool = "win-32core-pool"
         }
         # restart in case of any crashes
         restart_policy = "Always"

         # select the secret to be mounted
         volume {
           name = "buildbot-token"
             secret {
               optional = false
               secret_name = "password-windows10-vs2019"
             }
         }
         volume {
           name = "sccache-vol"
           empty_dir {}
         }
         volume {
           name = "buildbot-vol"
           empty_dir {}
         }

         # Windows nodes from the node pool are marked with the taint
         # "node.kubernetes.io/os=windows". So we need to "tolerate" this to
         # deploy to such nodes.
         toleration {
           effect    = "NoSchedule"
           key       = "node.kubernetes.io/os"
           operator  = "Equal"
           value     = "windows"
         }
       }
     }
   }
 }


 resource "google_container_node_pool" "linux_16_core_pool" {
   name       = "linux-16-core-pool"
   # specify a zone here (e.g. "-a") to avoid a redundant deployment
   location   = var.gcp_config.zone_a
   cluster    = google_container_cluster.primary.name

   # use autoscaling to only create a machine when there is a deployment
   autoscaling {
     min_node_count = 0
     max_node_count = 2
   }

   node_config {
     # use preemptible, as this saves costs
     preemptible  = true
     #custom machine type: 16 core, 32 GB as tsan needs more RAM
     machine_type = "n2d-custom-16-32768"
     disk_size_gb = 100
     disk_type = "pd-ssd"

     # set the premissions required for the deployment later
     oauth_scopes = [
       "https://www.googleapis.com/auth/logging.write",
       "https://www.googleapis.com/auth/monitoring",
       "https://www.googleapis.com/auth/devstorage.read_only",
     ]

     # add a label to all machines of this type, so we can select them
     # during deployment
     labels = {
       pool = "linux-16-core-pool"
     }
   }
 }


 resource "kubernetes_deployment" "clangd-ubuntu-clang" {
   metadata {
     name = "clangd-ubuntu-clang"
     labels = {
       app = "clangd-ubuntu-clang"
     }
   }

   spec {
     # create one instance of this container
     replicas = 1

     selector {
       match_labels = {
         app = "clangd-ubuntu-clang"
       }
     }
     strategy{
       rolling_update{
         # do not deploy more replicas, as the buildbot server
         # can't handle multiple workers with the same credentials
         max_surge = 0
         # Allow to have 0 replicas during updates.
         max_unavailable = 1
         }
       type = "RollingUpdate"
     }
     template {
       metadata {
         labels = {
           app = "clangd-ubuntu-clang"
         }
       }

       spec {
         container {
           image = "${var.gcp_config.gcr_prefix}/buildbot-clangd-ubuntu-clang:3"
           name  = "buildbot-clangd-ubuntu-clang"

           # reserve "<number of cores>-1" for this image, kubernetes also
           # needs <1 core for management tools
           resources {
             limits {
               cpu    = "15"
               memory = "28G"
             }
             requests {
               cpu    = "15"
               memory = "28G"
             }
           }

           # mount the secrets into a folder
           volume_mount {
             mount_path = "/vol/secrets"
             name = "buildbot-token"
           }
           volume_mount {
             mount_path = "/vol/cccache"
             name = "ccache-vol"
           }
           volume_mount {
             mount_path = "/vol/worker"
             name = "worker-vol"
           }

           env {
             # connect to production environment, running at port 9990
             # staging would be at 9994
             name = "BUILDBOT_PORT"
             value = "9990"
           }
         }
         # select which node pool to deploy to
         node_selector = {
           pool = "linux-16-core-pool"
         }
         # restart in case of any crashes
         restart_policy = "Always"

         # select the secret to be mounted
         volume {
           name = "buildbot-token"
             secret {
               optional = false
               secret_name = "password-clangd-ubuntu-clang"
             }
         }
         volume {
           name = "ccache-vol"
           empty_dir {}
         }
         volume {
           name = "worker-vol"
           empty_dir {}
         }

       }
     }
   }
 }
	# configure cloud storage backend to keep state information. This is shared
	# across all users and contains the previously configured parts. Accessing
	# GCS requires that the environment variable `GOOGLE_CLOUD_KEYFILE_JSON` points
	# to your credential file, e.g.
	# ~/.config/gcloud/legacy_credentials/<your email>/adc.json
	terraform {
	backend "gcs" {
	bucket = "buildbot_cluster_terraform_backend"
	prefix = "terraform/state"
	}
	}

	# configure Google Cloud project
	provider "google" {
	project = var.gcp_config.project
	region = var.gcp_config.region
	}


	# create a network for the cluster, required for Kubernetes on Windows
	# FIXME: rename to "buildbot-vpc-network", causes destruction of the cluster!
	resource "google_compute_network" "vpc_network" {
	name = "vpc-network"
	}

	# Create the cluster runningn all Kubernetes services
	resource "google_container_cluster" "primary" {
	name = "buildbot-cluster"
	# maybe have a regional cluster for Kubernetes, as we depend on this...
	location = var.gcp_config.zone_a

	# configure local network, required for Kubernetes on Windows
	network = google_compute_network.vpc_network.name
	# enable alias IP addresses, required for Kubernetes for Windows
	ip_allocation_policy {}

	# use newer Kubernetes version, otherwise Windows node pools can't be created
	min_master_version = "1.16"

	# one node is enough (at the moment)
	initial_node_count = 1

	node_config {
	# FIXME(kuhnel): turn this into a private cluster, without external IP
	# We need at least 2 vCPU to run all kubernetes services
	machine_type = "e2-medium"
	# use preemptible, as this saves costs
	preemptible = true
	}

	}

	resource "null_resource" "update_cluster" {
	# Add NVIDIA driver daemonset.
	depends_on = [google_container_cluster.primary]
	# Update kubectl context for the cluster and apply nvidia's daemonset.
	provisioner "local-exec" {
	command = <<EOT
	gcloud container clusters get-credentials cudabot
	kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml
	EOT
	}
	}

	# Create machines for mlir-nvidia
	# Note: The buildbot mlir-nividia is deployed using a kubernetes file. See
	# the README.md for details on GPUs.

	resource "google_container_node_pool" "nvidia_16core_pool_nodes" {
	name = "nvidia-16core-pool"
	# specify a zone here (e.g. "-a") to avoid a redundant deployment
	location = var.gcp_config.zone_a
	cluster = google_container_cluster.primary.name

	# use autoscaling to only create a machine when there is a deployment
	autoscaling {
	min_node_count = 0
	max_node_count = 2
	}

	node_config {
	# use preemptible, as this saves costs
	preemptible = true
	# FIXME upgrade to "n1-custom-24-32768"
	machine_type = "n1-highcpu-16"
	disk_size_gb = 100
	# FIXME: test if SSDs are actually faster than HDDs for our use case
	disk_type = "pd-ssd"
	guest_accelerator {
	type = "nvidia-tesla-t4"
	count= 1
	}

	# set the premissions required for the deployment later
	oauth_scopes = [
	"https://www.googleapis.com/auth/logging.write",
	"https://www.googleapis.com/auth/monitoring",
	"https://www.googleapis.com/auth/devstorage.read_only",
	]

	# add a label to all machines of this type, so we can select them
	# during deployment
	labels = {
	pool = "nvidia-16core-pool"
	}
	}
	}

	resource "null_resource" "deployment-mlir-nvidia" {
	# Add NVIDIA driver daemonset.
	depends_on = [null_resource.update_cluster]
	triggers = {
	t4_contents = filemd5("${path.module}/deployment-mlir-nvidia-production.yaml")
	}
	# Add NVIDIA daemonset and deploy mlir-nvidia.
	# Using this workaround as terraform does not support GPUs on Google Cloud.
	# https://github.com/terraform-providers/terraform-provider-kubernetes/issues/149
	provisioner "local-exec" {
	command = "kubectl apply -f ${path.module}/deployment-mlir-nvidia-production.yaml"
	}
	}

	# node pool for windows machines
	resource "google_container_node_pool" "windows_32core_pool_nodes" {
	name = "windows-32core-pool"
	# specify a zone here (e.g. "-a") to avoid a redundant deployment
	location = var.gcp_config.zone_a
	cluster = google_container_cluster.primary.name

	# use autoscaling to only create a machine when there is a deployment
	autoscaling {
	min_node_count = 0
	max_node_count = 1
	}

	node_config {
	# use preemptible, as this saves costs
	preemptible = true
	machine_type = "e2-highcpu-32"
	# Windows deployments tend to require more disk space, so using 300GB here.
	disk_size_gb = 300
	# FIXME: test if SSDs are actually faster than HDDs for our use case
	disk_type = "pd-ssd"

	# Configure Windows image. As Windows is picky about the combination of
	# host and container OS versions, this must be compatible with the version
	# in your container. Recommondation: Use LTSC for long-term stability.
	# For details see
	# https://docs.microsoft.com/en-us/virtualization/windowscontainers/deploy-containers/version-compatibility
	# https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-cluster-windows#choose_your_windows_server_node_image
	image_type = "WINDOWS_LTSC"

	# set the premissions required for the deployment later
	oauth_scopes = [
	"https://www.googleapis.com/auth/logging.write",
	"https://www.googleapis.com/auth/monitoring",
	"https://www.googleapis.com/auth/devstorage.read_only",
	]

	# add a label to all machines of this type, so we can select them
	# during deployment
	labels = {
	pool = "win-32core-pool"
	}
	}
	}

	# Deployment for the buildbot windows10_vs2019 running on Windows rather than
	# Linux.
	# Note: Deploying this takes significantly longer (~35 min) than on Linux
	# as the images tend to be larger (~18GB) and IO performance is lower.
	resource "kubernetes_deployment" "windows10_vs2019" {
	metadata {
	name = "windows10-vs2019"
	labels = {
	app = "windows10_vs2019"
	}
	}

	spec {
	# create one instance of this container
	replicas = 1

	selector {
	match_labels = {
	app = "windows10_vs2019"
	}
	}
	strategy{
	rolling_update{
	# do not deploy more replicas, as the buildbot server
	# can't handle multiple workers with the same credentials
	max_surge = 0
	# Allow to have 0 replicas during updates.
	max_unavailable = 1
	}
	type = "RollingUpdate"
	}
	template {
	metadata {
	labels = {
	app = "windows10_vs2019"
	}
	}

	spec {
	container {
	image = "${var.gcp_config.gcr_prefix}/buildbot-windows10-vs2019:17"
	name = "windows10-vs2019"

	# reserve "<number of cores>-1" for this image, kubernetes also
	# needs <1 core for management tools
	resources {
	limits {
	cpu = "31"
	memory = "20Gi"
	}
	requests {
	cpu = "31"
	memory = "20Gi"
	}
	}

	# mount the secrets into a folder
	volume_mount {
	mount_path = "c:\\volumes\\secrets"
	name = "buildbot-token"
	}
	volume_mount {
	mount_path = "c:\\volumes\\sccache"
	name = "sccache-vol"
	}
	volume_mount {
	mount_path = "c:\\volumes\\buildbot"
	name = "buildbot-vol"
	}

	}
	# select which node pool to deploy to
	node_selector = {
	pool = "win-32core-pool"
	}
	# restart in case of any crashes
	restart_policy = "Always"

	# select the secret to be mounted
	volume {
	name = "buildbot-token"
	secret {
	optional = false
	secret_name = "password-windows10-vs2019"
	}
	}
	volume {
	name = "sccache-vol"
	empty_dir {}
	}
	volume {
	name = "buildbot-vol"
	empty_dir {}
	}

	# Windows nodes from the node pool are marked with the taint
	# "node.kubernetes.io/os=windows". So we need to "tolerate" this to
	# deploy to such nodes.
	toleration {
	effect = "NoSchedule"
	key = "node.kubernetes.io/os"
	operator = "Equal"
	value = "windows"
	}
	}
	}
	}
	}


	resource "google_container_node_pool" "linux_16_core_pool" {
	name = "linux-16-core-pool"
	# specify a zone here (e.g. "-a") to avoid a redundant deployment
	location = var.gcp_config.zone_a
	cluster = google_container_cluster.primary.name

	# use autoscaling to only create a machine when there is a deployment
	autoscaling {
	min_node_count = 0
	max_node_count = 2
	}

	node_config {
	# use preemptible, as this saves costs
	preemptible = true
	#custom machine type: 16 core, 32 GB as tsan needs more RAM
	machine_type = "n2d-custom-16-32768"
	disk_size_gb = 100
	disk_type = "pd-ssd"

	# set the premissions required for the deployment later
	oauth_scopes = [
	"https://www.googleapis.com/auth/logging.write",
	"https://www.googleapis.com/auth/monitoring",
	"https://www.googleapis.com/auth/devstorage.read_only",
	]

	# add a label to all machines of this type, so we can select them
	# during deployment
	labels = {
	pool = "linux-16-core-pool"
	}
	}
	}


	resource "kubernetes_deployment" "clangd-ubuntu-clang" {
	metadata {
	name = "clangd-ubuntu-clang"
	labels = {
	app = "clangd-ubuntu-clang"
	}
	}

	spec {
	# create one instance of this container
	replicas = 1

	selector {
	match_labels = {
	app = "clangd-ubuntu-clang"
	}
	}
	strategy{
	rolling_update{
	# do not deploy more replicas, as the buildbot server
	# can't handle multiple workers with the same credentials
	max_surge = 0
	# Allow to have 0 replicas during updates.
	max_unavailable = 1
	}
	type = "RollingUpdate"
	}
	template {
	metadata {
	labels = {
	app = "clangd-ubuntu-clang"
	}
	}

	spec {
	container {
	image = "${var.gcp_config.gcr_prefix}/buildbot-clangd-ubuntu-clang:3"
	name = "buildbot-clangd-ubuntu-clang"

	# reserve "<number of cores>-1" for this image, kubernetes also
	# needs <1 core for management tools
	resources {
	limits {
	cpu = "15"
	memory = "28G"
	}
	requests {
	cpu = "15"
	memory = "28G"
	}
	}

	# mount the secrets into a folder
	volume_mount {
	mount_path = "/vol/secrets"
	name = "buildbot-token"
	}
	volume_mount {
	mount_path = "/vol/cccache"
	name = "ccache-vol"
	}
	volume_mount {
	mount_path = "/vol/worker"
	name = "worker-vol"
	}

	env {
	# connect to production environment, running at port 9990
	# staging would be at 9994
	name = "BUILDBOT_PORT"
	value = "9990"
	}
	}
	# select which node pool to deploy to
	node_selector = {
	pool = "linux-16-core-pool"
	}
	# restart in case of any crashes
	restart_policy = "Always"

	# select the secret to be mounted
	volume {
	name = "buildbot-token"
	secret {
	optional = false
	secret_name = "password-clangd-ubuntu-clang"
	}
	}
	volume {
	name = "ccache-vol"
	empty_dir {}
	}
	volume {
	name = "worker-vol"
	empty_dir {}
	}

	}
	}
	}
	}