diff --git a/infra/gcp/terraform/k8s-infra-prow-build/iam.tf b/infra/gcp/terraform/k8s-infra-prow-build/iam.tf index 748a6d4efda..cb1401831b3 100644 --- a/infra/gcp/terraform/k8s-infra-prow-build/iam.tf +++ b/infra/gcp/terraform/k8s-infra-prow-build/iam.tf @@ -32,6 +32,7 @@ module "iam" { "roles/secretmanager.secretAccessor" = [ "serviceAccount:kubernetes-external-secrets@k8s-infra-prow-build.iam.gserviceaccount.com", "principal://iam.googleapis.com/projects/${module.project.project_number}/locations/global/workloadIdentityPools/${module.project.project_id}.svc.id.goog/subject/ns/external-secrets/sa/external-secrets", + "principal://iam.googleapis.com/projects/180382678033/locations/global/workloadIdentityPools/k8s-infra-prow-build-trusted.svc.id.goog/subject/ns/external-secrets/sa/external-secrets", ] } } diff --git a/infra/gcp/terraform/k8s-infra-prow-build/main.tf b/infra/gcp/terraform/k8s-infra-prow-build/main.tf index ff00ac41d00..ad2db1b1ce5 100644 --- a/infra/gcp/terraform/k8s-infra-prow-build/main.tf +++ b/infra/gcp/terraform/k8s-infra-prow-build/main.tf @@ -60,14 +60,6 @@ resource "google_project_iam_member" "k8s_infra_prow_viewers" { member = "group:k8s-infra-prow-viewers@kubernetes.io" } -// Allow prow-deployer service account in k8s-infra-prow-build-trusted to deploy -// to the cluster defined in here -resource "google_project_iam_member" "prow_deployer_for_prow_build" { - project = module.project.project_id - role = "roles/container.admin" - member = "serviceAccount:prow-deployer@k8s-infra-prow-build-trusted.iam.gserviceaccount.com" -} - module "prow_build_cluster" { source = "../modules/gke-cluster" project_name = module.project.project_id @@ -86,18 +78,30 @@ module "prow_build_nodepool_c4_highmem_8_localssd" { cluster_name = module.prow_build_cluster.cluster.name location = module.prow_build_cluster.cluster.location node_locations = [ + "us-central1-a", "us-central1-b", "us-central1-c", "us-central1-f", ] - name = "pool6" - initial_count = 1 - min_count = 1 - max_count = 80 - machine_type = "c4-highmem-8" - disk_size_gb = 500 - disk_type = "hyperdisk-balanced" - service_account = module.prow_build_cluster.cluster_node_sa.email + name = "pool6" + initial_count = 1 + min_count = 1 + max_count = 250 # total across all zones + machine_type = "c4-highmem-8-lssd" + disk_size_gb = 100 + disk_type = "hyperdisk-balanced" + enable_nested_virtualization = true + service_account = module.prow_build_cluster.cluster_node_sa.email + // This taint exists to bias workloads on to the C4D nodepool first, if we can't secure a C4D node + // then we schedule on to a C4 node. C4D performs better than C4 but it is capacity constrained at times. + // Also, nested virt doesn't work on C4D or C4A + taints = [ + { + key = "spare" + value = "true" + effect = "PREFER_NO_SCHEDULE" + } + ] } module "prow_build_nodepool_c4d_highmem_8_localssd" { @@ -113,53 +117,13 @@ module "prow_build_nodepool_c4d_highmem_8_localssd" { name = "pool7" initial_count = 1 min_count = 10 - max_count = 80 - machine_type = "c4d-highmem-8-lssd" # has 2 local ssd disks attached + max_count = 250 # total across all zones + machine_type = "c4d-highmem-8-lssd" # has 1 local ssd disks attached disk_size_gb = 100 disk_type = "hyperdisk-balanced" service_account = module.prow_build_cluster.cluster_node_sa.email } - -module "sig_node_node_pool_1_n4_highmem_8" { - - source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gke-nodepool?ref=v39.0.0&depth=1" - project_id = module.project.project_id - name = "sig-node-pool1" - location = module.prow_build_cluster.cluster.location - cluster_name = module.prow_build_cluster.cluster.name - - service_account = { - email = module.prow_build_cluster.cluster_node_sa.email - oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] - } - - nodepool_config = { - autoscaling = { - max_node_count = 10 - min_node_count = 1 # 1 per zone - } - management = { - auto_repair = true - auto_upgrade = true - } - } - - node_config = { - machine_type = "n4-highmem-8" - disk_type = "hyperdisk-balanced" - image_type = "COS_CONTAINERD" - gvnic = true - workload_metadata_config_mode = "GKE_METADATA" - shielded_instance_config = { - enable_secure_boot = true - } - } - - - taints = { dedicated = { value = "sig-node", effect = "NO_SCHEDULE" } } -} - module "prow_build_nodepool_c4a_highmem_8_localssd" { source = "../modules/gke-nodepool" project_name = module.project.project_id @@ -169,11 +133,12 @@ module "prow_build_nodepool_c4a_highmem_8_localssd" { "us-central1-a", "us-central1-b", "us-central1-c", + "us-central1-f", ] name = "pool7-arm64" initial_count = 1 - min_count = 1 - max_count = 10 + min_count = 3 + max_count = 100 # total across all zones machine_type = "c4a-highmem-8-lssd" # has 2 local ssd disks attached disk_size_gb = 100 disk_type = "hyperdisk-balanced" diff --git a/infra/gcp/terraform/k8s-infra-prow-build/peering.tf b/infra/gcp/terraform/k8s-infra-prow-build/peering.tf index e094ce52e52..f0db7399914 100644 --- a/infra/gcp/terraform/k8s-infra-prow-build/peering.tf +++ b/infra/gcp/terraform/k8s-infra-prow-build/peering.tf @@ -15,9 +15,11 @@ limitations under the License. */ resource "google_vmwareengine_network_peering" "gvce_peering" { - name = "peer-with-gcve-project" - peer_network = "projects/k8s-infra-prow-build/global/networks/default" - project = module.project.project_id - peer_network_type = "STANDARD" - vmware_engine_network = "projects/broadcom-451918/locations/global/vmwareEngineNetworks/k8s-gcp-gcve-network" + name = "peer-with-gcve-project" + peer_network = "projects/k8s-infra-prow-build/global/networks/default" + project = module.project.project_id + peer_network_type = "STANDARD" + vmware_engine_network = "projects/broadcom-451918/locations/global/vmwareEngineNetworks/k8s-gcp-gcve-network" + export_custom_routes_with_public_ip = true + import_custom_routes_with_public_ip = true } diff --git a/infra/gcp/terraform/k8s-infra-prow-build/00-provider.tf b/infra/gcp/terraform/k8s-infra-prow-build/provider.tf similarity index 94% rename from infra/gcp/terraform/k8s-infra-prow-build/00-provider.tf rename to infra/gcp/terraform/k8s-infra-prow-build/provider.tf index cac23ac9146..95cf53d43f9 100644 --- a/infra/gcp/terraform/k8s-infra-prow-build/00-provider.tf +++ b/infra/gcp/terraform/k8s-infra-prow-build/provider.tf @@ -30,11 +30,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 6.31.0" + version = "~> 7.7.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 6.31.0" + version = "~> 7.7.0" } } } diff --git a/infra/gcp/terraform/modules/gke-cluster/versions.tf b/infra/gcp/terraform/modules/gke-cluster/versions.tf index ca25a20458f..cd723090bc3 100644 --- a/infra/gcp/terraform/modules/gke-cluster/versions.tf +++ b/infra/gcp/terraform/modules/gke-cluster/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 6.31.0" + version = ">=6.31.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 6.31.0" + version = ">=6.31.0" } } } diff --git a/infra/gcp/terraform/modules/gke-nodepool/main.tf b/infra/gcp/terraform/modules/gke-nodepool/main.tf index daee9c2ab8b..3894b6a117a 100644 --- a/infra/gcp/terraform/modules/gke-nodepool/main.tf +++ b/infra/gcp/terraform/modules/gke-nodepool/main.tf @@ -31,11 +31,17 @@ resource "google_container_node_pool" "node_pool" { auto_upgrade = true } + upgrade_settings { + max_unavailable = 0 + max_surge = 10 + } + // Autoscale the cluster as needed. Note if using a regional cluster these values will be multiplied by 3 initial_node_count = var.initial_count autoscaling { - min_node_count = var.min_count - max_node_count = var.max_count + total_min_node_count = var.min_count + total_max_node_count = var.max_count + location_policy = "ANY" } node_locations = var.node_locations @@ -49,6 +55,9 @@ resource "google_container_node_pool" "node_pool" { service_account = var.service_account oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + kubelet_config { + single_process_oom_kill = true # https://github.com/kubernetes-sigs/prow/issues/210 + } dynamic "ephemeral_storage_config" { for_each = var.ephemeral_local_ssd_count > 0 ? [var.ephemeral_local_ssd_count] : [] @@ -57,6 +66,11 @@ resource "google_container_node_pool" "node_pool" { } } + advanced_machine_features { + enable_nested_virtualization = var.enable_nested_virtualization + threads_per_core = 0 + } + // Needed for workload identity workload_metadata_config { mode = "GKE_METADATA" @@ -72,6 +86,7 @@ resource "google_container_node_pool" "node_pool" { value = taint.value.value } } + } // If we need to destroy the node pool, create the new one before destroying diff --git a/infra/gcp/terraform/modules/gke-nodepool/variables.tf b/infra/gcp/terraform/modules/gke-nodepool/variables.tf index 1062e3c2265..2428160aef8 100644 --- a/infra/gcp/terraform/modules/gke-nodepool/variables.tf +++ b/infra/gcp/terraform/modules/gke-nodepool/variables.tf @@ -107,3 +107,9 @@ variable "service_account" { description = "The email address of the GCP Service Account to be associated with nodes in this node_pool" type = string } + +variable "enable_nested_virtualization" { + description = "Whether to enable nested virtualization on the node pool's VMs" + type = bool + default = false +} diff --git a/infra/gcp/terraform/modules/gke-nodepool/versions.tf b/infra/gcp/terraform/modules/gke-nodepool/versions.tf index ca25a20458f..cd723090bc3 100644 --- a/infra/gcp/terraform/modules/gke-nodepool/versions.tf +++ b/infra/gcp/terraform/modules/gke-nodepool/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 6.31.0" + version = ">=6.31.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 6.31.0" + version = ">=6.31.0" } } } diff --git a/infra/gcp/terraform/modules/gke-project/versions.tf b/infra/gcp/terraform/modules/gke-project/versions.tf index ca25a20458f..cd723090bc3 100644 --- a/infra/gcp/terraform/modules/gke-project/versions.tf +++ b/infra/gcp/terraform/modules/gke-project/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 6.31.0" + version = ">=6.31.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 6.31.0" + version = ">=6.31.0" } } } diff --git a/infra/gcp/terraform/modules/workload-identity-service-account/versions.tf b/infra/gcp/terraform/modules/workload-identity-service-account/versions.tf index 7cff530704e..48ddb73fde9 100644 --- a/infra/gcp/terraform/modules/workload-identity-service-account/versions.tf +++ b/infra/gcp/terraform/modules/workload-identity-service-account/versions.tf @@ -17,11 +17,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 6.31.0" + version = ">=6.31.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 6.31.0" + version = ">=6.31.0" } } }