diff --git a/community/modules/internal/slurm-gcp/instance_template/README.md b/community/modules/internal/slurm-gcp/instance_template/README.md index 6baf8180e7..bc9f37630e 100644 --- a/community/modules/internal/slurm-gcp/instance_template/README.md +++ b/community/modules/internal/slurm-gcp/instance_template/README.md @@ -29,7 +29,7 @@ | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
nat_ip = string
network_tier = string
}))
list(object({| `[]` | no | +| [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
list(object({| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
source = optional(string)
disk_name = optional(string)
device_name = string
disk_type = optional(string)
disk_size_gb = optional(number)
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
list(object({| `[]` | no | | [advanced\_machine\_features](#input\_advanced\_machine\_features) | See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#nested_advanced_machine_features |
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
}))
object({| n/a | yes | | [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
enable_nested_virtualization = optional(bool)
threads_per_core = optional(number)
turbo_mode = optional(string)
visible_core_count = optional(number)
performance_monitoring_unit = optional(string)
enable_uefi_networking = optional(bool)
})
list(object({| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | List of maps of additional disks. See https://www.terraform.io/docs/providers/google/r/compute_instance_template#disk_name |
nat_ip = string
network_tier = string
}))
list(object({| `[]` | no | +| [additional\_disks](#input\_additional\_disks) | List of maps of additional disks. See https://www.terraform.io/docs/providers/google/r/compute_instance_template#disk_name |
disk_name = string
device_name = string
auto_delete = bool
boot = bool
disk_size_gb = number
disk_type = string
disk_labels = map(string)
}))
list(object({| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
source = optional(string)
disk_name = optional(string)
device_name = string
auto_delete = bool
boot = bool
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = map(string)
}))
list(object({| `[]` | no | | [advanced\_machine\_features](#input\_advanced\_machine\_features) | See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#nested_advanced_machine_features |
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
}))
object({| n/a | yes | | [alias\_ip\_range](#input\_alias\_ip\_range) | An array of alias IP ranges for this network interface. Can only be specified for network interfaces on subnet-mode networks.
enable_nested_virtualization = optional(bool)
threads_per_core = optional(number)
turbo_mode = optional(string)
visible_core_count = optional(number)
performance_monitoring_unit = optional(string)
enable_uefi_networking = optional(bool)
})
object({| `null` | no | diff --git a/community/modules/internal/slurm-gcp/internal_instance_template/main.tf b/community/modules/internal/slurm-gcp/internal_instance_template/main.tf index 8395998201..45c9ad8545 100644 --- a/community/modules/internal/slurm-gcp/internal_instance_template/main.tf +++ b/community/modules/internal/slurm-gcp/internal_instance_template/main.tf @@ -95,7 +95,7 @@ resource "google_compute_instance_template" "tpl" { source = lookup(disk.value, "source", null) source_image = lookup(disk.value, "source_image", null) type = lookup(disk.value, "disk_type", null) == "local-ssd" ? "SCRATCH" : "PERSISTENT" - labels = lookup(disk.value, "disk_type", null) == "local-ssd" ? null : lookup(disk.value, "disk_labels", null) + labels = (lookup(disk.value, "source", null) != null || lookup(disk.value, "disk_type", null) == "local-ssd") ? null : lookup(disk.value, "disk_labels", null) dynamic "disk_encryption_key" { for_each = compact([var.disk_encryption_key == null ? null : 1]) diff --git a/community/modules/internal/slurm-gcp/internal_instance_template/variables.tf b/community/modules/internal/slurm-gcp/internal_instance_template/variables.tf index 26cb523d84..299027ce7a 100644 --- a/community/modules/internal/slurm-gcp/internal_instance_template/variables.tf +++ b/community/modules/internal/slurm-gcp/internal_instance_template/variables.tf @@ -161,12 +161,13 @@ variable "auto_delete" { variable "additional_disks" { description = "List of maps of additional disks. See https://www.terraform.io/docs/providers/google/r/compute_instance_template#disk_name" type = list(object({ - disk_name = string + source = optional(string) + disk_name = optional(string) device_name = string auto_delete = bool boot = bool - disk_size_gb = number - disk_type = string + disk_size_gb = optional(number) + disk_type = optional(string) disk_labels = map(string) })) default = [] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index b8c36d61d7..98887df354 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -264,6 +264,7 @@ limitations under the License. | Name | Type | |------|------| +| [google_compute_disk.controller_disk](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_disk) | resource | | [google_compute_instance_from_template.controller](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_from_template) | resource | | [google_secret_manager_secret.cloudsql](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource | | [google_secret_manager_secret_iam_member.cloudsql_secret_accessor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | @@ -292,6 +293,7 @@ limitations under the License. | [controller\_project\_id](#input\_controller\_project\_id) | Optionally. Provision controller and config bucket in the different project | `string` | `null` | no | | [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `"# no-op"` | no | | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
ip_cidr_range = string
subnetwork_range_name = string
})
object({|
type = string
size = number
})
{| no | | [create\_bucket](#input\_create\_bucket) | Create GCS bucket instead of using an existing one. | `bool` | `true` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | | [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | DEPRECATED: Use `enable_controller_public_ips` instead. | `bool` | `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index d671583214..9640a321cc 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -32,6 +32,14 @@ locals { } ] + state_disk = var.controller_state_disk != null ? [{ + source = google_compute_disk.controller_disk[0].name + device_name = google_compute_disk.controller_disk[0].name + disk_labels = null + auto_delete = false + boot = false + }] : [] + synth_def_sa_email = "${data.google_project.controller_project.number}-compute@developer.gserviceaccount.com" service_account = { @@ -54,6 +62,15 @@ data "google_project" "controller_project" { project_id = var.controller_project_id } +resource "google_compute_disk" "controller_disk" { + count = var.controller_state_disk != null ? 1 : 0 + + name = "${local.slurm_cluster_name}-controller-save" + type = var.controller_state_disk.type + size = var.controller_state_disk.size + zone = var.zone +} + # INSTANCE TEMPLATE module "slurm_controller_template" { source = "../../internal/slurm-gcp/instance_template" @@ -68,7 +85,7 @@ module "slurm_controller_template" { disk_labels = merge(var.disk_labels, local.labels) disk_size_gb = var.disk_size_gb disk_type = var.disk_type - additional_disks = local.additional_disks + additional_disks = concat(local.additional_disks, local.state_disk) bandwidth_tier = var.bandwidth_tier slurm_bucket_path = module.slurm_files.slurm_bucket_path diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 8b60cbfc45..0905e899df 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -72,6 +72,7 @@ No modules. | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
"size": 50,
"type": "pd-ssd"
}
list(object({| `[]` | no | | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
filename = string
content = string
}))
object({|
device_name = string
})
{| no | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
"device_name": null
}