From 13537b522b7e2a46065e29829b8247710814a7dc Mon Sep 17 00:00:00 2001 From: Alyssa Date: Mon, 10 Feb 2025 20:51:33 +0000 Subject: [PATCH] Add controller state save disk --- .../slurm-gcp/instance_template/README.md | 2 +- .../slurm-gcp/instance_template/main.tf | 1 + .../slurm-gcp/instance_template/variables.tf | 1 + .../internal_instance_template/README.md | 2 +- .../internal_instance_template/main.tf | 10 ++-- .../internal_instance_template/variables.tf | 1 + .../schedmd-slurm-gcp-v6-controller/README.md | 2 + .../controller.tf | 22 ++++++++- .../modules/slurm_files/README.md | 1 + .../modules/slurm_files/main.tf | 17 +++---- .../modules/slurm_files/scripts/setup.py | 46 +++++++++++++++++++ .../modules/slurm_files/variables.tf | 16 +++++++ .../slurm_files.tf | 4 ++ .../variables.tf | 20 ++++++++ 14 files changed, 129 insertions(+), 16 deletions(-) diff --git a/community/modules/internal/slurm-gcp/instance_template/README.md b/community/modules/internal/slurm-gcp/instance_template/README.md index 6baf8180e7..eeacf45d86 100644 --- a/community/modules/internal/slurm-gcp/instance_template/README.md +++ b/community/modules/internal/slurm-gcp/instance_template/README.md @@ -29,7 +29,7 @@ | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
source = optional(string)
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
}))
| `[]` | no | | [advanced\_machine\_features](#input\_advanced\_machine\_features) | See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#nested_advanced_machine_features |
object({
enable_nested_virtualization = optional(bool)
threads_per_core = optional(number)
turbo_mode = optional(string)
visible_core_count = optional(number)
performance_monitoring_unit = optional(string)
enable_uefi_networking = optional(bool)
})
| n/a | yes | | [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `virtio_enabled` setting will only enable VirtioNet and will not enable TIER\_1.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image that supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | diff --git a/community/modules/internal/slurm-gcp/instance_template/main.tf b/community/modules/internal/slurm-gcp/instance_template/main.tf index 87a0f4ba56..cbb8056748 100644 --- a/community/modules/internal/slurm-gcp/instance_template/main.tf +++ b/community/modules/internal/slurm-gcp/instance_template/main.tf @@ -22,6 +22,7 @@ locals { disk_name = disk.disk_name device_name = disk.device_name auto_delete = disk.auto_delete + source = disk.source boot = disk.boot disk_size_gb = disk.disk_size_gb disk_type = disk.disk_type diff --git a/community/modules/internal/slurm-gcp/instance_template/variables.tf b/community/modules/internal/slurm-gcp/instance_template/variables.tf index 360c5f3c71..52fc05eef9 100644 --- a/community/modules/internal/slurm-gcp/instance_template/variables.tf +++ b/community/modules/internal/slurm-gcp/instance_template/variables.tf @@ -321,6 +321,7 @@ variable "disk_auto_delete" { variable "additional_disks" { type = list(object({ + source = optional(string) disk_name = string device_name = string disk_type = string diff --git a/community/modules/internal/slurm-gcp/internal_instance_template/README.md b/community/modules/internal/slurm-gcp/internal_instance_template/README.md index 1d07b23e7c..1a39a0b385 100644 --- a/community/modules/internal/slurm-gcp/internal_instance_template/README.md +++ b/community/modules/internal/slurm-gcp/internal_instance_template/README.md @@ -30,7 +30,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | List of maps of additional disks. See https://www.terraform.io/docs/providers/google/r/compute_instance_template#disk_name |
list(object({
disk_name = string
device_name = string
auto_delete = bool
boot = bool
disk_size_gb = number
disk_type = string
disk_labels = map(string)
}))
| `[]` | no | +| [additional\_disks](#input\_additional\_disks) | List of maps of additional disks. See https://www.terraform.io/docs/providers/google/r/compute_instance_template#disk_name |
list(object({
source = optional(string)
disk_name = string
device_name = string
auto_delete = bool
boot = bool
disk_size_gb = number
disk_type = string
disk_labels = map(string)
}))
| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
}))
| `[]` | no | | [advanced\_machine\_features](#input\_advanced\_machine\_features) | See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#nested_advanced_machine_features |
object({
enable_nested_virtualization = optional(bool)
threads_per_core = optional(number)
turbo_mode = optional(string)
visible_core_count = optional(number)
performance_monitoring_unit = optional(string)
enable_uefi_networking = optional(bool)
})
| n/a | yes | | [alias\_ip\_range](#input\_alias\_ip\_range) | An array of alias IP ranges for this network interface. Can only be specified for network interfaces on subnet-mode networks.
ip\_cidr\_range: The IP CIDR range represented by this alias IP range. This IP CIDR range must belong to the specified subnetwork and cannot contain IP addresses reserved by system or used by other network interfaces. At the time of writing only a netmask (e.g. /24) may be supplied, with a CIDR format resulting in an API error.
subnetwork\_range\_name: The subnetwork secondary range name specifying the secondary range from which to allocate the IP CIDR range for this alias IP range. If left unspecified, the primary range of the subnetwork will be used. |
object({
ip_cidr_range = string
subnetwork_range_name = string
})
| `null` | no | diff --git a/community/modules/internal/slurm-gcp/internal_instance_template/main.tf b/community/modules/internal/slurm-gcp/internal_instance_template/main.tf index 8395998201..1c1233bcf9 100644 --- a/community/modules/internal/slurm-gcp/internal_instance_template/main.tf +++ b/community/modules/internal/slurm-gcp/internal_instance_template/main.tf @@ -87,15 +87,15 @@ resource "google_compute_instance_template" "tpl" { auto_delete = lookup(disk.value, "auto_delete", null) boot = lookup(disk.value, "boot", null) device_name = lookup(disk.value, "device_name", null) - disk_name = lookup(disk.value, "disk_name", null) - disk_size_gb = lookup(disk.value, "disk_size_gb", lookup(disk.value, "disk_type", null) == "local-ssd" ? "375" : null) - disk_type = lookup(disk.value, "disk_type", null) + disk_name = lookup(disk.value, "source", null) != null ? null : lookup(disk.value, "disk_name", null) + disk_size_gb = lookup(disk.value, "source", null) != null ? null : lookup(disk.value, "disk_size_gb", lookup(disk.value, "disk_type", null) == "local-ssd" ? "375" : null) + disk_type = lookup(disk.value, "source", null) != null ? null : lookup(disk.value, "disk_type", null) interface = lookup(disk.value, "interface", lookup(disk.value, "disk_type", null) == "local-ssd" ? "NVME" : null) mode = lookup(disk.value, "mode", null) source = lookup(disk.value, "source", null) - source_image = lookup(disk.value, "source_image", null) + source_image = lookup(disk.value, "source", null) != null ? null : lookup(disk.value, "source_image", null) type = lookup(disk.value, "disk_type", null) == "local-ssd" ? "SCRATCH" : "PERSISTENT" - labels = lookup(disk.value, "disk_type", null) == "local-ssd" ? null : lookup(disk.value, "disk_labels", null) + labels = lookup(disk.value, "source", null) != null ? null : lookup(disk.value, "disk_type", null) == "local-ssd" ? null : lookup(disk.value, "disk_labels", null) dynamic "disk_encryption_key" { for_each = compact([var.disk_encryption_key == null ? null : 1]) diff --git a/community/modules/internal/slurm-gcp/internal_instance_template/variables.tf b/community/modules/internal/slurm-gcp/internal_instance_template/variables.tf index 26cb523d84..7b15bb1a5f 100644 --- a/community/modules/internal/slurm-gcp/internal_instance_template/variables.tf +++ b/community/modules/internal/slurm-gcp/internal_instance_template/variables.tf @@ -161,6 +161,7 @@ variable "auto_delete" { variable "additional_disks" { description = "List of maps of additional disks. See https://www.terraform.io/docs/providers/google/r/compute_instance_template#disk_name" type = list(object({ + source = optional(string) disk_name = string device_name = string auto_delete = bool diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 341a7605f0..c7a75fe43d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -264,6 +264,7 @@ limitations under the License. | Name | Type | |------|------| +| [google_compute_disk.controller_disk](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_disk) | resource | | [google_compute_instance_from_template.controller](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_from_template) | resource | | [google_secret_manager_secret.cloudsql](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource | | [google_secret_manager_secret_iam_member.cloudsql_secret_accessor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | @@ -291,6 +292,7 @@ limitations under the License. | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `"# no-op"` | no | | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [controller\_state\_disk](#input\_controller\_state\_disk) | A disk that will be attached to the controller instance template to save state of slurm. The disk is created and used by default.
To disable this feature, set this variable to null.

NOTE: This will not save the contents at /opt/apps and /home. To preserve those, they must be saved externally. |
object({
device_name = string
type = string
size = number
})
|
{
"device_name": "controller-save-state",
"size": 50,
"type": "pd-ssd"
}
| no | | [create\_bucket](#input\_create\_bucket) | Create GCS bucket instead of using an existing one. | `bool` | `true` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | | [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | DEPRECATED: Use `enable_controller_public_ips` instead. | `bool` | `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 6d4d1b0b07..7437624814 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -32,6 +32,17 @@ locals { } ] + state_disk = var.controller_state_disk != null ? [{ + disk_name = google_compute_disk.controller_disk[0].name + disk_size_gb = google_compute_disk.controller_disk[0].size + disk_type = google_compute_disk.controller_disk[0].type + source = google_compute_disk.controller_disk[0].name + device_name = google_compute_disk.controller_disk[0].name + disk_labels = null + auto_delete = false + boot = false + }] : [] + synth_def_sa_email = "${data.google_project.this.number}-compute@developer.gserviceaccount.com" service_account = { @@ -48,6 +59,15 @@ locals { ) } +resource "google_compute_disk" "controller_disk" { + count = var.controller_state_disk != null ? 1 : 0 + + name = "${local.slurm_cluster_name}-${var.controller_state_disk.device_name}" + type = var.controller_state_disk.type + size = var.controller_state_disk.size + zone = var.zone +} + # INSTANCE TEMPLATE module "slurm_controller_template" { source = "../../internal/slurm-gcp/instance_template" @@ -62,7 +82,7 @@ module "slurm_controller_template" { disk_labels = merge(var.disk_labels, local.labels) disk_size_gb = var.disk_size_gb disk_type = var.disk_type - additional_disks = local.additional_disks + additional_disks = concat(local.additional_disks, local.state_disk) bandwidth_tier = var.bandwidth_tier slurm_bucket_path = module.slurm_files.slurm_bucket_path diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 8b60cbfc45..a39b29add1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -72,6 +72,7 @@ No modules. | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [controller\_startup\_scripts](#input\_controller\_startup\_scripts) | List of scripts to be ran on controller VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [controller\_state\_disk](#input\_controller\_state\_disk) | A disk that will be attached to the controller instance template to save state of slurm. The disk is created and used by default.
To disable this feature, set this variable to null.

NOTE: This will not save the contents at /opt/apps and /home. To preserve those, they must be saved externally. |
object({
device_name = string
})
|
{
"device_name": "controller-save-state"
}
| no | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
If these are disabled, the slurm etc and munge dirs must be added manually,
or some other mechanism must be used to synchronize the slurm conf files
and the munge key across the cluster. | `bool` | `false` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | | [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. Not for production use. | `bool` | `false` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 7783258299..7b9fe27bc1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -43,14 +43,15 @@ locals { tp = "${local.bucket_dir}/" # prefix to trim from the bucket path to get a "file name" config = { - enable_bigquery_load = var.enable_bigquery_load - cloudsql_secret = var.cloudsql_secret - cluster_id = random_uuid.cluster_id.result - project = var.project_id - slurm_cluster_name = var.slurm_cluster_name - bucket_path = local.bucket_path - enable_debug_logging = var.enable_debug_logging - extra_logging_flags = var.extra_logging_flags + enable_bigquery_load = var.enable_bigquery_load + cloudsql_secret = var.cloudsql_secret + cluster_id = random_uuid.cluster_id.result + project = var.project_id + slurm_cluster_name = var.slurm_cluster_name + bucket_path = local.bucket_path + enable_debug_logging = var.enable_debug_logging + extra_logging_flags = var.extra_logging_flags + controller_state_disk = var.controller_state_disk # storage disable_default_mounts = var.disable_default_mounts diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index 0637fa569b..d431b2b5e0 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -176,6 +176,47 @@ def run_custom_scripts(): log.exception(f"script {script} encountered an exception") raise e +def mount_save_state_disk(): + disk_name = f"/dev/disk/by-id/google-{lookup().cfg.controller_state_disk.device_name}" + mount_point = util.slurmdirs.state + fs_type = "xfs" + + rdevice = util.run(f"realpath {disk_name}").stdout.strip() + file_output = util.run(f"file -s {rdevice}").stdout.strip() + if "filesystem" not in file_output: + util.run(f"mkfs -t {fs_type} -q {rdevice}") + + fstab_entry = f"{disk_name} {mount_point} {fs_type}" + with open("/etc/fstab", "r") as f: + fstab = f.readlines() + if fstab_entry not in fstab: + with open("/etc/fstab", "a") as f: + f.write(f"{fstab_entry} defaults 0 0\n") + + util.run(f"systemctl daemon-reload") + + os.makedirs(mount_point, exist_ok=True) + util.run(f"mount {mount_point}") + + util.chown_slurm(mount_point) + +def mount_munge_key_disk(): + state_disk_dir = "/var/spool/slurm/munge" + mount_point = dirs.munge + + os.makedirs(state_disk_dir, exist_ok=True) + + util.run(f"mount --bind {state_disk_dir} {mount_point}") + + fstab_entry = f"{state_disk_dir} {mount_point}" + with open("/etc/fstab", "r") as f: + fstab = f.readlines() + if fstab_entry not in fstab: + with open("/etc/fstab", "a") as f: + f.write(f"{fstab_entry} none bind 0 0\n") + + util.run(f"systemctl daemon-reload") + def setup_jwt_key(): jwt_key = Path(slurmdirs.state / "jwt_hs256.key") @@ -329,6 +370,11 @@ def setup_controller(): util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) install_custom_scripts() conf.gen_controller_configs(lookup()) + + if lookup().cfg.controller_state_disk != None: + mount_save_state_disk() + mount_munge_key_disk() + setup_jwt_key() setup_munge_key() setup_sudoers() diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index 31423c2211..1e8c697776 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -58,6 +58,22 @@ variable "slurm_cluster_name" { } } +variable "controller_state_disk" { + description = < 0 ? [local.daos_install_mount_script, local.ghpc_startup_controller] : [local.ghpc_startup_controller] + controller_state_disk = { + device_name : "${local.slurm_cluster_name}-${var.controller_state_disk.device_name}" + } ghpc_startup_login = { filename = "ghpc_startup.sh" content = var.login_startup_script @@ -154,6 +157,7 @@ module "slurm_files" { compute_startup_scripts_timeout = var.compute_startup_scripts_timeout login_startup_scripts = local.login_startup_scripts login_startup_scripts_timeout = var.login_startup_scripts_timeout + controller_state_disk = local.controller_state_disk enable_debug_logging = var.enable_debug_logging extra_logging_flags = var.extra_logging_flags diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 9984101efc..41f2d18583 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -387,6 +387,26 @@ EOD # SLURM # ######### +variable "controller_state_disk" { + description = <