Skip to content

Commit 4819be9

Browse files
committed
Add support for compute-cluster
1 parent d29a715 commit 4819be9

9 files changed

+277
-23
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright (c) 2023 Oracle Corporation and/or its affiliates.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
3+
4+
worker_compute_clusters = {
5+
"shared" = {
6+
placement_ad = 1
7+
}
8+
}
9+
10+
worker_pools = {
11+
oke-bm-rdma = {
12+
description = "Self-managed nodes in a Compute Cluster with RDMA networking"
13+
mode = "compute-cluster",
14+
compute_cluster = "shared"
15+
placement_ad = "1"
16+
instance_ids = ["1", "2", "3"],
17+
shape = "BM.HPC2.36",
18+
boot_volume_size = 50,
19+
},
20+
21+
oke-bm-gpu-rdma = {
22+
description = "Self-managed GPU nodes in a Compute Cluster with RDMA networking"
23+
mode = "compute-cluster",
24+
compute_cluster = "shared"
25+
placement_ad = "1",
26+
instance_ids = ["1", "2"],
27+
shape = "BM.GPU4.8",
28+
image_id = "ocid1.image..."
29+
image_type = "custom"
30+
boot_volume_size = 50,
31+
}
32+
}

module-workers.tf

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ module "workers" {
3838
cluster_type = var.cluster_type
3939
kubernetes_version = var.kubernetes_version
4040

41+
# Compute clusters
42+
compute_clusters = var.worker_compute_clusters
43+
4144
# Worker pools
4245
worker_pool_mode = var.worker_pool_mode
4346
worker_pool_size = var.worker_pool_size
@@ -103,4 +106,4 @@ output "worker_pool_ids" {
103106
output "worker_pool_ips" {
104107
description = "Created worker instance private IPs by pool for available modes ('node-pool', 'instance')."
105108
value = local.worker_count_expected > 0 ? try(one(module.workers[*].worker_pool_ips), null) : null
106-
}
109+
}

modules/workers/computecluster.tf

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# Copyright (c) 2022, 2025 Oracle Corporation and/or its affiliates.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
3+
4+
# Create the shared compute clusters defined in workers_compute_clusters
5+
6+
resource "oci_core_compute_cluster" "shared" {
7+
# Create an OCI Compute Cluster resource for each enabled entry of the worker_pools map with that mode.
8+
for_each = var.compute_clusters
9+
compartment_id = lookup(each.value, "compartment_id", var.compartment_id)
10+
display_name = each.key
11+
defined_tags = merge(
12+
var.defined_tags,
13+
lookup(each.value, "defined_tags", {})
14+
)
15+
freeform_tags = merge(
16+
var.freeform_tags,
17+
lookup(each.value, "freeform_tags", {})
18+
)
19+
20+
availability_domain = lookup(var.ad_numbers_to_names, lookup(each.value, "placement_ad", 1))
21+
22+
lifecycle {
23+
ignore_changes = [
24+
display_name, defined_tags, freeform_tags,
25+
]
26+
}
27+
}
28+
29+
# Dynamic resource block for Compute Cluster groups defined in worker_pools
30+
resource "oci_core_compute_cluster" "workers" {
31+
# Create an OCI Compute Cluster resource for each enabled entry of the worker_pools map with that mode.
32+
for_each = { for k, v in local.enabled_compute_clusters : k => v if length(lookup(v, "instance_ids", [])) > 0 && lookup(v, "compute_cluster", null) == null }
33+
compartment_id = each.value.compartment_id
34+
display_name = each.key
35+
defined_tags = each.value.defined_tags
36+
freeform_tags = each.value.freeform_tags
37+
availability_domain = lookup(each.value, "placement_ad", null) != null ? lookup(var.ad_numbers_to_names, lookup(each.value, "placement_ad")) : element(each.value.availability_domains, 0)
38+
39+
lifecycle {
40+
ignore_changes = [
41+
display_name, defined_tags, freeform_tags,
42+
]
43+
}
44+
}
45+
46+
resource "oci_core_instance" "compute_cluster_workers" {
47+
for_each = local.compute_cluster_instance_map
48+
49+
availability_domain = (lookup(oci_core_compute_cluster.shared, lookup(each.value, "compute_cluster", ""), null) != null ?
50+
oci_core_compute_cluster.shared[lookup(each.value, "compute_cluster", "")].availability_domain :
51+
lookup(each.value, "placement_ad", null) != null ? lookup(var.ad_numbers_to_names, lookup(each.value, "placement_ad")) : element(each.value.availability_domains, 0)
52+
)
53+
fault_domain = try(each.value.placement_fds[0], null)
54+
compartment_id = each.value.compartment_id
55+
display_name = format("%s-%s", element(split("###", each.key), 0), element(split("###", each.key), 1))
56+
preserve_boot_volume = false
57+
shape = each.value.shape
58+
59+
defined_tags = each.value.defined_tags
60+
freeform_tags = each.value.freeform_tags
61+
extended_metadata = each.value.extended_metadata
62+
capacity_reservation_id = each.value.capacity_reservation_id
63+
compute_cluster_id = (lookup(oci_core_compute_cluster.shared, lookup(each.value, "compute_cluster", ""), null) != null ?
64+
oci_core_compute_cluster.shared[lookup(each.value, "compute_cluster", "")].id :
65+
(lookup(oci_core_compute_cluster.workers, element(split("###", each.key), 0), null) != null ?
66+
oci_core_compute_cluster.workers[element(split("###", each.key), 0)].id :
67+
lookup(each.value, "compute_cluster", "")
68+
)
69+
)
70+
71+
dynamic "platform_config" {
72+
for_each = each.value.platform_config != null ? [1] : []
73+
content {
74+
type = lookup(
75+
# Attempt lookup against data source for the associated 'type' of configured worker shape
76+
lookup(local.platform_config_by_shape, each.value.shape, {}), "type",
77+
# Fall back to 'type' on pool with custom platform_config, or INTEL_VM default
78+
lookup(each.value.platform_config, "type", "INTEL_VM")
79+
)
80+
# Remaining parameters as configured, validated by instance/instance config resource
81+
are_virtual_instructions_enabled = lookup(each.value.platform_config, "are_virtual_instructions_enabled", null)
82+
is_access_control_service_enabled = lookup(each.value.platform_config, "is_access_control_service_enabled", null)
83+
is_input_output_memory_management_unit_enabled = lookup(each.value.platform_config, "is_input_output_memory_management_unit_enabled", null)
84+
is_measured_boot_enabled = lookup(each.value.platform_config, "is_measured_boot_enabled", null)
85+
is_memory_encryption_enabled = lookup(each.value.platform_config, "is_memory_encryption_enabled", null)
86+
is_secure_boot_enabled = lookup(each.value.platform_config, "is_secure_boot_enabled", null)
87+
is_symmetric_multi_threading_enabled = lookup(each.value.platform_config, "is_symmetric_multi_threading_enabled", null)
88+
is_trusted_platform_module_enabled = lookup(each.value.platform_config, "is_trusted_platform_module_enabled", null)
89+
numa_nodes_per_socket = lookup(each.value.platform_config, "numa_nodes_per_socket", null)
90+
percentage_of_cores_enabled = lookup(each.value.platform_config, "percentage_of_cores_enabled", null)
91+
}
92+
}
93+
94+
agent_config {
95+
are_all_plugins_disabled = each.value.agent_config.are_all_plugins_disabled
96+
is_management_disabled = each.value.agent_config.is_management_disabled
97+
is_monitoring_disabled = each.value.agent_config.is_monitoring_disabled
98+
dynamic "plugins_config" {
99+
for_each = merge(
100+
{
101+
"Compute HPC RDMA Authentication" : "ENABLED",
102+
"Compute HPC RDMA Auto-Configuration" : "ENABLED"
103+
},
104+
each.value.agent_config.plugins_config
105+
)
106+
content {
107+
name = plugins_config.key
108+
desired_state = plugins_config.value
109+
}
110+
}
111+
}
112+
113+
create_vnic_details {
114+
assign_private_dns_record = var.assign_dns
115+
assign_public_ip = each.value.assign_public_ip
116+
nsg_ids = each.value.nsg_ids
117+
subnet_id = each.value.subnet_id
118+
defined_tags = each.value.defined_tags
119+
freeform_tags = each.value.freeform_tags
120+
}
121+
122+
instance_options {
123+
are_legacy_imds_endpoints_disabled = false
124+
}
125+
126+
metadata = merge(
127+
{
128+
apiserver_host = var.apiserver_private_host
129+
cluster_ca_cert = var.cluster_ca_cert
130+
oke-k8version = var.kubernetes_version
131+
oke-kubeproxy-proxy-mode = var.kubeproxy_mode
132+
oke-tenancy-id = var.tenancy_id
133+
oke-initial-node-labels = join(",", [for k, v in each.value.node_labels : format("%v=%v", k, v)])
134+
secondary_vnics = jsonencode(lookup(each.value, "secondary_vnics", {}))
135+
ssh_authorized_keys = var.ssh_public_key
136+
user_data = lookup(lookup(data.cloudinit_config.workers, element(split("###", each.key), 0), {}), "rendered", "")
137+
},
138+
139+
# Add labels required for NPN CNI.
140+
var.cni_type == "npn" ? {
141+
oke-native-pod-networking = true
142+
oke-max-pods = var.max_pods_per_node
143+
pod-subnets = coalesce(var.pod_subnet_id, var.worker_subnet_id, "none")
144+
pod-nsgids = join(",", each.value.pod_nsg_ids)
145+
} : {},
146+
147+
# Only provide cluster DNS service address if set explicitly; determined automatically in practice.
148+
coalesce(var.cluster_dns, "none") == "none" ? {} : { kubedns_svc_ip = var.cluster_dns },
149+
150+
# Extra user-defined fields merged last
151+
var.node_metadata, # global
152+
lookup(each.value, "node_metadata", {}), # pool-specific
153+
)
154+
155+
source_details {
156+
boot_volume_size_in_gbs = each.value.boot_volume_size
157+
boot_volume_vpus_per_gb = each.value.boot_volume_vpus_per_gb
158+
source_id = each.value.image_id
159+
source_type = "image"
160+
}
161+
162+
lifecycle {
163+
precondition {
164+
condition = coalesce(each.value.image_id, "none") != "none"
165+
error_message = <<-EOT
166+
Missing image_id; check provided value if image_type is 'custom', or image_os/image_os_version if image_type is 'oke' or 'platform'.
167+
pool: ${element(split("###", each.key), 0)}
168+
image_type: ${coalesce(each.value.image_type, "none")}
169+
image_id: ${coalesce(each.value.image_id, "none")}
170+
EOT
171+
}
172+
173+
ignore_changes = [
174+
agent_config, # TODO Not updateable; remove when supported
175+
defined_tags, freeform_tags, display_name,
176+
metadata["cluster_ca_cert"], metadata["user_data"],
177+
create_vnic_details[0].defined_tags,
178+
create_vnic_details[0].freeform_tags,
179+
]
180+
}
181+
}

modules/workers/instance.tf

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,15 @@ resource "oci_core_instance" "workers" {
8383
secondary_vnics = jsonencode(lookup(each.value, "secondary_vnics", {}))
8484
ssh_authorized_keys = var.ssh_public_key
8585
user_data = lookup(lookup(data.cloudinit_config.workers, lookup(each.value, "key", ""), {}), "rendered", "")
86-
oke-native-pod-networking = var.cni_type == "npn" ? true : false
86+
},
87+
88+
# Add labels required for NPN CNI.
89+
var.cni_type == "npn" ? {
90+
oke-native-pod-networking = true
8791
oke-max-pods = var.max_pods_per_node
8892
pod-subnets = coalesce(var.pod_subnet_id, var.worker_subnet_id, "none")
89-
pod-nsgids = var.cni_type == "npn" ? join(",", each.value.pod_nsg_ids) : null
90-
},
93+
pod-nsgids = join(",", each.value.pod_nsg_ids)
94+
} : {},
9195

9296
# Only provide cluster DNS service address if set explicitly; determined automatically in practice.
9397
coalesce(var.cluster_dns, "none") == "none" ? {} : { kubedns_svc_ip = var.cluster_dns },

modules/workers/instanceconfig.tf

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,15 @@ resource "oci_core_instance_configuration" "workers" {
6161
secondary_vnics = jsonencode(lookup(each.value, "secondary_vnics", {}))
6262
ssh_authorized_keys = var.ssh_public_key
6363
user_data = lookup(lookup(data.cloudinit_config.workers, each.key, {}), "rendered", "")
64-
oke-native-pod-networking = var.cni_type == "npn" ? true : false
64+
},
65+
66+
# Add labels required for NPN CNI.
67+
var.cni_type == "npn" ? {
68+
oke-native-pod-networking = true
6569
oke-max-pods = var.max_pods_per_node
6670
pod-subnets = coalesce(var.pod_subnet_id, var.worker_subnet_id, "none")
67-
pod-nsgids = var.cni_type == "npn" ? join(",", each.value.pod_nsg_ids) : null
68-
},
71+
pod-nsgids = join(",", each.value.pod_nsg_ids)
72+
} : {},
6973

7074
# Only provide cluster DNS service address if set explicitly; determined automatically in practice.
7175
coalesce(var.cluster_dns, "none") == "none" ? {} : { kubedns_svc_ip = var.cluster_dns },

modules/workers/locals.tf

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -104,20 +104,20 @@ locals {
104104

105105
# Use provided image_id for 'custom' type, or first match for all shape + OS criteria
106106
image_id = (
107-
pool.image_type == "custom" ?
108-
pool.image_id :
109-
element(split("###", element(reverse(sort([for entry in tolist(setintersection([
110-
pool.image_type == "oke" ?
111-
setintersection(
112-
lookup(var.image_ids, "oke", null),
113-
lookup(var.image_ids, trimprefix(lower(pool.kubernetes_version), "v"), null)
114-
) :
115-
lookup(var.image_ids, "platform", null),
116-
lookup(var.image_ids, pool.image_type, null),
117-
length(regexall("GPU", pool.shape)) > 0 ? var.image_ids.gpu : var.image_ids.nongpu,
118-
length(regexall("A[12]\\.", pool.shape)) > 0 ? var.image_ids.aarch64 : var.image_ids.x86_64,
119-
lookup(var.image_ids, format("%v %v", pool.os, split(".", pool.os_version)[0]), null),
120-
]...)): "${var.indexed_images[entry].sort_key}###${entry}"])), 0)), 1)
107+
pool.image_type == "custom" ?
108+
pool.image_id :
109+
element(split("###", element(reverse(sort([for entry in tolist(setintersection([
110+
pool.image_type == "oke" ?
111+
setintersection(
112+
lookup(var.image_ids, "oke", null),
113+
lookup(var.image_ids, trimprefix(lower(pool.kubernetes_version), "v"), null)
114+
) :
115+
lookup(var.image_ids, "platform", null),
116+
lookup(var.image_ids, pool.image_type, null),
117+
length(regexall("GPU", pool.shape)) > 0 ? var.image_ids.gpu : var.image_ids.nongpu,
118+
length(regexall("A[12]\\.", pool.shape)) > 0 ? var.image_ids.aarch64 : var.image_ids.x86_64,
119+
lookup(var.image_ids, format("%v %v", pool.os, split(".", pool.os_version)[0]), null),
120+
]...)): "${var.indexed_images[entry].sort_key}###${entry}"])), 0)), 1)
121121
)
122122

123123
# Standard tags as defined if enabled for use
@@ -224,6 +224,16 @@ locals {
224224
for k, v in local.enabled_worker_pools : k => v if lookup(v, "mode", "") == "cluster-network"
225225
}
226226

227+
# Enabled worker_pool map entries for compute clusters
228+
enabled_compute_clusters = {
229+
for k, v in local.enabled_worker_pools : k => v if lookup(v, "mode", "") == "compute-cluster"
230+
}
231+
232+
# Prepare a map workers node enabled for compute_clusters { "pool_id###worker_id" => pool_values }
233+
compute_cluster_instance_ids_map = { for k, v in local.enabled_compute_clusters : k => toset(lookup(v, "instance_ids", [])) }
234+
compute_cluster_instance_ids = toset(concat(flatten([for k, v in local.compute_cluster_instance_ids_map : [for id in v : format("%s###%s", k, id)]])))
235+
compute_cluster_instance_map = { for id in local.compute_cluster_instance_ids : id => lookup(local.enabled_compute_clusters, element(split("###", id), 0), {}) }
236+
227237
# Sanitized worker_pools output; some conditionally-used defaults would be misleading
228238
worker_pools_final = {
229239
for pool_name, pool in local.enabled_worker_pools : pool_name => { for a, b in pool : a => b
@@ -270,4 +280,4 @@ locals {
270280

271281
# Yields {<pool name> = {<instance id> = <instance ip>}} for modes: 'node-pool', 'instance'
272282
worker_pool_ips = merge(local.worker_instance_ips, local.worker_nodepool_ips)
273-
}
283+
}

modules/workers/variables.tf

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,3 +318,13 @@ variable "agent_config" {
318318
plugins_config = map(string),
319319
})
320320
}
321+
322+
#
323+
# Workers: compute-cluster
324+
#
325+
326+
variable "compute_clusters" {
327+
default = {}
328+
description = "Whether to create compute clusters shared by nodes across multiple worker pools enabled for 'compute-cluster'."
329+
type = map(any)
330+
}

modules/workers/versions.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ terraform {
1212

1313
oci = {
1414
source = "oracle/oci"
15-
version = ">= 4.119.0"
15+
version = ">= 6.37.0"
1616
}
1717
}
1818
}

variables-workers.tf

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,16 @@ variable "worker_pool_size" {
5555
type = number
5656
}
5757

58+
#
59+
# Workers: Compute clusters
60+
#
61+
62+
variable "worker_compute_clusters" {
63+
default = {}
64+
description = "Whether to create compute clusters shared by nodes across multiple worker pools enabled for 'compute-cluster'."
65+
type = map(any)
66+
}
67+
5868
#
5969
# Workers: network
6070
#

0 commit comments

Comments
 (0)