Skip to content

Commit 28a1977

Browse files
authored
fix: try to get the nodes 3 times before failure (#132)
Signed-off-by: matttrach <[email protected]>
1 parent f17a4da commit 28a1977

File tree

1 file changed

+106
-11
lines changed
  • examples/downstream/modules/downstream

1 file changed

+106
-11
lines changed

examples/downstream/modules/downstream/main.tf

Lines changed: 106 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,25 @@ locals {
2727
ami_id = var.ami_id
2828
ami_ssh_user = var.ami_ssh_user
2929
node_count = var.node_count
30-
# if the IPs aren't found, then this should fail
31-
node_ips = { for i in range(local.node_count) : tostring(i) => data.aws_instances.rke2_instance_nodes.public_ips[i] }
32-
node_id = "${local.cluster_name}-nodes"
33-
node_wait_time = "${tostring(local.node_count * 60)}s" # 60 seconds per node
34-
ami_admin_group = (var.ami_admin_group != "" ? var.ami_admin_group : "tty")
35-
runner_ip = (var.direct_node_access != null ? var.direct_node_access.runner_ip : "10.1.1.1") # the IP running Terraform
36-
ssh_access_key = (var.direct_node_access != null ? var.direct_node_access.ssh_access_key : "fake123abc")
37-
ssh_access_user = (var.direct_node_access != null ? var.direct_node_access.ssh_access_user : "fake")
30+
nodes = (length(data.aws_instances.rke2_nodes.ids) > 0 ? data.aws_instances.rke2_nodes :
31+
(length(data.aws_instances.rke2_nodes_again.ids) > 0 ? data.aws_instances.rke2_nodes_again :
32+
data.aws_instances.rke2_nodes_again_again)
33+
)
34+
node_ids = coalescelist(
35+
data.aws_instances.rke2_nodes.ids,
36+
data.aws_instances.rke2_nodes_again.ids,
37+
data.aws_instances.rke2_nodes_again_again.ids,
38+
)
39+
# if the nodes aren't found, then this should fail
40+
# tflint-ignore: terraform_unused_declarations
41+
fail_nodes_not_found = (length(local.node_ids) == 0 ? one([local.node_count, "nodes_not_found"]) : false)
42+
node_ips = { for i in range(local.node_count) : tostring(i) => local.nodes.public_ips[i] }
43+
node_id = "${local.cluster_name}-nodes"
44+
node_wait_time = "${tostring(local.node_count * 60 + 60)}s" # 60 seconds per node + 60 seconds buffer
45+
ami_admin_group = (var.ami_admin_group != "" ? var.ami_admin_group : "tty")
46+
runner_ip = (var.direct_node_access != null ? var.direct_node_access.runner_ip : "10.1.1.1") # the IP running Terraform
47+
ssh_access_key = (var.direct_node_access != null ? var.direct_node_access.ssh_access_key : "fake123abc")
48+
ssh_access_user = (var.direct_node_access != null ? var.direct_node_access.ssh_access_user : "fake")
3849
# rke2 info
3950
rke2_version = var.rke2_version
4051
}
@@ -212,7 +223,7 @@ resource "time_sleep" "wait_for_nodes" {
212223
create_duration = local.node_wait_time
213224
}
214225

215-
data "aws_instances" "rke2_instance_nodes" {
226+
data "aws_instances" "rke2_nodes" {
216227
depends_on = [
217228
aws_security_group.downstream_cluster,
218229
aws_vpc_security_group_ingress_rule.downstream_ingress_rancher,
@@ -228,6 +239,76 @@ data "aws_instances" "rke2_instance_nodes" {
228239
values = [local.node_id]
229240
}
230241
}
242+
resource "time_sleep" "wait_for_nodes_again" {
243+
depends_on = [
244+
aws_security_group.downstream_cluster,
245+
aws_vpc_security_group_ingress_rule.downstream_ingress_rancher,
246+
aws_vpc_security_group_egress_rule.downstream_egress_ipv4,
247+
aws_vpc_security_group_egress_rule.downstream_egress_ipv6,
248+
aws_vpc_security_group_egress_rule.downstream_egress_project_link,
249+
rancher2_machine_config_v2.all_in_one,
250+
terraform_data.patch_machine_configs,
251+
time_sleep.wait_for_nodes,
252+
data.aws_instances.rke2_nodes,
253+
]
254+
create_duration = local.node_wait_time
255+
}
256+
data "aws_instances" "rke2_nodes_again" {
257+
depends_on = [
258+
aws_security_group.downstream_cluster,
259+
aws_vpc_security_group_ingress_rule.downstream_ingress_rancher,
260+
aws_vpc_security_group_egress_rule.downstream_egress_ipv4,
261+
aws_vpc_security_group_egress_rule.downstream_egress_ipv6,
262+
aws_vpc_security_group_egress_rule.downstream_egress_project_link,
263+
rancher2_machine_config_v2.all_in_one,
264+
terraform_data.patch_machine_configs,
265+
time_sleep.wait_for_nodes,
266+
data.aws_instances.rke2_nodes,
267+
time_sleep.wait_for_nodes_again,
268+
]
269+
filter {
270+
name = "tag:NodeId"
271+
values = [local.node_id]
272+
}
273+
}
274+
275+
resource "time_sleep" "wait_for_nodes_again_again" {
276+
depends_on = [
277+
aws_security_group.downstream_cluster,
278+
aws_vpc_security_group_ingress_rule.downstream_ingress_rancher,
279+
aws_vpc_security_group_egress_rule.downstream_egress_ipv4,
280+
aws_vpc_security_group_egress_rule.downstream_egress_ipv6,
281+
aws_vpc_security_group_egress_rule.downstream_egress_project_link,
282+
rancher2_machine_config_v2.all_in_one,
283+
terraform_data.patch_machine_configs,
284+
time_sleep.wait_for_nodes,
285+
data.aws_instances.rke2_nodes,
286+
time_sleep.wait_for_nodes_again,
287+
data.aws_instances.rke2_nodes_again,
288+
]
289+
create_duration = local.node_wait_time
290+
}
291+
292+
data "aws_instances" "rke2_nodes_again_again" {
293+
depends_on = [
294+
aws_security_group.downstream_cluster,
295+
aws_vpc_security_group_ingress_rule.downstream_ingress_rancher,
296+
aws_vpc_security_group_egress_rule.downstream_egress_ipv4,
297+
aws_vpc_security_group_egress_rule.downstream_egress_ipv6,
298+
aws_vpc_security_group_egress_rule.downstream_egress_project_link,
299+
rancher2_machine_config_v2.all_in_one,
300+
terraform_data.patch_machine_configs,
301+
time_sleep.wait_for_nodes,
302+
data.aws_instances.rke2_nodes,
303+
time_sleep.wait_for_nodes_again,
304+
data.aws_instances.rke2_nodes_again,
305+
time_sleep.wait_for_nodes_again_again,
306+
]
307+
filter {
308+
name = "tag:NodeId"
309+
values = [local.node_id]
310+
}
311+
}
231312

232313
# this allows the load balancer to accept connections initiated by the downstream cluster's public ip addresses
233314
# this weird in-flight grab of the nodes and manipulating the security groups is not good,
@@ -243,7 +324,11 @@ resource "aws_vpc_security_group_ingress_rule" "downstream_public_ingress_loadba
243324
rancher2_machine_config_v2.all_in_one,
244325
terraform_data.patch_machine_configs,
245326
time_sleep.wait_for_nodes,
246-
data.aws_instances.rke2_instance_nodes,
327+
data.aws_instances.rke2_nodes,
328+
time_sleep.wait_for_nodes_again,
329+
data.aws_instances.rke2_nodes_again,
330+
time_sleep.wait_for_nodes_again_again,
331+
data.aws_instances.rke2_nodes_again_again,
247332
]
248333
for_each = local.node_ips
249334
security_group_id = local.load_balancer_security_group_id
@@ -261,7 +346,11 @@ resource "aws_vpc_security_group_ingress_rule" "downstream_public_ingress_runner
261346
rancher2_machine_config_v2.all_in_one,
262347
terraform_data.patch_machine_configs,
263348
time_sleep.wait_for_nodes,
264-
data.aws_instances.rke2_instance_nodes,
349+
data.aws_instances.rke2_nodes,
350+
time_sleep.wait_for_nodes_again,
351+
data.aws_instances.rke2_nodes_again,
352+
time_sleep.wait_for_nodes_again_again,
353+
data.aws_instances.rke2_nodes_again_again,
265354
]
266355
security_group_id = aws_security_group.downstream_cluster.id
267356
ip_protocol = "tcp"
@@ -279,6 +368,12 @@ resource "rancher2_cluster_sync" "sync" {
279368
aws_vpc_security_group_egress_rule.downstream_egress_project_link,
280369
rancher2_machine_config_v2.all_in_one,
281370
terraform_data.patch_machine_configs,
371+
time_sleep.wait_for_nodes,
372+
data.aws_instances.rke2_nodes,
373+
time_sleep.wait_for_nodes_again,
374+
data.aws_instances.rke2_nodes_again,
375+
time_sleep.wait_for_nodes_again_again,
376+
data.aws_instances.rke2_nodes_again_again,
282377
rancher2_cluster_v2.rke2_cluster,
283378
]
284379
cluster_id = rancher2_cluster_v2.rke2_cluster.cluster_v1_id

0 commit comments

Comments
 (0)