From b0af08654fc58dd1fca04b0b5a071854fbf66fe7 Mon Sep 17 00:00:00 2001
From: matttrach <matt.trachier@suse.com>
Date: Wed, 22 Oct 2025 00:38:36 -0500
Subject: [PATCH] fix: try to get the nodes 3 times before failure

Signed-off-by: matttrach <matt.trachier@suse.com>
---
 .../downstream/modules/downstream/main.tf     | 117 ++++++++++++++++--
 1 file changed, 106 insertions(+), 11 deletions(-)

diff --git a/examples/downstream/modules/downstream/main.tf b/examples/downstream/modules/downstream/main.tf
index 39bfade..67a8be8 100644
--- a/examples/downstream/modules/downstream/main.tf
+++ b/examples/downstream/modules/downstream/main.tf
@@ -27,14 +27,25 @@ locals {
   ami_id            = var.ami_id
   ami_ssh_user      = var.ami_ssh_user
   node_count        = var.node_count
-  # if the IPs aren't found, then this should fail
-  node_ips        = { for i in range(local.node_count) : tostring(i) => data.aws_instances.rke2_instance_nodes.public_ips[i] }
-  node_id         = "${local.cluster_name}-nodes"
-  node_wait_time  = "${tostring(local.node_count * 60)}s" # 60 seconds per node
-  ami_admin_group = (var.ami_admin_group != "" ? var.ami_admin_group : "tty")
-  runner_ip       = (var.direct_node_access != null ? var.direct_node_access.runner_ip : "10.1.1.1") # the IP running Terraform
-  ssh_access_key  = (var.direct_node_access != null ? var.direct_node_access.ssh_access_key : "fake123abc")
-  ssh_access_user = (var.direct_node_access != null ? var.direct_node_access.ssh_access_user : "fake")
+  nodes = (length(data.aws_instances.rke2_nodes.ids) > 0 ? data.aws_instances.rke2_nodes :
+    (length(data.aws_instances.rke2_nodes_again.ids) > 0 ? data.aws_instances.rke2_nodes_again :
+    data.aws_instances.rke2_nodes_again_again)
+  )
+  node_ids = coalescelist(
+    data.aws_instances.rke2_nodes.ids,
+    data.aws_instances.rke2_nodes_again.ids,
+    data.aws_instances.rke2_nodes_again_again.ids,
+  )
+  # if the nodes aren't found, then this should fail
+  # tflint-ignore: terraform_unused_declarations
+  fail_nodes_not_found = (length(local.node_ids) == 0 ? one([local.node_count, "nodes_not_found"]) : false)
+  node_ips             = { for i in range(local.node_count) : tostring(i) => local.nodes.public_ips[i] }
+  node_id              = "${local.cluster_name}-nodes"
+  node_wait_time       = "${tostring(local.node_count * 60 + 60)}s" # 60 seconds per node + 60 seconds buffer
+  ami_admin_group      = (var.ami_admin_group != "" ? var.ami_admin_group : "tty")
+  runner_ip            = (var.direct_node_access != null ? var.direct_node_access.runner_ip : "10.1.1.1") # the IP running Terraform
+  ssh_access_key       = (var.direct_node_access != null ? var.direct_node_access.ssh_access_key : "fake123abc")
+  ssh_access_user      = (var.direct_node_access != null ? var.direct_node_access.ssh_access_user : "fake")
   # rke2 info
   rke2_version = var.rke2_version
 }
@@ -212,7 +223,7 @@ resource "time_sleep" "wait_for_nodes" {
   create_duration = local.node_wait_time
 }
 
-data "aws_instances" "rke2_instance_nodes" {
+data "aws_instances" "rke2_nodes" {
   depends_on = [
     aws_security_group.downstream_cluster,
     aws_vpc_security_group_ingress_rule.downstream_ingress_rancher,
@@ -228,6 +239,76 @@ data "aws_instances" "rke2_instance_nodes" {
     values = [local.node_id]
   }
 }
+resource "time_sleep" "wait_for_nodes_again" {
+  depends_on = [
+    aws_security_group.downstream_cluster,
+    aws_vpc_security_group_ingress_rule.downstream_ingress_rancher,
+    aws_vpc_security_group_egress_rule.downstream_egress_ipv4,
+    aws_vpc_security_group_egress_rule.downstream_egress_ipv6,
+    aws_vpc_security_group_egress_rule.downstream_egress_project_link,
+    rancher2_machine_config_v2.all_in_one,
+    terraform_data.patch_machine_configs,
+    time_sleep.wait_for_nodes,
+    data.aws_instances.rke2_nodes,
+  ]
+  create_duration = local.node_wait_time
+}
+data "aws_instances" "rke2_nodes_again" {
+  depends_on = [
+    aws_security_group.downstream_cluster,
+    aws_vpc_security_group_ingress_rule.downstream_ingress_rancher,
+    aws_vpc_security_group_egress_rule.downstream_egress_ipv4,
+    aws_vpc_security_group_egress_rule.downstream_egress_ipv6,
+    aws_vpc_security_group_egress_rule.downstream_egress_project_link,
+    rancher2_machine_config_v2.all_in_one,
+    terraform_data.patch_machine_configs,
+    time_sleep.wait_for_nodes,
+    data.aws_instances.rke2_nodes,
+    time_sleep.wait_for_nodes_again,
+  ]
+  filter {
+    name   = "tag:NodeId"
+    values = [local.node_id]
+  }
+}
+
+resource "time_sleep" "wait_for_nodes_again_again" {
+  depends_on = [
+    aws_security_group.downstream_cluster,
+    aws_vpc_security_group_ingress_rule.downstream_ingress_rancher,
+    aws_vpc_security_group_egress_rule.downstream_egress_ipv4,
+    aws_vpc_security_group_egress_rule.downstream_egress_ipv6,
+    aws_vpc_security_group_egress_rule.downstream_egress_project_link,
+    rancher2_machine_config_v2.all_in_one,
+    terraform_data.patch_machine_configs,
+    time_sleep.wait_for_nodes,
+    data.aws_instances.rke2_nodes,
+    time_sleep.wait_for_nodes_again,
+    data.aws_instances.rke2_nodes_again,
+  ]
+  create_duration = local.node_wait_time
+}
+
+data "aws_instances" "rke2_nodes_again_again" {
+  depends_on = [
+    aws_security_group.downstream_cluster,
+    aws_vpc_security_group_ingress_rule.downstream_ingress_rancher,
+    aws_vpc_security_group_egress_rule.downstream_egress_ipv4,
+    aws_vpc_security_group_egress_rule.downstream_egress_ipv6,
+    aws_vpc_security_group_egress_rule.downstream_egress_project_link,
+    rancher2_machine_config_v2.all_in_one,
+    terraform_data.patch_machine_configs,
+    time_sleep.wait_for_nodes,
+    data.aws_instances.rke2_nodes,
+    time_sleep.wait_for_nodes_again,
+    data.aws_instances.rke2_nodes_again,
+    time_sleep.wait_for_nodes_again_again,
+  ]
+  filter {
+    name   = "tag:NodeId"
+    values = [local.node_id]
+  }
+}
 
 # this allows the load balancer to accept connections initiated by the downstream cluster's public ip addresses
 # this weird in-flight grab of the nodes and manipulating the security groups is not good,
@@ -243,7 +324,11 @@ resource "aws_vpc_security_group_ingress_rule" "downstream_public_ingress_loadba
     rancher2_machine_config_v2.all_in_one,
     terraform_data.patch_machine_configs,
     time_sleep.wait_for_nodes,
-    data.aws_instances.rke2_instance_nodes,
+    data.aws_instances.rke2_nodes,
+    time_sleep.wait_for_nodes_again,
+    data.aws_instances.rke2_nodes_again,
+    time_sleep.wait_for_nodes_again_again,
+    data.aws_instances.rke2_nodes_again_again,
   ]
   for_each          = local.node_ips
   security_group_id = local.load_balancer_security_group_id
@@ -261,7 +346,11 @@ resource "aws_vpc_security_group_ingress_rule" "downstream_public_ingress_runner
     rancher2_machine_config_v2.all_in_one,
     terraform_data.patch_machine_configs,
     time_sleep.wait_for_nodes,
-    data.aws_instances.rke2_instance_nodes,
+    data.aws_instances.rke2_nodes,
+    time_sleep.wait_for_nodes_again,
+    data.aws_instances.rke2_nodes_again,
+    time_sleep.wait_for_nodes_again_again,
+    data.aws_instances.rke2_nodes_again_again,
   ]
   security_group_id = aws_security_group.downstream_cluster.id
   ip_protocol       = "tcp"
@@ -279,6 +368,12 @@ resource "rancher2_cluster_sync" "sync" {
     aws_vpc_security_group_egress_rule.downstream_egress_project_link,
     rancher2_machine_config_v2.all_in_one,
     terraform_data.patch_machine_configs,
+    time_sleep.wait_for_nodes,
+    data.aws_instances.rke2_nodes,
+    time_sleep.wait_for_nodes_again,
+    data.aws_instances.rke2_nodes_again,
+    time_sleep.wait_for_nodes_again_again,
+    data.aws_instances.rke2_nodes_again_again,
     rancher2_cluster_v2.rke2_cluster,
   ]
   cluster_id = rancher2_cluster_v2.rke2_cluster.cluster_v1_id