From 67370de095fe010ce27a246ab14d18516cfa0af9 Mon Sep 17 00:00:00 2001
From: matttrach <matt.trachier@suse.com>
Date: Tue, 21 Oct 2025 14:01:05 -0500
Subject: [PATCH] fix: create a slow mode for testing

Signed-off-by: matttrach <matt.trachier@suse.com>
---
 .github/workflows/manual.yaml  | 48 ------------------
 .github/workflows/release.yaml | 14 ++++--
 modules/deploy/create.sh.tpl   |  4 +-
 modules/deploy/destroy.sh.tpl  |  4 +-
 run_tests.sh                   | 89 ++++++++++++++++++++++++++++++----
 5 files changed, 95 insertions(+), 64 deletions(-)
 delete mode 100644 .github/workflows/manual.yaml

diff --git a/.github/workflows/manual.yaml b/.github/workflows/manual.yaml
deleted file mode 100644
index 0be99fe..0000000
--- a/.github/workflows/manual.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-name: manual
-
-on: workflow_dispatch
-
-env:
-  AWS_REGION: us-west-2
-  AWS_ROLE: arn:aws:iam::270074865685:role/terraform-module-ci-test
-  GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
-  ACME_SERVER_URL: https://acme-v02.api.letsencrypt.org/directory
-
-permissions: write-all
-
-jobs:
-  test_TestOneBasic:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v5
-        with:
-          token: ${{secrets.GITHUB_TOKEN}}
-          fetch-depth: 0
-      - id: aws-creds
-        uses: aws-actions/configure-aws-credentials@v5
-        with:
-          role-to-assume: ${{env.AWS_ROLE}}
-          role-session-name: ${{github.run_id}}
-          aws-region: ${{env.AWS_REGION}}
-          role-duration-seconds: 7200 # 2 hours
-          output-credentials: true
-      - name: install-nix
-        run: |
-          curl -L https://nixos.org/nix/install | sh
-          source /home/runner/.nix-profile/etc/profile.d/nix.sh
-          nix --version
-          which nix
-      - name: run_tests
-        shell: '/home/runner/.nix-profile/bin/nix develop --ignore-environment --extra-experimental-features nix-command --extra-experimental-features flakes --keep HOME --keep SSH_AUTH_SOCK --keep IDENTIFIER --keep GITHUB_TOKEN --keep GITHUB_OWNER --keep ZONE --keep AWS_ROLE --keep AWS_REGION --keep AWS_DEFAULT_REGION --keep AWS_ACCESS_KEY_ID --keep AWS_SECRET_ACCESS_KEY --keep AWS_SESSION_TOKEN --keep UPDATECLI_GPGTOKEN --keep UPDATECLI_GITHUB_TOKEN --keep UPDATECLI_GITHUB_ACTOR --keep GPG_SIGNING_KEY --keep NIX_SSL_CERT_FILE --keep NIX_ENV_LOADED --keep TERM --command bash -e {0}'
-        env:
-          AWS_ACCESS_KEY_ID: ${{ steps.aws-creds.outputs.aws-access-key-id }}
-          AWS_SECRET_ACCESS_KEY: ${{ steps.aws-creds.outputs.aws-secret-access-key }}
-          AWS_SESSION_TOKEN: ${{ steps.aws-creds.outputs.aws-session-token }}
-          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
-          GITHUB_OWNER: rancher
-          IDENTIFIER: ${{github.run_id}}
-          ZONE: ${{secrets.ZONE}}
-          ACME_SERVER_URL: https://acme-v02.api.letsencrypt.org/directory
-          RANCHER_INSECURE: false
-        run: |
-          ./run_tests.sh -t TestOneBasic
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 050d8e4..f7d1603 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -73,6 +73,13 @@ jobs:
               repo: "${{ github.event.repository.name }}",
               body: "Please make sure e2e tests pass before merging this PR! \n ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
             })
+
+  test:
+    needs:
+      - release
+    if: needs.release.outputs.release_pr
+    runs-on: ubuntu-latest
+    steps:
       - uses: actions/checkout@v5
         with:
           token: ${{secrets.GITHUB_TOKEN}}
@@ -83,7 +90,7 @@ jobs:
           role-to-assume: ${{env.AWS_ROLE}}
           role-session-name: ${{github.run_id}}
           aws-region: ${{env.AWS_REGION}}
-          role-duration-seconds: 14400 # 4 hours
+          role-duration-seconds: 28800 # 8 hours
           output-credentials: true
       - name: install-nix
         run: |
@@ -106,12 +113,12 @@ jobs:
           ACME_SERVER_URL: https://acme-v02.api.letsencrypt.org/directory
           RANCHER_INSECURE: false
         run: |
-          # should take around 4 hours
-          ./run_tests.sh
+          ./run_tests.sh -s
 
   cleanup:
     needs:
       - release
+      - test
     if: always() && needs.release.outputs.release_pr
     runs-on: ubuntu-latest
     steps:
@@ -147,6 +154,7 @@ jobs:
   report:
     needs:
       - release
+      - test
       - cleanup
     if: success() && needs.release.outputs.release_pr #Ensure the test jobs succeeded, and that a release PR was created.
     runs-on: ubuntu-latest
diff --git a/modules/deploy/create.sh.tpl b/modules/deploy/create.sh.tpl
index 713109a..ebb3a82 100644
--- a/modules/deploy/create.sh.tpl
+++ b/modules/deploy/create.sh.tpl
@@ -18,7 +18,7 @@ E1=0
 while [ $EXITCODE -gt 0 ] && [ $ATTEMPTS -lt $MAX ]; do
   A=0
   while [ $E -gt 0 ] && [ $A -lt $MAX ]; do
-    timeout -k 1m ${timeout} terraform apply -var-file="${deploy_path}/inputs.tfvars" -auto-approve -state="${deploy_path}/tfstate"
+    timeout -k 1m ${timeout} terraform apply -var-file="${deploy_path}/inputs.tfvars" -no-color -auto-approve -state="${deploy_path}/tfstate"
     E=$?
     if [ $E -eq 124 ]; then echo "Apply timed out after ${timeout}"; fi
     A=$((A+1))
@@ -27,7 +27,7 @@ while [ $EXITCODE -gt 0 ] && [ $ATTEMPTS -lt $MAX ]; do
   if [ $E -gt 0 ] && [ $ATTEMPTS != $((MAX-1)) ]; then
     A1=0
     while [ $E1 -gt 0 ] && [ $A1 -lt $MAX ]; do
-      timeout -k 1m ${timeout} terraform destroy -var-file="${deploy_path}/inputs.tfvars" -auto-approve -state="${deploy_path}/tfstate"
+      timeout -k 1m ${timeout} terraform destroy -var-file="${deploy_path}/inputs.tfvars" -no-color -auto-approve -state="${deploy_path}/tfstate"
       E1=$?
       if [ $E1 -eq 124 ]; then echo "Apply timed out after ${timeout}"; fi
       A1=$((A1+1))
diff --git a/modules/deploy/destroy.sh.tpl b/modules/deploy/destroy.sh.tpl
index e7296cf..c3ff2fd 100644
--- a/modules/deploy/destroy.sh.tpl
+++ b/modules/deploy/destroy.sh.tpl
@@ -8,8 +8,8 @@ whoami
 TF_CLI_ARGS_init=""
 TF_CLI_ARGS_apply=""
 if [ -z "${skip_destroy}" ]; then
-  timeout -k 1m ${timeout} terraform init -upgrade -reconfigure
-  timeout -k 1m ${timeout} terraform destroy -var-file="${deploy_path}/inputs.tfvars" -auto-approve -state="${deploy_path}/tfstate" || true
+  timeout -k 1m ${timeout} terraform init -upgrade -reconfigure -no-color
+  timeout -k 1m ${timeout} terraform destroy -var-file="${deploy_path}/inputs.tfvars" -no-color -auto-approve -state="${deploy_path}/tfstate" || true
 else
   echo "Not destroying deployed module, it will no longer be managed here."
 fi
diff --git a/run_tests.sh b/run_tests.sh
index 003ef07..4cde16d 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -4,23 +4,64 @@ rerun_failed=false
 specific_test=""
 specific_package=""
 cleanup_id=""
+slow_mode=false
 
-while getopts ":r:t:p:c:" opt; do
+while getopts ":rst:p:c:" opt; do
   case $opt in
     r) rerun_failed=true ;;
     t) specific_test="$OPTARG" ;;
     p) specific_package="$OPTARG" ;;
     c) cleanup_id="$OPTARG" ;;
+    s) slow_mode=true ;;
     \?) cat <<EOT >&2 && exit 1 ;;
 Invalid option -$OPTARG, valid options are
   -r to re-run failed tests
-  -t to specify a specific test (eg. TestBase)
-  -p to specify a specific test package (eg. base)
+  -s to run tests in slow mode (one at a time to avoid AWS rate limiting)
   -c to run clean up only with the given id (eg. abc123)
+  -t to specify a specific test (eg. TestBase)
+  -p to specify a specific test package (eg. one)
+Only one of -c, -t, or -p can be used at a time.
 EOT
   esac
 done
 
+if [ $slow_mode == true ]; then
+  echo "Running in slow mode: tests will be run one at a time to avoid AWS rate limiting."
+elif [ $slow_mode == false ]; then
+  echo "Running in normal mode: tests will be run in parallel."
+fi
+if [ $rerun_failed == true ]; then
+  echo "Rerun failed tests is enabled."
+elif [ $rerun_failed == false ]; then
+  echo "Rerun failed tests is disabled."
+fi
+if [ -n "$specific_test" ]; then
+  echo "Specific test to run: $specific_test"
+else
+  echo "No specific test to run."
+fi
+if [ -n "$specific_package" ]; then
+  echo "Specific package to run: $specific_package"
+else
+  echo "No specific package to run."
+fi
+if [ -n "$cleanup_id" ]; then
+  echo "Cleanup only mode enabled with id: $cleanup_id"
+fi
+if [ -n "$cleanup_id" ] && { [ -n "$specific_test" ] || [ -n "$specific_package" ]; }; then
+  echo "Error: Only one of -c, -t, or -p can be used at a time." >&2
+  exit 1
+fi
+if [ -n "$specific_test" ] && { [ -n "$specific_package" ] || [ -n "$cleanup_id" ]; }; then
+  echo "Error: Only one of -c, -t, or -p can be used at a time." >&2
+  exit 1
+fi
+if [ -n "$specific_package" ] && { [ -n "$specific_test" ] || [ -n "$cleanup_id" ]; }; then
+  echo "Error: Only one of -c, -t, or -p can be used at a time." >&2
+  exit 1
+fi
+
+
 # shellcheck disable=SC2143
 if [ -n "$cleanup_id" ]; then
   export IDENTIFIER="$cleanup_id"
@@ -30,6 +71,7 @@ REPO_ROOT="$(git rev-parse --show-toplevel)"
 
 run_tests() {
   local rerun=$1
+  local slow_mode=$2
   REPO_ROOT="$(git rev-parse --show-toplevel)"
   cd "$REPO_ROOT" || exit 1
 
@@ -85,8 +127,37 @@ EOF
   else
     package_pattern="..."
   fi
-  # We need both -p and -parallel, as -p sets the number of packages to test in parallel, and -parallel sets the number of tests to run in parallel.
-  # By setting both to 1, we ensure that tests are run sequentially, which can help avoid AWS rate limiting issues. I does increase the runtime significantly though.
+
+  # We need both -p and -parallel, as -p sets the number of packages to test in parallel,
+  #  and -parallel sets the number of tests to run in parallel.
+  # By setting both to 1, we ensure that tests are run sequentially, which can help avoid AWS rate limiting issues.
+  # It does increase the runtime significantly though.
+  local parallel_packages=""
+  local parallel_tests=""
+  if [ "$slow_mode" = true ]; then
+    echo "Running in slow mode..."
+    parallel_packages="-p=1"
+    parallel_tests="-parallel=1"
+  fi
+
+  CMD=$(cat <<EOT
+gotestsum \
+  --format=standard-verbose \
+  --jsonfile "/tmp/${IDENTIFIER}_test.log" \
+  --post-run-command "sh /tmp/${IDENTIFIER}_test-processor" \
+  --packages "$REPO_ROOT/$TEST_DIR/$package_pattern" \
+  -- \
+  -count=1 \
+  -timeout=300m \
+  -failfast \
+  $parallel_packages \
+  $parallel_tests \
+  $rerun_flag \
+  $specific_test_flag
+EOT
+)
+  echo "Running command: $CMD"
+
   # shellcheck disable=SC2086
   gotestsum \
     --format=standard-verbose \
@@ -95,10 +166,10 @@ EOF
     --packages "$REPO_ROOT/$TEST_DIR/$package_pattern" \
     -- \
     -count=1 \
-    -p=1 \
-    -parallel=1 \
     -timeout=300m \
     -failfast \
+    $parallel_packages \
+    $parallel_tests \
     $rerun_flag \
     $specific_test_flag
 
@@ -136,13 +207,13 @@ if [ -z "$cleanup_id" ]; then
   echo "terraform configs valid..."
 
   # Run tests initially
-  run_tests false
+  run_tests false "$slow_mode"
   sleep 60
 
   # Check if we need to rerun failed tests
   if [ "$rerun_failed" = true ] && [ -f "/tmp/${IDENTIFIER}_failed_tests.txt" ]; then
     echo "Rerunning failed tests..."
-    run_tests true
+    run_tests true "$slow_mode"
     sleep 60
   fi
 fi