Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 56 additions & 41 deletions .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,50 +14,65 @@ env:
SHA: ${{ github.event.pull_request.head.sha || github.sha }}

jobs:
conditional-skip:
uses: ./.github/workflows/reusable-conditional-skip.yml

test:
name: test
needs: [ conditional-skip ]
if: needs.conditional-skip.outputs.skip-ci != 'true'
runs-on: ubuntu-latest
cloud-acceptance:
name: cloud-acceptance
runs-on: ubuntu-24.04
steps:
- uses: benc-uk/workflow-dispatch@25b02cc069be46d637e8fe2f1e8484008e9e9609 # v1.2.3
name: test
name: cloud
with:
workflow: test.yml
workflow: cloud.yml
repo: hashicorp/consul-k8s-workflows
ref: main
# ref: main
# TODO: change ref to main after testing
ref: abhishek/eks-cleanup
token: ${{ secrets.ELEVATED_GITHUB_TOKEN }}
inputs: '{ "context":"${{ env.CONTEXT }}-${{ github.event.pull_request.number }}", "actor":"${{ github.actor }}", "repository":"${{ github.repository }}", "branch":"${{ env.BRANCH }}", "sha":"${{ env.SHA }}", "json_params":"{\"gotestsum-version\":\"1.12.3\", \"terraform-version\":\"latest\", \"test-ce\":true, \"test-type\":\"short\", \"dual-stack\":true}" }'
inputs: '{ "context":"${{ env.CONTEXT }}-${{ github.event.pull_request.number }}", "repository":"${{ github.repository }}", "branch":"${{ env.BRANCH }}", "sha":"${{ github.sha }}"}'

pass-required-checks-on-skip:
needs: [ conditional-skip ]
if: needs.conditional-skip.outputs.skip-ci == 'true'
runs-on: ubuntu-latest
strategy:
matrix:
include:
# The required checks that should be "passed" when the CI is skipped
- check-name: Unit test helm templates
- check-name: Unit test helm gen
- check-name: Unit test enterprise control plane
- check-name: Unit test control plane
- check-name: Unit test cli
- check-name: Unit test acceptance
- check-name: acceptance
- check-name: acceptance-cni
- check-name: acceptance-tproxy
steps:
- name: Update final status
uses: docker://ghcr.io/curtbushko/commit-status-action:e1d661c757934ab35c74210b4b70c44099ec747a
env:
INPUT_TOKEN: ${{ secrets.ELEVATED_GITHUB_TOKEN }}
INPUT_REPOSITORY: ${{ github.repository }}
INPUT_CONTEXT: ${{ matrix.check-name }}
INPUT_STATE: success
INPUT_DESCRIPTION: "Skipped due to conditional-skip check"
INPUT_SHA: ${{ env.SHA }}
INPUT_DETAILS_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
INPUT_OWNER: "hashicorp"
# conditional-skip:
# uses: ./.github/workflows/reusable-conditional-skip.yml

# test:
# name: test
# needs: [ conditional-skip ]
# if: needs.conditional-skip.outputs.skip-ci != 'true'
# runs-on: ubuntu-latest
# steps:
# - uses: benc-uk/workflow-dispatch@25b02cc069be46d637e8fe2f1e8484008e9e9609 # v1.2.3
# name: test
# with:
# workflow: test.yml
# repo: hashicorp/consul-k8s-workflows
# ref: main
# token: ${{ secrets.ELEVATED_GITHUB_TOKEN }}
# inputs: '{ "context":"${{ env.CONTEXT }}-${{ github.event.pull_request.number }}", "actor":"${{ github.actor }}", "repository":"${{ github.repository }}", "branch":"${{ env.BRANCH }}", "sha":"${{ env.SHA }}", "json_params":"{\"gotestsum-version\":\"1.12.3\", \"terraform-version\":\"latest\", \"test-ce\":true, \"test-type\":\"short\", \"dual-stack\":true}" }'

# pass-required-checks-on-skip:
# needs: [ conditional-skip ]
# if: needs.conditional-skip.outputs.skip-ci == 'true'
# runs-on: ubuntu-latest
# strategy:
# matrix:
# include:
# # The required checks that should be "passed" when the CI is skipped
# - check-name: Unit test helm templates
# - check-name: Unit test helm gen
# - check-name: Unit test enterprise control plane
# - check-name: Unit test control plane
# - check-name: Unit test cli
# - check-name: Unit test acceptance
# - check-name: acceptance
# - check-name: acceptance-cni
# - check-name: acceptance-tproxy
# steps:
# - name: Update final status
# uses: docker://ghcr.io/curtbushko/commit-status-action:e1d661c757934ab35c74210b4b70c44099ec747a
# env:
# INPUT_TOKEN: ${{ secrets.ELEVATED_GITHUB_TOKEN }}
# INPUT_REPOSITORY: ${{ github.repository }}
# INPUT_CONTEXT: ${{ matrix.check-name }}
# INPUT_STATE: success
# INPUT_DESCRIPTION: "Skipped due to conditional-skip check"
# INPUT_SHA: ${{ env.SHA }}
# INPUT_DETAILS_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
# INPUT_OWNER: "hashicorp"
32 changes: 16 additions & 16 deletions acceptance/ci-inputs/aks_acceptance_test_packages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@

# Cloud package is not included in test suite as it is triggered from a non consul-k8s repo and requires HCP credentials
- {runner: 0, test-packages: "connect"}
- {runner: 1, test-packages: "peering"}
- {runner: 2, test-packages: "snapshot-agent"}
- {runner: 3, test-packages: "wan-federation"}
- {runner: 4, test-packages: "consul-dns"}
- {runner: 5, test-packages: "example"}
- {runner: 6, test-packages: "partitions"}
- {runner: 7, test-packages: "metrics"}
- {runner: 8, test-packages: "sync"}
- {runner: 9, test-packages: "basic"}
- {runner: 10, test-packages: "cli"}
- {runner: 11, test-packages: "config-entries"}
- {runner: 12, test-packages: "api-gateway"}
- {runner: 13, test-packages: "ingress-gateway"}
- {runner: 14, test-packages: "terminating-gateway"}
- {runner: 15, test-packages: "vault"}
- {runner: 16, test-packages: "server"}
# - {runner: 1, test-packages: "peering"}
# - {runner: 2, test-packages: "snapshot-agent"}
# - {runner: 3, test-packages: "wan-federation"}
# - {runner: 4, test-packages: "consul-dns"}
# - {runner: 5, test-packages: "example"}
# - {runner: 6, test-packages: "partitions"}
# - {runner: 7, test-packages: "metrics"}
# - {runner: 8, test-packages: "sync"}
# - {runner: 9, test-packages: "basic"}
# - {runner: 10, test-packages: "cli"}
# - {runner: 11, test-packages: "config-entries"}
# - {runner: 12, test-packages: "api-gateway"}
# - {runner: 13, test-packages: "ingress-gateway"}
# - {runner: 14, test-packages: "terminating-gateway"}
# - {runner: 15, test-packages: "vault"}
# - {runner: 16, test-packages: "server"}
32 changes: 16 additions & 16 deletions acceptance/ci-inputs/eks_acceptance_test_packages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@

# Cloud package is not included in test suite as it is triggered from a non consul-k8s repo and requires HCP credentials
- {runner: 0, test-packages: "connect"}
- {runner: 1, test-packages: "peering"}
- {runner: 2, test-packages: "snapshot-agent"}
- {runner: 3, test-packages: "wan-federation"}
- {runner: 4, test-packages: "consul-dns"}
- {runner: 5, test-packages: "example"}
- {runner: 6, test-packages: "partitions"}
- {runner: 7, test-packages: "metrics"}
- {runner: 8, test-packages: "sync"}
- {runner: 9, test-packages: "basic"}
- {runner: 10, test-packages: "cli"}
- {runner: 11, test-packages: "config-entries"}
- {runner: 12, test-packages: "api-gateway"}
- {runner: 13, test-packages: "ingress-gateway"}
- {runner: 14, test-packages: "terminating-gateway"}
- {runner: 15, test-packages: "vault"}
- {runner: 16, test-packages: "server"}
# - {runner: 1, test-packages: "peering"}
# - {runner: 2, test-packages: "snapshot-agent"}
# - {runner: 3, test-packages: "wan-federation"}
# - {runner: 4, test-packages: "consul-dns"}
# - {runner: 5, test-packages: "example"}
# - {runner: 6, test-packages: "partitions"}
# - {runner: 7, test-packages: "metrics"}
# - {runner: 8, test-packages: "sync"}
# - {runner: 9, test-packages: "basic"}
# - {runner: 10, test-packages: "cli"}
# - {runner: 11, test-packages: "config-entries"}
# - {runner: 12, test-packages: "api-gateway"}
# - {runner: 13, test-packages: "ingress-gateway"}
# - {runner: 14, test-packages: "terminating-gateway"}
# - {runner: 15, test-packages: "vault"}
# - {runner: 16, test-packages: "server"}
32 changes: 16 additions & 16 deletions acceptance/ci-inputs/gke_acceptance_test_packages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@

# Cloud package is not included in test suite as it is triggered from a non consul-k8s repo and requires HCP credentials
- {runner: 0, test-packages: "connect"}
- {runner: 1, test-packages: "peering"}
- {runner: 2, test-packages: "snapshot-agent"}
- {runner: 3, test-packages: "wan-federation"}
- {runner: 4, test-packages: "consul-dns"}
- {runner: 5, test-packages: "example"}
- {runner: 6, test-packages: "partitions"}
- {runner: 7, test-packages: "metrics"}
- {runner: 8, test-packages: "sync"}
- {runner: 9, test-packages: "basic"}
- {runner: 10, test-packages: "cli"}
- {runner: 11, test-packages: "config-entries"}
- {runner: 12, test-packages: "api-gateway"}
- {runner: 13, test-packages: "ingress-gateway"}
- {runner: 14, test-packages: "terminating-gateway"}
- {runner: 15, test-packages: "vault"}
- {runner: 16, test-packages: "server"}
# - {runner: 1, test-packages: "peering"}
# - {runner: 2, test-packages: "snapshot-agent"}
# - {runner: 3, test-packages: "wan-federation"}
# - {runner: 4, test-packages: "consul-dns"}
# - {runner: 5, test-packages: "example"}
# - {runner: 6, test-packages: "partitions"}
# - {runner: 7, test-packages: "metrics"}
# - {runner: 8, test-packages: "sync"}
# - {runner: 9, test-packages: "basic"}
# - {runner: 10, test-packages: "cli"}
# - {runner: 11, test-packages: "config-entries"}
# - {runner: 12, test-packages: "api-gateway"}
# - {runner: 13, test-packages: "ingress-gateway"}
# - {runner: 14, test-packages: "terminating-gateway"}
# - {runner: 15, test-packages: "vault"}
# - {runner: 16, test-packages: "server"}
151 changes: 146 additions & 5 deletions acceptance/framework/consul/helm_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,20 +157,146 @@ func (h *HelmCluster) Create(t *testing.T) {
chartName = h.ChartPath
}
// Retry the install in case previous tests have not finished cleaning up.
retry.RunWith(&retry.Counter{Wait: 2 * time.Second, Count: 30}, t, func(r *retry.R) {
err := helm.UpgradeE(r, h.helmOptions, chartName, h.releaseName)
require.NoError(r, err)
})
// Each attempt can take up to 15m (helm --timeout); keep the retry count low
// so a genuinely broken install fails fast instead of looping for hours.
const maxInstallAttempts = 3
installStart := time.Now()
logger.Logf(t, "[helm-install] release=%s namespace=%s chart=%s starting helm upgrade --install (max %d attempts, --timeout 15m each)",
h.releaseName, h.helmOptions.KubectlOptions.Namespace, chartName, maxInstallAttempts)

var lastErr error
for installAttempt := 1; installAttempt <= maxInstallAttempts; installAttempt++ {
attemptStart := time.Now()
logger.Logf(t, "[helm-install] release=%s attempt %d/%d starting at %s (elapsed since first attempt: %s)",
h.releaseName, installAttempt, maxInstallAttempts, attemptStart.Format(time.RFC3339), time.Since(installStart))

lastErr = helm.UpgradeE(t, h.helmOptions, chartName, h.releaseName)
if lastErr == nil {
logger.Logf(t, "[helm-install] release=%s attempt %d/%d SUCCEEDED after %s (total elapsed: %s)",
h.releaseName, installAttempt, maxInstallAttempts, time.Since(attemptStart), time.Since(installStart))
break
}

logger.Logf(t, "[helm-install] release=%s attempt %d/%d FAILED after %s: %v",
h.releaseName, installAttempt, maxInstallAttempts, time.Since(attemptStart), lastErr)

// Always dump cluster status on a failed attempt so we can see why helm timed out.
h.dumpClusterStatus(t, fmt.Sprintf("after failed install attempt %d/%d", installAttempt, maxInstallAttempts))

if installAttempt < maxInstallAttempts {
logger.Logf(t, "[helm-install] release=%s sleeping 5s before retry", h.releaseName)
time.Sleep(5 * time.Second)
}
}
if lastErr != nil {
logger.Logf(t, "[helm-install] release=%s exhausted all %d attempts after %s; failing test",
h.releaseName, maxInstallAttempts, time.Since(installStart))
require.NoErrorf(t, lastErr, "helm upgrade --install failed after %d attempts (total elapsed %s)",
maxInstallAttempts, time.Since(installStart))
}

logger.Logf(t, "[helm-install] release=%s waiting for all pods (selector release=%s) to be ready",
h.releaseName, h.releaseName)
waitStart := time.Now()
k8s.WaitForAllPodsToBeReady(t, h.kubernetesClient, h.helmOptions.KubectlOptions.Namespace, fmt.Sprintf("release=%s", h.releaseName))
logger.Logf(t, "[helm-install] release=%s all pods ready after %s (total Create() elapsed: %s)",
h.releaseName, time.Since(waitStart), time.Since(installStart))
}

// dumpClusterStatus logs a snapshot of nodes, pods, and recent events in the
// helm install namespace so that helm timeouts in CI logs are diagnosable.
// All errors are swallowed because this is best-effort diagnostic output.
func (h *HelmCluster) dumpClusterStatus(t *testing.T, reason string) {
t.Helper()
ns := h.helmOptions.KubectlOptions.Namespace
logger.Logf(t, "[cluster-status] === BEGIN cluster status (%s) namespace=%s release=%s ===", reason, ns, h.releaseName)

// Nodes: capacity, allocatable, conditions.
nodes, err := h.kubernetesClient.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
if err != nil {
logger.Logf(t, "[cluster-status] failed to list nodes: %v", err)
} else {
for _, n := range nodes.Items {
ready := "Unknown"
for _, c := range n.Status.Conditions {
if c.Type == corev1.NodeReady {
ready = string(c.Status)
break
}
}
logger.Logf(t, "[cluster-status] node=%s ready=%s allocatable.cpu=%s allocatable.memory=%s instance-type=%s",
n.Name, ready,
n.Status.Allocatable.Cpu().String(),
n.Status.Allocatable.Memory().String(),
n.Labels["node.kubernetes.io/instance-type"])
}
}

// Pods in this release's namespace: phase, readiness, restarts, node, reason/message.
pods, err := h.kubernetesClient.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{})
if err != nil {
logger.Logf(t, "[cluster-status] failed to list pods in namespace %s: %v", ns, err)
} else {
logger.Logf(t, "[cluster-status] %d pod(s) in namespace %s", len(pods.Items), ns)
for _, p := range pods.Items {
ready, total := 0, len(p.Spec.Containers)
restarts := int32(0)
waitingReasons := []string{}
for _, cs := range p.Status.ContainerStatuses {
if cs.Ready {
ready++
}
restarts += cs.RestartCount
if cs.State.Waiting != nil {
waitingReasons = append(waitingReasons, fmt.Sprintf("%s=%s:%s", cs.Name, cs.State.Waiting.Reason, cs.State.Waiting.Message))
}
}
node := p.Spec.NodeName
if node == "" {
node = "<unscheduled>"
}
logger.Logf(t, "[cluster-status] pod=%s phase=%s ready=%d/%d restarts=%d node=%s reason=%q",
p.Name, p.Status.Phase, ready, total, restarts, node, p.Status.Reason)
if p.Status.Message != "" {
logger.Logf(t, "[cluster-status] message: %s", p.Status.Message)
}
for _, w := range waitingReasons {
logger.Logf(t, "[cluster-status] waiting: %s", w)
}
// Print Pod conditions to surface "Unschedulable" with reason like "Insufficient cpu".
for _, c := range p.Status.Conditions {
if c.Status != corev1.ConditionTrue && c.Message != "" {
logger.Logf(t, "[cluster-status] condition %s=%s reason=%s msg=%s", c.Type, c.Status, c.Reason, c.Message)
}
}
}
}

// Warning events: very helpful for FailedScheduling / FailedMount / ImagePullBackOff etc.
events, err := h.kubernetesClient.CoreV1().Events(ns).List(context.Background(), metav1.ListOptions{})
if err != nil {
logger.Logf(t, "[cluster-status] failed to list events in namespace %s: %v", ns, err)
} else {
warnCount := 0
for _, e := range events.Items {
if e.Type != corev1.EventTypeWarning {
continue
}
warnCount++
logger.Logf(t, "[cluster-status] event WARN obj=%s/%s reason=%s count=%d msg=%s",
e.InvolvedObject.Kind, e.InvolvedObject.Name, e.Reason, e.Count, e.Message)
}
logger.Logf(t, "[cluster-status] %d warning event(s) in namespace %s", warnCount, ns)
}

logger.Logf(t, "[cluster-status] === END cluster status (%s) ===", reason)
}

func (h *HelmCluster) Destroy(t *testing.T) {
t.Helper()

k8s.WritePodsDebugInfoIfFailed(t, h.helmOptions.KubectlOptions, h.debugDirectory, "release="+h.releaseName)


// Clean up any stuck gateway resources, note that we swallow all errors from
// here down since the terratest helm installation may actually already be
// deleted at this point, in which case these operations will fail on non-existent
Expand Down Expand Up @@ -204,8 +330,23 @@ func (h *HelmCluster) Destroy(t *testing.T) {
}
}

deleteAttempt := 0
deleteStart := time.Now()
logger.Logf(t, "[helm-delete] release=%s namespace=%s starting helm delete (max 30 attempts)",
h.releaseName, h.helmOptions.KubectlOptions.Namespace)
retry.RunWith(&retry.Counter{Wait: 2 * time.Second, Count: 30}, t, func(r *retry.R) {
deleteAttempt++
attemptStart := time.Now()
logger.Logf(t, "[helm-delete] release=%s attempt %d/30 starting (elapsed: %s)",
h.releaseName, deleteAttempt, time.Since(deleteStart))
err := helm.DeleteE(r, h.helmOptions, h.releaseName, false)
if err != nil {
logger.Logf(t, "[helm-delete] release=%s attempt %d/30 FAILED after %s: %v",
h.releaseName, deleteAttempt, time.Since(attemptStart), err)
} else {
logger.Logf(t, "[helm-delete] release=%s attempt %d/30 SUCCEEDED after %s",
h.releaseName, deleteAttempt, time.Since(attemptStart))
}
require.NoError(r, err)
})

Expand Down
Loading
Loading