fix: adds upgrade testing & graceful termination workaround (envoyproxy#1248)

mathetake · web-flow · commit efafe9c7a9e7 · 2025-09-30T20:34:41.000-04:00
**Description** This adds an end-to-end upgrade testing where we are supposed to test two test scenarios: simply rolling upgrade and control plane upgrade while keep making requests and verify no requests are dropped. What we found is that, as reported in envoyproxy#1241, there's a slight gap between Envoy stop receiving requests and extproc termination, hence users might experience requests being dropped during upgrade. The fundamental fix is to set extproc sidecar container in the k8s API sense, but it's only available after k8s v1.33 by default. So, this adds a common workaround to sleep before the context cancelation. The workaround fix is verified to work in the newly added e2e upgrade tests. After this is merged, adding k8s-version detection and enabling sidecar by default automatically as well as backporting the fix to v0.3 would be necessary. After that, we can enable the control-plane upgrade variant of the test case that is currently disabled in this commit. **Related Issues/PRs (if applicable)** Closes envoyproxy#1241 Closes envoyproxy#1060 --------- Signed-off-by: Takeshi Yoneda <t.y.mathetake@gmail.com>
diff --git a/.codespell.skip b/.codespell.skip
@@ -9,6 +9,8 @@
 go.mod
 go.sum
 ./tests/e2e/logs
+./tests/e2e-inference-extension/logs
+./tests/e2e-upgrade/logs
 *_for_tests.yaml
 ./tests/extproc/testdata/server.*
 ./tests/internal/testopenai/cassettes/*.yaml
diff --git a/.github/workflows/build_and_test.yaml b/.github/workflows/build_and_test.yaml
@@ -242,6 +242,31 @@ jobs:
           TEST_GEMINI_API_KEY: ${{ secrets.ENVOY_AI_GATEWAY_GEMINI_API_KEY }}
         run: make test-e2e
 
+  test_e2e_upgrade:
+    needs: changes
+    if: ${{ needs.changes.outputs.code == 'true' }}
+    name: E2E Test for Upgrades
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          cache: false
+          go-version-file: go.mod
+      - uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/go-build
+            ~/.cache/golangci-lint
+            ~/go/pkg/mod
+            ~/go/bin
+          key: e2e-test-${{ hashFiles('**/go.mod', '**/go.sum', '**/Makefile') }}
+      - uses: docker/setup-buildx-action@v3
+      - run: make test-e2e-upgrade
+        env:
+          # We only need to test the upgrade from the latest stable version of EG.
+          EG_VERSION: v1.5.0
+
   test_e2e_inference_extension:
     needs: changes
     if: ${{ needs.changes.outputs.code == 'true' }}
@@ -312,6 +337,7 @@ jobs:
       - test_controller
       - test_extproc
       - test_e2e
+      - test_e2e_upgrade
       - test_e2e_inference_extension
     # We need this to run always to force-fail (and not skip) if any needed
     # job has failed. Otherwise, a skipped job will not fail the workflow.
diff --git a/.gitignore b/.gitignore
@@ -11,7 +11,9 @@ out/
 # This is the placeholder for the access log file during extproc tests.
 ACCESS_LOG_PATH
 
-tests/e2e/logs
+tests/e2e/logs/
+tests/e2e-inference-extension/logs/
+tests/e2e-upgrade/logs/
 
 # Files and directories to ignore in the site directory
 # dependencies
diff --git a/Makefile b/Makefile
@@ -182,6 +182,12 @@ test-e2e-inference-extension: build-e2e ## Run the end-to-end tests with a local
 	@echo "Run E2E tests for inference extension"
 	@go test -v ./tests/e2e-inference-extension/... $(GO_TEST_ARGS) $(GO_TEST_E2E_ARGS)
 
+# This runs the end-to-end upgrade tests for the controller and extproc with a local kind cluster.
+.PHONY: test-e2e-upgrade
+test-e2e-upgrade: build-e2e
+	@echo "Run E2E upgrade tests"
+	@go test -v ./tests/e2e-upgrade/... $(GO_TEST_ARGS) $(GO_TEST_E2E_ARGS)
+
 ##@ Common
 
 # This builds a binary for the given command under the internal/cmd directory.
diff --git a/cmd/extproc/main.go b/cmd/extproc/main.go
@@ -11,6 +11,7 @@ import (
 	"os"
 	"os/signal"
 	"syscall"
+	"time"
 
 	"github.com/envoyproxy/ai-gateway/cmd/extproc/mainlib"
 )
@@ -21,6 +22,20 @@ func main() {
 	signal.Notify(signalsChan, syscall.SIGINT, syscall.SIGTERM)
 	go func() {
 		<-signalsChan
+		log.Printf("signal received, shutting down...")
+		// Give some time for graceful shutdown. Right after the sigterm is issued for this pod,
+		// Envoy's health checking endpoint starts returning 503, but there's a gap between
+		// actual stop of the traffic to Envoy and the time when Envoy receives the SIGTERM since
+		// the propagation of the readiness info to the load balancer takes some time.
+		// We need to keep the extproc alive until after Envoy stops receiving traffic.
+		// https://gateway.envoyproxy.io/docs/tasks/operations/graceful-shutdown/
+		//
+		// This is a workaround for older k8s versions that don't support sidecar feature.
+		// This can be removed after the floor of supported k8s versions is larger than 1.32.
+		//
+		// 15s should be enough to propagate the readiness info to the load balancer for most cases.
+		time.Sleep(15 * time.Second)
+		log.Printf("shutting down the server now")
 		cancel()
 	}()
 	if err := mainlib.Main(ctx, os.Args[1:], os.Stderr); err != nil {
diff --git a/tests/e2e-inference-extension/e2e_inference_extension_test.go b/tests/e2e-inference-extension/e2e_inference_extension_test.go
@@ -12,5 +12,5 @@ import (
 )
 
 func TestMain(m *testing.M) {
-	e2elib.TestMain(m, nil, true)
+	e2elib.TestMain(m, e2elib.AIGatewayHelmOption{}, true, false)
 }
diff --git a/tests/e2e-upgrade/testdata/manifest.yaml b/tests/e2e-upgrade/testdata/manifest.yaml
@@ -0,0 +1,132 @@
+# Copyright Envoy AI Gateway Authors
+# SPDX-License-Identifier: Apache-2.0
+# The full text of the Apache license is available in the LICENSE file at
+# the root of the repo.
+
+apiVersion: gateway.networking.k8s.io/v1
+kind: GatewayClass
+metadata:
+  name: upgrade-test
+spec:
+  controllerName: gateway.envoyproxy.io/gatewayclass-controller
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: Gateway
+metadata:
+  name: upgrade-test
+  namespace: default
+spec:
+  gatewayClassName: upgrade-test
+  listeners:
+    - name: http
+      protocol: HTTP
+      port: 80
+  infrastructure:
+    parametersRef:
+      group: gateway.envoyproxy.io
+      kind: EnvoyProxy
+      name: upgrade-test
+---
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: EnvoyProxy
+metadata:
+  name: upgrade-test
+  namespace: default
+spec:
+  provider:
+    type: Kubernetes
+    kubernetes:
+      envoyDeployment:
+        container:
+          # Clear the default memory/cpu requirements for local tests.
+          resources: {}
+---
+apiVersion: aigateway.envoyproxy.io/v1alpha1
+kind: AIGatewayRoute
+metadata:
+  name: upgrade-test
+  namespace: default
+spec:
+  parentRefs:
+    - name: upgrade-test
+      kind: Gateway
+      group: gateway.networking.k8s.io
+  rules:
+    - matches:
+        - headers:
+            - type: Exact
+              name: x-ai-eg-model
+              value: some-cool-model
+      backendRefs:
+        - name: upgrade-test-cool-model-backend
+      timeouts:
+        request: 120s
+---
+apiVersion: aigateway.envoyproxy.io/v1alpha1
+kind: AIServiceBackend
+metadata:
+  name: upgrade-test-cool-model-backend
+  namespace: default
+spec:
+  schema:
+    name: OpenAI
+  backendRef:
+    name: testupstream
+    kind: Backend
+    group: gateway.envoyproxy.io
+---
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: Backend
+metadata:
+  name: testupstream
+  namespace: default
+spec:
+  endpoints:
+    - fqdn:
+        hostname: testupstream.default.svc.cluster.local
+        port: 80
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: testupstream
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: testupstream
+  template:
+    metadata:
+      labels:
+        app: testupstream
+    spec:
+      containers:
+        - name: testupstream
+          image: docker.io/envoyproxy/ai-gateway-testupstream:latest
+          imagePullPolicy: IfNotPresent
+          ports:
+            - containerPort: 8080
+          env:
+            - name: TESTUPSTREAM_ID
+              value: whatever
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 8080
+            initialDelaySeconds: 1
+            periodSeconds: 1
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: testupstream
+  namespace: default
+spec:
+  selector:
+    app: testupstream
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 8080
+  type: ClusterIP
diff --git a/tests/e2e-upgrade/upgrade_test.go b/tests/e2e-upgrade/upgrade_test.go
diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go
diff --git a/tests/internal/e2elib/e2elib.go b/tests/internal/e2elib/e2elib.go

Original file line number	Diff line number	Diff line change
`@@ -12,5 +12,5 @@ import (`
`12`	`12`	`)`
`13`	`13`
`14`	`14`	`func TestMain(m *testing.M) {`
`15`		`- e2elib.TestMain(m, nil, true)`
	`15`	`+ e2elib.TestMain(m, e2elib.AIGatewayHelmOption{}, true, false)`
`16`	`16`	`}`