Keep queue-proxy admin server on HTTP for PreStop hooks (#16163)

Fedosin · web-flow · commit 54868c235560 · 2025-10-18T04:05:19.000Z
The queue-proxy admin server now always serves HTTP on port 8022, even
when system-internal-tls is enabled. This simplifies the PreStop hook
configuration and fixes graceful shutdown issues.

Changes:
- Queue-proxy admin server always uses HTTP, only main server uses TLS
- PreStop hooks always use HTTP scheme (removed dynamic configuration)
- Updated tests to reflect that admin server is always HTTP

Why this approach:
- PreStop hooks are called by kubelet locally within the pod (localhost)
- No network traffic leaves the pod, so TLS isn't needed for security
- Simpler implementation with no dynamic scheme configuration
- Prevents TLS handshake errors during pod shutdown

This fixes the issue where pods would receive HTTP 502 errors during
scale-down operations when system-internal-tls was enabled. The error
occurred because the PreStop hook was trying to connect with HTTP to
a TLS-enabled admin server, causing immediate SIGTERM and dropped
requests.
diff --git a/pkg/queue/sharedmain/main.go b/pkg/queue/sharedmain/main.go
@@ -255,7 +255,7 @@ func Main(opts ...Option) error {
 	}
 
 	if env.Observability.Runtime.ProfilingEnabled() {
-		logger.Info("Rutime profiling enabled")
+		logger.Info("Runtime profiling enabled")
 		pprof := runtime.NewProfilingServer()
 		pprof.SetEnabled(true)
 		httpServers["profile"] = pprof.Server
@@ -267,16 +267,13 @@ func Main(opts ...Option) error {
 
 	if tlsEnabled {
 		tlsServers["main"] = mainServer(":"+env.QueueServingTLSPort, mainHandler)
-		tlsServers["admin"] = adminServer(":"+strconv.Itoa(networking.QueueAdminPort), adminHandler)
+		// Keep admin server on HTTP even with TLS enabled since it's only accessed locally by kubelet
 
 		certWatcher, err = certificate.NewCertWatcher(certPath, keyPath, 1*time.Minute, logger)
 		if err != nil {
 			logger.Fatal("failed to create certWatcher", zap.Error(err))
 		}
 		defer certWatcher.Stop()
-
-		// Drop admin http server since the admin TLS server is listening on the same port
-		delete(httpServers, "admin")
 	}
 
 	logger.Info("Starting queue-proxy")
diff --git a/test/e2e/systeminternaltls/system_internal_tls_test.go b/test/e2e/systeminternaltls/system_internal_tls_test.go
@@ -21,10 +21,13 @@ package systeminternaltls
 
 import (
 	"context"
+	"fmt"
+	"net/http"
 	"net/url"
 	"os"
 	"strings"
 	"testing"
+	"time"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
@@ -253,3 +256,120 @@ func matchTLSLog(line string) bool {
 func matchCertReloadLog(line string) bool {
 	return strings.Contains(line, certificate.CertReloadMessage)
 }
+
+// TestGracefulShutdownWithTLS tests that PreStop hooks work correctly with system-internal-tls enabled.
+// This is a regression test for https://github.com/knative/serving/issues/16162
+// where PreStop hooks would fail with TLS handshake errors, causing HTTP 502 errors during scale-down.
+func TestGracefulShutdownWithTLS(t *testing.T) {
+	if !test.ServingFlags.EnableAlphaFeatures {
+		t.Skip("Alpha features not enabled")
+	}
+
+	if !strings.Contains(test.ServingFlags.IngressClass, "kourier") &&
+		!strings.Contains(test.ServingFlags.IngressClass, "contour") {
+		t.Skip("Skip this test for non-kourier/contour ingress.")
+	}
+
+	// Not running in parallel on purpose - we're testing pod deletion.
+	clients := test.Setup(t)
+
+	names := test.ResourceNames{
+		Service: test.ObjectNameForTest(t),
+		Image:   test.Autoscale,
+	}
+	test.EnsureTearDown(t, clients, &names)
+
+	// Create a service with a reasonable timeout
+	const revisionTimeout = 5 * time.Minute
+	objects, err := v1test.CreateServiceReady(t, clients, &names,
+		rtesting.WithRevisionTimeoutSeconds(int64(revisionTimeout.Seconds())))
+	if err != nil {
+		t.Fatal("Failed to create a service:", err)
+	}
+	routeURL := objects.Route.Status.URL.URL()
+
+	// Verify the service is working
+	if _, err = pkgTest.CheckEndpointState(
+		context.Background(),
+		clients.KubeClient,
+		t.Logf,
+		routeURL,
+		spoof.IsStatusOK,
+		"RouteServes",
+		test.ServingFlags.ResolvableDomain,
+		test.AddRootCAtoTransport(context.Background(), t.Logf, clients, test.ServingFlags.HTTPS),
+	); err != nil {
+		t.Fatalf("The endpoint for Route %s at %s didn't serve correctly: %v", names.Route, routeURL, err)
+	}
+
+	// Get the pod
+	pods, err := clients.KubeClient.CoreV1().Pods(test.ServingFlags.TestNamespace).List(context.Background(), v1.ListOptions{
+		LabelSelector: "serving.knative.dev/revision=" + objects.Revision.Name,
+	})
+	if err != nil || len(pods.Items) == 0 {
+		t.Fatal("No pods or error:", err)
+	}
+	t.Logf("Saw %d pods", len(pods.Items))
+
+	// Prepare a long-running request (12+ seconds)
+	// NOTE: 12s + 6s must be less than drainSleepDuration and TERMINATION_DRAIN_DURATION_SECONDS.
+	u, _ := url.Parse(routeURL.String())
+	q := u.Query()
+	q.Set("sleep", "12001")
+	u.RawQuery = q.Encode()
+
+	httpClient, err := pkgTest.NewSpoofingClient(context.Background(), clients.KubeClient, t.Logf, u.Hostname(), test.ServingFlags.ResolvableDomain, test.AddRootCAtoTransport(context.Background(), t.Logf, clients, test.ServingFlags.HTTPS))
+	if err != nil {
+		t.Fatal("Error creating spoofing client:", err)
+	}
+
+	// Start multiple long-running requests
+	ctx := context.Background()
+	numRequests := 6
+	requestErrors := make(chan error, numRequests)
+
+	for i := range numRequests {
+		// Request number starts at 1
+		reqNum := i + 1
+
+		t.Logf("Starting request %d", reqNum)
+		go func() {
+			req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
+			if err != nil {
+				requestErrors <- fmt.Errorf("request %d: failed to create HTTP request: %w", reqNum, err)
+				return
+			}
+
+			res, err := httpClient.Do(req)
+			t.Logf("Request %d completed", reqNum)
+			if err != nil {
+				requestErrors <- fmt.Errorf("request %d: request failed: %w", reqNum, err)
+				return
+			}
+			if res.StatusCode != http.StatusOK {
+				requestErrors <- fmt.Errorf("request %d: status = %v, want StatusOK (this could indicate PreStop hook failure)", reqNum, res.StatusCode)
+				return
+			}
+			requestErrors <- nil
+		}()
+		time.Sleep(time.Second)
+	}
+
+	// Immediately delete the pod while requests are in flight
+	// This triggers the PreStop hook which must use HTTP (not TLS) to drain connections
+	podToDelete := pods.Items[0].Name
+	t.Logf("Deleting pod %q while requests are in flight", podToDelete)
+	if err := clients.KubeClient.CoreV1().Pods(test.ServingFlags.TestNamespace).Delete(context.Background(), podToDelete, v1.DeleteOptions{}); err != nil {
+		t.Fatal("Failed to delete pod:", err)
+	}
+
+	// Wait for all requests to complete and check for errors
+	t.Log("Waiting for all requests to complete...")
+	for i := range numRequests {
+		if err := <-requestErrors; err != nil {
+			t.Errorf("Request %d: %v", i+1, err)
+		}
+	}
+
+	t.Log("All requests completed successfully - PreStop hook worked correctly with TLS enabled")
+}