Skip to content

Commit 54868c2

Browse files
authored
Keep queue-proxy admin server on HTTP for PreStop hooks (#16163)
The queue-proxy admin server now always serves HTTP on port 8022, even when system-internal-tls is enabled. This simplifies the PreStop hook configuration and fixes graceful shutdown issues. Changes: - Queue-proxy admin server always uses HTTP, only main server uses TLS - PreStop hooks always use HTTP scheme (removed dynamic configuration) - Updated tests to reflect that admin server is always HTTP Why this approach: - PreStop hooks are called by kubelet locally within the pod (localhost) - No network traffic leaves the pod, so TLS isn't needed for security - Simpler implementation with no dynamic scheme configuration - Prevents TLS handshake errors during pod shutdown This fixes the issue where pods would receive HTTP 502 errors during scale-down operations when system-internal-tls was enabled. The error occurred because the PreStop hook was trying to connect with HTTP to a TLS-enabled admin server, causing immediate SIGTERM and dropped requests.
1 parent 3dd1e41 commit 54868c2

File tree

2 files changed

+122
-5
lines changed

2 files changed

+122
-5
lines changed

pkg/queue/sharedmain/main.go

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ func Main(opts ...Option) error {
255255
}
256256

257257
if env.Observability.Runtime.ProfilingEnabled() {
258-
logger.Info("Rutime profiling enabled")
258+
logger.Info("Runtime profiling enabled")
259259
pprof := runtime.NewProfilingServer()
260260
pprof.SetEnabled(true)
261261
httpServers["profile"] = pprof.Server
@@ -267,16 +267,13 @@ func Main(opts ...Option) error {
267267

268268
if tlsEnabled {
269269
tlsServers["main"] = mainServer(":"+env.QueueServingTLSPort, mainHandler)
270-
tlsServers["admin"] = adminServer(":"+strconv.Itoa(networking.QueueAdminPort), adminHandler)
270+
// Keep admin server on HTTP even with TLS enabled since it's only accessed locally by kubelet
271271

272272
certWatcher, err = certificate.NewCertWatcher(certPath, keyPath, 1*time.Minute, logger)
273273
if err != nil {
274274
logger.Fatal("failed to create certWatcher", zap.Error(err))
275275
}
276276
defer certWatcher.Stop()
277-
278-
// Drop admin http server since the admin TLS server is listening on the same port
279-
delete(httpServers, "admin")
280277
}
281278

282279
logger.Info("Starting queue-proxy")

test/e2e/systeminternaltls/system_internal_tls_test.go

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,13 @@ package systeminternaltls
2121

2222
import (
2323
"context"
24+
"fmt"
25+
"net/http"
2426
"net/url"
2527
"os"
2628
"strings"
2729
"testing"
30+
"time"
2831

2932
corev1 "k8s.io/api/core/v1"
3033
"k8s.io/apimachinery/pkg/api/errors"
@@ -253,3 +256,120 @@ func matchTLSLog(line string) bool {
253256
func matchCertReloadLog(line string) bool {
254257
return strings.Contains(line, certificate.CertReloadMessage)
255258
}
259+
260+
// TestGracefulShutdownWithTLS tests that PreStop hooks work correctly with system-internal-tls enabled.
261+
// This is a regression test for https://github.com/knative/serving/issues/16162
262+
// where PreStop hooks would fail with TLS handshake errors, causing HTTP 502 errors during scale-down.
263+
func TestGracefulShutdownWithTLS(t *testing.T) {
264+
if !test.ServingFlags.EnableAlphaFeatures {
265+
t.Skip("Alpha features not enabled")
266+
}
267+
268+
if !strings.Contains(test.ServingFlags.IngressClass, "kourier") &&
269+
!strings.Contains(test.ServingFlags.IngressClass, "contour") {
270+
t.Skip("Skip this test for non-kourier/contour ingress.")
271+
}
272+
273+
// Not running in parallel on purpose - we're testing pod deletion.
274+
clients := test.Setup(t)
275+
276+
names := test.ResourceNames{
277+
Service: test.ObjectNameForTest(t),
278+
Image: test.Autoscale,
279+
}
280+
test.EnsureTearDown(t, clients, &names)
281+
282+
// Create a service with a reasonable timeout
283+
const revisionTimeout = 5 * time.Minute
284+
objects, err := v1test.CreateServiceReady(t, clients, &names,
285+
rtesting.WithRevisionTimeoutSeconds(int64(revisionTimeout.Seconds())))
286+
if err != nil {
287+
t.Fatal("Failed to create a service:", err)
288+
}
289+
routeURL := objects.Route.Status.URL.URL()
290+
291+
// Verify the service is working
292+
if _, err = pkgTest.CheckEndpointState(
293+
context.Background(),
294+
clients.KubeClient,
295+
t.Logf,
296+
routeURL,
297+
spoof.IsStatusOK,
298+
"RouteServes",
299+
test.ServingFlags.ResolvableDomain,
300+
test.AddRootCAtoTransport(context.Background(), t.Logf, clients, test.ServingFlags.HTTPS),
301+
); err != nil {
302+
t.Fatalf("The endpoint for Route %s at %s didn't serve correctly: %v", names.Route, routeURL, err)
303+
}
304+
305+
// Get the pod
306+
pods, err := clients.KubeClient.CoreV1().Pods(test.ServingFlags.TestNamespace).List(context.Background(), v1.ListOptions{
307+
LabelSelector: "serving.knative.dev/revision=" + objects.Revision.Name,
308+
})
309+
if err != nil || len(pods.Items) == 0 {
310+
t.Fatal("No pods or error:", err)
311+
}
312+
t.Logf("Saw %d pods", len(pods.Items))
313+
314+
// Prepare a long-running request (12+ seconds)
315+
// NOTE: 12s + 6s must be less than drainSleepDuration and TERMINATION_DRAIN_DURATION_SECONDS.
316+
u, _ := url.Parse(routeURL.String())
317+
q := u.Query()
318+
q.Set("sleep", "12001")
319+
u.RawQuery = q.Encode()
320+
321+
httpClient, err := pkgTest.NewSpoofingClient(context.Background(), clients.KubeClient, t.Logf, u.Hostname(), test.ServingFlags.ResolvableDomain, test.AddRootCAtoTransport(context.Background(), t.Logf, clients, test.ServingFlags.HTTPS))
322+
if err != nil {
323+
t.Fatal("Error creating spoofing client:", err)
324+
}
325+
326+
// Start multiple long-running requests
327+
ctx := context.Background()
328+
numRequests := 6
329+
requestErrors := make(chan error, numRequests)
330+
331+
for i := range numRequests {
332+
// Request number starts at 1
333+
reqNum := i + 1
334+
335+
t.Logf("Starting request %d", reqNum)
336+
go func() {
337+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
338+
if err != nil {
339+
requestErrors <- fmt.Errorf("request %d: failed to create HTTP request: %w", reqNum, err)
340+
return
341+
}
342+
343+
res, err := httpClient.Do(req)
344+
t.Logf("Request %d completed", reqNum)
345+
if err != nil {
346+
requestErrors <- fmt.Errorf("request %d: request failed: %w", reqNum, err)
347+
return
348+
}
349+
if res.StatusCode != http.StatusOK {
350+
requestErrors <- fmt.Errorf("request %d: status = %v, want StatusOK (this could indicate PreStop hook failure)", reqNum, res.StatusCode)
351+
return
352+
}
353+
requestErrors <- nil
354+
}()
355+
time.Sleep(time.Second)
356+
}
357+
358+
// Immediately delete the pod while requests are in flight
359+
// This triggers the PreStop hook which must use HTTP (not TLS) to drain connections
360+
podToDelete := pods.Items[0].Name
361+
t.Logf("Deleting pod %q while requests are in flight", podToDelete)
362+
if err := clients.KubeClient.CoreV1().Pods(test.ServingFlags.TestNamespace).Delete(context.Background(), podToDelete, v1.DeleteOptions{}); err != nil {
363+
t.Fatal("Failed to delete pod:", err)
364+
}
365+
366+
// Wait for all requests to complete and check for errors
367+
t.Log("Waiting for all requests to complete...")
368+
for i := range numRequests {
369+
if err := <-requestErrors; err != nil {
370+
t.Errorf("Request %d: %v", i+1, err)
371+
}
372+
}
373+
374+
t.Log("All requests completed successfully - PreStop hook worked correctly with TLS enabled")
375+
}

0 commit comments

Comments
 (0)