From cb2fc6aac50f2f2f57ee5b10c56d4ee8fd2ad2bf Mon Sep 17 00:00:00 2001 From: jukie <10012479+jukie@users.noreply.github.com> Date: Tue, 4 Nov 2025 10:27:11 -0700 Subject: [PATCH 1/3] wait for cache sync before ready Signed-off-by: jukie <10012479+jukie@users.noreply.github.com> --- internal/provider/kubernetes/kubernetes.go | 28 +++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/internal/provider/kubernetes/kubernetes.go b/internal/provider/kubernetes/kubernetes.go index 0163fb2d496..d3d1873de47 100644 --- a/internal/provider/kubernetes/kubernetes.go +++ b/internal/provider/kubernetes/kubernetes.go @@ -8,6 +8,7 @@ package kubernetes import ( "context" "fmt" + "net/http" "time" appsv1 "k8s.io/api/apps/v1" @@ -65,6 +66,28 @@ var ( webhookTLSPort = 9443 ) +// cacheReadyCheck returns a healthz.Checker that verifies the manager's cache has synced. +// This ensures the control plane has populated its cache with all resources from the API server +// before reporting ready. This prevents serving inconsistent xDS configuration to Envoy proxies +// when running multiple control plane replicas during periods of resource churn. +func cacheReadyCheck(mgr manager.Manager) healthz.Checker { + return func(req *http.Request) error { + // Use a short timeout to avoid blocking the health check indefinitely. + // The readiness probe will retry periodically until the cache syncs. + // + // TODO: For v1.7.0 Make configurable via API and align with helm container readiness probe timeout. + ctx, cancel := context.WithTimeout(req.Context(), 1*time.Second) + defer cancel() + + // WaitForCacheSync returns true if the cache has synced, false if the context is cancelled. + if !mgr.GetCache().WaitForCacheSync(ctx) { + return fmt.Errorf("cache not synced yet") + } + + return nil + } +} + // New creates a new Provider from the provided EnvoyGateway. func New(ctx context.Context, restCfg *rest.Config, svrCfg *ec.Server, resources *message.ProviderResources) (*Provider, error) { // TODO: Decide which mgr opts should be exposed through envoygateway.provider.kubernetes API. @@ -199,7 +222,10 @@ func New(ctx context.Context, restCfg *rest.Config, svrCfg *ec.Server, resources } // Add ready check health probes. - if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + // Use a custom readiness check that waits for the cache to sync before reporting ready. + // This ensures the control plane has a consistent view of all resources before serving + // xDS configuration to proxies, preventing inconsistent state when multiple replicas exist. + if err := mgr.AddReadyzCheck("cache-sync", cacheReadyCheck(mgr)); err != nil { return nil, fmt.Errorf("unable to set up ready check: %w", err) } From e21c5453ca672d5d8e3d9425398d6f9db2be5a14 Mon Sep 17 00:00:00 2001 From: jukie <10012479+jukie@users.noreply.github.com> Date: Tue, 4 Nov 2025 15:26:33 -0700 Subject: [PATCH 2/3] leader election and warmup Signed-off-by: jukie <10012479+jukie@users.noreply.github.com> --- internal/provider/kubernetes/kubernetes.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/internal/provider/kubernetes/kubernetes.go b/internal/provider/kubernetes/kubernetes.go index d3d1873de47..d54378f6716 100644 --- a/internal/provider/kubernetes/kubernetes.go +++ b/internal/provider/kubernetes/kubernetes.go @@ -96,8 +96,10 @@ func New(ctx context.Context, restCfg *rest.Config, svrCfg *ec.Server, resources Scheme: envoygateway.GetScheme(), Logger: svrCfg.Logger.Logger, HealthProbeBindAddress: healthProbeBindAddress, + LeaderElection: false, LeaderElectionID: "5b9825d2.gateway.envoyproxy.io", LeaderElectionNamespace: svrCfg.ControllerNamespace, + Controller: config.Controller{NeedLeaderElection: ptr.To(false)}, } log.SetLogger(mgrOpts.Logger) @@ -107,6 +109,9 @@ func New(ctx context.Context, restCfg *rest.Config, svrCfg *ec.Server, resources if !ptr.Deref(svrCfg.EnvoyGateway.Provider.Kubernetes.LeaderElection.Disable, false) { mgrOpts.LeaderElection = true + mgrOpts.Controller.NeedLeaderElection = ptr.To(true) + mgrOpts.Controller.EnableWarmup = ptr.To(true) + if svrCfg.EnvoyGateway.Provider.Kubernetes.LeaderElection.LeaseDuration != nil { ld, err := time.ParseDuration(string(*svrCfg.EnvoyGateway.Provider.Kubernetes.LeaderElection.LeaseDuration)) if err != nil { @@ -130,7 +135,6 @@ func New(ctx context.Context, restCfg *rest.Config, svrCfg *ec.Server, resources } mgrOpts.RenewDeadline = ptr.To(rd) } - mgrOpts.Controller = config.Controller{NeedLeaderElection: ptr.To(false)} } if svrCfg.EnvoyGateway.Provider.Kubernetes.CacheSyncPeriod != nil { From 242815e25792bcf7e3077dcb9b9b1393032064b3 Mon Sep 17 00:00:00 2001 From: jukie <10012479+jukie@users.noreply.github.com> Date: Tue, 4 Nov 2025 15:41:34 -0700 Subject: [PATCH 3/3] release note Signed-off-by: jukie <10012479+jukie@users.noreply.github.com> --- release-notes/current.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/release-notes/current.yaml b/release-notes/current.yaml index 32c964f90bc..019964fa0a1 100644 --- a/release-notes/current.yaml +++ b/release-notes/current.yaml @@ -15,6 +15,7 @@ bug fixes: | - Fixed Listener port limit typo 65353 -> 65535. - Fixed issue where reloading invalid envoy gateway configuration. - Fixed missing JWT provider configuration when JWT authentication is configured on multiple HTTP listeners sharing the same port. + - Fixed config issue in controller config that was leading to inconsistent xds state # Enhancements that improve performance. performance improvements: |