From fa8dbd5762fdfd50b13ae8c3e9d15196b1b264f5 Mon Sep 17 00:00:00 2001
From: Sascha Schwarze <schwarzs@de.ibm.com>
Date: Fri, 19 Jul 2024 14:25:17 +0200
Subject: [PATCH] Add feature flag treat-pod-as-always-schedulable

The feature flag allows to declare that Pods in the system will eventually all get scheduled and Revisions should therefore not be marked unschedulable

Signed-off-by: Sascha Schwarze <schwarzs@de.ibm.com>
---
 config/core/configmaps/features.yaml          | 11 +++++-
 pkg/apis/config/features.go                   |  3 ++
 pkg/apis/config/features_test.go              | 29 +++++++++++++++
 .../revision/reconcile_resources.go           | 12 ++++---
 pkg/reconciler/revision/table_test.go         | 35 ++++++++++++++++++-
 5 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/config/core/configmaps/features.yaml b/config/core/configmaps/features.yaml
index e88b945ddc9d..6061613d7be8 100644
--- a/config/core/configmaps/features.yaml
+++ b/config/core/configmaps/features.yaml
@@ -22,7 +22,7 @@ metadata:
     app.kubernetes.io/component: controller
     app.kubernetes.io/version: devel
   annotations:
-    knative.dev/example-checksum: "9553535b"
+    knative.dev/example-checksum: "beb98480"
 data:
   _example: |-
     ################################
@@ -251,3 +251,12 @@ data:
 
     # Default queue proxy resource requests and limits to good values for most cases if set.
     queueproxy.resource-defaults: "disabled"
+
+    # treat-pod-as-always-schedulable can be used to define that Pods in the system will always be
+    # scheduled, and a Revision should not be marked unschedulable.
+    # Setting this to `enabled` makes sense if you have cluster-autoscaling set up for you cluster
+    # where unschedulable Pods trigger the addition of a new Node and are therefore a short and
+    # transient state.
+    #
+    # See https://github.com/knative/serving/issues/14862
+    treat-pod-as-always-schedulable: "disabled"
diff --git a/pkg/apis/config/features.go b/pkg/apis/config/features.go
index d2d1ab22f7d6..46054e39063b 100644
--- a/pkg/apis/config/features.go
+++ b/pkg/apis/config/features.go
@@ -113,6 +113,7 @@ func defaultFeaturesConfig() *Features {
 		SecurePodDefaults:                Disabled,
 		TagHeaderBasedRouting:            Disabled,
 		AutoDetectHTTP2:                  Disabled,
+		TreatPodAsAlwaysSchedulable:      Disabled,
 	}
 }
 
@@ -129,6 +130,7 @@ func NewFeaturesConfigFromMap(data map[string]string) (*Features, error) {
 		asFlag("queueproxy.resource-defaults", &nc.QueueProxyResourceDefaults),
 		asSecurePodDefaultsFlag("secure-pod-defaults", &nc.SecurePodDefaults),
 		asFlag("tag-header-based-routing", &nc.TagHeaderBasedRouting),
+		asFlag("treat-pod-as-always-schedulable", &nc.TreatPodAsAlwaysSchedulable),
 		asFlag(FeatureContainerSpecAddCapabilities, &nc.ContainerSpecAddCapabilities),
 		asFlag(FeaturePodSpecAffinity, &nc.PodSpecAffinity),
 		asFlag(FeaturePodSpecDNSConfig, &nc.PodSpecDNSConfig),
@@ -198,6 +200,7 @@ type Features struct {
 	SecurePodDefaults                Flag
 	TagHeaderBasedRouting            Flag
 	AutoDetectHTTP2                  Flag
+	TreatPodAsAlwaysSchedulable      Flag
 }
 
 // asFlag parses the value at key as a Flag into the target, if it exists.
diff --git a/pkg/apis/config/features_test.go b/pkg/apis/config/features_test.go
index 49135294b19a..dc2b44786994 100644
--- a/pkg/apis/config/features_test.go
+++ b/pkg/apis/config/features_test.go
@@ -79,6 +79,7 @@ func TestFeaturesConfiguration(t *testing.T) {
 			SecurePodDefaults:                Enabled,
 			QueueProxyResourceDefaults:       Enabled,
 			TagHeaderBasedRouting:            Enabled,
+			TreatPodAsAlwaysSchedulable:      Enabled,
 		}),
 		data: map[string]string{
 			"multi-container":                              "Enabled",
@@ -101,6 +102,7 @@ func TestFeaturesConfiguration(t *testing.T) {
 			"secure-pod-defaults":                          "Enabled",
 			"queueproxy.resource-defaults":                 "Enabled",
 			"tag-header-based-routing":                     "Enabled",
+			"treat-pod-as-always-schedulable":              "Enabled",
 		},
 	}, {
 		name:    "multi-container Allowed",
@@ -759,6 +761,33 @@ func TestFeaturesConfiguration(t *testing.T) {
 		data: map[string]string{
 			"tag-header-based-routing": "Restricted",
 		},
+	}, {
+		name:    "treat-pod-as-always-schedulable Allowed",
+		wantErr: false,
+		wantFeatures: defaultWith(&Features{
+			TreatPodAsAlwaysSchedulable: Allowed,
+		}),
+		data: map[string]string{
+			"treat-pod-as-always-schedulable": "Allowed",
+		},
+	}, {
+		name:    "treat-pod-as-always-schedulable Enabled",
+		wantErr: false,
+		wantFeatures: defaultWith(&Features{
+			TreatPodAsAlwaysSchedulable: Enabled,
+		}),
+		data: map[string]string{
+			"treat-pod-as-always-schedulable": "Enabled",
+		},
+	}, {
+		name:    "treat-pod-as-always-schedulable Disabled",
+		wantErr: false,
+		wantFeatures: defaultWith(&Features{
+			TreatPodAsAlwaysSchedulable: Disabled,
+		}),
+		data: map[string]string{
+			"treat-pod-as-always-schedulable": "Disabled",
+		},
 	}}
 
 	for _, tt := range configTests {
diff --git a/pkg/reconciler/revision/reconcile_resources.go b/pkg/reconciler/revision/reconcile_resources.go
index cdb90c553794..83a761e3844f 100644
--- a/pkg/reconciler/revision/reconcile_resources.go
+++ b/pkg/reconciler/revision/reconcile_resources.go
@@ -36,6 +36,7 @@ import (
 	"knative.dev/pkg/kmp"
 	"knative.dev/pkg/logging"
 	"knative.dev/pkg/logging/logkey"
+	apicfg "knative.dev/serving/pkg/apis/config"
 	v1 "knative.dev/serving/pkg/apis/serving/v1"
 	"knative.dev/serving/pkg/networking"
 	"knative.dev/serving/pkg/reconciler/revision/config"
@@ -90,10 +91,13 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1.Revision)
 
 			// Update the revision status if pod cannot be scheduled (possibly resource constraints)
 			// If pod cannot be scheduled then we expect the container status to be empty.
-			for _, cond := range pod.Status.Conditions {
-				if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse {
-					rev.Status.MarkResourcesAvailableFalse(cond.Reason, cond.Message)
-					break
+			treatPodAsAlwaysSchedulable := config.FromContext(ctx).Features.TreatPodAsAlwaysSchedulable
+			if treatPodAsAlwaysSchedulable != apicfg.Enabled {
+				for _, cond := range pod.Status.Conditions {
+					if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse {
+						rev.Status.MarkResourcesAvailableFalse(cond.Reason, cond.Message)
+						break
+					}
 				}
 			}
 
diff --git a/pkg/reconciler/revision/table_test.go b/pkg/reconciler/revision/table_test.go
index 016828a5b081..45557feb79be 100644
--- a/pkg/reconciler/revision/table_test.go
+++ b/pkg/reconciler/revision/table_test.go
@@ -616,6 +616,32 @@ func TestReconcile(t *testing.T) {
 			Object: pa("foo", "pod-schedule-error", WithReachabilityUnreachable),
 		}},
 		Key: "foo/pod-schedule-error",
+	}, {
+		Name: "surface no pod schedule errors if treat-pod-as-always-schedulable is enabled",
+		// Test the propagation of the scheduling errors of Pod into the
+		// revision is not happening when treat-pod-as-always-schedulable
+		// is enabled.
+		Objects: []runtime.Object{
+			Revision("foo", "pod-no-schedule-error",
+				WithLogURL,
+				MarkActivating("Deploying", ""),
+				WithRoutingState(v1.RoutingStateActive, fc),
+				withDefaultContainerStatuses(),
+				MarkDeploying("Deploying"),
+				WithRevisionObservedGeneration(1),
+				MarkContainerHealthyUnknown("Deploying"),
+			),
+			pa("foo", "pod-no-schedule-error", WithReachabilityReachable), // PA can't be ready, since no traffic.
+			pod(t, "foo", "pod-no-schedule-error", WithUnschedulableContainer("Insufficient energy", "Unschedulable")),
+			deploy(t, "foo", "pod-no-schedule-error"),
+			image("foo", "pod-no-schedule-error"),
+		},
+		Ctx: defaultconfig.ToContext(context.Background(), &defaultconfig.Config{
+			Features: &defaultconfig.Features{
+				TreatPodAsAlwaysSchedulable: defaultconfig.Enabled,
+			},
+		}),
+		Key: "foo/pod-no-schedule-error",
 	}, {
 		Name: "ready steady state",
 		// Test the transition that Reconcile makes when Endpoints become ready on the
@@ -813,11 +839,18 @@ func TestReconcile(t *testing.T) {
 			resolver:            &nopResolver{},
 		}
 
+		cfg := reconcilerTestConfig()
+
+		apiCfgOverride := defaultconfig.FromContext(ctx)
+		if apiCfgOverride != nil {
+			cfg.Config.Features = apiCfgOverride.Features
+		}
+
 		return revisionreconciler.NewReconciler(ctx, logging.FromContext(ctx), servingclient.Get(ctx),
 			listers.GetRevisionLister(), controller.GetEventRecorder(ctx), r,
 			controller.Options{
 				ConfigStore: &testConfigStore{
-					config: reconcilerTestConfig(),
+					config: cfg,
 				},
 			})
 	}))