Merge pull request #8614 from shaikenov/shaikenov-scaledown-unprocessed-node-tracking

k8s-ci-robot · web-flow · commit 7e8cf0be6245 · 2025-11-05T06:46:53.000-08:00
feat(nodeScaleDownTime): add a new metric to track unprocessed nodes during scaleDown
diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
@@ -354,6 +354,8 @@ type AutoscalingOptions struct {
 	CapacitybufferControllerEnabled bool
 	// CapacitybufferPodInjectionEnabled tells if CA should injects fake pods for capacity buffers that are ready for provisioning
 	CapacitybufferPodInjectionEnabled bool
+	// MaxNodeSkipEvalTimeTrackerEnabled is used to enabled/disable the tracking of maximum evaluation time of a node being skipped during ScaleDown.
+	MaxNodeSkipEvalTimeTrackerEnabled bool
 }
 
 // KubeClientOptions specify options for kube client
diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go
@@ -232,6 +232,7 @@ var (
 	nodeDeletionCandidateTTL                     = flag.Duration("node-deletion-candidate-ttl", time.Duration(0), "Maximum time a node can be marked as removable before the marking becomes stale. This sets the TTL of Cluster-Autoscaler's state if the Cluste-Autoscaler deployment becomes inactive")
 	capacitybufferControllerEnabled              = flag.Bool("capacity-buffer-controller-enabled", false, "Whether to enable the default controller for capacity buffers or not")
 	capacitybufferPodInjectionEnabled            = flag.Bool("capacity-buffer-pod-injection-enabled", false, "Whether to enable pod list processor that processes ready capacity buffers and injects fake pods accordingly")
+	maxNodeSkipEvalTimeTrackerEnabled            = flag.Bool("max-node-skip-eval-time-tracker-enabled", false, "Whether to enable the tracking of the maximum time of node being skipped during ScaleDown")
 
 	// Deprecated flags
 	ignoreTaintsFlag = multiStringFlag("ignore-taint", "Specifies a taint to ignore in node templates when considering to scale a node group (Deprecated, use startup-taints instead)")
@@ -422,6 +423,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		NodeDeletionCandidateTTL:                     *nodeDeletionCandidateTTL,
 		CapacitybufferControllerEnabled:              *capacitybufferControllerEnabled,
 		CapacitybufferPodInjectionEnabled:            *capacitybufferPodInjectionEnabled,
+		MaxNodeSkipEvalTimeTrackerEnabled:            *maxNodeSkipEvalTimeTrackerEnabled,
 	}
 }
 
diff --git a/cluster-autoscaler/core/scaledown/nodeevaltracker/max_node_skip_eval_time.go b/cluster-autoscaler/core/scaledown/nodeevaltracker/max_node_skip_eval_time.go
@@ -0,0 +1,70 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package nodeevaltracker
+
+import (
+	"time"
+
+	"k8s.io/autoscaler/cluster-autoscaler/metrics"
+)
+
+// MaxNodeSkipEvalTime is a time tracker for the biggest evaluation time of a node during ScaleDown
+type MaxNodeSkipEvalTime struct {
+	// lastEvalTime is the time of previous currentlyUnneededNodeNames parsing
+	lastEvalTime time.Time
+	// nodeNamesWithTimeStamps is maps of nodeNames with their time of last successful evaluation
+	nodeNamesWithTimeStamps map[string]time.Time
+}
+
+// NewMaxNodeSkipEvalTime returns LongestNodeScaleDownEvalTime with lastEvalTime set to currentTime
+func NewMaxNodeSkipEvalTime(currentTime time.Time) *MaxNodeSkipEvalTime {
+	return &MaxNodeSkipEvalTime{lastEvalTime: currentTime}
+}
+
+// Retrieves the time of the last evaluation of a node.
+func (l *MaxNodeSkipEvalTime) get(nodeName string) time.Time {
+	if _, ok := l.nodeNamesWithTimeStamps[nodeName]; ok {
+		return l.nodeNamesWithTimeStamps[nodeName]
+	}
+	return l.lastEvalTime
+}
+
+// getMin() returns the minimum time in nodeNamesWithTimeStamps or time of last evaluation
+func (l *MaxNodeSkipEvalTime) getMin() time.Time {
+	minimumTime := l.lastEvalTime
+	for _, val := range l.nodeNamesWithTimeStamps {
+		if minimumTime.After(val) {
+			minimumTime = val
+		}
+	}
+	return minimumTime
+}
+
+// Update returns the longest evaluation time for the nodes in nodeNamesWithTimeStamps
+// and changes nodeNamesWithTimeStamps for nodeNames.
+func (l *MaxNodeSkipEvalTime) Update(nodeNames []string, currentTime time.Time) time.Duration {
+	newNodes := make(map[string]time.Time)
+	for _, nodeName := range nodeNames {
+		newNodes[nodeName] = l.get(nodeName)
+	}
+	l.nodeNamesWithTimeStamps = newNodes
+	l.lastEvalTime = currentTime
+	minimumTime := l.getMin()
+	longestDuration := currentTime.Sub(minimumTime)
+	metrics.ObserveMaxNodeSkipEvalDurationSeconds(longestDuration)
+	return longestDuration
+}
diff --git a/cluster-autoscaler/core/scaledown/nodeevaltracker/max_node_skip_eval_time_test.go b/cluster-autoscaler/core/scaledown/nodeevaltracker/max_node_skip_eval_time_test.go
@@ -0,0 +1,77 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package nodeevaltracker
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestMaxNodeSkipEvalTime(t *testing.T) {
+	type testCase struct {
+		name                       string
+		unprocessedNodes           [][]string
+		wantMaxSkipEvalTimeSeconds []int
+	}
+	start := time.Now()
+	testCases := []testCase{
+		{
+			name:                       "Only one node is skipped in one iteration",
+			unprocessedNodes:           [][]string{{}, {"n1"}, {}, {}},
+			wantMaxSkipEvalTimeSeconds: []int{0, 1, 0, 0},
+		},
+		{
+			name:                       "No nodes are skipped in the first iteration",
+			unprocessedNodes:           [][]string{{}, {"n1", "n2"}, {"n2", "n3"}, {}},
+			wantMaxSkipEvalTimeSeconds: []int{0, 1, 2, 0},
+		},
+		{
+			name:                       "Some nodes are skipped in the first iteration",
+			unprocessedNodes:           [][]string{{"n1", "n2"}, {"n1", "n2"}, {"n2", "n3"}, {}},
+			wantMaxSkipEvalTimeSeconds: []int{1, 2, 3, 0},
+		},
+		{
+			name:                       "Overlapping node sets are skipped in different iteration",
+			unprocessedNodes:           [][]string{{}, {"n1", "n2"}, {"n1"}, {"n2"}, {}},
+			wantMaxSkipEvalTimeSeconds: []int{0, 1, 2, 1, 0},
+		},
+		{
+			name:                       "Disjoint node sets are skipped in each iteration",
+			unprocessedNodes:           [][]string{{"n1"}, {"n2"}, {"n3"}, {"n4"}, {}},
+			wantMaxSkipEvalTimeSeconds: []int{1, 1, 1, 1, 0},
+		},
+		{
+			name:                       "None of the nodes are skipped in each iteration",
+			unprocessedNodes:           [][]string{{}, {}, {}},
+			wantMaxSkipEvalTimeSeconds: []int{0, 0, 0},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+			timestamp := start
+			maxNodeSkipEvalTime := NewMaxNodeSkipEvalTime(start)
+			for i := 0; i < len(tc.unprocessedNodes); i++ {
+				timestamp = timestamp.Add(1 * time.Second)
+				assert.Equal(t, time.Duration(tc.wantMaxSkipEvalTimeSeconds[i])*time.Second, maxNodeSkipEvalTime.Update(tc.unprocessedNodes[i], timestamp))
+				assert.Equal(t, len(tc.unprocessedNodes[i]), len(maxNodeSkipEvalTime.nodeNamesWithTimeStamps))
+			}
+		})
+	}
+}
diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go
@@ -26,6 +26,7 @@ import (
 	ca_context "k8s.io/autoscaler/cluster-autoscaler/context"
 	"k8s.io/autoscaler/cluster-autoscaler/core/scaledown"
 	"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility"
+	"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/nodeevaltracker"
 	"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb"
 	"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/resource"
 	"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/unneeded"
@@ -76,6 +77,7 @@ type Planner struct {
 	cc                    controllerReplicasCalculator
 	scaleDownSetProcessor nodes.ScaleDownSetProcessor
 	scaleDownContext      *nodes.ScaleDownContext
+	maxNodeSkipEvalTime   *nodeevaltracker.MaxNodeSkipEvalTime
 }
 
 // New creates a new Planner object.
@@ -91,6 +93,11 @@ func New(autoscalingCtx *ca_context.AutoscalingContext, processors *processors.A
 		unneededNodes.LoadFromExistingTaints(autoscalingCtx.ListerRegistry, time.Now(), autoscalingCtx.AutoscalingOptions.NodeDeletionCandidateTTL)
 	}
 
+	var maxNodeSkipEvalTime *nodeevaltracker.MaxNodeSkipEvalTime
+	if autoscalingCtx.AutoscalingOptions.MaxNodeSkipEvalTimeTrackerEnabled {
+		maxNodeSkipEvalTime = nodeevaltracker.NewMaxNodeSkipEvalTime(time.Now())
+	}
+
 	return &Planner{
 		autoscalingCtx:        autoscalingCtx,
 		unremovableNodes:      unremovable.NewNodes(),
@@ -104,6 +111,7 @@ func New(autoscalingCtx *ca_context.AutoscalingContext, processors *processors.A
 		scaleDownSetProcessor: processors.ScaleDownSetProcessor,
 		scaleDownContext:      nodes.NewDefaultScaleDownContext(),
 		minUpdateInterval:     minUpdateInterval,
+		maxNodeSkipEvalTime:   maxNodeSkipEvalTime,
 	}
 }
 
@@ -277,13 +285,16 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
 	}
 	p.nodeUtilizationMap = utilizationMap
 	timer := time.NewTimer(p.autoscalingCtx.ScaleDownSimulationTimeout)
+	var skippedNodes []string
 
 	for i, node := range currentlyUnneededNodeNames {
 		if timedOut(timer) {
+			skippedNodes = currentlyUnneededNodeNames[i:]
 			klog.Warningf("%d out of %d nodes skipped in scale down simulation due to timeout.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames))
 			break
 		}
 		if len(removableList)-atomicScaleDownNodesCount >= p.unneededNodesLimit() {
+			skippedNodes = currentlyUnneededNodeNames[i:]
 			klog.V(4).Infof("%d out of %d nodes skipped in scale down simulation: there are already %d unneeded nodes so no point in looking for more. Total atomic scale down nodes: %d", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames), len(removableList), atomicScaleDownNodesCount)
 			break
 		}
@@ -306,6 +317,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
 			p.unremovableNodes.AddTimeout(unremovable, unremovableTimeout)
 		}
 	}
+	p.handleUnprocessedNodes(skippedNodes)
 	p.unneededNodes.Update(removableList, p.latestUpdate)
 	if unremovableCount > 0 {
 		klog.V(1).Infof("%v nodes found to be unremovable in simulation, will re-check them at %v", unremovableCount, unremovableTimeout)
@@ -372,6 +384,15 @@ func (p *Planner) unneededNodesLimit() int {
 	return limit
 }
 
+// handleUnprocessedNodes is used to track the longest time that a node is being skipped during ScaleDown
+func (p *Planner) handleUnprocessedNodes(unprocessedNodeNames []string) {
+	// if p.maxNodeSkipEvalTime is nil (flag is disabled) do not do anything
+	if p.maxNodeSkipEvalTime == nil {
+		return
+	}
+	p.maxNodeSkipEvalTime.Update(unprocessedNodeNames, time.Now())
+}
+
 // getKnownOwnerRef returns ownerRef that is known by CA and CA knows the logic of how this controller recreates pods.
 func getKnownOwnerRef(ownerRefs []metav1.OwnerReference) *metav1.OwnerReference {
 	for _, ownerRef := range ownerRefs {
diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go
@@ -425,6 +425,14 @@ var (
 			Buckets:   k8smetrics.ExponentialBuckets(1, 2, 6), // 1, 2, 4, ..., 32
 		}, []string{"instance_type", "cpu_count", "namespace_count"},
 	)
+
+	maxNodeSkipEvalDurationSeconds = k8smetrics.NewGauge(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "max_node_skip_eval_duration_seconds",
+			Help:      "Maximum evaluation time of a node being skipped during ScaleDown.",
+		},
+	)
 )
 
 // RegisterAll registers all metrics.
@@ -461,6 +469,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
 	legacyregistry.MustRegister(nodeTaintsCount)
 	legacyregistry.MustRegister(inconsistentInstancesMigsCount)
 	legacyregistry.MustRegister(binpackingHeterogeneity)
+	legacyregistry.MustRegister(maxNodeSkipEvalDurationSeconds)
 
 	if emitPerNodeGroupMetrics {
 		legacyregistry.MustRegister(nodesGroupMinNodes)
@@ -748,3 +757,9 @@ func UpdateInconsistentInstancesMigsCount(migCount int) {
 func ObserveBinpackingHeterogeneity(instanceType, cpuCount, namespaceCount string, pegCount int) {
 	binpackingHeterogeneity.WithLabelValues(instanceType, cpuCount, namespaceCount).Observe(float64(pegCount))
 }
+
+// ObserveMaxNodeSkipEvalDurationSeconds records the longest time during which node was skipped during ScaleDown.
+// If a node is skipped multiple times consecutively, we store only the earliest timestamp.
+func ObserveMaxNodeSkipEvalDurationSeconds(duration time.Duration) {
+	maxNodeSkipEvalDurationSeconds.Set(duration.Seconds())
+}

Original file line number	Diff line number	Diff line change
`@@ -354,6 +354,8 @@ type AutoscalingOptions struct {`
`354`	`354`	`CapacitybufferControllerEnabled bool`
`355`	`355`	`// CapacitybufferPodInjectionEnabled tells if CA should injects fake pods for capacity buffers that are ready for provisioning`
`356`	`356`	`CapacitybufferPodInjectionEnabled bool`
	`357`	`+ // MaxNodeSkipEvalTimeTrackerEnabled is used to enabled/disable the tracking of maximum evaluation time of a node being skipped during ScaleDown.`
	`358`	`+ MaxNodeSkipEvalTimeTrackerEnabled bool`
`357`	`359`	`}`
`358`	`360`
`359`	`361`	`// KubeClientOptions specify options for kube client`
Original file line number	Diff line number	Diff line change
`@@ -232,6 +232,7 @@ var (`
`232`	`232`	`nodeDeletionCandidateTTL = flag.Duration("node-deletion-candidate-ttl", time.Duration(0), "Maximum time a node can be marked as removable before the marking becomes stale. This sets the TTL of Cluster-Autoscaler's state if the Cluste-Autoscaler deployment becomes inactive")`
`233`	`233`	`capacitybufferControllerEnabled = flag.Bool("capacity-buffer-controller-enabled", false, "Whether to enable the default controller for capacity buffers or not")`
`234`	`234`	`capacitybufferPodInjectionEnabled = flag.Bool("capacity-buffer-pod-injection-enabled", false, "Whether to enable pod list processor that processes ready capacity buffers and injects fake pods accordingly")`
	`235`	`+ maxNodeSkipEvalTimeTrackerEnabled = flag.Bool("max-node-skip-eval-time-tracker-enabled", false, "Whether to enable the tracking of the maximum time of node being skipped during ScaleDown")`
`235`	`236`
`236`	`237`	`// Deprecated flags`
`237`	`238`	`ignoreTaintsFlag = multiStringFlag("ignore-taint", "Specifies a taint to ignore in node templates when considering to scale a node group (Deprecated, use startup-taints instead)")`
`@@ -422,6 +423,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {`
`422`	`423`	`NodeDeletionCandidateTTL: *nodeDeletionCandidateTTL,`
`423`	`424`	`CapacitybufferControllerEnabled: *capacitybufferControllerEnabled,`
`424`	`425`	`CapacitybufferPodInjectionEnabled: *capacitybufferPodInjectionEnabled,`
	`426`	`+ MaxNodeSkipEvalTimeTrackerEnabled: *maxNodeSkipEvalTimeTrackerEnabled,`
`425`	`427`	`}`
`426`	`428`	`}`
`427`	`429`