Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ type AutoscalingOptions struct {
CapacitybufferControllerEnabled bool
// CapacitybufferPodInjectionEnabled tells if CA should injects fake pods for capacity buffers that are ready for provisioning
CapacitybufferPodInjectionEnabled bool
// MaxNodeSkipEvalTimeTrackerEnabled is used to enabled/disable the tracking of maximum evaluation time of a node being skipped during ScaleDown.
MaxNodeSkipEvalTimeTrackerEnabled bool
}

// KubeClientOptions specify options for kube client
Expand Down
2 changes: 2 additions & 0 deletions cluster-autoscaler/config/flags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ var (
nodeDeletionCandidateTTL = flag.Duration("node-deletion-candidate-ttl", time.Duration(0), "Maximum time a node can be marked as removable before the marking becomes stale. This sets the TTL of Cluster-Autoscaler's state if the Cluste-Autoscaler deployment becomes inactive")
capacitybufferControllerEnabled = flag.Bool("capacity-buffer-controller-enabled", false, "Whether to enable the default controller for capacity buffers or not")
capacitybufferPodInjectionEnabled = flag.Bool("capacity-buffer-pod-injection-enabled", false, "Whether to enable pod list processor that processes ready capacity buffers and injects fake pods accordingly")
maxNodeSkipEvalTimeTrackerEnabled = flag.Bool("max-node-skip-eval-time-tracker-enabled", false, "Whether to enable the tracking of the maximum time of node being skipped during ScaleDown")

// Deprecated flags
ignoreTaintsFlag = multiStringFlag("ignore-taint", "Specifies a taint to ignore in node templates when considering to scale a node group (Deprecated, use startup-taints instead)")
Expand Down Expand Up @@ -422,6 +423,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
NodeDeletionCandidateTTL: *nodeDeletionCandidateTTL,
CapacitybufferControllerEnabled: *capacitybufferControllerEnabled,
CapacitybufferPodInjectionEnabled: *capacitybufferPodInjectionEnabled,
MaxNodeSkipEvalTimeTrackerEnabled: *maxNodeSkipEvalTimeTrackerEnabled,
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
Copyright 2025 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package nodeevaltracker

import (
"time"

"k8s.io/autoscaler/cluster-autoscaler/metrics"
)

// MaxNodeSkipEvalTime is a time tracker for the biggest evaluation time of a node during ScaleDown
type MaxNodeSkipEvalTime struct {
// lastEvalTime is the time of previous currentlyUnneededNodeNames parsing
lastEvalTime time.Time
// nodeNamesWithTimeStamps is maps of nodeNames with their time of last successful evaluation
nodeNamesWithTimeStamps map[string]time.Time
}

// NewMaxNodeSkipEvalTime returns LongestNodeScaleDownEvalTime with lastEvalTime set to currentTime
func NewMaxNodeSkipEvalTime(currentTime time.Time) *MaxNodeSkipEvalTime {
return &MaxNodeSkipEvalTime{lastEvalTime: currentTime}
}

// Retrieves the time of the last evaluation of a node.
func (l *MaxNodeSkipEvalTime) get(nodeName string) time.Time {
if _, ok := l.nodeNamesWithTimeStamps[nodeName]; ok {
return l.nodeNamesWithTimeStamps[nodeName]
}
return l.lastEvalTime
}

// getMin() returns the minimum time in nodeNamesWithTimeStamps or time of last evaluation
func (l *MaxNodeSkipEvalTime) getMin() time.Time {
minimumTime := l.lastEvalTime
for _, val := range l.nodeNamesWithTimeStamps {
if minimumTime.After(val) {
minimumTime = val
}
}
return minimumTime
}

// Update returns the longest evaluation time for the nodes in nodeNamesWithTimeStamps
// and changes nodeNamesWithTimeStamps for nodeNames.
func (l *MaxNodeSkipEvalTime) Update(nodeNames []string, currentTime time.Time) time.Duration {
newNodes := make(map[string]time.Time)
for _, nodeName := range nodeNames {
newNodes[nodeName] = l.get(nodeName)
}
l.nodeNamesWithTimeStamps = newNodes
l.lastEvalTime = currentTime
minimumTime := l.getMin()
longestDuration := currentTime.Sub(minimumTime)
metrics.ObserveMaxNodeSkipEvalDurationSeconds(longestDuration)
return longestDuration
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
Copyright 2025 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package nodeevaltracker

import (
"testing"
"time"

"github.com/stretchr/testify/assert"
)

func TestMaxNodeSkipEvalTime(t *testing.T) {
type testCase struct {
name string
unprocessedNodes [][]string
wantMaxSkipEvalTimeSeconds []int
}
start := time.Now()
testCases := []testCase{
{
name: "Only one node is skipped in one iteration",
unprocessedNodes: [][]string{{}, {"n1"}, {}, {}},
wantMaxSkipEvalTimeSeconds: []int{0, 1, 0, 0},
},
{
name: "No nodes are skipped in the first iteration",
unprocessedNodes: [][]string{{}, {"n1", "n2"}, {"n2", "n3"}, {}},
wantMaxSkipEvalTimeSeconds: []int{0, 1, 2, 0},
},
{
name: "Some nodes are skipped in the first iteration",
unprocessedNodes: [][]string{{"n1", "n2"}, {"n1", "n2"}, {"n2", "n3"}, {}},
wantMaxSkipEvalTimeSeconds: []int{1, 2, 3, 0},
},
{
name: "Overlapping node sets are skipped in different iteration",
unprocessedNodes: [][]string{{}, {"n1", "n2"}, {"n1"}, {"n2"}, {}},
wantMaxSkipEvalTimeSeconds: []int{0, 1, 2, 1, 0},
},
{
name: "Disjoint node sets are skipped in each iteration",
unprocessedNodes: [][]string{{"n1"}, {"n2"}, {"n3"}, {"n4"}, {}},
wantMaxSkipEvalTimeSeconds: []int{1, 1, 1, 1, 0},
},
{
name: "None of the nodes are skipped in each iteration",
unprocessedNodes: [][]string{{}, {}, {}},
wantMaxSkipEvalTimeSeconds: []int{0, 0, 0},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
timestamp := start
maxNodeSkipEvalTime := NewMaxNodeSkipEvalTime(start)
for i := 0; i < len(tc.unprocessedNodes); i++ {
timestamp = timestamp.Add(1 * time.Second)
assert.Equal(t, time.Duration(tc.wantMaxSkipEvalTimeSeconds[i])*time.Second, maxNodeSkipEvalTime.Update(tc.unprocessedNodes[i], timestamp))
assert.Equal(t, len(tc.unprocessedNodes[i]), len(maxNodeSkipEvalTime.nodeNamesWithTimeStamps))
}
})
}
}
21 changes: 21 additions & 0 deletions cluster-autoscaler/core/scaledown/planner/planner.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
ca_context "k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/nodeevaltracker"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/pdb"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/resource"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/unneeded"
Expand Down Expand Up @@ -76,6 +77,7 @@ type Planner struct {
cc controllerReplicasCalculator
scaleDownSetProcessor nodes.ScaleDownSetProcessor
scaleDownContext *nodes.ScaleDownContext
maxNodeSkipEvalTime *nodeevaltracker.MaxNodeSkipEvalTime
}

// New creates a new Planner object.
Expand All @@ -91,6 +93,11 @@ func New(autoscalingCtx *ca_context.AutoscalingContext, processors *processors.A
unneededNodes.LoadFromExistingTaints(autoscalingCtx.ListerRegistry, time.Now(), autoscalingCtx.AutoscalingOptions.NodeDeletionCandidateTTL)
}

var maxNodeSkipEvalTime *nodeevaltracker.MaxNodeSkipEvalTime
if autoscalingCtx.AutoscalingOptions.MaxNodeSkipEvalTimeTrackerEnabled {
maxNodeSkipEvalTime = nodeevaltracker.NewMaxNodeSkipEvalTime(time.Now())
}

return &Planner{
autoscalingCtx: autoscalingCtx,
unremovableNodes: unremovable.NewNodes(),
Expand All @@ -104,6 +111,7 @@ func New(autoscalingCtx *ca_context.AutoscalingContext, processors *processors.A
scaleDownSetProcessor: processors.ScaleDownSetProcessor,
scaleDownContext: nodes.NewDefaultScaleDownContext(),
minUpdateInterval: minUpdateInterval,
maxNodeSkipEvalTime: maxNodeSkipEvalTime,
}
}

Expand Down Expand Up @@ -277,13 +285,16 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
}
p.nodeUtilizationMap = utilizationMap
timer := time.NewTimer(p.autoscalingCtx.ScaleDownSimulationTimeout)
var skippedNodes []string

for i, node := range currentlyUnneededNodeNames {
if timedOut(timer) {
skippedNodes = currentlyUnneededNodeNames[i:]
klog.Warningf("%d out of %d nodes skipped in scale down simulation due to timeout.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames))
break
}
if len(removableList)-atomicScaleDownNodesCount >= p.unneededNodesLimit() {
skippedNodes = currentlyUnneededNodeNames[i:]
klog.V(4).Infof("%d out of %d nodes skipped in scale down simulation: there are already %d unneeded nodes so no point in looking for more. Total atomic scale down nodes: %d", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames), len(removableList), atomicScaleDownNodesCount)
break
}
Expand All @@ -306,6 +317,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
p.unremovableNodes.AddTimeout(unremovable, unremovableTimeout)
}
}
p.handleUnprocessedNodes(skippedNodes)
p.unneededNodes.Update(removableList, p.latestUpdate)
if unremovableCount > 0 {
klog.V(1).Infof("%v nodes found to be unremovable in simulation, will re-check them at %v", unremovableCount, unremovableTimeout)
Expand Down Expand Up @@ -372,6 +384,15 @@ func (p *Planner) unneededNodesLimit() int {
return limit
}

// handleUnprocessedNodes is used to track the longest time that a node is being skipped during ScaleDown
func (p *Planner) handleUnprocessedNodes(unprocessedNodeNames []string) {
// if p.maxNodeSkipEvalTime is nil (flag is disabled) do not do anything
if p.maxNodeSkipEvalTime == nil {
return
}
p.maxNodeSkipEvalTime.Update(unprocessedNodeNames, time.Now())
}

// getKnownOwnerRef returns ownerRef that is known by CA and CA knows the logic of how this controller recreates pods.
func getKnownOwnerRef(ownerRefs []metav1.OwnerReference) *metav1.OwnerReference {
for _, ownerRef := range ownerRefs {
Expand Down
15 changes: 15 additions & 0 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,14 @@ var (
Buckets: k8smetrics.ExponentialBuckets(1, 2, 6), // 1, 2, 4, ..., 32
}, []string{"instance_type", "cpu_count", "namespace_count"},
)

maxNodeSkipEvalDurationSeconds = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "max_node_skip_eval_duration_seconds",
Help: "Maximum evaluation time of a node being skipped during ScaleDown.",
},
)
)

// RegisterAll registers all metrics.
Expand Down Expand Up @@ -461,6 +469,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
legacyregistry.MustRegister(nodeTaintsCount)
legacyregistry.MustRegister(inconsistentInstancesMigsCount)
legacyregistry.MustRegister(binpackingHeterogeneity)
legacyregistry.MustRegister(maxNodeSkipEvalDurationSeconds)

if emitPerNodeGroupMetrics {
legacyregistry.MustRegister(nodesGroupMinNodes)
Expand Down Expand Up @@ -748,3 +757,9 @@ func UpdateInconsistentInstancesMigsCount(migCount int) {
func ObserveBinpackingHeterogeneity(instanceType, cpuCount, namespaceCount string, pegCount int) {
binpackingHeterogeneity.WithLabelValues(instanceType, cpuCount, namespaceCount).Observe(float64(pegCount))
}

// ObserveMaxNodeSkipEvalDurationSeconds records the longest time during which node was skipped during ScaleDown.
// If a node is skipped multiple times consecutively, we store only the earliest timestamp.
func ObserveMaxNodeSkipEvalDurationSeconds(duration time.Duration) {
maxNodeSkipEvalDurationSeconds.Set(duration.Seconds())
}
Loading