kubernetes · chansuke · May 18, 2025 · May 18, 2025 · May 18, 2025
diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
@@ -378,6 +378,10 @@ For example, for a node label of `foo=bar`, you would tag the ASG with:
 }
 ```
 
+When a node group is scaled down to 0, Cluster Autoscaler also removes any cached
+node information for that group. The template will be regenerated once new nodes
+are created.
+
 ### How can I prevent Cluster Autoscaler from scaling down a particular node?
 
 From CA 1.0, node will be excluded from scale-down if it has the

diff --git a/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go b/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go
@@ -117,6 +117,27 @@ func (p *MixedTemplateNodeInfoProvider) Process(ctx *context.AutoscalingContext,
 			p.nodeInfoCache[id] = cacheItem{NodeInfo: nodeInfoCopy, added: time.Now()}
 		}
 	}
+
+	// Invalidate cache entries for node groups that were scaled down to zero and have no nodes
+	for _, nodeGroup := range ctx.CloudProvider.NodeGroups() {
+		size, err := nodeGroup.TargetSize()
+		if err != nil {
+			if instances, errN := nodeGroup.Nodes(); errN == nil {
+				size = len(instances)
+			} else {
+				continue
+			}
+		}
+		// We should only invalidate if both target size is 0 and there are no nodes
+		if size == 0 && p.nodeInfoCache != nil {
+			// Check if there are any nodes in this group
+			instances, err := nodeGroup.Nodes()
+			if err == nil && len(instances) == 0 {
+				delete(p.nodeInfoCache, nodeGroup.Id())
+			}
+		}
+	}
+
 	for _, nodeGroup := range ctx.CloudProvider.NodeGroups() {
 		id := nodeGroup.Id()
 		seenGroups[id] = true

diff --git a/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor_test.go b/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor_test.go
@@ -306,6 +306,51 @@ func TestGetNodeInfosCacheExpired(t *testing.T) {
 
 }
 
+func TestCacheEntryRemovedWhenGroupScaledToZero(t *testing.T) {
+	now := time.Now()
+	n1 := BuildTestNode("n1", 1000, 1000)
+	SetNodeReadyState(n1, true, now.Add(-2*time.Minute))
+
+	provider := testprovider.NewTestAutoprovisioningCloudProvider(nil, nil, nil, nil, nil, nil)
+	provider.AddNodeGroup("ng1", 0, 10, 1)
+	provider.AddNode("ng1", n1)
+
+	podLister := kube_util.NewTestPodLister([]*apiv1.Pod{})
+	registry := kube_util.NewListerRegistry(nil, nil, podLister, nil, nil, nil, nil, nil, nil)
+
+	snapshot := testsnapshot.NewTestSnapshotOrDie(t)
+	err := snapshot.SetClusterState([]*apiv1.Node{n1}, nil, drasnapshot.Snapshot{})
+	assert.NoError(t, err)
+
+	ctx := context.AutoscalingContext{
+		CloudProvider:   provider,
+		ClusterSnapshot: snapshot,
+		AutoscalingKubeClients: context.AutoscalingKubeClients{
+			ListerRegistry: registry,
+		},
+	}
+
+	niProcessor := NewMixedTemplateNodeInfoProvider(&cacheTtl, false)
+	_, err = niProcessor.Process(&ctx, []*apiv1.Node{n1}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
+	assert.NoError(t, err)
+	_, found := niProcessor.nodeInfoCache["ng1"]
+	assert.True(t, found)
+
+	// scale node group to zero
+	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(0)
+	provider.DeleteNode(n1)
+
+	snapshot = testsnapshot.NewTestSnapshotOrDie(t)
+	err = snapshot.SetClusterState([]*apiv1.Node{}, nil, drasnapshot.Snapshot{})
+	assert.NoError(t, err)
+	ctx.ClusterSnapshot = snapshot
+
+	_, err = niProcessor.Process(&ctx, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
+	assert.NoError(t, err)
+	_, found = niProcessor.nodeInfoCache["ng1"]
+	assert.False(t, found)
+}
+
 func assertEqualNodeCapacities(t *testing.T, expected, actual *apiv1.Node) {
 	t.Helper()
 	assert.NotEqual(t, actual.Status, nil, "")