Skip to content

Commit 8891f87

Browse files
committed
feat: Added Disruption control for Sandbox
feat: added PDB to Sandbox spec updated rbac generated file nit nit
1 parent 83e8460 commit 8891f87

File tree

7 files changed

+234
-3
lines changed

7 files changed

+234
-3
lines changed

api/v1alpha1/sandbox_types.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,14 @@ type PersistentVolumeClaimTemplate struct {
9090
Spec corev1.PersistentVolumeClaimSpec `json:"spec" protobuf:"bytes,3,opt,name=spec"`
9191
}
9292

93+
// ResilienceLevel defines the desired level of resilience for a Sandbox.
94+
type ResilienceLevel string
95+
96+
const (
97+
// ResilienceLevelHigh indicates the Sandbox should be protected from voluntary disruptions.
98+
ResilienceLevelHigh ResilienceLevel = "High"
99+
)
100+
93101
// SandboxSpec defines the desired state of Sandbox
94102
type SandboxSpec struct {
95103
// The following markers will use OpenAPI v3 schema to validate the value
@@ -117,6 +125,11 @@ type SandboxSpec struct {
117125
// +kubebuilder:validation:Maximum=1
118126
// +optional
119127
Replicas *int32 `json:"replicas,omitempty"`
128+
// Resilience defines the desired level of resilience for the Sandbox Pod.
129+
// When set to "High", a PodDisruptionBudget is created to prevent voluntary
130+
// disruptions and an annotation is added to prevent cluster-autoscaler evictions.
131+
// +optional
132+
Resilience ResilienceLevel `json:"resilience,omitempty"`
120133
}
121134

122135
// SandboxStatus defines the observed state of Sandbox.

codegen.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@
1717
package agentsandbox
1818

1919
// Generate CRDs and RBAC rules
20-
//go:generate go tool -modfile=tools.mod sigs.k8s.io/controller-tools/cmd/controller-gen object crd:maxDescLen=0 paths="./api/..." output:crd:dir=k8s/crds output:rbac:dir=k8s rbac:roleName=agent-sandbox-controller,fileName=rbac.generated.yaml
20+
//go:generate go tool -modfile=tools.mod sigs.k8s.io/controller-tools/cmd/controller-gen rbac:roleName=agent-sandbox-controller,fileName=rbac.generated.yaml crd:maxDescLen=0 paths="./..." output:crd:dir=k8s/crds output:rbac:dir=k8s
2121
//go:generate go tool -modfile=tools.mod sigs.k8s.io/controller-tools/cmd/controller-gen object crd:maxDescLen=0 paths="./extensions/..." output:crd:dir=k8s/crds output:rbac:dir=k8s rbac:roleName=agent-sandbox-controller,fileName=rbac.generated.yaml

controllers/sandbox_controller.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,13 @@ import (
2323
"time"
2424

2525
corev1 "k8s.io/api/core/v1"
26+
policyv1 "k8s.io/api/policy/v1"
2627
k8serrors "k8s.io/apimachinery/pkg/api/errors"
2728
"k8s.io/apimachinery/pkg/api/meta"
2829
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2930
"k8s.io/apimachinery/pkg/runtime"
3031
"k8s.io/apimachinery/pkg/types"
32+
"k8s.io/apimachinery/pkg/util/intstr"
3133
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
3234
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
3335
ctrl "sigs.k8s.io/controller-runtime"
@@ -42,6 +44,8 @@ import (
4244

4345
const (
4446
sandboxLabel = "agents.x-k8s.io/sandbox-name-hash"
47+
// safeToEvictAnnotation is used to mark pods that should not be evicted by a PDB
48+
safeToEvictAnnotation = "cluster-autoscaler.kubernetes.io/safe-to-evict"
4549
)
4650

4751
var (
@@ -66,6 +70,7 @@ type SandboxReconciler struct {
6670
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
6771
//+kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete
6872
//+kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch;delete
73+
//+kubebuilder:rbac:groups=policy,resources=poddisruptionbudgets,verbs=get;list;watch;create;update;patch;delete
6974

7075
// Reconcile is part of the main kubernetes reconciliation loop which aims to
7176
// move the current state of the cluster closer to the desired state.
@@ -131,6 +136,10 @@ func (r *SandboxReconciler) reconcileChildResources(ctx context.Context, sandbox
131136
err := r.reconcilePVCs(ctx, sandbox)
132137
allErrors = errors.Join(allErrors, err)
133138

139+
// Reconcile PDB
140+
err = r.reconcilePDB(ctx, sandbox, nameHash)
141+
allErrors = errors.Join(allErrors, err)
142+
134143
// Reconcile Pod
135144
pod, err := r.reconcilePod(ctx, sandbox, nameHash)
136145
allErrors = errors.Join(allErrors, err)
@@ -332,6 +341,10 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
332341
annotations[k] = v
333342
}
334343

344+
if sandbox.Spec.Resilience == sandboxv1alpha1.ResilienceLevelHigh {
345+
annotations[safeToEvictAnnotation] = "false"
346+
}
347+
335348
mutatedSpec := sandbox.Spec.PodTemplate.Spec.DeepCopy()
336349

337350
for _, pvcTemplate := range sandbox.Spec.VolumeClaimTemplates {
@@ -365,6 +378,59 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
365378
return pod, nil
366379
}
367380

381+
func (r *SandboxReconciler) reconcilePDB(ctx context.Context, sandbox *sandboxv1alpha1.Sandbox, nameHash string) error {
382+
log := log.FromContext(ctx)
383+
pdb := &policyv1.PodDisruptionBudget{}
384+
pdbName := types.NamespacedName{Name: sandbox.Name, Namespace: sandbox.Namespace}
385+
386+
// If resilience is not "High", ensure the PDB is deleted.
387+
if sandbox.Spec.Resilience != sandboxv1alpha1.ResilienceLevelHigh {
388+
if err := r.Get(ctx, pdbName, pdb); err != nil {
389+
if k8serrors.IsNotFound(err) {
390+
return nil // PDB doesn't exist, which is the desired state.
391+
}
392+
return err
393+
}
394+
log.Info("Deleting PDB as resilience level is not High", "PDB.Name", pdb.Name)
395+
return r.Delete(ctx, pdb)
396+
}
397+
398+
// If resilience is "High", ensure the PDB exists.
399+
if err := r.Get(ctx, pdbName, pdb); err != nil {
400+
if !k8serrors.IsNotFound(err) {
401+
log.Error(err, "Failed to get PDB")
402+
return fmt.Errorf("PDB Get Failed: %w", err)
403+
}
404+
405+
// PDB does not exist, so create it.
406+
log.Info("Creating a new PodDisruptionBudget", "PDB.Namespace", sandbox.Namespace, "PDB.Name", sandbox.Name)
407+
minAvailable := intstr.FromInt(1) // For a single-pod Sandbox, minAvailable=1 is appropriate
408+
newPDB := &policyv1.PodDisruptionBudget{
409+
ObjectMeta: metav1.ObjectMeta{
410+
Name: sandbox.Name,
411+
Namespace: sandbox.Namespace,
412+
},
413+
Spec: policyv1.PodDisruptionBudgetSpec{
414+
MinAvailable: &minAvailable,
415+
Selector: &metav1.LabelSelector{
416+
MatchLabels: map[string]string{
417+
sandboxLabel: nameHash,
418+
},
419+
},
420+
},
421+
}
422+
423+
if err := ctrl.SetControllerReference(sandbox, newPDB, r.Scheme); err != nil {
424+
return fmt.Errorf("SetControllerReference for PDB failed: %w", err)
425+
}
426+
427+
return r.Create(ctx, newPDB)
428+
}
429+
430+
log.Info("Found PDB", "PDB.Name", pdb.Name)
431+
return nil
432+
}
433+
368434
func (r *SandboxReconciler) reconcilePVCs(ctx context.Context, sandbox *sandboxv1alpha1.Sandbox) error {
369435
log := log.FromContext(ctx)
370436
for _, pvcTemplate := range sandbox.Spec.VolumeClaimTemplates {

controllers/sandbox_controller_test.go

Lines changed: 139 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@ import (
2222
"github.com/google/go-cmp/cmp/cmpopts"
2323
"github.com/stretchr/testify/require"
2424
corev1 "k8s.io/api/core/v1"
25+
policyv1 "k8s.io/api/policy/v1"
2526
k8serrors "k8s.io/apimachinery/pkg/api/errors"
2627
"k8s.io/apimachinery/pkg/api/resource"
2728
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2829
"k8s.io/apimachinery/pkg/runtime"
2930
"k8s.io/apimachinery/pkg/types"
31+
"k8s.io/apimachinery/pkg/util/intstr"
3032
"k8s.io/utils/ptr"
3133
sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
3234
ctrl "sigs.k8s.io/controller-runtime"
@@ -405,6 +407,88 @@ func TestReconcile(t *testing.T) {
405407
},
406408
},
407409
},
410+
{
411+
name: "sandbox with high resilience creates PDB and adds pod annotation",
412+
sandboxSpec: sandboxv1alpha1.SandboxSpec{
413+
Resilience: sandboxv1alpha1.ResilienceLevelHigh,
414+
PodTemplate: sandboxv1alpha1.PodTemplate{
415+
Spec: corev1.PodSpec{
416+
Containers: []corev1.Container{{Name: "test-container"}},
417+
},
418+
},
419+
},
420+
// Verify Sandbox status
421+
wantStatus: sandboxv1alpha1.SandboxStatus{
422+
Service: sandboxName,
423+
ServiceFQDN: "sandbox-name.sandbox-ns.svc.cluster.local",
424+
Replicas: 1,
425+
LabelSelector: "agents.x-k8s.io/sandbox-name-hash=ab179450",
426+
Conditions: []metav1.Condition{
427+
{
428+
Type: "Ready",
429+
Status: "False",
430+
ObservedGeneration: 1,
431+
Reason: "DependenciesNotReady",
432+
Message: "Pod exists with phase: ; Service Exists",
433+
},
434+
},
435+
},
436+
wantObjs: []client.Object{
437+
// Verify Pod has the new annotation
438+
&corev1.Pod{
439+
ObjectMeta: metav1.ObjectMeta{
440+
Name: sandboxName,
441+
Namespace: sandboxNs,
442+
ResourceVersion: "1",
443+
Labels: map[string]string{
444+
"agents.x-k8s.io/sandbox-name-hash": "ab179450",
445+
},
446+
Annotations: map[string]string{
447+
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
448+
},
449+
OwnerReferences: []metav1.OwnerReference{sandboxControllerRef(sandboxName)},
450+
},
451+
Spec: corev1.PodSpec{
452+
Containers: []corev1.Container{{Name: "test-container"}},
453+
},
454+
},
455+
// Verify Service
456+
&corev1.Service{
457+
ObjectMeta: metav1.ObjectMeta{
458+
Name: sandboxName,
459+
Namespace: sandboxNs,
460+
ResourceVersion: "1",
461+
Labels: map[string]string{
462+
"agents.x-k8s.io/sandbox-name-hash": "ab179450",
463+
},
464+
OwnerReferences: []metav1.OwnerReference{sandboxControllerRef(sandboxName)},
465+
},
466+
Spec: corev1.ServiceSpec{
467+
Selector: map[string]string{
468+
"agents.x-k8s.io/sandbox-name-hash": "ab179450",
469+
},
470+
ClusterIP: "None",
471+
},
472+
},
473+
// Verify the new PDB
474+
&policyv1.PodDisruptionBudget{
475+
ObjectMeta: metav1.ObjectMeta{
476+
Name: sandboxName,
477+
Namespace: sandboxNs,
478+
ResourceVersion: "1",
479+
OwnerReferences: []metav1.OwnerReference{sandboxControllerRef(sandboxName)},
480+
},
481+
Spec: policyv1.PodDisruptionBudgetSpec{
482+
MinAvailable: ptr.To(intstr.FromInt(1)),
483+
Selector: &metav1.LabelSelector{
484+
MatchLabels: map[string]string{
485+
"agents.x-k8s.io/sandbox-name-hash": "ab179450",
486+
},
487+
},
488+
},
489+
},
490+
},
491+
},
408492
}
409493

410494
for _, tc := range testCases {
@@ -595,15 +679,68 @@ func TestReconcilePod(t *testing.T) {
595679
require.Equal(t, tc.wantPod, pod)
596680
// Validate the Pod from the "cluster" (fake client)
597681
if tc.wantPod != nil {
682+
// If we expect a pod, verify it exists and matches.
598683
livePod := &corev1.Pod{}
599-
err = r.Get(t.Context(), types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}, livePod)
684+
err = r.Get(t.Context(), types.NamespacedName{Name: tc.wantPod.Name, Namespace: tc.wantPod.Namespace}, livePod)
600685
require.NoError(t, err)
601686
require.Equal(t, tc.wantPod, livePod)
602687
} else {
688+
// If we don't expect a pod (it was deleted), verify it's gone.
603689
livePod := &corev1.Pod{}
604-
err = r.Get(t.Context(), types.NamespacedName{Name: sandboxName, Namespace: sandboxNs}, livePod)
690+
err = r.Get(t.Context(), types.NamespacedName{Name: tc.sandbox.Name, Namespace: tc.sandbox.Namespace}, livePod)
605691
require.True(t, k8serrors.IsNotFound(err))
606692
}
607693
})
608694
}
609695
}
696+
697+
// This test simulates updating a Sandbox and ensures the controller correctly deletes the now-unneeded PDB
698+
func TestReconcile_ResilienceCleanup(t *testing.T) {
699+
sandboxName := "sandbox-name"
700+
sandboxNs := "sandbox-ns"
701+
702+
// Initial Sandbox with High Resilience
703+
sb := &sandboxv1alpha1.Sandbox{
704+
ObjectMeta: metav1.ObjectMeta{
705+
Name: sandboxName,
706+
Namespace: sandboxNs,
707+
Generation: 1,
708+
},
709+
Spec: sandboxv1alpha1.SandboxSpec{
710+
Resilience: sandboxv1alpha1.ResilienceLevelHigh,
711+
PodTemplate: sandboxv1alpha1.PodTemplate{
712+
Spec: corev1.PodSpec{
713+
Containers: []corev1.Container{{Name: "test-container"}},
714+
},
715+
},
716+
},
717+
}
718+
719+
r := SandboxReconciler{
720+
Client: newFakeClient(sb),
721+
Scheme: Scheme,
722+
}
723+
req := ctrl.Request{NamespacedName: types.NamespacedName{Name: sandboxName, Namespace: sandboxNs}}
724+
725+
_, err := r.Reconcile(t.Context(), req)
726+
require.NoError(t, err)
727+
728+
// Verify PDB was created
729+
pdb := &policyv1.PodDisruptionBudget{}
730+
require.NoError(t, r.Get(t.Context(), req.NamespacedName, pdb), "PDB should exist after first reconcile")
731+
732+
// Update Sandbox to remove resilience
733+
liveSandbox := &sandboxv1alpha1.Sandbox{}
734+
require.NoError(t, r.Get(t.Context(), req.NamespacedName, liveSandbox))
735+
liveSandbox.Spec.Resilience = "" // Remove resilience
736+
liveSandbox.Generation = 2
737+
require.NoError(t, r.Update(t.Context(), liveSandbox))
738+
739+
// Re-run reconcile
740+
_, err = r.Reconcile(t.Context(), req)
741+
require.NoError(t, err)
742+
743+
// Verify PDB was deleted
744+
err = r.Get(t.Context(), req.NamespacedName, pdb)
745+
require.True(t, k8serrors.IsNotFound(err), "PDB should be deleted after resilience is removed")
746+
}

examples/sandbox.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ metadata:
44
name: sandbox-example
55
namespace: sandbox-ns
66
spec:
7+
resilience: High
78
podTemplate:
89
metadata:
910
labels:

k8s/crds/agents.x-k8s.io_sandboxes.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3819,6 +3819,8 @@ spec:
38193819
maximum: 1
38203820
minimum: 0
38213821
type: integer
3822+
resilience:
3823+
type: string
38223824
shutdownTime:
38233825
format: date-time
38243826
type: string

k8s/rbac.generated.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,15 @@ rules:
3838
- get
3939
- patch
4040
- update
41+
- apiGroups:
42+
- policy
43+
resources:
44+
- poddisruptionbudgets
45+
verbs:
46+
- create
47+
- delete
48+
- get
49+
- list
50+
- patch
51+
- update
52+
- watch

0 commit comments

Comments
 (0)