Skip to content

Commit 1f03d2c

Browse files
committed
add logic to retry if update fails due to conflict
Signed-off-by: Rahul Sharma <[email protected]>
1 parent c5d513d commit 1f03d2c

File tree

5 files changed

+207
-86
lines changed

5 files changed

+207
-86
lines changed

internal/conditions/clusterpolicy.go

Lines changed: 43 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"k8s.io/apimachinery/pkg/types"
2626
"sigs.k8s.io/controller-runtime/pkg/client"
2727
"sigs.k8s.io/controller-runtime/pkg/log"
28+
"k8s.io/client-go/util/retry"
2829

2930
nvidiav1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
3031
)
@@ -57,45 +58,47 @@ func (u *clusterPolicyUpdater) SetConditionsError(ctx context.Context, cr any, r
5758

5859
func (u *clusterPolicyUpdater) setConditions(ctx context.Context, cr *nvidiav1.ClusterPolicy, statusType, reason, message string) error {
5960
reqLogger := log.FromContext(ctx)
60-
// Fetch latest instance and update state to avoid version mismatch
61-
instance := &nvidiav1.ClusterPolicy{}
62-
err := u.client.Get(ctx, types.NamespacedName{Name: cr.Name}, instance)
63-
if err != nil {
64-
reqLogger.Error(err, "Failed to get ClusterPolicy instance for status update", "name", cr.Name)
65-
return err
66-
}
67-
68-
switch statusType {
69-
case Ready:
70-
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
71-
Type: Ready,
72-
Status: metav1.ConditionTrue,
73-
Reason: reason,
74-
Message: message,
75-
})
76-
77-
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
78-
Type: Error,
79-
Status: metav1.ConditionFalse,
80-
Reason: Ready,
81-
})
82-
case Error:
83-
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
84-
Type: Ready,
85-
Status: metav1.ConditionFalse,
86-
Reason: Error,
87-
})
8861

89-
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
90-
Type: Error,
91-
Status: metav1.ConditionTrue,
92-
Reason: reason,
93-
Message: message,
94-
})
95-
default:
96-
reqLogger.Error(nil, "Unknown status type provided", "statusType", statusType)
97-
return fmt.Errorf("unknown status type provided: %s", statusType)
98-
}
99-
100-
return u.client.Status().Update(ctx, instance)
62+
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
63+
instance := &nvidiav1.ClusterPolicy{}
64+
err := u.client.Get(ctx, types.NamespacedName{Name: cr.Name}, instance)
65+
if err != nil {
66+
reqLogger.Error(err, "Failed to get ClusterPolicy instance for status update", "name", cr.Name)
67+
return err
68+
}
69+
switch statusType {
70+
case Ready:
71+
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
72+
Type: Ready,
73+
Status: metav1.ConditionTrue,
74+
Reason: reason,
75+
Message: message,
76+
})
77+
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
78+
Type: Error,
79+
Status: metav1.ConditionFalse,
80+
Reason: Ready,
81+
})
82+
case Error:
83+
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
84+
Type: Ready,
85+
Status: metav1.ConditionFalse,
86+
Reason: Error,
87+
})
88+
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
89+
Type: Error,
90+
Status: metav1.ConditionTrue,
91+
Reason: reason,
92+
Message: message,
93+
})
94+
default:
95+
reqLogger.Error(nil, "Unknown status type provided", "statusType", statusType)
96+
return fmt.Errorf("unknown status type provided: %s", statusType)
97+
}
98+
err = u.client.Status().Update(ctx, instance)
99+
if err != nil {
100+
reqLogger.Error(err, "Failed to update ClusterPolicy status", "name", cr.Name)
101+
}
102+
return err
103+
})
101104
}

internal/conditions/nvidiadriver.go

Lines changed: 54 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"k8s.io/apimachinery/pkg/types"
2626
"sigs.k8s.io/controller-runtime/pkg/client"
2727
"sigs.k8s.io/controller-runtime/pkg/log"
28+
"k8s.io/client-go/util/retry"
2829

2930
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
3031
)
@@ -63,53 +64,60 @@ func (u *nvDriverUpdater) SetConditionsError(ctx context.Context, cr any, reason
6364

6465
func (u *nvDriverUpdater) setConditions(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, statusType, reason, message string) error {
6566
reqLogger := log.FromContext(ctx)
66-
// Fetch latest instance and update state to avoid version mismatch
67-
instance := &nvidiav1alpha1.NVIDIADriver{}
68-
err := u.client.Get(ctx, types.NamespacedName{Name: cr.Name}, instance)
69-
if err != nil {
70-
reqLogger.Error(err, "Failed to get NVIDIADriver instance for status update", "name", cr.Name)
71-
return err
72-
}
7367

74-
switch statusType {
75-
case Ready:
76-
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
77-
Type: Ready,
78-
Status: metav1.ConditionTrue,
79-
Reason: reason,
80-
Message: message,
81-
})
82-
83-
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
84-
Type: Error,
85-
Status: metav1.ConditionFalse,
86-
Reason: Ready,
87-
})
88-
case Error:
89-
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
90-
Type: Ready,
91-
Status: metav1.ConditionFalse,
92-
Reason: Error,
93-
})
94-
95-
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
96-
Type: Error,
97-
Status: metav1.ConditionTrue,
98-
Reason: reason,
99-
Message: message,
100-
})
101-
102-
// Ensure status.state is not empty when updating the CR status.
103-
// The caller should set the state appropriately in the CR
104-
// depending on the error condition.
105-
instance.Status.State = cr.Status.State
106-
if instance.Status.State == "" {
107-
instance.Status.State = nvidiav1alpha1.NotReady
68+
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
69+
// Fetch latest instance and update state to avoid version mismatch
70+
instance := &nvidiav1alpha1.NVIDIADriver{}
71+
err := u.client.Get(ctx, types.NamespacedName{Name: cr.Name}, instance)
72+
if err != nil {
73+
reqLogger.Error(err, "Failed to get NVIDIADriver instance for status update", "name", cr.Name)
74+
return err
10875
}
109-
default:
110-
reqLogger.Error(nil, "Unknown status type provided", "statusType", statusType)
111-
return fmt.Errorf("unknown status type provided: %s", statusType)
112-
}
11376

114-
return u.client.Status().Update(ctx, instance)
77+
switch statusType {
78+
case Ready:
79+
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
80+
Type: Ready,
81+
Status: metav1.ConditionTrue,
82+
Reason: reason,
83+
Message: message,
84+
})
85+
86+
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
87+
Type: Error,
88+
Status: metav1.ConditionFalse,
89+
Reason: Ready,
90+
})
91+
case Error:
92+
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
93+
Type: Ready,
94+
Status: metav1.ConditionFalse,
95+
Reason: Error,
96+
})
97+
98+
meta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
99+
Type: Error,
100+
Status: metav1.ConditionTrue,
101+
Reason: reason,
102+
Message: message,
103+
})
104+
105+
// Ensure status.state is not empty when updating the CR status.
106+
// The caller should set the state appropriately in the CR
107+
// depending on the error condition.
108+
instance.Status.State = cr.Status.State
109+
if instance.Status.State == "" {
110+
instance.Status.State = nvidiav1alpha1.NotReady
111+
}
112+
default:
113+
reqLogger.Error(nil, "Unknown status type provided", "statusType", statusType)
114+
return fmt.Errorf("unknown status type provided: %s", statusType)
115+
}
116+
117+
err = u.client.Status().Update(ctx, instance)
118+
if err != nil {
119+
reqLogger.Error(err, "Failed to update NVIDIADriver status", "name", instance.Name)
120+
}
121+
return err
122+
})
115123
}

vendor/k8s.io/client-go/util/retry/OWNERS

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/k8s.io/client-go/util/retry/util.go

Lines changed: 105 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/modules.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,7 @@ k8s.io/client-go/util/flowcontrol
989989
k8s.io/client-go/util/homedir
990990
k8s.io/client-go/util/jsonpath
991991
k8s.io/client-go/util/keyutil
992+
k8s.io/client-go/util/retry
992993
k8s.io/client-go/util/workqueue
993994
# k8s.io/component-base v0.34.1
994995
## explicit; go 1.24.0

0 commit comments

Comments
 (0)