diff --git a/internal/conditions/clusterpolicy.go b/internal/conditions/clusterpolicy.go index 09d0607ab..4101d7849 100644 --- a/internal/conditions/clusterpolicy.go +++ b/internal/conditions/clusterpolicy.go @@ -23,6 +23,7 @@ import ( "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -55,14 +56,12 @@ func (u *clusterPolicyUpdater) SetConditionsError(ctx context.Context, cr any, r return u.setConditions(ctx, clusterPolicyCr, Error, reason, message) } -func (u *clusterPolicyUpdater) setConditions(ctx context.Context, cr *nvidiav1.ClusterPolicy, statusType, reason, message string) error { - reqLogger := log.FromContext(ctx) +// updateConditions updates the conditions of the ClusterPolicy CR +func (u *clusterPolicyUpdater) updateConditions(ctx context.Context, cr *nvidiav1.ClusterPolicy, statusType, reason, message string) error { // Fetch latest instance and update state to avoid version mismatch instance := &nvidiav1.ClusterPolicy{} - err := u.client.Get(ctx, types.NamespacedName{Name: cr.Name}, instance) - if err != nil { - reqLogger.Error(err, "Failed to get ClusterPolicy instance for status update", "name", cr.Name) - return err + if err := u.client.Get(ctx, types.NamespacedName{Name: cr.Name}, instance); err != nil { + return fmt.Errorf("failed to get ClusterPolicy instance for status update: %w", err) } switch statusType { @@ -93,9 +92,23 @@ func (u *clusterPolicyUpdater) setConditions(ctx context.Context, cr *nvidiav1.C Message: message, }) default: - reqLogger.Error(nil, "Unknown status type provided", "statusType", statusType) return fmt.Errorf("unknown status type provided: %s", statusType) } return u.client.Status().Update(ctx, instance) } + +// setConditions updates the conditions of the ClusterPolicy CR +// with retry on conflict to handle version mismatches +func (u *clusterPolicyUpdater) setConditions(ctx context.Context, cr *nvidiav1.ClusterPolicy, statusType, reason, message string) error { + reqLogger := log.FromContext(ctx) + + err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + return u.updateConditions(ctx, cr, statusType, reason, message) + }) + + if err != nil { + reqLogger.Error(err, "Failed to update ClusterPolicy status after retries", "name", cr.Name) + } + return err +} diff --git a/internal/conditions/nvidiadriver.go b/internal/conditions/nvidiadriver.go index 32c88a565..bf0140832 100644 --- a/internal/conditions/nvidiadriver.go +++ b/internal/conditions/nvidiadriver.go @@ -23,6 +23,7 @@ import ( "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -61,14 +62,12 @@ func (u *nvDriverUpdater) SetConditionsError(ctx context.Context, cr any, reason return u.setConditions(ctx, nvDriverCr, Error, reason, message) } -func (u *nvDriverUpdater) setConditions(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, statusType, reason, message string) error { - reqLogger := log.FromContext(ctx) +// updateConditions updates the conditions of the NVIDIADriver CR +func (u *nvDriverUpdater) updateConditions(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, statusType, reason, message string) error { // Fetch latest instance and update state to avoid version mismatch instance := &nvidiav1alpha1.NVIDIADriver{} - err := u.client.Get(ctx, types.NamespacedName{Name: cr.Name}, instance) - if err != nil { - reqLogger.Error(err, "Failed to get NVIDIADriver instance for status update", "name", cr.Name) - return err + if err := u.client.Get(ctx, types.NamespacedName{Name: cr.Name}, instance); err != nil { + return fmt.Errorf("failed to get NVIDIADriver instance for status update: %w", err) } switch statusType { @@ -107,9 +106,23 @@ func (u *nvDriverUpdater) setConditions(ctx context.Context, cr *nvidiav1alpha1. instance.Status.State = nvidiav1alpha1.NotReady } default: - reqLogger.Error(nil, "Unknown status type provided", "statusType", statusType) return fmt.Errorf("unknown status type provided: %s", statusType) } return u.client.Status().Update(ctx, instance) } + +// setConditions updates the conditions of the NVIDIADriver CR +// with retry on conflict to handle version mismatches +func (u *nvDriverUpdater) setConditions(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, statusType, reason, message string) error { + reqLogger := log.FromContext(ctx) + + err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + return u.updateConditions(ctx, cr, statusType, reason, message) + }) + + if err != nil { + reqLogger.Error(err, "Failed to update NVIDIADriver status after retries", "name", cr.Name) + } + return err +} diff --git a/vendor/k8s.io/client-go/util/retry/OWNERS b/vendor/k8s.io/client-go/util/retry/OWNERS new file mode 100644 index 000000000..75736b5aa --- /dev/null +++ b/vendor/k8s.io/client-go/util/retry/OWNERS @@ -0,0 +1,4 @@ +# See the OWNERS docs at https://go.k8s.io/owners + +reviewers: + - caesarxuchao diff --git a/vendor/k8s.io/client-go/util/retry/util.go b/vendor/k8s.io/client-go/util/retry/util.go new file mode 100644 index 000000000..57d3cd49c --- /dev/null +++ b/vendor/k8s.io/client-go/util/retry/util.go @@ -0,0 +1,105 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package retry + +import ( + "time" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/util/wait" +) + +// DefaultRetry is the recommended retry for a conflict where multiple clients +// are making changes to the same resource. +var DefaultRetry = wait.Backoff{ + Steps: 5, + Duration: 10 * time.Millisecond, + Factor: 1.0, + Jitter: 0.1, +} + +// DefaultBackoff is the recommended backoff for a conflict where a client +// may be attempting to make an unrelated modification to a resource under +// active management by one or more controllers. +var DefaultBackoff = wait.Backoff{ + Steps: 4, + Duration: 10 * time.Millisecond, + Factor: 5.0, + Jitter: 0.1, +} + +// OnError allows the caller to retry fn in case the error returned by fn is retriable +// according to the provided function. backoff defines the maximum retries and the wait +// interval between two retries. +func OnError(backoff wait.Backoff, retriable func(error) bool, fn func() error) error { + var lastErr error + err := wait.ExponentialBackoff(backoff, func() (bool, error) { + err := fn() + switch { + case err == nil: + return true, nil + case retriable(err): + lastErr = err + return false, nil + default: + return false, err + } + }) + if wait.Interrupted(err) { + err = lastErr + } + return err +} + +// RetryOnConflict is used to make an update to a resource when you have to worry about +// conflicts caused by other code making unrelated updates to the resource at the same +// time. fn should fetch the resource to be modified, make appropriate changes to it, try +// to update it, and return (unmodified) the error from the update function. On a +// successful update, RetryOnConflict will return nil. If the update function returns a +// "Conflict" error, RetryOnConflict will wait some amount of time as described by +// backoff, and then try again. On a non-"Conflict" error, or if it retries too many times +// and gives up, RetryOnConflict will return an error to the caller. +// +// err := retry.RetryOnConflict(retry.DefaultRetry, func() error { +// // Fetch the resource here; you need to refetch it on every try, since +// // if you got a conflict on the last update attempt then you need to get +// // the current version before making your own changes. +// pod, err := c.Pods("mynamespace").Get(name, metav1.GetOptions{}) +// if err != nil { +// return err +// } +// +// // Make whatever updates to the resource are needed +// pod.Status.Phase = v1.PodFailed +// +// // Try to update +// _, err = c.Pods("mynamespace").UpdateStatus(pod) +// // You have to return err itself here (not wrapped inside another error) +// // so that RetryOnConflict can identify it correctly. +// return err +// }) +// if err != nil { +// // May be conflict if max retries were hit, or may be something unrelated +// // like permissions or a network error +// return err +// } +// ... +// +// TODO: Make Backoff an interface? +func RetryOnConflict(backoff wait.Backoff, fn func() error) error { + return OnError(backoff, errors.IsConflict, fn) +} diff --git a/vendor/modules.txt b/vendor/modules.txt index e4eb69252..23521d5a6 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -989,6 +989,7 @@ k8s.io/client-go/util/flowcontrol k8s.io/client-go/util/homedir k8s.io/client-go/util/jsonpath k8s.io/client-go/util/keyutil +k8s.io/client-go/util/retry k8s.io/client-go/util/workqueue # k8s.io/component-base v0.34.1 ## explicit; go 1.24.0