Skip to content

Commit 976884d

Browse files
authored
Merge pull request #11450 from fabriziopandini/retry-etcd-errors-in-kcp
🌱 Retry in case of etcd errors in KCP
2 parents 48d23cd + 091473f commit 976884d

File tree

2 files changed

+235
-35
lines changed

2 files changed

+235
-35
lines changed

controlplane/kubeadm/internal/workload_cluster_conditions.go

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,38 @@ import (
4747
// This operation is best effort, in the sense that in case of problems in retrieving member status, it sets
4848
// the condition to Unknown state without returning any error.
4949
func (w *Workload) UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
50+
shouldRetry := func() bool {
51+
// if CP is scaling up or down.
52+
if ptr.Deref(controlPlane.KCP.Spec.Replicas, 0) != int32(len(controlPlane.Machines)) {
53+
return true
54+
}
55+
// if CP machines are provisioning or deleting.
56+
for _, m := range controlPlane.Machines {
57+
if m.Status.NodeRef == nil {
58+
return true
59+
}
60+
if !m.DeletionTimestamp.IsZero() {
61+
return true
62+
}
63+
}
64+
return false
65+
}
66+
5067
if controlPlane.IsEtcdManaged() {
51-
w.updateManagedEtcdConditions(ctx, controlPlane)
68+
// Update etcd conditions.
69+
// In case of well known temporary errors + control plane scaling up/down or rolling out, retry a few times.
70+
// Note: this is required because there isn't a watch mechanism on etcd.
71+
maxRetry := 3
72+
for i := range maxRetry {
73+
retryableError := w.updateManagedEtcdConditions(ctx, controlPlane)
74+
// if we should retry and there is a retry left, wait a bit.
75+
if !retryableError || !shouldRetry() {
76+
break
77+
}
78+
if i < maxRetry-1 {
79+
time.Sleep(time.Duration(250*(i+1)) * time.Millisecond)
80+
}
81+
}
5282
return
5383
}
5484
w.updateExternalEtcdConditions(ctx, controlPlane)
@@ -64,7 +94,7 @@ func (w *Workload) updateExternalEtcdConditions(_ context.Context, controlPlane
6494
// As soon as the v1beta1 condition above will be removed, we should drop this func entirely.
6595
}
6696

67-
func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
97+
func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) (retryableError bool) {
6898
// NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd
6999
// as ultimate source of truth for the list of members and for their health.
70100
controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
@@ -88,7 +118,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
88118
Reason: controlplanev1.KubeadmControlPlaneEtcdClusterInspectionFailedV1Beta2Reason,
89119
Message: "Failed to get Nodes hosting the etcd cluster",
90120
})
91-
return
121+
return retryableError
92122
}
93123

94124
// Update conditions for etcd members on the nodes.
@@ -154,6 +184,9 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
154184
if err != nil {
155185
// Note. even if we fail reading the member list from one node/etcd members we do not set EtcdMembersAgreeOnMemberList and EtcdMembersAgreeOnClusterID to false
156186
// (those info are computed on what we can collect during inspection, so we can reason about availability even if there is a certain degree of problems in the cluster).
187+
188+
// While scaling up/down or rolling out new CP machines this error might happen.
189+
retryableError = true
157190
continue
158191
}
159192

@@ -176,6 +209,9 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
176209
Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason,
177210
Message: fmt.Sprintf("The etcd member hosted on this Machine reports the cluster is composed by %s, but all previously seen etcd members are reporting %s", etcdutil.MemberNames(currentMembers), etcdutil.MemberNames(controlPlane.EtcdMembers)),
178211
})
212+
213+
// While scaling up/down or rolling out new CP machines this error might happen because we are reading the list from different nodes at different time.
214+
retryableError = true
179215
continue
180216
}
181217

@@ -277,6 +313,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
277313
trueReason: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Reason,
278314
note: "etcd member",
279315
})
316+
return retryableError
280317
}
281318

282319
func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1.Machine, nodeName string) ([]*etcd.Member, error) {

0 commit comments

Comments
 (0)