Skip to content

Commit 781d1e4

Browse files
authored
Merge pull request #11451 from fabriziopandini/refine-v1beta2-kcp-available-condition2
🌱 Refine v1beta2 KCP available condition
2 parents 976884d + cf4145a commit 781d1e4

File tree

2 files changed

+622
-98
lines changed

2 files changed

+622
-98
lines changed

controlplane/kubeadm/internal/controllers/status.go

+101-21
Original file line numberDiff line numberDiff line change
@@ -527,13 +527,10 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
527527

528528
// Determine control plane availability looking at machines conditions, which at this stage are
529529
// already surfacing status from etcd member and all control plane pods hosted on every machine.
530-
// Note: we intentionally use the number of etcd members to determine the etcd quorum because
531-
// etcd members might not match with machines, e.g. while provisioning a new machine.
532-
etcdQuorum := (len(etcdMembers) / 2.0) + 1
533530
k8sControlPlaneHealthy := 0
534531
k8sControlPlaneNotHealthy := 0
535-
etcdMembersHealthy := 0
536-
etcdMembersNotHealthy := 0
532+
k8sControlPlaneNotHealthyButNotReportedYet := 0
533+
537534
for _, machine := range machines {
538535
// if external etcd, only look at the status of the K8s control plane components on this machine.
539536
if !etcdIsManaged {
@@ -546,6 +543,8 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
546543
controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition,
547544
controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) {
548545
k8sControlPlaneNotHealthy++
546+
} else {
547+
k8sControlPlaneNotHealthyButNotReportedYet++
549548
}
550549
continue
551550
}
@@ -556,14 +555,6 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
556555
// - API server on one machine only connect to the local etcd member
557556
// - ControllerManager and scheduler on a machine connect to the local API server (not to the control plane endpoint)
558557
// As a consequence, we consider the K8s control plane on this machine healthy only if everything is healthy.
559-
560-
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
561-
etcdMembersHealthy++
562-
} else if shouldSurfaceWhenAvailableTrue(machine,
563-
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
564-
etcdMembersNotHealthy++
565-
}
566-
567558
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
568559
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
569560
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) &&
@@ -577,30 +568,115 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
577568
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition,
578569
controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition) {
579570
k8sControlPlaneNotHealthy++
571+
} else {
572+
k8sControlPlaneNotHealthyButNotReportedYet++
580573
}
581574
}
582575

576+
// Determine etcd members availability by using etcd members as a source of truth because
577+
// etcd members might not match with machines, e.g. while provisioning a new machine.
578+
// Also in this case, we leverage info on machines to determine member health.
579+
votingEtcdMembers := 0
580+
learnerEtcdMembers := 0
581+
etcdMembersHealthy := 0
582+
etcdMembersNotHealthy := 0
583+
etcdMembersNotHealthyButNotReportedYet := 0
584+
585+
if etcdIsManaged {
586+
// Maps machines to members
587+
memberToMachineMap := map[string]*clusterv1.Machine{}
588+
provisioningMachines := []*clusterv1.Machine{}
589+
for _, machine := range machines {
590+
if machine.Status.NodeRef == nil {
591+
provisioningMachines = append(provisioningMachines, machine)
592+
continue
593+
}
594+
for _, member := range etcdMembers {
595+
if machine.Status.NodeRef.Name == member.Name {
596+
memberToMachineMap[member.Name] = machine
597+
break
598+
}
599+
}
600+
}
601+
602+
for _, etcdMember := range etcdMembers {
603+
// Note. We consider etcd without a name yet as learners, because this prevents them to impact quorum (this is
604+
// a temporary state that usually goes away very quickly).
605+
if etcdMember.IsLearner || etcdMember.Name == "" {
606+
learnerEtcdMembers++
607+
} else {
608+
votingEtcdMembers++
609+
}
610+
611+
// In case the etcd member does not have yet a name it is not possible to find a corresponding machine,
612+
// but we consider the node being healthy because this is a transient state that usually goes away quickly.
613+
if etcdMember.Name == "" {
614+
etcdMembersHealthy++
615+
continue
616+
}
617+
618+
// Look for the corresponding machine.
619+
machine := memberToMachineMap[etcdMember.Name]
620+
if machine == nil {
621+
// If there is only one provisioning machine (a machine yet without the node name), considering that KCP
622+
// only creates one machine at time, we can make the assumption this is the machine hosting the etcd member without a match
623+
if len(provisioningMachines) == 1 {
624+
machine = provisioningMachines[0]
625+
provisioningMachines = nil
626+
} else {
627+
// In case we cannot match an etcd member with a machine, we consider this an issue (it should
628+
// never happen with KCP).
629+
etcdMembersNotHealthy++
630+
continue
631+
}
632+
}
633+
634+
// Otherwise read the status of the etcd member from he EtcdMemberHealthy condition.
635+
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
636+
etcdMembersHealthy++
637+
} else if shouldSurfaceWhenAvailableTrue(machine,
638+
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
639+
etcdMembersNotHealthy++
640+
} else {
641+
etcdMembersNotHealthyButNotReportedYet++
642+
}
643+
}
644+
}
645+
etcdQuorum := (votingEtcdMembers / 2.0) + 1
646+
647+
// If the control plane and etcd (if managed are available), set the condition to true taking care of surfacing partial unavailability if any.
583648
if kcp.DeletionTimestamp.IsZero() &&
584649
(!etcdIsManaged || etcdMembersHealthy >= etcdQuorum) &&
585650
k8sControlPlaneHealthy >= 1 &&
586651
v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
587652
messages := []string{}
588653

589654
if etcdIsManaged && etcdMembersNotHealthy > 0 {
590-
switch len(etcdMembers) - etcdMembersNotHealthy {
655+
etcdLearnersMsg := ""
656+
if learnerEtcdMembers > 0 {
657+
etcdLearnersMsg = fmt.Sprintf(" %d learner etcd member,", learnerEtcdMembers)
658+
}
659+
660+
// Note: When Available is true, we surface failures only after 10s they exist to avoid flakes;
661+
// Accordingly for this message NotHealthyButNotReportedYet sums up to Healthy.
662+
etcdMembersHealthyAndNotHealthyButNotReportedYet := etcdMembersHealthy + etcdMembersNotHealthyButNotReportedYet
663+
switch etcdMembersHealthyAndNotHealthyButNotReportedYet {
591664
case 1:
592-
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy, at least %d required for etcd quorum", len(etcdMembers), etcdQuorum))
665+
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy,%s at least %d healthy member required for etcd quorum", len(etcdMembers), etcdLearnersMsg, etcdQuorum))
593666
default:
594-
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy, at least %d required for etcd quorum", len(etcdMembers)-etcdMembersNotHealthy, len(etcdMembers), etcdQuorum))
667+
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy,%s at least %d healthy member required for etcd quorum", etcdMembersHealthyAndNotHealthyButNotReportedYet, len(etcdMembers), etcdLearnersMsg, etcdQuorum))
595668
}
596669
}
597670

598671
if k8sControlPlaneNotHealthy > 0 {
599-
switch len(machines) - k8sControlPlaneNotHealthy {
672+
// Note: When Available is true, we surface failures only after 10s they exist to avoid flakes;
673+
// Accordingly for this message NotHealthyButNotReportedYet sums up to Healthy.
674+
k8sControlPlaneHealthyAndNotHealthyButNotReportedYet := k8sControlPlaneHealthy + k8sControlPlaneNotHealthyButNotReportedYet
675+
switch k8sControlPlaneHealthyAndNotHealthyButNotReportedYet {
600676
case 1:
601677
messages = append(messages, fmt.Sprintf("* 1 of %d Machines has healthy control plane components, at least 1 required", len(machines)))
602678
default:
603-
messages = append(messages, fmt.Sprintf("* %d of %d Machines have healthy control plane components, at least 1 required", len(machines)-k8sControlPlaneNotHealthy, len(machines)))
679+
messages = append(messages, fmt.Sprintf("* %d of %d Machines have healthy control plane components, at least 1 required", k8sControlPlaneHealthyAndNotHealthyButNotReportedYet, len(machines)))
604680
}
605681
}
606682

@@ -623,13 +699,17 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
623699
}
624700

625701
if etcdIsManaged && etcdMembersHealthy < etcdQuorum {
702+
etcdLearnersMsg := ""
703+
if learnerEtcdMembers > 0 {
704+
etcdLearnersMsg = fmt.Sprintf(" %d learner etcd member,", learnerEtcdMembers)
705+
}
626706
switch etcdMembersHealthy {
627707
case 0:
628-
messages = append(messages, fmt.Sprintf("* There are no healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
708+
messages = append(messages, fmt.Sprintf("* There are no healthy etcd member,%s at least %d healthy member required for etcd quorum", etcdLearnersMsg, etcdQuorum))
629709
case 1:
630-
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy, at least %d required for etcd quorum", len(etcdMembers), etcdQuorum))
710+
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy,%s at least %d healthy member required for etcd quorum", len(etcdMembers), etcdLearnersMsg, etcdQuorum))
631711
default:
632-
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy, at least %d required for etcd quorum", etcdMembersHealthy, len(etcdMembers), etcdQuorum))
712+
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy,%s at least %d healthy member required for etcd quorum", etcdMembersHealthy, len(etcdMembers), etcdLearnersMsg, etcdQuorum))
633713
}
634714
}
635715

0 commit comments

Comments
 (0)