Skip to content

Commit 29afd3e

Browse files
Refine v1beta2 KCP available condition
1 parent 48d23cd commit 29afd3e

File tree

2 files changed

+556
-99
lines changed

2 files changed

+556
-99
lines changed

controlplane/kubeadm/internal/controllers/status.go

+99-21
Original file line numberDiff line numberDiff line change
@@ -527,13 +527,10 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
527527

528528
// Determine control plane availability looking at machines conditions, which at this stage are
529529
// already surfacing status from etcd member and all control plane pods hosted on every machine.
530-
// Note: we intentionally use the number of etcd members to determine the etcd quorum because
531-
// etcd members might not match with machines, e.g. while provisioning a new machine.
532-
etcdQuorum := (len(etcdMembers) / 2.0) + 1
533530
k8sControlPlaneHealthy := 0
534531
k8sControlPlaneNotHealthy := 0
535-
etcdMembersHealthy := 0
536-
etcdMembersNotHealthy := 0
532+
k8sControlPlaneNotHealthyButNotReportedYet := 0
533+
537534
for _, machine := range machines {
538535
// if external etcd, only look at the status of the K8s control plane components on this machine.
539536
if !etcdIsManaged {
@@ -546,6 +543,8 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
546543
controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition,
547544
controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) {
548545
k8sControlPlaneNotHealthy++
546+
} else {
547+
k8sControlPlaneNotHealthyButNotReportedYet++
549548
}
550549
continue
551550
}
@@ -556,14 +555,6 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
556555
// - API server on one machine only connect to the local etcd member
557556
// - ControllerManager and scheduler on a machine connect to the local API server (not to the control plane endpoint)
558557
// As a consequence, we consider the K8s control plane on this machine healthy only if everything is healthy.
559-
560-
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
561-
etcdMembersHealthy++
562-
} else if shouldSurfaceWhenAvailableTrue(machine,
563-
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
564-
etcdMembersNotHealthy++
565-
}
566-
567558
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
568559
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
569560
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) &&
@@ -577,30 +568,113 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
577568
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition,
578569
controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition) {
579570
k8sControlPlaneNotHealthy++
571+
} else {
572+
k8sControlPlaneNotHealthyButNotReportedYet++
573+
}
574+
}
575+
576+
// Maps machines to members
577+
memberToMachineMap := map[string]*clusterv1.Machine{}
578+
provisioningMachines := []*clusterv1.Machine{}
579+
for _, machine := range machines {
580+
if machine.Status.NodeRef == nil {
581+
provisioningMachines = append(provisioningMachines, machine)
582+
continue
583+
}
584+
for _, member := range etcdMembers {
585+
if machine.Status.NodeRef.Name == member.Name {
586+
memberToMachineMap[member.Name] = machine
587+
break
588+
}
589+
}
590+
}
591+
592+
// Determine etcd members availability by using etcd members as a source of truth because
593+
// etcd members might not match with machines, e.g. while provisioning a new machine.
594+
// Also in this case, we leverage info on machines to determine member health.
595+
votingEtcdMembers := 0
596+
learnerEtcdMembers := 0
597+
etcdMembersHealthy := 0
598+
etcdMembersNotHealthy := 0
599+
etcdMembersNotHealthyButNotReportedYet := 0
600+
601+
for _, etcdMember := range etcdMembers {
602+
// Note. We consider etcd without a name yet as learners, because this prevents them to impact quorum (this is
603+
// a temporary state that usually goes away very quickly).
604+
if etcdMember.IsLearner || etcdMember.Name == "" {
605+
learnerEtcdMembers++
606+
} else {
607+
votingEtcdMembers++
608+
}
609+
610+
// In case the etcd member does not have yet a name it is not possible to find a corresponding machine,
611+
// but we consider the node being healthy because this is a transient state that usually goes away quickly.
612+
if etcdMember.Name == "" {
613+
etcdMembersHealthy++
614+
continue
615+
}
616+
617+
// Look for the corresponding machine.
618+
machine := memberToMachineMap[etcdMember.Name]
619+
if machine == nil {
620+
// If there is only one provisioning machine (a machine yet without the node name), considering that KCP
621+
// only creates one machine at time, we can make the assumption this is the machine hosting the etcd member without a match
622+
if len(provisioningMachines) == 1 {
623+
machine = provisioningMachines[0]
624+
provisioningMachines = nil
625+
} else {
626+
// In case we cannot match an etcd member with a machine, we consider this an issue (it should
627+
// never happen with KCP).
628+
etcdMembersNotHealthy++
629+
continue
630+
}
631+
}
632+
633+
// Otherwise read the status of the etcd member from he EtcdMemberHealthy condition.
634+
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
635+
etcdMembersHealthy++
636+
} else if shouldSurfaceWhenAvailableTrue(machine,
637+
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
638+
etcdMembersNotHealthy++
639+
} else {
640+
etcdMembersNotHealthyButNotReportedYet++
580641
}
581642
}
643+
etcdQuorum := (votingEtcdMembers / 2.0) + 1
582644

645+
// If the control plane and etcd (if managed are available), set the condition to true taking care of surfacing partial unavailability if any.
583646
if kcp.DeletionTimestamp.IsZero() &&
584647
(!etcdIsManaged || etcdMembersHealthy >= etcdQuorum) &&
585648
k8sControlPlaneHealthy >= 1 &&
586649
v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
587650
messages := []string{}
588651

589652
if etcdIsManaged && etcdMembersNotHealthy > 0 {
590-
switch len(etcdMembers) - etcdMembersNotHealthy {
653+
etcdLearnersMsg := ""
654+
if learnerEtcdMembers > 0 {
655+
etcdLearnersMsg = fmt.Sprintf(" %d learner etcd member,", learnerEtcdMembers)
656+
}
657+
658+
// Note: When Available is true, we surface failures only after 10s they exist to avoid flakes;
659+
// Accordingly for this message NotHealthyButNotReportedYet sums up to Healthy.
660+
etcdMembersHealthyAndNotHealthyButNotReportedYet := etcdMembersHealthy + etcdMembersNotHealthyButNotReportedYet
661+
switch etcdMembersHealthyAndNotHealthyButNotReportedYet {
591662
case 1:
592-
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy, at least %d required for etcd quorum", len(etcdMembers), etcdQuorum))
663+
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy,%s at least %d healthy member required for etcd quorum", len(etcdMembers), etcdLearnersMsg, etcdQuorum))
593664
default:
594-
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy, at least %d required for etcd quorum", len(etcdMembers)-etcdMembersNotHealthy, len(etcdMembers), etcdQuorum))
665+
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy,%s at least %d healthy member required for etcd quorum", etcdMembersHealthyAndNotHealthyButNotReportedYet, len(etcdMembers), etcdLearnersMsg, etcdQuorum))
595666
}
596667
}
597668

598669
if k8sControlPlaneNotHealthy > 0 {
599-
switch len(machines) - k8sControlPlaneNotHealthy {
670+
// Note: When Available is true, we surface failures only after 10s they exist to avoid flakes;
671+
// Accordingly for this message NotHealthyButNotReportedYet sums up to Healthy.
672+
k8sControlPlaneHealthyAndNotHealthyButNotReportedYet := k8sControlPlaneHealthy + k8sControlPlaneNotHealthyButNotReportedYet
673+
switch k8sControlPlaneHealthyAndNotHealthyButNotReportedYet {
600674
case 1:
601675
messages = append(messages, fmt.Sprintf("* 1 of %d Machines has healthy control plane components, at least 1 required", len(machines)))
602676
default:
603-
messages = append(messages, fmt.Sprintf("* %d of %d Machines have healthy control plane components, at least 1 required", len(machines)-k8sControlPlaneNotHealthy, len(machines)))
677+
messages = append(messages, fmt.Sprintf("* %d of %d Machines have healthy control plane components, at least 1 required", k8sControlPlaneHealthyAndNotHealthyButNotReportedYet, len(machines)))
604678
}
605679
}
606680

@@ -623,13 +697,17 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
623697
}
624698

625699
if etcdIsManaged && etcdMembersHealthy < etcdQuorum {
700+
etcdLearnersMsg := ""
701+
if learnerEtcdMembers > 0 {
702+
etcdLearnersMsg = fmt.Sprintf(" %d learner etcd member,", learnerEtcdMembers)
703+
}
626704
switch etcdMembersHealthy {
627705
case 0:
628-
messages = append(messages, fmt.Sprintf("* There are no healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
706+
messages = append(messages, fmt.Sprintf("* There are no healthy etcd member,%s at least %d healthy member required for etcd quorum", etcdLearnersMsg, etcdQuorum))
629707
case 1:
630-
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy, at least %d required for etcd quorum", len(etcdMembers), etcdQuorum))
708+
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy,%s at least %d healthy member required for etcd quorum", len(etcdMembers), etcdLearnersMsg, etcdQuorum))
631709
default:
632-
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy, at least %d required for etcd quorum", etcdMembersHealthy, len(etcdMembers), etcdQuorum))
710+
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy,%s at least %d healthy member required for etcd quorum", etcdMembersHealthy, len(etcdMembers), etcdLearnersMsg, etcdQuorum))
633711
}
634712
}
635713

0 commit comments

Comments
 (0)