Skip to content

Commit 287ceb6

Browse files
Refine v1beta2 KCP available condition
1 parent 48d23cd commit 287ceb6

File tree

2 files changed

+552
-99
lines changed

2 files changed

+552
-99
lines changed

controlplane/kubeadm/internal/controllers/status.go

+97-21
Original file line numberDiff line numberDiff line change
@@ -527,13 +527,10 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
527527

528528
// Determine control plane availability looking at machines conditions, which at this stage are
529529
// already surfacing status from etcd member and all control plane pods hosted on every machine.
530-
// Note: we intentionally use the number of etcd members to determine the etcd quorum because
531-
// etcd members might not match with machines, e.g. while provisioning a new machine.
532-
etcdQuorum := (len(etcdMembers) / 2.0) + 1
533530
k8sControlPlaneHealthy := 0
534531
k8sControlPlaneNotHealthy := 0
535-
etcdMembersHealthy := 0
536-
etcdMembersNotHealthy := 0
532+
k8sControlPlaneNotHealthyButNotReportedYet := 0
533+
537534
for _, machine := range machines {
538535
// if external etcd, only look at the status of the K8s control plane components on this machine.
539536
if !etcdIsManaged {
@@ -546,6 +543,8 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
546543
controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition,
547544
controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) {
548545
k8sControlPlaneNotHealthy++
546+
} else {
547+
k8sControlPlaneNotHealthyButNotReportedYet++
549548
}
550549
continue
551550
}
@@ -556,14 +555,6 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
556555
// - API server on one machine only connect to the local etcd member
557556
// - ControllerManager and scheduler on a machine connect to the local API server (not to the control plane endpoint)
558557
// As a consequence, we consider the K8s control plane on this machine healthy only if everything is healthy.
559-
560-
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
561-
etcdMembersHealthy++
562-
} else if shouldSurfaceWhenAvailableTrue(machine,
563-
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
564-
etcdMembersNotHealthy++
565-
}
566-
567558
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
568559
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
569560
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) &&
@@ -577,30 +568,111 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
577568
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition,
578569
controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition) {
579570
k8sControlPlaneNotHealthy++
571+
} else {
572+
k8sControlPlaneNotHealthyButNotReportedYet++
580573
}
581574
}
582575

576+
// Maps machines to members
577+
memberToMachineMap := map[string]*clusterv1.Machine{}
578+
provisioningMachines := []*clusterv1.Machine{}
579+
for _, machine := range machines {
580+
if machine.Status.NodeRef == nil {
581+
provisioningMachines = append(provisioningMachines, machine)
582+
continue
583+
}
584+
for _, member := range etcdMembers {
585+
if machine.Status.NodeRef.Name == member.Name {
586+
memberToMachineMap[member.Name] = machine
587+
break
588+
}
589+
}
590+
}
591+
592+
// Determine etcd members availability by using etcd members as a source of truth because
593+
// etcd members might not match with machines, e.g. while provisioning a new machine.
594+
// Also in this case, we leverage info on machines to determine member health.
595+
votingEtcdMembers := 0
596+
learnerEtcdMembers := 0
597+
etcdMembersHealthy := 0
598+
etcdMembersNotHealthy := 0
599+
etcdMembersNotHealthyButNotReportedYet := 0
600+
601+
for _, etcdMember := range etcdMembers {
602+
if etcdMember.IsLearner || etcdMember.Name == "" {
603+
learnerEtcdMembers++
604+
} else {
605+
votingEtcdMembers++
606+
}
607+
608+
// In case the etcd member does not have yet a name it is not possible to find a corresponding machine,
609+
// but we consider the node being healthy because this is a transient state that usually goes away quickly.
610+
if etcdMember.Name == "" {
611+
etcdMembersHealthy++
612+
continue
613+
}
614+
615+
// Look for the corresponding machine.
616+
machine := memberToMachineMap[etcdMember.Name]
617+
if machine == nil {
618+
// id there is only one provisioning machine (a machine yet without the node name), considering that KCP
619+
// only creates one machine at time, we can make the assumption this is the machine hosting the etcd member without a match
620+
if len(provisioningMachines) == 1 {
621+
machine = provisioningMachines[0]
622+
provisioningMachines = nil
623+
} else {
624+
// In case we cannot match an etcd member with a machine, we consider this an issue (it should
625+
// never happen with KCP).
626+
etcdMembersNotHealthy++
627+
continue
628+
}
629+
}
630+
631+
// Otherwise read the status of the etcd member from he EtcdMemberHealthy condition.
632+
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
633+
etcdMembersHealthy++
634+
} else if shouldSurfaceWhenAvailableTrue(machine,
635+
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
636+
etcdMembersNotHealthy++
637+
} else {
638+
etcdMembersNotHealthyButNotReportedYet++
639+
}
640+
}
641+
etcdQuorum := (votingEtcdMembers / 2.0) + 1
642+
643+
// If the control plane and etcd (if managed are available), set the condition to true taking care of surfacing partial unavailability it any.
583644
if kcp.DeletionTimestamp.IsZero() &&
584645
(!etcdIsManaged || etcdMembersHealthy >= etcdQuorum) &&
585646
k8sControlPlaneHealthy >= 1 &&
586647
v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
587648
messages := []string{}
588649

589650
if etcdIsManaged && etcdMembersNotHealthy > 0 {
590-
switch len(etcdMembers) - etcdMembersNotHealthy {
651+
etcdLearnersMsg := ""
652+
if learnerEtcdMembers > 0 {
653+
etcdLearnersMsg = fmt.Sprintf(" %d learner etcd member,", learnerEtcdMembers)
654+
}
655+
656+
// Note: When Available is true, we surface failures only after 10s they exist to avoid flakes;
657+
// Accordingly for this message NotHealthyButNotReportedYet sums up to Healthy.
658+
etcdMembersHealthyAndNotHealthyButNotReportedYet := etcdMembersHealthy + etcdMembersNotHealthyButNotReportedYet
659+
switch etcdMembersHealthyAndNotHealthyButNotReportedYet {
591660
case 1:
592-
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy, at least %d required for etcd quorum", len(etcdMembers), etcdQuorum))
661+
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy,%s at least %d healthy required for etcd quorum", len(etcdMembers), etcdLearnersMsg, etcdQuorum))
593662
default:
594-
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy, at least %d required for etcd quorum", len(etcdMembers)-etcdMembersNotHealthy, len(etcdMembers), etcdQuorum))
663+
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy,%s at least %d healthy required for etcd quorum", etcdMembersHealthyAndNotHealthyButNotReportedYet, len(etcdMembers), etcdLearnersMsg, etcdQuorum))
595664
}
596665
}
597666

598667
if k8sControlPlaneNotHealthy > 0 {
599-
switch len(machines) - k8sControlPlaneNotHealthy {
668+
// Note: When Available is true, we surface failures only after 10s they exist to avoid flakes;
669+
// Accordingly for this message NotHealthyButNotReportedYet sums up to Healthy.
670+
k8sControlPlaneHealthyAndNotHealthyButNotReportedYet := k8sControlPlaneHealthy + k8sControlPlaneNotHealthyButNotReportedYet
671+
switch k8sControlPlaneHealthyAndNotHealthyButNotReportedYet {
600672
case 1:
601673
messages = append(messages, fmt.Sprintf("* 1 of %d Machines has healthy control plane components, at least 1 required", len(machines)))
602674
default:
603-
messages = append(messages, fmt.Sprintf("* %d of %d Machines have healthy control plane components, at least 1 required", len(machines)-k8sControlPlaneNotHealthy, len(machines)))
675+
messages = append(messages, fmt.Sprintf("* %d of %d Machines have healthy control plane components, at least 1 required", k8sControlPlaneHealthyAndNotHealthyButNotReportedYet, len(machines)))
604676
}
605677
}
606678

@@ -623,13 +695,17 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
623695
}
624696

625697
if etcdIsManaged && etcdMembersHealthy < etcdQuorum {
698+
etcdLearnersMsg := ""
699+
if learnerEtcdMembers > 0 {
700+
etcdLearnersMsg = fmt.Sprintf(" %d learner etcd member,", learnerEtcdMembers)
701+
}
626702
switch etcdMembersHealthy {
627703
case 0:
628-
messages = append(messages, fmt.Sprintf("* There are no healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
704+
messages = append(messages, fmt.Sprintf("* There are no healthy etcd member,%s at least %d healthy required for etcd quorum", etcdLearnersMsg, etcdQuorum))
629705
case 1:
630-
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy, at least %d required for etcd quorum", len(etcdMembers), etcdQuorum))
706+
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy,%s at least %d healthy required for etcd quorum", len(etcdMembers), etcdLearnersMsg, etcdQuorum))
631707
default:
632-
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy, at least %d required for etcd quorum", etcdMembersHealthy, len(etcdMembers), etcdQuorum))
708+
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy,%s at least %d healthy required for etcd quorum", etcdMembersHealthy, len(etcdMembers), etcdLearnersMsg, etcdQuorum))
633709
}
634710
}
635711

0 commit comments

Comments
 (0)