Skip to content

Commit cde70e3

Browse files
committed
amd64: for small cores, use (big hammer) INVPCID_CTXGLOB instead of INVLPG
A hypothetical CPU bug makes invalidation of global PTEs using INVLPG in pcid mode unreliable, it seems. The workaround is applied for all CPUs with small cores, since we do not know the scope of the issue, and the right fix. Reviewed by: alc (previous version) Discussed with: emaste, markj Tested by: karels PR: 261169, 266145 Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D37770
1 parent 45ac775 commit cde70e3

File tree

5 files changed

+67
-13
lines changed

5 files changed

+67
-13
lines changed

sys/amd64/amd64/initcpu.c

+5
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,11 @@ initializecpu(void)
324324
if ((r[0] & CPUID_HYBRID_CORE_MASK) ==
325325
CPUID_HYBRID_SMALL_CORE) {
326326
PCPU_SET(small_core, 1);
327+
if (pmap_pcid_enabled &&
328+
pmap_pcid_invlpg_workaround_uena) {
329+
PCPU_SET(pcid_invlpg_workaround, 1);
330+
pmap_pcid_invlpg_workaround = 1;
331+
}
327332
}
328333
}
329334
}

sys/amd64/amd64/mp_machdep.c

+11-5
Original file line numberDiff line numberDiff line change
@@ -861,7 +861,7 @@ invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1)
861861
(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
862862
#endif /* COUNT_IPIS */
863863

864-
invlpg(smp_tlb_addr1);
864+
pmap_invlpg(smp_tlb_pmap, smp_tlb_addr1);
865865
if (smp_tlb_pmap == PCPU_GET(curpmap) &&
866866
smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 &&
867867
PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
@@ -931,10 +931,16 @@ invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1,
931931
#endif /* COUNT_IPIS */
932932

933933
addr = smp_tlb_addr1;
934-
do {
935-
invlpg(addr);
936-
addr += PAGE_SIZE;
937-
} while (addr < smp_tlb_addr2);
934+
if (smp_tlb_pmap == kernel_pmap && PCPU_GET(pcid_invlpg_workaround)) {
935+
struct invpcid_descr d = { 0 };
936+
937+
invpcid(&d, INVPCID_CTXGLOB);
938+
} else {
939+
do {
940+
invlpg(addr);
941+
addr += PAGE_SIZE;
942+
} while (addr < smp_tlb_addr2);
943+
}
938944
if (smp_tlb_pmap == PCPU_GET(curpmap) &&
939945
smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 &&
940946
PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {

sys/amd64/amd64/pmap.c

+29-7
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,12 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
529529
int invpcid_works = 0;
530530
SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
531531
"Is the invpcid instruction available ?");
532+
int pmap_pcid_invlpg_workaround = 0;
533+
SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround,
534+
CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
535+
&pmap_pcid_invlpg_workaround, 0,
536+
"Enable small core PCID/INVLPG workaround");
537+
int pmap_pcid_invlpg_workaround_uena = 1;
532538

533539
int __read_frequently pti = 0;
534540
SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
@@ -2560,6 +2566,9 @@ pmap_init(void)
25602566
VM_PAGE_TO_PHYS(m);
25612567
}
25622568
}
2569+
2570+
TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
2571+
&pmap_pcid_invlpg_workaround_uena);
25632572
}
25642573

25652574
SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries,
@@ -2791,7 +2800,7 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
27912800

27922801
if ((newpde & PG_PS) == 0)
27932802
/* Demotion: flush a specific 2MB page mapping. */
2794-
invlpg(va);
2803+
pmap_invlpg(pmap, va);
27952804
else if ((newpde & PG_G) == 0)
27962805
/*
27972806
* Promotion: flush every 4KB page mapping from the TLB
@@ -3130,7 +3139,7 @@ pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va,
31303139
vm_offset_t addr2 __unused)
31313140
{
31323141
if (pmap == kernel_pmap) {
3133-
invlpg(va);
3142+
pmap_invlpg(kernel_pmap, va);
31343143
} else if (pmap == PCPU_GET(curpmap)) {
31353144
invlpg(va);
31363145
pmap_invalidate_page_cb(pmap, va);
@@ -3221,8 +3230,14 @@ pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
32213230
vm_offset_t addr;
32223231

32233232
if (pmap == kernel_pmap) {
3224-
for (addr = sva; addr < eva; addr += PAGE_SIZE)
3225-
invlpg(addr);
3233+
if (PCPU_GET(pcid_invlpg_workaround)) {
3234+
struct invpcid_descr d = { 0 };
3235+
3236+
invpcid(&d, INVPCID_CTXGLOB);
3237+
} else {
3238+
for (addr = sva; addr < eva; addr += PAGE_SIZE)
3239+
invlpg(addr);
3240+
}
32263241
} else if (pmap == PCPU_GET(curpmap)) {
32273242
for (addr = sva; addr < eva; addr += PAGE_SIZE)
32283243
invlpg(addr);
@@ -3760,7 +3775,7 @@ pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
37603775
for (; spa < epa; spa += PAGE_SIZE) {
37613776
sched_pin();
37623777
pte_store(pte, spa | pte_bits);
3763-
invlpg(vaddr);
3778+
pmap_invlpg(kernel_pmap, vaddr);
37643779
/* XXXKIB atomic inside flush_cache_range are excessive */
37653780
pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
37663781
sched_unpin();
@@ -7668,7 +7683,7 @@ pmap_kenter_temporary(vm_paddr_t pa, int i)
76687683

76697684
va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
76707685
pmap_kenter(va, pa);
7671-
invlpg(va);
7686+
pmap_invlpg(kernel_pmap, va);
76727687
return ((void *)crashdumpmap);
76737688
}
76747689

@@ -10371,7 +10386,7 @@ pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
1037110386
page[i]->md.pat_mode, 0);
1037210387
pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
1037310388
cache_bits);
10374-
invlpg(vaddr[i]);
10389+
pmap_invlpg(kernel_pmap, vaddr[i]);
1037510390
}
1037610391
}
1037710392
}
@@ -10420,7 +10435,14 @@ pmap_quick_remove_page(vm_offset_t addr)
1042010435
if (addr != qframe)
1042110436
return;
1042210437
pte_store(vtopte(qframe), 0);
10438+
10439+
/*
10440+
* Since qframe is exclusively mapped by
10441+
* pmap_quick_enter_page() and that function doesn't set PG_G,
10442+
* we can use INVLPG here.
10443+
*/
1042310444
invlpg(qframe);
10445+
1042410446
mtx_unlock_spin(&qframe_mtx);
1042510447
}
1042610448

sys/amd64/include/pcpu.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line");
100100
u_int pc_smp_tlb_op; \
101101
uint64_t pc_ucr3_load_mask; \
102102
u_int pc_small_core; \
103-
char __pad[2912] /* pad to UMA_PCPU_ALLOC_SIZE */
103+
u_int pc_pcid_invlpg_workaround; \
104+
char __pad[2908] /* pad to UMA_PCPU_ALLOC_SIZE */
104105

105106
#define PC_DBREG_CMD_NONE 0
106107
#define PC_DBREG_CMD_LOAD 1

sys/amd64/include/pmap.h

+20
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,8 @@ extern vm_offset_t virtual_end;
431431
extern vm_paddr_t dmaplimit;
432432
extern int pmap_pcid_enabled;
433433
extern int invpcid_works;
434+
extern int pmap_pcid_invlpg_workaround;
435+
extern int pmap_pcid_invlpg_workaround_uena;
434436

435437
#define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode)
436438
#define pmap_page_is_write_mapped(m) (((m)->a.flags & PGA_WRITEABLE) != 0)
@@ -514,6 +516,24 @@ pmap_invalidate_cpu_mask(pmap_t pmap)
514516
return (&pmap->pm_active);
515517
}
516518

519+
/*
520+
* It seems that AlderLake+ small cores have some microarchitectural
521+
* bug, which results in the INVLPG instruction failing to flush all
522+
* global TLB entries when PCID is enabled. Work around it for now,
523+
* by doing global invalidation on small cores instead of INVLPG.
524+
*/
525+
static __inline void
526+
pmap_invlpg(pmap_t pmap, vm_offset_t va)
527+
{
528+
if (pmap == kernel_pmap && PCPU_GET(pcid_invlpg_workaround)) {
529+
struct invpcid_descr d = { 0 };
530+
531+
invpcid(&d, INVPCID_CTXGLOB);
532+
} else {
533+
invlpg(va);
534+
}
535+
}
536+
517537
#endif /* _KERNEL */
518538

519539
/* Return various clipped indexes for a given VA */

0 commit comments

Comments
 (0)