Skip to content

Commit 4bd1b6c

Browse files
author
Fox Snowpatch
committed
1 parent ddf9a4c commit 4bd1b6c

File tree

12 files changed

+180
-30
lines changed

12 files changed

+180
-30
lines changed

Documentation/mm/split_page_table_lock.rst

+5-1
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,13 @@ There are helpers to lock/unlock a table and other accessor functions:
1616
- pte_offset_map_lock()
1717
maps PTE and takes PTE table lock, returns pointer to PTE with
1818
pointer to its PTE table lock, or returns NULL if no PTE table;
19-
- pte_offset_map_nolock()
19+
- pte_offset_map_ro_nolock()
2020
maps PTE, returns pointer to PTE with pointer to its PTE table
2121
lock (not taken), or returns NULL if no PTE table;
22+
- pte_offset_map_rw_nolock()
23+
maps PTE, returns pointer to PTE with pointer to its PTE table
24+
lock (not taken) and the value of its pmd entry, or returns NULL
25+
if no PTE table;
2226
- pte_offset_map()
2327
maps PTE, returns pointer to PTE, or returns NULL if no PTE table;
2428
- pte_unmap()

arch/arm/mm/fault-armv.c

+8-1
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
9494
pud_t *pud;
9595
pmd_t *pmd;
9696
pte_t *pte;
97+
pmd_t pmdval;
9798
int ret;
9899

99100
pgd = pgd_offset(vma->vm_mm, address);
@@ -112,16 +113,22 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
112113
if (pmd_none_or_clear_bad(pmd))
113114
return 0;
114115

116+
again:
115117
/*
116118
* This is called while another page table is mapped, so we
117119
* must use the nested version. This also means we need to
118120
* open-code the spin-locking.
119121
*/
120-
pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl);
122+
pte = pte_offset_map_rw_nolock(vma->vm_mm, pmd, address, &pmdval, &ptl);
121123
if (!pte)
122124
return 0;
123125

124126
do_pte_lock(ptl);
127+
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
128+
do_pte_unlock(ptl);
129+
pte_unmap(pte);
130+
goto again;
131+
}
125132

126133
ret = do_adjust_pte(vma, address, pfn, pte);
127134

arch/powerpc/mm/pgtable.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
392392
*/
393393
if (pmd_none(*pmd))
394394
return;
395-
pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
395+
pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
396396
BUG_ON(!pte);
397397
assert_spin_locked(ptl);
398398
pte_unmap(pte);

include/linux/mm.h

+5-2
Original file line numberDiff line numberDiff line change
@@ -2986,8 +2986,11 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
29862986
return pte;
29872987
}
29882988

2989-
pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
2990-
unsigned long addr, spinlock_t **ptlp);
2989+
pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
2990+
unsigned long addr, spinlock_t **ptlp);
2991+
pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
2992+
unsigned long addr, pmd_t *pmdvalp,
2993+
spinlock_t **ptlp);
29912994

29922995
#define pte_unmap_unlock(pte, ptl) do { \
29932996
spin_unlock(ptl); \

mm/filemap.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -3231,8 +3231,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
32313231
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
32323232
return 0;
32333233

3234-
ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address,
3235-
&vmf->ptl);
3234+
ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address,
3235+
&vmf->ptl);
32363236
if (unlikely(!ptep))
32373237
return VM_FAULT_NOPAGE;
32383238

mm/khugepaged.c

+36-3
Original file line numberDiff line numberDiff line change
@@ -1009,7 +1009,11 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
10091009
};
10101010

10111011
if (!pte++) {
1012-
pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
1012+
/*
1013+
* Here the ptl is only used to check pte_same() in
1014+
* do_swap_page(), so readonly version is enough.
1015+
*/
1016+
pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl);
10131017
if (!pte) {
10141018
mmap_read_unlock(mm);
10151019
result = SCAN_PMD_NULL;
@@ -1598,14 +1602,17 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
15981602
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
15991603
pml = pmd_lock(mm, pmd);
16001604

1601-
start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
1605+
start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
16021606
if (!start_pte) /* mmap_lock + page lock should prevent this */
16031607
goto abort;
16041608
if (!pml)
16051609
spin_lock(ptl);
16061610
else if (ptl != pml)
16071611
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
16081612

1613+
if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
1614+
goto abort;
1615+
16091616
/* step 2: clear page table and adjust rmap */
16101617
for (i = 0, addr = haddr, pte = start_pte;
16111618
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
@@ -1651,6 +1658,16 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
16511658
/* step 4: remove empty page table */
16521659
if (!pml) {
16531660
pml = pmd_lock(mm, pmd);
1661+
/*
1662+
* We called pte_unmap() and release the ptl before acquiring
1663+
* the pml, which means we left the RCU critical section, so the
1664+
* PTE page may have been freed, so we must do pmd_same() check
1665+
* before reacquiring the ptl.
1666+
*/
1667+
if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
1668+
spin_unlock(pml);
1669+
goto pmd_change;
1670+
}
16541671
if (ptl != pml)
16551672
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
16561673
}
@@ -1682,6 +1699,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
16821699
pte_unmap_unlock(start_pte, ptl);
16831700
if (pml && pml != ptl)
16841701
spin_unlock(pml);
1702+
pmd_change:
16851703
if (notified)
16861704
mmu_notifier_invalidate_range_end(&range);
16871705
drop_folio:
@@ -1703,6 +1721,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
17031721
spinlock_t *pml;
17041722
spinlock_t *ptl;
17051723
bool skipped_uffd = false;
1724+
pte_t *pte;
17061725

17071726
/*
17081727
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
@@ -1738,11 +1757,25 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
17381757
addr, addr + HPAGE_PMD_SIZE);
17391758
mmu_notifier_invalidate_range_start(&range);
17401759

1760+
pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pgt_pmd, &ptl);
1761+
if (!pte) {
1762+
mmu_notifier_invalidate_range_end(&range);
1763+
continue;
1764+
}
1765+
17411766
pml = pmd_lock(mm, pmd);
1742-
ptl = pte_lockptr(mm, pmd);
17431767
if (ptl != pml)
17441768
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
17451769

1770+
if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
1771+
pte_unmap_unlock(pte, ptl);
1772+
if (ptl != pml)
1773+
spin_unlock(pml);
1774+
mmu_notifier_invalidate_range_end(&range);
1775+
continue;
1776+
}
1777+
pte_unmap(pte);
1778+
17461779
/*
17471780
* Huge page lock is still held, so normally the page table
17481781
* must remain empty; and we have already skipped anon_vma

mm/memory.c

+29-3
Original file line numberDiff line numberDiff line change
@@ -1083,6 +1083,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
10831083
struct mm_struct *src_mm = src_vma->vm_mm;
10841084
pte_t *orig_src_pte, *orig_dst_pte;
10851085
pte_t *src_pte, *dst_pte;
1086+
pmd_t pmdval;
10861087
pte_t ptent;
10871088
spinlock_t *src_ptl, *dst_ptl;
10881089
int progress, max_nr, ret = 0;
@@ -1108,13 +1109,28 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
11081109
ret = -ENOMEM;
11091110
goto out;
11101111
}
1111-
src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
1112+
1113+
/*
1114+
* Since we may free the PTE page in retract_page_tables() without
1115+
* holding the read lock of mmap_lock, so we still need to do a
1116+
* pmd_same() check after holding the PTL.
1117+
*/
1118+
src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &pmdval,
1119+
&src_ptl);
11121120
if (!src_pte) {
11131121
pte_unmap_unlock(dst_pte, dst_ptl);
11141122
/* ret == 0 */
11151123
goto out;
11161124
}
11171125
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1126+
1127+
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(src_pmd)))) {
1128+
pte_unmap_unlock(src_pte, src_ptl);
1129+
pte_unmap_unlock(dst_pte, dst_ptl);
1130+
/* ret == 0 */
1131+
goto out;
1132+
}
1133+
11181134
orig_src_pte = src_pte;
11191135
orig_dst_pte = dst_pte;
11201136
arch_enter_lazy_mmu_mode();
@@ -5499,14 +5515,24 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
54995515
vmf->pte = NULL;
55005516
vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
55015517
} else {
5518+
pmd_t dummy_pmdval;
5519+
55025520
/*
55035521
* A regular pmd is established and it can't morph into a huge
55045522
* pmd by anon khugepaged, since that takes mmap_lock in write
55055523
* mode; but shmem or file collapse to THP could still morph
55065524
* it into a huge pmd: just retry later if so.
5525+
*
5526+
* Use the maywrite version to indicate that vmf->pte may be
5527+
* modified, but since we will use pte_same() to detect the
5528+
* change of the !pte_none() entry, there is no need to recheck
5529+
* the pmdval. Here we chooes to pass a dummy variable instead
5530+
* of NULL, which helps new user think about why this place is
5531+
* special.
55075532
*/
5508-
vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
5509-
vmf->address, &vmf->ptl);
5533+
vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd,
5534+
vmf->address, &dummy_pmdval,
5535+
&vmf->ptl);
55105536
if (unlikely(!vmf->pte))
55115537
return 0;
55125538
vmf->orig_pte = ptep_get_lockless(vmf->pte);

mm/mremap.c

+18-2
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
143143
spinlock_t *old_ptl, *new_ptl;
144144
bool force_flush = false;
145145
unsigned long len = old_end - old_addr;
146+
pmd_t pmdval;
146147
int err = 0;
147148

148149
/*
@@ -175,14 +176,29 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
175176
err = -EAGAIN;
176177
goto out;
177178
}
178-
new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
179+
/*
180+
* Since we may free the PTE page in retract_page_tables() without
181+
* holding the read lock of mmap_lock, so we still need to do a
182+
* pmd_same() check after holding the PTL.
183+
*/
184+
new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &pmdval,
185+
&new_ptl);
179186
if (!new_pte) {
180187
pte_unmap_unlock(old_pte, old_ptl);
181188
err = -EAGAIN;
182189
goto out;
183190
}
184-
if (new_ptl != old_ptl)
191+
if (new_ptl != old_ptl) {
185192
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
193+
194+
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(new_pmd)))) {
195+
pte_unmap_unlock(new_pte, new_ptl);
196+
pte_unmap_unlock(old_pte, old_ptl);
197+
err = -EAGAIN;
198+
goto out;
199+
}
200+
}
201+
186202
flush_tlb_batched_pending(vma->vm_mm);
187203
arch_enter_lazy_mmu_mode();
188204

mm/page_vma_mapped.c

+20-4
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
1313
return false;
1414
}
1515

16-
static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
16+
static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
17+
spinlock_t **ptlp)
1718
{
1819
pte_t ptent;
20+
pmd_t pmdval;
1921

2022
if (pvmw->flags & PVMW_SYNC) {
2123
/* Use the stricter lookup */
@@ -25,17 +27,19 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
2527
return !!pvmw->pte;
2628
}
2729

30+
again:
2831
/*
2932
* It is important to return the ptl corresponding to pte,
3033
* in case *pvmw->pmd changes underneath us; so we need to
3134
* return it even when choosing not to lock, in case caller
3235
* proceeds to loop over next ptes, and finds a match later.
3336
* Though, in most cases, page lock already protects this.
3437
*/
35-
pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
36-
pvmw->address, ptlp);
38+
pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd,
39+
pvmw->address, &pmdval, ptlp);
3740
if (!pvmw->pte)
3841
return false;
42+
*pmdvalp = pmdval;
3943

4044
ptent = ptep_get(pvmw->pte);
4145

@@ -69,6 +73,12 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
6973
}
7074
pvmw->ptl = *ptlp;
7175
spin_lock(pvmw->ptl);
76+
77+
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) {
78+
spin_unlock(pvmw->ptl);
79+
goto again;
80+
}
81+
7282
return true;
7383
}
7484

@@ -278,7 +288,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
278288
step_forward(pvmw, PMD_SIZE);
279289
continue;
280290
}
281-
if (!map_pte(pvmw, &ptl)) {
291+
if (!map_pte(pvmw, &pmde, &ptl)) {
282292
if (!pvmw->pte)
283293
goto restart;
284294
goto next_pte;
@@ -307,6 +317,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
307317
if (!pvmw->ptl) {
308318
pvmw->ptl = ptl;
309319
spin_lock(pvmw->ptl);
320+
if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) {
321+
pte_unmap_unlock(pvmw->pte, pvmw->ptl);
322+
pvmw->ptl = NULL;
323+
pvmw->pte = NULL;
324+
goto restart;
325+
}
310326
}
311327
goto this_pte;
312328
} while (pvmw->address < end);

0 commit comments

Comments
 (0)