Skip to content

Commit 97c5daa

Browse files
author
Fox Snowpatch
committed
1 parent ddf9a4c commit 97c5daa

12 files changed

Lines changed: 143 additions & 31 deletions

File tree

Documentation/mm/split_page_table_lock.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,13 @@ There are helpers to lock/unlock a table and other accessor functions:
1616
- pte_offset_map_lock()
1717
maps PTE and takes PTE table lock, returns pointer to PTE with
1818
pointer to its PTE table lock, or returns NULL if no PTE table;
19-
- pte_offset_map_nolock()
19+
- pte_offset_map_readonly_nolock()
2020
maps PTE, returns pointer to PTE with pointer to its PTE table
2121
lock (not taken), or returns NULL if no PTE table;
22+
- pte_offset_map_maywrite_nolock()
23+
maps PTE, returns pointer to PTE with pointer to its PTE table
24+
lock (not taken) and the value of its pmd entry, or returns NULL
25+
if no PTE table;
2226
- pte_offset_map()
2327
maps PTE, returns pointer to PTE, or returns NULL if no PTE table;
2428
- pte_unmap()

arch/arm/mm/fault-armv.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
9494
pud_t *pud;
9595
pmd_t *pmd;
9696
pte_t *pte;
97+
pmd_t pmdval;
9798
int ret;
9899

99100
pgd = pgd_offset(vma->vm_mm, address);
@@ -112,16 +113,22 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
112113
if (pmd_none_or_clear_bad(pmd))
113114
return 0;
114115

116+
again:
115117
/*
116118
* This is called while another page table is mapped, so we
117119
* must use the nested version. This also means we need to
118120
* open-code the spin-locking.
119121
*/
120-
pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl);
122+
pte = pte_offset_map_maywrite_nolock(vma->vm_mm, pmd, address, &pmdval, &ptl);
121123
if (!pte)
122124
return 0;
123125

124126
do_pte_lock(ptl);
127+
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
128+
do_pte_unlock(ptl);
129+
pte_unmap(pte);
130+
goto again;
131+
}
125132

126133
ret = do_adjust_pte(vma, address, pfn, pte);
127134

arch/powerpc/mm/pgtable.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
392392
*/
393393
if (pmd_none(*pmd))
394394
return;
395-
pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
395+
pte = pte_offset_map_readonly_nolock(mm, pmd, addr, &ptl);
396396
BUG_ON(!pte);
397397
assert_spin_locked(ptl);
398398
pte_unmap(pte);

include/linux/mm.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2986,8 +2986,11 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
29862986
return pte;
29872987
}
29882988

2989-
pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
2990-
unsigned long addr, spinlock_t **ptlp);
2989+
pte_t *pte_offset_map_readonly_nolock(struct mm_struct *mm, pmd_t *pmd,
2990+
unsigned long addr, spinlock_t **ptlp);
2991+
pte_t *pte_offset_map_maywrite_nolock(struct mm_struct *mm, pmd_t *pmd,
2992+
unsigned long addr, pmd_t *pmdvalp,
2993+
spinlock_t **ptlp);
29912994

29922995
#define pte_unmap_unlock(pte, ptl) do { \
29932996
spin_unlock(ptl); \

mm/filemap.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3231,8 +3231,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
32313231
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
32323232
return 0;
32333233

3234-
ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address,
3235-
&vmf->ptl);
3234+
ptep = pte_offset_map_readonly_nolock(vma->vm_mm, vmf->pmd,
3235+
vmf->address, &vmf->ptl);
32363236
if (unlikely(!ptep))
32373237
return VM_FAULT_NOPAGE;
32383238

mm/khugepaged.c

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,7 +1009,11 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
10091009
};
10101010

10111011
if (!pte++) {
1012-
pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
1012+
/*
1013+
* Here the ptl is only used to check pte_same() in
1014+
* do_swap_page(), so readonly version is enough.
1015+
*/
1016+
pte = pte_offset_map_readonly_nolock(mm, pmd, address, &ptl);
10131017
if (!pte) {
10141018
mmap_read_unlock(mm);
10151019
result = SCAN_PMD_NULL;
@@ -1598,14 +1602,17 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
15981602
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
15991603
pml = pmd_lock(mm, pmd);
16001604

1601-
start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
1605+
start_pte = pte_offset_map_maywrite_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
16021606
if (!start_pte) /* mmap_lock + page lock should prevent this */
16031607
goto abort;
16041608
if (!pml)
16051609
spin_lock(ptl);
16061610
else if (ptl != pml)
16071611
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
16081612

1613+
if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
1614+
goto abort;
1615+
16091616
/* step 2: clear page table and adjust rmap */
16101617
for (i = 0, addr = haddr, pte = start_pte;
16111618
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
@@ -1651,6 +1658,16 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
16511658
/* step 4: remove empty page table */
16521659
if (!pml) {
16531660
pml = pmd_lock(mm, pmd);
1661+
/*
1662+
* We called pte_unmap() and release the ptl before acquiring
1663+
* the pml, which means we left the RCU critical section, so the
1664+
* PTE page may have been freed, so we must do pmd_same() check
1665+
* before reacquiring the ptl.
1666+
*/
1667+
if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
1668+
spin_unlock(pml);
1669+
goto pmd_change;
1670+
}
16541671
if (ptl != pml)
16551672
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
16561673
}
@@ -1682,6 +1699,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
16821699
pte_unmap_unlock(start_pte, ptl);
16831700
if (pml && pml != ptl)
16841701
spin_unlock(pml);
1702+
pmd_change:
16851703
if (notified)
16861704
mmu_notifier_invalidate_range_end(&range);
16871705
drop_folio:
@@ -1703,6 +1721,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
17031721
spinlock_t *pml;
17041722
spinlock_t *ptl;
17051723
bool skipped_uffd = false;
1724+
pte_t *pte;
17061725

17071726
/*
17081727
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
@@ -1738,11 +1757,25 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
17381757
addr, addr + HPAGE_PMD_SIZE);
17391758
mmu_notifier_invalidate_range_start(&range);
17401759

1760+
pte = pte_offset_map_maywrite_nolock(mm, pmd, addr, &pgt_pmd, &ptl);
1761+
if (!pte) {
1762+
mmu_notifier_invalidate_range_end(&range);
1763+
continue;
1764+
}
1765+
17411766
pml = pmd_lock(mm, pmd);
1742-
ptl = pte_lockptr(mm, pmd);
17431767
if (ptl != pml)
17441768
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
17451769

1770+
if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
1771+
pte_unmap_unlock(pte, ptl);
1772+
if (ptl != pml)
1773+
spin_unlock(pml);
1774+
mmu_notifier_invalidate_range_end(&range);
1775+
continue;
1776+
}
1777+
pte_unmap(pte);
1778+
17461779
/*
17471780
* Huge page lock is still held, so normally the page table
17481781
* must remain empty; and we have already skipped anon_vma

mm/memory.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1083,6 +1083,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
10831083
struct mm_struct *src_mm = src_vma->vm_mm;
10841084
pte_t *orig_src_pte, *orig_dst_pte;
10851085
pte_t *src_pte, *dst_pte;
1086+
pmd_t pmdval;
10861087
pte_t ptent;
10871088
spinlock_t *src_ptl, *dst_ptl;
10881089
int progress, max_nr, ret = 0;
@@ -1108,7 +1109,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
11081109
ret = -ENOMEM;
11091110
goto out;
11101111
}
1111-
src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
1112+
src_pte = pte_offset_map_maywrite_nolock(src_mm, src_pmd, addr, NULL,
1113+
&src_ptl);
11121114
if (!src_pte) {
11131115
pte_unmap_unlock(dst_pte, dst_ptl);
11141116
/* ret == 0 */
@@ -5504,9 +5506,14 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
55045506
* pmd by anon khugepaged, since that takes mmap_lock in write
55055507
* mode; but shmem or file collapse to THP could still morph
55065508
* it into a huge pmd: just retry later if so.
5509+
*
5510+
* Use the maywrite version to indicate that vmf->pte will be
5511+
* modified, but since we will use pte_same() to detect the
5512+
* change of the pte entry, there is no need to get pmdval.
55075513
*/
5508-
vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
5509-
vmf->address, &vmf->ptl);
5514+
vmf->pte = pte_offset_map_maywrite_nolock(vmf->vma->vm_mm,
5515+
vmf->pmd, vmf->address,
5516+
NULL, &vmf->ptl);
55105517
if (unlikely(!vmf->pte))
55115518
return 0;
55125519
vmf->orig_pte = ptep_get_lockless(vmf->pte);

mm/mremap.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,12 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
175175
err = -EAGAIN;
176176
goto out;
177177
}
178-
new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
178+
/*
179+
* Use the maywrite version to indicate that new_pte will be modified,
180+
* but since we hold the exclusive mmap_lock, there is no need to
181+
* recheck pmd_same() after acquiring the new_ptl.
182+
*/
183+
new_pte = pte_offset_map_maywrite_nolock(mm, new_pmd, new_addr, NULL, &new_ptl);
179184
if (!new_pte) {
180185
pte_unmap_unlock(old_pte, old_ptl);
181186
err = -EAGAIN;

mm/page_vma_mapped.c

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
1313
return false;
1414
}
1515

16-
static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
16+
static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
17+
spinlock_t **ptlp)
1718
{
1819
pte_t ptent;
20+
pmd_t pmdval;
1921

2022
if (pvmw->flags & PVMW_SYNC) {
2123
/* Use the stricter lookup */
@@ -25,17 +27,19 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
2527
return !!pvmw->pte;
2628
}
2729

30+
again:
2831
/*
2932
* It is important to return the ptl corresponding to pte,
3033
* in case *pvmw->pmd changes underneath us; so we need to
3134
* return it even when choosing not to lock, in case caller
3235
* proceeds to loop over next ptes, and finds a match later.
3336
* Though, in most cases, page lock already protects this.
3437
*/
35-
pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
36-
pvmw->address, ptlp);
38+
pvmw->pte = pte_offset_map_maywrite_nolock(pvmw->vma->vm_mm, pvmw->pmd,
39+
pvmw->address, &pmdval, ptlp);
3740
if (!pvmw->pte)
3841
return false;
42+
*pmdvalp = pmdval;
3943

4044
ptent = ptep_get(pvmw->pte);
4145

@@ -69,6 +73,12 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
6973
}
7074
pvmw->ptl = *ptlp;
7175
spin_lock(pvmw->ptl);
76+
77+
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) {
78+
spin_unlock(pvmw->ptl);
79+
goto again;
80+
}
81+
7282
return true;
7383
}
7484

@@ -278,7 +288,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
278288
step_forward(pvmw, PMD_SIZE);
279289
continue;
280290
}
281-
if (!map_pte(pvmw, &ptl)) {
291+
if (!map_pte(pvmw, &pmde, &ptl)) {
282292
if (!pvmw->pte)
283293
goto restart;
284294
goto next_pte;
@@ -307,6 +317,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
307317
if (!pvmw->ptl) {
308318
pvmw->ptl = ptl;
309319
spin_lock(pvmw->ptl);
320+
if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) {
321+
pte_unmap_unlock(pvmw->pte, pvmw->ptl);
322+
pvmw->ptl = NULL;
323+
pvmw->pte = NULL;
324+
goto restart;
325+
}
310326
}
311327
goto this_pte;
312328
} while (pvmw->address < end);

mm/pgtable-generic.c

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -305,15 +305,30 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
305305
return NULL;
306306
}
307307

308-
pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
309-
unsigned long addr, spinlock_t **ptlp)
308+
pte_t *pte_offset_map_readonly_nolock(struct mm_struct *mm, pmd_t *pmd,
309+
unsigned long addr, spinlock_t **ptlp)
310+
{
311+
pmd_t pmdval;
312+
pte_t *pte;
313+
314+
pte = __pte_offset_map(pmd, addr, &pmdval);
315+
if (likely(pte))
316+
*ptlp = pte_lockptr(mm, &pmdval);
317+
return pte;
318+
}
319+
320+
pte_t *pte_offset_map_maywrite_nolock(struct mm_struct *mm, pmd_t *pmd,
321+
unsigned long addr, pmd_t *pmdvalp,
322+
spinlock_t **ptlp)
310323
{
311324
pmd_t pmdval;
312325
pte_t *pte;
313326

314327
pte = __pte_offset_map(pmd, addr, &pmdval);
315328
if (likely(pte))
316329
*ptlp = pte_lockptr(mm, &pmdval);
330+
if (pmdvalp)
331+
*pmdvalp = pmdval;
317332
return pte;
318333
}
319334

@@ -347,14 +362,21 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
347362
* and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s
348363
* afterwards.
349364
*
350-
* pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
351-
* but when successful, it also outputs a pointer to the spinlock in ptlp - as
352-
* pte_offset_map_lock() does, but in this case without locking it. This helps
353-
* the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
354-
* act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
355-
* pointer for the page table that it returns. In principle, the caller should
356-
* recheck *pmd once the lock is taken; in practice, no callsite needs that -
357-
* either the mmap_lock for write, or pte_same() check on contents, is enough.
365+
* pte_offset_map_readonly_nolock(mm, pmd, addr, ptlp), above, is like
366+
* pte_offset_map(); but when successful, it also outputs a pointer to the
367+
* spinlock in ptlp - as pte_offset_map_lock() does, but in this case without
368+
* locking it. This helps the caller to avoid a later pte_lockptr(mm, *pmd),
369+
* which might by that time act on a changed *pmd: pte_offset_map_readonly_nolock()
370+
* provides the correct spinlock pointer for the page table that it returns.
371+
* For readonly case, the caller does not need to recheck *pmd after the lock is
372+
* taken, because the RCU lock will ensure that the PTE page will not be freed.
373+
*
374+
* pte_offset_map_maywrite_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like
375+
* pte_offset_map_readonly_nolock(); but when successful, it also outputs the
376+
* pdmval. For cases where pte or pmd entries may be modified, that is, maywrite
377+
* case, this can help the caller recheck *pmd once the lock is taken. In some
378+
* cases we can pass NULL to pmdvalp: either the mmap_lock for write, or
379+
* pte_same() check on contents, is also enough.
358380
*
359381
* Note that free_pgtables(), used after unmapping detached vmas, or when
360382
* exiting the whole mm, does not take page table lock before freeing a page

0 commit comments

Comments
 (0)