Skip to content

Commit 2a4077f

Browse files
lorenzo-stoakesakpm00
authored andcommitted
mm/mremap: refactor move_page_tables(), abstracting state
A lot of state is threaded throughout the page table moving logic within the mremap code, including boolean values which control behaviour specifically in regard to whether rmap locks need be held over the operation and whether the VMA belongs to a temporary stack being moved by move_arg_pages() (and consequently, relocate_vma_down()). As we already transmit state throughout this operation, it is neater and more readable to maintain a small state object. We do so in the form of pagetable_move_control. In addition, this allows us to update parameters within the state as we manipulate things, for instance with regard to the page table realignment logic. In future I want to add additional functionality to the page table logic, so this is an additional motivation for making it easier to do so. This patch changes move_page_tables() to accept a pointer to a pagetable_move_control struct, and performs changes at this level only. Further page table logic will be updated in a subsequent patch. We additionally also take the opportunity to add significant comments describing the address realignment logic to make it abundantly clear what is going on in this code. Link: https://lkml.kernel.org/r/e20180add9c8746184aa3f23a61fff69a06cdaa9.1741639347.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes <[email protected]> Reviewed-by: Vlastimil Babka <[email protected]> Cc: Harry Yoo <[email protected]> Cc: Liam R. Howlett <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent b714ccb commit 2a4077f

File tree

3 files changed

+168
-52
lines changed

3 files changed

+168
-52
lines changed

mm/internal.h

+39-4
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,44 @@
2424

2525
struct folio_batch;
2626

27+
/*
28+
* Maintains state across a page table move. The operation assumes both source
29+
* and destination VMAs already exist and are specified by the user.
30+
*
31+
* Partial moves are permitted, but the old and new ranges must both reside
32+
* within a VMA.
33+
*
34+
* mmap lock must be held in write and VMA write locks must be held on any VMA
35+
* that is visible.
36+
*
37+
* Use the PAGETABLE_MOVE() macro to initialise this struct.
38+
*
39+
* NOTE: The page table move is affected by reading from [old_addr, old_end),
40+
* and old_addr may be updated for better page table alignment, so len_in
41+
* represents the length of the range being copied as specified by the user.
42+
*/
43+
struct pagetable_move_control {
44+
struct vm_area_struct *old; /* Source VMA. */
45+
struct vm_area_struct *new; /* Destination VMA. */
46+
unsigned long old_addr; /* Address from which the move begins. */
47+
unsigned long old_end; /* Exclusive address at which old range ends. */
48+
unsigned long new_addr; /* Address to move page tables to. */
49+
unsigned long len_in; /* Bytes to remap specified by user. */
50+
51+
bool need_rmap_locks; /* Do rmap locks need to be taken? */
52+
bool for_stack; /* Is this an early temp stack being moved? */
53+
};
54+
55+
#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
56+
struct pagetable_move_control name = { \
57+
.old = old_, \
58+
.new = new_, \
59+
.old_addr = old_addr_, \
60+
.old_end = (old_addr_) + (len_), \
61+
.new_addr = new_addr_, \
62+
.len_in = len_, \
63+
}
64+
2765
/*
2866
* The set of flags that only affect watermark checking and reclaim
2967
* behaviour. This is used by the MM to obey the caller constraints
@@ -1527,10 +1565,7 @@ extern struct list_lru shadow_nodes;
15271565
} while (0)
15281566

15291567
/* mremap.c */
1530-
unsigned long move_page_tables(struct vm_area_struct *vma,
1531-
unsigned long old_addr, struct vm_area_struct *new_vma,
1532-
unsigned long new_addr, unsigned long len,
1533-
bool need_rmap_locks, bool for_stack);
1568+
unsigned long move_page_tables(struct pagetable_move_control *pmc);
15341569

15351570
#ifdef CONFIG_UNACCEPTED_MEMORY
15361571
void accept_page(struct page *page);

mm/mmap.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -1694,6 +1694,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
16941694
VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
16951695
struct vm_area_struct *next;
16961696
struct mmu_gather tlb;
1697+
PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
16971698

16981699
BUG_ON(new_start > new_end);
16991700

@@ -1716,8 +1717,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
17161717
* move the page tables downwards, on failure we rely on
17171718
* process cleanup to remove whatever mess we made.
17181719
*/
1719-
if (length != move_page_tables(vma, old_start,
1720-
vma, new_start, length, false, true))
1720+
pmc.for_stack = true;
1721+
if (length != move_page_tables(&pmc))
17211722
return -ENOMEM;
17221723

17231724
tlb_gather_mmu(&tlb, mm);

mm/mremap.c

+126-46
Original file line numberDiff line numberDiff line change
@@ -580,8 +580,9 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
580580
* the VMA that is created to span the source and destination of the move,
581581
* so we make an exception for it.
582582
*/
583-
static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
584-
unsigned long mask, bool for_stack)
583+
static bool can_align_down(struct pagetable_move_control *pmc,
584+
struct vm_area_struct *vma, unsigned long addr_to_align,
585+
unsigned long mask)
585586
{
586587
unsigned long addr_masked = addr_to_align & mask;
587588

@@ -590,11 +591,11 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
590591
* of the corresponding VMA, we can't align down or we will destroy part
591592
* of the current mapping.
592593
*/
593-
if (!for_stack && vma->vm_start != addr_to_align)
594+
if (!pmc->for_stack && vma->vm_start != addr_to_align)
594595
return false;
595596

596597
/* In the stack case we explicitly permit in-VMA alignment. */
597-
if (for_stack && addr_masked >= vma->vm_start)
598+
if (pmc->for_stack && addr_masked >= vma->vm_start)
598599
return true;
599600

600601
/*
@@ -604,54 +605,131 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
604605
return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
605606
}
606607

607-
/* Opportunistically realign to specified boundary for faster copy. */
608-
static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
609-
unsigned long *new_addr, struct vm_area_struct *new_vma,
610-
unsigned long mask, bool for_stack)
608+
/*
609+
* Determine if are in fact able to realign for efficiency to a higher page
610+
* table boundary.
611+
*/
612+
static bool can_realign_addr(struct pagetable_move_control *pmc,
613+
unsigned long pagetable_mask)
611614
{
615+
unsigned long align_mask = ~pagetable_mask;
616+
unsigned long old_align = pmc->old_addr & align_mask;
617+
unsigned long new_align = pmc->new_addr & align_mask;
618+
unsigned long pagetable_size = align_mask + 1;
619+
unsigned long old_align_next = pagetable_size - old_align;
620+
621+
/*
622+
* We don't want to have to go hunting for VMAs from the end of the old
623+
* VMA to the next page table boundary, also we want to make sure the
624+
* operation is wortwhile.
625+
*
626+
* So ensure that we only perform this realignment if the end of the
627+
* range being copied reaches or crosses the page table boundary.
628+
*
629+
* boundary boundary
630+
* .<- old_align -> .
631+
* . |----------------.-----------|
632+
* . | vma . |
633+
* . |----------------.-----------|
634+
* . <----------------.----------->
635+
* . len_in
636+
* <------------------------------->
637+
* . pagetable_size .
638+
* . <---------------->
639+
* . old_align_next .
640+
*/
641+
if (pmc->len_in < old_align_next)
642+
return false;
643+
612644
/* Skip if the addresses are already aligned. */
613-
if ((*old_addr & ~mask) == 0)
614-
return;
645+
if (old_align == 0)
646+
return false;
615647

616648
/* Only realign if the new and old addresses are mutually aligned. */
617-
if ((*old_addr & ~mask) != (*new_addr & ~mask))
618-
return;
649+
if (old_align != new_align)
650+
return false;
619651

620652
/* Ensure realignment doesn't cause overlap with existing mappings. */
621-
if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
622-
!can_align_down(new_vma, *new_addr, mask, for_stack))
653+
if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) ||
654+
!can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask))
655+
return false;
656+
657+
return true;
658+
}
659+
660+
/*
661+
* Opportunistically realign to specified boundary for faster copy.
662+
*
663+
* Consider an mremap() of a VMA with page table boundaries as below, and no
664+
* preceding VMAs from the lower page table boundary to the start of the VMA,
665+
* with the end of the range reaching or crossing the page table boundary.
666+
*
667+
* boundary boundary
668+
* . |----------------.-----------|
669+
* . | vma . |
670+
* . |----------------.-----------|
671+
* . pmc->old_addr . pmc->old_end
672+
* . <---------------------------->
673+
* . move these page tables
674+
*
675+
* If we proceed with moving page tables in this scenario, we will have a lot of
676+
* work to do traversing old page tables and establishing new ones in the
677+
* destination across multiple lower level page tables.
678+
*
679+
* The idea here is simply to align pmc->old_addr, pmc->new_addr down to the
680+
* page table boundary, so we can simply copy a single page table entry for the
681+
* aligned portion of the VMA instead:
682+
*
683+
* boundary boundary
684+
* . |----------------.-----------|
685+
* . | vma . |
686+
* . |----------------.-----------|
687+
* pmc->old_addr . pmc->old_end
688+
* <------------------------------------------->
689+
* . move these page tables
690+
*/
691+
static void try_realign_addr(struct pagetable_move_control *pmc,
692+
unsigned long pagetable_mask)
693+
{
694+
695+
if (!can_realign_addr(pmc, pagetable_mask))
623696
return;
624697

625-
*old_addr = *old_addr & mask;
626-
*new_addr = *new_addr & mask;
698+
/*
699+
* Simply align to page table boundaries. Note that we do NOT update the
700+
* pmc->old_end value, and since the move_page_tables() operation spans
701+
* from [old_addr, old_end) (offsetting new_addr as it is performed),
702+
* this simply changes the start of the copy, not the end.
703+
*/
704+
pmc->old_addr &= pagetable_mask;
705+
pmc->new_addr &= pagetable_mask;
627706
}
628707

629-
unsigned long move_page_tables(struct vm_area_struct *vma,
630-
unsigned long old_addr, struct vm_area_struct *new_vma,
631-
unsigned long new_addr, unsigned long len,
632-
bool need_rmap_locks, bool for_stack)
708+
unsigned long move_page_tables(struct pagetable_move_control *pmc)
633709
{
634710
unsigned long extent, old_end;
635711
struct mmu_notifier_range range;
636712
pmd_t *old_pmd, *new_pmd;
637713
pud_t *old_pud, *new_pud;
714+
unsigned long old_addr, new_addr;
715+
struct vm_area_struct *vma = pmc->old;
638716

639-
if (!len)
717+
if (!pmc->len_in)
640718
return 0;
641719

642-
old_end = old_addr + len;
643-
644720
if (is_vm_hugetlb_page(vma))
645-
return move_hugetlb_page_tables(vma, new_vma, old_addr,
646-
new_addr, len);
721+
return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr,
722+
pmc->new_addr, pmc->len_in);
647723

724+
old_end = pmc->old_end;
648725
/*
649726
* If possible, realign addresses to PMD boundary for faster copy.
650727
* Only realign if the mremap copying hits a PMD boundary.
651728
*/
652-
if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
653-
try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
654-
for_stack);
729+
try_realign_addr(pmc, PMD_MASK);
730+
/* These may have been changed. */
731+
old_addr = pmc->old_addr;
732+
new_addr = pmc->new_addr;
655733

656734
flush_cache_range(vma, old_addr, old_end);
657735
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
@@ -675,12 +753,11 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
675753
if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
676754
if (extent == HPAGE_PUD_SIZE) {
677755
move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
678-
old_pud, new_pud, need_rmap_locks);
756+
old_pud, new_pud, pmc->need_rmap_locks);
679757
/* We ignore and continue on error? */
680758
continue;
681759
}
682760
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
683-
684761
if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
685762
old_pud, new_pud, true))
686763
continue;
@@ -698,7 +775,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
698775
pmd_devmap(*old_pmd)) {
699776
if (extent == HPAGE_PMD_SIZE &&
700777
move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
701-
old_pmd, new_pmd, need_rmap_locks))
778+
old_pmd, new_pmd, pmc->need_rmap_locks))
702779
continue;
703780
split_huge_pmd(vma, old_pmd, old_addr);
704781
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
@@ -713,10 +790,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
713790
}
714791
if (pmd_none(*old_pmd))
715792
continue;
716-
if (pte_alloc(new_vma->vm_mm, new_pmd))
793+
if (pte_alloc(pmc->new->vm_mm, new_pmd))
717794
break;
718795
if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
719-
new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
796+
pmc->new, new_pmd, new_addr, pmc->need_rmap_locks) < 0)
720797
goto again;
721798
}
722799

@@ -726,10 +803,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
726803
* Prevent negative return values when {old,new}_addr was realigned
727804
* but we broke out of the above loop for the first PMD itself.
728805
*/
729-
if (old_addr < old_end - len)
806+
if (old_addr < old_end - pmc->len_in)
730807
return 0;
731808

732-
return len + old_addr - old_end; /* how much done */
809+
return pmc->len_in + old_addr - old_end; /* how much done */
733810
}
734811

735812
/* Set vrm->delta to the difference in VMA size specified by user. */
@@ -1040,37 +1117,40 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm,
10401117
unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT;
10411118
unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff;
10421119
unsigned long moved_len;
1043-
bool need_rmap_locks;
1044-
struct vm_area_struct *vma;
1120+
struct vm_area_struct *vma = vrm->vma;
10451121
struct vm_area_struct *new_vma;
10461122
int err = 0;
1123+
PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len);
10471124

1048-
new_vma = copy_vma(&vrm->vma, vrm->new_addr, vrm->new_len, new_pgoff,
1049-
&need_rmap_locks);
1125+
new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff,
1126+
&pmc.need_rmap_locks);
10501127
if (!new_vma) {
10511128
vrm_uncharge(vrm);
10521129
*new_vma_ptr = NULL;
10531130
return -ENOMEM;
10541131
}
1055-
vma = vrm->vma;
1132+
vrm->vma = vma;
1133+
pmc.old = vma;
1134+
pmc.new = new_vma;
10561135

1057-
moved_len = move_page_tables(vma, vrm->addr, new_vma,
1058-
vrm->new_addr, vrm->old_len,
1059-
need_rmap_locks, /* for_stack= */false);
1136+
moved_len = move_page_tables(&pmc);
10601137
if (moved_len < vrm->old_len)
10611138
err = -ENOMEM;
10621139
else if (vma->vm_ops && vma->vm_ops->mremap)
10631140
err = vma->vm_ops->mremap(new_vma);
10641141

10651142
if (unlikely(err)) {
1143+
PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr,
1144+
vrm->addr, moved_len);
1145+
10661146
/*
10671147
* On error, move entries back from new area to old,
10681148
* which will succeed since page tables still there,
10691149
* and then proceed to unmap new area instead of old.
10701150
*/
1071-
move_page_tables(new_vma, vrm->new_addr, vma, vrm->addr,
1072-
moved_len, /* need_rmap_locks = */true,
1073-
/* for_stack= */false);
1151+
pmc_revert.need_rmap_locks = true;
1152+
move_page_tables(&pmc_revert);
1153+
10741154
vrm->vma = new_vma;
10751155
vrm->old_len = vrm->new_len;
10761156
vrm->addr = vrm->new_addr;

0 commit comments

Comments
 (0)