Skip to content

Commit f5fd4a2

Browse files
author
Fox Snowpatch
committed
1 parent f85c105 commit f5fd4a2

38 files changed

+679
-859
lines changed

Documentation/virt/kvm/locking.rst

+41-39
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,8 @@ We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
135135
For direct sp, we can easily avoid it since the spte of direct sp is fixed
136136
to gfn. For indirect sp, we disabled fast page fault for simplicity.
137137

138-
A solution for indirect sp could be to pin the gfn, for example via
139-
kvm_vcpu_gfn_to_pfn_atomic, before the cmpxchg. After the pinning:
138+
A solution for indirect sp could be to pin the gfn before the cmpxchg. After
139+
the pinning:
140140

141141
- We have held the refcount of pfn; that means the pfn can not be freed and
142142
be reused for another gfn.
@@ -147,49 +147,51 @@ Then, we can ensure the dirty bitmaps is correctly set for a gfn.
147147

148148
2) Dirty bit tracking
149149

150-
In the origin code, the spte can be fast updated (non-atomically) if the
150+
In the original code, the spte can be fast updated (non-atomically) if the
151151
spte is read-only and the Accessed bit has already been set since the
152152
Accessed bit and Dirty bit can not be lost.
153153

154154
But it is not true after fast page fault since the spte can be marked
155155
writable between reading spte and updating spte. Like below case:
156156

157-
+------------------------------------------------------------------------+
158-
| At the beginning:: |
159-
| |
160-
| spte.W = 0 |
161-
| spte.Accessed = 1 |
162-
+------------------------------------+-----------------------------------+
163-
| CPU 0: | CPU 1: |
164-
+------------------------------------+-----------------------------------+
165-
| In mmu_spte_clear_track_bits():: | |
166-
| | |
167-
| old_spte = *spte; | |
168-
| | |
169-
| | |
170-
| /* 'if' condition is satisfied. */| |
171-
| if (old_spte.Accessed == 1 && | |
172-
| old_spte.W == 0) | |
173-
| spte = 0ull; | |
174-
+------------------------------------+-----------------------------------+
175-
| | on fast page fault path:: |
176-
| | |
177-
| | spte.W = 1 |
178-
| | |
179-
| | memory write on the spte:: |
180-
| | |
181-
| | spte.Dirty = 1 |
182-
+------------------------------------+-----------------------------------+
183-
| :: | |
184-
| | |
185-
| else | |
186-
| old_spte = xchg(spte, 0ull) | |
187-
| if (old_spte.Accessed == 1) | |
188-
| kvm_set_pfn_accessed(spte.pfn);| |
189-
| if (old_spte.Dirty == 1) | |
190-
| kvm_set_pfn_dirty(spte.pfn); | |
191-
| OOPS!!! | |
192-
+------------------------------------+-----------------------------------+
157+
+-------------------------------------------------------------------------+
158+
| At the beginning:: |
159+
| |
160+
| spte.W = 0 |
161+
| spte.Accessed = 1 |
162+
+-------------------------------------+-----------------------------------+
163+
| CPU 0: | CPU 1: |
164+
+-------------------------------------+-----------------------------------+
165+
| In mmu_spte_update():: | |
166+
| | |
167+
| old_spte = *spte; | |
168+
| | |
169+
| | |
170+
| /* 'if' condition is satisfied. */ | |
171+
| if (old_spte.Accessed == 1 && | |
172+
| old_spte.W == 0) | |
173+
| spte = new_spte; | |
174+
+-------------------------------------+-----------------------------------+
175+
| | on fast page fault path:: |
176+
| | |
177+
| | spte.W = 1 |
178+
| | |
179+
| | memory write on the spte:: |
180+
| | |
181+
| | spte.Dirty = 1 |
182+
+-------------------------------------+-----------------------------------+
183+
| :: | |
184+
| | |
185+
| else | |
186+
| old_spte = xchg(spte, new_spte);| |
187+
| if (old_spte.Accessed && | |
188+
| !new_spte.Accessed) | |
189+
| flush = true; | |
190+
| if (old_spte.Dirty && | |
191+
| !new_spte.Dirty) | |
192+
| flush = true; | |
193+
| OOPS!!! | |
194+
+-------------------------------------+-----------------------------------+
193195

194196
The Dirty bit is lost in this case.
195197

arch/arm64/include/asm/kvm_pgtable.h

+1-3
Original file line numberDiff line numberDiff line change
@@ -674,10 +674,8 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
674674
*
675675
* If there is a valid, leaf page-table entry used to translate @addr, then
676676
* set the access flag in that entry.
677-
*
678-
* Return: The old page-table entry prior to setting the flag, 0 on failure.
679677
*/
680-
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
678+
void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
681679

682680
/**
683681
* kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access

arch/arm64/kvm/guest.c

+6-9
Original file line numberDiff line numberDiff line change
@@ -1051,20 +1051,18 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
10511051
}
10521052

10531053
while (length > 0) {
1054-
kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
1054+
struct page *page = __gfn_to_page(kvm, gfn, write);
10551055
void *maddr;
10561056
unsigned long num_tags;
1057-
struct page *page;
10581057

1059-
if (is_error_noslot_pfn(pfn)) {
1058+
if (!page) {
10601059
ret = -EFAULT;
10611060
goto out;
10621061
}
10631062

1064-
page = pfn_to_online_page(pfn);
1065-
if (!page) {
1063+
if (!pfn_to_online_page(page_to_pfn(page))) {
10661064
/* Reject ZONE_DEVICE memory */
1067-
kvm_release_pfn_clean(pfn);
1065+
kvm_release_page_unused(page);
10681066
ret = -EFAULT;
10691067
goto out;
10701068
}
@@ -1078,7 +1076,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
10781076
/* No tags in memory, so write zeros */
10791077
num_tags = MTE_GRANULES_PER_PAGE -
10801078
clear_user(tags, MTE_GRANULES_PER_PAGE);
1081-
kvm_release_pfn_clean(pfn);
1079+
kvm_release_page_clean(page);
10821080
} else {
10831081
/*
10841082
* Only locking to serialise with a concurrent
@@ -1093,8 +1091,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
10931091
if (num_tags != MTE_GRANULES_PER_PAGE)
10941092
mte_clear_page_tags(maddr);
10951093
set_page_mte_tagged(page);
1096-
1097-
kvm_release_pfn_dirty(pfn);
1094+
kvm_release_page_dirty(page);
10981095
}
10991096

11001097
if (num_tags != MTE_GRANULES_PER_PAGE) {

arch/arm64/kvm/hyp/pgtable.c

+2-5
Original file line numberDiff line numberDiff line change
@@ -1245,19 +1245,16 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
12451245
NULL, NULL, 0);
12461246
}
12471247

1248-
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
1248+
void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
12491249
{
1250-
kvm_pte_t pte = 0;
12511250
int ret;
12521251

12531252
ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
1254-
&pte, NULL,
1253+
NULL, NULL,
12551254
KVM_PGTABLE_WALK_HANDLE_FAULT |
12561255
KVM_PGTABLE_WALK_SHARED);
12571256
if (!ret)
12581257
dsb(ishst);
1259-
1260-
return pte;
12611258
}
12621259

12631260
struct stage2_age_data {

arch/arm64/kvm/mmu.c

+8-13
Original file line numberDiff line numberDiff line change
@@ -1439,6 +1439,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
14391439
long vma_pagesize, fault_granule;
14401440
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
14411441
struct kvm_pgtable *pgt;
1442+
struct page *page;
14421443

14431444
if (fault_is_perm)
14441445
fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
@@ -1560,7 +1561,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
15601561

15611562
/*
15621563
* Read mmu_invalidate_seq so that KVM can detect if the results of
1563-
* vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
1564+
* vma_lookup() or __kvm_faultin_pfn() become stale prior to
15641565
* acquiring kvm->mmu_lock.
15651566
*
15661567
* Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
@@ -1569,8 +1570,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
15691570
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
15701571
mmap_read_unlock(current->mm);
15711572

1572-
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
1573-
write_fault, &writable, NULL);
1573+
pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
1574+
&writable, &page);
15741575
if (pfn == KVM_PFN_ERR_HWPOISON) {
15751576
kvm_send_hwpoison_signal(hva, vma_shift);
15761577
return 0;
@@ -1583,7 +1584,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
15831584
* If the page was identified as device early by looking at
15841585
* the VMA flags, vma_pagesize is already representing the
15851586
* largest quantity we can map. If instead it was mapped
1586-
* via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
1587+
* via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
15871588
* and must not be upgraded.
15881589
*
15891590
* In both cases, we don't let transparent_hugepage_adjust()
@@ -1692,33 +1693,27 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
16921693
}
16931694

16941695
out_unlock:
1696+
kvm_release_faultin_page(kvm, page, !!ret, writable);
16951697
read_unlock(&kvm->mmu_lock);
16961698

16971699
/* Mark the page dirty only if the fault is handled successfully */
1698-
if (writable && !ret) {
1699-
kvm_set_pfn_dirty(pfn);
1700+
if (writable && !ret)
17001701
mark_page_dirty_in_slot(kvm, memslot, gfn);
1701-
}
17021702

1703-
kvm_release_pfn_clean(pfn);
17041703
return ret != -EAGAIN ? ret : 0;
17051704
}
17061705

17071706
/* Resolve the access fault by making the page young again. */
17081707
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
17091708
{
1710-
kvm_pte_t pte;
17111709
struct kvm_s2_mmu *mmu;
17121710

17131711
trace_kvm_access_fault(fault_ipa);
17141712

17151713
read_lock(&vcpu->kvm->mmu_lock);
17161714
mmu = vcpu->arch.hw_mmu;
1717-
pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
1715+
kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
17181716
read_unlock(&vcpu->kvm->mmu_lock);
1719-
1720-
if (kvm_pte_valid(pte))
1721-
kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
17221717
}
17231718

17241719
/**

arch/loongarch/kvm/mmu.c

+12-28
Original file line numberDiff line numberDiff line change
@@ -552,12 +552,10 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
552552
static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
553553
{
554554
int ret = 0;
555-
kvm_pfn_t pfn = 0;
556555
kvm_pte_t *ptep, changed, new;
557556
gfn_t gfn = gpa >> PAGE_SHIFT;
558557
struct kvm *kvm = vcpu->kvm;
559558
struct kvm_memory_slot *slot;
560-
struct page *page;
561559

562560
spin_lock(&kvm->mmu_lock);
563561

@@ -570,8 +568,6 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ
570568

571569
/* Track access to pages marked old */
572570
new = kvm_pte_mkyoung(*ptep);
573-
/* call kvm_set_pfn_accessed() after unlock */
574-
575571
if (write && !kvm_pte_dirty(new)) {
576572
if (!kvm_pte_write(new)) {
577573
ret = -EFAULT;
@@ -595,26 +591,14 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ
595591
}
596592

597593
changed = new ^ (*ptep);
598-
if (changed) {
594+
if (changed)
599595
kvm_set_pte(ptep, new);
600-
pfn = kvm_pte_pfn(new);
601-
page = kvm_pfn_to_refcounted_page(pfn);
602-
if (page)
603-
get_page(page);
604-
}
596+
605597
spin_unlock(&kvm->mmu_lock);
606598

607-
if (changed) {
608-
if (kvm_pte_young(changed))
609-
kvm_set_pfn_accessed(pfn);
599+
if (kvm_pte_dirty(changed))
600+
mark_page_dirty(kvm, gfn);
610601

611-
if (kvm_pte_dirty(changed)) {
612-
mark_page_dirty(kvm, gfn);
613-
kvm_set_pfn_dirty(pfn);
614-
}
615-
if (page)
616-
put_page(page);
617-
}
618602
return ret;
619603
out:
620604
spin_unlock(&kvm->mmu_lock);
@@ -796,6 +780,7 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
796780
struct kvm *kvm = vcpu->kvm;
797781
struct kvm_memory_slot *memslot;
798782
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
783+
struct page *page;
799784

800785
/* Try the fast path to handle old / clean pages */
801786
srcu_idx = srcu_read_lock(&kvm->srcu);
@@ -823,7 +808,7 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
823808
mmu_seq = kvm->mmu_invalidate_seq;
824809
/*
825810
* Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in
826-
* gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
811+
* kvm_faultin_pfn() (which calls get_user_pages()), so that we don't
827812
* risk the page we get a reference to getting unmapped before we have a
828813
* chance to grab the mmu_lock without mmu_invalidate_retry() noticing.
829814
*
@@ -835,7 +820,7 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
835820
smp_rmb();
836821

837822
/* Slow path - ask KVM core whether we can access this GPA */
838-
pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable);
823+
pfn = kvm_faultin_pfn(vcpu, gfn, write, &writeable, &page);
839824
if (is_error_noslot_pfn(pfn)) {
840825
err = -EFAULT;
841826
goto out;
@@ -847,10 +832,10 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
847832
/*
848833
* This can happen when mappings are changed asynchronously, but
849834
* also synchronously if a COW is triggered by
850-
* gfn_to_pfn_prot().
835+
* kvm_faultin_pfn().
851836
*/
852837
spin_unlock(&kvm->mmu_lock);
853-
kvm_release_pfn_clean(pfn);
838+
kvm_release_page_unused(page);
854839
if (retry_no > 100) {
855840
retry_no = 0;
856841
schedule();
@@ -915,14 +900,13 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
915900
else
916901
++kvm->stat.pages;
917902
kvm_set_pte(ptep, new_pte);
903+
904+
kvm_release_faultin_page(kvm, page, false, writeable);
918905
spin_unlock(&kvm->mmu_lock);
919906

920-
if (prot_bits & _PAGE_DIRTY) {
907+
if (prot_bits & _PAGE_DIRTY)
921908
mark_page_dirty_in_slot(kvm, memslot, gfn);
922-
kvm_set_pfn_dirty(pfn);
923-
}
924909

925-
kvm_release_pfn_clean(pfn);
926910
out:
927911
srcu_read_unlock(&kvm->srcu, srcu_idx);
928912
return err;

0 commit comments

Comments
 (0)