Skip to content

Commit 954a209

Browse files
committed
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "ARM: - Correctly clean the BSS to the PoC before allowing EL2 to access it on nVHE/hVHE/protected configurations - Propagate ownership of debug registers in protected mode after the rework that landed in 6.14-rc1 - Stop pretending that we can run the protected mode without a GICv3 being present on the host - Fix a use-after-free situation that can occur if a vcpu fails to initialise the NV shadow S2 MMU contexts - Always evaluate the need to arm a background timer for fully emulated guest timers - Fix the emulation of EL1 timers in the absence of FEAT_ECV - Correctly handle the EL2 virtual timer, specially when HCR_EL2.E2H==0 s390: - move some of the guest page table (gmap) logic into KVM itself, inching towards the final goal of completely removing gmap from the non-kvm memory management code. As an initial set of cleanups, move some code from mm/gmap into kvm and start using __kvm_faultin_pfn() to fault-in pages as needed; but especially stop abusing page->index and page->lru to aid in the pgdesc conversion. x86: - Add missing check in the fix to defer starting the huge page recovery vhost_task - SRSO_USER_KERNEL_NO does not need SYNTHESIZED_F" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (31 commits) KVM: x86/mmu: Ensure NX huge page recovery thread is alive before waking KVM: remove kvm_arch_post_init_vm KVM: selftests: Fix spelling mistake "initally" -> "initially" kvm: x86: SRSO_USER_KERNEL_NO is not synthesized KVM: arm64: timer: Don't adjust the EL2 virtual timer offset KVM: arm64: timer: Correctly handle EL1 timer emulation when !FEAT_ECV KVM: arm64: timer: Always evaluate the need for a soft timer KVM: arm64: Fix nested S2 MMU structures reallocation KVM: arm64: Fail protected mode init if no vgic hardware is present KVM: arm64: Flush/sync debug state in protected mode KVM: s390: selftests: Streamline uc_skey test to issue iske after sske KVM: s390: remove the last user of page->index KVM: s390: move PGSTE softbits KVM: s390: remove useless page->index usage KVM: s390: move gmap_shadow_pgt_lookup() into kvm KVM: s390: stop using lists to keep track of used dat tables KVM: s390: stop using page->index for non-shadow gmaps KVM: s390: move some gmap shadowing functions away from mm/gmap.c KVM: s390: get rid of gmap_translate() KVM: s390: get rid of gmap_fault() ...
2 parents 9946eaf + 43fb96a commit 954a209

File tree

31 files changed

+1093
-1007
lines changed

31 files changed

+1093
-1007
lines changed

Documentation/virt/kvm/api.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -1419,7 +1419,7 @@ fetch) is injected in the guest.
14191419
S390:
14201420
^^^^^
14211421

1422-
Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set.
1422+
Returns -EINVAL or -EEXIST if the VM has the KVM_VM_S390_UCONTROL flag set.
14231423
Returns -EINVAL if called on a protected VM.
14241424

14251425
4.36 KVM_SET_TSS_ADDR

arch/arm64/kvm/arch_timer.c

+11-38
Original file line numberDiff line numberDiff line change
@@ -471,10 +471,8 @@ static void timer_emulate(struct arch_timer_context *ctx)
471471

472472
trace_kvm_timer_emulate(ctx, should_fire);
473473

474-
if (should_fire != ctx->irq.level) {
474+
if (should_fire != ctx->irq.level)
475475
kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
476-
return;
477-
}
478476

479477
kvm_timer_update_status(ctx, should_fire);
480478

@@ -761,21 +759,6 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu,
761759
timer_irq(map->direct_ptimer),
762760
&arch_timer_irq_ops);
763761
WARN_ON_ONCE(ret);
764-
765-
/*
766-
* The virtual offset behaviour is "interesting", as it
767-
* always applies when HCR_EL2.E2H==0, but only when
768-
* accessed from EL1 when HCR_EL2.E2H==1. So make sure we
769-
* track E2H when putting the HV timer in "direct" mode.
770-
*/
771-
if (map->direct_vtimer == vcpu_hvtimer(vcpu)) {
772-
struct arch_timer_offset *offs = &map->direct_vtimer->offset;
773-
774-
if (vcpu_el2_e2h_is_set(vcpu))
775-
offs->vcpu_offset = NULL;
776-
else
777-
offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2);
778-
}
779762
}
780763
}
781764

@@ -976,31 +959,21 @@ void kvm_timer_sync_nested(struct kvm_vcpu *vcpu)
976959
* which allows trapping of the timer registers even with NV2.
977960
* Still, this is still worse than FEAT_NV on its own. Meh.
978961
*/
979-
if (!vcpu_el2_e2h_is_set(vcpu)) {
980-
if (cpus_have_final_cap(ARM64_HAS_ECV))
981-
return;
982-
983-
/*
984-
* A non-VHE guest hypervisor doesn't have any direct access
985-
* to its timers: the EL2 registers trap (and the HW is
986-
* fully emulated), while the EL0 registers access memory
987-
* despite the access being notionally direct. Boo.
988-
*
989-
* We update the hardware timer registers with the
990-
* latest value written by the guest to the VNCR page
991-
* and let the hardware take care of the rest.
992-
*/
993-
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CTL_EL0), SYS_CNTV_CTL);
994-
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CVAL_EL0), SYS_CNTV_CVAL);
995-
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CTL_EL0), SYS_CNTP_CTL);
996-
write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0), SYS_CNTP_CVAL);
997-
} else {
962+
if (!cpus_have_final_cap(ARM64_HAS_ECV)) {
998963
/*
999964
* For a VHE guest hypervisor, the EL2 state is directly
1000-
* stored in the host EL1 timers, while the emulated EL0
965+
* stored in the host EL1 timers, while the emulated EL1
1001966
* state is stored in the VNCR page. The latter could have
1002967
* been updated behind our back, and we must reset the
1003968
* emulation of the timers.
969+
*
970+
* A non-VHE guest hypervisor doesn't have any direct access
971+
* to its timers: the EL2 registers trap despite being
972+
* notionally direct (we use the EL1 HW, as for VHE), while
973+
* the EL1 registers access memory.
974+
*
975+
* In both cases, process the emulated timers on each guest
976+
* exit. Boo.
1004977
*/
1005978
struct timer_map map;
1006979
get_timer_map(vcpu, &map);

arch/arm64/kvm/arm.c

+20
Original file line numberDiff line numberDiff line change
@@ -2290,6 +2290,19 @@ static int __init init_subsystems(void)
22902290
break;
22912291
case -ENODEV:
22922292
case -ENXIO:
2293+
/*
2294+
* No VGIC? No pKVM for you.
2295+
*
2296+
* Protected mode assumes that VGICv3 is present, so no point
2297+
* in trying to hobble along if vgic initialization fails.
2298+
*/
2299+
if (is_protected_kvm_enabled())
2300+
goto out;
2301+
2302+
/*
2303+
* Otherwise, userspace could choose to implement a GIC for its
2304+
* guest on non-cooperative hardware.
2305+
*/
22932306
vgic_present = false;
22942307
err = 0;
22952308
break;
@@ -2400,6 +2413,13 @@ static void kvm_hyp_init_symbols(void)
24002413
kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
24012414
kvm_nvhe_sym(__icache_flags) = __icache_flags;
24022415
kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
2416+
2417+
/*
2418+
* Flush entire BSS since part of its data containing init symbols is read
2419+
* while the MMU is off.
2420+
*/
2421+
kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start),
2422+
kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start));
24032423
}
24042424

24052425
static int __init kvm_hyp_init_protection(u32 hyp_va_bits)

arch/arm64/kvm/hyp/nvhe/hyp-main.c

+24
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,34 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
9191
*host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
9292
}
9393

94+
static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
95+
{
96+
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
97+
98+
hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner;
99+
100+
if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu))
101+
hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state;
102+
else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu))
103+
hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state;
104+
}
105+
106+
static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
107+
{
108+
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
109+
110+
if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu))
111+
host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state;
112+
else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu))
113+
host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state;
114+
}
115+
94116
static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
95117
{
96118
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
97119

98120
fpsimd_sve_flush();
121+
flush_debug_state(hyp_vcpu);
99122

100123
hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt;
101124

@@ -123,6 +146,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
123146
unsigned int i;
124147

125148
fpsimd_sve_sync(&hyp_vcpu->vcpu);
149+
sync_debug_state(hyp_vcpu);
126150

127151
host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt;
128152

arch/arm64/kvm/nested.c

+5-4
Original file line numberDiff line numberDiff line change
@@ -67,26 +67,27 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
6767
if (!tmp)
6868
return -ENOMEM;
6969

70+
swap(kvm->arch.nested_mmus, tmp);
71+
7072
/*
7173
* If we went through a realocation, adjust the MMU back-pointers in
7274
* the previously initialised kvm_pgtable structures.
7375
*/
7476
if (kvm->arch.nested_mmus != tmp)
7577
for (int i = 0; i < kvm->arch.nested_mmus_size; i++)
76-
tmp[i].pgt->mmu = &tmp[i];
78+
kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i];
7779

7880
for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++)
79-
ret = init_nested_s2_mmu(kvm, &tmp[i]);
81+
ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]);
8082

8183
if (ret) {
8284
for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++)
83-
kvm_free_stage2_pgd(&tmp[i]);
85+
kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]);
8486

8587
return ret;
8688
}
8789

8890
kvm->arch.nested_mmus_size = num_mmus;
89-
kvm->arch.nested_mmus = tmp;
9091

9192
return 0;
9293
}

arch/arm64/kvm/sys_regs.c

+13-3
Original file line numberDiff line numberDiff line change
@@ -1452,6 +1452,16 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
14521452
return true;
14531453
}
14541454

1455+
static bool access_hv_timer(struct kvm_vcpu *vcpu,
1456+
struct sys_reg_params *p,
1457+
const struct sys_reg_desc *r)
1458+
{
1459+
if (!vcpu_el2_e2h_is_set(vcpu))
1460+
return undef_access(vcpu, p, r);
1461+
1462+
return access_arch_timer(vcpu, p, r);
1463+
}
1464+
14551465
static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp,
14561466
s64 new, s64 cur)
14571467
{
@@ -3103,9 +3113,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
31033113
EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0),
31043114
EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0),
31053115

3106-
{ SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer },
3107-
EL2_REG(CNTHV_CTL_EL2, access_arch_timer, reset_val, 0),
3108-
EL2_REG(CNTHV_CVAL_EL2, access_arch_timer, reset_val, 0),
3116+
{ SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer },
3117+
EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0),
3118+
EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0),
31093119

31103120
{ SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 },
31113121

arch/s390/include/asm/gmap.h

+6-14
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
/**
2424
* struct gmap_struct - guest address space
2525
* @list: list head for the mm->context gmap list
26-
* @crst_list: list of all crst tables used in the guest address space
2726
* @mm: pointer to the parent mm_struct
2827
* @guest_to_host: radix tree with guest to host address translation
2928
* @host_to_guest: radix tree with pointer to segment table entries
@@ -35,7 +34,6 @@
3534
* @guest_handle: protected virtual machine handle for the ultravisor
3635
* @host_to_rmap: radix tree with gmap_rmap lists
3736
* @children: list of shadow gmap structures
38-
* @pt_list: list of all page tables used in the shadow guest address space
3937
* @shadow_lock: spinlock to protect the shadow gmap list
4038
* @parent: pointer to the parent gmap for shadow guest address spaces
4139
* @orig_asce: ASCE for which the shadow page table has been created
@@ -45,7 +43,6 @@
4543
*/
4644
struct gmap {
4745
struct list_head list;
48-
struct list_head crst_list;
4946
struct mm_struct *mm;
5047
struct radix_tree_root guest_to_host;
5148
struct radix_tree_root host_to_guest;
@@ -61,7 +58,6 @@ struct gmap {
6158
/* Additional data for shadow guest address spaces */
6259
struct radix_tree_root host_to_rmap;
6360
struct list_head children;
64-
struct list_head pt_list;
6561
spinlock_t shadow_lock;
6662
struct gmap *parent;
6763
unsigned long orig_asce;
@@ -106,23 +102,21 @@ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
106102
void gmap_remove(struct gmap *gmap);
107103
struct gmap *gmap_get(struct gmap *gmap);
108104
void gmap_put(struct gmap *gmap);
105+
void gmap_free(struct gmap *gmap);
106+
struct gmap *gmap_alloc(unsigned long limit);
109107

110108
int gmap_map_segment(struct gmap *gmap, unsigned long from,
111109
unsigned long to, unsigned long len);
112110
int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
113111
unsigned long __gmap_translate(struct gmap *, unsigned long gaddr);
114-
unsigned long gmap_translate(struct gmap *, unsigned long gaddr);
115112
int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr);
116-
int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags);
117113
void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
118114
void __gmap_zap(struct gmap *, unsigned long gaddr);
119115
void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
120116

121117
int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
122118

123-
struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
124-
int edat_level);
125-
int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
119+
void gmap_unshadow(struct gmap *sg);
126120
int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
127121
int fake);
128122
int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
@@ -131,24 +125,22 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
131125
int fake);
132126
int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
133127
int fake);
134-
int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
135-
unsigned long *pgt, int *dat_protection, int *fake);
136128
int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
137129

138130
void gmap_register_pte_notifier(struct gmap_notifier *);
139131
void gmap_unregister_pte_notifier(struct gmap_notifier *);
140132

141-
int gmap_mprotect_notify(struct gmap *, unsigned long start,
142-
unsigned long len, int prot);
133+
int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits);
143134

144135
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
145136
unsigned long gaddr, unsigned long vmaddr);
146137
int s390_disable_cow_sharing(void);
147-
void s390_unlist_old_asce(struct gmap *gmap);
148138
int s390_replace_asce(struct gmap *gmap);
149139
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
150140
int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
151141
unsigned long end, bool interruptible);
142+
int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split);
143+
unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level);
152144

153145
/**
154146
* s390_uv_destroy_range - Destroy a range of pages in the given mm.

arch/s390/include/asm/kvm_host.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
#define KVM_S390_ESCA_CPU_SLOTS 248
3131
#define KVM_MAX_VCPUS 255
3232

33+
#define KVM_INTERNAL_MEM_SLOTS 1
34+
3335
/*
3436
* These seem to be used for allocating ->chip in the routing table, which we
3537
* don't use. 1 is as small as we can get to reduce the needed memory. If we
@@ -931,12 +933,14 @@ struct sie_page2 {
931933
u8 reserved928[0x1000 - 0x928]; /* 0x0928 */
932934
};
933935

936+
struct vsie_page;
937+
934938
struct kvm_s390_vsie {
935939
struct mutex mutex;
936940
struct radix_tree_root addr_to_page;
937941
int page_count;
938942
int next;
939-
struct page *pages[KVM_MAX_VCPUS];
943+
struct vsie_page *pages[KVM_MAX_VCPUS];
940944
};
941945

942946
struct kvm_s390_gisa_iam {

arch/s390/include/asm/pgtable.h

+18-3
Original file line numberDiff line numberDiff line change
@@ -420,9 +420,10 @@ void setup_protection_map(void);
420420
#define PGSTE_HC_BIT 0x0020000000000000UL
421421
#define PGSTE_GR_BIT 0x0004000000000000UL
422422
#define PGSTE_GC_BIT 0x0002000000000000UL
423-
#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */
424-
#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */
425-
#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */
423+
#define PGSTE_ST2_MASK 0x0000ffff00000000UL
424+
#define PGSTE_UC_BIT 0x0000000000008000UL /* user dirty (migration) */
425+
#define PGSTE_IN_BIT 0x0000000000004000UL /* IPTE notify bit */
426+
#define PGSTE_VSIE_BIT 0x0000000000002000UL /* ref'd in a shadow table */
426427

427428
/* Guest Page State used for virtualization */
428429
#define _PGSTE_GPS_ZERO 0x0000000080000000UL
@@ -2007,4 +2008,18 @@ extern void s390_reset_cmma(struct mm_struct *mm);
20072008
#define pmd_pgtable(pmd) \
20082009
((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE))
20092010

2011+
static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt)
2012+
{
2013+
unsigned long *pgstes, res;
2014+
2015+
pgstes = pgt + _PAGE_ENTRIES;
2016+
2017+
res = (pgstes[0] & PGSTE_ST2_MASK) << 16;
2018+
res |= pgstes[1] & PGSTE_ST2_MASK;
2019+
res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16;
2020+
res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32;
2021+
2022+
return res;
2023+
}
2024+
20102025
#endif /* _S390_PAGE_H */

arch/s390/include/asm/uv.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -628,12 +628,12 @@ static inline int is_prot_virt_host(void)
628628
}
629629

630630
int uv_pin_shared(unsigned long paddr);
631-
int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
632-
int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
633631
int uv_destroy_folio(struct folio *folio);
634632
int uv_destroy_pte(pte_t pte);
635633
int uv_convert_from_secure_pte(pte_t pte);
636-
int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr);
634+
int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb);
635+
int uv_convert_from_secure(unsigned long paddr);
636+
int uv_convert_from_secure_folio(struct folio *folio);
637637

638638
void setup_uv(void);
639639

0 commit comments

Comments
 (0)