diff --git a/buildroot-external/board/raspberrypi/patches/linux/0005-amdgpu.patch b/buildroot-external/board/raspberrypi/patches/linux/0005-amdgpu.patch new file mode 100644 index 00000000000..79a39c4719c --- /dev/null +++ b/buildroot-external/board/raspberrypi/patches/linux/0005-amdgpu.patch @@ -0,0 +1,5556 @@ +From 02eb7e12d6f41821af02d985a9457e282a1e200e Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Tue, 7 Nov 2023 21:30:44 +0100 +Subject: [PATCH 01/18] memory access fixes/workarounds for the pi5 + +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 6 +++--- + drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 4 ++-- + drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 20 +++++++++++--------- + 7 files changed, 20 insertions(+), 18 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 24d007715a14ae..51981cead0dca9 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -1466,7 +1466,7 @@ static int amdgpu_device_wb_init(struct amdgpu_device *adev) + memset(&adev->wb.used, 0, sizeof(adev->wb.used)); + + /* clear wb memory */ +- memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); ++ memset_io((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); + } + + return 0; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +index 05ebb8216a55a5..d3f6fbddf91935 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +@@ -370,7 +370,7 @@ int amdgpu_gfx_kiq_init(struct amdgpu_device *adev, + return r; + } + +- memset(hpd, 0, hpd_size); ++ memset_io(hpd, 0, hpd_size); + + r = amdgpu_bo_reserve(kiq->eop_obj, true); + if (unlikely(r != 0)) +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c +index 10df731998b22f..2627963b2c0d4a 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c +@@ -58,7 +58,7 @@ int amdgpu_sa_bo_manager_init(struct amdgpu_device *adev, + return r; + } + +- memset(sa_manager->cpu_ptr, 0, size); ++ memset_io(sa_manager->cpu_ptr, 0, size); + drm_suballoc_manager_init(&sa_manager->base, size, suballoc_align); + return r; + } +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +index 1c8ac4cf08c5ac..85b9649da1ab6d 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +@@ -1120,7 +1120,7 @@ static struct ttm_tt *amdgpu_ttm_tt_create(struct ttm_buffer_object *bo, + if (abo->flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC) + caching = ttm_write_combined; + else +- caching = ttm_cached; ++ caching = ttm_uncached; + + /* allocate space for the uninitialized page entries */ + if (ttm_sg_tt_init(>t->ttm, bo, page_flags, caching)) { +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c +index 4c7b53648a507a..d59dd73e21a098 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c +@@ -1074,7 +1074,7 @@ static int amdgpu_ucode_init_single_fw(struct amdgpu_device *adev, + le32_to_cpu(header->ucode_array_offset_bytes); + } + +- memcpy(ucode->kaddr, ucode_addr, ucode->ucode_size); ++ memcpy_toio(ucode->kaddr, ucode_addr, ucode->ucode_size); + + return 0; + } +@@ -1098,7 +1098,7 @@ static int amdgpu_ucode_patch_jt(struct amdgpu_firmware_info *ucode, + src_addr = (uint8_t *)ucode->fw->data + + le32_to_cpu(comm_hdr->ucode_array_offset_bytes) + + (le32_to_cpu(header->jt_offset) * 4); +- memcpy(dst_addr, src_addr, le32_to_cpu(header->jt_size) * 4); ++ memcpy_toio(dst_addr, src_addr, le32_to_cpu(header->jt_size) * 4); + + return 0; + } +@@ -1117,7 +1117,7 @@ int amdgpu_ucode_create_bo(struct amdgpu_device *adev) + dev_err(adev->dev, "failed to create kernel buffer for firmware.fw_buf\n"); + return -ENOMEM; + } else if (amdgpu_sriov_vf(adev)) { +- memset(adev->firmware.fw_buf_ptr, 0, adev->firmware.fw_size); ++ memset_io(adev->firmware.fw_buf_ptr, 0, adev->firmware.fw_size); + } + } + return 0; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +index 65bb26215e867a..248d821bfdb137 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +@@ -1202,7 +1202,7 @@ int amdgpu_uvd_get_create_msg(struct amdgpu_ring *ring, uint32_t handle, + { + struct amdgpu_device *adev = ring->adev; + struct amdgpu_bo *bo = adev->uvd.ib_bo; +- uint32_t *msg; ++ volatile uint32_t *msg; + int i; + + msg = amdgpu_bo_kptr(bo); +@@ -1230,7 +1230,7 @@ int amdgpu_uvd_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle, + { + struct amdgpu_device *adev = ring->adev; + struct amdgpu_bo *bo = NULL; +- uint32_t *msg; ++ volatile uint32_t *msg; + int r, i; + + if (direct) { +diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +index 9d741695ca07d6..721391dc959756 100644 +--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +@@ -1319,7 +1319,7 @@ static int gfx_v8_0_mec_init(struct amdgpu_device *adev) + return r; + } + +- memset(hpd, 0, mec_hpd_size); ++ memset_io(hpd, 0, mec_hpd_size); + + amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj); + amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj); +@@ -4391,7 +4391,7 @@ static int gfx_v8_0_deactivate_hqd(struct amdgpu_device *adev, u32 req) + return r; + } + +-static void gfx_v8_0_mqd_set_priority(struct amdgpu_ring *ring, struct vi_mqd *mqd) ++static void gfx_v8_0_mqd_set_priority(struct amdgpu_ring *ring, volatile struct vi_mqd *mqd) + { + struct amdgpu_device *adev = ring->adev; + +@@ -4407,7 +4407,7 @@ static void gfx_v8_0_mqd_set_priority(struct amdgpu_ring *ring, struct vi_mqd *m + static int gfx_v8_0_mqd_init(struct amdgpu_ring *ring) + { + struct amdgpu_device *adev = ring->adev; +- struct vi_mqd *mqd = ring->mqd_ptr; ++ volatile struct vi_mqd *mqd = ring->mqd_ptr; + uint64_t hqd_gpu_addr, wb_gpu_addr, eop_base_addr; + uint32_t tmp; + +@@ -4418,11 +4418,13 @@ static int gfx_v8_0_mqd_init(struct amdgpu_ring *ring) + mqd->compute_static_thread_mgmt_se2 = 0xffffffff; + mqd->compute_static_thread_mgmt_se3 = 0xffffffff; + mqd->compute_misc_reserved = 0x00000003; ++ + mqd->dynamic_cu_mask_addr_lo = lower_32_bits(ring->mqd_gpu_addr + + offsetof(struct vi_mqd_allocation, dynamic_cu_mask)); + mqd->dynamic_cu_mask_addr_hi = upper_32_bits(ring->mqd_gpu_addr + + offsetof(struct vi_mqd_allocation, dynamic_cu_mask)); + eop_base_addr = ring->eop_gpu_addr >> 8; ++ + mqd->cp_hqd_eop_base_addr_lo = eop_base_addr; + mqd->cp_hqd_eop_base_addr_hi = upper_32_bits(eop_base_addr); + +@@ -4598,7 +4600,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring) + if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */ + /* reset MQD to a clean status */ + if (adev->gfx.kiq[0].mqd_backup) +- memcpy(mqd, adev->gfx.kiq[0].mqd_backup, sizeof(struct vi_mqd_allocation)); ++ memcpy_toio(mqd, adev->gfx.kiq[0].mqd_backup, sizeof(struct vi_mqd_allocation)); + + /* reset ring buffer */ + ring->wptr = 0; +@@ -4609,7 +4611,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring) + vi_srbm_select(adev, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); + } else { +- memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); ++ memset_io((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); + ((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF; + ((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF; + if (amdgpu_sriov_vf(adev) && adev->in_suspend) +@@ -4622,7 +4624,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring) + mutex_unlock(&adev->srbm_mutex); + + if (adev->gfx.kiq[0].mqd_backup) +- memcpy(adev->gfx.kiq[0].mqd_backup, mqd, sizeof(struct vi_mqd_allocation)); ++ memcpy_fromio(adev->gfx.kiq[0].mqd_backup, mqd, sizeof(struct vi_mqd_allocation)); + } + + return 0; +@@ -4635,7 +4637,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring) + int mqd_idx = ring - &adev->gfx.compute_ring[0]; + + if (!amdgpu_in_reset(adev) && !adev->in_suspend) { +- memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); ++ memset_io((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); + ((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF; + ((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF; + mutex_lock(&adev->srbm_mutex); +@@ -4645,11 +4647,11 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring) + mutex_unlock(&adev->srbm_mutex); + + if (adev->gfx.mec.mqd_backup[mqd_idx]) +- memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation)); ++ memcpy_fromio(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation)); + } else { + /* restore MQD to a clean status */ + if (adev->gfx.mec.mqd_backup[mqd_idx]) +- memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation)); ++ memcpy_toio(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation)); + /* reset ring buffer */ + ring->wptr = 0; + amdgpu_ring_clear_ring(ring); + +From 6a74b71563eee334c09e1a3b493632c2c2db1c39 Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Thu, 23 Nov 2023 16:38:59 +0100 +Subject: [PATCH 02/18] some alignment trapping, still wip + +--- + arch/arm64/include/asm/exception.h | 1 + + arch/arm64/kernel/compat_alignment.c | 313 ++++++++++++++++++++++++++- + arch/arm64/mm/fault.c | 9 + + 3 files changed, 322 insertions(+), 1 deletion(-) + +diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h +index f296662590c7f8..10249b8be1b16c 100644 +--- a/arch/arm64/include/asm/exception.h ++++ b/arch/arm64/include/asm/exception.h +@@ -68,6 +68,7 @@ void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs) + void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr); + void do_el0_cp15(unsigned long esr, struct pt_regs *regs); + int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs); ++int do_alignment_fixup(unsigned long addr, struct pt_regs *regs); + void do_el0_svc(struct pt_regs *regs); + void do_el0_svc_compat(struct pt_regs *regs); + void do_el0_fpac(struct pt_regs *regs, unsigned long esr); +diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c +index b68e1d328d4cb9..9b921cc50b340e 100644 +--- a/arch/arm64/kernel/compat_alignment.c ++++ b/arch/arm64/kernel/compat_alignment.c +@@ -318,7 +318,7 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) + int thumb2_32b = 0; + + instrptr = instruction_pointer(regs); +- ++ printk("Alignment fixup\n"); + if (compat_thumb_mode(regs)) { + __le16 __user *ptr = (__le16 __user *)(instrptr & ~1); + u16 tinstr, tinst2; +@@ -383,3 +383,314 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) + + return 0; + } ++ ++// arm64# ++ ++/* ++ *Happens with The Long Dark ++ * ++ *[ 6012.660803] Faulting instruction: 0x3d800020 ++[ 6012.660813] Load/Store: op0 0x3 op1 0x1 op2 0x3 op3 0x0 op4 0x0 ++ */ ++ ++struct fixupDescription{ ++ void* addr; ++ // ++ u64 data1; ++ u64 data1_simd; ++ u64 data2; ++ u64 data2_simd; ++ ++ int Rs; // used for atomics (which don't get handled atomically) ++ ++ int simd; // wether or not this is a vector instruction ++ int load; // 1 is it's a load, 0 if it's a store ++ int pair; // 1 if it's a l/s pair instruction ++ int width; // width of the access in bits ++}; ++ ++static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *inst) ++{ ++ __le32 instr = 0; ++ int fault; ++ ++ fault = get_user(instr, ip); ++ if (fault) ++ return fault; ++ ++ *inst = __le32_to_cpu(instr); ++ return 0; ++} ++ ++/*int ldpstp_offset_fixup(u32 instr, struct pt_regs *regs){ ++ uint8_t load = (instr >> 22) & 1; ++ uint8_t simd = (instr >> 26) & 1; ++ uint16_t imm7 = (instr >> 15) & 0x7f; ++ uint8_t Rt2 = (instr >> 10) & 0x1f; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; ++ ++ int16_t imm = 0xffff & imm7; ++ printk("Variant: 0x%x Load: %x SIMD: %x IMM: 0x%x Rt: 0x%x Rt2: 0x%x Rn: 0x%x\n", ((instr >> 30) & 3),load, simd, imm, Rt, Rt2, Rn); ++ if(((instr >> 30) & 3) == 2){ ++ // 64bit ++ if(!load){ ++ if(!simd){ ++ // 64bit store ++ u64 val1, val2; ++ val1 = regs->regs[Rt]; ++ val2 = regs->regs[Rt2]; ++ u64 addr = regs->regs[Rn] + imm; ++ printk("STP 64bit storing 0x%llx 0x%llx at 0x%llx\n", val1, val2, addr); ++ // for the first reg. Byte by byte to avoid any alignment issues ++ for(int i = 0; i < 8; i++){ ++ uint8_t v = (val1 >> (i*8)) & 0xff; ++ put_user(v, (uint8_t __user *)addr); ++ addr++; ++ } ++ // second reg ++ for(int i = 0; i < 8; i++){ ++ uint8_t v = (val2 >> (i*8)) & 0xff; ++ put_user(v, (uint8_t __user *)addr); ++ addr++; ++ } ++ arm64_skip_faulting_instruction(regs, 4); ++ } ++ } ++ } ++ return 0; ++}*/ ++ ++int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ int r; ++ if(!desc->load){ ++ uint8_t* addr = desc->addr; ++ int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that ++ printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); ++ for(int i = 0; i < bcount; i++){ ++ if((r=put_user(desc->data1 & 0xff, (uint8_t __user *)addr))) ++ return r; ++ desc->data1 >>= 8; ++ addr++; ++ } ++ ++ if(desc->pair){ ++ for(int i = 0; i < bcount; i++){ ++ if((r=put_user(desc->data2 & 0xff, (uint8_t __user *)addr))) ++ return r; ++ desc->data2 >>= 8; ++ addr++; ++ } ++ } ++ arm64_skip_faulting_instruction(regs, 4); ++ } else { ++ printk("Loading is currently not implemented (addr 0x%llx)\n", desc->addr); ++ return -1; ++ } ++ return 0; ++} ++ ++int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t size = (instr >> 30) & 3; ++ uint8_t load = (instr >> 22) & 1; // acquire semantics, has no effect here, since it's not atomic anymore ++ uint8_t Rs = (instr >> 16) & 0x1f; ++ uint8_t Rt2 = (instr >> 10) & 0x1f; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; ++ ++ uint8_t o0 = (instr >> 15) & 1; // L, release semantics, has no effect here, since it's not atomic anymore ++ ++ if(Rt2 != 0x1f){ ++ return -1; ++ } ++ ++ switch(size){ ++ case 0: ++ desc->width = 8; ++ break; ++ case 1: ++ desc->width = 16; ++ break; ++ case 2: ++ desc->width = 32; ++ break; ++ case 3: ++ desc->width = 64; ++ break; ++ } ++ ++ desc->addr = (void*)regs->regs[Rn]; ++ desc->data1 = regs->regs[Rt]; ++ ++ // nearly everything from here on could be moved into another function if needed ++ u64 cmpmask = (1 << desc->width) - 1; ++ u64 cmpval = regs->regs[Rs] & cmpmask; ++ ++ u64 readval = 0; ++ int bcount = desc->width / 8; ++ u64 addr = desc->addr; ++ int r; ++ uint8_t tmp; ++ ++ printk("Atomic CAS not being done atomically at 0x%llx, size %d\n",desc->addr, desc->width); ++ ++ for(int i = 0; i < bcount; i++){ ++ if((r=get_user(tmp, (uint8_t __user *)addr))) ++ return r; ++ readval |= tmp; ++ readval <<= 8; // maybe this could be read directly into regs->regs[Rs] ++ addr++; ++ } ++ ++ if((readval & cmpmask) == cmpval){ ++ // swap ++ addr = (u64)desc->addr; ++ ++ for(int i = 0; i < bcount; i++){ ++ if((r=put_user(desc->data1 & 0xff, (uint8_t __user *)addr))) ++ return r; ++ desc->data1 >>= 8; ++ addr++; ++ } ++ ++ regs->regs[Rs] = readval; ++ } ++ ++ arm64_skip_faulting_instruction(regs, 4); ++ ++ return 0; ++} ++ ++int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t op2; ++ uint8_t opc; ++ op2 = (instr >> 23) & 3; ++ opc = (instr >> 30) & 3; ++ ++ uint8_t load = (instr >> 22) & 1; ++ uint8_t simd = (instr >> 26) & 1; ++ uint16_t imm7 = (instr >> 15) & 0x7f; ++ uint8_t Rt2 = (instr >> 10) & 0x1f; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; ++ ++ int16_t imm = 0xffff & imm7; ++ ++ desc->load = load; ++ desc->simd = simd; ++ ++ // opc controls the width ++ switch(opc){ ++ case 0: ++ desc->width = 32; ++ imm <<= 2; ++ break; ++ case 2: ++ desc->width = 64; ++ imm <<= 3; ++ break; ++ default: ++ return -1; ++ } ++ ++ // op2 controls the indexing ++ switch(op2){ ++ case 2: ++ // offset ++ desc->addr = (void*)(regs->regs[Rn] + imm); ++ break; ++ default: ++ return -1; ++ } ++ desc->data1 = regs->regs[Rt]; ++ desc->data2 = regs->regs[Rt2]; ++ ++ return do_ls_fixup(instr, regs, desc); ++ ++} ++ ++int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t size = (instr >> 30) & 3; ++ uint8_t simd = (instr >> 26) & 1; ++ uint8_t opc = (instr >> 22) & 3; ++ ++ switch(size){ ++ case 0: ++ desc->width = 8; ++ break; ++ case 1: ++ desc->width = 16; ++ break; ++ case 2: ++ desc->width = 32; ++ break; ++ case 3: ++ desc->width = 64; ++ break; ++ } ++ return 0; ++} ++ ++int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t op0; ++ uint8_t op1; ++ uint8_t op2; ++ uint8_t op3; ++ uint8_t op4; ++ ++ op0 = (instr >> 28) & 0xf; ++ op1 = (instr >> 26) & 1; ++ op2 = (instr >> 23) & 3; ++ op3 = (instr >> 16) & 0x3f; ++ op4 = (instr >> 10) & 3; ++ printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); ++ if((op0 & 3) == 2){ ++ desc->pair = 1; ++ return ls_pair_fixup(instr, regs, desc); ++ } ++ if((op0 & 3) == 0 && op1 == 0 && op2 == 1 && (op3 & 0x20) == 0x20){ ++ // compare and swap ++ return ls_cas_fixup(instr, regs, desc); ++ } ++ if((op0 & 3) == 3 && (op2 & 3) == 3){ ++ //load/store unsigned immediate ++ desc->pair = 0; ++ ++ } ++ if((op0 & 3) == 2 && (op2 == 2)){ ++ // Load/store pair offset ++ //ldpstp_offset_fixup(instr, regs); ++ return ls_reg_unsigned_imm(instr, regs, desc); ++ } ++ return 0; ++} ++ ++int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ ++ unsigned long long instrptr; ++ u32 instr = 0; ++ ++ instrptr = instruction_pointer(regs); ++ printk("Alignment fixup\n"); ++ ++ if (alignment_get_arm64(regs, (__le64 __user *)instrptr, &instr)){ ++ printk("Failed to get aarch64 instruction\n"); ++ return 1; ++ } ++ printk("Faulting instruction: 0x%lx\n", instr); ++ /** ++ * List of seen faults: 020c00a9 (0xa9000c02) stp x2, x3, [x0] ++ * ++ */ ++ ++ uint8_t op0; ++ struct fixupDescription desc = {0}; ++ ++ op0 = ((instr & 0x1E000000) >> 25); ++ if((op0 & 5) == 0x4){ ++ printk("Load/Store\n"); ++ return ls_fixup(instr, regs, &desc); ++ } else { ++ printk("Not handling instruction with op0 0x%x ",op0); ++ } ++ return -1; ++} +diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c +index 8b281cf308b30f..23fd8c9c59cac0 100644 +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -697,6 +698,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, + * We had some memory, but were unable to successfully fix up + * this page fault. + */ ++ printk("Page fault bus error\n"); + arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name); + } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) { + unsigned int lsb; +@@ -749,9 +751,16 @@ static int __kprobes do_translation_fault(unsigned long far, + static int do_alignment_fault(unsigned long far, unsigned long esr, + struct pt_regs *regs) + { ++ //printk("Alignment fault: fixup enabled?: %d, user mode: %d pstate: 0x%llx\n", IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS), compat_user_mode(regs), regs->pstate); ++ trigger_all_cpu_backtrace(); + if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) && + compat_user_mode(regs)) + return do_compat_alignment_fixup(far, regs); ++ ++ if(user_mode(regs)){ ++ // aarch64 user mode ++ return do_alignment_fixup(far, regs); ++ } + do_bad_area(far, esr, regs); + return 0; + } + +From 745e287355b9e7921c206c5b65d1b0956b2b0f9a Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Sat, 2 Dec 2023 00:36:29 +0100 +Subject: [PATCH 03/18] added 32/64bit str + +--- + arch/arm64/kernel/compat_alignment.c | 129 ++++++++++++++++++++++++--- + arch/arm64/mm/fault.c | 4 +- + 2 files changed, 121 insertions(+), 12 deletions(-) + +diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c +index 9b921cc50b340e..edeb59c22e5ae2 100644 +--- a/arch/arm64/kernel/compat_alignment.c ++++ b/arch/arm64/kernel/compat_alignment.c +@@ -387,10 +387,23 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) + // arm64# + + /* +- *Happens with The Long Dark ++ *Happens with The Long Dark (also with steam) + * + *[ 6012.660803] Faulting instruction: 0x3d800020 + [ 6012.660813] Load/Store: op0 0x3 op1 0x1 op2 0x3 op3 0x0 op4 0x0 ++ * ++ *[ 555.449651] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x1 op4 0x0 ++[ 555.449654] Faulting instruction: 0x3c810021 ++ * ++ * ++ *[ 555.449663] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x2 op4 0x0 ++[ 555.449666] Faulting instruction: 0x3c820020 ++ * ++ *[ 555.449674] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x3 op4 0x0 ++[ 555.449677] Faulting instruction: 0x3c830021 ++ * ++ * ++ * + */ + + struct fixupDescription{ +@@ -407,6 +420,8 @@ struct fixupDescription{ + int load; // 1 is it's a load, 0 if it's a store + int pair; // 1 if it's a l/s pair instruction + int width; // width of the access in bits ++ int extendSign; ++ int extend_width; + }; + + static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *inst) +@@ -466,7 +481,7 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + if(!desc->load){ + uint8_t* addr = desc->addr; + int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that +- printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); ++ //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); + for(int i = 0; i < bcount; i++){ + if((r=put_user(desc->data1 & 0xff, (uint8_t __user *)addr))) + return r; +@@ -628,6 +643,84 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription + desc->width = 64; + break; + } ++ return 1; ++} ++ ++ ++u64 extend_reg(u64 reg, int type, int shift){ ++ ++ uint8_t is_signed = (type & 4) >> 2; ++ uint8_t input_width = type & 1; ++ ++ u64 tmp; ++ if(!is_signed){ ++ tmp = reg; ++ } else { ++ if(input_width == 0){ ++ // 32bit, needs to be extended to 64 ++ // I hope the compiler just does this kind of automatically with these types ++ int32_t stmpw = reg; ++ int64_t stmpdw = stmpw; ++ tmp = (u64)stmpdw; ++ } ++ } ++ ++ return tmp << shift; ++} ++ ++int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t size = (instr >> 30) & 3; ++ uint8_t simd = (instr >> 26) & 1; ++ uint8_t opc = (instr >> 22) & 3; ++ uint8_t option = (instr >> 13) & 5; ++ uint8_t Rm = (instr >> 16) & 0x1f; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; ++ uint8_t S = (instr >> 12) & 1; ++ // size==0 seems to be a bit special ++ // opc&2 is sign, opc&1 is load (for most instructions anyways) ++ ++ uint8_t load = opc & 1; ++ uint8_t extend_sign = (opc & 2) >> 1; ++ desc->pair = 0; ++ ++ desc->simd = simd; ++ desc->width = 8 << size; ++ ++ // the simd instructions make this a bit weird ++ if(!simd){ ++ if(extend_sign){ ++ if(load){ ++ desc->extend_width = 32; ++ } else { ++ desc->extend_width = 64; ++ } ++ desc->load = 1; ++ } else { ++ desc->load = load; ++ } ++ ++ desc->extendSign = extend_sign; // needed for load, which isn't implemented yet ++ ++ ++ u64 addr = regs->regs[Rn]; ++ ++ int shift = 0; ++ if(S) shift = 2 << ((size & 1) & ((size >> 1) & 1)); ++ ++ u64 offset = extend_reg(regs->regs[Rm], option, S); ++ ++ addr += offset; ++ ++ desc->data1 = regs->regs[Rt]; ++ desc->addr = (void*)addr; ++ ++ return do_ls_fixup(instr, regs, desc); ++ ++ } else { ++ printk("Load/Store register offset decode doesn't support simd yet\n"); ++ return 1; ++ } + return 0; + } + +@@ -638,19 +731,21 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t op3; + uint8_t op4; + ++ int r = 1; ++ + op0 = (instr >> 28) & 0xf; + op1 = (instr >> 26) & 1; + op2 = (instr >> 23) & 3; + op3 = (instr >> 16) & 0x3f; + op4 = (instr >> 10) & 3; +- printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); ++ + if((op0 & 3) == 2){ + desc->pair = 1; +- return ls_pair_fixup(instr, regs, desc); ++ r = ls_pair_fixup(instr, regs, desc); + } + if((op0 & 3) == 0 && op1 == 0 && op2 == 1 && (op3 & 0x20) == 0x20){ + // compare and swap +- return ls_cas_fixup(instr, regs, desc); ++ r = ls_cas_fixup(instr, regs, desc); + } + if((op0 & 3) == 3 && (op2 & 3) == 3){ + //load/store unsigned immediate +@@ -660,9 +755,16 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + if((op0 & 3) == 2 && (op2 == 2)){ + // Load/store pair offset + //ldpstp_offset_fixup(instr, regs); +- return ls_reg_unsigned_imm(instr, regs, desc); ++ //r = ls_reg_unsigned_imm(instr, regs, desc); + } +- return 0; ++ if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x20 && op4 == 2){ ++ // register offset load/store ++ r = lsr_offset_fixup(instr, regs, desc); ++ } ++ if(r){ ++ printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); ++ } ++ return r; + } + + int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ +@@ -670,25 +772,30 @@ int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ + u32 instr = 0; + + instrptr = instruction_pointer(regs); +- printk("Alignment fixup\n"); ++ //printk("Alignment fixup\n"); + + if (alignment_get_arm64(regs, (__le64 __user *)instrptr, &instr)){ + printk("Failed to get aarch64 instruction\n"); + return 1; + } +- printk("Faulting instruction: 0x%lx\n", instr); ++ + /** + * List of seen faults: 020c00a9 (0xa9000c02) stp x2, x3, [x0] + * + */ + + uint8_t op0; ++ int r; + struct fixupDescription desc = {0}; + + op0 = ((instr & 0x1E000000) >> 25); + if((op0 & 5) == 0x4){ +- printk("Load/Store\n"); +- return ls_fixup(instr, regs, &desc); ++ //printk("Load/Store\n"); ++ r = ls_fixup(instr, regs, &desc); ++ if(r){ ++ printk("Faulting instruction: 0x%lx\n", instr); ++ } ++ return r; + } else { + printk("Not handling instruction with op0 0x%x ",op0); + } +diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c +index 23fd8c9c59cac0..59a53a92ecea64 100644 +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -759,7 +759,9 @@ static int do_alignment_fault(unsigned long far, unsigned long esr, + + if(user_mode(regs)){ + // aarch64 user mode +- return do_alignment_fixup(far, regs); ++ if(!do_alignment_fixup(far, regs)){ ++ return 0; ++ } + } + do_bad_area(far, esr, regs); + return 0; + +From 14f95152da6f49fef19f2cba846cf79553b30daa Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Wed, 6 Dec 2023 17:52:45 +0100 +Subject: [PATCH 04/18] some SIMD stuff for unity (not quite enough yet) + +--- + arch/arm64/kernel/compat_alignment.c | 79 ++++++++++++++++++++++++++-- + arch/arm64/mm/fault.c | 3 +- + 2 files changed, 76 insertions(+), 6 deletions(-) + +diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c +index edeb59c22e5ae2..73055048d6377c 100644 +--- a/arch/arm64/kernel/compat_alignment.c ++++ b/arch/arm64/kernel/compat_alignment.c +@@ -12,6 +12,8 @@ + #include + #include + ++#include ++ + /* + * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998 + * +@@ -401,6 +403,9 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) + * + *[ 555.449674] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x3 op4 0x0 + [ 555.449677] Faulting instruction: 0x3c830021 ++ ++stur q1, [x1, #16] ++potentially also ldur q0, [x1, #32] and ldur q1, [x1, #48] + * + * + * +@@ -408,7 +413,8 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) + + struct fixupDescription{ + void* addr; +- // ++ ++ // datax_simd has to be located directly after datax in memory + u64 data1; + u64 data1_simd; + u64 data2; +@@ -476,24 +482,40 @@ static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *ins + return 0; + }*/ + ++// saves the contents of the simd register reg to dst ++void read_simd_reg(int reg, __uint128_t* dst){ ++ struct user_fpsimd_state st; ++ fpsimd_save_state(&st); ++ *dst = st.vregs[reg]; ++} ++ + int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + int r; ++ /*if(desc->width > 64){ ++ printk("Currently cannot process ls_fixup with a size of %d bits\n", desc->width); ++ return 1; ++ }*/ + if(!desc->load){ + uint8_t* addr = desc->addr; + int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that ++ + //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); ++ int addrIt = 0; + for(int i = 0; i < bcount; i++){ +- if((r=put_user(desc->data1 & 0xff, (uint8_t __user *)addr))) ++ if((r=put_user( (*(((uint8_t*)(&desc->data1)) + addrIt) & 0xff), (uint8_t __user *)addr))) + return r; +- desc->data1 >>= 8; ++ //desc->data1 >>= 8; ++ addrIt++; + addr++; + } + ++ addrIt = 0; + if(desc->pair){ + for(int i = 0; i < bcount; i++){ +- if((r=put_user(desc->data2 & 0xff, (uint8_t __user *)addr))) ++ if((r=put_user((*(((uint8_t*)(&desc->data2)) + addrIt) & 0xff) & 0xff, (uint8_t __user *)addr))) + return r; +- desc->data2 >>= 8; ++ //desc->data2 >>= 8; ++ addrIt++; + addr++; + } + } +@@ -724,6 +746,43 @@ int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* d + return 0; + } + ++int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t size = (instr >> 30) & 3; ++ uint8_t simd = (instr >> 26) & 1; ++ uint8_t opc = (instr >> 22) & 3; ++ uint16_t imm9 = (instr >> 12) & 0x1ff; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; ++ ++ int16_t fullImm = 0; ++ // sign extend it ++ if(imm9 & 0x100){ ++ fullImm = 0xfe00 | imm9; ++ } else { ++ fullImm = imm9; ++ } ++ u64 addr = regs->regs[Rn]; ++ desc->addr = addr + fullImm; ++ desc->pair = 0; ++ ++ int load = opc & 1; ++ if(load){ ++ return 1; ++ } ++ if(simd){ ++ desc->simd = 1; ++ desc->width = 8 << (size | (opc << 1)); ++ // assuming store ++ __uint128_t tmp; ++ read_simd_reg(Rt, &tmp); ++ desc->data1 = tmp; ++ desc->data1_simd = *(((u64*)&tmp) + 1); ++ return do_ls_fixup(instr, regs, desc); ++ } ++ printk("SIMD: %d\n", simd); ++ return 1; ++} ++ + int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t op0; + uint8_t op1; +@@ -761,6 +820,16 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + // register offset load/store + r = lsr_offset_fixup(instr, regs, desc); + } ++ if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x0 && op4 == 0){ ++ // register load/store unscaled immediate ++ r = lsr_unscaled_immediate_fixup(instr, regs, desc); ++ printk("Likely SIMD stuff, which isn't being handled properly at all!\n"); ++ if(r){ ++ arm64_skip_faulting_instruction(regs, 4); ++ // skip anyways ++ } ++ //r = 0; ++ } + if(r){ + printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); + } +diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c +index 59a53a92ecea64..aaf73ecbbabf3a 100644 +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -759,9 +759,10 @@ static int do_alignment_fault(unsigned long far, unsigned long esr, + + if(user_mode(regs)){ + // aarch64 user mode +- if(!do_alignment_fixup(far, regs)){ ++ if(do_alignment_fixup(far, regs) == 0){ + return 0; + } ++ printk("Unfixed alignment issue\n"); + } + do_bad_area(far, esr, regs); + return 0; + +From 4a53ae4784c5273377b01511b6ace416dda6474d Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Fri, 8 Dec 2023 22:09:12 +0100 +Subject: [PATCH 05/18] better simd fixup (still not entirely working) + +--- + arch/arm64/kernel/compat_alignment.c | 176 +++++++++++++++++---------- + 1 file changed, 109 insertions(+), 67 deletions(-) + +diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c +index 73055048d6377c..f14650d19cd573 100644 +--- a/arch/arm64/kernel/compat_alignment.c ++++ b/arch/arm64/kernel/compat_alignment.c +@@ -13,6 +13,8 @@ + #include + + #include ++#include ++#include + + /* + * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998 +@@ -415,10 +417,13 @@ struct fixupDescription{ + void* addr; + + // datax_simd has to be located directly after datax in memory +- u64 data1; ++ /*u64 data1; + u64 data1_simd; + u64 data2; +- u64 data2_simd; ++ u64 data2_simd;*/ ++ ++ int reg1; ++ int reg2; + + int Rs; // used for atomics (which don't get handled atomically) + +@@ -483,14 +488,42 @@ static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *ins + }*/ + + // saves the contents of the simd register reg to dst +-void read_simd_reg(int reg, __uint128_t* dst){ +- struct user_fpsimd_state st; +- fpsimd_save_state(&st); +- *dst = st.vregs[reg]; ++void read_simd_reg(int reg, u64 dst[2]){ ++ struct user_fpsimd_state st = {0}; ++ //fpsimd_save_state(&st); ++ ++ if(!may_use_simd()){ ++ printk("may_use_simd returned false!\n"); ++ } ++ kernel_neon_begin(); ++ if(current->thread.sve_state){ ++ printk("SVE state is not NULL!\n"); ++ } ++ ++ dst[0] = *((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])); ++ dst[1] = *(((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) + 1); ++ ++ kernel_neon_end(); + } + + int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + int r; ++ u64 data1[2]; ++ u64 data2[2]; ++ ++ // the reg indices have to always be valid, even if the reg isn't being used ++ if(desc->simd){ ++ // At least currently, there aren't any simd instructions supported that use more than one data register ++ //__uint128_t tmp; ++ read_simd_reg(desc->reg1, data1); ++ //data1[0] = tmp; ++ //data1[1] = *(((u64*)&tmp) + 1); ++ printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); ++ } else { ++ data1[0] = regs->regs[desc->reg1]; ++ data2[0] = regs->regs[desc->reg2]; ++ } ++ + /*if(desc->width > 64){ + printk("Currently cannot process ls_fixup with a size of %d bits\n", desc->width); + return 1; +@@ -502,8 +535,10 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); + int addrIt = 0; + for(int i = 0; i < bcount; i++){ +- if((r=put_user( (*(((uint8_t*)(&desc->data1)) + addrIt) & 0xff), (uint8_t __user *)addr))) ++ if((r=put_user( (*(((uint8_t*)(data1)) + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); + return r; ++ } + //desc->data1 >>= 8; + addrIt++; + addr++; +@@ -512,8 +547,10 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + addrIt = 0; + if(desc->pair){ + for(int i = 0; i < bcount; i++){ +- if((r=put_user((*(((uint8_t*)(&desc->data2)) + addrIt) & 0xff) & 0xff, (uint8_t __user *)addr))) ++ if((r=put_user((*(((uint8_t*)(data2)) + addrIt) & 0xff) & 0xff, (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); + return r; ++ } + //desc->data2 >>= 8; + addrIt++; + addr++; +@@ -521,7 +558,7 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + } + arm64_skip_faulting_instruction(regs, 4); + } else { +- printk("Loading is currently not implemented (addr 0x%llx)\n", desc->addr); ++ printk("Loading is currently not implemented (addr 0x%px)\n", desc->addr); + return -1; + } + return 0; +@@ -557,7 +594,7 @@ int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc) + } + + desc->addr = (void*)regs->regs[Rn]; +- desc->data1 = regs->regs[Rt]; ++ u64 data1 = regs->regs[Rt]; + + // nearly everything from here on could be moved into another function if needed + u64 cmpmask = (1 << desc->width) - 1; +@@ -569,7 +606,7 @@ int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc) + int r; + uint8_t tmp; + +- printk("Atomic CAS not being done atomically at 0x%llx, size %d\n",desc->addr, desc->width); ++ printk("Atomic CAS not being done atomically at 0x%px, size %d\n",desc->addr, desc->width); + + for(int i = 0; i < bcount; i++){ + if((r=get_user(tmp, (uint8_t __user *)addr))) +@@ -584,9 +621,9 @@ int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc) + addr = (u64)desc->addr; + + for(int i = 0; i < bcount; i++){ +- if((r=put_user(desc->data1 & 0xff, (uint8_t __user *)addr))) ++ if((r=put_user(data1 & 0xff, (uint8_t __user *)addr))) + return r; +- desc->data1 >>= 8; ++ data1 >>= 8; + addr++; + } + +@@ -639,8 +676,10 @@ int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc + default: + return -1; + } +- desc->data1 = regs->regs[Rt]; +- desc->data2 = regs->regs[Rt2]; ++ //desc->data1 = regs->regs[Rt]; ++ //desc->data2 = regs->regs[Rt2]; ++ desc->reg1 = Rt; ++ desc->reg2 = Rt2; + + return do_ls_fixup(instr, regs, desc); + +@@ -650,22 +689,29 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription + uint8_t size = (instr >> 30) & 3; + uint8_t simd = (instr >> 26) & 1; + uint8_t opc = (instr >> 22) & 3; ++ uint16_t imm12 = (instr >> 10) & 0xfff; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; + +- switch(size){ +- case 0: +- desc->width = 8; +- break; +- case 1: +- desc->width = 16; +- break; +- case 2: +- desc->width = 32; +- break; +- case 3: +- desc->width = 64; +- break; ++ uint8_t load = opc & 1; ++ uint8_t extend_sign = ((opc & 2) >> 1 ) & !simd; ++ printk("size: %d simd: %d opc: %d imm12: 0x%x Rn: %d Rt: %d\n", size, simd, opc, imm12, Rn, Rt); ++ // when in simd mode, opc&2 is a third size bit. Otherwise, it's there for sign extension ++ int width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); ++ desc->width = 8 << width_shift; ++ ++ if((size & 1) && simd && (opc & 2)){ ++ return 1; + } +- return 1; ++ ++ desc->reg1 = Rt; ++ desc->simd = simd; ++ desc->extendSign = extend_sign; ++ u64 addr = regs->regs[Rn]; ++ desc->addr = addr + (imm12 << width_shift); ++ printk("unsigned imm\n"); ++ ++ return do_ls_fixup(instr, regs, desc); + } + + +@@ -699,50 +745,52 @@ int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* d + uint8_t Rn = (instr >> 5) & 0x1f; + uint8_t Rt = instr & 0x1f; + uint8_t S = (instr >> 12) & 1; ++ int width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); + // size==0 seems to be a bit special + // opc&2 is sign, opc&1 is load (for most instructions anyways) + + uint8_t load = opc & 1; +- uint8_t extend_sign = (opc & 2) >> 1; ++ uint8_t extend_sign = ((opc & 2) >> 1 ) & !simd; + desc->pair = 0; + + desc->simd = simd; +- desc->width = 8 << size; ++ desc->width = 8 << width_shift; + + // the simd instructions make this a bit weird +- if(!simd){ +- if(extend_sign){ +- if(load){ +- desc->extend_width = 32; +- } else { +- desc->extend_width = 64; +- } +- desc->load = 1; ++ if(extend_sign){ ++ if(load){ ++ desc->extend_width = 32; + } else { +- desc->load = load; ++ desc->extend_width = 64; + } ++ desc->load = 1; ++ } else { ++ desc->load = load; ++ } + +- desc->extendSign = extend_sign; // needed for load, which isn't implemented yet +- +- +- u64 addr = regs->regs[Rn]; ++ desc->extendSign = extend_sign; // needed for load, which isn't implemented yet + ++ u64 offset = 0; ++ u64 addr = 0; ++ addr = regs->regs[Rn]; ++ if(simd){ ++ int shift = 0; ++ if(S) shift = width_shift; ++ offset = extend_reg(regs->regs[Rm], option, shift); ++ } else { + int shift = 0; + if(S) shift = 2 << ((size & 1) & ((size >> 1) & 1)); + +- u64 offset = extend_reg(regs->regs[Rm], option, S); +- +- addr += offset; ++ offset = extend_reg(regs->regs[Rm], option, shift); ++ } + +- desc->data1 = regs->regs[Rt]; +- desc->addr = (void*)addr; ++ addr += offset; + +- return do_ls_fixup(instr, regs, desc); ++ //desc->data1 = regs->regs[Rt]; ++ desc->reg1 = Rt; ++ desc->addr = (void*)addr; + +- } else { +- printk("Load/Store register offset decode doesn't support simd yet\n"); +- return 1; +- } ++ return do_ls_fixup(instr, regs, desc); + return 0; + } + +@@ -769,14 +817,15 @@ int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDe + if(load){ + return 1; + } ++ desc->reg1 = Rt; + if(simd){ + desc->simd = 1; +- desc->width = 8 << (size | (opc << 1)); ++ desc->width = 8 << (size | ((opc & 2) << 1)); + // assuming store +- __uint128_t tmp; ++ /*__uint128_t tmp; + read_simd_reg(Rt, &tmp); + desc->data1 = tmp; +- desc->data1_simd = *(((u64*)&tmp) + 1); ++ desc->data1_simd = *(((u64*)&tmp) + 1);*/ + return do_ls_fixup(instr, regs, desc); + } + printk("SIMD: %d\n", simd); +@@ -811,10 +860,9 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + desc->pair = 0; + + } +- if((op0 & 3) == 2 && (op2 == 2)){ +- // Load/store pair offset +- //ldpstp_offset_fixup(instr, regs); +- //r = ls_reg_unsigned_imm(instr, regs, desc); ++ if((op0 & 3) == 3 && ((op2 & 2) == 2)){ ++ // register unsigned immediate ++ r = ls_reg_unsigned_imm(instr, regs, desc); + } + if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x20 && op4 == 2){ + // register offset load/store +@@ -823,12 +871,6 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x0 && op4 == 0){ + // register load/store unscaled immediate + r = lsr_unscaled_immediate_fixup(instr, regs, desc); +- printk("Likely SIMD stuff, which isn't being handled properly at all!\n"); +- if(r){ +- arm64_skip_faulting_instruction(regs, 4); +- // skip anyways +- } +- //r = 0; + } + if(r){ + printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); + +From 4766addea82548a13c896a79cb4e5f2a883dfda5 Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Sat, 13 Jan 2024 00:29:56 +0100 +Subject: [PATCH 06/18] SIMD stp + +--- + arch/arm64/kernel/compat_alignment.c | 103 +++++++++++++++++++++------ + 1 file changed, 83 insertions(+), 20 deletions(-) + +diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c +index f14650d19cd573..104b0d334776ad 100644 +--- a/arch/arm64/kernel/compat_alignment.c ++++ b/arch/arm64/kernel/compat_alignment.c +@@ -448,6 +448,15 @@ static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *ins + return 0; + } + ++int64_t extend_sign(int64_t in, int bits){ ++ bits--; ++ if(in & (1 << bits)){ ++ // extend sign ++ return (0xffffffffffffffff << bits) | in; ++ } ++ return in; ++} ++ + /*int ldpstp_offset_fixup(u32 instr, struct pt_regs *regs){ + uint8_t load = (instr >> 22) & 1; + uint8_t simd = (instr >> 26) & 1; +@@ -515,10 +524,16 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + if(desc->simd){ + // At least currently, there aren't any simd instructions supported that use more than one data register + //__uint128_t tmp; ++ ++ // probably better for performance to read both registers with one function to kernel_neon_* doesn't have to be called more than once + read_simd_reg(desc->reg1, data1); ++ read_simd_reg(desc->reg2, data2); + //data1[0] = tmp; + //data1[1] = *(((u64*)&tmp) + 1); +- printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); ++ ///printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); ++ if(desc->width < 128){ ++ return -1; ++ } + } else { + data1[0] = regs->regs[desc->reg1]; + data2[0] = regs->regs[desc->reg2]; +@@ -648,23 +663,29 @@ int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc + uint8_t Rn = (instr >> 5) & 0x1f; + uint8_t Rt = instr & 0x1f; + +- int16_t imm = 0xffff & imm7; +- ++ int64_t imm = extend_sign(imm7, 7); ++ int immshift = 0; + desc->load = load; + desc->simd = simd; + + // opc controls the width +- switch(opc){ +- case 0: +- desc->width = 32; +- imm <<= 2; +- break; +- case 2: +- desc->width = 64; +- imm <<= 3; +- break; +- default: +- return -1; ++ if(simd){ ++ desc->width = 32 << opc; ++ immshift = 4 << opc; ++ imm <<= immshift; ++ } else { ++ switch(opc){ ++ case 0: ++ desc->width = 32; ++ imm <<= 2; ++ break; ++ case 2: ++ desc->width = 64; ++ imm <<= 3; ++ break; ++ default: ++ return -1; ++ } + } + + // op2 controls the indexing +@@ -689,15 +710,25 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription + uint8_t size = (instr >> 30) & 3; + uint8_t simd = (instr >> 26) & 1; + uint8_t opc = (instr >> 22) & 3; +- uint16_t imm12 = (instr >> 10) & 0xfff; ++ uint64_t imm12 = (instr >> 10) & 0xfff; + uint8_t Rn = (instr >> 5) & 0x1f; + uint8_t Rt = instr & 0x1f; + + uint8_t load = opc & 1; +- uint8_t extend_sign = ((opc & 2) >> 1 ) & !simd; +- printk("size: %d simd: %d opc: %d imm12: 0x%x Rn: %d Rt: %d\n", size, simd, opc, imm12, Rn, Rt); ++ uint8_t extend_sign = 0;// = ((opc & 2) >> 1 ) & !simd; ++ int width_shift = 0; ++ ++ if(simd){ ++ extend_sign = 0; ++ width_shift = size | ((opc & 2) << 1); ++ } else { ++ extend_sign = ((opc & 2) >> 1 ); ++ width_shift = size; ++ } ++ ++ ///printk("size: %d simd: %d opc: %d imm12: 0x%x Rn: %d Rt: %d\n", size, simd, opc, imm12, Rn, Rt); + // when in simd mode, opc&2 is a third size bit. Otherwise, it's there for sign extension +- int width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); ++ //width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); + desc->width = 8 << width_shift; + + if((size & 1) && simd && (opc & 2)){ +@@ -709,7 +740,7 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription + desc->extendSign = extend_sign; + u64 addr = regs->regs[Rn]; + desc->addr = addr + (imm12 << width_shift); +- printk("unsigned imm\n"); ++ ///printk("unsigned imm\n"); + + return do_ls_fixup(instr, regs, desc); + } +@@ -730,9 +761,14 @@ u64 extend_reg(u64 reg, int type, int shift){ + int32_t stmpw = reg; + int64_t stmpdw = stmpw; + tmp = (u64)stmpdw; ++ } else { ++ printk("Other branch I forgor about previously!\n"); ++ tmp = reg; // since the size stays the same, I don't think this makes a difference + } + } + ++ ///printk("extend_reg: reg 0x%lx out (before shift) 0x%lx signed: %x\n", reg, tmp, is_signed); ++ + return tmp << shift; + } + +@@ -828,7 +864,7 @@ int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDe + desc->data1_simd = *(((u64*)&tmp) + 1);*/ + return do_ls_fixup(instr, regs, desc); + } +- printk("SIMD: %d\n", simd); ++ ///printk("SIMD: %d\n", simd); + return 1; + } + +@@ -878,6 +914,31 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + return r; + } + ++uint32_t* seenCMDs; ++size_t seenCMDCount = 0; ++size_t seenCMDSize = 0; ++ ++void instrDBG(u32 instr){ ++ for(size_t i = 0; i < seenCMDCount; i++){ ++ if(seenCMDs[i] == instr){ ++ return; ++ } ++ } ++ if(seenCMDSize == 0){ ++ seenCMDs = krealloc(seenCMDs, 1, GFP_KERNEL); ++ seenCMDSize = 1; ++ } ++ ++ if(seenCMDCount >= seenCMDSize){ ++ seenCMDs = krealloc(seenCMDs, seenCMDSize*2, GFP_KERNEL); ++ seenCMDSize *= 2; ++ } ++ ++ seenCMDs[seenCMDCount] = instr; ++ seenCMDCount++; ++ printk("New instruction: %x", instr); ++} ++ + int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ + unsigned long long instrptr; + u32 instr = 0; +@@ -895,6 +956,8 @@ int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ + * + */ + ++ instrDBG(instr); ++ + uint8_t op0; + int r; + struct fixupDescription desc = {0}; + +From 9c03af884f842ea62b6f78476cde680246ad562e Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Mon, 15 Jan 2024 20:37:21 +0100 +Subject: [PATCH 07/18] ldr, DOOM Eternal now works + +--- + arch/arm64/kernel/compat_alignment.c | 114 +++++++++++++++++++++------ + 1 file changed, 92 insertions(+), 22 deletions(-) + +diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c +index 104b0d334776ad..04ece4d0a2fad8 100644 +--- a/arch/arm64/kernel/compat_alignment.c ++++ b/arch/arm64/kernel/compat_alignment.c +@@ -515,28 +515,47 @@ void read_simd_reg(int reg, u64 dst[2]){ + kernel_neon_end(); + } + ++ ++void write_simd_reg(int reg, u64 src[2]){ ++ ++ if(!may_use_simd()){ ++ printk("may_use_simd returned false!\n"); ++ } ++ kernel_neon_begin(); ++ if(current->thread.sve_state){ ++ printk("SVE state is not NULL!\n"); ++ } ++ ++ *((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) = src[0]; ++ *(((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) + 1) = src[1]; ++ ++ kernel_neon_end(); ++} ++ + int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + int r; +- u64 data1[2]; +- u64 data2[2]; ++ u64 data1[2] = {0,0}; ++ u64 data2[2] = {0,0}; + + // the reg indices have to always be valid, even if the reg isn't being used +- if(desc->simd){ +- // At least currently, there aren't any simd instructions supported that use more than one data register +- //__uint128_t tmp; +- +- // probably better for performance to read both registers with one function to kernel_neon_* doesn't have to be called more than once +- read_simd_reg(desc->reg1, data1); +- read_simd_reg(desc->reg2, data2); +- //data1[0] = tmp; +- //data1[1] = *(((u64*)&tmp) + 1); +- ///printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); +- if(desc->width < 128){ +- return -1; ++ if(!desc->load){ ++ if(desc->simd){ ++ // At least currently, there aren't any simd instructions supported that use more than one data register ++ //__uint128_t tmp; ++ ++ // probably better for performance to read both registers with one function to kernel_neon_* doesn't have to be called more than once ++ read_simd_reg(desc->reg1, data1); ++ read_simd_reg(desc->reg2, data2); ++ //data1[0] = tmp; ++ //data1[1] = *(((u64*)&tmp) + 1); ++ ///printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); ++ /*if(desc->width < 128){ ++ return -1; ++ }*/ ++ } else { ++ data1[0] = regs->regs[desc->reg1]; ++ data2[0] = regs->regs[desc->reg2]; + } +- } else { +- data1[0] = regs->regs[desc->reg1]; +- data2[0] = regs->regs[desc->reg2]; + } + + /*if(desc->width > 64){ +@@ -573,8 +592,54 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + } + arm64_skip_faulting_instruction(regs, 4); + } else { +- printk("Loading is currently not implemented (addr 0x%px)\n", desc->addr); +- return -1; ++ //printk("Loading is currently not implemented (addr 0x%px)\n", desc->addr); ++ ++ uint8_t* addr = desc->addr; ++ int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that ++ ++ //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); ++ int addrIt = 0; ++ for(int i = 0; i < bcount; i++){ ++ uint8_t val; ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); ++ return r; ++ } ++ *(((uint8_t*)data1) + addrIt) = val; ++ //desc->data1 >>= 8; ++ addrIt++; ++ addr++; ++ } ++ ++ if(desc->simd){ ++ write_simd_reg(desc->reg1, data1); ++ } else { ++ regs->regs[desc->reg1] = data1[0]; ++ } ++ ++ addrIt = 0; ++ if(desc->pair){ ++ for(int i = 0; i < bcount; i++){ ++ uint8_t val; ++ if((r=get_user(val, (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); ++ return r; ++ } ++ *(((uint8_t*)data2) + addrIt) = val; ++ //desc->data2 >>= 8; ++ addrIt++; ++ addr++; ++ } ++ ++ if(desc->simd){ ++ write_simd_reg(desc->reg2, data1); ++ } else { ++ regs->regs[desc->reg2] = data1[0]; ++ } ++ } ++ arm64_skip_faulting_instruction(regs, 4); ++ ++ + } + return 0; + } +@@ -734,7 +799,7 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription + if((size & 1) && simd && (opc & 2)){ + return 1; + } +- ++ desc->load = load; + desc->reg1 = Rt; + desc->simd = simd; + desc->extendSign = extend_sign; +@@ -850,9 +915,10 @@ int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDe + desc->pair = 0; + + int load = opc & 1; +- if(load){ ++ desc->load = load; ++ /*if(load){ + return 1; +- } ++ }*/ + desc->reg1 = Rt; + if(simd){ + desc->simd = 1; +@@ -863,6 +929,10 @@ int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDe + desc->data1 = tmp; + desc->data1_simd = *(((u64*)&tmp) + 1);*/ + return do_ls_fixup(instr, regs, desc); ++ } else { ++ desc->simd = 0; ++ desc->width = 8 << size; ++ return do_ls_fixup(instr, regs, desc); + } + ///printk("SIMD: %d\n", simd); + return 1; + +From 8b0c2de26675012385d1b35a65d662c03042dff1 Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Fri, 4 Oct 2024 17:48:44 +0200 +Subject: [PATCH 08/18] some more alignment things + +--- + arch/arm64/kernel/compat_alignment.c | 283 ++++++++++++++++++++++++--- + 1 file changed, 260 insertions(+), 23 deletions(-) + +diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c +index 04ece4d0a2fad8..88cd01a9e99ea1 100644 +--- a/arch/arm64/kernel/compat_alignment.c ++++ b/arch/arm64/kernel/compat_alignment.c +@@ -413,6 +413,8 @@ potentially also ldur q0, [x1, #32] and ldur q1, [x1, #48] + * + */ + ++#include ++ + struct fixupDescription{ + void* addr; + +@@ -433,9 +435,14 @@ struct fixupDescription{ + int width; // width of the access in bits + int extendSign; + int extend_width; ++ ++ // profiling ++ u64 starttime; ++ u64 decodedtime; ++ u64 endtime; + }; + +-static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *inst) ++__attribute__((always_inline)) inline static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *inst) + { + __le32 instr = 0; + int fault; +@@ -448,7 +455,7 @@ static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *ins + return 0; + } + +-int64_t extend_sign(int64_t in, int bits){ ++__attribute__((always_inline)) inline int64_t extend_sign(int64_t in, int bits){ + bits--; + if(in & (1 << bits)){ + // extend sign +@@ -497,7 +504,7 @@ int64_t extend_sign(int64_t in, int bits){ + }*/ + + // saves the contents of the simd register reg to dst +-void read_simd_reg(int reg, u64 dst[2]){ ++__attribute__((always_inline)) inline void read_simd_reg(int reg, u64 dst[2]){ + struct user_fpsimd_state st = {0}; + //fpsimd_save_state(&st); + +@@ -516,7 +523,7 @@ void read_simd_reg(int reg, u64 dst[2]){ + } + + +-void write_simd_reg(int reg, u64 src[2]){ ++__attribute__((always_inline)) inline void write_simd_reg(int reg, u64 src[2]){ + + if(!may_use_simd()){ + printk("may_use_simd returned false!\n"); +@@ -532,11 +539,228 @@ void write_simd_reg(int reg, u64 src[2]){ + kernel_neon_end(); + } + ++// these try to use larger access widths than single bytes. Slower for small loads/stores, but it might speed larger ones up ++ ++__attribute__((always_inline)) inline int put_data2(int size, uint8_t* data, void* addr){ ++ int r = 0; ++ ++ while(size){ ++ if(size >= 4 && (((u64)addr % 4) == 0)){ ++ if((r=put_user( (*(((uint32_t*)(data)))), (uint32_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (%d)\n", addr,r); ++ return r; ++ } ++ addr += 4; ++ data += 4; ++ size -= 4; ++ continue; ++ } ++ if(size >= 2 && (((u64)addr % 2) == 0)){ ++ if((r=put_user( (*(((uint16_t*)(data)))), (uint16_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (%d)\n", addr,r); ++ return r; ++ } ++ addr += 2; ++ data += 2; ++ size -= 2; ++ continue; ++ } ++ // I guess the if is redundant here ++ if(size >= 1){ ++ if((r=put_user( (*(((uint8_t*)(data)))), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (%d)\n", addr,r); ++ return r; ++ } ++ addr += 1; ++ data += 1; ++ size -= 1; ++ continue; ++ } ++ ++ } ++ ++ return r; ++} ++ ++__attribute__((always_inline)) inline int get_data2(int size, uint8_t* data, void* addr){ ++ int r = 0; ++ uint32_t val32; ++ uint16_t val16; ++ uint8_t val8; ++ while(size){ ++ if(size >= 4 && (((u64)addr % 4) == 0)){ ++ if((r=get_user( val32, (uint32_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *((uint32_t*)data) = val32; ++ addr += 4; ++ data += 4; ++ size -= 4; ++ continue; ++ } ++ if(size >= 2 && (((u64)addr % 2) == 0)){ ++ if((r=get_user( val16, (uint16_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *((uint16_t*)data) = val16; ++ addr += 2; ++ data += 2; ++ size -= 2; ++ continue; ++ } ++ // I guess the if is redundant here ++ if(size >= 1){ ++ if((r=get_user( val8, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *((uint8_t*)data) = val8; ++ addr += 1; ++ data += 1; ++ size -= 1; ++ continue; ++ } ++ ++ } ++ ++ return r; ++} ++ ++ ++// these should avoid some branching, but still use single byte accesses ++__attribute__((always_inline)) inline int put_data(int size, uint8_t* data, void* addr){ ++ int r = 0; ++ int addrIt = 0; ++ ++ // with the fixed size loops, the compiler should be able to unroll them ++ // this should mean a lot less branching ++ switch(size){ ++ case 16: ++ for(int i = 0; i < 8; i++){ ++ if((r=put_user( (*(((uint8_t*)(data)) + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px\n", addr); ++ return r; ++ } ++ addrIt++; ++ addr++; ++ } ++ // fall through ++ case 8: ++ for(int i = 0; i < 4; i++){ ++ if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px\n", addr); ++ return r; ++ } ++ addrIt++; ++ addr++; ++ } ++ // fall through ++ case 4: ++ for(int i = 0; i < 2; i++){ ++ if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px\n", addr); ++ return r; ++ } ++ addrIt++; ++ addr++; ++ } ++ // fall through ++ case 2: ++ if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px\n", addr); ++ return r; ++ } ++ addrIt++; ++ addr++; ++ // fall through ++ case 1: ++ if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px\n", addr); ++ return r; ++ } ++ addrIt++; ++ addr++; ++ break; ++ default: ++ printk("unsupported size %d\n", size); ++ } ++ ++ return r; ++} ++ ++__attribute__((always_inline)) inline int get_data(int size, uint8_t* data, void* addr){ ++ int r = 0; ++ int addrIt = 0; ++ ++ // with the fixed size loops, the compiler should be able to unroll them ++ // this should mean a lot less branching ++ uint8_t val; ++ switch(size){ ++ case 16: ++ for(int i = 0; i < 8; i++){ ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *(data + addrIt) = val; ++ addrIt++; ++ addr++; ++ } ++ // fall through ++ case 8: ++ for(int i = 0; i < 4; i++){ ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *(data + addrIt) = val; ++ addrIt++; ++ addr++; ++ } ++ // fall through ++ case 4: ++ for(int i = 0; i < 2; i++){ ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *(data + addrIt) = val; ++ addrIt++; ++ addr++; ++ } ++ // fall through ++ case 2: ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *(data + addrIt) = val; ++ addrIt++; ++ addr++; ++ // fall through ++ case 1: ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *(data + addrIt) = val; ++ addrIt++; ++ addr++; ++ break; ++ default: ++ printk("unsupported size %d\n", size); ++ } ++ ++ return r; ++} ++ + int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + int r; + u64 data1[2] = {0,0}; + u64 data2[2] = {0,0}; +- ++ //desc->decodedtime = ktime_get_ns(); + // the reg indices have to always be valid, even if the reg isn't being used + if(!desc->load){ + if(desc->simd){ +@@ -570,25 +794,28 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + int addrIt = 0; + for(int i = 0; i < bcount; i++){ + if((r=put_user( (*(((uint8_t*)(data1)) + addrIt) & 0xff), (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); ++ printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr, r, desc->addr); + return r; + } + //desc->data1 >>= 8; + addrIt++; + addr++; + } +- ++ //put_data2(bcount, (uint8_t*)data1, addr); ++ //addr += bcount; + addrIt = 0; + if(desc->pair){ + for(int i = 0; i < bcount; i++){ + if((r=put_user((*(((uint8_t*)(data2)) + addrIt) & 0xff) & 0xff, (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); ++ printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr, r, desc->addr); + return r; + } + //desc->data2 >>= 8; + addrIt++; + addr++; + } ++ //put_data2(bcount, (uint8_t*)data2, addr); ++ addr += bcount; + } + arm64_skip_faulting_instruction(regs, 4); + } else { +@@ -599,7 +826,7 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + + //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); + int addrIt = 0; +- for(int i = 0; i < bcount; i++){ ++ /*for(int i = 0; i < bcount; i++){ + uint8_t val; + if((r=get_user( val, (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); +@@ -609,7 +836,9 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + //desc->data1 >>= 8; + addrIt++; + addr++; +- } ++ }*/ ++ get_data2(bcount, (uint8_t*)data1, addr); ++ addr += bcount; + + if(desc->simd){ + write_simd_reg(desc->reg1, data1); +@@ -619,7 +848,7 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + + addrIt = 0; + if(desc->pair){ +- for(int i = 0; i < bcount; i++){ ++ /*for(int i = 0; i < bcount; i++){ + uint8_t val; + if((r=get_user(val, (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); +@@ -629,8 +858,10 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + //desc->data2 >>= 8; + addrIt++; + addr++; +- } ++ }*/ + ++ get_data2(bcount, (uint8_t*)data2, addr); ++ addr += bcount; + if(desc->simd){ + write_simd_reg(desc->reg2, data1); + } else { +@@ -715,7 +946,7 @@ int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc) + return 0; + } + +-int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++__attribute__((always_inline)) inline int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t op2; + uint8_t opc; + op2 = (instr >> 23) & 3; +@@ -729,15 +960,16 @@ int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc + uint8_t Rt = instr & 0x1f; + + int64_t imm = extend_sign(imm7, 7); +- int immshift = 0; ++ //int immshift = 0; + desc->load = load; + desc->simd = simd; + + // opc controls the width + if(simd){ + desc->width = 32 << opc; +- immshift = 4 << opc; +- imm <<= immshift; ++ //immshift = 4 << opc; ++ imm <<= 2; ++ imm <<= opc; + } else { + switch(opc){ + case 0: +@@ -771,7 +1003,7 @@ int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc + + } + +-int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++__attribute__((always_inline)) inline int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t size = (instr >> 30) & 3; + uint8_t simd = (instr >> 26) & 1; + uint8_t opc = (instr >> 22) & 3; +@@ -811,7 +1043,7 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription + } + + +-u64 extend_reg(u64 reg, int type, int shift){ ++__attribute__((always_inline)) inline u64 extend_reg(u64 reg, int type, int shift){ + + uint8_t is_signed = (type & 4) >> 2; + uint8_t input_width = type & 1; +@@ -837,7 +1069,7 @@ u64 extend_reg(u64 reg, int type, int shift){ + return tmp << shift; + } + +-int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++__attribute__((always_inline)) inline int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t size = (instr >> 30) & 3; + uint8_t simd = (instr >> 26) & 1; + uint8_t opc = (instr >> 22) & 3; +@@ -895,7 +1127,7 @@ int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* d + return 0; + } + +-int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++__attribute__((always_inline)) inline int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t size = (instr >> 30) & 3; + uint8_t simd = (instr >> 26) & 1; + uint8_t opc = (instr >> 22) & 3; +@@ -938,7 +1170,7 @@ int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDe + return 1; + } + +-int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++__attribute__((always_inline)) inline int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t op0; + uint8_t op1; + uint8_t op2; +@@ -1026,19 +1258,24 @@ int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ + * + */ + +- instrDBG(instr); ++ //instrDBG(instr); + + uint8_t op0; + int r; + struct fixupDescription desc = {0}; +- ++ //desc.starttime = ktime_get_ns(); + op0 = ((instr & 0x1E000000) >> 25); + if((op0 & 5) == 0x4){ + //printk("Load/Store\n"); + r = ls_fixup(instr, regs, &desc); ++ //desc.endtime = ktime_get_ns(); ++ /*printk("Trap timing: decoding: %ldns, mem ops: %ldns, total: %ldns\n", desc.decodedtime - desc.starttime, ++ desc.endtime - desc.decodedtime, desc.endtime - desc.starttime); ++ */ + if(r){ + printk("Faulting instruction: 0x%lx\n", instr); + } ++ + return r; + } else { + printk("Not handling instruction with op0 0x%x ",op0); + +From 07faf3d24389771aae3816c3ce881ab2d072eb21 Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Tue, 8 Oct 2024 17:27:48 +0200 +Subject: [PATCH 09/18] another memcpy, tripped by vaapi + +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +index 5df21529b3b13e..fdd7a3f54244a0 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +@@ -1056,7 +1056,7 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p, + kptr += va_start - (m->start * AMDGPU_GPU_PAGE_SIZE); + + if (ring->funcs->parse_cs) { +- memcpy(ib->ptr, kptr, ib->length_dw * 4); ++ memcpy_fromio(ib->ptr, kptr, ib->length_dw * 4); + amdgpu_bo_kunmap(aobj); + + r = amdgpu_ring_parse_cs(ring, p, job, ib); + +From bdf561a53d4917957a16e7e75a5ce9a596137db9 Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Mon, 14 Oct 2024 17:48:40 +0200 +Subject: [PATCH 10/18] added dc zva to the alignment trap + +--- + arch/arm64/kernel/compat_alignment.c | 94 +++++++++++++++++++++++++++- + 1 file changed, 92 insertions(+), 2 deletions(-) + +diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c +index 88cd01a9e99ea1..a41c9b9f34ae61 100644 +--- a/arch/arm64/kernel/compat_alignment.c ++++ b/arch/arm64/kernel/compat_alignment.c +@@ -414,6 +414,7 @@ potentially also ldur q0, [x1, #32] and ldur q1, [x1, #48] + */ + + #include ++#include + + struct fixupDescription{ + void* addr; +@@ -756,6 +757,38 @@ __attribute__((always_inline)) inline int get_data(int size, uint8_t* data, void + return r; + } + ++int memset_io_user(uint64_t size, uint8_t c, void* addr){ ++ int r = 0; ++ uint64_t pattern = c; ++ pattern |= pattern << 8; ++ pattern |= pattern << 16; ++ pattern |= pattern << 32; ++ uint64_t cnt = 0; ++ while(cnt < size){ ++ if((uint64_t)(addr + cnt) % 8){ ++ if((r = put_user(c, (uint8_t __user*) addr))){ ++ printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr + cnt, r, addr); ++ return r; ++ } ++ cnt++; ++ } else if(size - cnt >= 8){ ++ if((r = put_user(pattern, (uint64_t __user*) addr))){ ++ printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr + cnt, r, addr); ++ return r; ++ } ++ cnt += 8; ++ } else{ ++ if((r = put_user(c, (uint8_t __user*) addr))){ ++ printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr + cnt, r, addr); ++ return r; ++ } ++ cnt++; ++ } ++ ++ } ++ return r; ++} ++ + int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + int r; + u64 data1[2] = {0,0}; +@@ -1216,6 +1249,60 @@ __attribute__((always_inline)) inline int ls_fixup(u32 instr, struct pt_regs *re + return r; + } + ++__attribute__((always_inline)) inline int system_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t op1; ++ uint8_t op2; ++ uint8_t CRn; ++ uint8_t CRm; ++ uint8_t Rt; ++ bool L; ++ int r = 0; ++ ++ op1 = (instr >> 16) & 0x7; ++ op2 = (instr >> 5) & 0x7; ++ CRn = (instr >> 12) & 0xf; ++ CRm = (instr >> 8) & 0xf; ++ L = (instr >> 21) & 1; ++ Rt = instr & 0x1f; ++ ++ if(!L){ ++ // SYS ++ // proper decoding would be nicer here, but I don't expect to see too many system instructions ++ if((op1 == 0x3) && (op2 == 1) && (CRn = 0x7) && (CRm == 4)){ ++ // dc zva ++ uint64_t dczid_el0 = read_sysreg_s(SYS_DCZID_EL0); ++ if(!((dczid_el0 >> DCZID_EL0_DZP_SHIFT) & 1)){ ++ uint16_t blksize = 4 << (dczid_el0 & 0xf); ++ r = memset_io_user(blksize, 0, regs->user_regs.regs[Rt]); ++ arm64_skip_faulting_instruction(regs, 4); ++ return r; ++ } else { ++ printk("DC ZVA is not allowed!\n"); ++ return 1; ++ } ++ } ++ } ++ ++ printk("Unhandled system instruction. op1=0x%x op2=0x%x CRn=0x%x CRm=0x%x\n", op1, op2, CRn, CRm); ++ return 1; ++} ++ ++__attribute__((always_inline)) inline int branch_except_system_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t op0; ++ uint32_t op1; ++ uint8_t op2; ++ ++ op0 = (instr >> 29) & 0x7; ++ op1 = (instr >> 5) & 0x1fffff; ++ op2 = instr & 0x1f; ++ ++ if((op0 == 0x6) && (op1 & 0x1ec000) == 0x84000){ ++ return system_fixup(instr, regs, desc); ++ } ++ printk("Unhandled Branch/Exception generating/System instruction. op0=0x%x op1=0x%x op2=0x%x\n", op0, op1, op2); ++ return 1; ++} ++ + uint32_t* seenCMDs; + size_t seenCMDCount = 0; + size_t seenCMDSize = 0; +@@ -1277,8 +1364,11 @@ int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ + } + + return r; +- } else { +- printk("Not handling instruction with op0 0x%x ",op0); ++ } else if((op0 & 0xe) == 0xa){ ++ // System instructions, needed for dc zva ++ return branch_except_system_fixup(instr, regs, &desc); ++ }else { ++ printk("Not handling instruction with op0 0x%x (instruction is 0x%08x)",op0, instr); + } + return -1; + } + +From 07ba8014d718fbddecfeb5e4e2aeb260be512482 Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Thu, 17 Oct 2024 16:28:16 +0200 +Subject: [PATCH 11/18] remove trigger_backtrace from do_alignment_fault + +--- + arch/arm64/mm/fault.c | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c +index aaf73ecbbabf3a..368f4eb573e9a9 100644 +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -751,8 +751,6 @@ static int __kprobes do_translation_fault(unsigned long far, + static int do_alignment_fault(unsigned long far, unsigned long esr, + struct pt_regs *regs) + { +- //printk("Alignment fault: fixup enabled?: %d, user mode: %d pstate: 0x%llx\n", IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS), compat_user_mode(regs), regs->pstate); +- trigger_all_cpu_backtrace(); + if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) && + compat_user_mode(regs)) + return do_compat_alignment_fixup(far, regs); + +From f20948514dbf27353da9e95fc23c6f9c626c2513 Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Sat, 19 Oct 2024 16:04:38 +0200 +Subject: [PATCH 12/18] gfx10 successful init + +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 18 +++++++++--------- + drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 6 +++--- + drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 16 ++++++++-------- + .../display/dc/clk_mgr/dcn30/dcn30_clk_mgr.c | 2 +- + 4 files changed, 21 insertions(+), 21 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +index d70855d7c61c1d..19b26fbc8e6ee6 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +@@ -672,9 +672,9 @@ psp_cmd_submit_buf(struct psp_context *psp, + if (psp->adev->no_hw_access) + return 0; + +- memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE); ++ memset_io(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE); + +- memcpy(psp->cmd_buf_mem, cmd, sizeof(struct psp_gfx_cmd_resp)); ++ memcpy_toio(psp->cmd_buf_mem, cmd, sizeof(struct psp_gfx_cmd_resp)); + + index = atomic_inc_return(&psp->fence_value); + ret = psp_ring_cmd_submit(psp, psp->cmd_buf_mc_addr, fence_mc_addr, index); +@@ -703,7 +703,7 @@ psp_cmd_submit_buf(struct psp_context *psp, + skip_unsupport = (psp->cmd_buf_mem->resp.status == TEE_ERROR_NOT_SUPPORTED || + psp->cmd_buf_mem->resp.status == PSP_ERR_UNKNOWN_COMMAND) && amdgpu_sriov_vf(psp->adev); + +- memcpy(&cmd->resp, &psp->cmd_buf_mem->resp, sizeof(struct psp_gfx_resp)); ++ memcpy_fromio(&cmd->resp, &psp->cmd_buf_mem->resp, sizeof(struct psp_gfx_resp)); + + /* In some cases, psp response status is not 0 even there is no + * problem while the command is submitted. Some version of PSP FW +@@ -1027,8 +1027,8 @@ static int psp_rl_load(struct amdgpu_device *adev) + + cmd = acquire_psp_cmd_buf(psp); + +- memset(psp->fw_pri_buf, 0, PSP_1_MEG); +- memcpy(psp->fw_pri_buf, psp->rl.start_addr, psp->rl.size_bytes); ++ memset_io(psp->fw_pri_buf, 0, PSP_1_MEG); ++ memcpy_toio(psp->fw_pri_buf, psp->rl.start_addr, psp->rl.size_bytes); + + cmd->cmd_id = GFX_CMD_ID_LOAD_IP_FW; + cmd->cmd.cmd_load_ip_fw.fw_phy_addr_lo = lower_32_bits(psp->fw_pri_mc_addr); +@@ -2876,7 +2876,7 @@ static int psp_load_fw(struct amdgpu_device *adev) + /* should not destroy ring, only stop */ + psp_ring_stop(psp, PSP_RING_TYPE__KM); + } else { +- memset(psp->fence_buf, 0, PSP_FENCE_BUFFER_SIZE); ++ memset_io(psp->fence_buf, 0, PSP_FENCE_BUFFER_SIZE); + + ret = psp_ring_init(psp, PSP_RING_TYPE__KM); + if (ret) { +@@ -3224,7 +3224,7 @@ int psp_ring_cmd_submit(struct psp_context *psp, + } + + /* Initialize KM RB frame */ +- memset(write_frame, 0, sizeof(struct psp_gfx_rb_frame)); ++ memset_io(write_frame, 0, sizeof(struct psp_gfx_rb_frame)); + + /* Update KM RB frame */ + write_frame->cmd_buf_addr_hi = upper_32_bits(cmd_buf_mc_addr); +@@ -3836,8 +3836,8 @@ void psp_copy_fw(struct psp_context *psp, uint8_t *start_addr, uint32_t bin_size + if (!drm_dev_enter(adev_to_drm(psp->adev), &idx)) + return; + +- memset(psp->fw_pri_buf, 0, PSP_1_MEG); +- memcpy(psp->fw_pri_buf, start_addr, bin_size); ++ memset_io(psp->fw_pri_buf, 0, PSP_1_MEG); ++ memcpy_toio(psp->fw_pri_buf, start_addr, bin_size); + + drm_dev_exit(idx); + } +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +index 43f44cc201cb80..ac9f1e52361081 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +@@ -593,7 +593,7 @@ static int amdgpu_vcn_dec_get_create_msg(struct amdgpu_ring *ring, uint32_t hand + struct amdgpu_ib *ib) + { + struct amdgpu_device *adev = ring->adev; +- uint32_t *msg; ++ volatile uint32_t *msg; + int r, i; + + memset(ib, 0, sizeof(*ib)); +@@ -628,7 +628,7 @@ static int amdgpu_vcn_dec_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han + struct amdgpu_ib *ib) + { + struct amdgpu_device *adev = ring->adev; +- uint32_t *msg; ++ volatile uint32_t *msg; + int r, i; + + memset(ib, 0, sizeof(*ib)); +@@ -751,7 +751,7 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring, + ib->ptr[ib->length_dw++] = cpu_to_le32(AMDGPU_VCN_IB_FLAG_DECODE_BUFFER); + decode_buffer = (struct amdgpu_vcn_decode_buffer *)&(ib->ptr[ib->length_dw]); + ib->length_dw += sizeof(struct amdgpu_vcn_decode_buffer) / 4; +- memset(decode_buffer, 0, sizeof(struct amdgpu_vcn_decode_buffer)); ++ memset_io(decode_buffer, 0, sizeof(struct amdgpu_vcn_decode_buffer)); + + decode_buffer->valid_buf_flag |= cpu_to_le32(AMDGPU_VCN_CMD_FLAG_MSG_BUFFER); + decode_buffer->msg_buffer_address_hi = cpu_to_le32(addr >> 32); +diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +index 45ed97038df0c8..cb92b3292cfe94 100644 +--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +@@ -4375,7 +4375,7 @@ static int gfx_v10_0_mec_init(struct amdgpu_device *adev) + return r; + } + +- memset(hpd, 0, mec_hpd_size); ++ memset_io(hpd, 0, mec_hpd_size); + + amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj); + amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj); +@@ -5580,7 +5580,7 @@ static void gfx_v10_0_rlc_backdoor_autoload_copy_ucode(struct amdgpu_device *ade + memcpy(ptr + toc_offset, fw_data, fw_size); + + if (fw_size < toc_fw_size) +- memset(ptr + toc_offset + fw_size, 0, toc_fw_size - fw_size); ++ memset_io(ptr + toc_offset + fw_size, 0, toc_fw_size - fw_size); + } + + static void gfx_v10_0_rlc_backdoor_autoload_copy_toc_ucode(struct amdgpu_device *adev) +@@ -6592,7 +6592,7 @@ static void gfx_v10_0_kiq_setting(struct amdgpu_ring *ring) + } + + static void gfx_v10_0_gfx_mqd_set_priority(struct amdgpu_device *adev, +- struct v10_gfx_mqd *mqd, ++ volatile struct v10_gfx_mqd *mqd, + struct amdgpu_mqd_prop *prop) + { + bool priority = 0; +@@ -6612,7 +6612,7 @@ static void gfx_v10_0_gfx_mqd_set_priority(struct amdgpu_device *adev, + static int gfx_v10_0_gfx_mqd_init(struct amdgpu_device *adev, void *m, + struct amdgpu_mqd_prop *prop) + { +- struct v10_gfx_mqd *mqd = m; ++ volatile struct v10_gfx_mqd *mqd = m; + uint64_t hqd_gpu_addr, wb_gpu_addr; + uint32_t tmp; + uint32_t rb_bufsz; +@@ -6699,7 +6699,7 @@ static int gfx_v10_0_kgq_init_queue(struct amdgpu_ring *ring, bool reset) + int mqd_idx = ring - &adev->gfx.gfx_ring[0]; + + if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) { +- memset((void *)mqd, 0, sizeof(*mqd)); ++ memset_io((void *)mqd, 0, sizeof(*mqd)); + mutex_lock(&adev->srbm_mutex); + nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); + amdgpu_ring_init_mqd(ring); +@@ -6769,7 +6769,7 @@ static int gfx_v10_0_cp_async_gfx_ring_resume(struct amdgpu_device *adev) + static int gfx_v10_0_compute_mqd_init(struct amdgpu_device *adev, void *m, + struct amdgpu_mqd_prop *prop) + { +- struct v10_compute_mqd *mqd = m; ++ volatile struct v10_compute_mqd *mqd = m; + uint64_t hqd_gpu_addr, wb_gpu_addr, eop_base_addr; + uint32_t tmp; + +@@ -7013,7 +7013,7 @@ static int gfx_v10_0_kiq_init_queue(struct amdgpu_ring *ring) + nv_grbm_select(adev, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); + } else { +- memset((void *)mqd, 0, sizeof(*mqd)); ++ memset_io((void *)mqd, 0, sizeof(*mqd)); + if (amdgpu_sriov_vf(adev) && adev->in_suspend) + amdgpu_ring_clear_ring(ring); + mutex_lock(&adev->srbm_mutex); +@@ -7037,7 +7037,7 @@ static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring, bool restore) + int mqd_idx = ring - &adev->gfx.compute_ring[0]; + + if (!restore && !amdgpu_in_reset(adev) && !adev->in_suspend) { +- memset((void *)mqd, 0, sizeof(*mqd)); ++ memset_io((void *)mqd, 0, sizeof(*mqd)); + mutex_lock(&adev->srbm_mutex); + nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); + amdgpu_ring_init_mqd(ring); +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn30/dcn30_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn30/dcn30_clk_mgr.c +index 8083a553c60e88..d0724060083120 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn30/dcn30_clk_mgr.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn30/dcn30_clk_mgr.c +@@ -333,7 +333,7 @@ static void dcn3_notify_wm_ranges(struct clk_mgr *clk_mgr_base) + // should log failure + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + /* collect valid ranges, place in pmfw table */ + for (i = 0; i < WM_SET_COUNT; i++) + +From 93138d932bbdf36e48e84a92f5c309efb8a3c585 Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Sat, 2 Nov 2024 18:26:35 +0100 +Subject: [PATCH 13/18] amdkfd alignment for arm64 + +--- + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 12 ++--- + .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c | 44 +++++++++---------- + 2 files changed, 28 insertions(+), 28 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +index d6037577c53278..0256706e47360e 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +@@ -102,7 +102,7 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, + kq->eop_gpu_addr = kq->eop_mem->gpu_addr; + kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; + +- memset(kq->eop_kernel_addr, 0, PAGE_SIZE); ++ memset_io(kq->eop_kernel_addr, 0, PAGE_SIZE); + } + + retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->rptr_kernel), +@@ -123,9 +123,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, + kq->wptr_kernel = kq->wptr_mem->cpu_ptr; + kq->wptr_gpu_addr = kq->wptr_mem->gpu_addr; + +- memset(kq->pq_kernel_addr, 0, queue_size); +- memset(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel)); +- memset(kq->wptr_kernel, 0, dev->kfd->device_info.doorbell_size); ++ memset_io(kq->pq_kernel_addr, 0, queue_size); ++ memset_io(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel)); ++ memset_io(kq->wptr_kernel, 0, dev->kfd->device_info.doorbell_size); + + prop.queue_size = queue_size; + prop.is_interop = false; +@@ -234,8 +234,8 @@ int kq_acquire_packet_buffer(struct kernel_queue *kq, + { + size_t available_size; + size_t queue_size_dwords; +- uint32_t wptr, rptr; +- uint64_t wptr64; ++ volatile uint32_t wptr, rptr; ++ volatile uint64_t wptr64; + unsigned int *queue_address; + + /* When rptr == wptr, the buffer is empty. +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c +index 1695dd78ede8e6..4ea2ad969eaf44 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c +@@ -32,20 +32,20 @@ + #include "gc/gc_10_1_0_sh_mask.h" + #include "amdgpu_amdkfd.h" + +-static inline struct v10_compute_mqd *get_mqd(void *mqd) ++static inline volatile struct v10_compute_mqd *get_mqd(void *mqd) + { +- return (struct v10_compute_mqd *)mqd; ++ return (volatile struct v10_compute_mqd *)mqd; + } + +-static inline struct v10_sdma_mqd *get_sdma_mqd(void *mqd) ++static inline volatile struct v10_sdma_mqd *get_sdma_mqd(void *mqd) + { +- return (struct v10_sdma_mqd *)mqd; ++ return (volatile struct v10_sdma_mqd *)mqd; + } + + static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct mqd_update_info *minfo) + { +- struct v10_compute_mqd *m; ++ volatile struct v10_compute_mqd *m; + uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ + + if (!minfo || !minfo->cu_mask.ptr) +@@ -67,7 +67,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, + m->compute_static_thread_mgmt_se3); + } + +-static void set_priority(struct v10_compute_mqd *m, struct queue_properties *q) ++static void set_priority(volatile struct v10_compute_mqd *m, struct queue_properties *q) + { + m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; + m->cp_hqd_queue_priority = q->priority; +@@ -90,12 +90,12 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, + struct queue_properties *q) + { + uint64_t addr; +- struct v10_compute_mqd *m; ++ volatile struct v10_compute_mqd *m; + + m = (struct v10_compute_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + +- memset(m, 0, sizeof(struct v10_compute_mqd)); ++ memset_io(m, 0, sizeof(struct v10_compute_mqd)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; +@@ -165,7 +165,7 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + struct mqd_update_info *minfo) + { +- struct v10_compute_mqd *m; ++ volatile struct v10_compute_mqd *m; + + m = get_mqd(mqd); + +@@ -239,7 +239,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) + { +- struct v10_compute_mqd *m; ++ volatile struct v10_compute_mqd *m; + struct kfd_context_save_area_header header; + + m = get_mqd(mqd); +@@ -273,11 +273,11 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, + + static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst) + { +- struct v10_compute_mqd *m; ++ volatile struct v10_compute_mqd *m; + + m = get_mqd(mqd); + +- memcpy(mqd_dst, m, sizeof(struct v10_compute_mqd)); ++ memcpy_fromio(mqd_dst, m, sizeof(struct v10_compute_mqd)); + } + + static void restore_mqd(struct mqd_manager *mm, void **mqd, +@@ -287,12 +287,12 @@ static void restore_mqd(struct mqd_manager *mm, void **mqd, + const void *ctl_stack_src, const u32 ctl_stack_size) + { + uint64_t addr; +- struct v10_compute_mqd *m; ++ volatile struct v10_compute_mqd *m; + + m = (struct v10_compute_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + +- memcpy(m, mqd_src, sizeof(*m)); ++ memcpy_toio(m, mqd_src, sizeof(*m)); + + *mqd = m; + if (gart_addr) +@@ -311,7 +311,7 @@ static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) + { +- struct v10_compute_mqd *m; ++ volatile struct v10_compute_mqd *m; + + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + +@@ -345,11 +345,11 @@ static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) + { +- struct v10_sdma_mqd *m; ++ volatile struct v10_sdma_mqd *m; + + m = (struct v10_sdma_mqd *) mqd_mem_obj->cpu_ptr; + +- memset(m, 0, sizeof(struct v10_sdma_mqd)); ++ memset_io(m, 0, sizeof(struct v10_sdma_mqd)); + + *mqd = m; + if (gart_addr) +@@ -364,7 +364,7 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + struct mqd_update_info *minfo) + { +- struct v10_sdma_mqd *m; ++ volatile struct v10_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) +@@ -392,11 +392,11 @@ static void checkpoint_mqd_sdma(struct mqd_manager *mm, + void *mqd_dst, + void *ctl_stack_dst) + { +- struct v10_sdma_mqd *m; ++ volatile struct v10_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + +- memcpy(mqd_dst, m, sizeof(struct v10_sdma_mqd)); ++ memcpy_fromio(mqd_dst, m, sizeof(struct v10_sdma_mqd)); + } + + static void restore_mqd_sdma(struct mqd_manager *mm, void **mqd, +@@ -407,12 +407,12 @@ static void restore_mqd_sdma(struct mqd_manager *mm, void **mqd, + const u32 ctl_stack_size) + { + uint64_t addr; +- struct v10_sdma_mqd *m; ++ volatile struct v10_sdma_mqd *m; + + m = (struct v10_sdma_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + +- memcpy(m, mqd_src, sizeof(*m)); ++ memcpy_toio(m, mqd_src, sizeof(*m)); + + m->sdmax_rlcx_doorbell_offset = + qp->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; + +From ad8164216710b72739150d5b8627708f998844ca Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Sat, 2 Nov 2024 22:28:55 +0100 +Subject: [PATCH 14/18] cleanup: added options for aarch64 alignment trap and + DRM force wc + +--- + arch/arm64/Kconfig | 8 + + arch/arm64/kernel/Makefile | 1 + + arch/arm64/kernel/compat_alignment.c | 989 +---------------------- + arch/arm64/kernel/compat_alignment_64.c | 995 ++++++++++++++++++++++++ + arch/arm64/mm/fault.c | 2 +- + drivers/gpu/drm/Kconfig | 10 + + include/drm/drm_cache.h | 3 + + 7 files changed, 1019 insertions(+), 989 deletions(-) + create mode 100644 arch/arm64/kernel/compat_alignment_64.c + +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 942c02943bce43..4053e82da9c739 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -1694,6 +1694,14 @@ config ARM64_TAGGED_ADDR_ABI + to system calls as pointer arguments. For details, see + Documentation/arch/arm64/tagged-address-abi.rst. + ++config ARM64_ALIGNMENT_FIXUPS ++ bool "Fix up misaligned loads and stores from userspace for 64bit code" ++ default n ++ help ++ Userspace may incorrectly assume that certain memory does not need ++ any special alignment considerations, which may result in Bus Erros. ++ Enable to handle these faults in the kernel. ++ + menuconfig COMPAT + bool "Kernel support for 32-bit EL0" + depends on ARM64_4K_PAGES || EXPERT +diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile +index 2b112f3b75109a..bb8009f7b29f00 100644 +--- a/arch/arm64/kernel/Makefile ++++ b/arch/arm64/kernel/Makefile +@@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ + sys_compat.o + obj-$(CONFIG_COMPAT) += sigreturn32.o + obj-$(CONFIG_COMPAT_ALIGNMENT_FIXUPS) += compat_alignment.o ++obj-$(CONFIG_ARM64_ALIGNMENT_FIXUPS) += compat_alignment_64.o + obj-$(CONFIG_KUSER_HELPERS) += kuser32.o + obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o + obj-$(CONFIG_MODULES) += module.o module-plts.o +diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c +index a41c9b9f34ae61..a617734ea1faf6 100644 +--- a/arch/arm64/kernel/compat_alignment.c ++++ b/arch/arm64/kernel/compat_alignment.c +@@ -12,9 +12,7 @@ + #include + #include + +-#include +-#include +-#include ++ + + /* + * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998 +@@ -387,988 +385,3 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) + + return 0; + } +- +-// arm64# +- +-/* +- *Happens with The Long Dark (also with steam) +- * +- *[ 6012.660803] Faulting instruction: 0x3d800020 +-[ 6012.660813] Load/Store: op0 0x3 op1 0x1 op2 0x3 op3 0x0 op4 0x0 +- * +- *[ 555.449651] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x1 op4 0x0 +-[ 555.449654] Faulting instruction: 0x3c810021 +- * +- * +- *[ 555.449663] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x2 op4 0x0 +-[ 555.449666] Faulting instruction: 0x3c820020 +- * +- *[ 555.449674] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x3 op4 0x0 +-[ 555.449677] Faulting instruction: 0x3c830021 +- +-stur q1, [x1, #16] +-potentially also ldur q0, [x1, #32] and ldur q1, [x1, #48] +- * +- * +- * +- */ +- +-#include +-#include +- +-struct fixupDescription{ +- void* addr; +- +- // datax_simd has to be located directly after datax in memory +- /*u64 data1; +- u64 data1_simd; +- u64 data2; +- u64 data2_simd;*/ +- +- int reg1; +- int reg2; +- +- int Rs; // used for atomics (which don't get handled atomically) +- +- int simd; // wether or not this is a vector instruction +- int load; // 1 is it's a load, 0 if it's a store +- int pair; // 1 if it's a l/s pair instruction +- int width; // width of the access in bits +- int extendSign; +- int extend_width; +- +- // profiling +- u64 starttime; +- u64 decodedtime; +- u64 endtime; +-}; +- +-__attribute__((always_inline)) inline static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *inst) +-{ +- __le32 instr = 0; +- int fault; +- +- fault = get_user(instr, ip); +- if (fault) +- return fault; +- +- *inst = __le32_to_cpu(instr); +- return 0; +-} +- +-__attribute__((always_inline)) inline int64_t extend_sign(int64_t in, int bits){ +- bits--; +- if(in & (1 << bits)){ +- // extend sign +- return (0xffffffffffffffff << bits) | in; +- } +- return in; +-} +- +-/*int ldpstp_offset_fixup(u32 instr, struct pt_regs *regs){ +- uint8_t load = (instr >> 22) & 1; +- uint8_t simd = (instr >> 26) & 1; +- uint16_t imm7 = (instr >> 15) & 0x7f; +- uint8_t Rt2 = (instr >> 10) & 0x1f; +- uint8_t Rn = (instr >> 5) & 0x1f; +- uint8_t Rt = instr & 0x1f; +- +- int16_t imm = 0xffff & imm7; +- printk("Variant: 0x%x Load: %x SIMD: %x IMM: 0x%x Rt: 0x%x Rt2: 0x%x Rn: 0x%x\n", ((instr >> 30) & 3),load, simd, imm, Rt, Rt2, Rn); +- if(((instr >> 30) & 3) == 2){ +- // 64bit +- if(!load){ +- if(!simd){ +- // 64bit store +- u64 val1, val2; +- val1 = regs->regs[Rt]; +- val2 = regs->regs[Rt2]; +- u64 addr = regs->regs[Rn] + imm; +- printk("STP 64bit storing 0x%llx 0x%llx at 0x%llx\n", val1, val2, addr); +- // for the first reg. Byte by byte to avoid any alignment issues +- for(int i = 0; i < 8; i++){ +- uint8_t v = (val1 >> (i*8)) & 0xff; +- put_user(v, (uint8_t __user *)addr); +- addr++; +- } +- // second reg +- for(int i = 0; i < 8; i++){ +- uint8_t v = (val2 >> (i*8)) & 0xff; +- put_user(v, (uint8_t __user *)addr); +- addr++; +- } +- arm64_skip_faulting_instruction(regs, 4); +- } +- } +- } +- return 0; +-}*/ +- +-// saves the contents of the simd register reg to dst +-__attribute__((always_inline)) inline void read_simd_reg(int reg, u64 dst[2]){ +- struct user_fpsimd_state st = {0}; +- //fpsimd_save_state(&st); +- +- if(!may_use_simd()){ +- printk("may_use_simd returned false!\n"); +- } +- kernel_neon_begin(); +- if(current->thread.sve_state){ +- printk("SVE state is not NULL!\n"); +- } +- +- dst[0] = *((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])); +- dst[1] = *(((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) + 1); +- +- kernel_neon_end(); +-} +- +- +-__attribute__((always_inline)) inline void write_simd_reg(int reg, u64 src[2]){ +- +- if(!may_use_simd()){ +- printk("may_use_simd returned false!\n"); +- } +- kernel_neon_begin(); +- if(current->thread.sve_state){ +- printk("SVE state is not NULL!\n"); +- } +- +- *((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) = src[0]; +- *(((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) + 1) = src[1]; +- +- kernel_neon_end(); +-} +- +-// these try to use larger access widths than single bytes. Slower for small loads/stores, but it might speed larger ones up +- +-__attribute__((always_inline)) inline int put_data2(int size, uint8_t* data, void* addr){ +- int r = 0; +- +- while(size){ +- if(size >= 4 && (((u64)addr % 4) == 0)){ +- if((r=put_user( (*(((uint32_t*)(data)))), (uint32_t __user *)addr))){ +- printk("Failed to write data at 0x%px (%d)\n", addr,r); +- return r; +- } +- addr += 4; +- data += 4; +- size -= 4; +- continue; +- } +- if(size >= 2 && (((u64)addr % 2) == 0)){ +- if((r=put_user( (*(((uint16_t*)(data)))), (uint16_t __user *)addr))){ +- printk("Failed to write data at 0x%px (%d)\n", addr,r); +- return r; +- } +- addr += 2; +- data += 2; +- size -= 2; +- continue; +- } +- // I guess the if is redundant here +- if(size >= 1){ +- if((r=put_user( (*(((uint8_t*)(data)))), (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px (%d)\n", addr,r); +- return r; +- } +- addr += 1; +- data += 1; +- size -= 1; +- continue; +- } +- +- } +- +- return r; +-} +- +-__attribute__((always_inline)) inline int get_data2(int size, uint8_t* data, void* addr){ +- int r = 0; +- uint32_t val32; +- uint16_t val16; +- uint8_t val8; +- while(size){ +- if(size >= 4 && (((u64)addr % 4) == 0)){ +- if((r=get_user( val32, (uint32_t __user *)addr))){ +- printk("Failed to read data at 0x%px\n", addr); +- return r; +- } +- *((uint32_t*)data) = val32; +- addr += 4; +- data += 4; +- size -= 4; +- continue; +- } +- if(size >= 2 && (((u64)addr % 2) == 0)){ +- if((r=get_user( val16, (uint16_t __user *)addr))){ +- printk("Failed to read data at 0x%px\n", addr); +- return r; +- } +- *((uint16_t*)data) = val16; +- addr += 2; +- data += 2; +- size -= 2; +- continue; +- } +- // I guess the if is redundant here +- if(size >= 1){ +- if((r=get_user( val8, (uint8_t __user *)addr))){ +- printk("Failed to read data at 0x%px\n", addr); +- return r; +- } +- *((uint8_t*)data) = val8; +- addr += 1; +- data += 1; +- size -= 1; +- continue; +- } +- +- } +- +- return r; +-} +- +- +-// these should avoid some branching, but still use single byte accesses +-__attribute__((always_inline)) inline int put_data(int size, uint8_t* data, void* addr){ +- int r = 0; +- int addrIt = 0; +- +- // with the fixed size loops, the compiler should be able to unroll them +- // this should mean a lot less branching +- switch(size){ +- case 16: +- for(int i = 0; i < 8; i++){ +- if((r=put_user( (*(((uint8_t*)(data)) + addrIt) & 0xff), (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px\n", addr); +- return r; +- } +- addrIt++; +- addr++; +- } +- // fall through +- case 8: +- for(int i = 0; i < 4; i++){ +- if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px\n", addr); +- return r; +- } +- addrIt++; +- addr++; +- } +- // fall through +- case 4: +- for(int i = 0; i < 2; i++){ +- if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px\n", addr); +- return r; +- } +- addrIt++; +- addr++; +- } +- // fall through +- case 2: +- if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px\n", addr); +- return r; +- } +- addrIt++; +- addr++; +- // fall through +- case 1: +- if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px\n", addr); +- return r; +- } +- addrIt++; +- addr++; +- break; +- default: +- printk("unsupported size %d\n", size); +- } +- +- return r; +-} +- +-__attribute__((always_inline)) inline int get_data(int size, uint8_t* data, void* addr){ +- int r = 0; +- int addrIt = 0; +- +- // with the fixed size loops, the compiler should be able to unroll them +- // this should mean a lot less branching +- uint8_t val; +- switch(size){ +- case 16: +- for(int i = 0; i < 8; i++){ +- if((r=get_user( val, (uint8_t __user *)addr))){ +- printk("Failed to read data at 0x%px\n", addr); +- return r; +- } +- *(data + addrIt) = val; +- addrIt++; +- addr++; +- } +- // fall through +- case 8: +- for(int i = 0; i < 4; i++){ +- if((r=get_user( val, (uint8_t __user *)addr))){ +- printk("Failed to read data at 0x%px\n", addr); +- return r; +- } +- *(data + addrIt) = val; +- addrIt++; +- addr++; +- } +- // fall through +- case 4: +- for(int i = 0; i < 2; i++){ +- if((r=get_user( val, (uint8_t __user *)addr))){ +- printk("Failed to read data at 0x%px\n", addr); +- return r; +- } +- *(data + addrIt) = val; +- addrIt++; +- addr++; +- } +- // fall through +- case 2: +- if((r=get_user( val, (uint8_t __user *)addr))){ +- printk("Failed to read data at 0x%px\n", addr); +- return r; +- } +- *(data + addrIt) = val; +- addrIt++; +- addr++; +- // fall through +- case 1: +- if((r=get_user( val, (uint8_t __user *)addr))){ +- printk("Failed to read data at 0x%px\n", addr); +- return r; +- } +- *(data + addrIt) = val; +- addrIt++; +- addr++; +- break; +- default: +- printk("unsupported size %d\n", size); +- } +- +- return r; +-} +- +-int memset_io_user(uint64_t size, uint8_t c, void* addr){ +- int r = 0; +- uint64_t pattern = c; +- pattern |= pattern << 8; +- pattern |= pattern << 16; +- pattern |= pattern << 32; +- uint64_t cnt = 0; +- while(cnt < size){ +- if((uint64_t)(addr + cnt) % 8){ +- if((r = put_user(c, (uint8_t __user*) addr))){ +- printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr + cnt, r, addr); +- return r; +- } +- cnt++; +- } else if(size - cnt >= 8){ +- if((r = put_user(pattern, (uint64_t __user*) addr))){ +- printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr + cnt, r, addr); +- return r; +- } +- cnt += 8; +- } else{ +- if((r = put_user(c, (uint8_t __user*) addr))){ +- printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr + cnt, r, addr); +- return r; +- } +- cnt++; +- } +- +- } +- return r; +-} +- +-int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +- int r; +- u64 data1[2] = {0,0}; +- u64 data2[2] = {0,0}; +- //desc->decodedtime = ktime_get_ns(); +- // the reg indices have to always be valid, even if the reg isn't being used +- if(!desc->load){ +- if(desc->simd){ +- // At least currently, there aren't any simd instructions supported that use more than one data register +- //__uint128_t tmp; +- +- // probably better for performance to read both registers with one function to kernel_neon_* doesn't have to be called more than once +- read_simd_reg(desc->reg1, data1); +- read_simd_reg(desc->reg2, data2); +- //data1[0] = tmp; +- //data1[1] = *(((u64*)&tmp) + 1); +- ///printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); +- /*if(desc->width < 128){ +- return -1; +- }*/ +- } else { +- data1[0] = regs->regs[desc->reg1]; +- data2[0] = regs->regs[desc->reg2]; +- } +- } +- +- /*if(desc->width > 64){ +- printk("Currently cannot process ls_fixup with a size of %d bits\n", desc->width); +- return 1; +- }*/ +- if(!desc->load){ +- uint8_t* addr = desc->addr; +- int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that +- +- //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); +- int addrIt = 0; +- for(int i = 0; i < bcount; i++){ +- if((r=put_user( (*(((uint8_t*)(data1)) + addrIt) & 0xff), (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr, r, desc->addr); +- return r; +- } +- //desc->data1 >>= 8; +- addrIt++; +- addr++; +- } +- //put_data2(bcount, (uint8_t*)data1, addr); +- //addr += bcount; +- addrIt = 0; +- if(desc->pair){ +- for(int i = 0; i < bcount; i++){ +- if((r=put_user((*(((uint8_t*)(data2)) + addrIt) & 0xff) & 0xff, (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr, r, desc->addr); +- return r; +- } +- //desc->data2 >>= 8; +- addrIt++; +- addr++; +- } +- //put_data2(bcount, (uint8_t*)data2, addr); +- addr += bcount; +- } +- arm64_skip_faulting_instruction(regs, 4); +- } else { +- //printk("Loading is currently not implemented (addr 0x%px)\n", desc->addr); +- +- uint8_t* addr = desc->addr; +- int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that +- +- //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); +- int addrIt = 0; +- /*for(int i = 0; i < bcount; i++){ +- uint8_t val; +- if((r=get_user( val, (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); +- return r; +- } +- *(((uint8_t*)data1) + addrIt) = val; +- //desc->data1 >>= 8; +- addrIt++; +- addr++; +- }*/ +- get_data2(bcount, (uint8_t*)data1, addr); +- addr += bcount; +- +- if(desc->simd){ +- write_simd_reg(desc->reg1, data1); +- } else { +- regs->regs[desc->reg1] = data1[0]; +- } +- +- addrIt = 0; +- if(desc->pair){ +- /*for(int i = 0; i < bcount; i++){ +- uint8_t val; +- if((r=get_user(val, (uint8_t __user *)addr))){ +- printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); +- return r; +- } +- *(((uint8_t*)data2) + addrIt) = val; +- //desc->data2 >>= 8; +- addrIt++; +- addr++; +- }*/ +- +- get_data2(bcount, (uint8_t*)data2, addr); +- addr += bcount; +- if(desc->simd){ +- write_simd_reg(desc->reg2, data1); +- } else { +- regs->regs[desc->reg2] = data1[0]; +- } +- } +- arm64_skip_faulting_instruction(regs, 4); +- +- +- } +- return 0; +-} +- +-int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +- uint8_t size = (instr >> 30) & 3; +- uint8_t load = (instr >> 22) & 1; // acquire semantics, has no effect here, since it's not atomic anymore +- uint8_t Rs = (instr >> 16) & 0x1f; +- uint8_t Rt2 = (instr >> 10) & 0x1f; +- uint8_t Rn = (instr >> 5) & 0x1f; +- uint8_t Rt = instr & 0x1f; +- +- uint8_t o0 = (instr >> 15) & 1; // L, release semantics, has no effect here, since it's not atomic anymore +- +- if(Rt2 != 0x1f){ +- return -1; +- } +- +- switch(size){ +- case 0: +- desc->width = 8; +- break; +- case 1: +- desc->width = 16; +- break; +- case 2: +- desc->width = 32; +- break; +- case 3: +- desc->width = 64; +- break; +- } +- +- desc->addr = (void*)regs->regs[Rn]; +- u64 data1 = regs->regs[Rt]; +- +- // nearly everything from here on could be moved into another function if needed +- u64 cmpmask = (1 << desc->width) - 1; +- u64 cmpval = regs->regs[Rs] & cmpmask; +- +- u64 readval = 0; +- int bcount = desc->width / 8; +- u64 addr = desc->addr; +- int r; +- uint8_t tmp; +- +- printk("Atomic CAS not being done atomically at 0x%px, size %d\n",desc->addr, desc->width); +- +- for(int i = 0; i < bcount; i++){ +- if((r=get_user(tmp, (uint8_t __user *)addr))) +- return r; +- readval |= tmp; +- readval <<= 8; // maybe this could be read directly into regs->regs[Rs] +- addr++; +- } +- +- if((readval & cmpmask) == cmpval){ +- // swap +- addr = (u64)desc->addr; +- +- for(int i = 0; i < bcount; i++){ +- if((r=put_user(data1 & 0xff, (uint8_t __user *)addr))) +- return r; +- data1 >>= 8; +- addr++; +- } +- +- regs->regs[Rs] = readval; +- } +- +- arm64_skip_faulting_instruction(regs, 4); +- +- return 0; +-} +- +-__attribute__((always_inline)) inline int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +- uint8_t op2; +- uint8_t opc; +- op2 = (instr >> 23) & 3; +- opc = (instr >> 30) & 3; +- +- uint8_t load = (instr >> 22) & 1; +- uint8_t simd = (instr >> 26) & 1; +- uint16_t imm7 = (instr >> 15) & 0x7f; +- uint8_t Rt2 = (instr >> 10) & 0x1f; +- uint8_t Rn = (instr >> 5) & 0x1f; +- uint8_t Rt = instr & 0x1f; +- +- int64_t imm = extend_sign(imm7, 7); +- //int immshift = 0; +- desc->load = load; +- desc->simd = simd; +- +- // opc controls the width +- if(simd){ +- desc->width = 32 << opc; +- //immshift = 4 << opc; +- imm <<= 2; +- imm <<= opc; +- } else { +- switch(opc){ +- case 0: +- desc->width = 32; +- imm <<= 2; +- break; +- case 2: +- desc->width = 64; +- imm <<= 3; +- break; +- default: +- return -1; +- } +- } +- +- // op2 controls the indexing +- switch(op2){ +- case 2: +- // offset +- desc->addr = (void*)(regs->regs[Rn] + imm); +- break; +- default: +- return -1; +- } +- //desc->data1 = regs->regs[Rt]; +- //desc->data2 = regs->regs[Rt2]; +- desc->reg1 = Rt; +- desc->reg2 = Rt2; +- +- return do_ls_fixup(instr, regs, desc); +- +-} +- +-__attribute__((always_inline)) inline int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +- uint8_t size = (instr >> 30) & 3; +- uint8_t simd = (instr >> 26) & 1; +- uint8_t opc = (instr >> 22) & 3; +- uint64_t imm12 = (instr >> 10) & 0xfff; +- uint8_t Rn = (instr >> 5) & 0x1f; +- uint8_t Rt = instr & 0x1f; +- +- uint8_t load = opc & 1; +- uint8_t extend_sign = 0;// = ((opc & 2) >> 1 ) & !simd; +- int width_shift = 0; +- +- if(simd){ +- extend_sign = 0; +- width_shift = size | ((opc & 2) << 1); +- } else { +- extend_sign = ((opc & 2) >> 1 ); +- width_shift = size; +- } +- +- ///printk("size: %d simd: %d opc: %d imm12: 0x%x Rn: %d Rt: %d\n", size, simd, opc, imm12, Rn, Rt); +- // when in simd mode, opc&2 is a third size bit. Otherwise, it's there for sign extension +- //width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); +- desc->width = 8 << width_shift; +- +- if((size & 1) && simd && (opc & 2)){ +- return 1; +- } +- desc->load = load; +- desc->reg1 = Rt; +- desc->simd = simd; +- desc->extendSign = extend_sign; +- u64 addr = regs->regs[Rn]; +- desc->addr = addr + (imm12 << width_shift); +- ///printk("unsigned imm\n"); +- +- return do_ls_fixup(instr, regs, desc); +-} +- +- +-__attribute__((always_inline)) inline u64 extend_reg(u64 reg, int type, int shift){ +- +- uint8_t is_signed = (type & 4) >> 2; +- uint8_t input_width = type & 1; +- +- u64 tmp; +- if(!is_signed){ +- tmp = reg; +- } else { +- if(input_width == 0){ +- // 32bit, needs to be extended to 64 +- // I hope the compiler just does this kind of automatically with these types +- int32_t stmpw = reg; +- int64_t stmpdw = stmpw; +- tmp = (u64)stmpdw; +- } else { +- printk("Other branch I forgor about previously!\n"); +- tmp = reg; // since the size stays the same, I don't think this makes a difference +- } +- } +- +- ///printk("extend_reg: reg 0x%lx out (before shift) 0x%lx signed: %x\n", reg, tmp, is_signed); +- +- return tmp << shift; +-} +- +-__attribute__((always_inline)) inline int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +- uint8_t size = (instr >> 30) & 3; +- uint8_t simd = (instr >> 26) & 1; +- uint8_t opc = (instr >> 22) & 3; +- uint8_t option = (instr >> 13) & 5; +- uint8_t Rm = (instr >> 16) & 0x1f; +- uint8_t Rn = (instr >> 5) & 0x1f; +- uint8_t Rt = instr & 0x1f; +- uint8_t S = (instr >> 12) & 1; +- int width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); +- // size==0 seems to be a bit special +- // opc&2 is sign, opc&1 is load (for most instructions anyways) +- +- uint8_t load = opc & 1; +- uint8_t extend_sign = ((opc & 2) >> 1 ) & !simd; +- desc->pair = 0; +- +- desc->simd = simd; +- desc->width = 8 << width_shift; +- +- // the simd instructions make this a bit weird +- if(extend_sign){ +- if(load){ +- desc->extend_width = 32; +- } else { +- desc->extend_width = 64; +- } +- desc->load = 1; +- } else { +- desc->load = load; +- } +- +- desc->extendSign = extend_sign; // needed for load, which isn't implemented yet +- +- u64 offset = 0; +- u64 addr = 0; +- addr = regs->regs[Rn]; +- if(simd){ +- int shift = 0; +- if(S) shift = width_shift; +- offset = extend_reg(regs->regs[Rm], option, shift); +- } else { +- int shift = 0; +- if(S) shift = 2 << ((size & 1) & ((size >> 1) & 1)); +- +- offset = extend_reg(regs->regs[Rm], option, shift); +- } +- +- addr += offset; +- +- //desc->data1 = regs->regs[Rt]; +- desc->reg1 = Rt; +- desc->addr = (void*)addr; +- +- return do_ls_fixup(instr, regs, desc); +- return 0; +-} +- +-__attribute__((always_inline)) inline int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +- uint8_t size = (instr >> 30) & 3; +- uint8_t simd = (instr >> 26) & 1; +- uint8_t opc = (instr >> 22) & 3; +- uint16_t imm9 = (instr >> 12) & 0x1ff; +- uint8_t Rn = (instr >> 5) & 0x1f; +- uint8_t Rt = instr & 0x1f; +- +- int16_t fullImm = 0; +- // sign extend it +- if(imm9 & 0x100){ +- fullImm = 0xfe00 | imm9; +- } else { +- fullImm = imm9; +- } +- u64 addr = regs->regs[Rn]; +- desc->addr = addr + fullImm; +- desc->pair = 0; +- +- int load = opc & 1; +- desc->load = load; +- /*if(load){ +- return 1; +- }*/ +- desc->reg1 = Rt; +- if(simd){ +- desc->simd = 1; +- desc->width = 8 << (size | ((opc & 2) << 1)); +- // assuming store +- /*__uint128_t tmp; +- read_simd_reg(Rt, &tmp); +- desc->data1 = tmp; +- desc->data1_simd = *(((u64*)&tmp) + 1);*/ +- return do_ls_fixup(instr, regs, desc); +- } else { +- desc->simd = 0; +- desc->width = 8 << size; +- return do_ls_fixup(instr, regs, desc); +- } +- ///printk("SIMD: %d\n", simd); +- return 1; +-} +- +-__attribute__((always_inline)) inline int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +- uint8_t op0; +- uint8_t op1; +- uint8_t op2; +- uint8_t op3; +- uint8_t op4; +- +- int r = 1; +- +- op0 = (instr >> 28) & 0xf; +- op1 = (instr >> 26) & 1; +- op2 = (instr >> 23) & 3; +- op3 = (instr >> 16) & 0x3f; +- op4 = (instr >> 10) & 3; +- +- if((op0 & 3) == 2){ +- desc->pair = 1; +- r = ls_pair_fixup(instr, regs, desc); +- } +- if((op0 & 3) == 0 && op1 == 0 && op2 == 1 && (op3 & 0x20) == 0x20){ +- // compare and swap +- r = ls_cas_fixup(instr, regs, desc); +- } +- if((op0 & 3) == 3 && (op2 & 3) == 3){ +- //load/store unsigned immediate +- desc->pair = 0; +- +- } +- if((op0 & 3) == 3 && ((op2 & 2) == 2)){ +- // register unsigned immediate +- r = ls_reg_unsigned_imm(instr, regs, desc); +- } +- if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x20 && op4 == 2){ +- // register offset load/store +- r = lsr_offset_fixup(instr, regs, desc); +- } +- if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x0 && op4 == 0){ +- // register load/store unscaled immediate +- r = lsr_unscaled_immediate_fixup(instr, regs, desc); +- } +- if(r){ +- printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); +- } +- return r; +-} +- +-__attribute__((always_inline)) inline int system_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +- uint8_t op1; +- uint8_t op2; +- uint8_t CRn; +- uint8_t CRm; +- uint8_t Rt; +- bool L; +- int r = 0; +- +- op1 = (instr >> 16) & 0x7; +- op2 = (instr >> 5) & 0x7; +- CRn = (instr >> 12) & 0xf; +- CRm = (instr >> 8) & 0xf; +- L = (instr >> 21) & 1; +- Rt = instr & 0x1f; +- +- if(!L){ +- // SYS +- // proper decoding would be nicer here, but I don't expect to see too many system instructions +- if((op1 == 0x3) && (op2 == 1) && (CRn = 0x7) && (CRm == 4)){ +- // dc zva +- uint64_t dczid_el0 = read_sysreg_s(SYS_DCZID_EL0); +- if(!((dczid_el0 >> DCZID_EL0_DZP_SHIFT) & 1)){ +- uint16_t blksize = 4 << (dczid_el0 & 0xf); +- r = memset_io_user(blksize, 0, regs->user_regs.regs[Rt]); +- arm64_skip_faulting_instruction(regs, 4); +- return r; +- } else { +- printk("DC ZVA is not allowed!\n"); +- return 1; +- } +- } +- } +- +- printk("Unhandled system instruction. op1=0x%x op2=0x%x CRn=0x%x CRm=0x%x\n", op1, op2, CRn, CRm); +- return 1; +-} +- +-__attribute__((always_inline)) inline int branch_except_system_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +- uint8_t op0; +- uint32_t op1; +- uint8_t op2; +- +- op0 = (instr >> 29) & 0x7; +- op1 = (instr >> 5) & 0x1fffff; +- op2 = instr & 0x1f; +- +- if((op0 == 0x6) && (op1 & 0x1ec000) == 0x84000){ +- return system_fixup(instr, regs, desc); +- } +- printk("Unhandled Branch/Exception generating/System instruction. op0=0x%x op1=0x%x op2=0x%x\n", op0, op1, op2); +- return 1; +-} +- +-uint32_t* seenCMDs; +-size_t seenCMDCount = 0; +-size_t seenCMDSize = 0; +- +-void instrDBG(u32 instr){ +- for(size_t i = 0; i < seenCMDCount; i++){ +- if(seenCMDs[i] == instr){ +- return; +- } +- } +- if(seenCMDSize == 0){ +- seenCMDs = krealloc(seenCMDs, 1, GFP_KERNEL); +- seenCMDSize = 1; +- } +- +- if(seenCMDCount >= seenCMDSize){ +- seenCMDs = krealloc(seenCMDs, seenCMDSize*2, GFP_KERNEL); +- seenCMDSize *= 2; +- } +- +- seenCMDs[seenCMDCount] = instr; +- seenCMDCount++; +- printk("New instruction: %x", instr); +-} +- +-int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ +- unsigned long long instrptr; +- u32 instr = 0; +- +- instrptr = instruction_pointer(regs); +- //printk("Alignment fixup\n"); +- +- if (alignment_get_arm64(regs, (__le64 __user *)instrptr, &instr)){ +- printk("Failed to get aarch64 instruction\n"); +- return 1; +- } +- +- /** +- * List of seen faults: 020c00a9 (0xa9000c02) stp x2, x3, [x0] +- * +- */ +- +- //instrDBG(instr); +- +- uint8_t op0; +- int r; +- struct fixupDescription desc = {0}; +- //desc.starttime = ktime_get_ns(); +- op0 = ((instr & 0x1E000000) >> 25); +- if((op0 & 5) == 0x4){ +- //printk("Load/Store\n"); +- r = ls_fixup(instr, regs, &desc); +- //desc.endtime = ktime_get_ns(); +- /*printk("Trap timing: decoding: %ldns, mem ops: %ldns, total: %ldns\n", desc.decodedtime - desc.starttime, +- desc.endtime - desc.decodedtime, desc.endtime - desc.starttime); +- */ +- if(r){ +- printk("Faulting instruction: 0x%lx\n", instr); +- } +- +- return r; +- } else if((op0 & 0xe) == 0xa){ +- // System instructions, needed for dc zva +- return branch_except_system_fixup(instr, regs, &desc); +- }else { +- printk("Not handling instruction with op0 0x%x (instruction is 0x%08x)",op0, instr); +- } +- return -1; +-} +diff --git a/arch/arm64/kernel/compat_alignment_64.c b/arch/arm64/kernel/compat_alignment_64.c +new file mode 100644 +index 00000000000000..adfd0523094bc1 +--- /dev/null ++++ b/arch/arm64/kernel/compat_alignment_64.c +@@ -0,0 +1,995 @@ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++/* ++ *Happens with The Long Dark (also with steam) ++ * ++ *[ 6012.660803] Faulting instruction: 0x3d800020 ++[ 6012.660813] Load/Store: op0 0x3 op1 0x1 op2 0x3 op3 0x0 op4 0x0 ++ * ++ *[ 555.449651] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x1 op4 0x0 ++[ 555.449654] Faulting instruction: 0x3c810021 ++ * ++ * ++ *[ 555.449663] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x2 op4 0x0 ++[ 555.449666] Faulting instruction: 0x3c820020 ++ * ++ *[ 555.449674] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x3 op4 0x0 ++[ 555.449677] Faulting instruction: 0x3c830021 ++ ++stur q1, [x1, #16] ++potentially also ldur q0, [x1, #32] and ldur q1, [x1, #48] ++ * ++ * ++ * ++ */ ++ ++ ++struct fixupDescription{ ++ void* addr; ++ ++ // datax_simd has to be located directly after datax in memory ++ /*u64 data1; ++ u64 data1_simd; ++ u64 data2; ++ u64 data2_simd;*/ ++ ++ int reg1; ++ int reg2; ++ ++ int Rs; // used for atomics (which don't get handled atomically) ++ ++ int simd; // wether or not this is a vector instruction ++ int load; // 1 is it's a load, 0 if it's a store ++ int pair; // 1 if it's a l/s pair instruction ++ int width; // width of the access in bits ++ int extendSign; ++ int extend_width; ++ ++ // profiling ++ u64 starttime; ++ u64 decodedtime; ++ u64 endtime; ++}; ++ ++__attribute__((always_inline)) inline static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *inst) ++{ ++ __le32 instr = 0; ++ int fault; ++ ++ fault = get_user(instr, ip); ++ if (fault) ++ return fault; ++ ++ *inst = __le32_to_cpu(instr); ++ return 0; ++} ++ ++__attribute__((always_inline)) inline int64_t extend_sign(int64_t in, int bits){ ++ bits--; ++ if(in & (1 << bits)){ ++ // extend sign ++ return (0xffffffffffffffff << bits) | in; ++ } ++ return in; ++} ++ ++/*int ldpstp_offset_fixup(u32 instr, struct pt_regs *regs){ ++ uint8_t load = (instr >> 22) & 1; ++ uint8_t simd = (instr >> 26) & 1; ++ uint16_t imm7 = (instr >> 15) & 0x7f; ++ uint8_t Rt2 = (instr >> 10) & 0x1f; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; ++ ++ int16_t imm = 0xffff & imm7; ++ printk("Variant: 0x%x Load: %x SIMD: %x IMM: 0x%x Rt: 0x%x Rt2: 0x%x Rn: 0x%x\n", ((instr >> 30) & 3),load, simd, imm, Rt, Rt2, Rn); ++ if(((instr >> 30) & 3) == 2){ ++ // 64bit ++ if(!load){ ++ if(!simd){ ++ // 64bit store ++ u64 val1, val2; ++ val1 = regs->regs[Rt]; ++ val2 = regs->regs[Rt2]; ++ u64 addr = regs->regs[Rn] + imm; ++ printk("STP 64bit storing 0x%llx 0x%llx at 0x%llx\n", val1, val2, addr); ++ // for the first reg. Byte by byte to avoid any alignment issues ++ for(int i = 0; i < 8; i++){ ++ uint8_t v = (val1 >> (i*8)) & 0xff; ++ put_user(v, (uint8_t __user *)addr); ++ addr++; ++ } ++ // second reg ++ for(int i = 0; i < 8; i++){ ++ uint8_t v = (val2 >> (i*8)) & 0xff; ++ put_user(v, (uint8_t __user *)addr); ++ addr++; ++ } ++ arm64_skip_faulting_instruction(regs, 4); ++ } ++ } ++ } ++ return 0; ++}*/ ++ ++// saves the contents of the simd register reg to dst ++__attribute__((always_inline)) inline void read_simd_reg(int reg, u64 dst[2]){ ++ struct user_fpsimd_state st = {0}; ++ //fpsimd_save_state(&st); ++ ++ if(!may_use_simd()){ ++ printk("may_use_simd returned false!\n"); ++ } ++ kernel_neon_begin(); ++ if(current->thread.sve_state){ ++ printk("SVE state is not NULL!\n"); ++ } ++ ++ dst[0] = *((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])); ++ dst[1] = *(((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) + 1); ++ ++ kernel_neon_end(); ++} ++ ++ ++__attribute__((always_inline)) inline void write_simd_reg(int reg, u64 src[2]){ ++ ++ if(!may_use_simd()){ ++ printk("may_use_simd returned false!\n"); ++ } ++ kernel_neon_begin(); ++ if(current->thread.sve_state){ ++ printk("SVE state is not NULL!\n"); ++ } ++ ++ *((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) = src[0]; ++ *(((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) + 1) = src[1]; ++ ++ kernel_neon_end(); ++} ++ ++// these try to use larger access widths than single bytes. Slower for small loads/stores, but it might speed larger ones up ++ ++__attribute__((always_inline)) inline int put_data2(int size, uint8_t* data, void* addr){ ++ int r = 0; ++ ++ while(size){ ++ if(size >= 4 && (((u64)addr % 4) == 0)){ ++ if((r=put_user( (*(((uint32_t*)(data)))), (uint32_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (%d)\n", addr,r); ++ return r; ++ } ++ addr += 4; ++ data += 4; ++ size -= 4; ++ continue; ++ } ++ if(size >= 2 && (((u64)addr % 2) == 0)){ ++ if((r=put_user( (*(((uint16_t*)(data)))), (uint16_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (%d)\n", addr,r); ++ return r; ++ } ++ addr += 2; ++ data += 2; ++ size -= 2; ++ continue; ++ } ++ // I guess the if is redundant here ++ if(size >= 1){ ++ if((r=put_user( (*(((uint8_t*)(data)))), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (%d)\n", addr,r); ++ return r; ++ } ++ addr += 1; ++ data += 1; ++ size -= 1; ++ continue; ++ } ++ ++ } ++ ++ return r; ++} ++ ++__attribute__((always_inline)) inline int get_data2(int size, uint8_t* data, void* addr){ ++ int r = 0; ++ uint32_t val32; ++ uint16_t val16; ++ uint8_t val8; ++ while(size){ ++ if(size >= 4 && (((u64)addr % 4) == 0)){ ++ if((r=get_user( val32, (uint32_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *((uint32_t*)data) = val32; ++ addr += 4; ++ data += 4; ++ size -= 4; ++ continue; ++ } ++ if(size >= 2 && (((u64)addr % 2) == 0)){ ++ if((r=get_user( val16, (uint16_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *((uint16_t*)data) = val16; ++ addr += 2; ++ data += 2; ++ size -= 2; ++ continue; ++ } ++ // I guess the if is redundant here ++ if(size >= 1){ ++ if((r=get_user( val8, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *((uint8_t*)data) = val8; ++ addr += 1; ++ data += 1; ++ size -= 1; ++ continue; ++ } ++ ++ } ++ ++ return r; ++} ++ ++ ++// these should avoid some branching, but still use single byte accesses ++__attribute__((always_inline)) inline int put_data(int size, uint8_t* data, void* addr){ ++ int r = 0; ++ int addrIt = 0; ++ ++ // with the fixed size loops, the compiler should be able to unroll them ++ // this should mean a lot less branching ++ switch(size){ ++ case 16: ++ for(int i = 0; i < 8; i++){ ++ if((r=put_user( (*(((uint8_t*)(data)) + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px\n", addr); ++ return r; ++ } ++ addrIt++; ++ addr++; ++ } ++ //__attribute__((fallthrough)); ++ case 8: ++ for(int i = 0; i < 4; i++){ ++ if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px\n", addr); ++ return r; ++ } ++ addrIt++; ++ addr++; ++ } ++ //__attribute__((fallthrough)); ++ case 4: ++ for(int i = 0; i < 2; i++){ ++ if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px\n", addr); ++ return r; ++ } ++ addrIt++; ++ addr++; ++ } ++ //__attribute__ ((fallthrough)); ++ case 2: ++ if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px\n", addr); ++ return r; ++ } ++ addrIt++; ++ addr++; ++ //__attribute__ ((fallthrough)); ++ case 1: ++ if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px\n", addr); ++ return r; ++ } ++ addrIt++; ++ addr++; ++ break; ++ default: ++ printk("unsupported size %d\n", size); ++ } ++ ++ return r; ++} ++ ++__attribute__((always_inline)) inline int get_data(int size, uint8_t* data, void* addr){ ++ int r = 0; ++ int addrIt = 0; ++ ++ // with the fixed size loops, the compiler should be able to unroll them ++ // this should mean a lot less branching ++ uint8_t val; ++ switch(size){ ++ case 16: ++ for(int i = 0; i < 8; i++){ ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *(data + addrIt) = val; ++ addrIt++; ++ addr++; ++ } ++ // fall through ++ case 8: ++ for(int i = 0; i < 4; i++){ ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *(data + addrIt) = val; ++ addrIt++; ++ addr++; ++ } ++ // fall through ++ case 4: ++ for(int i = 0; i < 2; i++){ ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *(data + addrIt) = val; ++ addrIt++; ++ addr++; ++ } ++ // fall through ++ case 2: ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *(data + addrIt) = val; ++ addrIt++; ++ addr++; ++ // fall through ++ case 1: ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to read data at 0x%px\n", addr); ++ return r; ++ } ++ *(data + addrIt) = val; ++ addrIt++; ++ addr++; ++ break; ++ default: ++ printk("unsupported size %d\n", size); ++ } ++ ++ return r; ++} ++ ++int memset_io_user(uint64_t size, uint8_t c, void* addr){ ++ int r = 0; ++ uint64_t pattern = c; ++ pattern |= pattern << 8; ++ pattern |= pattern << 16; ++ pattern |= pattern << 32; ++ uint64_t cnt = 0; ++ while(cnt < size){ ++ if((uint64_t)(addr + cnt) % 8){ ++ if((r = put_user(c, (uint8_t __user*) addr))){ ++ printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr + cnt, r, addr); ++ return r; ++ } ++ cnt++; ++ } else if(size - cnt >= 8){ ++ if((r = put_user(pattern, (uint64_t __user*) addr))){ ++ printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr + cnt, r, addr); ++ return r; ++ } ++ cnt += 8; ++ } else{ ++ if((r = put_user(c, (uint8_t __user*) addr))){ ++ printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr + cnt, r, addr); ++ return r; ++ } ++ cnt++; ++ } ++ ++ } ++ return r; ++} ++ ++int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ int r; ++ u64 data1[2] = {0,0}; ++ u64 data2[2] = {0,0}; ++ //desc->decodedtime = ktime_get_ns(); ++ // the reg indices have to always be valid, even if the reg isn't being used ++ if(!desc->load){ ++ if(desc->simd){ ++ // At least currently, there aren't any simd instructions supported that use more than one data register ++ //__uint128_t tmp; ++ ++ // probably better for performance to read both registers with one function to kernel_neon_* doesn't have to be called more than once ++ read_simd_reg(desc->reg1, data1); ++ read_simd_reg(desc->reg2, data2); ++ //data1[0] = tmp; ++ //data1[1] = *(((u64*)&tmp) + 1); ++ ///printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); ++ /*if(desc->width < 128){ ++ return -1; ++ }*/ ++ } else { ++ data1[0] = regs->regs[desc->reg1]; ++ data2[0] = regs->regs[desc->reg2]; ++ } ++ } ++ ++ /*if(desc->width > 64){ ++ printk("Currently cannot process ls_fixup with a size of %d bits\n", desc->width); ++ return 1; ++ }*/ ++ if(!desc->load){ ++ uint8_t* addr = desc->addr; ++ int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that ++ ++ //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); ++ int addrIt = 0; ++ for(int i = 0; i < bcount; i++){ ++ if((r=put_user( (*(((uint8_t*)(data1)) + addrIt) & 0xff), (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr, r, desc->addr); ++ return r; ++ } ++ //desc->data1 >>= 8; ++ addrIt++; ++ addr++; ++ } ++ //put_data2(bcount, (uint8_t*)data1, addr); ++ //addr += bcount; ++ addrIt = 0; ++ if(desc->pair){ ++ for(int i = 0; i < bcount; i++){ ++ if((r=put_user((*(((uint8_t*)(data2)) + addrIt) & 0xff) & 0xff, (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr, r, desc->addr); ++ return r; ++ } ++ //desc->data2 >>= 8; ++ addrIt++; ++ addr++; ++ } ++ //put_data2(bcount, (uint8_t*)data2, addr); ++ addr += bcount; ++ } ++ arm64_skip_faulting_instruction(regs, 4); ++ } else { ++ //printk("Loading is currently not implemented (addr 0x%px)\n", desc->addr); ++ ++ uint8_t* addr = desc->addr; ++ int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that ++ ++ //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); ++ int addrIt = 0; ++ /*for(int i = 0; i < bcount; i++){ ++ uint8_t val; ++ if((r=get_user( val, (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); ++ return r; ++ } ++ *(((uint8_t*)data1) + addrIt) = val; ++ //desc->data1 >>= 8; ++ addrIt++; ++ addr++; ++ }*/ ++ get_data2(bcount, (uint8_t*)data1, addr); ++ addr += bcount; ++ ++ if(desc->simd){ ++ write_simd_reg(desc->reg1, data1); ++ } else { ++ regs->regs[desc->reg1] = data1[0]; ++ } ++ ++ addrIt = 0; ++ if(desc->pair){ ++ /*for(int i = 0; i < bcount; i++){ ++ uint8_t val; ++ if((r=get_user(val, (uint8_t __user *)addr))){ ++ printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); ++ return r; ++ } ++ *(((uint8_t*)data2) + addrIt) = val; ++ //desc->data2 >>= 8; ++ addrIt++; ++ addr++; ++ }*/ ++ ++ get_data2(bcount, (uint8_t*)data2, addr); ++ addr += bcount; ++ if(desc->simd){ ++ write_simd_reg(desc->reg2, data1); ++ } else { ++ regs->regs[desc->reg2] = data1[0]; ++ } ++ } ++ arm64_skip_faulting_instruction(regs, 4); ++ ++ ++ } ++ return 0; ++} ++ ++int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t size = (instr >> 30) & 3; ++ uint8_t load = (instr >> 22) & 1; // acquire semantics, has no effect here, since it's not atomic anymore ++ uint8_t Rs = (instr >> 16) & 0x1f; ++ uint8_t Rt2 = (instr >> 10) & 0x1f; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; ++ ++ uint8_t o0 = (instr >> 15) & 1; // L, release semantics, has no effect here, since it's not atomic anymore ++ ++ if(Rt2 != 0x1f){ ++ return -1; ++ } ++ ++ switch(size){ ++ case 0: ++ desc->width = 8; ++ break; ++ case 1: ++ desc->width = 16; ++ break; ++ case 2: ++ desc->width = 32; ++ break; ++ case 3: ++ desc->width = 64; ++ break; ++ } ++ ++ desc->addr = (void*)regs->regs[Rn]; ++ u64 data1 = regs->regs[Rt]; ++ ++ // nearly everything from here on could be moved into another function if needed ++ u64 cmpmask = (1 << desc->width) - 1; ++ u64 cmpval = regs->regs[Rs] & cmpmask; ++ ++ u64 readval = 0; ++ int bcount = desc->width / 8; ++ u64 addr = desc->addr; ++ int r; ++ uint8_t tmp; ++ ++ printk("Atomic CAS not being done atomically at 0x%px, size %d\n",desc->addr, desc->width); ++ ++ for(int i = 0; i < bcount; i++){ ++ if((r=get_user(tmp, (uint8_t __user *)addr))) ++ return r; ++ readval |= tmp; ++ readval <<= 8; // maybe this could be read directly into regs->regs[Rs] ++ addr++; ++ } ++ ++ if((readval & cmpmask) == cmpval){ ++ // swap ++ addr = (u64)desc->addr; ++ ++ for(int i = 0; i < bcount; i++){ ++ if((r=put_user(data1 & 0xff, (uint8_t __user *)addr))) ++ return r; ++ data1 >>= 8; ++ addr++; ++ } ++ ++ regs->regs[Rs] = readval; ++ } ++ ++ arm64_skip_faulting_instruction(regs, 4); ++ ++ return 0; ++} ++ ++__attribute__((always_inline)) inline int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t op2; ++ uint8_t opc; ++ op2 = (instr >> 23) & 3; ++ opc = (instr >> 30) & 3; ++ ++ uint8_t load = (instr >> 22) & 1; ++ uint8_t simd = (instr >> 26) & 1; ++ uint16_t imm7 = (instr >> 15) & 0x7f; ++ uint8_t Rt2 = (instr >> 10) & 0x1f; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; ++ ++ int64_t imm = extend_sign(imm7, 7); ++ //int immshift = 0; ++ desc->load = load; ++ desc->simd = simd; ++ ++ // opc controls the width ++ if(simd){ ++ desc->width = 32 << opc; ++ //immshift = 4 << opc; ++ imm <<= 2; ++ imm <<= opc; ++ } else { ++ switch(opc){ ++ case 0: ++ desc->width = 32; ++ imm <<= 2; ++ break; ++ case 2: ++ desc->width = 64; ++ imm <<= 3; ++ break; ++ default: ++ return -1; ++ } ++ } ++ ++ // op2 controls the indexing ++ switch(op2){ ++ case 2: ++ // offset ++ desc->addr = (void*)(regs->regs[Rn] + imm); ++ break; ++ default: ++ return -1; ++ } ++ //desc->data1 = regs->regs[Rt]; ++ //desc->data2 = regs->regs[Rt2]; ++ desc->reg1 = Rt; ++ desc->reg2 = Rt2; ++ ++ return do_ls_fixup(instr, regs, desc); ++ ++} ++ ++__attribute__((always_inline)) inline int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t size = (instr >> 30) & 3; ++ uint8_t simd = (instr >> 26) & 1; ++ uint8_t opc = (instr >> 22) & 3; ++ uint64_t imm12 = (instr >> 10) & 0xfff; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; ++ ++ uint8_t load = opc & 1; ++ uint8_t extend_sign = 0;// = ((opc & 2) >> 1 ) & !simd; ++ int width_shift = 0; ++ ++ if(simd){ ++ extend_sign = 0; ++ width_shift = size | ((opc & 2) << 1); ++ } else { ++ extend_sign = ((opc & 2) >> 1 ); ++ width_shift = size; ++ } ++ ++ ///printk("size: %d simd: %d opc: %d imm12: 0x%x Rn: %d Rt: %d\n", size, simd, opc, imm12, Rn, Rt); ++ // when in simd mode, opc&2 is a third size bit. Otherwise, it's there for sign extension ++ //width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); ++ desc->width = 8 << width_shift; ++ ++ if((size & 1) && simd && (opc & 2)){ ++ return 1; ++ } ++ desc->load = load; ++ desc->reg1 = Rt; ++ desc->simd = simd; ++ desc->extendSign = extend_sign; ++ u64 addr = regs->regs[Rn]; ++ desc->addr = addr + (imm12 << width_shift); ++ ///printk("unsigned imm\n"); ++ ++ return do_ls_fixup(instr, regs, desc); ++} ++ ++ ++__attribute__((always_inline)) inline u64 extend_reg(u64 reg, int type, int shift){ ++ ++ uint8_t is_signed = (type & 4) >> 2; ++ uint8_t input_width = type & 1; ++ ++ u64 tmp; ++ if(!is_signed){ ++ tmp = reg; ++ } else { ++ if(input_width == 0){ ++ // 32bit, needs to be extended to 64 ++ // I hope the compiler just does this kind of automatically with these types ++ int32_t stmpw = reg; ++ int64_t stmpdw = stmpw; ++ tmp = (u64)stmpdw; ++ } else { ++ printk("Other branch I forgor about previously!\n"); ++ tmp = reg; // since the size stays the same, I don't think this makes a difference ++ } ++ } ++ ++ ///printk("extend_reg: reg 0x%lx out (before shift) 0x%lx signed: %x\n", reg, tmp, is_signed); ++ ++ return tmp << shift; ++} ++ ++__attribute__((always_inline)) inline int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t size = (instr >> 30) & 3; ++ uint8_t simd = (instr >> 26) & 1; ++ uint8_t opc = (instr >> 22) & 3; ++ uint8_t option = (instr >> 13) & 5; ++ uint8_t Rm = (instr >> 16) & 0x1f; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; ++ uint8_t S = (instr >> 12) & 1; ++ int width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); ++ // size==0 seems to be a bit special ++ // opc&2 is sign, opc&1 is load (for most instructions anyways) ++ ++ uint8_t load = opc & 1; ++ uint8_t extend_sign = ((opc & 2) >> 1 ) & !simd; ++ desc->pair = 0; ++ ++ desc->simd = simd; ++ desc->width = 8 << width_shift; ++ ++ // the simd instructions make this a bit weird ++ if(extend_sign){ ++ if(load){ ++ desc->extend_width = 32; ++ } else { ++ desc->extend_width = 64; ++ } ++ desc->load = 1; ++ } else { ++ desc->load = load; ++ } ++ ++ desc->extendSign = extend_sign; // needed for load, which isn't implemented yet ++ ++ u64 offset = 0; ++ u64 addr = 0; ++ addr = regs->regs[Rn]; ++ if(simd){ ++ int shift = 0; ++ if(S) shift = width_shift; ++ offset = extend_reg(regs->regs[Rm], option, shift); ++ } else { ++ int shift = 0; ++ if(S) shift = 2 << ((size & 1) & ((size >> 1) & 1)); ++ ++ offset = extend_reg(regs->regs[Rm], option, shift); ++ } ++ ++ addr += offset; ++ ++ //desc->data1 = regs->regs[Rt]; ++ desc->reg1 = Rt; ++ desc->addr = (void*)addr; ++ ++ return do_ls_fixup(instr, regs, desc); ++ return 0; ++} ++ ++__attribute__((always_inline)) inline int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t size = (instr >> 30) & 3; ++ uint8_t simd = (instr >> 26) & 1; ++ uint8_t opc = (instr >> 22) & 3; ++ uint16_t imm9 = (instr >> 12) & 0x1ff; ++ uint8_t Rn = (instr >> 5) & 0x1f; ++ uint8_t Rt = instr & 0x1f; ++ ++ int16_t fullImm = 0; ++ // sign extend it ++ if(imm9 & 0x100){ ++ fullImm = 0xfe00 | imm9; ++ } else { ++ fullImm = imm9; ++ } ++ u64 addr = regs->regs[Rn]; ++ desc->addr = addr + fullImm; ++ desc->pair = 0; ++ ++ int load = opc & 1; ++ desc->load = load; ++ /*if(load){ ++ return 1; ++ }*/ ++ desc->reg1 = Rt; ++ if(simd){ ++ desc->simd = 1; ++ desc->width = 8 << (size | ((opc & 2) << 1)); ++ // assuming store ++ /*__uint128_t tmp; ++ read_simd_reg(Rt, &tmp); ++ desc->data1 = tmp; ++ desc->data1_simd = *(((u64*)&tmp) + 1);*/ ++ return do_ls_fixup(instr, regs, desc); ++ } else { ++ desc->simd = 0; ++ desc->width = 8 << size; ++ return do_ls_fixup(instr, regs, desc); ++ } ++ ///printk("SIMD: %d\n", simd); ++ return 1; ++} ++ ++__attribute__((always_inline)) inline int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t op0; ++ uint8_t op1; ++ uint8_t op2; ++ uint8_t op3; ++ uint8_t op4; ++ ++ int r = 1; ++ ++ op0 = (instr >> 28) & 0xf; ++ op1 = (instr >> 26) & 1; ++ op2 = (instr >> 23) & 3; ++ op3 = (instr >> 16) & 0x3f; ++ op4 = (instr >> 10) & 3; ++ ++ if((op0 & 3) == 2){ ++ desc->pair = 1; ++ r = ls_pair_fixup(instr, regs, desc); ++ } ++ if((op0 & 3) == 0 && op1 == 0 && op2 == 1 && (op3 & 0x20) == 0x20){ ++ // compare and swap ++ r = ls_cas_fixup(instr, regs, desc); ++ } ++ if((op0 & 3) == 3 && (op2 & 3) == 3){ ++ //load/store unsigned immediate ++ desc->pair = 0; ++ ++ } ++ if((op0 & 3) == 3 && ((op2 & 2) == 2)){ ++ // register unsigned immediate ++ r = ls_reg_unsigned_imm(instr, regs, desc); ++ } ++ if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x20 && op4 == 2){ ++ // register offset load/store ++ r = lsr_offset_fixup(instr, regs, desc); ++ } ++ if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x0 && op4 == 0){ ++ // register load/store unscaled immediate ++ r = lsr_unscaled_immediate_fixup(instr, regs, desc); ++ } ++ if(r){ ++ printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); ++ } ++ return r; ++} ++ ++__attribute__((always_inline)) inline int system_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t op1; ++ uint8_t op2; ++ uint8_t CRn; ++ uint8_t CRm; ++ uint8_t Rt; ++ bool L; ++ int r = 0; ++ ++ op1 = (instr >> 16) & 0x7; ++ op2 = (instr >> 5) & 0x7; ++ CRn = (instr >> 12) & 0xf; ++ CRm = (instr >> 8) & 0xf; ++ L = (instr >> 21) & 1; ++ Rt = instr & 0x1f; ++ ++ if(!L){ ++ // SYS ++ // proper decoding would be nicer here, but I don't expect to see too many system instructions ++ if((op1 == 0x3) && (op2 == 1) && (CRn = 0x7) && (CRm == 4)){ ++ // dc zva ++ uint64_t dczid_el0 = read_sysreg_s(SYS_DCZID_EL0); ++ if(!((dczid_el0 >> DCZID_EL0_DZP_SHIFT) & 1)){ ++ uint16_t blksize = 4 << (dczid_el0 & 0xf); ++ r = memset_io_user(blksize, 0, regs->user_regs.regs[Rt]); ++ arm64_skip_faulting_instruction(regs, 4); ++ return r; ++ } else { ++ printk("DC ZVA is not allowed!\n"); ++ return 1; ++ } ++ } ++ } ++ ++ printk("Unhandled system instruction. op1=0x%x op2=0x%x CRn=0x%x CRm=0x%x\n", op1, op2, CRn, CRm); ++ return 1; ++} ++ ++__attribute__((always_inline)) inline int branch_except_system_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ ++ uint8_t op0; ++ uint32_t op1; ++ uint8_t op2; ++ ++ op0 = (instr >> 29) & 0x7; ++ op1 = (instr >> 5) & 0x1fffff; ++ op2 = instr & 0x1f; ++ ++ if((op0 == 0x6) && (op1 & 0x1ec000) == 0x84000){ ++ return system_fixup(instr, regs, desc); ++ } ++ printk("Unhandled Branch/Exception generating/System instruction. op0=0x%x op1=0x%x op2=0x%x\n", op0, op1, op2); ++ return 1; ++} ++ ++uint32_t* seenCMDs; ++size_t seenCMDCount = 0; ++size_t seenCMDSize = 0; ++ ++void instrDBG(u32 instr){ ++ for(size_t i = 0; i < seenCMDCount; i++){ ++ if(seenCMDs[i] == instr){ ++ return; ++ } ++ } ++ if(seenCMDSize == 0){ ++ seenCMDs = krealloc(seenCMDs, 1, GFP_KERNEL); ++ seenCMDSize = 1; ++ } ++ ++ if(seenCMDCount >= seenCMDSize){ ++ seenCMDs = krealloc(seenCMDs, seenCMDSize*2, GFP_KERNEL); ++ seenCMDSize *= 2; ++ } ++ ++ seenCMDs[seenCMDCount] = instr; ++ seenCMDCount++; ++ printk("New instruction: %x", instr); ++} ++ ++int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ ++ unsigned long long instrptr; ++ u32 instr = 0; ++ ++ instrptr = instruction_pointer(regs); ++ //printk("Alignment fixup\n"); ++ ++ if (alignment_get_arm64(regs, (__le64 __user *)instrptr, &instr)){ ++ printk("Failed to get aarch64 instruction\n"); ++ return 1; ++ } ++ ++ /** ++ * List of seen faults: 020c00a9 (0xa9000c02) stp x2, x3, [x0] ++ * ++ */ ++ ++ //instrDBG(instr); ++ ++ uint8_t op0; ++ int r; ++ struct fixupDescription desc = {0}; ++ //desc.starttime = ktime_get_ns(); ++ op0 = ((instr & 0x1E000000) >> 25); ++ if((op0 & 5) == 0x4){ ++ //printk("Load/Store\n"); ++ r = ls_fixup(instr, regs, &desc); ++ //desc.endtime = ktime_get_ns(); ++ /*printk("Trap timing: decoding: %ldns, mem ops: %ldns, total: %ldns\n", desc.decodedtime - desc.starttime, ++ desc.endtime - desc.decodedtime, desc.endtime - desc.starttime); ++ */ ++ if(r){ ++ printk("Faulting instruction: 0x%lx\n", instr); ++ } ++ ++ return r; ++ } else if((op0 & 0xe) == 0xa){ ++ // System instructions, needed for dc zva ++ return branch_except_system_fixup(instr, regs, &desc); ++ }else { ++ printk("Not handling instruction with op0 0x%x (instruction is 0x%08x)",op0, instr); ++ } ++ return -1; ++} +diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c +index 368f4eb573e9a9..dcc00362c9ba8c 100644 +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -755,7 +755,7 @@ static int do_alignment_fault(unsigned long far, unsigned long esr, + compat_user_mode(regs)) + return do_compat_alignment_fixup(far, regs); + +- if(user_mode(regs)){ ++ if(IS_ENABLED(CONFIG_ARM64_ALIGNMENT_FIXUPS) && user_mode(regs)){ + // aarch64 user mode + if(do_alignment_fixup(far, regs) == 0){ + return 0; +diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig +index 8e0d481e00e57e..ed1e3f5c2e76c1 100644 +--- a/drivers/gpu/drm/Kconfig ++++ b/drivers/gpu/drm/Kconfig +@@ -31,6 +31,16 @@ menuconfig DRM + + if DRM + ++config DRM_ARCH_CAN_WC ++ bool "Force Architecture can write-combine memory" ++ depends on DRM ++ default n ++ help ++ Enables write-combining even if it is not enabled by default. ++ Only use if the target systems support write-combining on ++ the memory used by the graphics adapters. ++ If in doubt, say 'N' ++ + config DRM_MIPI_DBI + tristate + depends on DRM +diff --git a/include/drm/drm_cache.h b/include/drm/drm_cache.h +index 08e0e3ffad1319..6deef42ba79f30 100644 +--- a/include/drm/drm_cache.h ++++ b/include/drm/drm_cache.h +@@ -45,6 +45,9 @@ bool drm_need_swiotlb(int dma_bits); + + static inline bool drm_arch_can_wc_memory(void) + { ++#if defined(CONFIG_DRM_ARCH_CAN_WC) ++ return true; ++#endif + #if defined(CONFIG_PPC) && !defined(CONFIG_NOT_COHERENT_CACHE) + return false; + #elif defined(CONFIG_MIPS) && defined(CONFIG_CPU_LOONGSON64) + +From 4233314c9233352db246e079635deeb1dd0e0098 Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Wed, 20 Nov 2024 02:57:24 +0100 +Subject: [PATCH 15/18] rx7000 + +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 8 +++--- + drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 2 +- + drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 16 ++++++------ + drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 12 ++++----- + drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 8 +++--- + drivers/gpu/drm/amd/amdgpu/psp_v13_0_4.c | 8 +++--- + .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 26 +++++++++---------- + .../display/dc/clk_mgr/dcn301/vg_clk_mgr.c | 4 +-- + .../display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c | 4 +-- + .../dc/clk_mgr/dcn314/dcn314_clk_mgr.c | 4 +-- + .../dc/clk_mgr/dcn315/dcn315_clk_mgr.c | 4 +-- + .../dc/clk_mgr/dcn316/dcn316_clk_mgr.c | 4 +-- + .../display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c | 2 +- + 13 files changed, 51 insertions(+), 51 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +index 7d4b540340e021..f29313e8266517 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +@@ -291,7 +291,7 @@ int amdgpu_mes_create_process(struct amdgpu_device *adev, int pasid, + DRM_ERROR("failed to allocate process context bo\n"); + goto clean_up_memory; + } +- memset(process->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); ++ memset_io(process->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); + + /* + * Avoid taking any other locks under MES lock to avoid circular +@@ -415,7 +415,7 @@ int amdgpu_mes_add_gang(struct amdgpu_device *adev, int pasid, + DRM_ERROR("failed to allocate process context bo\n"); + goto clean_up_mem; + } +- memset(gang->gang_ctx_cpu_ptr, 0, AMDGPU_MES_GANG_CTX_SIZE); ++ memset_io(gang->gang_ctx_cpu_ptr, 0, AMDGPU_MES_GANG_CTX_SIZE); + + /* + * Avoid taking any other locks under MES lock to avoid circular +@@ -563,7 +563,7 @@ static int amdgpu_mes_queue_alloc_mqd(struct amdgpu_device *adev, + dev_warn(adev->dev, "failed to create queue mqd bo (%d)", r); + return r; + } +- memset(q->mqd_cpu_ptr, 0, mqd_size); ++ memset_io(q->mqd_cpu_ptr, 0, mqd_size); + + r = amdgpu_bo_reserve(q->mqd_obj, false); + if (unlikely(r != 0)) +@@ -1279,7 +1279,7 @@ int amdgpu_mes_ctx_alloc_meta_data(struct amdgpu_device *adev, + if (!ctx_data->meta_data_obj) + return -ENOMEM; + +- memset(ctx_data->meta_data_ptr, 0, ++ memset_io(ctx_data->meta_data_ptr, 0, + sizeof(struct amdgpu_mes_ctx_meta_data)); + + return 0; +diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +index cb92b3292cfe94..e4caa01d93dbc1 100644 +--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +@@ -5577,7 +5577,7 @@ static void gfx_v10_0_rlc_backdoor_autoload_copy_ucode(struct amdgpu_device *ade + if (fw_size > toc_fw_size) + fw_size = toc_fw_size; + +- memcpy(ptr + toc_offset, fw_data, fw_size); ++ memcpy_toio(ptr + toc_offset, fw_data, fw_size); + + if (fw_size < toc_fw_size) + memset_io(ptr + toc_offset + fw_size, 0, toc_fw_size - fw_size); +diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +index 84cf5fd297b7f6..4175ed34ad3205 100644 +--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +@@ -938,7 +938,7 @@ static int gfx_v11_0_mec_init(struct amdgpu_device *adev) + return r; + } + +- memset(hpd, 0, mec_hpd_size); ++ memset_io(hpd, 0, mec_hpd_size); + + amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj); + amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj); +@@ -1240,10 +1240,10 @@ static void gfx_v11_0_rlc_backdoor_autoload_copy_ucode(struct amdgpu_device *ade + if (fw_size > toc_fw_size) + fw_size = toc_fw_size; + +- memcpy(ptr + toc_offset, fw_data, fw_size); ++ memcpy_toio(ptr + toc_offset, fw_data, fw_size); + + if (fw_size < toc_fw_size) +- memset(ptr + toc_offset + fw_size, 0, toc_fw_size - fw_size); ++ memset_io(ptr + toc_offset + fw_size, 0, toc_fw_size - fw_size); + + if ((id != SOC21_FIRMWARE_ID_RS64_PFP) && (id != SOC21_FIRMWARE_ID_RS64_ME)) + *(uint64_t *)fw_autoload_mask |= 1ULL << id; +@@ -3904,7 +3904,7 @@ static void gfx_v11_0_gfx_mqd_set_priority(struct amdgpu_device *adev, + static int gfx_v11_0_gfx_mqd_init(struct amdgpu_device *adev, void *m, + struct amdgpu_mqd_prop *prop) + { +- struct v11_gfx_mqd *mqd = m; ++ volatile struct v11_gfx_mqd *mqd = m; + uint64_t hqd_gpu_addr, wb_gpu_addr; + uint32_t tmp; + uint32_t rb_bufsz; +@@ -3991,7 +3991,7 @@ static int gfx_v11_0_kgq_init_queue(struct amdgpu_ring *ring, bool reset) + int mqd_idx = ring - &adev->gfx.gfx_ring[0]; + + if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) { +- memset((void *)mqd, 0, sizeof(*mqd)); ++ memset_io((void *)mqd, 0, sizeof(*mqd)); + mutex_lock(&adev->srbm_mutex); + soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); + amdgpu_ring_init_mqd(ring); +@@ -4045,7 +4045,7 @@ static int gfx_v11_0_cp_async_gfx_ring_resume(struct amdgpu_device *adev) + static int gfx_v11_0_compute_mqd_init(struct amdgpu_device *adev, void *m, + struct amdgpu_mqd_prop *prop) + { +- struct v11_compute_mqd *mqd = m; ++ volatile struct v11_compute_mqd *mqd = m; + uint64_t hqd_gpu_addr, wb_gpu_addr, eop_base_addr; + uint32_t tmp; + +@@ -4304,7 +4304,7 @@ static int gfx_v11_0_kiq_init_queue(struct amdgpu_ring *ring) + soc21_grbm_select(adev, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); + } else { +- memset((void *)mqd, 0, sizeof(*mqd)); ++ memset_io((void *)mqd, 0, sizeof(*mqd)); + if (amdgpu_sriov_vf(adev) && adev->in_suspend) + amdgpu_ring_clear_ring(ring); + mutex_lock(&adev->srbm_mutex); +@@ -4328,7 +4328,7 @@ static int gfx_v11_0_kcq_init_queue(struct amdgpu_ring *ring, bool reset) + int mqd_idx = ring - &adev->gfx.compute_ring[0]; + + if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) { +- memset((void *)mqd, 0, sizeof(*mqd)); ++ memset_io((void *)mqd, 0, sizeof(*mqd)); + mutex_lock(&adev->srbm_mutex); + soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); + amdgpu_ring_init_mqd(ring); +diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +index 7a773fcd7752c2..e2a71a7edd28a8 100644 +--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +@@ -794,7 +794,7 @@ static int mes_v11_0_allocate_ucode_buffer(struct amdgpu_device *adev, + return r; + } + +- memcpy(adev->mes.ucode_fw_ptr[pipe], fw_data, fw_size); ++ memcpy_toio(adev->mes.ucode_fw_ptr[pipe], fw_data, fw_size); + + amdgpu_bo_kunmap(adev->mes.ucode_fw_obj[pipe]); + amdgpu_bo_unreserve(adev->mes.ucode_fw_obj[pipe]); +@@ -835,7 +835,7 @@ static int mes_v11_0_allocate_ucode_data_buffer(struct amdgpu_device *adev, + return r; + } + +- memcpy(adev->mes.data_fw_ptr[pipe], fw_data, fw_size); ++ memcpy_toio(adev->mes.data_fw_ptr[pipe], fw_data, fw_size); + + amdgpu_bo_kunmap(adev->mes.data_fw_obj[pipe]); + amdgpu_bo_unreserve(adev->mes.data_fw_obj[pipe]); +@@ -1023,7 +1023,7 @@ static int mes_v11_0_allocate_eop_buf(struct amdgpu_device *adev, + return r; + } + +- memset(eop, 0, ++ memset_io(eop, 0, + adev->mes.eop_gpu_obj[pipe]->tbo.base.size); + + amdgpu_bo_kunmap(adev->mes.eop_gpu_obj[pipe]); +@@ -1034,11 +1034,11 @@ static int mes_v11_0_allocate_eop_buf(struct amdgpu_device *adev, + + static int mes_v11_0_mqd_init(struct amdgpu_ring *ring) + { +- struct v11_compute_mqd *mqd = ring->mqd_ptr; ++ volatile struct v11_compute_mqd *mqd = ring->mqd_ptr; + uint64_t hqd_gpu_addr, wb_gpu_addr, eop_base_addr; + uint32_t tmp; + +- memset(mqd, 0, sizeof(*mqd)); ++ memset_io(mqd, 0, sizeof(*mqd)); + + mqd->header = 0xC0310800; + mqd->compute_pipelinestat_enable = 0x00000001; +@@ -1326,7 +1326,7 @@ static int mes_v11_0_mqd_sw_init(struct amdgpu_device *adev, + return r; + } + +- memset(ring->mqd_ptr, 0, mqd_size); ++ memset_io(ring->mqd_ptr, 0, mqd_size); + + /* prepare MQD backup */ + adev->mes.mqd_backup[pipe] = kmalloc(mqd_size, GFP_KERNEL); +diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +index 51e470e8d67d9e..5cda5635aaf340 100644 +--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +@@ -236,10 +236,10 @@ static int psp_v13_0_bootloader_load_component(struct psp_context *psp, + if (ret) + return ret; + +- memset(psp->fw_pri_buf, 0, PSP_1_MEG); ++ memset_io(psp->fw_pri_buf, 0, PSP_1_MEG); + + /* Copy PSP KDB binary to memory */ +- memcpy(psp->fw_pri_buf, bin_desc->start_addr, bin_desc->size_bytes); ++ memcpy_toio(psp->fw_pri_buf, bin_desc->start_addr, bin_desc->size_bytes); + + /* Provide the PSP KDB to bootloader */ + WREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_36, +@@ -313,10 +313,10 @@ static int psp_v13_0_bootloader_load_sos(struct psp_context *psp) + if (ret) + return ret; + +- memset(psp->fw_pri_buf, 0, PSP_1_MEG); ++ memset_io(psp->fw_pri_buf, 0, PSP_1_MEG); + + /* Copy Secure OS binary to PSP memory */ +- memcpy(psp->fw_pri_buf, psp->sos.start_addr, psp->sos.size_bytes); ++ memcpy_toio(psp->fw_pri_buf, psp->sos.start_addr, psp->sos.size_bytes); + + /* Provide the PSP secure OS to bootloader */ + WREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_36, +diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0_4.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0_4.c +index eaa5512a21dacd..45900865b7a51a 100644 +--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0_4.c ++++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0_4.c +@@ -107,10 +107,10 @@ static int psp_v13_0_4_bootloader_load_component(struct psp_context *psp, + if (ret) + return ret; + +- memset(psp->fw_pri_buf, 0, PSP_1_MEG); ++ memset_io(psp->fw_pri_buf, 0, PSP_1_MEG); + + /* Copy PSP KDB binary to memory */ +- memcpy(psp->fw_pri_buf, bin_desc->start_addr, bin_desc->size_bytes); ++ memcpy_toio(psp->fw_pri_buf, bin_desc->start_addr, bin_desc->size_bytes); + + /* Provide the PSP KDB to bootloader */ + WREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_36, +@@ -170,10 +170,10 @@ static int psp_v13_0_4_bootloader_load_sos(struct psp_context *psp) + if (ret) + return ret; + +- memset(psp->fw_pri_buf, 0, PSP_1_MEG); ++ memset_io(psp->fw_pri_buf, 0, PSP_1_MEG); + + /* Copy Secure OS binary to PSP memory */ +- memcpy(psp->fw_pri_buf, psp->sos.start_addr, psp->sos.size_bytes); ++ memcpy_toio(psp->fw_pri_buf, psp->sos.start_addr, psp->sos.size_bytes); + + /* Provide the PSP secure OS to bootloader */ + WREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_36, +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +index 3c0ae28c5923b5..8abb0f2d027543 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +@@ -44,7 +44,7 @@ static inline struct v11_sdma_mqd *get_sdma_mqd(void *mqd) + static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct mqd_update_info *minfo) + { +- struct v11_compute_mqd *m; ++ volatile struct v11_compute_mqd *m; + uint32_t se_mask[KFD_MAX_NUM_SE] = {0}; + bool has_wa_flag = minfo && (minfo->update_flag & (UPDATE_FLAG_DBG_WA_ENABLE | + UPDATE_FLAG_DBG_WA_DISABLE)); +@@ -125,7 +125,7 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, + struct queue_properties *q) + { + uint64_t addr; +- struct v11_compute_mqd *m; ++ volatile struct v11_compute_mqd *m; + int size; + uint32_t wa_mask = q->is_dbg_wa ? 0xffff : 0xffffffff; + +@@ -137,7 +137,7 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, + else + size = sizeof(struct v11_compute_mqd); + +- memset(m, 0, size); ++ memset_io(m, 0, size); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; +@@ -219,7 +219,7 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + struct mqd_update_info *minfo) + { +- struct v11_compute_mqd *m; ++ volatile struct v11_compute_mqd *m; + + m = get_mqd(mqd); + +@@ -281,7 +281,7 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, + + static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) + { +- struct v11_compute_mqd *m = (struct v11_compute_mqd *)mqd; ++ volatile struct v11_compute_mqd *m = (struct v11_compute_mqd *)mqd; + + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); + } +@@ -292,7 +292,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) + { +- struct v11_compute_mqd *m; ++ volatile struct v11_compute_mqd *m; + struct kfd_context_save_area_header header; + + m = get_mqd(mqd); +@@ -325,11 +325,11 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, + + static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst) + { +- struct v11_compute_mqd *m; ++ volatile struct v11_compute_mqd *m; + + m = get_mqd(mqd); + +- memcpy(mqd_dst, m, sizeof(struct v11_compute_mqd)); ++ memcpy_fromio(mqd_dst, m, sizeof(struct v11_compute_mqd)); + } + + static void restore_mqd(struct mqd_manager *mm, void **mqd, +@@ -339,12 +339,12 @@ static void restore_mqd(struct mqd_manager *mm, void **mqd, + const void *ctl_stack_src, const u32 ctl_stack_size) + { + uint64_t addr; +- struct v11_compute_mqd *m; ++ volatile struct v11_compute_mqd *m; + + m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + +- memcpy(m, mqd_src, sizeof(*m)); ++ memcpy_toio(m, mqd_src, sizeof(*m)); + + *mqd = m; + if (gart_addr) +@@ -364,7 +364,7 @@ static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) + { +- struct v11_compute_mqd *m; ++ volatile struct v11_compute_mqd *m; + + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + +@@ -379,7 +379,7 @@ static int destroy_hiq_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id) + { + int err; +- struct v11_compute_mqd *m; ++ volatile struct v11_compute_mqd *m; + u32 doorbell_off; + + m = get_mqd(mqd); +@@ -408,7 +408,7 @@ static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + else + size = sizeof(struct v11_sdma_mqd); + +- memset(m, 0, size); ++ memset_io(m, 0, size); + *mqd = m; + if (gart_addr) + *gart_addr = mqd_mem_obj->gpu_addr; +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c +index 9e2ef0e724fcf6..1e8f19564ea745 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c +@@ -451,7 +451,7 @@ static void vg_notify_wm_ranges(struct clk_mgr *clk_mgr_base) + if (!table || clk_mgr_vgh->smu_wm_set.mc_address.quad_part == 0) + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + vg_build_watermark_ranges(clk_mgr_base->bw_params, table); + +@@ -649,7 +649,7 @@ static void vg_get_dpm_table_from_smu(struct clk_mgr_internal *clk_mgr, + if (!table || smu_dpm_clks->mc_address.quad_part == 0) + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + dcn301_smu_set_dram_addr_high(clk_mgr, + smu_dpm_clks->mc_address.high_part); +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c +index bc123f1884da32..6fa8916791e6db 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c +@@ -486,7 +486,7 @@ static void dcn31_notify_wm_ranges(struct clk_mgr *clk_mgr_base) + if (!table || clk_mgr_dcn31->smu_wm_set.mc_address.quad_part == 0) + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + dcn31_build_watermark_ranges(clk_mgr_base->bw_params, table); + +@@ -508,7 +508,7 @@ static void dcn31_get_dpm_table_from_smu(struct clk_mgr_internal *clk_mgr, + if (!table || smu_dpm_clks->mc_address.quad_part == 0) + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + dcn31_smu_set_dram_addr_high(clk_mgr, + smu_dpm_clks->mc_address.high_part); +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c +index 91d872d6d392b1..7d7f2f1070f112 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c +@@ -550,7 +550,7 @@ static void dcn314_notify_wm_ranges(struct clk_mgr *clk_mgr_base) + if (!table || clk_mgr_dcn314->smu_wm_set.mc_address.quad_part == 0) + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + dcn314_build_watermark_ranges(clk_mgr_base->bw_params, table); + +@@ -572,7 +572,7 @@ static void dcn314_get_dpm_table_from_smu(struct clk_mgr_internal *clk_mgr, + if (!table || smu_dpm_clks->mc_address.quad_part == 0) + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + dcn314_smu_set_dram_addr_high(clk_mgr, + smu_dpm_clks->mc_address.high_part); +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_clk_mgr.c +index a0fb4481d2f1b1..2cca0d6f747282 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_clk_mgr.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_clk_mgr.c +@@ -445,7 +445,7 @@ static void dcn315_notify_wm_ranges(struct clk_mgr *clk_mgr_base) + if (!table || clk_mgr_dcn315->smu_wm_set.mc_address.quad_part == 0) + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + dcn315_build_watermark_ranges(clk_mgr_base->bw_params, table); + +@@ -467,7 +467,7 @@ static void dcn315_get_dpm_table_from_smu(struct clk_mgr_internal *clk_mgr, + if (!table || smu_dpm_clks->mc_address.quad_part == 0) + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + dcn315_smu_set_dram_addr_high(clk_mgr, + smu_dpm_clks->mc_address.high_part); +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c +index c3e50c3aaa609e..24470daa33eb08 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c +@@ -407,7 +407,7 @@ static void dcn316_notify_wm_ranges(struct clk_mgr *clk_mgr_base) + if (!table || clk_mgr_dcn316->smu_wm_set.mc_address.quad_part == 0) + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + dcn316_build_watermark_ranges(clk_mgr_base->bw_params, table); + +@@ -429,7 +429,7 @@ static void dcn316_get_dpm_table_from_smu(struct clk_mgr_internal *clk_mgr, + if (!table || smu_dpm_clks->mc_address.quad_part == 0) + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + dcn316_smu_set_dram_addr_high(clk_mgr, + smu_dpm_clks->mc_address.high_part); +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c +index 084994c650c4c9..1b90d785d4a912 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c +@@ -977,7 +977,7 @@ static void dcn32_notify_wm_ranges(struct clk_mgr *clk_mgr_base) + if (!table) + return; + +- memset(table, 0, sizeof(*table)); ++ memset_io(table, 0, sizeof(*table)); + + /* collect valid ranges, place in pmfw table */ + for (i = 0; i < WM_SET_COUNT; i++) + +From e3238906ace9c9f10dbbebc264042df4849f040c Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Thu, 21 Nov 2024 17:12:50 +0100 +Subject: [PATCH 16/18] sdma_6 mqd struct fix + +--- + drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +index 208a1fa9d4e7f2..c902d8c7156c09 100644 +--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +@@ -833,7 +833,7 @@ static int sdma_v6_0_start(struct amdgpu_device *adev) + static int sdma_v6_0_mqd_init(struct amdgpu_device *adev, void *mqd, + struct amdgpu_mqd_prop *prop) + { +- struct v11_sdma_mqd *m = mqd; ++ volatile struct v11_sdma_mqd *m = mqd; + uint64_t wb_gpu_addr; + + m->sdmax_rlcx_rb_cntl = + +From e73428d90788a947219a9b9ab18a7fdccead2322 Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Sat, 11 Jan 2025 17:23:41 +0100 +Subject: [PATCH 17/18] fix a memset in amdgpu_seq64 + +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c +index e22cb2b5cd9264..90661a88430ba0 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c +@@ -240,7 +240,7 @@ int amdgpu_seq64_init(struct amdgpu_device *adev) + return r; + } + +- memset(adev->seq64.cpu_base_addr, 0, AMDGPU_VA_RESERVED_SEQ64_SIZE); ++ memset_io(adev->seq64.cpu_base_addr, 0, AMDGPU_VA_RESERVED_SEQ64_SIZE); + + adev->seq64.num_sem = AMDGPU_MAX_SEQ64_SLOTS; + memset(&adev->seq64.used, 0, sizeof(adev->seq64.used)); + +From 92f6edf75632d7ee0673027ffed92acdc3767871 Mon Sep 17 00:00:00 2001 +From: Coreforge +Date: Fri, 7 Mar 2025 17:32:31 +0100 +Subject: [PATCH 18/18] some gfx9 (VEGA) fixes + +--- + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 36 +++++++++++++-------------- + 1 file changed, 18 insertions(+), 18 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +index 114653a0b57013..6eb0b2e5192bc4 100644 +--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +@@ -1211,7 +1211,7 @@ static int gfx_v9_0_ring_test_ib(struct amdgpu_ring *ring, long timeout) + + gpu_addr = adev->wb.gpu_addr + (index * 4); + adev->wb.wb[index] = cpu_to_le32(0xCAFEDEAD); +- memset(&ib, 0, sizeof(ib)); ++ memset_io(&ib, 0, sizeof(ib)); + + r = amdgpu_ib_get(adev, NULL, 20, AMDGPU_IB_POOL_DIRECT, &ib); + if (r) +@@ -1884,7 +1884,7 @@ static int gfx_v9_0_mec_init(struct amdgpu_device *adev) + return r; + } + +- memset(hpd, 0, mec_hpd_size); ++ memset_io(hpd, 0, mec_hpd_size); + + amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj); + amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj); +@@ -1908,7 +1908,7 @@ static int gfx_v9_0_mec_init(struct amdgpu_device *adev) + return r; + } + +- memcpy(fw, fw_data, fw_size); ++ memcpy_toio(fw, fw_data, fw_size); + + amdgpu_bo_kunmap(adev->gfx.mec.mec_fw_obj); + amdgpu_bo_unreserve(adev->gfx.mec.mec_fw_obj); +@@ -3471,7 +3471,7 @@ static void gfx_v9_0_kiq_setting(struct amdgpu_ring *ring) + WREG32_SOC15_RLC(GC, 0, mmRLC_CP_SCHEDULERS, tmp); + } + +-static void gfx_v9_0_mqd_set_priority(struct amdgpu_ring *ring, struct v9_mqd *mqd) ++static void gfx_v9_0_mqd_set_priority(struct amdgpu_ring *ring, volatile struct v9_mqd *mqd) + { + struct amdgpu_device *adev = ring->adev; + +@@ -3487,7 +3487,7 @@ static void gfx_v9_0_mqd_set_priority(struct amdgpu_ring *ring, struct v9_mqd *m + static int gfx_v9_0_mqd_init(struct amdgpu_ring *ring) + { + struct amdgpu_device *adev = ring->adev; +- struct v9_mqd *mqd = ring->mqd_ptr; ++ volatile struct v9_mqd *mqd = ring->mqd_ptr; + uint64_t hqd_gpu_addr, wb_gpu_addr, eop_base_addr; + uint32_t tmp; + +@@ -3786,7 +3786,7 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring *ring) + if (amdgpu_in_reset(adev) && tmp_mqd->cp_hqd_pq_control){ + /* for GPU_RESET case , reset MQD to a clean status */ + if (adev->gfx.kiq[0].mqd_backup) +- memcpy(mqd, adev->gfx.kiq[0].mqd_backup, sizeof(struct v9_mqd_allocation)); ++ memcpy_toio(mqd, adev->gfx.kiq[0].mqd_backup, sizeof(struct v9_mqd_allocation)); + + /* reset ring buffer */ + ring->wptr = 0; +@@ -3798,7 +3798,7 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring *ring) + soc15_grbm_select(adev, 0, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); + } else { +- memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation)); ++ memset_io((void *)mqd, 0, sizeof(struct v9_mqd_allocation)); + ((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF; + ((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF; + if (amdgpu_sriov_vf(adev) && adev->in_suspend) +@@ -3811,7 +3811,7 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring *ring) + mutex_unlock(&adev->srbm_mutex); + + if (adev->gfx.kiq[0].mqd_backup) +- memcpy(adev->gfx.kiq[0].mqd_backup, mqd, sizeof(struct v9_mqd_allocation)); ++ memcpy_fromio(adev->gfx.kiq[0].mqd_backup, mqd, sizeof(struct v9_mqd_allocation)); + } + + return 0; +@@ -3831,7 +3831,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring, bool restore) + + if (!restore && (!tmp_mqd->cp_hqd_pq_control || + (!amdgpu_in_reset(adev) && !adev->in_suspend))) { +- memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation)); ++ memset_io((void *)mqd, 0, sizeof(struct v9_mqd_allocation)); + ((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF; + ((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF; + mutex_lock(&adev->srbm_mutex); +@@ -3841,11 +3841,11 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring, bool restore) + mutex_unlock(&adev->srbm_mutex); + + if (adev->gfx.mec.mqd_backup[mqd_idx]) +- memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct v9_mqd_allocation)); ++ memcpy_fromio(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct v9_mqd_allocation)); + } else { + /* restore MQD to a clean status */ + if (adev->gfx.mec.mqd_backup[mqd_idx]) +- memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation)); ++ memcpy_toio(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation)); + /* reset ring buffer */ + ring->wptr = 0; + atomic64_set((atomic64_t *)ring->wptr_cpu_addr, 0); +@@ -4637,7 +4637,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev) + total_size += sizeof(sgpr_init_compute_shader); + + /* allocate an indirect buffer to put the commands in */ +- memset(&ib, 0, sizeof(ib)); ++ memset_io(&ib, 0, sizeof(ib)); + r = amdgpu_ib_get(adev, NULL, total_size, + AMDGPU_IB_POOL_DIRECT, &ib); + if (r) { +@@ -5456,12 +5456,12 @@ static void gfx_v9_0_ring_patch_ce_meta(struct amdgpu_ring *ring, + } + + if (offset + (payload_size >> 2) <= ring->buf_mask + 1) { +- memcpy((void *)&ring->ring[offset], ce_payload_cpu_addr, payload_size); ++ memcpy_toio((void *)&ring->ring[offset], ce_payload_cpu_addr, payload_size); + } else { +- memcpy((void *)&ring->ring[offset], ce_payload_cpu_addr, ++ memcpy_toio((void *)&ring->ring[offset], ce_payload_cpu_addr, + (ring->buf_mask + 1 - offset) << 2); + payload_size -= (ring->buf_mask + 1 - offset) << 2; +- memcpy((void *)&ring->ring[0], ++ memcpy_toio((void *)&ring->ring[0], + ce_payload_cpu_addr + ((ring->buf_mask + 1 - offset) << 2), + payload_size); + } +@@ -5491,12 +5491,12 @@ static void gfx_v9_0_ring_patch_de_meta(struct amdgpu_ring *ring, + IB_COMPLETION_STATUS_PREEMPTED; + + if (offset + (payload_size >> 2) <= ring->buf_mask + 1) { +- memcpy((void *)&ring->ring[offset], de_payload_cpu_addr, payload_size); ++ memcpy_toio((void *)&ring->ring[offset], de_payload_cpu_addr, payload_size); + } else { +- memcpy((void *)&ring->ring[offset], de_payload_cpu_addr, ++ memcpy_toio((void *)&ring->ring[offset], de_payload_cpu_addr, + (ring->buf_mask + 1 - offset) << 2); + payload_size -= (ring->buf_mask + 1 - offset) << 2; +- memcpy((void *)&ring->ring[0], ++ memcpy_toio((void *)&ring->ring[0], + de_payload_cpu_addr + ((ring->buf_mask + 1 - offset) << 2), + payload_size); + } diff --git a/buildroot-external/board/raspberrypi/rpi5-64/cmdline.txt b/buildroot-external/board/raspberrypi/rpi5-64/cmdline.txt index 9466f253c5c..26e0dbf85e7 100644 --- a/buildroot-external/board/raspberrypi/rpi5-64/cmdline.txt +++ b/buildroot-external/board/raspberrypi/rpi5-64/cmdline.txt @@ -1 +1 @@ -zram.enabled=1 zram.num_devices=3 rootwait cgroup_enable=memory fsck.repair=yes console=tty0 root=PARTUUID=8d3d53e3-6d49-4c38-8349-aff6859e82fd ro rauc.slot=A +zram.enabled=1 zram.num_devices=3 rootwait cgroup_enable=memory fsck.repair=yes console=tty0 root=PARTUUID=8d3d53e3-6d49-4c38-8349-aff6859e82fd ro rauc.slot=A numa=fake=0 numa=off amdgpu.runpm=0 diff --git a/buildroot-external/board/raspberrypi/rpi5-64/config.txt b/buildroot-external/board/raspberrypi/rpi5-64/config.txt index 86e54c4e1a5..79bc2a93989 100755 --- a/buildroot-external/board/raspberrypi/rpi5-64/config.txt +++ b/buildroot-external/board/raspberrypi/rpi5-64/config.txt @@ -7,6 +7,11 @@ #dtparam=i2s=on #dtparam=spi=on +# use external antenna +dtparam=ant2 +# use PCIe gen 3 +#dtparam=pciex1_gen=3 + # Enable audio (loads snd_bcm2835) dtparam=audio=on diff --git a/buildroot-external/configs/rpi5_64_defconfig b/buildroot-external/configs/rpi5_64_defconfig index ef23584b2c3..c1f62f217cb 100644 --- a/buildroot-external/configs/rpi5_64_defconfig +++ b/buildroot-external/configs/rpi5_64_defconfig @@ -74,6 +74,7 @@ BR2_PACKAGE_LINUX_FIRMWARE_RTL_RTW88=y BR2_PACKAGE_LINUX_FIRMWARE_RTL_RTW89=y BR2_PACKAGE_LINUX_FIRMWARE_RTL_815X=y BR2_PACKAGE_LINUX_FIRMWARE_USB_SERIAL_TI=y +BR2_PACKAGE_LINUX_FIRMWARE_AMDGPU=y BR2_PACKAGE_DBUS_BROKER=y BR2_PACKAGE_FLASHROM=y BR2_PACKAGE_GPTFDISK=y diff --git a/buildroot-external/kernel/v6.12.y/device-support-pci.config b/buildroot-external/kernel/v6.12.y/device-support-pci.config index 8569bc9c704..fe7aa583e2e 100644 --- a/buildroot-external/kernel/v6.12.y/device-support-pci.config +++ b/buildroot-external/kernel/v6.12.y/device-support-pci.config @@ -49,3 +49,22 @@ CONFIG_SND_HDA_CODEC_HDMI=m # NVMe SSD support CONFIG_NVME_CORE=y CONFIG_BLK_DEV_NVME=y + +# AMDGPU support +# $ diff --unchanged-line-format= --old-line-format= --new-line-format='%L' .config_rpi .config +CONFIG_ARM64_ALIGNMENT_FIXUPS=y +CONFIG_COMPAT_ALIGNMENT_FIXUPS=y +CONFIG_MMU_NOTIFIER=y +CONFIG_HMM_MIRROR=y +CONFIG_DRM_ARCH_CAN_WC=y +CONFIG_DRM_DISPLAY_DP_HELPER=y +CONFIG_DRM_DISPLAY_HDCP_HELPER=y +CONFIG_DRM_EXEC=m +CONFIG_DRM_BUDDY=m +CONFIG_DRM_SUBALLOC_HELPER=m +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_USERPTR=y +CONFIG_DRM_AMD_DC=y +CONFIG_DRM_AMD_DC_FP=y +CONFIG_HSA_AMD=y +CONFIG_INTERVAL_TREE=y diff --git a/buildroot-external/package/hassio/dind-import-containers.sh b/buildroot-external/package/hassio/dind-import-containers.sh index 019e83e2a10..4b85248d4ef 100755 --- a/buildroot-external/package/hassio/dind-import-containers.sh +++ b/buildroot-external/package/hassio/dind-import-containers.sh @@ -31,3 +31,5 @@ mkdir -p "/data/supervisor/apparmor" wget -O "/data/supervisor/apparmor/hassio-supervisor" "${APPARMOR_URL}" echo "{ \"channel\": \"${channel}\" }" > /data/supervisor/updater.json + +echo '{ "repositories": [ "https://github.com/sanctuary-systems-com/llm-addons", "https://github.com/hacs/addons" ] }' > /data/supervisor/store.json diff --git a/cert.pem b/cert.pem new file mode 100644 index 00000000000..2d9dca3ec22 --- /dev/null +++ b/cert.pem @@ -0,0 +1,32 @@ +-----BEGIN CERTIFICATE----- +MIIFbTCCA1WgAwIBAgIUR7MWBlXpqPJWcvBWRrm1zgcJdEAwDQYJKoZIhvcNAQEL +BQAwRjEPMA0GA1UECgwGSGFzc09TMTMwMQYDVQQDDCpIYXNzT1MgU2VsZi1zaWdu +ZWQgRGV2ZWxvcG1lbnQgQ2VydGlmaWNhdGUwHhcNMjUwODE4MTUwNjQ3WhcNMzUw +ODE2MTUwNjQ3WjBGMQ8wDQYDVQQKDAZIYXNzT1MxMzAxBgNVBAMMKkhhc3NPUyBT +ZWxmLXNpZ25lZCBEZXZlbG9wbWVudCBDZXJ0aWZpY2F0ZTCCAiIwDQYJKoZIhvcN +AQEBBQADggIPADCCAgoCggIBAOqAiTwhfIOCsVQFIT5FlgQHvFeXowZIanOxAtDo +SJPCVQeO/AQe1oMteCGRhY5Yk+U65DtauIkDOIOST9CGwo8NKkXpONaysdngp1zi +M2hd9TBnWic/WmbL0bidkSazOVk6jeU477uJZ4l8iu6q2h9TOgH48FxdeP8gcMfD +2AfXAqQlJniwdyhjwD446gkgSLu6lkSnOsjZNFim0ol9g34w7wUAbT6Qgm+e/mCy +SRCDpOZLE3Y2JatZa7VXWxbBYdsKk/i3hH5Li4RXoX7zYtZ3BTHIzxAKnlqXl33D +HjJ9dvlNl15RoF4bHF9xXgIiD4E26k7C4RQNYCGDkZj0lfyh0UmU4UWha8KEzIXx +hpPmjRaHmG6zaPw8E8Sxftqp+D9AL9nk9LwrPMduwZT+Dm25BfO0wojJ22Pfa9Z5 +9E75LHrWsV90O+ZTPlrm94EjgeD8Q87flfHNTnHObd3rPflFTBfYWoVu+gCWgE+2 +JZZfbkYst/wR3GS+0GmwlSFwvRjNsAyJ87GW6O1GLfHf7qeQkRjQf/egHe9epmo2 +RYvaREo9LGVM5v0oyGlAnpSqJyK287G+MJfXuOkaJNACgwhPhWSp7ZZ+2FG72dbT +ZmVVhaxF6Vucpmy5qIFGgFLEKDWhso8kSyvhXffBF5NcD9Z6l47xbcVijRTkrlyR +CDzvAgMBAAGjUzBRMB0GA1UdDgQWBBTbPRtPg3rhDEHGpp4GEajwP7G+GDAfBgNV +HSMEGDAWgBTbPRtPg3rhDEHGpp4GEajwP7G+GDAPBgNVHRMBAf8EBTADAQH/MA0G +CSqGSIb3DQEBCwUAA4ICAQAyCkyL/RtHOGLVXV2EPdgEfqjidddExUBS1gsD0axS +znFW2VO7jwjRVHYpOlOxSD8kJiZB3OpVzwMIcTZ+uan4lNIxl7RhPYflK854j0mS +LYKqv7XXChLaufQ4wuzJjlNf2WJdov1Ej1t3vVLHnJAXO37tzmp0r/yPjq6fUOEZ +YEcndh5eaVh3O0fJaFgRuYFvCv4VnxwBbCRIosEps/pBFFw5jsFCg7b3omT0ROOc +27lEceDTv1rnMBMP5ogpiCNF/lUpZFF9JNdtR4+NPfauWZk4/ZsssCRyCpj5igEZ +UeAXuTQg83KmtO/MXBdQCRkC1QnRaYpaQeMijG+J+8S50q9BlhoDGMgEuEFqj/KL +1wxlhorgtT+joBkxrPa+zb/t4n2K3Oq35Cse6hnW4k0ejEUXfl4z5jAfD6wJDt9f +ws4hoYYljC4s8MfG0ZL7K/OAM1fLM5+bAENZ9qsIMAAowV9/aIV7/HPiMqlQQjd6 +w6ym0cuGUfvT4mOnIPIQPSzaC+s68COZ/ZZojRmRUy5ukN2RrkapyHOiIqXGT4+0 +07vZjchTwT8TVo04c5RzxfUBmJhylRKIRhbDKWvSR7dNCEeVHW6Gty6/HWxikK3b +d6sYvraOMRhDN395SYuA5+P36uB+Z1mo94nTmiQDI+dFsEON2hbTebyQE+bTzyvO +Jg== +-----END CERTIFICATE-----