Skip to content

Commit f83afe6

Browse files
davemgreentstellar
authored andcommitted
[ARM] Ensure instructions are simplified prior to GatherScatter lowering.
Surprisingly, not all instructions are always simplified after unrolling and before MVE gather/scatter lowering. Notably dead gather operations can be left around which cause the gather/scatter lowering pass to crash if there are multiple gathers, some of which are dead. This patch ensures they are simplified before we modify anything, which can change some of the existing tests, including making them no-longer test what they originally tested. This uses a combination of disabling the gather/scatter lowering pass and adjusting the test to keep them as before. Differential Revision: https://reviews.llvm.org/D103150
1 parent 1a8f0b9 commit f83afe6

File tree

9 files changed

+96
-52
lines changed

9 files changed

+96
-52
lines changed

llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -1166,6 +1166,8 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
11661166
bool Changed = false;
11671167

11681168
for (BasicBlock &BB : F) {
1169+
SimplifyInstructionsInBlock(&BB);
1170+
11691171
for (Instruction &I : BB) {
11701172
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
11711173
if (II && II->getIntrinsicID() == Intrinsic::masked_gather &&

llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp %s -o - | FileCheck %s
2+
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -enable-arm-maskedgatscat=false %s -o - | FileCheck %s
33

44
define void @remat_vctp(i32* %arg, i32* %arg1, i32* %arg2, i32* %arg3, i32* %arg4, i16 zeroext %arg5) {
55
; CHECK-LABEL: remat_vctp:

llvm/test/CodeGen/Thumb2/lsll0.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
2+
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s
33

44
define void @_Z4loopPxS_iS_i(i64* %d) {
55
; CHECK-LABEL: _Z4loopPxS_iS_i:

llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll

+42-32
Original file line numberDiff line numberDiff line change
@@ -321,26 +321,29 @@ end:
321321
ret void;
322322
}
323323

324-
define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
324+
define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) {
325325
; CHECK-LABEL: non_gatscat_use1:
326326
; CHECK: @ %bb.0: @ %vector.ph
327-
; CHECK-NEXT: .vsave {d8, d9}
328-
; CHECK-NEXT: vpush {d8, d9}
329-
; CHECK-NEXT: adr r3, .LCPI7_0
330-
; CHECK-NEXT: vmov.i32 q0, #0x8
331-
; CHECK-NEXT: vldrw.u32 q2, [r3]
327+
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
328+
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
329+
; CHECK-NEXT: adr.w r12, .LCPI7_0
330+
; CHECK-NEXT: vmov.i32 q0, #0x9
331+
; CHECK-NEXT: vldrw.u32 q3, [r12]
332332
; CHECK-NEXT: vmov.i32 q1, #0xc
333+
; CHECK-NEXT: vmov.i32 q2, #0x8
333334
; CHECK-NEXT: .LBB7_1: @ %vector.body
334335
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
335-
; CHECK-NEXT: vadd.i32 q3, q2, q0
336-
; CHECK-NEXT: vmlas.u32 q2, q1, r0
337-
; CHECK-NEXT: vldrw.u32 q4, [q2, #24]
336+
; CHECK-NEXT: vadd.i32 q4, q3, q2
337+
; CHECK-NEXT: vmul.i32 q5, q3, q0
338+
; CHECK-NEXT: vmlas.u32 q3, q1, r0
338339
; CHECK-NEXT: subs r2, #4
339-
; CHECK-NEXT: vmov q2, q3
340-
; CHECK-NEXT: vstrb.8 q4, [r1], #16
340+
; CHECK-NEXT: vldrw.u32 q6, [q3, #24]
341+
; CHECK-NEXT: vmov q3, q4
342+
; CHECK-NEXT: vstrw.32 q5, [r3]
343+
; CHECK-NEXT: vstrb.8 q6, [r1], #16
341344
; CHECK-NEXT: bne .LBB7_1
342345
; CHECK-NEXT: @ %bb.2: @ %end
343-
; CHECK-NEXT: vpop {d8, d9}
346+
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
344347
; CHECK-NEXT: bx lr
345348
; CHECK-NEXT: .p2align 4
346349
; CHECK-NEXT: @ %bb.3:
@@ -364,6 +367,7 @@ vector.body: ; preds = %vector.body, %vecto
364367
%4 = bitcast i32* %3 to <4 x i32>*
365368
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
366369
%non_gatscat_use = mul <4 x i32> %0, <i32 3, i32 3, i32 3, i32 3>
370+
store <4 x i32> %non_gatscat_use, <4 x i32>* %x, align 4
367371
%index.next = add i32 %index, 4
368372
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
369373
%5 = icmp eq i32 %index.next, %n.vec
@@ -373,26 +377,31 @@ end:
373377
ret void;
374378
}
375379

376-
define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
380+
define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) {
377381
; CHECK-LABEL: non_gatscat_use2:
378382
; CHECK: @ %bb.0: @ %vector.ph
379-
; CHECK-NEXT: .vsave {d8, d9}
380-
; CHECK-NEXT: vpush {d8, d9}
381-
; CHECK-NEXT: adr r3, .LCPI8_0
382-
; CHECK-NEXT: vmov.i32 q0, #0x8
383-
; CHECK-NEXT: vldrw.u32 q2, [r3]
384-
; CHECK-NEXT: vmov.i32 q1, #0xc
383+
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
384+
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
385+
; CHECK-NEXT: adr.w r12, .LCPI8_0
386+
; CHECK-NEXT: vmov.i32 q0, #0x12
387+
; CHECK-NEXT: vldrw.u32 q4, [r12]
388+
; CHECK-NEXT: vmov.i32 q1, #0x9
389+
; CHECK-NEXT: vmov.i32 q2, #0x8
390+
; CHECK-NEXT: vmov.i32 q3, #0xc
385391
; CHECK-NEXT: .LBB8_1: @ %vector.body
386392
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
387-
; CHECK-NEXT: vadd.i32 q3, q2, q0
388-
; CHECK-NEXT: vmlas.u32 q2, q1, r0
389-
; CHECK-NEXT: vldrw.u32 q4, [q2, #24]
393+
; CHECK-NEXT: vadd.i32 q5, q4, q2
394+
; CHECK-NEXT: vmul.i32 q6, q4, q1
395+
; CHECK-NEXT: vmlas.u32 q4, q3, r0
390396
; CHECK-NEXT: subs r2, #4
391-
; CHECK-NEXT: vmov q2, q3
392-
; CHECK-NEXT: vstrb.8 q4, [r1], #16
397+
; CHECK-NEXT: vldrw.u32 q7, [q4, #24]
398+
; CHECK-NEXT: vadd.i32 q4, q6, q0
399+
; CHECK-NEXT: vstrw.32 q4, [r3]
400+
; CHECK-NEXT: vmov q4, q5
401+
; CHECK-NEXT: vstrb.8 q7, [r1], #16
393402
; CHECK-NEXT: bne .LBB8_1
394403
; CHECK-NEXT: @ %bb.2: @ %end
395-
; CHECK-NEXT: vpop {d8, d9}
404+
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
396405
; CHECK-NEXT: bx lr
397406
; CHECK-NEXT: .p2align 4
398407
; CHECK-NEXT: @ %bb.3:
@@ -416,6 +425,7 @@ vector.body: ; preds = %vector.body, %vecto
416425
%4 = bitcast i32* %3 to <4 x i32>*
417426
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
418427
%non_gatscat_use = mul <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
428+
store <4 x i32> %non_gatscat_use, <4 x i32>* %x, align 4
419429
%index.next = add i32 %index, 4
420430
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
421431
%5 = icmp eq i32 %index.next, %n.vec
@@ -844,12 +854,12 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
844854
; CHECK-NEXT: add.w r8, r7, #10
845855
; CHECK-NEXT: adr r7, .LCPI11_0
846856
; CHECK-NEXT: ldr r1, [sp, #96]
847-
; CHECK-NEXT: vdup.32 q1, r2
848-
; CHECK-NEXT: vldrw.u32 q0, [r7]
857+
; CHECK-NEXT: vdup.32 q0, r2
858+
; CHECK-NEXT: vldrw.u32 q1, [r7]
849859
; CHECK-NEXT: mov.w r10, #0
850860
; CHECK-NEXT: mov.w r9, #6
851861
; CHECK-NEXT: movs r6, #11
852-
; CHECK-NEXT: vshl.i32 q1, q1, #2
862+
; CHECK-NEXT: vshl.i32 q0, q0, #2
853863
; CHECK-NEXT: movs r5, #0
854864
; CHECK-NEXT: .LBB11_1: @ %for.body10.i
855865
; CHECK-NEXT: @ =>This Loop Header: Depth=1
@@ -884,10 +894,10 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
884894
; CHECK-NEXT: mul r4, r11, r6
885895
; CHECK-NEXT: vdup.32 q3, r5
886896
; CHECK-NEXT: vdup.32 q2, r7
887-
; CHECK-NEXT: vadd.i32 q4, q0, r4
897+
; CHECK-NEXT: vadd.i32 q4, q1, r4
888898
; CHECK-NEXT: vmla.u32 q3, q4, r2
889899
; CHECK-NEXT: adds r4, #113
890-
; CHECK-NEXT: vadd.i32 q4, q0, r4
900+
; CHECK-NEXT: vadd.i32 q4, q1, r4
891901
; CHECK-NEXT: mov r4, r8
892902
; CHECK-NEXT: vmla.u32 q2, q4, r2
893903
; CHECK-NEXT: .LBB11_5: @ %vector.body
@@ -897,8 +907,8 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
897907
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
898908
; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
899909
; CHECK-NEXT: vldrb.s32 q6, [r0, q2]
900-
; CHECK-NEXT: vadd.i32 q5, q2, q1
901-
; CHECK-NEXT: vadd.i32 q4, q3, q1
910+
; CHECK-NEXT: vadd.i32 q5, q2, q0
911+
; CHECK-NEXT: vadd.i32 q4, q3, q0
902912
; CHECK-NEXT: subs r4, #4
903913
; CHECK-NEXT: vadd.i32 q2, q6, r2
904914
; CHECK-NEXT: vldrb.s32 q6, [r1, q3]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
3+
4+
; This files has some unused gathers, making sure that they do not cause
5+
; problems as the function gets simplified.
6+
7+
define arm_aapcs_vfpcc void @unused1(<4 x i32*> %offs) {
8+
; CHECK-LABEL: unused1:
9+
; CHECK: @ %bb.0: @ %entry
10+
; CHECK-NEXT: bx lr
11+
entry:
12+
%gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
13+
ret void
14+
}
15+
16+
define arm_aapcs_vfpcc void @unused2(<4 x i32*> %offs) {
17+
; CHECK-LABEL: unused2:
18+
; CHECK: @ %bb.0: @ %entry
19+
; CHECK-NEXT: bx lr
20+
entry:
21+
%gather1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
22+
%gather2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
23+
ret void
24+
}
25+
26+
define arm_aapcs_vfpcc void @unused2_used(<4 x i32*> %offs) {
27+
; CHECK-LABEL: unused2_used:
28+
; CHECK: @ %bb.0: @ %entry
29+
; CHECK-NEXT: bx lr
30+
entry:
31+
%gather1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
32+
%gather2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
33+
%unused = add <4 x i32> %gather1, %gather2
34+
ret void
35+
}
36+
37+
38+
declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)

llvm/test/CodeGen/Thumb2/mve-phireg.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
2+
; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s
33

44
; verify-machineinstrs previously caught the incorrect use of QPR in the stack reloads.
55

llvm/test/CodeGen/Thumb2/mve-pred-xor.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,8 @@ entry:
170170
define arm_aapcs_vfpcc <4 x i32> @cmpugez_v4i1(<4 x i32> %a, <4 x i32> %b) {
171171
; CHECK-LABEL: cmpugez_v4i1:
172172
; CHECK: @ %bb.0: @ %entry
173-
; CHECK-NEXT: vcmp.i32 ne, q0, zr
174-
; CHECK-NEXT: vpsel q0, q0, q1
173+
; CHECK-NEXT: vcmp.i32 eq, q0, zr
174+
; CHECK-NEXT: vpsel q0, q1, q0
175175
; CHECK-NEXT: bx lr
176176
entry:
177177
%c1 = icmp eq <4 x i32> %a, zeroinitializer

llvm/test/CodeGen/Thumb2/mve-selectcc.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
2+
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
33

44
define arm_aapcs_vfpcc <4 x i32> @test_v4i32(i32 %x, <4 x i32> %s0, <4 x i32> %s1) {
55
; CHECK-LABEL: test_v4i32:

llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll

+8-14
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,6 @@ entry:
7070
define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
7171
; CHECK-LABEL: vqdmulh_i16_c:
7272
; CHECK: @ %bb.0: @ %entry
73-
; CHECK-NEXT: .vsave {d8, d9}
74-
; CHECK-NEXT: vpush {d8, d9}
7573
; CHECK-NEXT: vmov q2, q0
7674
; CHECK-NEXT: vmov.u16 r0, q0[2]
7775
; CHECK-NEXT: vmov.u16 r1, q0[0]
@@ -87,35 +85,32 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
8785
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
8886
; CHECK-NEXT: vmov.u16 r1, q2[4]
8987
; CHECK-NEXT: vmullb.s16 q0, q3, q0
90-
; CHECK-NEXT: vmov.i32 q3, #0x7fff
9188
; CHECK-NEXT: vshl.i32 q0, q0, #10
9289
; CHECK-NEXT: vshr.s32 q0, q0, #10
93-
; CHECK-NEXT: vshr.s32 q0, q0, #15
94-
; CHECK-NEXT: vmin.s32 q4, q0, q3
95-
; CHECK-NEXT: vmov r0, s16
90+
; CHECK-NEXT: vshr.s32 q3, q0, #15
91+
; CHECK-NEXT: vmov r0, s12
9692
; CHECK-NEXT: vmov.16 q0[0], r0
97-
; CHECK-NEXT: vmov r0, s17
93+
; CHECK-NEXT: vmov r0, s13
9894
; CHECK-NEXT: vmov.16 q0[1], r0
99-
; CHECK-NEXT: vmov r0, s18
95+
; CHECK-NEXT: vmov r0, s14
10096
; CHECK-NEXT: vmov.16 q0[2], r0
101-
; CHECK-NEXT: vmov r0, s19
97+
; CHECK-NEXT: vmov r0, s15
10298
; CHECK-NEXT: vmov.16 q0[3], r0
10399
; CHECK-NEXT: vmov.u16 r0, q2[6]
104-
; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
100+
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
105101
; CHECK-NEXT: vmov.u16 r0, q2[7]
106102
; CHECK-NEXT: vmov.u16 r1, q2[5]
107-
; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
103+
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
108104
; CHECK-NEXT: vmov.u16 r0, q1[6]
109105
; CHECK-NEXT: vmov.u16 r1, q1[4]
110106
; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
111107
; CHECK-NEXT: vmov.u16 r0, q1[7]
112108
; CHECK-NEXT: vmov.u16 r1, q1[5]
113109
; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
114-
; CHECK-NEXT: vmullb.s16 q1, q2, q4
110+
; CHECK-NEXT: vmullb.s16 q1, q2, q3
115111
; CHECK-NEXT: vshl.i32 q1, q1, #10
116112
; CHECK-NEXT: vshr.s32 q1, q1, #10
117113
; CHECK-NEXT: vshr.s32 q1, q1, #15
118-
; CHECK-NEXT: vmin.s32 q1, q1, q3
119114
; CHECK-NEXT: vmov r0, s4
120115
; CHECK-NEXT: vmov.16 q0[4], r0
121116
; CHECK-NEXT: vmov r0, s5
@@ -124,7 +119,6 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
124119
; CHECK-NEXT: vmov.16 q0[6], r0
125120
; CHECK-NEXT: vmov r0, s7
126121
; CHECK-NEXT: vmov.16 q0[7], r0
127-
; CHECK-NEXT: vpop {d8, d9}
128122
; CHECK-NEXT: bx lr
129123
entry:
130124
%l2 = sext <8 x i16> %s0 to <8 x i22>

0 commit comments

Comments
 (0)