@@ -321,26 +321,29 @@ end:
321
321
ret void ;
322
322
}
323
323
324
- define arm_aapcs_vfpcc void @non_gatscat_use1 (i32* noalias nocapture readonly %data , i32* noalias nocapture %dst , i32 %n.vec ) {
324
+ define arm_aapcs_vfpcc void @non_gatscat_use1 (i32* noalias nocapture readonly %data , i32* noalias nocapture %dst , i32 %n.vec , < 4 x i32 >* %x ) {
325
325
; CHECK-LABEL: non_gatscat_use1:
326
326
; CHECK: @ %bb.0: @ %vector.ph
327
- ; CHECK-NEXT: .vsave {d8, d9}
328
- ; CHECK-NEXT: vpush {d8, d9}
329
- ; CHECK-NEXT: adr r3 , .LCPI7_0
330
- ; CHECK-NEXT: vmov.i32 q0, #0x8
331
- ; CHECK-NEXT: vldrw.u32 q2 , [r3 ]
327
+ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13 }
328
+ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13 }
329
+ ; CHECK-NEXT: adr.w r12 , .LCPI7_0
330
+ ; CHECK-NEXT: vmov.i32 q0, #0x9
331
+ ; CHECK-NEXT: vldrw.u32 q3 , [r12 ]
332
332
; CHECK-NEXT: vmov.i32 q1, #0xc
333
+ ; CHECK-NEXT: vmov.i32 q2, #0x8
333
334
; CHECK-NEXT: .LBB7_1: @ %vector.body
334
335
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
335
- ; CHECK-NEXT: vadd.i32 q3, q2, q0
336
- ; CHECK-NEXT: vmlas.u32 q2, q1, r0
337
- ; CHECK-NEXT: vldrw .u32 q4, [q2, #24]
336
+ ; CHECK-NEXT: vadd.i32 q4, q3, q2
337
+ ; CHECK-NEXT: vmul.i32 q5, q3, q0
338
+ ; CHECK-NEXT: vmlas .u32 q3, q1, r0
338
339
; CHECK-NEXT: subs r2, #4
339
- ; CHECK-NEXT: vmov q2, q3
340
- ; CHECK-NEXT: vstrb.8 q4, [r1], #16
340
+ ; CHECK-NEXT: vldrw.u32 q6, [q3, #24]
341
+ ; CHECK-NEXT: vmov q3, q4
342
+ ; CHECK-NEXT: vstrw.32 q5, [r3]
343
+ ; CHECK-NEXT: vstrb.8 q6, [r1], #16
341
344
; CHECK-NEXT: bne .LBB7_1
342
345
; CHECK-NEXT: @ %bb.2: @ %end
343
- ; CHECK-NEXT: vpop {d8, d9}
346
+ ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13 }
344
347
; CHECK-NEXT: bx lr
345
348
; CHECK-NEXT: .p2align 4
346
349
; CHECK-NEXT: @ %bb.3:
@@ -364,6 +367,7 @@ vector.body: ; preds = %vector.body, %vecto
364
367
%4 = bitcast i32* %3 to <4 x i32 >*
365
368
store <4 x i32 > %wide.masked.gather , <4 x i32 >* %4 , align 4
366
369
%non_gatscat_use = mul <4 x i32 > %0 , <i32 3 , i32 3 , i32 3 , i32 3 >
370
+ store <4 x i32 > %non_gatscat_use , <4 x i32 >* %x , align 4
367
371
%index.next = add i32 %index , 4
368
372
%vec.ind.next = add <4 x i32 > %vec.ind , <i32 8 , i32 8 , i32 8 , i32 8 >
369
373
%5 = icmp eq i32 %index.next , %n.vec
@@ -373,26 +377,31 @@ end:
373
377
ret void ;
374
378
}
375
379
376
- define arm_aapcs_vfpcc void @non_gatscat_use2 (i32* noalias nocapture readonly %data , i32* noalias nocapture %dst , i32 %n.vec ) {
380
+ define arm_aapcs_vfpcc void @non_gatscat_use2 (i32* noalias nocapture readonly %data , i32* noalias nocapture %dst , i32 %n.vec , < 4 x i32 >* %x ) {
377
381
; CHECK-LABEL: non_gatscat_use2:
378
382
; CHECK: @ %bb.0: @ %vector.ph
379
- ; CHECK-NEXT: .vsave {d8, d9}
380
- ; CHECK-NEXT: vpush {d8, d9}
381
- ; CHECK-NEXT: adr r3, .LCPI8_0
382
- ; CHECK-NEXT: vmov.i32 q0, #0x8
383
- ; CHECK-NEXT: vldrw.u32 q2, [r3]
384
- ; CHECK-NEXT: vmov.i32 q1, #0xc
383
+ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
384
+ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
385
+ ; CHECK-NEXT: adr.w r12, .LCPI8_0
386
+ ; CHECK-NEXT: vmov.i32 q0, #0x12
387
+ ; CHECK-NEXT: vldrw.u32 q4, [r12]
388
+ ; CHECK-NEXT: vmov.i32 q1, #0x9
389
+ ; CHECK-NEXT: vmov.i32 q2, #0x8
390
+ ; CHECK-NEXT: vmov.i32 q3, #0xc
385
391
; CHECK-NEXT: .LBB8_1: @ %vector.body
386
392
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
387
- ; CHECK-NEXT: vadd.i32 q3, q2, q0
388
- ; CHECK-NEXT: vmlas.u32 q2, q1, r0
389
- ; CHECK-NEXT: vldrw .u32 q4, [q2, #24]
393
+ ; CHECK-NEXT: vadd.i32 q5, q4, q2
394
+ ; CHECK-NEXT: vmul.i32 q6, q4, q1
395
+ ; CHECK-NEXT: vmlas .u32 q4, q3, r0
390
396
; CHECK-NEXT: subs r2, #4
391
- ; CHECK-NEXT: vmov q2, q3
392
- ; CHECK-NEXT: vstrb.8 q4, [r1], #16
397
+ ; CHECK-NEXT: vldrw.u32 q7, [q4, #24]
398
+ ; CHECK-NEXT: vadd.i32 q4, q6, q0
399
+ ; CHECK-NEXT: vstrw.32 q4, [r3]
400
+ ; CHECK-NEXT: vmov q4, q5
401
+ ; CHECK-NEXT: vstrb.8 q7, [r1], #16
393
402
; CHECK-NEXT: bne .LBB8_1
394
403
; CHECK-NEXT: @ %bb.2: @ %end
395
- ; CHECK-NEXT: vpop {d8, d9}
404
+ ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15 }
396
405
; CHECK-NEXT: bx lr
397
406
; CHECK-NEXT: .p2align 4
398
407
; CHECK-NEXT: @ %bb.3:
@@ -416,6 +425,7 @@ vector.body: ; preds = %vector.body, %vecto
416
425
%4 = bitcast i32* %3 to <4 x i32 >*
417
426
store <4 x i32 > %wide.masked.gather , <4 x i32 >* %4 , align 4
418
427
%non_gatscat_use = mul <4 x i32 > %1 , <i32 3 , i32 3 , i32 3 , i32 3 >
428
+ store <4 x i32 > %non_gatscat_use , <4 x i32 >* %x , align 4
419
429
%index.next = add i32 %index , 4
420
430
%vec.ind.next = add <4 x i32 > %vec.ind , <i32 8 , i32 8 , i32 8 , i32 8 >
421
431
%5 = icmp eq i32 %index.next , %n.vec
@@ -844,12 +854,12 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
844
854
; CHECK-NEXT: add.w r8, r7, #10
845
855
; CHECK-NEXT: adr r7, .LCPI11_0
846
856
; CHECK-NEXT: ldr r1, [sp, #96]
847
- ; CHECK-NEXT: vdup.32 q1 , r2
848
- ; CHECK-NEXT: vldrw.u32 q0 , [r7]
857
+ ; CHECK-NEXT: vdup.32 q0 , r2
858
+ ; CHECK-NEXT: vldrw.u32 q1 , [r7]
849
859
; CHECK-NEXT: mov.w r10, #0
850
860
; CHECK-NEXT: mov.w r9, #6
851
861
; CHECK-NEXT: movs r6, #11
852
- ; CHECK-NEXT: vshl.i32 q1, q1 , #2
862
+ ; CHECK-NEXT: vshl.i32 q0, q0 , #2
853
863
; CHECK-NEXT: movs r5, #0
854
864
; CHECK-NEXT: .LBB11_1: @ %for.body10.i
855
865
; CHECK-NEXT: @ =>This Loop Header: Depth=1
@@ -884,10 +894,10 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
884
894
; CHECK-NEXT: mul r4, r11, r6
885
895
; CHECK-NEXT: vdup.32 q3, r5
886
896
; CHECK-NEXT: vdup.32 q2, r7
887
- ; CHECK-NEXT: vadd.i32 q4, q0 , r4
897
+ ; CHECK-NEXT: vadd.i32 q4, q1 , r4
888
898
; CHECK-NEXT: vmla.u32 q3, q4, r2
889
899
; CHECK-NEXT: adds r4, #113
890
- ; CHECK-NEXT: vadd.i32 q4, q0 , r4
900
+ ; CHECK-NEXT: vadd.i32 q4, q1 , r4
891
901
; CHECK-NEXT: mov r4, r8
892
902
; CHECK-NEXT: vmla.u32 q2, q4, r2
893
903
; CHECK-NEXT: .LBB11_5: @ %vector.body
@@ -897,8 +907,8 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
897
907
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
898
908
; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
899
909
; CHECK-NEXT: vldrb.s32 q6, [r0, q2]
900
- ; CHECK-NEXT: vadd.i32 q5, q2, q1
901
- ; CHECK-NEXT: vadd.i32 q4, q3, q1
910
+ ; CHECK-NEXT: vadd.i32 q5, q2, q0
911
+ ; CHECK-NEXT: vadd.i32 q4, q3, q0
902
912
; CHECK-NEXT: subs r4, #4
903
913
; CHECK-NEXT: vadd.i32 q2, q6, r2
904
914
; CHECK-NEXT: vldrb.s32 q6, [r1, q3]
0 commit comments