Skip to content

Commit 693eb0b

Browse files
vmustyaigcbot
authored andcommitted
Extend dpas loop peeling heuristic in VC
VC should match more complex dpas loop cases, when the accumulator vector is joined and accessed by rdregion operations.
1 parent 91ea128 commit 693eb0b

File tree

2 files changed

+186
-29
lines changed

2 files changed

+186
-29
lines changed

IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.cpp

+71-29
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ static cl::opt<unsigned> PeelLoopDpasNullAccMaxBlocks(
8888
cl::desc("Max number of a loop basic blocks to peel, when the loop has "
8989
"dpas instructions with zero-initialized accumulator operand"));
9090
static cl::opt<unsigned> PeelLoopDpasNullAccMaxInstr(
91-
"vc-peel-loops-dpas-null-acc-max-instr", cl::init(128), cl::Hidden,
91+
"vc-peel-loops-dpas-null-acc-max-instr", cl::init(192), cl::Hidden,
9292
cl::desc("Max number of a loop instructions to peel, when the loop has "
9393
"dpas instructions with zero-initialized accumulator operand"));
9494
static cl::opt<unsigned> PeelLoopDpasNullAccMin(
@@ -102,6 +102,23 @@ static std::string getDL(bool Is64Bit) {
102102
: "e-p:32:32-p3:32:32-p6:32:32-i64:64-n8:16:32:64";
103103
}
104104

105+
namespace {
106+
bool isDpasAccumulator(const Value *V) {
107+
if (!V->hasOneUse())
108+
return false;
109+
110+
const auto *User = V->user_back();
111+
const auto IID = vc::getAnyIntrinsicID(User);
112+
113+
if (IID != GenXIntrinsic::genx_dpas && IID != GenXIntrinsic::genx_dpas2)
114+
return true;
115+
116+
const auto *CI = cast<CallInst>(User);
117+
return CI->getArgOperand(0) == V;
118+
}
119+
120+
} // namespace
121+
105122
namespace llvm {
106123

107124
//===----------------------------------------------------------------------===//
@@ -255,35 +272,60 @@ void GenXTTIImpl::getPeelingPreferences(
255272
if (L->getNumBlocks() > PeelLoopDpasNullAccMaxBlocks)
256273
return;
257274

275+
// Match the following two cases:
276+
// 1. Dpas accumulator is a phi node with a zero value came from the pre-loop
277+
// basic block.
278+
// %acc = phi <N x Ty> [ zeroinitializer, %entry ], [ %dst, %loop ]
279+
// %dst = call @llvm.genx.dpas(%acc, ...)
280+
// 2. Dpas accumulator is a rdregion taking a piece of a large vector. The
281+
// large vector is a phi node with a zero value taken from the pre-loop basic
282+
// block.
283+
// %phi = phi <N x ty>
284+
// %acc = call @llvm.genx.rdregion(%phi, ...)
285+
// %dst = call @llvm.genx.dpas(%acc, ...)
286+
// %wrregion = call @llvm.genx.wrregion(%phi, %dst, ...)
287+
258288
const auto PhiNodes = L->getHeader()->phis();
259-
const unsigned NumDpasZeroAcc = llvm::count_if(PhiNodes, [](const auto &Phi) {
260-
// Check vector Phi nodes
261-
if (!Phi.getType()->isVectorTy() || !Phi.hasOneUse() ||
262-
Phi.getNumIncomingValues() != 2)
263-
return false;
264-
265-
// The only user of the Phi node must be dpas intrinsic
266-
const auto *User = Phi.user_back();
267-
268-
// Only check dpas intrinsics
269-
switch (vc::getAnyIntrinsicID(User)) {
270-
default:
271-
return false;
272-
case GenXIntrinsic::genx_dpas:
273-
case GenXIntrinsic::genx_dpas2:
274-
break;
275-
}
276-
277-
// Only allow dpas accumulator
278-
if (const auto *CI = cast<CallInst>(User); CI->getArgOperand(0) != &Phi)
279-
return false;
280-
281-
// Check if one of the Phi inputs is constant zero
282-
return llvm::any_of(Phi.incoming_values(), [](const auto &V) {
283-
const auto *C = dyn_cast<Constant>(&V);
284-
return C && C->isZeroValue();
285-
});
286-
});
289+
const uint64_t NumDpasZeroAcc = std::accumulate(
290+
std::begin(PhiNodes), std::end(PhiNodes), 0ull,
291+
[this](uint64_t Acc, const auto &Phi) {
292+
// Check vector Phi nodes
293+
const auto *PhiVTy = dyn_cast<IGCLLVM::FixedVectorType>(Phi.getType());
294+
if (!PhiVTy || Phi.getNumIncomingValues() != 2)
295+
return Acc;
296+
297+
// Check if one of the Phi inputs is constant zero
298+
bool IsZeroAcc = llvm::any_of(Phi.incoming_values(), [](const auto &V) {
299+
const auto *C = dyn_cast<Constant>(&V);
300+
return C && C->isZeroValue();
301+
});
302+
if (!IsZeroAcc)
303+
return Acc;
304+
305+
// Simple case. The only user of the Phi node must be dpas intrinsic.
306+
if (isDpasAccumulator(&Phi))
307+
return Acc + 1;
308+
309+
// If vector decomposition is disabled, we unable to simplify the region
310+
// access chain. So, it doesn't make sense to peel the loop.
311+
if (ST.disableVectorDecomposition())
312+
return Acc;
313+
314+
if (!Phi.hasNUses(2))
315+
return Acc;
316+
317+
// Find the rdregion used as a dpas accumulator
318+
const auto It = llvm::find_if(Phi.users(), [](const Value *V) {
319+
return GenXIntrinsic::isRdRegion(V) && isDpasAccumulator(V);
320+
});
321+
if (It == Phi.user_end())
322+
return Acc;
323+
324+
// Assume that the whole phi node value is used as an accumulator source
325+
// for multiple similar dpas instructions.
326+
const auto *DpasVTy = cast<IGCLLVM::FixedVectorType>((*It)->getType());
327+
return Acc + (PhiVTy->getNumElements() / DpasVTy->getNumElements());
328+
});
287329

288330
if (NumDpasZeroAcc < PeelLoopDpasNullAccMin)
289331
return;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; RUN: %opt %use_old_pass_manager% -loop-unroll -vc-peel-loops-dpas-null-acc=true -march=genx64 -mcpu=XeHPC -S < %s | FileCheck %s
10+
11+
target datalayout = "e-p:64:64-p3:32:32-p6:32:32-i64:64-n8:16:32:64"
12+
target triple = "spir64-unknown-unknown"
13+
14+
; Function Attrs: nofree nosync nounwind readnone
15+
declare <64 x i32> @llvm.genx.rdregioni.v64i32.v256i32.i16(<256 x i32>, i32, i32, i32, i16, i32) #0
16+
17+
; Function Attrs: nofree nosync nounwind readnone
18+
declare <128 x i32> @llvm.genx.rdregioni.v128i32.v512i32.i16(<512 x i32>, i32, i32, i32, i16, i32) #0
19+
20+
; Function Attrs: nofree nosync nounwind readnone
21+
declare <512 x i32> @llvm.genx.wrregioni.v512i32.v128i32.i16.i1(<512 x i32>, <128 x i32>, i32, i32, i32, i16, i32, i1) #0
22+
23+
; Function Attrs: nofree nosync nounwind readnone
24+
declare <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32>, <128 x i32>, <64 x i32>, i32, i32, i32, i32, i32, i32) #0
25+
26+
; Function Attrs: noinline nounwind
27+
define dllexport spir_kernel void @kernel(i8 addrspace(1)* %0, i8 addrspace(1)* %1, i8 addrspace(1)* %2) local_unnamed_addr #1 !spirv.ParameterDecorations !8 !intel_reqd_sub_group_size !11 {
28+
; CHECK: call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32>
29+
; CHECK: call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32>
30+
; CHECK: call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32>
31+
; CHECK: call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32>
32+
%4 = ptrtoint i8 addrspace(1)* %1 to i64
33+
%5 = ptrtoint i8 addrspace(1)* %2 to i64
34+
br label %6
35+
36+
6: ; preds = %6, %3
37+
; CHECK: phi <512 x i32>
38+
39+
; CHECK: call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32>
40+
; CHECK: call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32>
41+
; CHECK: call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32>
42+
; CHECK: call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32>
43+
44+
%indvars.iv159 = phi i64 [ 0, %3 ], [ %indvars.iv.next160, %6 ]
45+
%indvars.iv = phi i64 [ 0, %3 ], [ %indvars.iv.next, %6 ]
46+
%.0140155 = phi i32 [ 0, %3 ], [ %23, %6 ]
47+
%phiacc = phi <512 x i32> [ zeroinitializer, %3 ], [ %dst3, %6 ]
48+
%7 = shl nsw i64 %indvars.iv159, 2
49+
%8 = add i64 %7, %4
50+
%9 = inttoptr i64 %8 to <256 x i32> addrspace(1)*
51+
%10 = load <256 x i32>, <256 x i32> addrspace(1)* %9, align 16
52+
%11 = shl nsw i64 %indvars.iv, 2
53+
%12 = add i64 %11, %5
54+
%13 = inttoptr i64 %12 to <128 x i32> addrspace(1)*
55+
%14 = load <128 x i32>, <128 x i32> addrspace(1)* %13, align 16
56+
%indvars.iv.next160 = add nuw nsw i64 %indvars.iv159, 256
57+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 128
58+
59+
%15 = call <64 x i32> @llvm.genx.rdregioni.v64i32.v256i32.i16(<256 x i32> %10, i32 8, i32 8, i32 1, i16 0, i32 8)
60+
%acc0 = call <128 x i32> @llvm.genx.rdregioni.v128i32.v512i32.i16(<512 x i32> %phiacc, i32 1, i32 1, i32 0, i16 0, i32 0)
61+
%16 = call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32> %acc0, <128 x i32> %14, <64 x i32> %15, i32 8, i32 8, i32 8, i32 8, i32 1, i32 1)
62+
%dst0 = call <512 x i32> @llvm.genx.wrregioni.v512i32.v128i32.i16.i1(<512 x i32> %phiacc, <128 x i32> %16, i32 1, i32 1, i32 0, i16 0, i32 0, i1 true)
63+
64+
%17 = call <64 x i32> @llvm.genx.rdregioni.v64i32.v256i32.i16(<256 x i32> %10, i32 8, i32 8, i32 1, i16 256, i32 8)
65+
%acc1 = call <128 x i32> @llvm.genx.rdregioni.v128i32.v512i32.i16(<512 x i32> %dst0, i32 1, i32 1, i32 0, i16 512, i32 0)
66+
%18 = call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32> %acc1, <128 x i32> %14, <64 x i32> %15, i32 8, i32 8, i32 8, i32 8, i32 1, i32 1)
67+
%dst1 = call <512 x i32> @llvm.genx.wrregioni.v512i32.v128i32.i16.i1(<512 x i32> %dst0, <128 x i32> %18, i32 1, i32 1, i32 0, i16 512, i32 0, i1 true)
68+
69+
%19 = call <64 x i32> @llvm.genx.rdregioni.v64i32.v256i32.i16(<256 x i32> %10, i32 8, i32 8, i32 1, i16 512, i32 8)
70+
%acc2 = call <128 x i32> @llvm.genx.rdregioni.v128i32.v512i32.i16(<512 x i32> %dst1, i32 1, i32 1, i32 0, i16 1024, i32 0)
71+
%20 = call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32> %acc2, <128 x i32> %14, <64 x i32> %15, i32 8, i32 8, i32 8, i32 8, i32 1, i32 1)
72+
%dst2 = call <512 x i32> @llvm.genx.wrregioni.v512i32.v128i32.i16.i1(<512 x i32> %dst1, <128 x i32> %20, i32 1, i32 1, i32 0, i16 1024, i32 0, i1 true)
73+
74+
%21 = call <64 x i32> @llvm.genx.rdregioni.v64i32.v256i32.i16(<256 x i32> %10, i32 8, i32 8, i32 1, i16 768, i32 8)
75+
%acc3 = call <128 x i32> @llvm.genx.rdregioni.v128i32.v512i32.i16(<512 x i32> %dst2, i32 1, i32 1, i32 0, i16 1536, i32 0)
76+
%22 = call <128 x i32> @llvm.genx.dpas2.v128i32.v128i32.v128i32.v64i32(<128 x i32> %acc3, <128 x i32> %14, <64 x i32> %15, i32 8, i32 8, i32 8, i32 8, i32 1, i32 1)
77+
%dst3 = call <512 x i32> @llvm.genx.wrregioni.v512i32.v128i32.i16.i1(<512 x i32> %dst2, <128 x i32> %22, i32 1, i32 1, i32 0, i16 1536, i32 0, i1 true)
78+
79+
%23 = add nuw nsw i32 %.0140155, 1
80+
%exitcond.not = icmp eq i32 %23, 16
81+
br i1 %exitcond.not, label %24, label %6
82+
83+
24: ; preds = %6
84+
%res = phi <512 x i32> [ %dst3, %6 ]
85+
%25 = ptrtoint i8 addrspace(1)* %0 to i64
86+
%26 = bitcast i8 addrspace(1)* %0 to <512 x i32> addrspace(1)*
87+
store <512 x i32> %res, <512 x i32> addrspace(1)* %26, align 16
88+
ret void
89+
}
90+
91+
attributes #0 = { nofree nosync nounwind readnone "target-cpu"="XeHPC" }
92+
attributes #1 = { noinline nounwind "CMGenxMain" "oclrt"="1" "target-cpu"="XeHPC" }
93+
94+
!spirv.MemoryModel = !{!0}
95+
!opencl.enable.FP_CONTRACT = !{}
96+
!spirv.Source = !{!1}
97+
!opencl.spir.version = !{!2}
98+
!opencl.ocl.version = !{!1}
99+
!opencl.used.extensions = !{!3}
100+
!opencl.used.optional.core.features = !{!3}
101+
!spirv.Generator = !{!4}
102+
!genx.kernels = !{!5}
103+
104+
!0 = !{i32 2, i32 2}
105+
!1 = !{i32 0, i32 0}
106+
!2 = !{i32 1, i32 2}
107+
!3 = !{}
108+
!4 = !{i16 6, i16 14}
109+
!5 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @kernel, !"kernel", !6, i32 0, i32 0, !6, !7, i32 0}
110+
!6 = !{i32 0, i32 0, i32 0}
111+
!7 = !{!"svmptr_t", !"svmptr_t", !"svmptr_t"}
112+
!8 = !{!9, !9, !9}
113+
!9 = !{!10}
114+
!10 = !{i32 5625, i32 0}
115+
!11 = !{i32 1}

0 commit comments

Comments
 (0)