Skip to content

Commit 046c2e8

Browse files
mbelickiigcbot
authored andcommitted
Add LSC prefetch, new API to access 2d block write from OpenCL-C
This patch adds support for prefetch instructions and adds additional API to emit 2d block write from OpenCL-C.
1 parent 0b45589 commit 046c2e8

File tree

7 files changed

+135
-4
lines changed

7 files changed

+135
-4
lines changed

IGC/BiFModule/Languages/OpenCL/IBiF_Sub_Groups.cl

+54
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,60 @@ DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_FLAT(intel_subgroup_block_read_transpose_u64
870870

871871
#endif // defined(cl_intel_subgroup_extended_block_read)
872872

873+
#if defined(cl_intel_subgroup_extended_block_read_cacheopts)
874+
#define DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(FUNC_NAME, TYPE, INTERNAL_FUNC) \
875+
INLINE TYPE FUNC_NAME( __global void* base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control ) \
876+
{ \
877+
long baseoffset = as_long(base_address); \
878+
int width_minus_one = width - 1; \
879+
int height_minus_one = height - 1; \
880+
int pitch_minus_one = pitch - 1; \
881+
return INTERNAL_FUNC(baseoffset, width_minus_one, height_minus_one, pitch_minus_one, coord, cache_control); \
882+
}
883+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_u8_m1k32v2, ushort2, __builtin_IB_subgroup_block_read_cacheopts_u8_m1k32v2)
884+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_u8_m2k32v2, ushort4, __builtin_IB_subgroup_block_read_cacheopts_u8_m2k32v2)
885+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_u8_m4k32v2, ushort8, __builtin_IB_subgroup_block_read_cacheopts_u8_m4k32v2)
886+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_u8_m8k32v2, ushort16, __builtin_IB_subgroup_block_read_cacheopts_u8_m8k32v2)
887+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_u16_m1k16v2, ushort2, __builtin_IB_subgroup_block_read_cacheopts_u16_m1k16v2)
888+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_u16_m2k16v2, ushort4, __builtin_IB_subgroup_block_read_cacheopts_u16_m2k16v2)
889+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_u16_m4k16v2, ushort8, __builtin_IB_subgroup_block_read_cacheopts_u16_m4k16v2)
890+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_u16_m8k16v2, ushort16, __builtin_IB_subgroup_block_read_cacheopts_u16_m8k16v2)
891+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_transform_u8_k32, uint8, __builtin_IB_subgroup_block_read_cacheopts_transform_u8_k32)
892+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_transform_u16_k16, uint8, __builtin_IB_subgroup_block_read_cacheopts_transform_u16_k16)
893+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_transpose_u32_k8, uint8, __builtin_IB_subgroup_block_read_cacheopts_transpose_u32_k8)
894+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_read_cacheopts_transpose_u64_k4, ulong4,__builtin_IB_subgroup_block_read_cacheopts_transpose_u64_k4)
895+
896+
#define DEFN_INTEL_SUB_GROUP_BLOCK_WRITE_LSC_CACHEOPTS(FUNC_NAME, TYPE, INTERNAL_FUNC) \
897+
INLINE void FUNC_NAME( __global void* base_address, int width, int height, int pitch, int2 coord, TYPE val, enum LSC_STCC cache_control ) \
898+
{ \
899+
long baseoffset = as_long(base_address); \
900+
int width_minus_one = width - 1; \
901+
int height_minus_one = height - 1; \
902+
int pitch_minus_one = pitch - 1; \
903+
INTERNAL_FUNC(baseoffset, width_minus_one, height_minus_one, pitch_minus_one, coord, val, cache_control); \
904+
}
905+
DEFN_INTEL_SUB_GROUP_BLOCK_WRITE_LSC_CACHEOPTS(intel_subgroup_block_write_cacheopts_u8_m1k32v1, ushort, __builtin_IB_subgroup_block_write_cacheopts_u8_m1k32v1)
906+
DEFN_INTEL_SUB_GROUP_BLOCK_WRITE_LSC_CACHEOPTS(intel_subgroup_block_write_cacheopts_u8_m2k32v1, ushort2, __builtin_IB_subgroup_block_write_cacheopts_u8_m2k32v1)
907+
DEFN_INTEL_SUB_GROUP_BLOCK_WRITE_LSC_CACHEOPTS(intel_subgroup_block_write_cacheopts_u8_m4k32v1, ushort4, __builtin_IB_subgroup_block_write_cacheopts_u8_m4k32v1)
908+
DEFN_INTEL_SUB_GROUP_BLOCK_WRITE_LSC_CACHEOPTS(intel_subgroup_block_write_cacheopts_u8_m8k32v1, ushort8, __builtin_IB_subgroup_block_write_cacheopts_u8_m8k32v1)
909+
DEFN_INTEL_SUB_GROUP_BLOCK_WRITE_LSC_CACHEOPTS(intel_subgroup_block_write_cacheopts_u16_m1k16v1, ushort, __builtin_IB_subgroup_block_write_cacheopts_u16_m1k16v1)
910+
DEFN_INTEL_SUB_GROUP_BLOCK_WRITE_LSC_CACHEOPTS(intel_subgroup_block_write_cacheopts_u16_m2k16v1, ushort2, __builtin_IB_subgroup_block_write_cacheopts_u16_m2k16v1)
911+
DEFN_INTEL_SUB_GROUP_BLOCK_WRITE_LSC_CACHEOPTS(intel_subgroup_block_write_cacheopts_u16_m4k16v1, ushort4, __builtin_IB_subgroup_block_write_cacheopts_u16_m4k16v1)
912+
DEFN_INTEL_SUB_GROUP_BLOCK_WRITE_LSC_CACHEOPTS(intel_subgroup_block_write_cacheopts_u16_m8k16v1, ushort8, __builtin_IB_subgroup_block_write_cacheopts_u16_m8k16v1)
913+
914+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_u8_m1k32v2, void, __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v2)
915+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_u8_m2k32v2, void, __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v2)
916+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_u8_m4k32v2, void, __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v2)
917+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_u8_m8k32v2, void, __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v2)
918+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_u16_m1k16v2, void, __builtin_IB_subgroup_block_read_prefetch_u16_m1k16v2)
919+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_u16_m2k16v2, void, __builtin_IB_subgroup_block_read_prefetch_u16_m2k16v2)
920+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_u16_m4k16v2, void, __builtin_IB_subgroup_block_read_prefetch_u16_m4k16v2)
921+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_u16_m8k16v2, void, __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2)
922+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_transform_u8_k32, void, __builtin_IB_subgroup_block_read_prefetch_transform_u8_k32)
923+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_transform_u16_k16, void, __builtin_IB_subgroup_block_read_prefetch_transform_u16_k16)
924+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_transpose_u32_k8, void, __builtin_IB_subgroup_block_read_prefetch_transpose_u32_k8)
925+
DEFN_INTEL_SUB_GROUP_BLOCK_READ_LSC_CACHEOPTS(intel_subgroup_block_prefetch_transpose_u64_k4, void, __builtin_IB_subgroup_block_read_prefetch_transpose_u64_k4)
926+
#endif // defined(cl_intel_subgroup_extended_block_read_cacheopts)
873927

874928
#if defined(cl_khr_subgroup_shuffle)
875929
#define DEFN_SUB_GROUP_SHUFFLE(TYPE, SPV_TYPE, TYPE_ABBR) \

IGC/BiFModule/Languages/OpenCL/PreRelease/opencl_cth_pre_release.h

+40
Original file line numberDiff line numberDiff line change
@@ -2566,6 +2566,46 @@ ulong4 intel_subgroup_block_read_transpose_u64_k4(__global void *base_address,
25662566

25672567
#endif //defined(cl_intel_subgroup_extended_block_read)
25682568

2569+
#ifdef cl_intel_subgroup_extended_block_read_cacheopts
2570+
extern enum LSC_LDCC;
2571+
extern enum LSC_STCC;
2572+
2573+
ushort2 intel_subgroup_block_read_cacheopts_u8_m1k32v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2574+
ushort4 intel_subgroup_block_read_cacheopts_u8_m2k32v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2575+
ushort8 intel_subgroup_block_read_cacheopts_u8_m4k32v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2576+
ushort16 intel_subgroup_block_read_cacheopts_u8_m8k32v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2577+
ushort2 intel_subgroup_block_read_cacheopts_u16_m1k16v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2578+
ushort4 intel_subgroup_block_read_cacheopts_u16_m2k16v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2579+
ushort8 intel_subgroup_block_read_cacheopts_u16_m4k16v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2580+
ushort16 intel_subgroup_block_read_cacheopts_u16_m8k16v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2581+
uint8 intel_subgroup_block_read_cacheopts_transform_u8_k32(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2582+
uint8 intel_subgroup_block_read_cacheopts_transform_u16_k16(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2583+
uint8 intel_subgroup_block_read_cacheopts_transpose_u32_k8(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2584+
ulong4 intel_subgroup_block_read_cacheopts_transpose_u64_k4(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2585+
2586+
void intel_subgroup_block_write_cacheopts_u8_m1k32v1(__global void *base_address, int width, int height, int pitch, int2 coord, ushort val, enum LSC_STCC cache_control);
2587+
void intel_subgroup_block_write_cacheopts_u8_m2k32v1(__global void *base_address, int width, int height, int pitch, int2 coord, ushort2 val, enum LSC_STCC cache_control);
2588+
void intel_subgroup_block_write_cacheopts_u8_m4k32v1(__global void *base_address, int width, int height, int pitch, int2 coord, ushort4 val, enum LSC_STCC cache_control);
2589+
void intel_subgroup_block_write_cacheopts_u8_m8k32v1(__global void *base_address, int width, int height, int pitch, int2 coord, ushort8 val, enum LSC_STCC cache_control);
2590+
void intel_subgroup_block_write_cacheopts_u16_m1k16v1(__global void *base_address, int width, int height, int pitch, int2 coord, ushort val, enum LSC_STCC cache_control);
2591+
void intel_subgroup_block_write_cacheopts_u16_m2k16v1(__global void *base_address, int width, int height, int pitch, int2 coord, ushort2 val, enum LSC_STCC cache_control);
2592+
void intel_subgroup_block_write_cacheopts_u16_m4k16v1(__global void *base_address, int width, int height, int pitch, int2 coord, ushort4 val, enum LSC_STCC cache_control);
2593+
void intel_subgroup_block_write_cacheopts_u16_m8k16v1(__global void *base_address, int width, int height, int pitch, int2 coord, ushort8 val, enum LSC_STCC cache_control);
2594+
2595+
void intel_subgroup_block_prefetch_u8_m1k32v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2596+
void intel_subgroup_block_prefetch_u8_m2k32v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2597+
void intel_subgroup_block_prefetch_u8_m4k32v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2598+
void intel_subgroup_block_prefetch_u8_m8k32v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2599+
void intel_subgroup_block_prefetch_u16_m1k16v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2600+
void intel_subgroup_block_prefetch_u16_m2k16v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2601+
void intel_subgroup_block_prefetch_u16_m4k16v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2602+
void intel_subgroup_block_prefetch_u16_m8k16v2(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2603+
void intel_subgroup_block_prefetch_transform_u8_k32(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2604+
void intel_subgroup_block_prefetch_transform_u16_k16(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2605+
void intel_subgroup_block_prefetch_transpose_u32_k8(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2606+
void intel_subgroup_block_prefetch_transpose_u64_k4(__global void *base_address, int width, int height, int pitch, int2 coord, enum LSC_LDCC cache_control);
2607+
#endif //defined(cl_intel_subgroup_extended_block_read_cacheopts)
2608+
25692609
void global_barrier();
25702610

25712611
//

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -8844,6 +8844,7 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
88448844
case GenISAIntrinsic::GenISA_LSCAtomicFP32:
88458845
case GenISAIntrinsic::GenISA_LSCAtomicInts:
88468846
case GenISAIntrinsic::GenISA_LSC2DBlockRead:
8847+
case GenISAIntrinsic::GenISA_LSC2DBlockPrefetch:
88478848
case GenISAIntrinsic::GenISA_LSC2DBlockWrite:
88488849
emitLSCIntrinsic(inst);
88498850
break;

IGC/Compiler/CISACodeGen/helper.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -1794,6 +1794,7 @@ namespace IGC
17941794
{
17951795
case GenISAIntrinsic::GenISA_simdBlockRead:
17961796
case GenISAIntrinsic::GenISA_LSC2DBlockRead:
1797+
case GenISAIntrinsic::GenISA_LSC2DBlockPrefetch:
17971798
case GenISAIntrinsic::GenISA_LSCLoad:
17981799
case GenISAIntrinsic::GenISA_LSCLoadBlock:
17991800
case GenISAIntrinsic::GenISA_LSCPrefetch:

IGC/Compiler/Optimizer/OpenCLPasses/LSCFuncs/LSCFuncsResolution.cpp

+20-4
Original file line numberDiff line numberDiff line change
@@ -374,8 +374,11 @@ Instruction* LSCFuncsResolution::CreateSubGroup2DBlockOperation(llvm::CallInst&
374374
unsigned int subGrpSize = funcInfoMD->getSubGroupSize()->getSIMDSize();
375375

376376
funcName.consume_front("_flat");
377+
bool isPrefetch = funcName.consume_front("_prefetch");
378+
bool hasCacheOpts = funcName.consume_front("_cacheopts") || isPrefetch;
377379
uint32_t isTranspose = funcName.consume_front("_transpose") ? 1 : 0;
378380
uint32_t isVnniTransform = funcName.consume_front("_transform") ? 1 : 0;
381+
hasCacheOpts |= funcName.consume_front("_cacheopts");
379382

380383
uint32_t elemSize = 0;
381384
if (funcName.consume_front("_u8"))
@@ -597,24 +600,37 @@ Instruction* LSCFuncsResolution::CreateSubGroup2DBlockOperation(llvm::CallInst&
597600
args.push_back(isVnniTransformConstant);
598601

599602

600-
args.push_back(getConstantInt32(LSC_L1DEF_L3DEF));
603+
if (hasCacheOpts)
604+
{
605+
unsigned cacheOptsId = isRead ? 5 : 6;
606+
args.push_back(getCacheControlOpts(cacheOptsId));
607+
}
608+
else
609+
{
610+
args.push_back(getConstantInt32(LSC_L1DEF_L3DEF));
611+
}
601612

602613
Function* BlockFunc = nullptr;
603614
if (isRead)
604615
{
605616
BlockFunc = GenISAIntrinsic::getDeclaration(
606617
CI.getCalledFunction()->getParent(),
607-
GenISAIntrinsic::GenISA_LSC2DBlockRead,
618+
isPrefetch ? GenISAIntrinsic::GenISA_LSC2DBlockPrefetch : GenISAIntrinsic::GenISA_LSC2DBlockRead,
608619
CI.getCalledFunction()->getReturnType());
609620
}
610621
else
611622
{
612623
uint32_t blockWriteDstOperandId = 5;
613-
args.push_back(CI.getArgOperand(blockWriteDstOperandId));
624+
if (hasCacheOpts)
625+
{
626+
blockWriteDstOperandId = 6;
627+
}
628+
Value *dst = CI.getArgOperand(blockWriteDstOperandId);
629+
args.push_back(dst);
614630
BlockFunc = GenISAIntrinsic::getDeclaration(
615631
CI.getCalledFunction()->getParent(),
616632
GenISAIntrinsic::GenISA_LSC2DBlockWrite,
617-
CI.getArgOperand(5)->getType());
633+
dst->getType());
618634
}
619635

620636
Instruction* BlockOp = CallInst::Create(BlockFunc, args, "", &CI);

IGC/Compiler/Optimizer/OpenCLPasses/ScalarArgAsPointer/ScalarArgAsPointer.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ void ScalarArgAsPointerAnalysis::visitCallInst(CallInst& CI)
110110
GenISAIntrinsic::ID const id = I->getIntrinsicID();
111111

112112
if (id == GenISAIntrinsic::GenISA_LSC2DBlockRead ||
113+
id == GenISAIntrinsic::GenISA_LSC2DBlockPrefetch ||
113114
id == GenISAIntrinsic::GenISA_LSC2DBlockWrite)
114115
{
115116
return analyzeValue(I->getOperand(0));

IGC/GenISAIntrinsics/Intrinsic_definitions.py

+18
Original file line numberDiff line numberDiff line change
@@ -2710,6 +2710,24 @@
27102710
("anyint", "stored value")],
27112711
"None"]],
27122712
####################################################################################################
2713+
"GenISA_LSC2DBlockPrefetch": ["LSC 2d block prefetch",
2714+
[("void", "nothing is returned"),
2715+
[("long", "flat image base offset"),
2716+
("int", "flat image base width"),
2717+
("int", "flat image base height"),
2718+
("int", "flat image base pitch"),
2719+
("int", "offset x"),
2720+
("int", "offset y"),
2721+
("int", "elemSize"),
2722+
("int", "tile width"),
2723+
("int", "tile height"),
2724+
("int", "V - num blocks (2 for simple 2d block read)"),
2725+
("bool", "transpose"),
2726+
("bool", "vnni transform (for transpose+transform use transpose "+\
2727+
"only and elemSize 32)"),
2728+
("int", "cache controls options (LSC_CACHE_OPTS)")],
2729+
"None"]],
2730+
####################################################################################################
27132731
"GenISA_LSCAtomicFP32": ["LSC atomic FP32 add,sub,min,max,fcas",
27142732
[("float", "return old value"),
27152733
[("anyptr", "memory pointer: ugm, ugml, tgm, slm"),

0 commit comments

Comments
 (0)