@@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
8080 let assemblyFormat = "`<` $value `>`";
8181}
8282
83+ //===----------------------------------------------------------------------===//
84+ // AMDGPU Type definitions
85+ //===----------------------------------------------------------------------===//
86+
8387class AMDGPU_Type<string name, string typeMnemonic, list<Trait> traits = []>
8488 : TypeDef<AMDGPU_Dialect, name, traits> {
8589 let mnemonic = typeMnemonic;
8690}
8791
88- //===----------------------------------------------------------------------===//
89- // AMDGPU Type definitions
90- //===----------------------------------------------------------------------===//
91-
9292def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
9393 let summary = "Pair of base addresses that move data between LDS and global storage.";
9494 let description = [{
@@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
104104 let assemblyFormat = "`<` $elementType `>`";
105105}
106106
107+ def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
108+ let summary = "Descriptors used in tensor store/load operations.";
109+ let description = [{
110+ This type is opaque and corresponds to the two or four descriptor groups
111+ used in tensor_load_to_lds or tensor_store_from_lds.
112+ }];
113+
114+ }
115+
107116//===----------------------------------------------------------------------===//
108117// AMDGPU Op definitions
109118//===----------------------------------------------------------------------===//
@@ -1222,14 +1231,13 @@ def AMDGPU_MakeDmaBaseOp :
12221231 AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>,
12231232 Arguments<(ins
12241233 Arg<AnyMemRef, "buffer to read from">:$src,
1225- Variadic<Index>:$srcIndices ,
1234+ Variadic<Index>:$src_indices ,
12261235 Arg<AnyMemRef, "buffer to write to">:$dst,
1227- Variadic<Index>:$dstIndices )>,
1236+ Variadic<Index>:$dst_indices )>,
12281237 Results<(outs AMDGPU_TDMBaseType: $base)> {
12291238
12301239 // TODO:
12311240 // * Add verifiers such that one of the memrefs is from LDS and the other global.
1232- // * Add verifiers to make sure that the type is in the correct direction.
12331241 // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.
12341242
12351243 let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
@@ -1240,12 +1248,105 @@ def AMDGPU_MakeDmaBaseOp :
12401248 This operation creates a value corresponding to the tensor descriptor (D#) group 0
12411249 found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.
12421250
1251+ For example:
1252+
1253+ ```mlir
1254+ %base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
1255+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
1256+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
1257+ ```
1258+
1259+ to
1260+
1261+ ```mlir
1262+ // pseudocode
1263+ %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)>
1264+ %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)>
1265+ %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)>
1266+ // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base<i32>
1267+
1268+ // The base will be used when contructing dgroup0
1269+ // when lowering amdgpu.make_dma_descriptor
1270+ %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)>
1271+ %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : ....
1272+
1273+ // When lowering amdgpu.tensor_load_to_lds
1274+ rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
1275+ ```
1276+
12431277 These tensor DMA operations were introduced in gfx1250.
12441278 }];
12451279
12461280 let assemblyFormat = [{
1247- $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results)
1281+ $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results)
1282+ }];
1283+ }
1284+
1285+ def AMDGPU_MakeDmaDescriptorOp :
1286+ AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>,
1287+ Arguments<(ins
1288+ AMDGPU_TDMBaseType: $base,
1289+ Variadic<Index>: $global_dynamic_sizes,
1290+ DenseI64ArrayAttr: $global_static_sizes,
1291+ Variadic<Index>: $global_dynamic_strides,
1292+ DenseI64ArrayAttr: $global_static_strides,
1293+ Variadic<Index>: $shared_dynamic_sizes,
1294+ DenseI64ArrayAttr: $shared_static_sizes,
1295+ Optional<Index>: $pad,
1296+ Optional<Index>: $pad_every,
1297+ Optional<AnyMemRef>: $atomic_barrier_address,
1298+ Variadic<Index>: $atomic_barrier_indices,
1299+ Optional<Index>: $global_increment,
1300+ Optional<Index>: $lds_increment,
1301+ Optional<Index>: $iteration_count)>,
1302+ Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
1303+
1304+ let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS.";
1305+ let description = [{
1306+ Make all descriptor groups needed by tensor memory operations.
1307+
1308+ The $base operand corresponds to the base pair addresses, one must be an address in LDS
1309+ while the other must be a global memory location.
1310+
1311+ $global_{static/dynamic}_sizes determine the size of the tensor.
1312+ $global_{static/dynamic}_strides determine the strides of the tensor.
1313+ $shared_{static/dynamic}_sizes determines the size of the tile.
1314+
1315+ Padding can be applied to the LDS address when copying from memory to LDS,
1316+ but not when copying from LDS to memory.
1317+ The values in the padded target addresses remain the same as before the operation was applied.
1318+
1319+ 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
1320+ $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
1321+ $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
1322+ $iterate_count determines how many times to iterate.
1323+
1324+ ```mlir
1325+ // Example of moving a two-dimensional tensor to LDS.
1326+ %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
1327+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
1328+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
1329+
1330+ // Example of moving a two dimension tensor to LDS where padding is applied after every integer.
1331+ %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
1332+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
1333+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
1334+ ```
1335+ }];
1336+
1337+ let assemblyFormat = [{
1338+ $base
1339+ `globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
1340+ `globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
1341+ `sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
1342+ ( `padShared` `(` $pad^ `every` $pad_every `)` )?
1343+ ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]`
1344+ `:` type($atomic_barrier_address) `)`)?
1345+ ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
1346+ attr-dict `:` qualified(type($base)) `->` type(results)
12481347 }];
1348+
1349+ let hasVerifier = 1;
12491350}
12501351
12511352#endif // AMDGPU
0 commit comments