Skip to content

Commit 0346b6f

Browse files
amd-eochoalokuhar
authored andcommitted
[mlir][amdgpu] Add amdgpu.make_dma_descriptor (llvm#169407)
Co-authored-by: Jakub Kuderski <[email protected]>
1 parent a27597e commit 0346b6f

File tree

4 files changed

+249
-12
lines changed

4 files changed

+249
-12
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 109 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
8080
let assemblyFormat = "`<` $value `>`";
8181
}
8282

83+
//===----------------------------------------------------------------------===//
84+
// AMDGPU Type definitions
85+
//===----------------------------------------------------------------------===//
86+
8387
class AMDGPU_Type<string name, string typeMnemonic, list<Trait> traits = []>
8488
: TypeDef<AMDGPU_Dialect, name, traits> {
8589
let mnemonic = typeMnemonic;
8690
}
8791

88-
//===----------------------------------------------------------------------===//
89-
// AMDGPU Type definitions
90-
//===----------------------------------------------------------------------===//
91-
9292
def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
9393
let summary = "Pair of base addresses that move data between LDS and global storage.";
9494
let description = [{
@@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
104104
let assemblyFormat = "`<` $elementType `>`";
105105
}
106106

107+
def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
108+
let summary = "Descriptors used in tensor store/load operations.";
109+
let description = [{
110+
This type is opaque and corresponds to the two or four descriptor groups
111+
used in tensor_load_to_lds or tensor_store_from_lds.
112+
}];
113+
114+
}
115+
107116
//===----------------------------------------------------------------------===//
108117
// AMDGPU Op definitions
109118
//===----------------------------------------------------------------------===//
@@ -1222,14 +1231,13 @@ def AMDGPU_MakeDmaBaseOp :
12221231
AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>,
12231232
Arguments<(ins
12241233
Arg<AnyMemRef, "buffer to read from">:$src,
1225-
Variadic<Index>:$srcIndices,
1234+
Variadic<Index>:$src_indices,
12261235
Arg<AnyMemRef, "buffer to write to">:$dst,
1227-
Variadic<Index>:$dstIndices)>,
1236+
Variadic<Index>:$dst_indices)>,
12281237
Results<(outs AMDGPU_TDMBaseType: $base)> {
12291238

12301239
// TODO:
12311240
// * Add verifiers such that one of the memrefs is from LDS and the other global.
1232-
// * Add verifiers to make sure that the type is in the correct direction.
12331241
// * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.
12341242

12351243
let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
@@ -1240,12 +1248,105 @@ def AMDGPU_MakeDmaBaseOp :
12401248
This operation creates a value corresponding to the tensor descriptor (D#) group 0
12411249
found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.
12421250

1251+
For example:
1252+
1253+
```mlir
1254+
%base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
1255+
%descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
1256+
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
1257+
```
1258+
1259+
to
1260+
1261+
```mlir
1262+
// pseudocode
1263+
%base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)>
1264+
%base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)>
1265+
%base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)>
1266+
// type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base<i32>
1267+
1268+
// The base will be used when contructing dgroup0
1269+
// when lowering amdgpu.make_dma_descriptor
1270+
%dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)>
1271+
%dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : ....
1272+
1273+
// When lowering amdgpu.tensor_load_to_lds
1274+
rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
1275+
```
1276+
12431277
These tensor DMA operations were introduced in gfx1250.
12441278
}];
12451279

12461280
let assemblyFormat = [{
1247-
$src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results)
1281+
$src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results)
1282+
}];
1283+
}
1284+
1285+
def AMDGPU_MakeDmaDescriptorOp :
1286+
AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>,
1287+
Arguments<(ins
1288+
AMDGPU_TDMBaseType: $base,
1289+
Variadic<Index>: $global_dynamic_sizes,
1290+
DenseI64ArrayAttr: $global_static_sizes,
1291+
Variadic<Index>: $global_dynamic_strides,
1292+
DenseI64ArrayAttr: $global_static_strides,
1293+
Variadic<Index>: $shared_dynamic_sizes,
1294+
DenseI64ArrayAttr: $shared_static_sizes,
1295+
Optional<Index>: $pad,
1296+
Optional<Index>: $pad_every,
1297+
Optional<AnyMemRef>: $atomic_barrier_address,
1298+
Variadic<Index>: $atomic_barrier_indices,
1299+
Optional<Index>: $global_increment,
1300+
Optional<Index>: $lds_increment,
1301+
Optional<Index>: $iteration_count)>,
1302+
Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
1303+
1304+
let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS.";
1305+
let description = [{
1306+
Make all descriptor groups needed by tensor memory operations.
1307+
1308+
The $base operand corresponds to the base pair addresses, one must be an address in LDS
1309+
while the other must be a global memory location.
1310+
1311+
$global_{static/dynamic}_sizes determine the size of the tensor.
1312+
$global_{static/dynamic}_strides determine the strides of the tensor.
1313+
$shared_{static/dynamic}_sizes determines the size of the tile.
1314+
1315+
Padding can be applied to the LDS address when copying from memory to LDS,
1316+
but not when copying from LDS to memory.
1317+
The values in the padded target addresses remain the same as before the operation was applied.
1318+
1319+
2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
1320+
$global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
1321+
$lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
1322+
$iterate_count determines how many times to iterate.
1323+
1324+
```mlir
1325+
// Example of moving a two-dimensional tensor to LDS.
1326+
%base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
1327+
%descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
1328+
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
1329+
1330+
// Example of moving a two dimension tensor to LDS where padding is applied after every integer.
1331+
%base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
1332+
%descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
1333+
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
1334+
```
1335+
}];
1336+
1337+
let assemblyFormat = [{
1338+
$base
1339+
`globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
1340+
`globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
1341+
`sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
1342+
( `padShared` `(` $pad^ `every` $pad_every `)` )?
1343+
( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]`
1344+
`:` type($atomic_barrier_address) `)`)?
1345+
( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
1346+
attr-dict `:` qualified(type($base)) `->` type(results)
12481347
}];
1348+
1349+
let hasVerifier = 1;
12491350
}
12501351

12511352
#endif // AMDGPU

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,44 @@ LogicalResult TransposeLoadOp::verify() {
705705
return success();
706706
}
707707

708+
//===----------------------------------------------------------------------===//
709+
// MakeDmaDescriptorOp
710+
//===----------------------------------------------------------------------===//
711+
712+
LogicalResult MakeDmaDescriptorOp::verify() {
713+
ArrayRef<int64_t> globalStaticStrides = getGlobalStaticStrides();
714+
715+
if (globalStaticStrides.empty()) {
716+
return emitOpError("strides must not be empty.");
717+
}
718+
if (globalStaticStrides.back() != 1) {
719+
return emitOpError("strides for the innermost dimension must be 1.");
720+
}
721+
722+
ArrayRef<int64_t> globalStaticSizes = getGlobalStaticSizes();
723+
size_t rank = globalStaticSizes.size();
724+
if (rank != globalStaticStrides.size()) {
725+
return emitOpError("strides and sizes must have same rank.");
726+
}
727+
728+
ArrayRef<int64_t> sharedStaticSizes = getSharedStaticSizes();
729+
if (rank != sharedStaticSizes.size()) {
730+
return emitOpError("tensor must have same rank as tile.");
731+
}
732+
733+
if (Value atomicBarrierAddress = getAtomicBarrierAddress()) {
734+
MemRefType atomicBarrierAddressType =
735+
cast<MemRefType>(atomicBarrierAddress.getType());
736+
bool barrierInLDS =
737+
hasWorkgroupMemorySpace(atomicBarrierAddressType.getMemorySpace());
738+
if (!barrierInLDS) {
739+
return emitOpError("atomic barrier address must be in LDS.");
740+
}
741+
}
742+
743+
return success();
744+
}
745+
708746
//===----------------------------------------------------------------------===//
709747
// ScaledMFMAOp
710748
//===----------------------------------------------------------------------===//

mlir/test/Dialect/AMDGPU/invalid.mlir

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,3 +354,50 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x
354354
%0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
355355
func.return %0 : vector<16xf32>
356356
}
357+
358+
// -----
359+
360+
func.func @make_dma_base_invalid_barrier(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8xi32>, %idx: index) {
361+
// expected-error@+1 {{'amdgpu.make_dma_descriptor' op atomic barrier address must be in LDS.}}
362+
amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1] sharedSize [0] atomicBarrier(%barrier[%idx] : memref<8xi32>) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
363+
}
364+
365+
// -----
366+
367+
// CHECK-LABEL: func @make_dma_descriptor_invalid_empty_strides
368+
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
369+
func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base<i32>) {
370+
// expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}}
371+
amdgpu.make_dma_descriptor %base globalSize [0] globalStride [] sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
372+
func.return
373+
}
374+
375+
// -----
376+
377+
// CHECK-LABEL: func @make_dma_descriptor_invalid_innermost_stride
378+
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
379+
func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base<i32>) {
380+
// expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}}
381+
amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
382+
func.return
383+
}
384+
385+
// -----
386+
387+
// CHECK-LABEL: func @make_dma_descriptor_invalid_size_and_stride_sizes
388+
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
389+
func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base<i32>) {
390+
// expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}}
391+
amdgpu.make_dma_descriptor %base globalSize [1] globalStride [1, 1] sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
392+
func.return
393+
}
394+
395+
// -----
396+
397+
// CHECK-LABEL: func @make_dma_descriptor_invalid_shared_and_global_rank
398+
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
399+
func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base<i32>) {
400+
// expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}}
401+
amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
402+
func.return
403+
}

mlir/test/Dialect/AMDGPU/ops.mlir

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -689,11 +689,62 @@ func.func @memory_counter_wait() {
689689
// CHECK-LABEL: func @make_dma_base
690690
// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space<workgroup>>)
691691
func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space<workgroup>>) {
692-
// CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> to !amdgpu.tdm_base<i32>
693-
amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> to !amdgpu.tdm_base<i32>
692+
// CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
693+
amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
694694

695-
// CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> to !amdgpu.tdm_base<i32>
696-
amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> to !amdgpu.tdm_base<i32>
695+
// CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> -> !amdgpu.tdm_base<i32>
696+
amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> -> !amdgpu.tdm_base<i32>
697697
func.return
698698
}
699699

700+
// CHECK-LABEL: func @make_dma_descriptor
701+
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index)
702+
func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index) {
703+
704+
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
705+
amdgpu.make_dma_descriptor %base
706+
// CHECK-SAME: globalSize [0]
707+
globalSize [0]
708+
// CHECK-SAME: globalStride [1]
709+
globalStride [1]
710+
// CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
711+
sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
712+
713+
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
714+
amdgpu.make_dma_descriptor %base
715+
// CHECK-SAME: globalSize [0]
716+
globalSize [0]
717+
// CHECK-SAME: globalStride [1]
718+
globalStride [1]
719+
// CHECK-SAME: sharedSize [0]
720+
sharedSize [0]
721+
// CHECK-SAME: padShared(%[[IDX]] every %[[IDX]])
722+
padShared(%idx every %idx)
723+
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
724+
725+
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
726+
amdgpu.make_dma_descriptor %base
727+
// CHECK-SAME: globalSize [0]
728+
globalSize [0]
729+
// CHECK-SAME: globalStride [1]
730+
globalStride [1]
731+
// CHECK-SAME: sharedSize [0]
732+
sharedSize [0]
733+
// CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>)
734+
atomicBarrier(%barrier[%idx] : memref<8xi32, #gpu.address_space<workgroup>>)
735+
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
736+
737+
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
738+
amdgpu.make_dma_descriptor %base
739+
// CHECK-SAME: globalSize [0]
740+
globalSize [0]
741+
// CHECK-SAME: globalStride [1]
742+
globalStride [1]
743+
// CHECK-SAME: sharedSize [0]
744+
sharedSize [0]
745+
// CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]]
746+
iterate %idx, %idx, %idx
747+
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
748+
749+
func.return
750+
}

0 commit comments

Comments
 (0)