|
| 1 | +From 8a734652353bdd85b9cc7d2426e7395404372d72 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Chao Chen < [email protected]> |
| 3 | +Date: Wed, 28 Aug 2024 23:57:49 +0000 |
| 4 | +Subject: [PATCH] refine the XeGPU definition - add verification for |
| 5 | + scattered tensordesc regarding to chunk size and total size - refine |
| 6 | + load_gather and store_scatter to reveal transpose effect |
| 7 | + |
| 8 | +--- |
| 9 | + .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 40 +++++++++++------ |
| 10 | + mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 1 + |
| 11 | + mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 44 ++++++++++++++++--- |
| 12 | + 3 files changed, 65 insertions(+), 20 deletions(-) |
| 13 | + |
| 14 | +diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |
| 15 | +index a3922bbad2b3..3e0c6f243fd4 100644 |
| 16 | +--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |
| 17 | ++++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |
| 18 | +@@ -413,24 +413,28 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { |
| 19 | + implying each element in the array corresponds to a work-item (SIMT lane) |
| 20 | + in the subgroup. |
| 21 | + |
| 22 | ++ The first dimension of the result TensorDesc corresponds to work-items, so it should |
| 23 | ++ match the dimension of offsets. It may also has a second dimension corresponding to |
| 24 | ++ the chunk_size if the chunk size is larger than 1. |
| 25 | ++ |
| 26 | + Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] |
| 27 | + ```mlir |
| 28 | + %a = memref.alloc() : memref<1024xf32> |
| 29 | +- %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32, chunk_size_per_lane = 1> |
| 30 | ++ %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32> |
| 31 | + ``` |
| 32 | + |
| 33 | + Example 2. It assumes subgroup size is 4, and each workitem access 8 elements. |
| 34 | + It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71] |
| 35 | + ```mlir |
| 36 | + %0 = memref.alloc() : memref<1024xf32> |
| 37 | +- %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8> |
| 38 | ++ %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8> |
| 39 | + ``` |
| 40 | + |
| 41 | + Example 3. It is similar to Example 2, but there is some overlaps among workitems. |
| 42 | + It accesses: a[0:7], a[4:11], a[8:15], a[12:19] |
| 43 | + ```mlir |
| 44 | + %0 = memref.alloc() : memref<1024xf32> |
| 45 | +- %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>> |
| 46 | ++ %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>> |
| 47 | + ``` |
| 48 | + }]; |
| 49 | + |
| 50 | +@@ -500,28 +504,31 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"] |
| 51 | + |
| 52 | + let description = [{ It (aka. load) load data per each work-item. The output |
| 53 | + describes the data being loaded at the subgroup level, so its size is |
| 54 | +- consistent with the number of work-items in a subgroup. When `chunk_size_per_lane` |
| 55 | +- attribute is larger than 1 in TensorDesc, the output vector will be 2D vector, |
| 56 | +- with dim-1 correspoding to the chunk size. |
| 57 | ++ consistent with the number of work-items in a subgroup. When the chunk size |
| 58 | ++ is larger than 2, the output vector is a 2D vector, with dim-1 correspoding |
| 59 | ++ to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item. |
| 60 | ++ Specially, there is a transpose effect on the result (as compared to the TensorDesc) |
| 61 | ++ due to the hardware implementation. Therefore, a transpose attribute is introduced |
| 62 | ++ on purpose, making sure users are aware of this implicit transformation. |
| 63 | + |
| 64 | + The mask operand masks out memory access so that it is safe to pass out-of-boundary |
| 65 | + addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. |
| 66 | + |
| 67 | + Example: |
| 68 | + ```mlir |
| 69 | +- %2 = xegpu.load %1, %0 {transpose = [1, 0], |
| 70 | ++ %2 = xegpu.load %1, %0 {transpose, |
| 71 | + l1_hint = #xegpu.cache_hint<cached>, |
| 72 | + l2_hint = #xegpu.cache_hint<uncached>, |
| 73 | + l3_hint = #xegpu.cache_hint<uncached>} |
| 74 | +- : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1> |
| 75 | +- -> vector<16xf32> |
| 76 | ++ : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=global>>, |
| 77 | ++ vector<16xi1> -> vector<16xf32> |
| 78 | + ``` |
| 79 | + |
| 80 | + }]; |
| 81 | + |
| 82 | + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, |
| 83 | + XeGPU_MaskType: $mask, |
| 84 | +- OptionalAttr<DenseI64ArrayAttr>: $transpose, |
| 85 | ++ OptionalAttr<UnitAttr>: $transpose, |
| 86 | + OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint, |
| 87 | + OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint, |
| 88 | + OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint); |
| 89 | +@@ -553,11 +560,15 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"] |
| 90 | + let hasVerifier = 1; |
| 91 | + } |
| 92 | + |
| 93 | +-def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>, |
| 94 | +- AllElementTypesMatch<["value", "TensorDesc"]>]> { |
| 95 | ++def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "TensorDesc"]>, |
| 96 | ++ AllElementTypesMatch<["value", "TensorDesc"]>]> { |
| 97 | + let summary = "store data to scattered memory locations."; |
| 98 | +- let description = [{ It (aka. store) stores data to scattered memory locations. |
| 99 | +- It has similar semantic to `load_gather`. |
| 100 | ++ let description = [{ It (aka. store) stores data to scattered memory locations. The value is |
| 101 | ++ typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be |
| 102 | ++ a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes |
| 103 | ++ and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter` |
| 104 | ++ has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is |
| 105 | ++ introduced on purpose, making sure users are aware of this implicit transformation. |
| 106 | + |
| 107 | + Example: |
| 108 | + ```mlir |
| 109 | +@@ -572,6 +583,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDe |
| 110 | + XeGPU_ValueType: $value, |
| 111 | + XeGPU_TensorDesc: $TensorDesc, |
| 112 | + XeGPU_MaskType: $mask, |
| 113 | ++ OptionalAttr<UnitAttr>: $transpose, |
| 114 | + OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint, |
| 115 | + OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint, |
| 116 | + OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint); |
| 117 | +diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp |
| 118 | +index 0eab601bbaac..555c232ff1f0 100644 |
| 119 | +--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp |
| 120 | ++++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp |
| 121 | +@@ -57,6 +57,7 @@ ScatterTensorDescAttr ScatterTensorDescAttr::get(mlir::MLIRContext *context, |
| 122 | + //===----------------------------------------------------------------------===// |
| 123 | + // XeGPU_TensorDescType |
| 124 | + //===----------------------------------------------------------------------===// |
| 125 | ++ |
| 126 | + mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { |
| 127 | + llvm::SmallVector<int64_t> shape; |
| 128 | + mlir::Type elementType; |
| 129 | +diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp |
| 130 | +index c9e399a7149f..b35a639540aa 100644 |
| 131 | +--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp |
| 132 | ++++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp |
| 133 | +@@ -305,6 +305,26 @@ LogicalResult CreateDescOp::verify() { |
| 134 | + |
| 135 | + auto chunkSize = tdescTy.getChunkSize(); |
| 136 | + |
| 137 | ++ // check chunk_size |
| 138 | ++ llvm::SmallVector<int64_t> supportedChunkSizes = {1, 2, 3, 4, 8, 16, 32, 64, 128, 256}; |
| 139 | ++ if (!llvm::is_contained(supportedChunkSizes, chunkSize)) |
| 140 | ++ return emitOpError("Invalid chunk_size. Supported values are 1, 2, 3, 4, 8, 16, 32, 64, 128, or 256."); |
| 141 | ++ |
| 142 | ++ // check total size |
| 143 | ++ auto elemBits = tdescTy.getElementType().getIntOrFloatBitWidth(); |
| 144 | ++ auto bitsPerLane = elemBits * chunkSize; |
| 145 | ++ if (bitsPerLane % 32) { |
| 146 | ++ // For 8-bit and 16-bit data, the hardware only supports chunk size of 1. |
| 147 | ++ // For 32-bit data, the hardware can support larger larger chunk size. So |
| 148 | ++ // we can bitcast 8-bit/16-bit data to 32-bit data for better performance. |
| 149 | ++ // But this requires the total size is 32 bit aligned to make the optimization work. |
| 150 | ++ return emitOpError("access size (chunk_size * sizeof(elemTy)) should be 32-bit aligned."); |
| 151 | ++ } |
| 152 | ++ |
| 153 | ++ auto lscConstraints = 512 * 8; // each access is upto 512 bytes. |
| 154 | ++ if (elemBits * tdescTy.getNumElements() > lscConstraints) |
| 155 | ++ return emitOpError("total access size (simd_lanes * chunk_size * sizeof(elemTy)) is upto 512 bytes."); |
| 156 | ++ |
| 157 | + SmallVector<int64_t> shape({(int64_t)getNumOffsets()}); |
| 158 | + if (chunkSize != 1) |
| 159 | + shape.push_back(chunkSize); |
| 160 | +@@ -370,14 +390,13 @@ LogicalResult LoadGatherOp::verify() { |
| 161 | + if (tdescShape[0] != maskShape[0]) |
| 162 | + return emitOpError("dim-0 of the Mask and TensorDesc should be the same."); |
| 163 | + |
| 164 | +- if (getTransposeAttr()) { |
| 165 | +- auto trans = getTranspose().value(); |
| 166 | +- if (tdescShape.size() < trans.size()) |
| 167 | +- emitWarning("Invalid transpose attr. It is ignored."); |
| 168 | +- else |
| 169 | +- transpose(trans, tdescShape); |
| 170 | ++ if (tdescTy.getRank() == 2) { |
| 171 | ++ if (!getTransposeAttr()) |
| 172 | ++ return emitOpError("load_gather has to be transposed."); |
| 173 | ++ transpose({1, 0}, tdescShape); |
| 174 | + } |
| 175 | + |
| 176 | ++ |
| 177 | + if (valueShape != tdescShape) |
| 178 | + return emitOpError("Unexpected result shape") |
| 179 | + << "(Expected shape: " << makeString(tdescShape) |
| 180 | +@@ -404,11 +423,24 @@ LogicalResult StoreScatterOp::verify() { |
| 181 | + return emitOpError("invlid l3_hint: ") << getL3HintAttr(); |
| 182 | + |
| 183 | + auto maskTy = getMaskType(); |
| 184 | ++ auto valueTy = getValueType(); |
| 185 | + auto maskShape = getShapeOf(maskTy); |
| 186 | + auto tdescShape = getShapeOf(tdescTy); |
| 187 | ++ auto valueShape = getShapeOf(valueTy); |
| 188 | + if (tdescShape[0] != maskShape[0]) |
| 189 | + return emitOpError("dim-0 of the Mask and TensorDesc should be the same."); |
| 190 | + |
| 191 | ++ if (tdescTy.getRank() == 2) { |
| 192 | ++ if (!getTransposeAttr()) |
| 193 | ++ return emitOpError("load_gather has to be transposed."); |
| 194 | ++ transpose({1, 0}, tdescShape); |
| 195 | ++ } |
| 196 | ++ |
| 197 | ++ if (valueShape != tdescShape) |
| 198 | ++ return emitOpError("Unexpected value shape") |
| 199 | ++ << "(Expected shape: " << makeString(tdescShape) |
| 200 | ++ << ", Given shape: " << makeString(valueShape) << ").\n"; |
| 201 | ++ |
| 202 | + return success(); |
| 203 | + } |
| 204 | + //===----------------------------------------------------------------------===// |
| 205 | +-- |
| 206 | +2.34.1 |
0 commit comments