refine XeGPU definition (#849)

chencha3 · web-flow · commit 8ac2fb66e67e · 2024-08-29T09:05:41.000-05:00
- add verification for create_tdesc regarding to the chunk size and
  total size
- update load_gather and store_scatter definition to reveal the
  transpose effect
diff --git a/build_tools/patches/0010-refine-the-XeGPU-definition.patch b/build_tools/patches/0010-refine-the-XeGPU-definition.patch
@@ -0,0 +1,206 @@
+From 8a734652353bdd85b9cc7d2426e7395404372d72 Mon Sep 17 00:00:00 2001
+From: Chao Chen <chao.chen@intel.com>
+Date: Wed, 28 Aug 2024 23:57:49 +0000
+Subject: [PATCH]  refine the XeGPU definition   - add verification for
+ scattered tensordesc regarding to chunk size and total size   - refine
+ load_gather and store_scatter to reveal transpose effect
+
+---
+ .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 40 +++++++++++------
+ mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  1 +
+ mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 44 ++++++++++++++++---
+ 3 files changed, 65 insertions(+), 20 deletions(-)
+
+diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+index a3922bbad2b3..3e0c6f243fd4 100644
+--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
++++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+@@ -413,24 +413,28 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
+       implying each element in the array corresponds to a work-item (SIMT lane)
+       in the subgroup.
+
++    The first dimension of the result TensorDesc corresponds to work-items, so it should
++    match the dimension of offsets. It may also has a second dimension corresponding to
++    the chunk_size if the chunk size is larger than 1.
++
+     Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
+     ```mlir
+     %a = memref.alloc() : memref<1024xf32>
+-    %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32, chunk_size_per_lane = 1>
++    %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32>
+     ```
+
+     Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
+                It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
+     ```mlir
+     %0 = memref.alloc() : memref<1024xf32>
+-    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>
++    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>
+     ```
+
+     Example 3. It is similar to Example 2, but there is some overlaps among workitems.
+                It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
+     ```mlir
+     %0 = memref.alloc() : memref<1024xf32>
+-    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>>
++    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>>
+     ```
+   }];
+
+@@ -500,28 +504,31 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
+
+   let description = [{ It (aka. load) load data per each work-item. The output
+     describes the data being loaded at the subgroup level, so its size is
+-    consistent with the number of work-items in a subgroup. When `chunk_size_per_lane`
+-    attribute is larger than 1 in TensorDesc, the output vector will be 2D vector,
+-    with dim-1 correspoding to the chunk size.
++    consistent with the number of work-items in a subgroup. When the chunk size
++    is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
++    to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item.
++    Specially, there is a transpose effect on the result (as compared to the TensorDesc)
++    due to the hardware implementation. Therefore, a transpose attribute is introduced
++    on purpose, making sure users are aware of this implicit transformation.
+
+     The mask operand masks out memory access so that it is safe to pass out-of-boundary
+     addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
+
+   Example:
+   ```mlir
+-    %2 = xegpu.load %1, %0 {transpose = [1, 0],
++    %2 = xegpu.load %1, %0 {transpose,
+                             l1_hint = #xegpu.cache_hint<cached>,
+                             l2_hint = #xegpu.cache_hint<uncached>,
+                             l3_hint = #xegpu.cache_hint<uncached>}
+-          : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
+-            -> vector<16xf32>
++          : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=global>>,
++            vector<16xi1> -> vector<16xf32>
+   ```
+
+   }];
+
+   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                        XeGPU_MaskType: $mask,
+-                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
++                       OptionalAttr<UnitAttr>: $transpose,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+@@ -553,11 +560,15 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
+   let hasVerifier = 1;
+ }
+
+-def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>,
+-                                        AllElementTypesMatch<["value", "TensorDesc"]>]> {
++def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "TensorDesc"]>,
++                                              AllElementTypesMatch<["value", "TensorDesc"]>]> {
+   let summary = "store data to scattered memory locations.";
+-  let description = [{ It (aka. store) stores data to scattered memory locations.
+-  It has similar semantic to `load_gather`.
++  let description = [{ It (aka. store) stores data to scattered memory locations. The value is
++  typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
++  a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
++  and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter`
++  has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
++  introduced on purpose, making sure users are aware of this implicit transformation.
+
+   Example:
+   ```mlir
+@@ -572,6 +583,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDe
+     XeGPU_ValueType: $value,
+     XeGPU_TensorDesc: $TensorDesc,
+     XeGPU_MaskType: $mask,
++    OptionalAttr<UnitAttr>: $transpose,
+     OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+     OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+     OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+index 0eab601bbaac..555c232ff1f0 100644
+--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
++++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+@@ -57,6 +57,7 @@ ScatterTensorDescAttr ScatterTensorDescAttr::get(mlir::MLIRContext *context,
+ //===----------------------------------------------------------------------===//
+ // XeGPU_TensorDescType
+ //===----------------------------------------------------------------------===//
++
+ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
+   llvm::SmallVector<int64_t> shape;
+   mlir::Type elementType;
+diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+index c9e399a7149f..b35a639540aa 100644
+--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
++++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+@@ -305,6 +305,26 @@ LogicalResult CreateDescOp::verify() {
+
+   auto chunkSize = tdescTy.getChunkSize();
+
++  // check chunk_size
++  llvm::SmallVector<int64_t> supportedChunkSizes = {1, 2, 3, 4, 8, 16, 32, 64, 128, 256};
++  if (!llvm::is_contained(supportedChunkSizes, chunkSize))
++    return emitOpError("Invalid chunk_size. Supported values are 1, 2, 3, 4, 8, 16, 32, 64, 128, or 256.");
++
++  // check total size
++  auto elemBits = tdescTy.getElementType().getIntOrFloatBitWidth();
++  auto bitsPerLane = elemBits * chunkSize;
++  if (bitsPerLane % 32) {
++    // For 8-bit and 16-bit data, the hardware only supports chunk size of 1.
++    // For 32-bit data, the hardware can support larger larger chunk size. So
++    // we can bitcast 8-bit/16-bit data to 32-bit data for better performance.
++    // But this requires the total size is 32 bit aligned to make the optimization work.
++    return emitOpError("access size (chunk_size * sizeof(elemTy)) should be 32-bit aligned.");
++  }
++
++  auto lscConstraints = 512 * 8; // each access is upto 512 bytes.
++  if (elemBits * tdescTy.getNumElements() > lscConstraints)
++    return emitOpError("total access size (simd_lanes * chunk_size * sizeof(elemTy)) is upto 512 bytes.");
++
+   SmallVector<int64_t> shape({(int64_t)getNumOffsets()});
+   if (chunkSize != 1)
+     shape.push_back(chunkSize);
+@@ -370,14 +390,13 @@ LogicalResult LoadGatherOp::verify() {
+   if (tdescShape[0] != maskShape[0])
+     return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
+
+-  if (getTransposeAttr()) {
+-    auto trans = getTranspose().value();
+-    if (tdescShape.size() < trans.size())
+-      emitWarning("Invalid transpose attr. It is ignored.");
+-    else
+-      transpose(trans, tdescShape);
++  if (tdescTy.getRank() == 2) {
++    if (!getTransposeAttr())
++      return emitOpError("load_gather has to be transposed.");
++    transpose({1, 0}, tdescShape);
+   }
+
++
+   if (valueShape != tdescShape)
+     return emitOpError("Unexpected result shape")
+            << "(Expected shape: " << makeString(tdescShape)
+@@ -404,11 +423,24 @@ LogicalResult StoreScatterOp::verify() {
+     return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
+   auto maskTy = getMaskType();
++  auto valueTy = getValueType();
+   auto maskShape = getShapeOf(maskTy);
+   auto tdescShape = getShapeOf(tdescTy);
++  auto valueShape = getShapeOf(valueTy);
+   if (tdescShape[0] != maskShape[0])
+     return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
+
++  if (tdescTy.getRank() == 2) {
++    if (!getTransposeAttr())
++      return emitOpError("load_gather has to be transposed.");
++    transpose({1, 0}, tdescShape);
++  }
++
++  if (valueShape != tdescShape)
++    return emitOpError("Unexpected value shape")
++           << "(Expected shape: " << makeString(tdescShape)
++           << ", Given shape: " << makeString(valueShape) << ").\n";
++
+   return success();
+ }
+ //===----------------------------------------------------------------------===//
+--
+2.34.1
diff --git a/test/Conversion/XeGPUToVC/loadgather.mlir b/test/Conversion/XeGPUToVC/loadgather.mlir
@@ -32,12 +32,12 @@ module @gemm attributes {gpu.container_module} {
 
       // CHECK: %[[OLD:.*]] =  arith.constant dense<0> : vector<16xi32>
       // CHECK: %[[LOAD_RES:.*]] = func.call @llvm.genx.raw.send2.v16i32.v16i1.v16i64({{.*}}, %[[MASK]], {{.*}}, %[[IN_PAYLOAD]], %[[OLD]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<16xi32>) -> vector<16xi32>
-      %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<16x2xf16>
+      %loaded = xegpu.load %tdesc_in, %mask {transpose} : !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<2x16xf16>
       // CHECK: %[[POST_OP_ELEMENT_TYPE_CAST:.*]] = vector.bitcast %[[LOAD_RES]] : vector<16xi32> to vector<32xf16>
 
       // CHECK: %[[PRE_OP_ELEMENT_TYPE_CAST:.*]] = vector.bitcast %[[POST_OP_ELEMENT_TYPE_CAST]] : vector<32xf16> to vector<16xi32>
       // CHECK: func.call @llvm.genx.raw.sends2.noresult.v16i1.v16i64.v16i32({{.*}}, %[[MASK]], {{.*}}, %[[OUT_PAYLOAD]], %[[PRE_OP_ELEMENT_TYPE_CAST]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<16xi32>) -> ()
-      xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf16>, !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
+      xegpu.store %loaded, %tdesc_out, %mask {transpose} : vector<2x16xf16>, !xegpu.tensor_desc<16x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
 
       gpu.return
     }
diff --git a/test/Conversion/XeGPUToVC/loadgather_dpas.mlir b/test/Conversion/XeGPUToVC/loadgather_dpas.mlir
@@ -23,11 +23,8 @@ module @gemm attributes {gpu.container_module} {
 
          // CHECK: %[[OLD:.*]] =  arith.constant dense<0> : vector<64xi32>
          // CHECK: %[[LOAD_RES:.*]] = func.call @llvm.genx.raw.send2.v64i32.v16i1.v16i64({{.*}}, %[[MASK]], {{.*}}, %[[IN_PAYLOAD]], %[[OLD]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<64xi32>) -> vector<64xi32>
-         %3 = xegpu.load %0, %mask : !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1> -> vector<16x8xf16>
-
          // CHECK: %[[LOADA_v128f16:.*]] = vector.bitcast %[[LOAD_RES]] : vector<64xi32> to vector<128xf16>
-         %66 = vector.shape_cast %3: vector<16x8xf16> to vector<128xf16>
-         %6 = vector.shape_cast %66: vector<128xf16> to vector<8x16xf16>
+         %3 = xegpu.load %0, %mask {transpose} : !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1> -> vector<8x16xf16>
 
          // CHECK: %[[B_STRUCT:.*]]= arith.constant dense<0> : vector<4xi64>
          // CHECK: %[[B_BASEPTR:.*]] = memref.extract_aligned_pointer_as_index {{.*}} : memref<16x16xf16> -> index
@@ -46,7 +43,7 @@ module @gemm attributes {gpu.container_module} {
 
          // CHECK: %[[LOADA_v64i32:.*]] = vector.bitcast %[[LOADA_v128f16]] : vector<128xf16> to vector<64xi32>
          // CHECK: %[[C_ACC_v128f32:.*]] = func.call @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(%{{.*}}, %[[LOADA_v64i32]], %{{.*}}) : (vector<128xi32>, vector<64xi32>, i32) -> vector<128xf32>
-         %5 = xegpu.dpas %6, %4 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+         %5 = xegpu.dpas %3, %4 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
 
          // CHECK: %[[OUT_OFFSET:.*]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
          %offsets2 = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
@@ -59,11 +56,11 @@ module @gemm attributes {gpu.container_module} {
          // CHECK: %[[OUT_ELEMENTWISE_OFFSET:.*]] = arith.muli %[[OUT_ELEMENT_BYTEWIDTH]], %[[OUT_OFFSET]] : vector<16xindex>
          // CHECK: %[[OUT_PAYLOAD:.*]] = arith.addi %[[OUT_PAYLOAD_BASEPTR_SHUFFLED]], %[[OUT_ELEMENTWISE_OFFSET]] : vector<16xindex>
          %2 = xegpu.create_tdesc %arg2, %offsets2 : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
-         %7 = vector.shape_cast %5: vector<8x16xf32> to vector<128xf32>
-         %8 = vector.shape_cast %7: vector<128xf32> to vector<16x8xf32>
+         // %7 = vector.shape_cast %5: vector<8x16xf32> to vector<128xf32>
+         // %8 = vector.shape_cast %7: vector<128xf32> to vector<16x8xf32>
 
          // CHECK: func.call @llvm.genx.raw.sends2.noresult.v16i1.v16i64.v128f32({{.*}}, %[[MASK]], {{.*}}, %[[OUT_PAYLOAD]], %[[C_ACC_v128f32]]) : (i8, i8, vector<16xi1>, i8, i8, i8, i32, i32, vector<16xindex>, vector<128xf32>) -> ()
-         xegpu.store %8, %2, %mask : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1>
+         xegpu.store %5, %2, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1>
 
          gpu.return
       }
diff --git a/test/Dialect/XeGPU/IR/invalid_vc.mlir b/test/Dialect/XeGPU/IR/invalid_vc.mlir
@@ -67,3 +67,27 @@ func.func @test_load_gather(%src: ui64, %offsets : vector<16xindex>) {
                           : !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1> -> vector<8x8x4xf16>
   return
 }
+
+// -----
+func.func @test_create_tdesc_oversized(%src: ui64, %offsets : vector<16xindex>) {
+  // expected-error@+1 {{total access size (simd_lanes * chunk_size * sizeof(elemTy)) is upto 512 bytes}}
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
+              -> !xegpu.tensor_desc<16x16xf32, #xegpu.scatter_tdesc_attr<chunk_size = 16>>
+  return
+}
+
+// -----
+func.func @test_create_tdesc_invalid_chunk_size(%src: ui64, %offsets : vector<16xindex>) {
+  // expected-error@+1 {{Invalid chunk_size. Supported values are 1, 2, 3, 4, 8, 16, 32, 64, 128, or 256.}}
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
+              -> !xegpu.tensor_desc<16x7xf32, #xegpu.scatter_tdesc_attr<chunk_size = 7>>
+  return
+}
+
+// -----
+func.func @test_create_tdesc_unaligned(%src: ui64, %offsets : vector<16xindex>) {
+  // expected-error@+1 {{access size (chunk_size * sizeof(elemTy)) should be 32-bit aligned}}
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
+              -> !xegpu.tensor_desc<16x3xf16, #xegpu.scatter_tdesc_attr<chunk_size = 3>>
+  return
+}
diff --git a/test/Dialect/XeGPU/IR/load_gather_vc.mlir b/test/Dialect/XeGPU/IR/load_gather_vc.mlir
@@ -28,9 +28,9 @@ func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) {
   %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
           -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
 
-  //CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}>
+  //CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>
   //CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1> -> vector<8x16xf32>
-  %2 = xegpu.load %1, %0 {transpose = array<i64: 1, 0>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
+  %2 = xegpu.load %1, %0 {transpose, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
                : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1> -> vector<8x16xf32>
   return
 }
diff --git a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir
@@ -36,8 +36,8 @@ module @gemm attributes {gpu.container_module} {
       %mask = arith.constant dense<[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]> : vector<16xi1>
       %tdesc_in = xegpu.create_tdesc %in, %offsets : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
       %tdesc_out = xegpu.create_tdesc %out, %offsets : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
-      %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<16x2xf32>
-      xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
+      %loaded = xegpu.load %tdesc_in, %mask {transpose} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<2x16xf32>
+      xegpu.store %loaded, %tdesc_out, %mask {transpose} : vector<2x16xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
       gpu.return
     }
   }
diff --git a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir
@@ -36,8 +36,8 @@ module @gemm attributes {gpu.container_module} {
       %mask = arith.constant dense<[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]> : vector<16xi1>
       %tdesc_in = xegpu.create_tdesc %in, %offsets : memref<?xi32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
       %tdesc_out = xegpu.create_tdesc %out, %offsets : memref<?xi32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
-      %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<16x2xi32>
-      xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xi32>, !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
+      %loaded = xegpu.load %tdesc_in, %mask {transpose} : !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<2x16xi32>
+      xegpu.store %loaded, %tdesc_out, %mask {transpose} : vector<2x16xi32>, !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
       gpu.return
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -36,8 +36,8 @@ module @gemm attributes {gpu.container_module} {`
`36`	`36`	`%mask = arith.constant dense<[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]> : vector<16xi1>`
`37`	`37`	`%tdesc_in = xegpu.create_tdesc %in, %offsets : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>`
`38`	`38`	`%tdesc_out = xegpu.create_tdesc %out, %offsets : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>`
`39`		`- %loaded = xegpu.load %tdesc_in, %mask : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<16x2xf32>`
`40`		`- xegpu.store %loaded, %tdesc_out, %mask : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>`
	`39`	`+ %loaded = xegpu.load %tdesc_in, %mask {transpose} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<2x16xf32>`
	`40`	`+ xegpu.store %loaded, %tdesc_out, %mask {transpose} : vector<2x16xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>`
`41`	`41`	`gpu.return`
`42`	`42`	`}`
`43`	`43`	`}`