@@ -118,8 +118,8 @@ tile_load(tile_t& tile, payload_t& payload) {
118118 static constexpr uint32_t max_load_width_in_elem =
119119 load_store_attr::max_load_width_in_bytes / sizeof (dtype);
120120
121- // static constexpr uint32_t max_trans_load_height_in_elem =
122- // load_store_attr::max_trans_load_height_in_elem;
121+ // static constexpr uint32_t max_trans_load_height_in_elem =
122+ // load_store_attr::max_trans_load_height_in_elem;
123123 static constexpr uint32_t max_load_height_in_elem =
124124 load_store_attr::max_load_height_in_elem;
125125
@@ -206,6 +206,11 @@ tile_load(tile_t& tile, payload_t& payload) {
206206#pragma unroll
207207 for (uint32_t ii = 0 ; ii < block_size_y / ld_blk_size_y; ++ii) {
208208 constexpr uint32_t load_elems = ld_blk_size_y * block_size_x * arr_len;
209+ uint32_t address_offset_x =
210+ (mem_transpose ? (offset_y + ii * ld_blk_size_y) : offset_x) /
211+ scale_factor;
212+ uint32_t address_offset_y =
213+ mem_transpose ? offset_x : (offset_y + ii * ld_blk_size_y);
209214 reg_tmp.xetla_format <native_type_t <load_dtype>>() = xetla_load_global<
210215 native_type_t <load_dtype>,
211216 (trans ? ld_blk_size_y : block_size_x) / scale_factor,
@@ -222,13 +227,8 @@ tile_load(tile_t& tile, payload_t& payload) {
222227 payload.surface_width ,
223228 payload.surface_height ,
224229 payload.surface_pitch ,
225- payload.offset_x +
226- (mem_transpose ? (offset_y / (int )scale_factor +
227- ii * ld_blk_size_y / (int )scale_factor)
228- : (offset_x / scale_factor)),
229-
230- payload.offset_y +
231- (mem_transpose ? offset_x : (offset_y + ii * ld_blk_size_y)));
230+ payload.offset_x + address_offset_x,
231+ payload.offset_y + address_offset_y);
232232
233233 if constexpr (reg_transpose && trans) {
234234 reg_blk.xetla_select <load_elems, 1 >(ii * load_elems)
0 commit comments