Fix the JointMatrixFuncsResolution alloca construction

agrabezh · igcbot · commit 91ea128eefa0 · 2024-02-12T17:38:43.000+01:00
Construct allocas by the JointMatrixFuncsResolution pass
in the entry node of the function.
diff --git a/IGC/Compiler/Optimizer/OpenCLPasses/JointMatrixFuncsResolutionPass/JointMatrixFuncsResolutionPass.cpp b/IGC/Compiler/Optimizer/OpenCLPasses/JointMatrixFuncsResolutionPass/JointMatrixFuncsResolutionPass.cpp
@@ -962,8 +962,12 @@ Instruction *JointMatrixFuncsResolutionPass::ResolveLoad(CallInst *CI)
 
     InstsToErase.insert(CI);
 
-    IRBuilder builder(CI);
+    // Create alloca in the entry node of the function
+    IRBuilder<> builder(&*CI->getFunction()->getEntryBlock().getFirstInsertionPt());
+    builder.SetCurrentDebugLocation(CI->getDebugLoc());
     Value *sliceArray = builder.CreateAlloca(matTy, ADDRESS_SPACE_PRIVATE);
+
+    builder.SetInsertPoint(CI);
     Value *dst = builder.CreateBitCast(sliceArray, arrayTy);
 
     std::vector<Value *> Args = { dst, ptrVal, strideVal };
@@ -992,7 +996,6 @@ Instruction *JointMatrixFuncsResolutionPass::ResolveStore(CallInst *CI)
     Type *arrayTy = Type::getInt8PtrTy(ctx, ADDRESS_SPACE_PRIVATE);
 
     Module *M = CI->getParent()->getModule();
-    IRBuilder builder(CI);
 
     Value *matVal = Resolve(matrixVal);
 
@@ -1008,7 +1011,12 @@ Instruction *JointMatrixFuncsResolutionPass::ResolveStore(CallInst *CI)
 
     InstsToErase.insert(CI);
 
+    // Create alloca in the entry node of the function
+    IRBuilder<> builder(&*CI->getFunction()->getEntryBlock().getFirstInsertionPt());
+    builder.SetCurrentDebugLocation(CI->getDebugLoc());
     Value *sliceArray = builder.CreateAlloca(matVal->getType(), ADDRESS_SPACE_PRIVATE);
+
+    builder.SetInsertPoint(CI);
     builder.CreateStore(matVal, sliceArray);
     Value *src = builder.CreateBitCast(sliceArray, arrayTy);
 
@@ -1132,13 +1140,17 @@ Instruction *JointMatrixFuncsResolutionPass::ResolveMad(CallInst *CI, unsigned O
         Value *bMat = Resolve(bMatVal);
         Value *cMat = Resolve(cMatVal);
 
-        IRBuilder builder(CI);
+        // Create alloca in the entry node of the function
+        IRBuilder<> builder(&*CI->getFunction()->getEntryBlock().getFirstInsertionPt());
+        builder.SetCurrentDebugLocation(CI->getDebugLoc());
 
         Value *sliceA = builder.CreateAlloca(aMat->getType(), ADDRESS_SPACE_PRIVATE);
         Value *sliceB = builder.CreateAlloca(bMat->getType(), ADDRESS_SPACE_PRIVATE);
         Value *sliceC = builder.CreateAlloca(cMat->getType(), ADDRESS_SPACE_PRIVATE);
         Value *sliceD = builder.CreateAlloca(cMat->getType(), ADDRESS_SPACE_PRIVATE);
 
+        builder.SetInsertPoint(CI);
+
         builder.CreateStore(aMat, sliceA);
         builder.CreateStore(bMat, sliceB);
         builder.CreateStore(cMat, sliceC);
diff --git a/IGC/Compiler/tests/JointMatrixFuncsResolutionPass/ErrorsReporting/validate-load-store-error.ll b/IGC/Compiler/tests/JointMatrixFuncsResolutionPass/ErrorsReporting/validate-load-store-error.ll
@@ -12,15 +12,18 @@
 ; ------------------------------------------------
 
 
+; Debug-info related check
+; CHECK: CheckModuleDebugify: PASS
+
 %intel.joint_matrix_packedA_8x16_i32_ = type opaque
 
 define spir_kernel void @load_store_legacy_error(i8* %a, i8* %dst) {
 ; CHECK-LABEL: define spir_kernel void @load_store_legacy_error(
+; CHECK: [[TMP4:%.*]] = alloca <16 x i32>
 ; CHECK: [[PTR:%.*]] = alloca <16 x i32>
 ; CHECK: [[MATPTR:%.*]] = bitcast <16 x i32>* [[PTR]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixLoadINTEL_PackedA_RowMajor_8x16_i32_16_generic_v8i8_pi32_i32(i8* [[MATPTR]], i8* %a, i32 16), !dbg [[DBG2:![0-9]*]]
 ; CHECK: [[MATRIX:%.*]] = load <16 x i32>, <16 x i32>* [[PTR]]
-; CHECK: [[TMP4:%.*]] = alloca <16 x i32>
 ; CHECK: store <16 x i32> [[MATRIX]], <16 x i32>* [[TMP4]]
 ; CHECK: [[TMP5:%.*]] = bitcast <16 x i32>* [[TMP4]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixStoreINTEL_PackedA_RowMajor_8x16_i32_16_generic_pi64_v8i8(i8* %dst, i8* [[TMP5]], i32 8), !dbg [[DBG3:![0-9]*]]
diff --git a/IGC/Compiler/tests/JointMatrixFuncsResolutionPass/acc_fill_store.ll b/IGC/Compiler/tests/JointMatrixFuncsResolutionPass/acc_fill_store.ll
@@ -17,6 +17,8 @@
 ; CHECK-LABEL: define spir_kernel void @test_fill_store(
 ; CHECK-SAME: float addrspace(1)* [[DST0:%.*]], float addrspace(1)* [[DST1:%.*]], float addrspace(1)* [[DST2:%.*]]) {
 define spir_kernel void @test_fill_store(float addrspace(1)* %dst0, float addrspace(1)* %dst1, float addrspace(1)* %dst2){
+; CHECK-NEXT:    [[TMP5:%.*]] = alloca [2 x <32 x i64>]
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca <8 x float>
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <16 x float>
 ; CHECK-NEXT:    store <16 x float> <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>, <16 x float>* [[TMP1]]
   %1 = call spir_func %spirv.JointMatrixINTEL._float_16_16_3_3_2 addrspace(1)* @_Z26__spirv_CompositeConstructf(float 5.000000e+00)
@@ -25,15 +27,13 @@ define spir_kernel void @test_fill_store(float addrspace(1)* %dst0, float addrsp
 ; CHECK-NEXT:    call void @__builtin_spriv_OpJointMatrixStoreINTEL_Accumulator_RowMajor_SG16_16x16_i32_16_global_pi64_v8i8(float addrspace(1)* [[DST0]], i8* [[TMP2]], i64 16)
   call spir_func void @_Z29__spirv_JointMatrixStoreINTELPU3AS1fPU3AS143__spirv_JointMatrixINTEL__float_16_16_3_3_2liii(float addrspace(1)* %dst0, %spirv.JointMatrixINTEL._float_16_16_3_3_2 addrspace(1)* %1, i64 16, i32 0, i32 3, i32 0)
 
-; CHECK-NEXT:    [[TMP3:%.*]] = alloca <8 x float>
 ; CHECK-NEXT:    store <8 x float> <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>, <8 x float>* [[TMP3]]
   %2 = call spir_func %spirv.JointMatrixINTEL._float_8_16_3_3_2 addrspace(1)* @_Z26__spirv_CompositeConstructf.1(float 5.000000e+00)
 
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float>* [[TMP3]] to i8*
 ; CHECK-NEXT:    call void @__builtin_spriv_OpJointMatrixStoreINTEL_Accumulator_RowMajor_SG16_8x16_i32_8_global_pi64_v8i8(float addrspace(1)* [[DST1]], i8* [[TMP4]], i64 16)
   call spir_func void @_Z29__spirv_JointMatrixStoreINTELPU3AS1fPU3AS142__spirv_JointMatrixINTEL__float_8_16_3_3_2liii(float addrspace(1)* %dst1, %spirv.JointMatrixINTEL._float_8_16_3_3_2 addrspace(1)* %2, i64 16, i32 0, i32 3, i32 0)
 
-; CHECK-NEXT:    [[TMP5:%.*]] = alloca [2 x <32 x i64>]
 ; CHECK-NEXT:    store [2 x <32 x i64>] [<32 x i64> <i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448>, <32 x i64> <i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448, i64 4656722015785320448>], [2 x <32 x i64>]* [[TMP5]]
   %3 = call spir_func %spirv.JointMatrixINTEL._float_32_64_3_3_2 addrspace(1)* @_Z26__spirv_CompositeConstructf.2(float 5.000000e+00)
 
diff --git a/IGC/Compiler/tests/JointMatrixFuncsResolutionPass/address-spaces.ll b/IGC/Compiler/tests/JointMatrixFuncsResolutionPass/address-spaces.ll
@@ -35,16 +35,18 @@ define spir_kernel void @test_local(i8 addrspace(3)*  %src, i8 addrspace(3)* %ds
 ; CHECK-LABEL: define void @load_store_generic(
 define void @load_store_generic(i8* %src, i8* %dst) {
 
-; Matrix load sequence:
+; Allocas:
+; CHECK: [[TMP4:%.*]] = alloca <4 x i32>
 ; CHECK: [[PTR:%.*]] = alloca <4 x i32>
+
+; Matrix load sequence:
 ; CHECK: [[MATPTR:%.*]] = bitcast <4 x i32>* [[PTR]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixLoadINTEL_PackedA_RowMajor_SG16_8x8_i32_4_generic_v8i8_pi32_i32(i8* [[MATPTR]], i8* %src, i32 8)
 ; CHECK: [[MATRIX:%.*]] = load <4 x i32>, <4 x i32>* [[PTR]]
 
   %1 = call spir_func %intel.joint_matrix_packedA_8x8_f32_t* @__builtin_spirv_OpJointMatrixLoadINTEL_generic(i8* %src, i32 8, i32 0)
 
 ; Matrix store sequence:
-; CHECK: [[TMP4:%.*]] = alloca <4 x i32>
 ; CHECK: store <4 x i32> [[MATRIX]], <4 x i32>* [[TMP4]]
 ; CHECK: [[TMP5:%.*]] = bitcast <4 x i32>* [[TMP4]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixStoreINTEL_PackedA_RowMajor_SG16_8x8_i32_4_generic_pi64_v8i8(i8* %dst, i8* [[TMP5]], i32 8)
@@ -59,8 +61,11 @@ define void @load_store_generic(i8* %src, i8* %dst) {
 ; CHECK-LABEL: define void @load_store_large_generic(
 define void @load_store_large_generic(i8* %src, i8* %dst) {
 
-; Matrix load sequence:
+; Allocas:
+; CHECK: [[TMP4:%.*]] = alloca [2 x <32 x i64>]
 ; CHECK: [[PTR:%.*]] = alloca [2 x <32 x i64>]
+
+; Matrix load sequence:
 ; CHECK: [[MATPTR:%.*]] = bitcast [2 x <32 x i64>]* [[PTR]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixLoadINTEL_Accumulator_RowMajor_SG16_32x64_i32_128_generic_v8i8_pi32_i32(i8* [[MATPTR]], i8* %src, i64 16)
 ; CHECK: [[HALF_PTR_0:%.*]] = bitcast [2 x <32 x i64>]* [[PTR]] to <32 x i64>*
@@ -73,7 +78,6 @@ define void @load_store_large_generic(i8* %src, i8* %dst) {
   %1 = call spir_func %intel.joint_matrix_acc_32x64_f32_t* @__builtin_spirv_OpJointMatrixLoadINTELacc_32x64_f32_p1i8_i64_i32_generic(i8* %src, i64 16, i32 0)
 
 ; Matrix store sequence:
-; CHECK: [[TMP4:%.*]] = alloca [2 x <32 x i64>]
 ; CHECK: store [2 x <32 x i64>] [[MATRIX]], [2 x <32 x i64>]* [[TMP4]]
 ; CHECK: [[TMP5:%.*]] = bitcast [2 x <32 x i64>]* [[TMP4]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixStoreINTEL_Accumulator_RowMajor_SG16_32x64_i32_128_generic_pi64_v8i8(i8* %dst, i8* [[TMP5]], i64 8)
@@ -88,16 +92,18 @@ define void @load_store_large_generic(i8* %src, i8* %dst) {
 ; CHECK-LABEL: define void @load_store_global(
 define void @load_store_global(i8 addrspace(1)* %src, i8 addrspace(1)* %dst) {
 
-; Matrix load sequence:
+; Allocas:
+; CHECK: [[TMP4:%.*]] = alloca <4 x i32>
 ; CHECK: [[PTR:%.*]] = alloca <4 x i32>
+
+; Matrix load sequence:
 ; CHECK: [[MATPTR:%.*]] = bitcast <4 x i32>* [[PTR]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixLoadINTEL_PackedA_RowMajor_SG16_8x8_i32_4_global_v8i8_pi32_i32(i8* [[MATPTR]], i8 addrspace(1)* %src, i32 8)
 ; CHECK: [[MATRIX:%.*]] = load <4 x i32>, <4 x i32>* [[PTR]]
 
   %1 = call spir_func %intel.joint_matrix_packedA_8x8_f32_t* @__builtin_spirv_OpJointMatrixLoadINTEL_global(i8 addrspace(1)* %src, i32 8, i32 0)
 
 ; Matrix store sequence:
-; CHECK: [[TMP4:%.*]] = alloca <4 x i32>
 ; CHECK: store <4 x i32> [[MATRIX]], <4 x i32>* [[TMP4]]
 ; CHECK: [[TMP5:%.*]] = bitcast <4 x i32>* [[TMP4]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixStoreINTEL_PackedA_RowMajor_SG16_8x8_i32_4_global_pi64_v8i8(i8 addrspace(1)* %dst, i8* [[TMP5]], i32 8)
@@ -112,8 +118,11 @@ define void @load_store_global(i8 addrspace(1)* %src, i8 addrspace(1)* %dst) {
 ; CHECK-LABEL: define void @load_store_large_global(
 define void @load_store_large_global(i8 addrspace(1)* %src, i8 addrspace(1)* %dst) {
 
-; Matrix load sequence:
+; Allocas:
+; CHECK: [[TMP4:%.*]] = alloca [2 x <32 x i64>]
 ; CHECK: [[PTR:%.*]] = alloca [2 x <32 x i64>]
+
+; Matrix load sequence:
 ; CHECK: [[MATPTR:%.*]] = bitcast [2 x <32 x i64>]* [[PTR]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixLoadINTEL_Accumulator_RowMajor_SG16_32x64_i32_128_global_v8i8_pi32_i32(i8* [[MATPTR]], i8 addrspace(1)* %src, i64 16)
 ; CHECK: [[HALF_PTR_0:%.*]] = bitcast [2 x <32 x i64>]* [[PTR]] to <32 x i64>*
@@ -126,7 +135,6 @@ define void @load_store_large_global(i8 addrspace(1)* %src, i8 addrspace(1)* %ds
   %1 = call spir_func %intel.joint_matrix_acc_32x64_f32_t* @__builtin_spirv_OpJointMatrixLoadINTELacc_32x64_f32_p1i8_i64_i32_global(i8 addrspace(1)* %src, i64 16, i32 0)
 
 ; Matrix store sequence:
-; CHECK: [[TMP4:%.*]] = alloca [2 x <32 x i64>]
 ; CHECK: store [2 x <32 x i64>] [[MATRIX]], [2 x <32 x i64>]* [[TMP4]]
 ; CHECK: [[TMP5:%.*]] = bitcast [2 x <32 x i64>]* [[TMP4]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixStoreINTEL_Accumulator_RowMajor_SG16_32x64_i32_128_global_pi64_v8i8(i8 addrspace(1)* %dst, i8* [[TMP5]], i64 8)
@@ -141,16 +149,18 @@ define void @load_store_large_global(i8 addrspace(1)* %src, i8 addrspace(1)* %ds
 ; CHECK-LABEL: define void @load_store_local(
 define void @load_store_local(i8 addrspace(3)* %src, i8 addrspace(3)* %dst) {
 
-; Matrix load sequence:
+; Allocas:
+; CHECK: [[TMP4:%.*]] = alloca <4 x i32>
 ; CHECK: [[PTR:%.*]] = alloca <4 x i32>
+
+; Matrix load sequence:
 ; CHECK: [[MATPTR:%.*]] = bitcast <4 x i32>* [[PTR]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixLoadINTEL_PackedA_RowMajor_SG16_8x8_i32_4_local_v8i8_pi32_i32(i8* [[MATPTR]], i8 addrspace(3)* %src, i32 8)
 ; CHECK: [[MATRIX:%.*]] = load <4 x i32>, <4 x i32>* [[PTR]]
 
   %1 = call spir_func %intel.joint_matrix_packedA_8x8_f32_t* @__builtin_spirv_OpJointMatrixLoadINTEL_local(i8 addrspace(3)* %src, i32 8, i32 0)
 
 ; Matrix store sequence:
-; CHECK: [[TMP4:%.*]] = alloca <4 x i32>
 ; CHECK: store <4 x i32> [[MATRIX]], <4 x i32>* [[TMP4]]
 ; CHECK: [[TMP5:%.*]] = bitcast <4 x i32>* [[TMP4]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixStoreINTEL_PackedA_RowMajor_SG16_8x8_i32_4_local_pi64_v8i8(i8 addrspace(3)* %dst, i8* [[TMP5]], i32 8)
@@ -165,8 +175,11 @@ define void @load_store_local(i8 addrspace(3)* %src, i8 addrspace(3)* %dst) {
 ; CHECK-LABEL: define void @load_store_large_local(
 define void @load_store_large_local(i8 addrspace(3)* %src, i8 addrspace(3)* %dst) {
 
-; Matrix load sequence:
+; Allocas:
+; CHECK: [[TMP4:%.*]] = alloca [2 x <32 x i64>]
 ; CHECK: [[PTR:%.*]] = alloca [2 x <32 x i64>]
+
+; Matrix load sequence:
 ; CHECK: [[MATPTR:%.*]] = bitcast [2 x <32 x i64>]* [[PTR]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixLoadINTEL_Accumulator_RowMajor_SG16_32x64_i32_128_local_v8i8_pi32_i32(i8* [[MATPTR]], i8 addrspace(3)* %src, i64 16)
 ; CHECK: [[HALF_PTR_0:%.*]] = bitcast [2 x <32 x i64>]* [[PTR]] to <32 x i64>*
@@ -179,7 +192,6 @@ define void @load_store_large_local(i8 addrspace(3)* %src, i8 addrspace(3)* %dst
   %1 = call spir_func %intel.joint_matrix_acc_32x64_f32_t* @__builtin_spirv_OpJointMatrixLoadINTELacc_32x64_f32_p1i8_i64_i32_local(i8 addrspace(3)* %src, i64 16, i32 0)
 
 ; Matrix store sequence:
-; CHECK: [[TMP4:%.*]] = alloca [2 x <32 x i64>]
 ; CHECK: store [2 x <32 x i64>] [[MATRIX]], [2 x <32 x i64>]* [[TMP4]]
 ; CHECK: [[TMP5:%.*]] = bitcast [2 x <32 x i64>]* [[TMP4]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixStoreINTEL_Accumulator_RowMajor_SG16_32x64_i32_128_local_pi64_v8i8(i8 addrspace(3)* %dst, i8* [[TMP5]], i64 8)
diff --git a/IGC/Compiler/tests/JointMatrixFuncsResolutionPass/basic.ll b/IGC/Compiler/tests/JointMatrixFuncsResolutionPass/basic.ll
@@ -27,6 +27,7 @@ define spir_kernel void @test_jm(i32 %t1_a, i8* %t1_dst1, i32* %t1_dst2, i8* %t2
 %intel.joint_matrix_packedA_8x32_i8_ = type opaque
 define void @fill_length(i32 %a, i8* %dst, i32* %dst2) {
 ; CHECK-LABEL: define void @fill_length(
+; CHECK:    [[PTR:%.*]] = alloca <8 x i32>
 ; CHECK:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[A:%.*]], i64 0
 ; CHECK:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[A]], i64 1
 ; CHECK:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A]], i64 2
@@ -35,7 +36,6 @@ define void @fill_length(i32 %a, i8* %dst, i32* %dst2) {
 ; CHECK:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A]], i64 5
 ; CHECK:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A]], i64 6
 ; CHECK:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A]], i64 7
-; CHECK:    [[PTR:%.*]] = alloca <8 x i32>
 ; CHECK:    store <8 x i32> [[TMP8]], <8 x i32>* [[PTR:%.*]]
 ; CHECK:    [[MATPTR:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*, !dbg [[DBG1:![0-9]*]]
 ; CHECK:    call void @__builtin_spriv_OpJointMatrixStoreINTEL_PackedA_RowMajor_8x32_i8_8_generic_pi64_v8i8(i8* %dst, i8* [[MATPTR]], i32 8), !dbg [[DBG1]]
@@ -56,11 +56,11 @@ declare spir_func void @__builtin_spirv_OpJointMatrixStoreINTEL(i8*, %intel.join
 %intel.joint_matrix_packedA_8x16_i16_ = type opaque
 define void @load_store_legacy(i8* %a, i8* %dst) {
 ; CHECK-LABEL: define void @load_store_legacy(
+; CHECK: [[TMP4:%.*]] = alloca <8 x i32>
 ; CHECK: [[PTR:%.*]] = alloca <8 x i32>
 ; CHECK: [[MATPTR:%.*]] = bitcast <8 x i32>* [[PTR]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixLoadINTEL_PackedA_RowMajor_8x16_i16_8_generic_v8i8_pi32_i32(i8* [[MATPTR]], i8* %a, i32 16), !dbg [[DBG2:![0-9]*]]
 ; CHECK: [[MATRIX:%.*]] = load <8 x i32>, <8 x i32>* [[PTR]]
-; CHECK: [[TMP4:%.*]] = alloca <8 x i32>
 ; CHECK: store <8 x i32> [[MATRIX]], <8 x i32>* [[TMP4]]
 ; CHECK: [[TMP5:%.*]] = bitcast <8 x i32>* [[TMP4]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixStoreINTEL_PackedA_RowMajor_8x16_i16_8_generic_pi64_v8i8(i8* %dst, i8* [[TMP5]], i32 8), !dbg [[DBG3:![0-9]*]]
@@ -78,11 +78,11 @@ declare spir_func void @__builtin_spirv_OpJointMatrixStoreINTEL.8x16(i8*, %intel
 %spirv.JointMatrixINTEL._float_8_8_3_3_2 = type opaque
 define void @load_store_acc_transpose(float addrspace(1)* %a, float addrspace(1)* %dst) {
 ; CHECK-LABEL: define void @load_store_acc_transpose(
+; CHECK: [[TMP4:%.*]] = alloca <8 x float>
 ; CHECK: [[PTR:%.*]] = alloca <8 x float>
 ; CHECK: [[MATPTR:%.*]] = bitcast <8 x float>* [[PTR]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixLoadINTEL_Accumulator_ColumnMajor_8x8_i32_8_global_v8i8_pi32_i32(i8* [[MATPTR]], float addrspace(1)* %a, i64 64), !dbg [[DBG2:![0-9]*]]
 ; CHECK: [[MATRIX:%.*]] = load <8 x float>, <8 x float>* [[PTR]]
-; CHECK: [[TMP4:%.*]] = alloca <8 x float>
 ; CHECK: store <8 x float> [[MATRIX]], <8 x float>* [[TMP4]]
 ; CHECK: [[TMP5:%.*]] = bitcast <8 x float>* [[TMP4]] to i8*
 ; CHECK: call void @__builtin_spriv_OpJointMatrixStoreINTEL_Accumulator_ColumnMajor_8x8_i32_8_global_pi64_v8i8(float addrspace(1)* %dst, i8* [[TMP5]], i64 64), !dbg [[DBG3:![0-9]*]]