intel
diff --git a/‎include/imex/ExecutionEngine/ImexRunnerUtils.h
Lines changed: 5 additions & 0 deletions b/‎include/imex/ExecutionEngine/ImexRunnerUtils.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir
Lines changed: 19 additions & 25 deletions b/‎test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir
Lines changed: 19 additions & 25 deletions
diff --git a/‎test/Integration/Dialect/XeGPU/exp_f32.vc.mlir
Lines changed: 32 additions & 57 deletions b/‎test/Integration/Dialect/XeGPU/exp_f32.vc.mlir
Lines changed: 32 additions & 57 deletions
diff --git a/‎test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir
Lines changed: 4 additions & 3 deletions b/‎test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir
Lines changed: 4 additions & 3 deletions
@@ -72,6 +72,11 @@ _mlir_ciface_fillResource1DRandomF16(UnrankedMemRefType<f16> *ptr,
                                      const float lower, const float upper,
                                      const bool genInt);
 
+extern "C" IMEX_RUNNERUTILS_EXPORT void
+_mlir_ciface_fillResource1DRandomF32(UnrankedMemRefType<float> *ptr,
+                                     const float lower, const float upper,
+                                     const bool genInt);
+
 extern "C" IMEX_RUNNERUTILS_EXPORT void
 _mlir_ciface_printMemrefBF16(UnrankedMemRefType<bf16> *m);
 extern "C" IMEX_RUNNERUTILS_EXPORT void
 
@@ -7,55 +7,49 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
-  func.func @test(%A : memref<8x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+  func.func @test(%A : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
-    %memref_0 = gpu.alloc  host_shared () : memref<8x16xf16>
-    memref.copy %A, %memref_0 : memref<8x16xf16> to memref<8x16xf16>
+    %memref_0 = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %A, %memref_0 : memref<8x16xf32> to memref<8x16xf32>
     %memref_1 = gpu.alloc  host_shared () : memref<8x16xf32>
-    %memref_0_cast = memref.cast %memref_0 : memref<8x16xf16> to memref<?x?xf16>
+    %memref_0_cast = memref.cast %memref_0 : memref<8x16xf32> to memref<?x?xf32>
     %memref_1_cast = memref.cast %memref_1 : memref<8x16xf32> to memref<?x?xf32>
-    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0_cast : memref<?x?xf16>, %memref_1_cast : memref<?x?xf32>)
-    gpu.dealloc %memref_0 : memref<8x16xf16>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0_cast : memref<?x?xf32>, %memref_1_cast : memref<?x?xf32>)
+    gpu.dealloc %memref_0 : memref<8x16xf32>
     return %memref_1 : memref<8x16xf32>
   }
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-   gpu.func @test_kernel(%arg0 : memref<?x?xf16>, %arg1: memref<?x?xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+   gpu.func @test_kernel(%arg0 : memref<?x?xf32>, %arg1: memref<?x?xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c1 = arith.constant 1 : index
       %c8 = arith.constant 8 : index
       %c16 = arith.constant 16 : index
-      %1 = xegpu.create_nd_tdesc %arg0[0, 0], [%c8, %c16], [%c16, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %2 = xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      %3 = vector.shape_cast %2 : vector<8x16xf16> to vector<128xf16>
-      %5 = arith.extf %3 : vector<128xf16> to vector<128xf32>
-      %4 = vector.shape_cast %5 : vector<128xf32> to vector<8x16xf32>
+      %1 = xegpu.create_nd_tdesc %arg0[0, 0], [%c8, %c16], [%c16, %c1] : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %2 = xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>}  : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
       %6 = xegpu.create_nd_tdesc %arg1[0, 0], [%c8, %c16], [%c16, %c1] : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
-      xegpu.store_nd %4, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+      xegpu.store_nd %2, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
       gpu.return
     }
   }
   func.func @main() attributes {llvm.emit_c_interface} {
-    %A = memref.alloc() : memref<8x16xf16>
-    %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16>
+    %A = memref.alloc() : memref<8x16xf32>
+    %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32>
     %c_gen_int = arith.constant 0 : i1
     %cf_lower = arith.constant -0.5 : f32
     %cf_upper = arith.constant 0.5 : f32
 
-    call @fillResource1DRandomF16(%A_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
+    call @fillResource1DRandomF32(%A_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf32>, f32, f32, i1) -> ()
 
-    %B = call @test(%A) : (memref<8x16xf16>) -> memref<8x16xf32>
+    %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
     %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
-    %A_cast = memref.cast %A : memref<8x16xf16> to memref<*xf16>
-    // call @printMemrefF16(%A_cast) : (memref<*xf16>) -> ()
+    %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32>
     // call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
     // CHECK: [ALLCLOSE: TRUE]
-    call @printAllcloseF16(%A_cast, %B_cast) : (memref<*xf16>, memref<*xf32>) -> ()
+    call @printAllcloseF32(%A_cast, %B_cast) : (memref<*xf32>, memref<*xf32>) -> ()
 
-    memref.dealloc %A : memref<8x16xf16>
+    memref.dealloc %A : memref<8x16xf32>
     return
   }
-  func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface}
   func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
-  func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
-  func.func private @fillResource1DF16(memref<*xf16>, f32) attributes {llvm.emit_c_interface}
-  func.func private @printAllcloseF16(memref<*xf16>, memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface}
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
 }
@@ -7,63 +7,51 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
-  func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16> ) -> (memref<8x16xf32>, memref<8x16xf32>) attributes {llvm.emit_c_interface} {
+  func.func @test(%A: memref<8x16xf32>) -> (memref<8x16xf32>, memref<8x16xf32>) attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
-    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
-    %memref_1 = gpu.alloc  host_shared () : memref<16x16xf16>
-    memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16>
-    memref.copy %B, %memref_1 : memref<16x16xf16> to memref<16x16xf16>
+    %memref = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %A, %memref : memref<8x16xf32> to memref<8x16xf32>
+
     %memref_2 = gpu.alloc  host_shared () : memref<8x16xf32>
     %memref_3 = gpu.alloc  host_shared () : memref<8x16xf32>
-    gpu.launch_func  @module0::@test_exp_larger_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_2 : memref<8x16xf32>)
-    gpu.launch_func  @module1::@test_exp_generic_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_3 : memref<8x16xf32>)
-    gpu.dealloc  %memref : memref<8x16xf16>
-    gpu.dealloc  %memref_1 : memref<16x16xf16>
+    gpu.launch_func  @module0::@test_exp_larger_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_2 : memref<8x16xf32>)
+    gpu.launch_func  @module1::@test_exp_generic_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_3 : memref<8x16xf32>)
+    gpu.dealloc  %memref : memref<8x16xf32>
     return %memref_2, %memref_3 : memref<8x16xf32>, memref<8x16xf32>
   }
 
     gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_exp_larger_vec(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_exp_larger_vec(%A: memref<8x16xf32>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c0 = arith.constant 0 : index
       %c16 = arith.constant 16 : index
       // load A tile
-      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      // load B tile
-      %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-      %val2 = xegpu.load_nd %b_tile0 { packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
-      // do DPAS
-      %val4 = xegpu.dpas %val0, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
       // take exp
-      %t6 = math.exp %val4 : vector<8x16xf32>
+      %t6 = math.exp %val0 : vector<8x16xf32>
       // store
       %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
       xegpu.store_nd %t6, %out_tile  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
       gpu.return
     }
   }
   gpu.module @module1 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_exp_generic_vec(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_exp_generic_vec(%A: memref<8x16xf32>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c0 = arith.constant 0 : index
       %c16 = arith.constant 16 : index
       // load A tile
-      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      // load B tile
-      %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-      %val2 = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
-      // do DPAS
-      %val4 = xegpu.dpas %val0, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
-      // extract dpas out into 16xf32 vectors
-      %cst1 = arith.constant dense<1.4426950408889634> : vector<128xf32>
-      %v0 = vector.extract %val4[0] : vector<16xf32> from vector<8x16xf32>
-      %v1 = vector.extract %val4[1] : vector<16xf32> from vector<8x16xf32>
-      %v2 = vector.extract %val4[2] : vector<16xf32> from vector<8x16xf32>
-      %v3 = vector.extract %val4[3] : vector<16xf32> from vector<8x16xf32>
-      %v4 = vector.extract %val4[4] : vector<16xf32> from vector<8x16xf32>
-      %v5 = vector.extract %val4[5] : vector<16xf32> from vector<8x16xf32>
-      %v6 = vector.extract %val4[6] : vector<16xf32> from vector<8x16xf32>
-      %v7 = vector.extract %val4[7] : vector<16xf32> from vector<8x16xf32>
+      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+
+      // extract the loaded vector into 16xf32 vectors
+      %v0 = vector.extract %val0[0] : vector<16xf32> from vector<8x16xf32>
+      %v1 = vector.extract %val0[1] : vector<16xf32> from vector<8x16xf32>
+      %v2 = vector.extract %val0[2] : vector<16xf32> from vector<8x16xf32>
+      %v3 = vector.extract %val0[3] : vector<16xf32> from vector<8x16xf32>
+      %v4 = vector.extract %val0[4] : vector<16xf32> from vector<8x16xf32>
+      %v5 = vector.extract %val0[5] : vector<16xf32> from vector<8x16xf32>
+      %v6 = vector.extract %val0[6] : vector<16xf32> from vector<8x16xf32>
+      %v7 = vector.extract %val0[7] : vector<16xf32> from vector<8x16xf32>
       // do generic size exp
       %v0_exp = math.exp %v0 : vector<16xf32>
       %v1_exp = math.exp %v1 : vector<16xf32>
@@ -104,31 +92,19 @@ module @gemm attributes {gpu.container_module} {
     %rand_lower = arith.constant -1.0 : f32
     %rand_upper = arith.constant 1.0 : f32
     %gen_int = arith.constant 0 : i1
-    %A = memref.alloc() : memref<8x16xf16>
-    %B = memref.alloc() : memref<16x16xf16>
+    %A = memref.alloc() : memref<8x16xf32>
     %Out_cpu = memref.alloc() : memref<8x16xf32>
-    %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16>
-    %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16>
-    call @fillResource1DRandomF16(%A_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
-    call @fillResource1DRandomF16(%B_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
+    %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+    call @fillResource1DRandomF32(%A_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf32>, f32, f32, i1) -> ()
     // run GPU version
-    %Out_gpu_large, %Out_gpu_generic = call @test(%A, %B) : (memref<8x16xf16>, memref<16x16xf16>) -> (memref<8x16xf32>, memref<8x16xf32>)
+    %Out_gpu_large, %Out_gpu_generic = call @test(%A) : (memref<8x16xf32>) -> (memref<8x16xf32>, memref<8x16xf32>)
     %Out_gpu_generic_cast = memref.cast %Out_gpu_generic : memref<8x16xf32> to memref<*xf32>
     %Out_gpu_large_cast = memref.cast %Out_gpu_large : memref<8x16xf32> to memref<*xf32>
     // run CPU version
     scf.for %i = %c0 to %c8 step %c1 {
       scf.for %j = %c0 to  %c16 step %c1 {
-        %v0_init = arith.constant 0.0 : f32
-        %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 {
-          %a0 = memref.load %A[%i, %k] : memref<8x16xf16>
-          %b0 = memref.load %B[%k, %j] : memref<16x16xf16>
-          %a0_f32 = arith.extf %a0 : f16 to f32
-          %b0_f32 = arith.extf %b0 : f16 to f32
-          %t0 = arith.mulf %a0_f32, %b0_f32 : f32
-          %v0_new = arith.addf %v0, %t0 : f32
-          scf.yield %v0_new : f32
-        }
-        %vexp = math.exp %result#0: f32
+        %a0 = memref.load %A[%i, %j] : memref<8x16xf32>
+        %vexp = math.exp %a0: f32
         memref.store %vexp, %Out_cpu[%i, %j] : memref<8x16xf32>
       }
     }
@@ -141,15 +117,14 @@ module @gemm attributes {gpu.container_module} {
     call @printAllcloseF32(%Out_gpu_generic_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
     call @printAllcloseF32(%Out_gpu_large_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
     // dealloc
-    memref.dealloc %A : memref<8x16xf16>
-    memref.dealloc %B : memref<16x16xf16>
+    memref.dealloc %A : memref<8x16xf32>
     memref.dealloc %Out_cpu : memref<8x16xf32>
     // gpu dealloc
     gpu.dealloc %Out_gpu_generic : memref<8x16xf32>
     gpu.dealloc %Out_gpu_large : memref<8x16xf32>
     return
   }
   func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
-  func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
+  func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface}
   func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
 }
@@ -50,6 +50,7 @@ module @gemm attributes {gpu.container_module} {
     %c1 = arith.constant 1 : index
     %c8 = arith.constant 8 : index
     %c16 = arith.constant 16 : index
+
     %A = memref.alloc() : memref<8x32xf16>
     %B = memref.alloc() : memref<16x32xf16>
     %Out_cpu = memref.alloc() : memref<8x16xf32>
@@ -72,9 +73,9 @@ module @gemm attributes {gpu.container_module} {
         %v0_init = arith.constant 0.0 : f32
         %v1_init = arith.constant 0.0 : f32
         %result:2 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init, %v1 = %v1_init) -> (f32, f32){
-          %a0 = memref.load %A[%i, %k] : memref<8x32xf16>
           %1 = arith.addi %k, %c16 : index
           %2 = arith.addi %j, %c16 : index
+          %a0 = memref.load %A[%i, %k] : memref<8x32xf16>
           %a1 = memref.load %A[%i, %1] : memref<8x32xf16>
           %b0 = memref.load %B[%k, %j] : memref<16x32xf16>
           %b1 = memref.load %B[%k, %2] : memref<16x32xf16>
@@ -94,8 +95,8 @@ module @gemm attributes {gpu.container_module} {
     }
     %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32>
     // print GPU and CPU outs
-    // call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> ()
-    // call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> ()
+     call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> ()
+     call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> ()
     // CHECK: [ALLCLOSE: TRUE]
     call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
     // dealloc