7
7
// RUN: --entry-point-result=void \
8
8
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
9
9
module @gemm attributes {gpu.container_module } {
10
- func.func @test (%A: memref <8 x 16 x f16 >, %B: memref < 16 x 16 x f16 > ) -> (memref <8 x16 xf32 >, memref <8 x16 xf32 >) attributes {llvm.emit_c_interface } {
10
+ func.func @test (%A: memref <8 x 16 x f32 > ) -> (memref <8 x16 xf32 >, memref <8 x16 xf32 >) attributes {llvm.emit_c_interface } {
11
11
%c1 = arith.constant 1 : index
12
- %memref = gpu.alloc host_shared () : memref <8 x16 xf16 >
13
- %memref_1 = gpu.alloc host_shared () : memref <16 x16 xf16 >
14
- memref.copy %A , %memref : memref <8 x16 xf16 > to memref <8 x16 xf16 >
15
- memref.copy %B , %memref_1 : memref <16 x16 xf16 > to memref <16 x16 xf16 >
12
+ %memref = gpu.alloc host_shared () : memref <8 x16 xf32 >
13
+ memref.copy %A , %memref : memref <8 x16 xf32 > to memref <8 x16 xf32 >
14
+
16
15
%memref_2 = gpu.alloc host_shared () : memref <8 x16 xf32 >
17
16
%memref_3 = gpu.alloc host_shared () : memref <8 x16 xf32 >
18
- gpu.launch_func @module0 ::@test_exp_larger_vec blocks in (%c1 , %c1 , %c1 ) threads in (%c1 , %c1 , %c1 ) args (%memref : memref <8 x16 xf16 >, %memref_1 : memref <16 x16 xf16 >, %memref_2 : memref <8 x16 xf32 >)
19
- gpu.launch_func @module1 ::@test_exp_generic_vec blocks in (%c1 , %c1 , %c1 ) threads in (%c1 , %c1 , %c1 ) args (%memref : memref <8 x16 xf16 >, %memref_1 : memref <16 x16 xf16 >, %memref_3 : memref <8 x16 xf32 >)
20
- gpu.dealloc %memref : memref <8 x16 xf16 >
21
- gpu.dealloc %memref_1 : memref <16 x16 xf16 >
17
+ gpu.launch_func @module0 ::@test_exp_larger_vec blocks in (%c1 , %c1 , %c1 ) threads in (%c1 , %c1 , %c1 ) args (%memref : memref <8 x16 xf32 >, %memref_2 : memref <8 x16 xf32 >)
18
+ gpu.launch_func @module1 ::@test_exp_generic_vec blocks in (%c1 , %c1 , %c1 ) threads in (%c1 , %c1 , %c1 ) args (%memref : memref <8 x16 xf32 >, %memref_3 : memref <8 x16 xf32 >)
19
+ gpu.dealloc %memref : memref <8 x16 xf32 >
22
20
return %memref_2 , %memref_3 : memref <8 x16 xf32 >, memref <8 x16 xf32 >
23
21
}
24
22
25
23
gpu.module @module0 attributes {spirv.target_env = #spirv.target_env <#spirv.vce <v1.4 , [Addresses , Float16Buffer , Int64 , Int16 , Int8 , Kernel , Linkage , Vector16 , GenericPointer , Groups , Float16 , Float64 , AtomicFloat32AddEXT , ExpectAssumeKHR , SubgroupDispatch , VectorComputeINTEL , VectorAnyINTEL ], [SPV_EXT_shader_atomic_float_add , SPV_KHR_expect_assume , SPV_INTEL_vector_compute ]>, api =OpenCL , #spirv.resource_limits <>>} {
26
- gpu.func @test_exp_larger_vec (%A: memref <8 x 16 x f16 >, %B: memref < 16 x 16 x f16 >, %Out: memref <8 x16 xf32 >) kernel attributes {VectorComputeFunctionINTEL , spirv.entry_point_abi = #spirv.entry_point_abi <>} {
24
+ gpu.func @test_exp_larger_vec (%A: memref <8 x 16 x f32 >, %Out: memref <8 x16 xf32 >) kernel attributes {VectorComputeFunctionINTEL , spirv.entry_point_abi = #spirv.entry_point_abi <>} {
27
25
%c0 = arith.constant 0 : index
28
26
%c16 = arith.constant 16 : index
29
27
// load A tile
30
- %a_tile0 = xegpu.create_nd_tdesc %A [%c0 , %c0 ] : memref <8 x16 xf16 > -> !xegpu.tensor_desc <8 x16 xf16 >
31
- %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc <8 x16 xf16 > -> vector <8 x16 xf16 >
32
- // load B tile
33
- %b_tile0 = xegpu.create_nd_tdesc %B [%c0 , %c0 ] : memref <16 x16 xf16 > -> !xegpu.tensor_desc <16 x16 xf16 >
34
- %val2 = xegpu.load_nd %b_tile0 { packed } : !xegpu.tensor_desc <16 x16 xf16 > -> vector <8 x16 x2 xf16 >
35
- // do DPAS
36
- %val4 = xegpu.dpas %val0 , %val2 : vector <8 x16 xf16 >, vector <8 x16 x2 xf16 > -> vector <8 x16 xf32 >
28
+ %a_tile0 = xegpu.create_nd_tdesc %A [%c0 , %c0 ] : memref <8 x16 xf32 > -> !xegpu.tensor_desc <8 x16 xf32 >
29
+ %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc <8 x16 xf32 > -> vector <8 x16 xf32 >
37
30
// take exp
38
- %t6 = math.exp %val4 : vector <8 x16 xf32 >
31
+ %t6 = math.exp %val0 : vector <8 x16 xf32 >
39
32
// store
40
33
%out_tile = xegpu.create_nd_tdesc %Out [%c0 , %c0 ] : memref <8 x16 xf32 > -> !xegpu.tensor_desc <8 x16 xf32 >
41
34
xegpu.store_nd %t6 , %out_tile : vector <8 x16 xf32 >, !xegpu.tensor_desc <8 x16 xf32 >
42
35
gpu.return
43
36
}
44
37
}
45
38
gpu.module @module1 attributes {spirv.target_env = #spirv.target_env <#spirv.vce <v1.4 , [Addresses , Float16Buffer , Int64 , Int16 , Int8 , Kernel , Linkage , Vector16 , GenericPointer , Groups , Float16 , Float64 , AtomicFloat32AddEXT , ExpectAssumeKHR , SubgroupDispatch , VectorComputeINTEL , VectorAnyINTEL ], [SPV_EXT_shader_atomic_float_add , SPV_KHR_expect_assume , SPV_INTEL_vector_compute ]>, api =OpenCL , #spirv.resource_limits <>>} {
46
- gpu.func @test_exp_generic_vec (%A: memref <8 x 16 x f16 >, %B: memref < 16 x 16 x f16 >, %Out: memref <8 x16 xf32 >) kernel attributes {VectorComputeFunctionINTEL , spirv.entry_point_abi = #spirv.entry_point_abi <>} {
39
+ gpu.func @test_exp_generic_vec (%A: memref <8 x 16 x f32 >, %Out: memref <8 x16 xf32 >) kernel attributes {VectorComputeFunctionINTEL , spirv.entry_point_abi = #spirv.entry_point_abi <>} {
47
40
%c0 = arith.constant 0 : index
48
41
%c16 = arith.constant 16 : index
49
42
// load A tile
50
- %a_tile0 = xegpu.create_nd_tdesc %A [%c0 , %c0 ] : memref <8 x16 xf16 > -> !xegpu.tensor_desc <8 x16 xf16 >
51
- %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc <8 x16 xf16 > -> vector <8 x16 xf16 >
52
- // load B tile
53
- %b_tile0 = xegpu.create_nd_tdesc %B [%c0 , %c0 ] : memref <16 x16 xf16 > -> !xegpu.tensor_desc <16 x16 xf16 >
54
- %val2 = xegpu.load_nd %b_tile0 {packed } : !xegpu.tensor_desc <16 x16 xf16 > -> vector <8 x16 x2 xf16 >
55
- // do DPAS
56
- %val4 = xegpu.dpas %val0 , %val2 : vector <8 x16 xf16 >, vector <8 x16 x2 xf16 > -> vector <8 x16 xf32 >
57
- // extract dpas out into 16xf32 vectors
58
- %cst1 = arith.constant dense <1.4426950408889634 > : vector <128 xf32 >
59
- %v0 = vector.extract %val4 [0 ] : vector <16 xf32 > from vector <8 x16 xf32 >
60
- %v1 = vector.extract %val4 [1 ] : vector <16 xf32 > from vector <8 x16 xf32 >
61
- %v2 = vector.extract %val4 [2 ] : vector <16 xf32 > from vector <8 x16 xf32 >
62
- %v3 = vector.extract %val4 [3 ] : vector <16 xf32 > from vector <8 x16 xf32 >
63
- %v4 = vector.extract %val4 [4 ] : vector <16 xf32 > from vector <8 x16 xf32 >
64
- %v5 = vector.extract %val4 [5 ] : vector <16 xf32 > from vector <8 x16 xf32 >
65
- %v6 = vector.extract %val4 [6 ] : vector <16 xf32 > from vector <8 x16 xf32 >
66
- %v7 = vector.extract %val4 [7 ] : vector <16 xf32 > from vector <8 x16 xf32 >
43
+ %a_tile0 = xegpu.create_nd_tdesc %A [%c0 , %c0 ] : memref <8 x16 xf32 > -> !xegpu.tensor_desc <8 x16 xf32 >
44
+ %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc <8 x16 xf32 > -> vector <8 x16 xf32 >
45
+
46
+ // extract the loaded vector into 16xf32 vectors
47
+ %v0 = vector.extract %val0 [0 ] : vector <16 xf32 > from vector <8 x16 xf32 >
48
+ %v1 = vector.extract %val0 [1 ] : vector <16 xf32 > from vector <8 x16 xf32 >
49
+ %v2 = vector.extract %val0 [2 ] : vector <16 xf32 > from vector <8 x16 xf32 >
50
+ %v3 = vector.extract %val0 [3 ] : vector <16 xf32 > from vector <8 x16 xf32 >
51
+ %v4 = vector.extract %val0 [4 ] : vector <16 xf32 > from vector <8 x16 xf32 >
52
+ %v5 = vector.extract %val0 [5 ] : vector <16 xf32 > from vector <8 x16 xf32 >
53
+ %v6 = vector.extract %val0 [6 ] : vector <16 xf32 > from vector <8 x16 xf32 >
54
+ %v7 = vector.extract %val0 [7 ] : vector <16 xf32 > from vector <8 x16 xf32 >
67
55
// do generic size exp
68
56
%v0_exp = math.exp %v0 : vector <16 xf32 >
69
57
%v1_exp = math.exp %v1 : vector <16 xf32 >
@@ -104,31 +92,19 @@ module @gemm attributes {gpu.container_module} {
104
92
%rand_lower = arith.constant -1.0 : f32
105
93
%rand_upper = arith.constant 1.0 : f32
106
94
%gen_int = arith.constant 0 : i1
107
- %A = memref.alloc () : memref <8 x16 xf16 >
108
- %B = memref.alloc () : memref <16 x16 xf16 >
95
+ %A = memref.alloc () : memref <8 x16 xf32 >
109
96
%Out_cpu = memref.alloc () : memref <8 x16 xf32 >
110
- %A_random = memref.cast %A : memref <8 x16 xf16 > to memref <*xf16 >
111
- %B_random = memref.cast %B : memref <16 x16 xf16 > to memref <*xf16 >
112
- call @fillResource1DRandomF16 (%A_random , %rand_lower , %rand_upper , %gen_int ) : (memref <*xf16 >, f32 , f32 , i1 ) -> ()
113
- call @fillResource1DRandomF16 (%B_random , %rand_lower , %rand_upper , %gen_int ) : (memref <*xf16 >, f32 , f32 , i1 ) -> ()
97
+ %A_random = memref.cast %A : memref <8 x16 xf32 > to memref <*xf32 >
98
+ call @fillResource1DRandomF32 (%A_random , %rand_lower , %rand_upper , %gen_int ) : (memref <*xf32 >, f32 , f32 , i1 ) -> ()
114
99
// run GPU version
115
- %Out_gpu_large , %Out_gpu_generic = call @test (%A , %B ) : (memref <8 x 16 x f16 >, memref < 16 x 16 x f16 >) -> (memref <8 x16 xf32 >, memref <8 x16 xf32 >)
100
+ %Out_gpu_large , %Out_gpu_generic = call @test (%A ) : (memref <8 x 16 x f32 >) -> (memref <8 x16 xf32 >, memref <8 x16 xf32 >)
116
101
%Out_gpu_generic_cast = memref.cast %Out_gpu_generic : memref <8 x16 xf32 > to memref <*xf32 >
117
102
%Out_gpu_large_cast = memref.cast %Out_gpu_large : memref <8 x16 xf32 > to memref <*xf32 >
118
103
// run CPU version
119
104
scf.for %i = %c0 to %c8 step %c1 {
120
105
scf.for %j = %c0 to %c16 step %c1 {
121
- %v0_init = arith.constant 0.0 : f32
122
- %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args (%v0 = %v0_init ) -> f32 {
123
- %a0 = memref.load %A [%i , %k ] : memref <8 x16 xf16 >
124
- %b0 = memref.load %B [%k , %j ] : memref <16 x16 xf16 >
125
- %a0_f32 = arith.extf %a0 : f16 to f32
126
- %b0_f32 = arith.extf %b0 : f16 to f32
127
- %t0 = arith.mulf %a0_f32 , %b0_f32 : f32
128
- %v0_new = arith.addf %v0 , %t0 : f32
129
- scf.yield %v0_new : f32
130
- }
131
- %vexp = math.exp %result#0: f32
106
+ %a0 = memref.load %A [%i , %j ] : memref <8 x16 xf32 >
107
+ %vexp = math.exp %a0: f32
132
108
memref.store %vexp , %Out_cpu [%i , %j ] : memref <8 x16 xf32 >
133
109
}
134
110
}
@@ -141,15 +117,14 @@ module @gemm attributes {gpu.container_module} {
141
117
call @printAllcloseF32 (%Out_gpu_generic_cast , %Out_cpu_cast ) : (memref <*xf32 >, memref <*xf32 >) -> ()
142
118
call @printAllcloseF32 (%Out_gpu_large_cast , %Out_cpu_cast ) : (memref <*xf32 >, memref <*xf32 >) -> ()
143
119
// dealloc
144
- memref.dealloc %A : memref <8 x16 xf16 >
145
- memref.dealloc %B : memref <16 x16 xf16 >
120
+ memref.dealloc %A : memref <8 x16 xf32 >
146
121
memref.dealloc %Out_cpu : memref <8 x16 xf32 >
147
122
// gpu dealloc
148
123
gpu.dealloc %Out_gpu_generic : memref <8 x16 xf32 >
149
124
gpu.dealloc %Out_gpu_large : memref <8 x16 xf32 >
150
125
return
151
126
}
152
127
func.func private @printMemrefF32 (memref <*xf32 >) attributes {llvm.emit_c_interface }
153
- func.func private @fillResource1DRandomF16 (memref <*x f16 >, f32 , f32 , i1 ) attributes {llvm.emit_c_interface }
128
+ func.func private @fillResource1DRandomF32 (memref <*x f32 >, f32 , f32 , i1 ) attributes {llvm.emit_c_interface }
154
129
func.func private @printAllcloseF32 (memref <*xf32 >, memref <*xf32 >) attributes {llvm.emit_c_interface }
155
130
}
0 commit comments