33from loopy .diagnostic import LoopyError
44from loopy .target .c import CTarget
55from loopy .version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
6+ from loopy .target .c .c_execution import CCompiler
7+ from codepy .toolchain import GCCToolchain
68
79
810# {{{ blas callable
@@ -22,7 +24,7 @@ def with_types(self, arg_id_to_dtype, callables_table):
2224
2325 if vec_dtype .numpy_dtype == np .float32 :
2426 name_in_target = "cblas_sgemv"
25- elif vec_dtype . numpy_dtype == np .float64 :
27+ elif vec_dtype .numpy_dtype == np .float64 :
2628 name_in_target = "cblas_dgemv"
2729 else :
2830 raise LoopyError ("GEMV is only supported for float32 and float64 "
@@ -47,30 +49,37 @@ def with_descrs(self, arg_id_to_descr, callables_table):
4749 assert mat_descr .shape [0 ] == res_descr .shape [0 ]
4850 assert len (vec_descr .shape ) == len (res_descr .shape ) == 1
4951 # handling only the easy case when stride == 1
50- assert vec_descr .dim_tags [0 ].stride == 1
5152 assert mat_descr .dim_tags [1 ].stride == 1
52- assert res_descr .dim_tags [0 ].stride == 1
5353
5454 return self .copy (arg_id_to_descr = arg_id_to_descr ), callables_table
5555
5656 def emit_call_insn (self , insn , target , expression_to_code_mapper ):
5757 from pymbolic import var
58+ from loopy .codegen import UnvectorizableError
5859 mat_descr = self .arg_id_to_descr [0 ]
60+ vec_descr = self .arg_id_to_descr [1 ]
61+ res_descr = self .arg_id_to_descr [- 1 ]
5962 m , n = mat_descr .shape
6063 ecm = expression_to_code_mapper
64+
65+ if ecm .codegen_state .vectorization_info is not None :
66+ raise UnvectorizableError ("cannot vectorize BLAS-gemv." )
67+
6168 mat , vec = insn .expression .parameters
6269 result , = insn .assignees
6370
6471 c_parameters = [var ("CblasRowMajor" ),
6572 var ("CblasNoTrans" ),
6673 m , n ,
67- 1 ,
74+ 1 , # alpha
6875 ecm (mat ).expr ,
69- 1 ,
76+ mat_descr . dim_tags [ 0 ]. stride , # LDA
7077 ecm (vec ).expr ,
71- 1 ,
78+ vec_descr .dim_tags [0 ].stride , # INCX
79+ 0 , # beta
7280 ecm (result ).expr ,
73- 1 ]
81+ res_descr .dim_tags [0 ].stride # INCY
82+ ]
7483 return (var (self .name_in_target )(* c_parameters ),
7584 False # cblas_gemv does not return anything
7685 )
@@ -83,17 +92,95 @@ def generate_preambles(self, target):
8392# }}}
8493
8594
86- n = 10
87-
88- knl = lp .make_kernel (
89- "{:}" ,
95+ def transform_1 (knl ):
96+ return knl
97+
98+
99+ def transform_2 (knl ):
100+ # A similar transformation is applied to kernels containing
101+ # SLATE <https://www.firedrakeproject.org/firedrake.slate.html>
102+ # callables.
103+ knl = lp .split_iname (knl , "e" , 4 , inner_iname = "e_inner" , slabs = (0 , 1 ))
104+ knl = lp .privatize_temporaries_with_inames (knl , "e_inner" )
105+ knl = lp .tag_inames (knl , {"e_inner" : "vec" })
106+ if 0 :
107+ # Easy codegen exercise, but misses vectorizing certain instructions.
108+ knl = lp .tag_array_axes (knl , "tmp3" , "c,vec" )
109+ else :
110+ knl = lp .tag_array_axes (knl , "tmp3,tmp2" , "c,vec" )
111+ return knl
112+
113+
114+ def main ():
115+
116+ compiler = CCompiler (toolchain = GCCToolchain (
117+ cc = "gcc" ,
118+ cflags = "-std=c99 -O3 -fPIC" .split (),
119+ ldflags = "-shared" .split (),
120+ libraries = ["blas" ],
121+ library_dirs = [],
122+ defines = [],
123+ undefines = [],
124+ source_suffix = "c" ,
125+ so_ext = ".so" ,
126+ o_ext = ".o" ,
127+ include_dirs = []))
128+
129+ knl = lp .make_kernel (
130+ "{[e,i1,i2]: 0<=e<n and 0<=i1,i2<4}" ,
90131 """
91- y[:] = gemv(A[:, :], x[:])
92- """ , [
93- lp .GlobalArg ("A" , dtype = np .float64 , shape = (n , n )),
94- lp .GlobalArg ("x" , dtype = np .float64 , shape = (n , )),
95- lp .GlobalArg ("y" , shape = (n , )), ...],
96- target = CTarget ())
97-
98- knl = lp .register_callable (knl , "gemv" , CBLASGEMV (name = "gemv" ))
99- print (lp .generate_code_v2 (knl ).device_code ())
132+ for e
133+ for i1
134+ tmp1[i1] = 3*x[e, i1]
135+ end
136+ tmp2[:] = matvec(A[:, :], tmp1[:])
137+ for i2
138+ <> tmp3[i2] = 2 * tmp2[i2]
139+ out[e, i2] = tmp3[i2]
140+ end
141+ end
142+ """ ,
143+ kernel_data = [
144+ lp .TemporaryVariable ("tmp1" ,
145+ shape = (4 , ),
146+ dtype = None ),
147+ lp .TemporaryVariable ("tmp2" ,
148+ shape = (4 , ),
149+ dtype = None ),
150+ lp .GlobalArg ("A" ,
151+ shape = (4 , 4 ),
152+ dtype = "float64" ),
153+ lp .GlobalArg ("x" ,
154+ shape = lp .auto ,
155+ dtype = "float64" ),
156+ ...],
157+ target = lp .ExecutableCVectorExtensionsTarget (compiler = compiler ),
158+ lang_version = (2018 , 2 ))
159+
160+ knl = lp .register_callable (knl , "matvec" , CBLASGEMV ("matvec" ))
161+
162+ for transform_func in [transform_1 , transform_2 ]:
163+ knl = transform_func (knl )
164+ print ("Generated code from '{transform_func.__name__} -----'" )
165+ print (lp .generate_code_v2 (knl ).device_code ())
166+ print (75 * "-" )
167+
168+ # {{ verify the result is correct.
169+
170+ from numpy .random import default_rng
171+
172+ rng = default_rng (seed = 0 )
173+ a = rng .random ((4 , 4 ))
174+ x = rng .random ((100 , 4 ))
175+
176+ _ , (out ,) = knl (A = a , x = x )
177+
178+ np .testing .assert_allclose (6 * np .einsum ("ij,ej->ei" ,
179+ a , x ),
180+ out )
181+
182+ # }}}
183+
184+
185+ if __name__ == "__main__" :
186+ main ()
0 commit comments