Skip to content

Commit 3bdd997

Browse files
committed
test vectorizability of blas-callables with vector inputs
1 parent 7e42b74 commit 3bdd997

File tree

2 files changed

+117
-44
lines changed

2 files changed

+117
-44
lines changed

examples/python/call-external.py

Lines changed: 107 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from loopy.diagnostic import LoopyError
44
from loopy.target.c import CTarget
55
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
6+
from loopy.target.c.c_execution import CCompiler
7+
from codepy.toolchain import GCCToolchain
68

79

810
# {{{ blas callable
@@ -22,7 +24,7 @@ def with_types(self, arg_id_to_dtype, callables_table):
2224

2325
if vec_dtype.numpy_dtype == np.float32:
2426
name_in_target = "cblas_sgemv"
25-
elif vec_dtype. numpy_dtype == np.float64:
27+
elif vec_dtype.numpy_dtype == np.float64:
2628
name_in_target = "cblas_dgemv"
2729
else:
2830
raise LoopyError("GEMV is only supported for float32 and float64 "
@@ -47,30 +49,37 @@ def with_descrs(self, arg_id_to_descr, callables_table):
4749
assert mat_descr.shape[0] == res_descr.shape[0]
4850
assert len(vec_descr.shape) == len(res_descr.shape) == 1
4951
# handling only the easy case when stride == 1
50-
assert vec_descr.dim_tags[0].stride == 1
5152
assert mat_descr.dim_tags[1].stride == 1
52-
assert res_descr.dim_tags[0].stride == 1
5353

5454
return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table
5555

5656
def emit_call_insn(self, insn, target, expression_to_code_mapper):
5757
from pymbolic import var
58+
from loopy.codegen import UnvectorizableError
5859
mat_descr = self.arg_id_to_descr[0]
60+
vec_descr = self.arg_id_to_descr[1]
61+
res_descr = self.arg_id_to_descr[-1]
5962
m, n = mat_descr.shape
6063
ecm = expression_to_code_mapper
64+
65+
if ecm.codegen_state.vectorization_info is not None:
66+
raise UnvectorizableError("cannot vectorize BLAS-gemv.")
67+
6168
mat, vec = insn.expression.parameters
6269
result, = insn.assignees
6370

6471
c_parameters = [var("CblasRowMajor"),
6572
var("CblasNoTrans"),
6673
m, n,
67-
1,
74+
1, # alpha
6875
ecm(mat).expr,
69-
1,
76+
mat_descr.dim_tags[0].stride, # LDA
7077
ecm(vec).expr,
71-
1,
78+
vec_descr.dim_tags[0].stride, # INCX
79+
0, # beta
7280
ecm(result).expr,
73-
1]
81+
res_descr.dim_tags[0].stride # INCY
82+
]
7483
return (var(self.name_in_target)(*c_parameters),
7584
False # cblas_gemv does not return anything
7685
)
@@ -83,17 +92,95 @@ def generate_preambles(self, target):
8392
# }}}
8493

8594

86-
n = 10
87-
88-
knl = lp.make_kernel(
89-
"{:}",
95+
def transform_1(knl):
96+
return knl
97+
98+
99+
def transform_2(knl):
100+
# A similar transformation is applied to kernels containing
101+
# SLATE <https://www.firedrakeproject.org/firedrake.slate.html>
102+
# callables.
103+
knl = lp.split_iname(knl, "e", 4, inner_iname="e_inner", slabs=(0, 1))
104+
knl = lp.privatize_temporaries_with_inames(knl, "e_inner")
105+
knl = lp.tag_inames(knl, {"e_inner": "vec"})
106+
if 0:
107+
# Easy codegen exercise, but misses vectorizing certain instructions.
108+
knl = lp.tag_array_axes(knl, "tmp3", "c,vec")
109+
else:
110+
knl = lp.tag_array_axes(knl, "tmp3,tmp2", "c,vec")
111+
return knl
112+
113+
114+
def main():
115+
116+
compiler = CCompiler(toolchain=GCCToolchain(
117+
cc="gcc",
118+
cflags="-std=c99 -O3 -fPIC".split(),
119+
ldflags="-shared".split(),
120+
libraries=["blas"],
121+
library_dirs=[],
122+
defines=[],
123+
undefines=[],
124+
source_suffix="c",
125+
so_ext=".so",
126+
o_ext=".o",
127+
include_dirs=[]))
128+
129+
knl = lp.make_kernel(
130+
"{[e,i1,i2]: 0<=e<n and 0<=i1,i2<4}",
90131
"""
91-
y[:] = gemv(A[:, :], x[:])
92-
""", [
93-
lp.GlobalArg("A", dtype=np.float64, shape=(n, n)),
94-
lp.GlobalArg("x", dtype=np.float64, shape=(n, )),
95-
lp.GlobalArg("y", shape=(n, )), ...],
96-
target=CTarget())
97-
98-
knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv"))
99-
print(lp.generate_code_v2(knl).device_code())
132+
for e
133+
for i1
134+
tmp1[i1] = 3*x[e, i1]
135+
end
136+
tmp2[:] = matvec(A[:, :], tmp1[:])
137+
for i2
138+
<> tmp3[i2] = 2 * tmp2[i2]
139+
out[e, i2] = tmp3[i2]
140+
end
141+
end
142+
""",
143+
kernel_data=[
144+
lp.TemporaryVariable("tmp1",
145+
shape=(4, ),
146+
dtype=None),
147+
lp.TemporaryVariable("tmp2",
148+
shape=(4, ),
149+
dtype=None),
150+
lp.GlobalArg("A",
151+
shape=(4, 4),
152+
dtype="float64"),
153+
lp.GlobalArg("x",
154+
shape=lp.auto,
155+
dtype="float64"),
156+
...],
157+
target=lp.ExecutableCVectorExtensionsTarget(compiler=compiler),
158+
lang_version=(2018, 2))
159+
160+
knl = lp.register_callable(knl, "matvec", CBLASGEMV("matvec"))
161+
162+
for transform_func in [transform_1, transform_2]:
163+
knl = transform_func(knl)
164+
print("Generated code from '{transform_func.__name__} -----'")
165+
print(lp.generate_code_v2(knl).device_code())
166+
print(75 * "-")
167+
168+
# {{ verify the result is correct.
169+
170+
from numpy.random import default_rng
171+
172+
rng = default_rng(seed=0)
173+
a = rng.random((4, 4))
174+
x = rng.random((100, 4))
175+
176+
_, (out,) = knl(A=a, x=x)
177+
178+
np.testing.assert_allclose(6*np.einsum("ij,ej->ei",
179+
a, x),
180+
out)
181+
182+
# }}}
183+
184+
185+
if __name__ == "__main__":
186+
main()

test/test_target.py

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -737,23 +737,6 @@ def test_c_vector_extensions():
737737
print(lp.generate_code_v2(knl).device_code())
738738

739739

740-
def test_omp_simd_tag():
741-
knl = lp.make_kernel(
742-
"{[i]: 0<=i<16}",
743-
"""
744-
y[i] = 2 * x[i]
745-
""")
746-
747-
knl = lp.add_dtypes(knl, {"x": "float64"})
748-
knl = lp.split_iname(knl, "i", 4)
749-
knl = lp.tag_inames(knl, {"i_inner": lp.OpenMPSIMDTag()})
750-
751-
code_str = lp.generate_code_v2(knl).device_code()
752-
753-
assert any(line.strip() == "#pragma omp simd"
754-
for line in code_str.split("\n"))
755-
756-
757740
def test_vec_tag_with_omp_simd_fallback():
758741
knl = lp.make_kernel(
759742
"{[i, j1, j2, j3]: 0<=i<10 and 0<=j1,j2,j3<4}",
@@ -764,11 +747,13 @@ def test_vec_tag_with_omp_simd_fallback():
764747
""",
765748
[lp.GlobalArg("x, y", shape=lp.auto, dtype=float)],
766749
seq_dependencies=True,
767-
target=lp.ExecutableCVectorExtensionsTarget())
750+
target=lp.ExecutableCVectorExtensionsTarget(
751+
lp.VectorizationFallback.OMP_SIMD)
752+
)
768753

769-
knl = lp.tag_inames(knl, {"j1": lp.VectorizeTag(lp.OpenMPSIMDTag()),
770-
"j2": lp.VectorizeTag(lp.OpenMPSIMDTag()),
771-
"j3": lp.VectorizeTag(lp.OpenMPSIMDTag())})
754+
knl = lp.tag_inames(knl, {"j1": "vec",
755+
"j2": "vec",
756+
"j3": "vec"})
772757
knl = lp.tag_array_axes(knl, "temp1,temp2", "vec")
773758

774759
code_str = lp.generate_code_v2(knl).device_code()
@@ -794,15 +779,16 @@ def test_vec_extensions_with_multiple_loopy_body_insns():
794779
end
795780
""",
796781
seq_dependencies=True,
797-
target=lp.ExecutableCVectorExtensionsTarget())
782+
target=lp.ExecutableCVectorExtensionsTarget(
783+
lp.VectorizationFallback.OMP_SIMD)
784+
)
798785

799786
knl = lp.add_dtypes(knl, {"dat0": "float64"})
800787
knl = lp.split_iname(knl, "n", 4, slabs=(1, 1),
801788
inner_iname="n_batch")
802789
knl = lp.privatize_temporaries_with_inames(knl, "n_batch")
803790
knl = lp.tag_array_axes(knl, "tmp", "vec")
804-
knl = lp.tag_inames(knl, {
805-
"n_batch": lp.VectorizeTag(lp.OpenMPSIMDTag())})
791+
knl = lp.tag_inames(knl, {"n_batch": "vec"})
806792

807793
_, (out,) = knl(N=100)
808794
np.testing.assert_allclose(out, 2*np.ones((100, 1)))

0 commit comments

Comments
 (0)