Open
Description
We are using @turbo
extensively in Trixi.jl.
Recently, we have started running our code on ARM-based machines and encountered the following error:
LoadError: MethodError: no method matching _vstore_unroll!(::LayoutPointers.StridedPointer{Float64, 4, 1, 0, (1, 2, 3, 4), Tuple{Static.StaticInt{8}, Static.StaticInt{8}, Static.StaticInt{40}, Static.StaticInt{200}}, NTuple{4, Static.StaticInt{0}}}, ::VectorizationBase.VecUnroll{4, 1, Float64, VectorizationBase.VecUnroll{4, 1, Float64, Float64}}, ::VectorizationBase.Unroll{2, 1, 5, 1, 1, 0x0000000000000000, 1, VectorizationBase.Unroll{4, 1, 5, 1, 1, 0x0000000000000000, 1, Static.StaticInt{0}}}, ::Static.False, ::Static.False, ::Static.False, ::Static.StaticInt{16}, ::Static.StaticInt{8})
Closest candidates are:
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::M, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS, M}
@ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2552
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T, <:VectorizationBase.VecUnroll{<:Any, W, T, VectorizationBase.Vec{W, T}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::Static.StaticInt{SVUS}) where {W, T, A<:Static.StaticBool, S<:Static.StaticBool, NT<:Static.StaticBool, RS, D, C, SVUS, UU<:(VectorizationBase.Unroll{AUO, FO, NO, AV, W, MO, X, VectorizationBase.Unroll{AUI, FI, NI, AV, W, MI, X, I}} where {AV, X, I, AUO, FO, NO, MO, AUI, FI, NI, MI})}
@ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2575
_vstore_unroll!(::LayoutPointers.AbstractStridedPointer{T1, D, C, B, R, X, O} where {B, R, X<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}, O<:Tuple{Vararg{Union{Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, D}}}, ::VectorizationBase.VecUnroll{<:Any, W, T2, <:VectorizationBase.VecUnroll{<:Any, W, T2, VectorizationBase.Vec{W, T2}}}, ::UU, ::A, ::S, ::NT, ::Static.StaticInt{RS}, ::SVUS) where {T1, D, C, W, T2, UU, A, S, NT, RS, SVUS}
@ VectorizationBase ~/.julia/packages/VectorizationBase/LqJbS/src/vecunroll/memory.jl:2531
...
This was caused by one of our helper functions, which basically does a specialized matrix-vector multiplication. We were able to reproduce this issue with this example:
MWE
using StaticArrays
using StrideArrays: PtrArray, StaticInt
using LoopVectorization: @turbo
function multiply_dimensionwise!(data_out, matrix)
tmp = zeros(eltype(data_out), size(data_out, 1), size(matrix, 1), size(matrix, 2), size(matrix, 2))
@turbo for k in axes(data_out, 4), j in axes(data_out, 3), i in axes(data_out, 2), v in axes(data_out, 1)
res = zero(eltype(data_out))
for kk in axes(matrix, 2)
res += matrix[k, kk] * tmp[v, i, j, kk]
end
data_out[v, i, j, k] = res
end
return nothing
end
dims = 3
nodes = 5 # important!
els = 1
test_u = fill(2.0, nodes^dims * els)
test_ptr = PtrArray(pointer(test_u), (StaticInt(1), ntuple(_ -> StaticInt(nodes), dims)..., els))
test_mat = fill(1.0, nodes, nodes)
test_smat = SMatrix{nodes, nodes}(test_mat)
multiply_dimensionwise!(view(test_ptr, :, :, :, :, 1), test_smat)
Metadata
Metadata
Assignees
Labels
No labels