Skip to content

Commit 47c59b7

Browse files
committed
WIP: use LoopVectorization
Preliminary tests suggest ~4x speedups, not too shabby
1 parent 6b51789 commit 47c59b7

File tree

5 files changed

+142
-64
lines changed

5 files changed

+142
-64
lines changed

Project.toml

+5-1
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@ author = ["Tim Holy <[email protected]>", "Jan Weidner <[email protected]>"]
44
version = "0.7.0"
55

66
[deps]
7+
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
78
CatIndices = "aafaddc9-749c-510e-ac4f-586e18779b91"
89
ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3"
910
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
1011
FFTViews = "4f61f5a4-77b1-5117-aa51-3ab5ef4ef0cd"
1112
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
1213
ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
1314
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
15+
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
1416
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
1517
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
1618
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -19,16 +21,18 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
1921
TiledIteration = "06e1c1a7-607b-532d-9fad-de7d9aa2abac"
2022

2123
[compat]
24+
ArrayInterface = "3"
2225
CatIndices = "0.2"
2326
ComputationalResources = "0.3"
2427
DataStructures = "0.17.7, 0.18"
2528
FFTViews = "0.3"
2629
FFTW = "0.3, 1"
2730
ImageCore = "0.9"
31+
LoopVectorization = "0.12.75"
2832
OffsetArrays = "1.9"
2933
Reexport = "1.1"
3034
StaticArrays = "0.10, 0.11, 0.12, 1.0"
31-
TiledIteration = "0.2, 0.3"
35+
TiledIteration = "0.4"
3236
julia = "1"
3337

3438
[extras]

src/ImageFiltering.jl

+2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ using Base: Indices, tail, fill_to_length, @pure, depwarn, @propagate_inbounds
1010
using OffsetArrays: IdentityUnitRange # using the one in OffsetArrays makes this work with multiple Julia versions
1111
using SparseArrays # only needed to fix an ambiguity in borderarray
1212
using Reexport
13+
using LoopVectorization
14+
using LoopVectorization.VectorizationBase
1315

1416
@reexport using OffsetArrays: centered # this method once lived here
1517

src/imfilter.jl

+108-63
Original file line numberDiff line numberDiff line change
@@ -774,7 +774,10 @@ end
774774
function imfilter!(r::AbstractCPU{FIRTiled{N}}, out::AbstractArray{S,N}, A::AbstractArray{T,N}, kernel::Tuple{Any,Any,Vararg{Any}}, border::NoPad, inds::Indices=axes(out)) where {S,T,N}
775775
kern = kernel[1]
776776
iscopy(kern) && return imfilter!(r, out, A, tail(kernel), border, inds)
777-
tmp = tile_allocate(filter_type(A, kernel), r.settings.tilesize, kernel)
777+
TTile, f = native_eltype(filter_type(A, kernel))
778+
@assert f == 1
779+
# @show r.settings.tilesize
780+
tmp = tile_allocate(TTile, r.settings.tilesize, kernel)
778781
_imfilter_tiled!(r, out, A, kernel, border, tmp, inds)
779782
out
780783
end
@@ -834,6 +837,7 @@ end
834837

835838
# Single-threaded, pair of kernels (with only one temporary buffer required)
836839
function _imfilter_tiled!(r::CPU1, out, A, kernel::Tuple{Any,Any}, border::NoPad, tiles::Vector{AA}, indsout) where AA<:AbstractArray
840+
out, A, kernel = maybe_reinterpret(out, A, kernel)
837841
k1, k2 = kernel
838842
tile = tiles[1]
839843
indsk2, indstile = axes(k2), axes(tile)
@@ -850,6 +854,7 @@ end
850854

851855
# Multithreaded, pair of kernels
852856
function _imfilter_tiled!(r::CPUThreads, out, A, kernel::Tuple{Any,Any}, border::NoPad, tiles::Vector{AA}, indsout) where AA<:AbstractArray
857+
out, A, kernel = maybe_reinterpret(out, A, kernel)
853858
k1, k2 = kernel
854859
tile = tiles[1]
855860
indsk2, indstile = axes(k2), axes(tile)
@@ -908,10 +913,8 @@ end
908913
out
909914
end
910915

911-
# The first of the pair in `tmp` has the current data. We also make
912-
# the second a plain array so there's no doubt about who's holding the
913-
# proper indices.
914-
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, border, tmp::Tuple{TileBuffer,Array})
916+
# The first of the pair in `tmp` has the current data.
917+
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, border, tmp::Tuple{TileBuffer,AbstractArray})
915918
tileb1, tile2 = tmp
916919
k1, kt = kernel[1], tail(kernel)
917920
parentinds = axes(tileb1)
@@ -922,7 +925,7 @@ function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, borde
922925
end
923926

924927
# on the last call we write to `out` instead of one of the buffers
925-
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any}, border, tmp::Tuple{TileBuffer,Array})
928+
function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any}, border, tmp::Tuple{TileBuffer,AbstractArray})
926929
tileb1 = tmp[1]
927930
k1 = kernel[1]
928931
parentinds = axes(tileb1)
@@ -1014,26 +1017,26 @@ end
10141017
# This is unfortunate, but specializing this saves an add in the inner
10151018
# loop and results in a modest performance improvement. It would be
10161019
# nice if LLVM did this automatically. (@polly?)
1017-
function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z)
1018-
off, k = CartesianIndex(kern.offsets), parent(kern)
1019-
o, O = safehead(off), safetail(off)
1020-
Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off)))
1021-
Rk = CartesianIndices(axes(k))
1022-
offA, pA = CartesianIndex(A.offsets), parent(A)
1023-
oA, OA = safehead(offA), safetail(offA)
1024-
for I in safetail(Rnew)
1025-
IA = I-OA
1026-
for i in safehead(Rnew)
1027-
tmp = z
1028-
iA = i-oA
1029-
@inbounds for J in safetail(Rk), j in safehead(Rk)
1030-
tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J]
1031-
end
1032-
@inbounds out[i-o,I-O] = tmp
1033-
end
1034-
end
1035-
out
1036-
end
1020+
# function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z)
1021+
# off, k = CartesianIndex(kern.offsets), parent(kern)
1022+
# o, O = safehead(off), safetail(off)
1023+
# Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off)))
1024+
# Rk = CartesianIndices(axes(k))
1025+
# offA, pA = CartesianIndex(A.offsets), parent(A)
1026+
# oA, OA = safehead(offA), safetail(offA)
1027+
# for I in safetail(Rnew)
1028+
# IA = I-OA
1029+
# for i in safehead(Rnew)
1030+
# tmp = z
1031+
# iA = i-oA
1032+
# @inbounds for J in safetail(Rk), j in safehead(Rk)
1033+
# tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J]
1034+
# end
1035+
# @inbounds out[i-o,I-O] = tmp
1036+
# end
1037+
# end
1038+
# out
1039+
# end
10371040

10381041
function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::ReshapedOneD, border::NoPad, inds)
10391042
Rpre, ind, Rpost = iterdims(inds, kern)
@@ -1043,68 +1046,110 @@ function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::R
10431046
return out
10441047
end
10451048
p = accumfilter(A[first(R)+first(Rk)], first(k))
1046-
z = zero(typeof(p+p))
1049+
z = float(zero(eltype(A)))
1050+
# z = zero(typeof(p+p))
10471051
_imfilter_inbounds!(r, z, out, A, k, Rpre, ind, Rpost)
10481052
end
10491053

1050-
# Many of the following are unfortunate specializations
1051-
function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
1052-
_imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1])
1053-
end
1054+
# # Many of the following are unfortunate specializations
1055+
# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
1056+
# _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1])
1057+
# end
10541058

1055-
function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, koffset=0)
1059+
# LoopVectorization.check_type(::Type{T}) where T<:ColorVectorSpace.MathTypes = true
1060+
# @generated function VectorizationBase.zero_vecunroll(::StaticInt{N}, ::StaticInt{W}, ::Type{Gray{T}}, ::StaticInt{RS}) where {N,W,T,RS}
1061+
# Expr(:block, Expr(:meta, :inline), :(VectorizationBase._vzero(VecUnroll{$(N-1),$W,$T,Vec{$W,$T}}, StaticInt{$RS}())))
1062+
# end
1063+
# function VectorizationBase._vload_unroll(
1064+
# sptr::AbstractStridedPointer{Gray{T},N,C,B}, u::Unroll{AU,F,UN,AV,W,M,UX,I}, ::A, ::StaticInt{RS}, ::StaticInt{X}
1065+
# ) where {T<:NativeTypes,N,C,B,AU,F,UN,AV,W,M,UX,I<:IndexNoUnroll,A<:StaticBool,RS,X}
1066+
# VectorizationBase.VecUnroll{N,1,T,T}(x::S) where {N,T<:VectorizationBase.NativeTypes,S<:FixedPoint{T}} =
1067+
# VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))
1068+
# VectorizationBase.VecUnroll(x::FixedPoint) = VectorizationBase.VecUnroll(reinterpret(x))
1069+
# VectorizationBase.VecUnroll(x::AbstractGray) = VectorizationBase.VecUnroll(gray(x))
1070+
# VectorizationBase.VecUnroll(x::Gray) where {N,T<:VectorizationBase.NativeTypes} =
1071+
# VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))
1072+
1073+
const LVTypes = Union{VectorizationBase.NativeTypes, SVector{N,<:VectorizationBase.NativeTypes} where N}
1074+
1075+
const args = Ref{Any}()
1076+
function _imfilter_inbounds!(r::AbstractResource, z, out::AbstractArray{<:LVTypes}, A::AbstractArray{<:LVTypes}, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
1077+
if !LoopVectorization.check_args(out, A)
1078+
@show summary(out) summary(A)
1079+
args[] = (deepcopy(out), deepcopy(A))
1080+
error("this should have worked")
1081+
end
10561082
indsk = axes(k, 1)
1083+
zout = convert(eltype(out), z)
10571084
for Ipost in Rpost
10581085
for i in ind
1059-
ik = i+koffset
1060-
for Ipre in Rpre
1061-
tmp = z
1086+
@turbo for Ipre in Rpre
1087+
tmp = zout
10621088
for j in indsk
1063-
@inbounds tmp += safe_for_prod(A[Ipre,ik+j,Ipost], tmp)*k[j]
1089+
tmp += safe_for_prod(A[Ipre,i+j,Ipost], z)*k[j]
10641090
end
1065-
@inbounds out[Ipre,i,Ipost] = tmp
1091+
out[Ipre,i,Ipost] = tmp
10661092
end
10671093
end
10681094
end
10691095
out
10701096
end
10711097

1072-
function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds)
1073-
Rpre, ind, Rpost = iterdims(inds, kern)
1074-
k = kern.data
1075-
R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern))
1076-
if isempty(R) || isempty(Rk)
1077-
return out
1078-
end
1079-
p = accumfilter(A[first(R)+first(Rk)], first(k))
1080-
z = zero(typeof(p+p))
1081-
Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern)
1082-
_imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost)
1083-
end
1084-
1085-
function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
1086-
_imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1])
1087-
end
1088-
1089-
function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost, koffset=0)
1098+
# No @turbo version
1099+
function _imfilter_inbounds!(r::AbstractResource, z, out::AbstractArray{<:LVTypes}, A::AbstractArray{<:LVTypes}, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
10901100
indsk = axes(k, 1)
1101+
zout = convert(eltype(out), z)
10911102
for Ipost in Rpost
1092-
IOpost = Ipost - Opost
10931103
for i in ind
1094-
io = i-o+koffset
1095-
for Ipre in Rpre
1096-
IOpre = Ipre - Opre
1097-
tmp = z
1104+
@inbounds for Ipre in Rpre
1105+
tmp = zout
10981106
for j in indsk
1099-
@inbounds tmp += safe_for_prod(A[IOpre,io+j,IOpost], tmp)*k[j]
1107+
tmp += safe_for_prod(A[Ipre,i+j,Ipost], z)*k[j]
11001108
end
1101-
@inbounds out[Ipre,i,Ipost] = tmp
1109+
out[Ipre,i,Ipost] = tmp
11021110
end
11031111
end
11041112
end
11051113
out
11061114
end
1107-
# end unfortunate specializations
1115+
1116+
# function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds)
1117+
# Rpre, ind, Rpost = iterdims(inds, kern)
1118+
# k = kern.data
1119+
# R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern))
1120+
# if isempty(R) || isempty(Rk)
1121+
# return out
1122+
# end
1123+
# p = accumfilter(A[first(R)+first(Rk)], first(k))
1124+
# z = zero(typeof(p+p))
1125+
# Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern)
1126+
# _imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost)
1127+
# end
1128+
1129+
# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
1130+
# _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1])
1131+
# end
1132+
1133+
# function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
1134+
# indsk = axes(k, 1)
1135+
# zout = convert(eltype(out), z)
1136+
# for Ipost in Rpost
1137+
# IOpost = Ipost - Opost
1138+
# for i in ind
1139+
# io = i-o+koffset
1140+
# @turbo for Ipre in Rpre
1141+
# IOpre = Ipre - Opre
1142+
# tmp = zout
1143+
# for j in indsk
1144+
# tmp += safe_for_prod(A[IOpre,io+j,IOpost], z)*k[j]
1145+
# end
1146+
# @inbounds out[Ipre,i,Ipost] = tmp
1147+
# end
1148+
# end
1149+
# end
1150+
# out
1151+
# end
1152+
# # end unfortunate specializations
11081153

11091154
## commented out because "virtual padding" is commented out
11101155
# function _imfilter_iter!(r::AbstractResource, out, padded, kernel::AbstractArray, iter)

src/kernelfactors.jl

+3
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ for op in (:+, :-, :*, :/)
111111
end
112112
Base.BroadcastStyle(::Type{R}) where {R<:ReshapedOneD{T,N}} where {T,N} = Broadcast.DefaultArrayStyle{N}()
113113

114+
Base.:/(A::ReshapedOneD{T,N,Npre}, x::Real) where {T,N,Npre} = ReshapedOneD{N,Npre}(A.data / x)
115+
Base.:*(A::ReshapedOneD{T,N,Npre}, x::Real) where {T,N,Npre} = ReshapedOneD{N,Npre}(A.data * x)
116+
114117
_reshape(A::ReshapedOneD{T,N}, ::Val{N}) where {T,N} = A
115118
_vec(A::ReshapedOneD) = A.data
116119
Base.vec(A::ReshapedOneD) = A.data # is this OK? (note indices won't nec. start with 1)

src/utils.jl

+24
Original file line numberDiff line numberDiff line change
@@ -131,3 +131,27 @@ accumfilter(pixelval::Colorant{N0f8}, filterval::N0f8) = float32(c)*Float32(filt
131131
# In theory, the following might need to be specialized. For safety, make it a
132132
# standalone function call.
133133
safe_for_prod(x, ref) = oftype(ref, x)
134+
135+
# For LoopVectorization, the easiest path is to convert to native types
136+
native_eltype(::Type{T}) where T<:VectorizationBase.NativeTypes = T, 1
137+
native_eltype(::Type{T}) where T<:FixedPoint = FixedPointNumbers.rawtype(T), FixedPointNumbers.rawone(T)
138+
native_eltype(::Type{Gray{T}}) where T = native_eltype(T)
139+
function native_eltype(::Type{RGB{T}}) where T
140+
T′, f = native_eltype(T)
141+
return SVector{3,T′}, f
142+
end
143+
native_eltype(::Type{T}) where T = T, 1
144+
145+
function maybe_reinterpret(out::AbstractArray, A::AbstractArray, kernel::Tuple{Any,Vararg{Any}})
146+
Tout = eltype(out)
147+
Tout′, outf = native_eltype(Tout)
148+
TA = eltype(A)
149+
TA′, Af = native_eltype(TA)
150+
if Tout′ !== Tout || TA′ !== TA
151+
kernel′ = ((kernel[1]/Af)*outf, Base.tail(kernel)...)
152+
return (Tout === Tout′ ? out : reinterpret(reshape, Tout′, out),
153+
TA === TA′ ? A : reinterpret(reshape, TA′, A),
154+
kernel′)
155+
end
156+
return out, A, kernel
157+
end

0 commit comments

Comments
 (0)