774
774
function imfilter! (r:: AbstractCPU{FIRTiled{N}} , out:: AbstractArray{S,N} , A:: AbstractArray{T,N} , kernel:: Tuple{Any,Any,Vararg{Any}} , border:: NoPad , inds:: Indices = axes (out)) where {S,T,N}
775
775
kern = kernel[1 ]
776
776
iscopy (kern) && return imfilter! (r, out, A, tail (kernel), border, inds)
777
- tmp = tile_allocate (filter_type (A, kernel), r. settings. tilesize, kernel)
777
+ TTile, f = native_eltype (filter_type (A, kernel))
778
+ @assert f == 1
779
+ # @show r.settings.tilesize
780
+ tmp = tile_allocate (TTile, r. settings. tilesize, kernel)
778
781
_imfilter_tiled! (r, out, A, kernel, border, tmp, inds)
779
782
out
780
783
end
834
837
835
838
# Single-threaded, pair of kernels (with only one temporary buffer required)
836
839
function _imfilter_tiled! (r:: CPU1 , out, A, kernel:: Tuple{Any,Any} , border:: NoPad , tiles:: Vector{AA} , indsout) where AA<: AbstractArray
840
+ out, A, kernel = maybe_reinterpret (out, A, kernel)
837
841
k1, k2 = kernel
838
842
tile = tiles[1 ]
839
843
indsk2, indstile = axes (k2), axes (tile)
850
854
851
855
# Multithreaded, pair of kernels
852
856
function _imfilter_tiled! (r:: CPUThreads , out, A, kernel:: Tuple{Any,Any} , border:: NoPad , tiles:: Vector{AA} , indsout) where AA<: AbstractArray
857
+ out, A, kernel = maybe_reinterpret (out, A, kernel)
853
858
k1, k2 = kernel
854
859
tile = tiles[1 ]
855
860
indsk2, indstile = axes (k2), axes (tile)
908
913
out
909
914
end
910
915
911
- # The first of the pair in `tmp` has the current data. We also make
912
- # the second a plain array so there's no doubt about who's holding the
913
- # proper indices.
914
- function _imfilter_tiled_swap! (r, out, kernel:: Tuple{Any,Any,Vararg{Any}} , border, tmp:: Tuple{TileBuffer,Array} )
916
+ # The first of the pair in `tmp` has the current data.
917
+ function _imfilter_tiled_swap! (r, out, kernel:: Tuple{Any,Any,Vararg{Any}} , border, tmp:: Tuple{TileBuffer,AbstractArray} )
915
918
tileb1, tile2 = tmp
916
919
k1, kt = kernel[1 ], tail (kernel)
917
920
parentinds = axes (tileb1)
@@ -922,7 +925,7 @@ function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, borde
922
925
end
923
926
924
927
# on the last call we write to `out` instead of one of the buffers
925
- function _imfilter_tiled_swap! (r, out, kernel:: Tuple{Any} , border, tmp:: Tuple{TileBuffer,Array } )
928
+ function _imfilter_tiled_swap! (r, out, kernel:: Tuple{Any} , border, tmp:: Tuple{TileBuffer,AbstractArray } )
926
929
tileb1 = tmp[1 ]
927
930
k1 = kernel[1 ]
928
931
parentinds = axes (tileb1)
@@ -1014,26 +1017,26 @@ end
1014
1017
# This is unfortunate, but specializing this saves an add in the inner
1015
1018
# loop and results in a modest performance improvement. It would be
1016
1019
# nice if LLVM did this automatically. (@polly?)
1017
- function __imfilter_inbounds! (r, out, A:: OffsetArray , kern:: OffsetArray , border, R, z)
1018
- off, k = CartesianIndex (kern. offsets), parent (kern)
1019
- o, O = safehead (off), safetail (off)
1020
- Rnew = CartesianIndices (map ((x,y)-> x.+ y, R. indices, Tuple (off)))
1021
- Rk = CartesianIndices (axes (k))
1022
- offA, pA = CartesianIndex (A. offsets), parent (A)
1023
- oA, OA = safehead (offA), safetail (offA)
1024
- for I in safetail (Rnew)
1025
- IA = I- OA
1026
- for i in safehead (Rnew)
1027
- tmp = z
1028
- iA = i- oA
1029
- @inbounds for J in safetail (Rk), j in safehead (Rk)
1030
- tmp += safe_for_prod (pA[iA+ j,IA+ J], tmp)* k[j,J]
1031
- end
1032
- @inbounds out[i- o,I- O] = tmp
1033
- end
1034
- end
1035
- out
1036
- end
1020
+ # function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z)
1021
+ # off, k = CartesianIndex(kern.offsets), parent(kern)
1022
+ # o, O = safehead(off), safetail(off)
1023
+ # Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off)))
1024
+ # Rk = CartesianIndices(axes(k))
1025
+ # offA, pA = CartesianIndex(A.offsets), parent(A)
1026
+ # oA, OA = safehead(offA), safetail(offA)
1027
+ # for I in safetail(Rnew)
1028
+ # IA = I-OA
1029
+ # for i in safehead(Rnew)
1030
+ # tmp = z
1031
+ # iA = i-oA
1032
+ # @inbounds for J in safetail(Rk), j in safehead(Rk)
1033
+ # tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J]
1034
+ # end
1035
+ # @inbounds out[i-o,I-O] = tmp
1036
+ # end
1037
+ # end
1038
+ # out
1039
+ # end
1037
1040
1038
1041
function _imfilter_inbounds! (r:: AbstractResource , out, A:: AbstractArray , kern:: ReshapedOneD , border:: NoPad , inds)
1039
1042
Rpre, ind, Rpost = iterdims (inds, kern)
@@ -1043,68 +1046,110 @@ function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::R
1043
1046
return out
1044
1047
end
1045
1048
p = accumfilter (A[first (R)+ first (Rk)], first (k))
1046
- z = zero (typeof (p+ p))
1049
+ z = float (zero (eltype (A)))
1050
+ # z = zero(typeof(p+p))
1047
1051
_imfilter_inbounds! (r, z, out, A, k, Rpre, ind, Rpost)
1048
1052
end
1049
1053
1050
- # Many of the following are unfortunate specializations
1051
- function _imfilter_inbounds! (r:: AbstractResource , z, out, A:: AbstractArray , k:: OffsetVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices )
1052
- _imfilter_inbounds! (r, z, out, A, parent (k), Rpre, ind, Rpost, k. offsets[1 ])
1053
- end
1054
+ # # Many of the following are unfortunate specializations
1055
+ # function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
1056
+ # _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1])
1057
+ # end
1054
1058
1055
- function _imfilter_inbounds! (r:: AbstractResource , z, out, A:: AbstractArray , k:: AbstractVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices , koffset= 0 )
1059
+ # LoopVectorization.check_type(::Type{T}) where T<:ColorVectorSpace.MathTypes = true
1060
+ # @generated function VectorizationBase.zero_vecunroll(::StaticInt{N}, ::StaticInt{W}, ::Type{Gray{T}}, ::StaticInt{RS}) where {N,W,T,RS}
1061
+ # Expr(:block, Expr(:meta, :inline), :(VectorizationBase._vzero(VecUnroll{$(N-1),$W,$T,Vec{$W,$T}}, StaticInt{$RS}())))
1062
+ # end
1063
+ # function VectorizationBase._vload_unroll(
1064
+ # sptr::AbstractStridedPointer{Gray{T},N,C,B}, u::Unroll{AU,F,UN,AV,W,M,UX,I}, ::A, ::StaticInt{RS}, ::StaticInt{X}
1065
+ # ) where {T<:NativeTypes,N,C,B,AU,F,UN,AV,W,M,UX,I<:IndexNoUnroll,A<:StaticBool,RS,X}
1066
+ # VectorizationBase.VecUnroll{N,1,T,T}(x::S) where {N,T<:VectorizationBase.NativeTypes,S<:FixedPoint{T}} =
1067
+ # VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))
1068
+ # VectorizationBase.VecUnroll(x::FixedPoint) = VectorizationBase.VecUnroll(reinterpret(x))
1069
+ # VectorizationBase.VecUnroll(x::AbstractGray) = VectorizationBase.VecUnroll(gray(x))
1070
+ # VectorizationBase.VecUnroll(x::Gray) where {N,T<:VectorizationBase.NativeTypes} =
1071
+ # VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))
1072
+
1073
+ const LVTypes = Union{VectorizationBase. NativeTypes, SVector{N,<: VectorizationBase.NativeTypes } where N}
1074
+
1075
+ const args = Ref {Any} ()
1076
+ function _imfilter_inbounds! (r:: AbstractResource , z, out:: AbstractArray{<:LVTypes} , A:: AbstractArray{<:LVTypes} , k:: AbstractVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices )
1077
+ if ! LoopVectorization. check_args (out, A)
1078
+ @show summary (out) summary (A)
1079
+ args[] = (deepcopy (out), deepcopy (A))
1080
+ error (" this should have worked" )
1081
+ end
1056
1082
indsk = axes (k, 1 )
1083
+ zout = convert (eltype (out), z)
1057
1084
for Ipost in Rpost
1058
1085
for i in ind
1059
- ik = i+ koffset
1060
- for Ipre in Rpre
1061
- tmp = z
1086
+ @turbo for Ipre in Rpre
1087
+ tmp = zout
1062
1088
for j in indsk
1063
- @inbounds tmp += safe_for_prod (A[Ipre,ik + j,Ipost], tmp )* k[j]
1089
+ tmp += safe_for_prod (A[Ipre,i + j,Ipost], z )* k[j]
1064
1090
end
1065
- @inbounds out[Ipre,i,Ipost] = tmp
1091
+ out[Ipre,i,Ipost] = tmp
1066
1092
end
1067
1093
end
1068
1094
end
1069
1095
out
1070
1096
end
1071
1097
1072
- function _imfilter_inbounds! (r:: AbstractResource , out, A:: OffsetArray , kern:: ReshapedVector , border:: NoPad , inds)
1073
- Rpre, ind, Rpost = iterdims (inds, kern)
1074
- k = kern. data
1075
- R, Rk = CartesianIndices (inds), CartesianIndices (axes (kern))
1076
- if isempty (R) || isempty (Rk)
1077
- return out
1078
- end
1079
- p = accumfilter (A[first (R)+ first (Rk)], first (k))
1080
- z = zero (typeof (p+ p))
1081
- Opre, o, Opost = KernelFactors. indexsplit (CartesianIndex (A. offsets), kern)
1082
- _imfilter_inbounds! (r, z, out, parent (A), k, Rpre, ind, Rpost, Opre, o, Opost)
1083
- end
1084
-
1085
- function _imfilter_inbounds! (r:: AbstractResource , z, out, A:: AbstractArray , k:: OffsetVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices , Opre, o, Opost)
1086
- _imfilter_inbounds! (r, z, out, A, parent (k), Rpre, ind, Rpost, Opre, o, Opost, k. offsets[1 ])
1087
- end
1088
-
1089
- function _imfilter_inbounds! (r:: AbstractResource , z, out, A:: AbstractArray , k:: AbstractVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices , Opre, o, Opost, koffset= 0 )
1098
+ # No @turbo version
1099
+ function _imfilter_inbounds! (r:: AbstractResource , z, out:: AbstractArray{<:LVTypes} , A:: AbstractArray{<:LVTypes} , k:: AbstractVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices )
1090
1100
indsk = axes (k, 1 )
1101
+ zout = convert (eltype (out), z)
1091
1102
for Ipost in Rpost
1092
- IOpost = Ipost - Opost
1093
1103
for i in ind
1094
- io = i- o+ koffset
1095
- for Ipre in Rpre
1096
- IOpre = Ipre - Opre
1097
- tmp = z
1104
+ @inbounds for Ipre in Rpre
1105
+ tmp = zout
1098
1106
for j in indsk
1099
- @inbounds tmp += safe_for_prod (A[IOpre,io + j,IOpost ], tmp )* k[j]
1107
+ tmp += safe_for_prod (A[Ipre,i + j,Ipost ], z )* k[j]
1100
1108
end
1101
- @inbounds out[Ipre,i,Ipost] = tmp
1109
+ out[Ipre,i,Ipost] = tmp
1102
1110
end
1103
1111
end
1104
1112
end
1105
1113
out
1106
1114
end
1107
- # end unfortunate specializations
1115
+
1116
+ # function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds)
1117
+ # Rpre, ind, Rpost = iterdims(inds, kern)
1118
+ # k = kern.data
1119
+ # R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern))
1120
+ # if isempty(R) || isempty(Rk)
1121
+ # return out
1122
+ # end
1123
+ # p = accumfilter(A[first(R)+first(Rk)], first(k))
1124
+ # z = zero(typeof(p+p))
1125
+ # Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern)
1126
+ # _imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost)
1127
+ # end
1128
+
1129
+ # function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
1130
+ # _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1])
1131
+ # end
1132
+
1133
+ # function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
1134
+ # indsk = axes(k, 1)
1135
+ # zout = convert(eltype(out), z)
1136
+ # for Ipost in Rpost
1137
+ # IOpost = Ipost - Opost
1138
+ # for i in ind
1139
+ # io = i-o+koffset
1140
+ # @turbo for Ipre in Rpre
1141
+ # IOpre = Ipre - Opre
1142
+ # tmp = zout
1143
+ # for j in indsk
1144
+ # tmp += safe_for_prod(A[IOpre,io+j,IOpost], z)*k[j]
1145
+ # end
1146
+ # @inbounds out[Ipre,i,Ipost] = tmp
1147
+ # end
1148
+ # end
1149
+ # end
1150
+ # out
1151
+ # end
1152
+ # # end unfortunate specializations
1108
1153
1109
1154
# # commented out because "virtual padding" is commented out
1110
1155
# function _imfilter_iter!(r::AbstractResource, out, padded, kernel::AbstractArray, iter)
0 commit comments