diff --git a/docs/src/interface.md b/docs/src/interface.md index 01c0a3c9..44f62405 100644 --- a/docs/src/interface.md +++ b/docs/src/interface.md @@ -13,8 +13,6 @@ all, you need to provide a type that represents your execution back-end and a wa kernels: ```@docs -GPUArrays.AbstractGPUBackend -GPUArrays.AbstractKernelContext GPUArrays.gpu_call GPUArrays.thread_block_heuristic ``` diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index 3f80f90a..ccfe1f8a 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -18,9 +18,9 @@ using Reexport include("device/execution.jl") ## executed on-device include("device/abstractarray.jl") -include("device/indexing.jl") +#include("device/indexing.jl") include("device/memory.jl") -include("device/synchronization.jl") +#include("device/synchronization.jl") using KernelAbstractions # host abstractions @@ -28,7 +28,7 @@ include("host/abstractarray.jl") include("host/construction.jl") ## integrations and specialized methods include("host/base.jl") -include("host/indexing.jl") +#include("host/indexing.jl") include("host/broadcast.jl") include("host/mapreduce.jl") include("host/linalg.jl") diff --git a/src/device/indexing.jl b/src/device/indexing.jl deleted file mode 100644 index b0b9990d..00000000 --- a/src/device/indexing.jl +++ /dev/null @@ -1,83 +0,0 @@ -# indexing - -export global_index, global_size, linear_index, @linearidx, @cartesianidx - - -## hardware - -for f in (:blockidx, :blockdim, :threadidx, :griddim) - @eval $f(ctx::AbstractKernelContext)::Int = error("Not implemented") # COV_EXCL_LINE - @eval export $f -end - -""" - global_index(ctx::AbstractKernelContext) - -Query the global index of the current thread in the launch configuration (i.e. as far as the -hardware is concerned). -""" -@inline function global_index(ctx::AbstractKernelContext) - threadidx(ctx) + (blockidx(ctx) - 1) * blockdim(ctx) -end - -""" - global_size(ctx::AbstractKernelContext) - -Query the global size of the launch configuration (total number of threads launched). -""" -@inline function global_size(ctx::AbstractKernelContext) - griddim(ctx) * blockdim(ctx) -end - - -## logical - -""" - linear_index(ctx::AbstractKernelContext, grididx::Int=1) - -Return a linear index for the current kernel by querying hardware registers (similar to -`get_global_id` in OpenCL). For applying a grid stride (in terms of [`global_size`](@ref)), -specify `grididx`. - -""" -@inline function linear_index(ctx::AbstractKernelContext, grididx::Int=1) - global_index(ctx) + (grididx - 1) * global_size(ctx) -end - -""" - linearidx(A, grididx=1, ctxsym=:ctx) - -Macro form of [`linear_index`](@ref), which return from the surrouunding scope when out of -bounds: - - ```julia - function kernel(ctx::AbstractKernelContext, A) - idx = @linearidx A - # from here on it's safe to index into A with idx - @inbounds begin - A[idx] = ... - end - end - ``` -""" -macro linearidx(A, grididx=1, ctxsym=:ctx) - quote - x = $(esc(A)) - i = linear_index($(esc(ctxsym)), $(esc(grididx))) - i > length(x) && return - i - end -end - -""" - cartesianidx(A, grididx=1, ctxsym=:ctx) - -Like [`@linearidx`](@ref), but returns a N-dimensional `CartesianIndex`. -""" -macro cartesianidx(A, grididx=1, ctxsym=:ctx) - quote - x = $(esc(A)) - i = @linearidx(x, $(esc(grididx)), $(esc(ctxsym))) - @inbounds CartesianIndices(x)[i] - end -end diff --git a/src/device/synchronization.jl b/src/device/synchronization.jl deleted file mode 100644 index b16d2518..00000000 --- a/src/device/synchronization.jl +++ /dev/null @@ -1,13 +0,0 @@ -# synchronization - -export synchronize_threads - -""" - synchronize_threads(ctx::AbstractKernelContext) - -in CUDA terms `__synchronize` -in OpenCL terms: `barrier(CLK_LOCAL_MEM_FENCE)` -""" -function synchronize_threads(ctx::AbstractKernelContext) - error("Not implemented") # COV_EXCL_LINE -end diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl index 05e0e8d6..6387f006 100644 --- a/src/host/abstractarray.jl +++ b/src/host/abstractarray.jl @@ -147,7 +147,7 @@ end ## generalized blocks of heterogeneous memory -@kernel function cartesian_copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets) +@kernel function cartesian_copy_kernel!(dest, dest_offsets, src, src_offsets) I = @index(Global, Cartesian) @inbounds dest[I + dest_offsets] = src[I + src_offsets] end diff --git a/src/host/base.jl b/src/host/base.jl index 96fe3fd4..0feeb30e 100644 --- a/src/host/base.jl +++ b/src/host/base.jl @@ -4,8 +4,7 @@ import Base: _RepeatInnerOuter # Handle `out = repeat(x; inner)` by parallelizing over `out` array This can benchmark # faster if repeating elements along the first axis (i.e. `inner=(n, ones...)`), as data # access can be contiguous on write. -function repeat_inner_dst_kernel!( - ctx::AbstractKernelContext, +@kernel function repeat_inner_dst_kernel!( xs::AbstractArray{<:Any, N}, inner::NTuple{N, Int}, out::AbstractArray{<:Any, N} @@ -13,13 +12,12 @@ function repeat_inner_dst_kernel!( # Get the "stride" index in each dimension, where the size # of the stride is given by `inner`. The stride-index (sdx) then # corresponds to the index of the repeated value in `xs`. - odx = @cartesianidx out + odx = @index(Global, Cartesian) dest_inds = odx.I sdx = ntuple(N) do i @inbounds (dest_inds[i] - 1) ÷ inner[i] + 1 end @inbounds out[odx] = xs[CartesianIndex(sdx)] - return nothing end # Handle `out = repeat(x; inner)` by parallelizing over the `xs` array This tends to @@ -90,8 +88,6 @@ end end @inbounds out[CartesianIndex(odx)] = val end - - return nothing end function repeat_outer(xs::AnyGPUArray, outer) diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl index 9a44fb83..0e2f5563 100644 --- a/src/host/broadcast.jl +++ b/src/host/broadcast.jl @@ -51,23 +51,22 @@ end bc′ = Broadcast.preprocess(dest, bc) # grid-stride kernel - function broadcast_kernel(ctx, dest, bc′, nelem) + @kernel function broadcast_kernel(dest, bc′, nelem) i = 0 + I = @index(Global, Linear) while i < nelem i += 1 - I = @cartesianidx(dest, i) - @inbounds dest[I] = bc′[I] + idx = CartesianIndices(dest)[(I-1)*nelem + i] + @inbounds dest[idx] = bc′[idx] end - return end elements = length(dest) elements_per_thread = typemax(Int) - heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc′, 1; - elements, elements_per_thread) - config = launch_configuration(backend(dest), heuristic; - elements, elements_per_thread) - gpu_call(broadcast_kernel, dest, bc′, config.elements_per_thread; - threads=config.threads, blocks=config.blocks) + # TODO: figure out actual arguments, 3 should be workgroupsize + config = KernelAbstractions.launch_config(broadcast_kernel, elements, + elements/elements_per_thread) + kernel! = broadcast_kernel(get_backend(dest), config.threads) + kernel!(dest, bc', nelem, ndrange = config.ndrange) return dest end diff --git a/src/host/construction.jl b/src/host/construction.jl index 482d50e9..39d3aecf 100644 --- a/src/host/construction.jl +++ b/src/host/construction.jl @@ -11,8 +11,8 @@ Base.convert(::Type{T}, a::AbstractArray) where {T<:AbstractGPUArray} = a isa T function Base.fill!(A::AnyGPUArray{T}, x) where T length(A) == 0 && return A - @kernel fill!(a, val) - idx = @index(Linear, Global) + @kernel function fill!(a, val) + idx = @index(Global, Linear) @inbounds a[idx] = val end kernel = fill!(backend(A)) @@ -23,11 +23,12 @@ end ## identity matrices -@kernel function identity_kernel(ctx::AbstractKernelContext, res::AbstractArray{T}, stride, val) where T +@kernel function identity_kernel(res::AbstractArray{T}, stride, val) where T i = @index(Global, Linear) ilin = (stride * (i - 1)) + i - ilin > length(res) && return - @inbounds res[ilin] = val + if ilin < length(res) + @inbounds res[ilin] = val + end end function (T::Type{<: AnyGPUArray{U}})(s::UniformScaling, dims::Dims{2}) where {U} diff --git a/src/host/indexing.jl b/src/host/indexing.jl deleted file mode 100644 index 34fc4084..00000000 --- a/src/host/indexing.jl +++ /dev/null @@ -1,182 +0,0 @@ -# host-level indexing - - -# basic indexing with integers - -Base.IndexStyle(::Type{<:AbstractGPUArray}) = Base.IndexLinear() - -function Base.getindex(xs::AbstractGPUArray{T}, I::Integer...) where T - assertscalar("getindex") - i = Base._to_linear_index(xs, I...) - x = Array{T}(undef, 1) - copyto!(x, 1, xs, i, 1) - return x[1] -end - -function Base.setindex!(xs::AbstractGPUArray{T}, v::T, I::Integer...) where T - assertscalar("setindex!") - i = Base._to_linear_index(xs, I...) - x = T[v] - copyto!(xs, i, x, 1, 1) - return xs -end - -Base.setindex!(xs::AbstractGPUArray, v, I::Integer...) = - setindex!(xs, convert(eltype(xs), v), I...) - - -# basic indexing with cartesian indices - -Base.@propagate_inbounds Base.getindex(A::AbstractGPUArray, I::Union{Integer, CartesianIndex}...) = - A[Base.to_indices(A, I)...] -Base.@propagate_inbounds Base.setindex!(A::AbstractGPUArray, v, I::Union{Integer, CartesianIndex}...) = - (A[Base.to_indices(A, I)...] = v; A) - - -# generalized multidimensional indexing - -Base.getindex(A::AbstractGPUArray, I...) = _getindex(A, to_indices(A, I)...) - -function _getindex(src::AbstractGPUArray, Is...) - shape = Base.index_shape(Is...) - dest = similar(src, shape) - any(isempty, Is) && return dest # indexing with empty array - idims = map(length, Is) - - AT = typeof(src).name.wrapper - # NOTE: we are pretty liberal here supporting non-GPU indices... - gpu_call(getindex_kernel, dest, src, idims, adapt(AT, Is)...) - return dest -end - -@generated function getindex_kernel(ctx::AbstractKernelContext, dest, src, idims, - Is::Vararg{Any,N}) where {N} - quote - i = @linearidx dest - is = @inbounds CartesianIndices(idims)[i] - @nexprs $N i -> I_i = @inbounds(Is[i][is[i]]) - val = @ncall $N getindex src i -> I_i - @inbounds dest[i] = val - return - end -end - -Base.setindex!(A::AbstractGPUArray, v, I...) = _setindex!(A, v, to_indices(A, I)...) - -function _setindex!(dest::AbstractGPUArray, src, Is...) - isempty(Is) && return dest - idims = length.(Is) - len = prod(idims) - len==0 && return dest - - AT = typeof(dest).name.wrapper - # NOTE: we are pretty liberal here supporting non-GPU sources and indices... - gpu_call(setindex_kernel, dest, adapt(AT, src), idims, len, adapt(AT, Is)...; - elements=len) - return dest -end - -@generated function setindex_kernel(ctx::AbstractKernelContext, dest, src, idims, len, - Is::Vararg{Any,N}) where {N} - quote - i = linear_index(ctx) - i > len && return - is = @inbounds CartesianIndices(idims)[i] - @nexprs $N i -> I_i = @inbounds(Is[i][is[i]]) - @ncall $N setindex! dest src[i] i -> I_i - return - end -end - - -## find* - -# simple array type that returns the index used to access an element, while -# retaining the dimensionality of the original array. this can be used to -# broadcast or reduce an array together with its indices, whereas normally -# combining e.g. a 2x2 array with its 4-element eachindex array would result -# in a 4x4 broadcast or reduction. -struct EachIndex{T,N,IS} <: AbstractArray{T,N} - dims::NTuple{N,Int} - indices::IS -end -EachIndex(xs::AbstractArray) = - EachIndex{typeof(firstindex(xs)), ndims(xs), typeof(eachindex(xs))}( - size(xs), eachindex(xs)) -Base.size(ei::EachIndex) = ei.dims -Base.getindex(ei::EachIndex, i::Int) = ei.indices[i] -Base.IndexStyle(::Type{<:EachIndex}) = Base.IndexLinear() - -function Base.findfirst(f::Function, xs::AnyGPUArray) - indices = EachIndex(xs) - dummy_index = first(indices) - - # given two pairs of (istrue, index), return the one with the smallest index - function reduction(t1, t2) - (x, i), (y, j) = t1, t2 - if i > j - t1, t2 = t2, t1 - (x, i), (y, j) = t1, t2 - end - x && return t1 - y && return t2 - return (false, dummy_index) - end - - res = mapreduce((x, y)->(f(x), y), reduction, xs, indices; - init = (false, dummy_index)) - if res[1] - # out of consistency with Base.findarray, return a CartesianIndex - # when the input is a multidimensional array - ndims(xs) == 1 && return res[2] - return CartesianIndices(xs)[res[2]] - else - return nothing - end -end - -Base.findfirst(xs::AnyGPUArray{Bool}) = findfirst(identity, xs) - -function findminmax(binop, xs::AnyGPUArray; init, dims) - indices = EachIndex(xs) - dummy_index = firstindex(xs) - - function reduction(t1, t2) - (x, i), (y, j) = t1, t2 - - binop(x, y) && return t2 - x == y && return (x, min(i, j)) - return t1 - end - - @static if VERSION < v"1.7.0-DEV.119" - # before JuliaLang/julia#35316, isless/isgreated did not order NaNs last - function reduction(t1::Tuple{<:AbstractFloat,<:Any}, t2::Tuple{<:AbstractFloat,<:Any}) - (x, i), (y, j) = t1, t2 - - isnan(x) && return t1 - isnan(y) && return t2 - - binop(x, y) && return t2 - x == y && return (x, min(i, j)) - return t1 - end - end - - if dims == Colon() - res = mapreduce(tuple, reduction, xs, indices; init = (init, dummy_index)) - - # out of consistency with Base.findarray, return a CartesianIndex - # when the input is a multidimensional array - return (res[1], ndims(xs) == 1 ? res[2] : CartesianIndices(xs)[res[2]]) - else - res = mapreduce(tuple, reduction, xs, indices; - init = (init, dummy_index), dims=dims) - vals = map(x->x[1], res) - inds = map(x->ndims(xs) == 1 ? x[2] : CartesianIndices(xs)[x[2]], res) - return (vals, inds) - end -end - -Base.findmax(a::AnyGPUArray; dims=:) = findminmax(Base.isless, a; init=typemin(eltype(a)), dims) -Base.findmin(a::AnyGPUArray; dims=:) = findminmax(Base.isgreater, a; init=typemax(eltype(a)), dims) diff --git a/src/host/linalg.jl b/src/host/linalg.jl index 64f49241..c7d44175 100644 --- a/src/host/linalg.jl +++ b/src/host/linalg.jl @@ -12,20 +12,20 @@ function LinearAlgebra.transpose!(B::AbstractGPUMatrix, A::AbstractGPUVector) end function LinearAlgebra.adjoint!(B::AbstractGPUVector, A::AbstractGPUMatrix) axes(B,1) == axes(A,2) && axes(A,1) == 1:1 || throw(DimensionMismatch("adjoint")) - gpu_call(B, A) do ctx, B, A - idx = @linearidx B + @kernel function adjoint_kernel!(B, A) + idx = @index(Global, Linear) @inbounds B[idx] = adjoint(A[1, idx]) - return end + adjoint_kernel!(backend(B))(B, A, ndrange = size(B)) B end function LinearAlgebra.adjoint!(B::AbstractGPUMatrix, A::AbstractGPUVector) axes(B,2) == axes(A,1) && axes(B,1) == 1:1 || throw(DimensionMismatch("adjoint")) - gpu_call(B, A) do ctx, B, A - idx = @linearidx A + @kernel function adjoint_kernel!(B, A) + idx = @index(Global, Linear) @inbounds B[1, idx] = adjoint(A[idx]) - return end + adjoint_kernel!(backend(A))(B, A, ndrange = size(A)) B end @@ -33,11 +33,11 @@ LinearAlgebra.transpose!(B::AbstractGPUArray, A::AbstractGPUArray) = transpose_f LinearAlgebra.adjoint!(B::AbstractGPUArray, A::AbstractGPUArray) = transpose_f!(adjoint, B, A) function transpose_f!(f, B::AbstractGPUMatrix{T}, A::AbstractGPUMatrix{T}) where T axes(B,1) == axes(A,2) && axes(B,2) == axes(A,1) || throw(DimensionMismatch(string(f))) - gpu_call(B, A) do ctx, B, A - idx = @cartesianidx A + @kernel function transpose_kernel!(B, A) + idx = @index(Global, Cartesian) @inbounds B[idx[2], idx[1]] = f(A[idx[1], idx[2]]) - return end + transpose_kernel!(backend(B))(B, A, ndrange = size(A)) B end @@ -60,47 +60,47 @@ end ## copy upper triangle to lower and vice versa function LinearAlgebra.copytri!(A::AbstractGPUMatrix{T}, uplo::AbstractChar, conjugate::Bool=false) where T - n = LinearAlgebra.checksquare(A) - if uplo == 'U' && conjugate - gpu_call(A) do ctx, _A - I = @cartesianidx _A - i, j = Tuple(I) - if j > i - _A[j,i] = conj(_A[i,j]) + n = LinearAlgebra.checksquare(A) + if uplo == 'U' && conjugate + @kernel function U_conj!(_A) + I = @index(Global, Cartesian) + i, j = Tuple(I) + if j > i + _A[j,i] = conj(_A[i,j]) + end end - return - end - elseif uplo == 'U' && !conjugate - gpu_call(A) do ctx, _A - I = @cartesianidx _A - i, j = Tuple(I) - if j > i - _A[j,i] = _A[i,j] + U_conj!(backend(_A), ndrange = size(_A)) + elseif uplo == 'U' && !conjugate + @kernel function U_noconj!(_A) + I = @index(Global, Cartesian) + i, j = Tuple(I) + if j > i + _A[j,i] = _A[i,j] + end end - return - end - elseif uplo == 'L' && conjugate - gpu_call(A) do ctx, _A - I = @cartesianidx _A - i, j = Tuple(I) - if j > i - _A[i,j] = conj(_A[j,i]) + U_noconj!(backend(_A))(_A, ndrange=size(_A)) + elseif uplo == 'L' && conjugate + @kernel function L_conj!(_A) + I = @index(Global, Cartesian) + i, j = Tuple(I) + if j > i + _A[i,j] = conj(_A[j,i]) + end end - return - end - elseif uplo == 'L' && !conjugate - gpu_call(A) do ctx, _A - I = @cartesianidx _A - i, j = Tuple(I) - if j > i - _A[i,j] = _A[j,i] + L_conj!(backend(_A))(_A, ndrange = size(_A)) + elseif uplo == 'L' && !conjugate + @kernel function L_noconj!(_A) + I = @index(Global, Cartesian) + i, j = Tuple(I) + if j > i + _A[i,j] = _A[j,i] + end end - return - end - else - throw(ArgumentError("uplo argument must be 'U' (upper) or 'L' (lower), got $uplo")) - end - A + L_noconj!(backend(_A))(_A, ndrange = size(_A)) + else + throw(ArgumentError("uplo argument must be 'U' (upper) or 'L' (lower), got $uplo")) + end + A end @@ -118,26 +118,26 @@ for T in (UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriang end function LinearAlgebra.tril!(A::AbstractGPUMatrix{T}, d::Integer = 0) where T - gpu_call(A, d; name="tril!") do ctx, _A, _d - I = @cartesianidx _A + @kernel function tril_kernel!(_A, _d) + I = @index(Global, Cartesian) i, j = Tuple(I) if i < j - _d _A[i, j] = 0 end - return end + tril_kernel!(backend(_A))(_A, _d, ndrange = size(_A)) return A end function LinearAlgebra.triu!(A::AbstractGPUMatrix{T}, d::Integer = 0) where T - gpu_call(A, d; name="triu!") do ctx, _A, _d - I = @cartesianidx _A + @kernel function triu_kernel!(_A, _d) + I = @index(Global, Cartesian) i, j = Tuple(I) if j < i + _d _A[i, j] = 0 end - return end + triu_kernel!(backend(_A))(_A, _d, ndrange = length(_A)) return A end @@ -300,8 +300,8 @@ function generic_matmatmul!(C::AbstractArray{R}, A::AbstractArray{T}, B::Abstrac return fill!(C, zero(R)) end - gpu_call(C, A, B; name="matmatmul!") do ctx, C, A, B - idx = @linearidx C + @kernel function matmatmul_kernel!(C, A, B) + idx = @index(Global, Linear) i, j = @inbounds Tuple(CartesianIndices(C)[idx])..., 1 @inbounds if i <= size(A,1) && j <= size(B,2) @@ -312,10 +312,8 @@ function generic_matmatmul!(C::AbstractArray{R}, A::AbstractArray{T}, B::Abstrac end C[i,j] = Ctmp*a + C[i,j]*b end - - return end - + matmatmul_kernel!(backend(C))(C, A, B, ndrange = size(C)) C end @@ -342,22 +340,22 @@ LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:A function generic_rmul!(X::AbstractArray, s::Number) - gpu_call(X, s; name="rmul!") do ctx, X, s - i = @linearidx X + @kernel function rmul_kernel!(X, s) + i = @index(Global, Linear) @inbounds X[i] *= s - return end + rmul_kernel!(backend(X))(X, s, ndrange = size(X)) return X end LinearAlgebra.rmul!(A::AbstractGPUArray, b::Number) = generic_rmul!(A, b) function generic_lmul!(s::Number, X::AbstractArray) - gpu_call(X, s; name="lmul!") do ctx, X, s - i = @linearidx X + @kernel function lmul_kernel!(X, s) + i = @index(Global, linear) @inbounds X[i] = s*X[i] - return end + lmul_kernel!(backend(X))(X, s, ndrange = size(X)) return X end @@ -392,15 +390,15 @@ function _permutedims!(::Type{IT}, dest::AbstractGPUArray, src::AbstractGPUArray dest_strides = ntuple(k->k==1 ? 1 : prod(i->size(dest, i), 1:k-1), N) dest_strides_perm = ntuple(i->IT(dest_strides[findfirst(==(i), perm)]), N) size_src = IT.(size(src)) - function permutedims_kernel(ctx, dest, src, size_src, dest_strides_perm) - SLI = @linearidx dest + @kernel function permutedims_kernel!(dest, src, size_src, dest_strides_perm) + SLI = @index(Global, Linear) assume(0 < SLI <= typemax(IT)) LI = IT(SLI) dest_index = permute_linearindex(size_src, LI, dest_strides_perm) @inbounds dest[dest_index] = src[LI] - return end - gpu_call(permutedims_kernel, vec(dest), vec(src), size_src, dest_strides_perm) + permutedims_kernel!(backend(dest))(dest, src, size_src, dest_strides_perm, + ndrange = size(dest)) return dest end @@ -477,28 +475,28 @@ end ## rotate function LinearAlgebra.rotate!(x::AbstractGPUArray, y::AbstractGPUArray, c::Number, s::Number) - gpu_call(x, y, c, s; name="rotate!") do ctx, x, y, c, s - i = @linearidx x + @kernel function rotate_kernel!(x, y, c, s) + i = @index(Global, Linear) @inbounds xi = x[i] @inbounds yi = y[i] @inbounds x[i] = c * xi + s * yi @inbounds y[i] = -conj(s) * xi + c * yi - return end + rotate_kernel!(backend(x))(x, y, c, s, ndrange = size(x)) return x, y end ## reflect function LinearAlgebra.reflect!(x::AbstractGPUArray, y::AbstractGPUArray, c::Number, s::Number) - gpu_call(x, y, c, s; name="reflect!") do ctx, x, y, c, s - i = @linearidx x + @kernel function reflect_kernel!(x, y, c, s) + i = @index(Global, Linear) @inbounds xi = x[i] @inbounds yi = y[i] @inbounds x[i] = c * xi + s * yi @inbounds y[i] = conj(s) * xi - c * yi - return end + reflect_kernel!(backend(x))(x, y, c, s, ndrange = size(x)) return x, y end diff --git a/src/host/math.jl b/src/host/math.jl index 8d02c97f..153593fa 100644 --- a/src/host/math.jl +++ b/src/host/math.jl @@ -1,10 +1,10 @@ # Base mathematical operations function Base.clamp!(A::AnyGPUArray, low, high) - gpu_call(A, low, high) do ctx, A, low, high - I = @cartesianidx A + @kernel function clamp_kernel!(A::AnyGPUArray, low, high) + I = @index(Global, Cartesian) A[I] = clamp(A[I], low, high) - return end + clamp_kernel!(backend(A))(A, low, high, ndrange = size(A)) return A end diff --git a/src/host/random.jl b/src/host/random.jl index bd62590f..0e7a80db 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -30,14 +30,14 @@ function next_rand(state::NTuple{4, T}) where {T <: Unsigned} return state, tmp end -function gpu_rand(::Type{T}, ctx::AbstractKernelContext, randstate::AbstractVector{NTuple{4, UInt32}}) where T +function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T threadid = GPUArrays.threadidx(ctx) stateful_rand = next_rand(randstate[threadid]) randstate[threadid] = stateful_rand[1] return make_rand_num(T, stateful_rand[2]) end -function gpu_rand(::Type{T}, ctx::AbstractKernelContext, randstate::AbstractVector{NTuple{4, UInt32}}) where T <: Integer +function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T <: Integer threadid = GPUArrays.threadidx(ctx) result = zero(T) if sizeof(T) >= 4 @@ -55,9 +55,9 @@ end # support for complex numbers -function gpu_rand(::Type{Complex{T}}, ctx::AbstractKernelContext, randstate::AbstractVector{NTuple{4, UInt32}}) where T - re = gpu_rand(T, ctx, randstate) - im = gpu_rand(T, ctx, randstate) +function gpu_rand(::Type{Complex{T}}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T + re = gpu_rand(T, threadid, randstate) + im = gpu_rand(T, threadid, randstate) return complex(re, im) end @@ -84,9 +84,9 @@ function Random.seed!(rng::RNG, seed::Vector{UInt32}) end function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number - @kernel rand!(a, randstate) + @kernel function rand!(a, randstate) idx = @index(Global, Linear) - @inbounds a[idx] = gpu_rand(T, ctx, randstates) + @inbounds a[idx] = gpu_rand(T, idx, randstates) end kernel = rand!(backend(A)) kernel(A, rng.state) @@ -96,11 +96,11 @@ end function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Number threads = (length(A) - 1) ÷ 2 + 1 length(A) == 0 && return - @kernel randn!(a, randstates) + @kernel function randn!(a, randstates) i = @index(Global, Linear) idx = 2*(i - 1) + 1 - U1 = gpu_rand(T, ctx, randstates) - U2 = gpu_rand(T, ctx, randstates) + U1 = gpu_rand(T, i, randstates) + U2 = gpu_rand(T, i, randstates) Z0 = sqrt(T(-2.0)*log(U1))*cos(T(2pi)*U2) Z1 = sqrt(T(-2.0)*log(U1))*sin(T(2pi)*U2) @inbounds a[idx] = Z0