diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 8f7d4f80..b7f4aa82 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -2,7 +2,7 @@ steps: - label: "CUDA.jl" plugins: - JuliaCI/julia#v1: - version: 1.8 + version: "1.10" - JuliaCI/julia-coverage#v1: codecov: true command: | @@ -23,7 +23,7 @@ steps: - label: "oneAPI.jl" plugins: - JuliaCI/julia#v1: - version: 1.8 + version: "1.10" - JuliaCI/julia-coverage#v1: codecov: true command: | @@ -48,7 +48,7 @@ steps: - label: "Metal.jl" plugins: - JuliaCI/julia#v1: - version: 1.8 + version: "1.10" - JuliaCI/julia-coverage#v1: codecov: true command: | diff --git a/src/device/indexing.jl b/src/device/indexing.jl index b0b9990d..31084fce 100644 --- a/src/device/indexing.jl +++ b/src/device/indexing.jl @@ -64,7 +64,9 @@ macro linearidx(A, grididx=1, ctxsym=:ctx) quote x = $(esc(A)) i = linear_index($(esc(ctxsym)), $(esc(grididx))) - i > length(x) && return + if !(1 <= i <= length(x)) + return + end i end end diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl index d7cec877..93532e5f 100644 --- a/src/host/broadcast.jl +++ b/src/host/broadcast.jl @@ -2,7 +2,7 @@ using Base.Broadcast -import Base.Broadcast: BroadcastStyle, Broadcasted, AbstractArrayStyle, instantiate +using Base.Broadcast: BroadcastStyle, Broadcasted, AbstractArrayStyle, instantiate # but make sure we don't dispatch to the optimized copy method that directly indexes function Broadcast.copy(bc::Broadcasted{<:AbstractGPUArrayStyle{0}}) @@ -32,32 +32,48 @@ end return _copyto!(dest, instantiate(Broadcasted{Style}(bc.f, bc.args, axes(dest)))) end -@inline Base.copyto!(dest::AnyGPUArray, bc::Broadcasted{Nothing}) = _copyto!(dest, bc) # Keep it for ArrayConflict +@inline Base.copyto!(dest::AnyGPUArray, bc::Broadcasted{Nothing}) = + _copyto!(dest, bc) # Keep it for ArrayConflict -@inline Base.copyto!(dest::AbstractArray, bc::Broadcasted{<:AbstractGPUArrayStyle}) = _copyto!(dest, bc) +@inline Base.copyto!(dest::AbstractArray, bc::Broadcasted{<:AbstractGPUArrayStyle}) = + _copyto!(dest, bc) @inline function _copyto!(dest::AbstractArray, bc::Broadcasted) axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc)) isempty(dest) && return dest - bc′ = Broadcast.preprocess(dest, bc) - - # grid-stride kernel - function broadcast_kernel(ctx, dest, bc′, nelem) - i = 0 - while i < nelem - i += 1 - I = @cartesianidx(dest, i) - @inbounds dest[I] = bc′[I] + bc = Broadcast.preprocess(dest, bc) + + broadcast_kernel = if ndims(dest) == 1 || + (isa(IndexStyle(dest), IndexLinear) && + isa(IndexStyle(bc), IndexLinear)) + function (ctx, dest, bc, nelem) + i = 1 + while i <= nelem + I = @linearidx(dest, i) + @inbounds dest[I] = bc[I] + i += 1 + end + return + end + else + function (ctx, dest, bc, nelem) + i = 0 + while i < nelem + i += 1 + I = @cartesianidx(dest, i) + @inbounds dest[I] = bc[I] + end + return end - return end + elements = length(dest) elements_per_thread = typemax(Int) - heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc′, 1; + heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc, 1; elements, elements_per_thread) config = launch_configuration(backend(dest), heuristic; elements, elements_per_thread) - gpu_call(broadcast_kernel, dest, bc′, config.elements_per_thread; + gpu_call(broadcast_kernel, dest, bc, config.elements_per_thread; threads=config.threads, blocks=config.blocks) return dest @@ -101,12 +117,15 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...) # grid-stride kernel function map_kernel(ctx, dest, bc, nelem) - for i in 1:nelem + i = 1 + while i <= nelem j = linear_index(ctx, i) j > common_length && return J = CartesianIndices(axes(bc))[j] @inbounds dest[j] = bc[J] + + i += 1 end return end diff --git a/src/host/math.jl b/src/host/math.jl index 8d02c97f..cf455d31 100644 --- a/src/host/math.jl +++ b/src/host/math.jl @@ -2,7 +2,7 @@ function Base.clamp!(A::AnyGPUArray, low, high) gpu_call(A, low, high) do ctx, A, low, high - I = @cartesianidx A + I = @linearidx A A[I] = clamp(A[I], low, high) return end