Open
Description
After removing all the KernelAbstractions parts from JuliaGPU/KernelAbstractions.jl#517
The assume
is there to remove an exceptional branch
using AMDGPU
nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1
"""
assume(cond::Bool)
Assume that the condition `cond` is true. This is a hint to the compiler, possibly enabling
it to optimize more aggressively.
"""
@inline assume(cond::Bool) = Base.llvmcall(("""
declare void @llvm.assume(i1)
define void @entry(i8) #0 {
%cond = icmp eq i8 %0, 1
call void @llvm.assume(i1 %cond)
ret void
}
attributes #0 = { alwaysinline }""", "entry"),
Nothing, Tuple{Bool}, cond)
@inline function assume_nonzero(CI::CartesianIndices)
ntuple(Val(ndims(CI))) do I
@inline
indices = CI.indices[I]
assume(indices.stop > 0)
end
end
Base.@propagate_inbounds function expand(blocks, workitems, groupidx::Integer, idx::Integer)
# this causes a exception branch and a div
assume_nonzero(blocks)
assume_nonzero(workitems)
expand(blocks, workitems, blocks[groupidx], workitems[idx])
end
@inline function expand(blocks, workitems, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
nI = ntuple(Val(N)) do I
Base.@_inline_meta
stride = size(workitems, I)
gidx = groupidx.I[I]
(gidx - 1) * stride + idx.I[I]
end
CartesianIndex(nI)
end
function gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx::Int64, Ny::Int64; )
bI = AMDGPU.blockIdx().x
tI = AMDGPU.threadIdx().x
I = @inbounds expand(blocks, workitems, bI, tI)
if I in ndrange
I = @inbounds expand(blocks, workitems, bI, tI)
(i, j, k) = I.I
sum = zero(eltype(tensor))
for p = 1:Nx + 2
for q = -Ny:Ny
sum += 1.0
end
end
@inbounds tensor[i, j, k] = sum
end
return nothing
end
tensor = AMDGPU.zeros(Float64, nx, ny, nz)
ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1, 1, 1))
workitems = CartesianIndices((10, 1, 1))
@roc groupsize=512 gridsize=size(tensor) gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx, Ny)
println("ka_direct:", tensor)
# expected answer [9.0, 9.0, 9.0, ...]
Metadata
Metadata
Assignees
Labels
No labels