Skip to content

Miscompilation with nested loops and control flow #672

Open
@vchuravy

Description

@vchuravy

After removing all the KernelAbstractions parts from JuliaGPU/KernelAbstractions.jl#517
The assume is there to remove an exceptional branch

using AMDGPU

nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1

"""
    assume(cond::Bool)

Assume that the condition `cond` is true. This is a hint to the compiler, possibly enabling
it to optimize more aggressively.
"""
@inline assume(cond::Bool) = Base.llvmcall(("""
        declare void @llvm.assume(i1)

        define void @entry(i8) #0 {
            %cond = icmp eq i8 %0, 1
            call void @llvm.assume(i1 %cond)
            ret void
        }

        attributes #0 = { alwaysinline }""", "entry"),
    Nothing, Tuple{Bool}, cond)

@inline function assume_nonzero(CI::CartesianIndices)
    ntuple(Val(ndims(CI))) do I
        @inline
        indices = CI.indices[I]
        assume(indices.stop > 0)
    end
end

Base.@propagate_inbounds function expand(blocks, workitems, groupidx::Integer, idx::Integer)
    # this causes a exception branch and a div
    assume_nonzero(blocks)
    assume_nonzero(workitems)
    expand(blocks, workitems, blocks[groupidx], workitems[idx])
end

@inline function expand(blocks, workitems, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
    nI = ntuple(Val(N)) do I
        Base.@_inline_meta
        stride = size(workitems, I)
        gidx = groupidx.I[I]
        (gidx - 1) * stride + idx.I[I]
    end
    CartesianIndex(nI)
end

function gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx::Int64, Ny::Int64; )
    bI = AMDGPU.blockIdx().x
    tI = AMDGPU.threadIdx().x

    I = @inbounds expand(blocks, workitems, bI, tI)
    if I in ndrange
        I = @inbounds expand(blocks, workitems, bI, tI)
        (i, j, k) = I.I
        sum = zero(eltype(tensor))
        for p = 1:Nx + 2
            for q = -Ny:Ny
                sum += 1.0
            end
        end
        @inbounds tensor[i, j, k] = sum
    end
    return nothing
end

tensor = AMDGPU.zeros(Float64, nx, ny, nz)

ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1, 1, 1))
workitems = CartesianIndices((10, 1, 1))

@roc groupsize=512 gridsize=size(tensor) gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx, Ny)
println("ka_direct:", tensor)
# expected answer [9.0, 9.0, 9.0, ...]

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions