Miscompilation with nested loops and control flow

After removing all the KernelAbstractions parts from https://github.com/JuliaGPU/KernelAbstractions.jl/issues/517
The `assume` is there to remove an exceptional branch

```
using AMDGPU

nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1

"""
    assume(cond::Bool)

Assume that the condition `cond` is true. This is a hint to the compiler, possibly enabling
it to optimize more aggressively.
"""
@inline assume(cond::Bool) = Base.llvmcall(("""
        declare void @llvm.assume(i1)

        define void @entry(i8) #0 {
            %cond = icmp eq i8 %0, 1
            call void @llvm.assume(i1 %cond)
            ret void
        }

        attributes #0 = { alwaysinline }""", "entry"),
    Nothing, Tuple{Bool}, cond)

@inline function assume_nonzero(CI::CartesianIndices)
    ntuple(Val(ndims(CI))) do I
        @inline
        indices = CI.indices[I]
        assume(indices.stop > 0)
    end
end

Base.@propagate_inbounds function expand(blocks, workitems, groupidx::Integer, idx::Integer)
    # this causes a exception branch and a div
    assume_nonzero(blocks)
    assume_nonzero(workitems)
    expand(blocks, workitems, blocks[groupidx], workitems[idx])
end

@inline function expand(blocks, workitems, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
    nI = ntuple(Val(N)) do I
        Base.@_inline_meta
        stride = size(workitems, I)
        gidx = groupidx.I[I]
        (gidx - 1) * stride + idx.I[I]
    end
    CartesianIndex(nI)
end

function gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx::Int64, Ny::Int64; )
    bI = AMDGPU.blockIdx().x
    tI = AMDGPU.threadIdx().x

    I = @inbounds expand(blocks, workitems, bI, tI)
    if I in ndrange
        I = @inbounds expand(blocks, workitems, bI, tI)
        (i, j, k) = I.I
        sum = zero(eltype(tensor))
        for p = 1:Nx + 2
            for q = -Ny:Ny
                sum += 1.0
            end
        end
        @inbounds tensor[i, j, k] = sum
    end
    return nothing
end

tensor = AMDGPU.zeros(Float64, nx, ny, nz)

ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1, 1, 1))
workitems = CartesianIndices((10, 1, 1))

@roc groupsize=512 gridsize=size(tensor) gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx, Ny)
println("ka_direct:", tensor)
# expected answer [9.0, 9.0, 9.0, ...]
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Miscompilation with nested loops and control flow #672

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Miscompilation with nested loops and control flow #672

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions