diff --git a/lib/cusparse/device.jl b/lib/cusparse/device.jl index 45dc480032..43117a5a84 100644 --- a/lib/cusparse/device.jl +++ b/lib/cusparse/device.jl @@ -11,9 +11,9 @@ using SparseArrays export CuSparseDeviceVector, CuSparseDeviceMatrixCSC, CuSparseDeviceMatrixCSR, CuSparseDeviceMatrixBSR, CuSparseDeviceMatrixCOO -struct CuSparseDeviceVector{Tv,Ti, A} <: AbstractSparseVector{Tv,Ti} - iPtr::CuDeviceVector{Ti, A} - nzVal::CuDeviceVector{Tv, A} +struct CuSparseDeviceVector{Tv,Ti,A} <: AbstractSparseVector{Tv,Ti} + iPtr::CuDeviceVector{Ti,A,Ti} + nzVal::CuDeviceVector{Tv,A,Ti} len::Int nnz::Ti end @@ -23,9 +23,9 @@ Base.size(g::CuSparseDeviceVector) = (g.len,) SparseArrays.nnz(g::CuSparseDeviceVector) = g.nnz struct CuSparseDeviceMatrixCSC{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti} - colPtr::CuDeviceVector{Ti, A} - rowVal::CuDeviceVector{Ti, A} - nzVal::CuDeviceVector{Tv, A} + colPtr::CuDeviceVector{Ti,A,Ti} + rowVal::CuDeviceVector{Ti,A,Ti} + nzVal::CuDeviceVector{Tv,A,Ti} dims::NTuple{2,Int} nnz::Ti end @@ -35,10 +35,10 @@ Base.size(g::CuSparseDeviceMatrixCSC) = g.dims SparseArrays.nnz(g::CuSparseDeviceMatrixCSC) = g.nnz struct CuSparseDeviceMatrixCSR{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti} - rowPtr::CuDeviceVector{Ti, A} - colVal::CuDeviceVector{Ti, A} - nzVal::CuDeviceVector{Tv, A} - dims::NTuple{2, Int} + rowPtr::CuDeviceVector{Ti,A,Ti} + colVal::CuDeviceVector{Ti,A,Ti} + nzVal::CuDeviceVector{Tv,A,Ti} + dims::NTuple{2,Int} nnz::Ti end @@ -47,9 +47,9 @@ Base.size(g::CuSparseDeviceMatrixCSR) = g.dims SparseArrays.nnz(g::CuSparseDeviceMatrixCSR) = g.nnz struct CuSparseDeviceMatrixBSR{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti} - rowPtr::CuDeviceVector{Ti, A} - colVal::CuDeviceVector{Ti, A} - nzVal::CuDeviceVector{Tv, A} + rowPtr::CuDeviceVector{Ti,A,Ti} + colVal::CuDeviceVector{Ti,A,Ti} + nzVal::CuDeviceVector{Tv,A,Ti} dims::NTuple{2,Int} blockDim::Ti dir::Char @@ -61,9 +61,9 @@ Base.size(g::CuSparseDeviceMatrixBSR) = g.dims SparseArrays.nnz(g::CuSparseDeviceMatrixBSR) = g.nnz struct CuSparseDeviceMatrixCOO{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti} - rowInd::CuDeviceVector{Ti, A} - colInd::CuDeviceVector{Ti, A} - nzVal::CuDeviceVector{Tv, A} + rowInd::CuDeviceVector{Ti,A,Ti} + colInd::CuDeviceVector{Ti,A,Ti} + nzVal::CuDeviceVector{Tv,A,Ti} dims::NTuple{2,Int} nnz::Ti end diff --git a/src/broadcast.jl b/src/broadcast.jl index 75228290b8..9b9c1d2d1c 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -19,3 +19,14 @@ BroadcastStyle(::CUDA.CuArrayStyle{N, B1}, # allocation of output arrays Base.similar(bc::Broadcasted{CuArrayStyle{N,B}}, ::Type{T}, dims) where {T,N,B} = similar(CuArray{T,length(dims),B}, dims) + +# Base.Broadcast can't handle Int32 axes +# XXX: not using a quirk, as constprop/irinterpret is crucial here +# XXX: 1.11 uses to_index i nstead of CartesianIndex +Base.@propagate_inbounds Broadcast.newindex(arg::AnyCuDeviceArray, I::CartesianIndex) = CartesianIndex(_newindex(axes(arg), I.I)) +Base.@propagate_inbounds Broadcast.newindex(arg::AnyCuDeviceArray, I::Integer) = CartesianIndex(_newindex(axes(arg), (I,))) +Base.@propagate_inbounds _newindex(ax::Tuple, I::Tuple) = # XXX: upstream this? + (ifelse(length(ax[1]) == 1, promote(ax[1][1], I[1])...), _newindex(Base.tail(ax), Base.tail(I))...) +Base.@propagate_inbounds _newindex(ax::Tuple{}, I::Tuple) = () +Base.@propagate_inbounds _newindex(ax::Tuple, I::Tuple{}) = (ax[1][1], _newindex(Base.tail(ax), ())...) +Base.@propagate_inbounds _newindex(ax::Tuple{}, I::Tuple{}) = () diff --git a/src/device/array.jl b/src/device/array.jl index a53b83324d..9b89731916 100644 --- a/src/device/array.jl +++ b/src/device/array.jl @@ -6,36 +6,49 @@ export CuDeviceArray, CuDeviceVector, CuDeviceMatrix, ldg ## construction """ - CuDeviceArray{T,N,A}(ptr, dims, [maxsize]) + CuDeviceArray{T,N,A,I}(ptr, dims, [maxsize]) Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a -pointer, where `N` is determined from the length of `dims` and `T` is determined from the -type of `ptr`. `dims` may be a single scalar, or a tuple of integers corresponding to the -lengths in each dimension). If the rank `N` is supplied explicitly as in `Array{T,N}(dims)`, -then it must match the length of `dims`. The same applies to the element type `T`, which -should match the type of the pointer `ptr`. +pointer `ptr` in address space `A`. `dims` should be a tuple of `N` integers corresponding +to the lengths in each dimension. `maxsize` is the maximum number of bytes that can be +stored in the array, and is determined automatically if not specified. `I` is the integer +type used to store the size of the array, and is determined automatically if not specified. """ CuDeviceArray -# NOTE: we can't support the typical `tuple or series of integer` style construction, -# because we're currently requiring a trailing pointer argument. - -struct CuDeviceArray{T,N,A} <: DenseArray{T,N} +struct CuDeviceArray{T,N,A,I} <: DenseArray{T,N} ptr::LLVMPtr{T,A} - maxsize::Int - - dims::Dims{N} - len::Int + maxsize::I + + dims::NTuple{N,I} + len::I + + # determine an index type based on the size of the array. + # this is type unstable, so only use this constructor from the host side. + function CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple, + maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N} + if maxsize <= typemax(Int32) + CuDeviceArray{T,N,A,Int32}(ptr, dims, maxsize) + else + CuDeviceArray{T,N,A,Int64}(ptr, dims, maxsize) + end + end - # inner constructors, fully parameterized, exact types (ie. Int not <:Integer) - CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple, - maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} = - new(ptr, maxsize, dims, prod(dims)) + # fully typed, for use in device code + CuDeviceArray{T,N,A,I}(ptr::LLVMPtr{T,A}, dims::Tuple, + maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N,I} = + new{T,N,A,I}(ptr, convert(I, maxsize), map(I, dims), convert(I, prod(dims))) end const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A} const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A} +# anything that's (secretly) backed by a CuArray +const AnyCuDeviceArray{T,N} = Union{CuDeviceArray{T,N}, WrappedArray{T,N,CuDeviceArray,CuDeviceArray{T,N}}} +const AnyCuDeviceVector{T} = AnyCuDeviceArray{T,1} +const AnyCuDeviceMatrix{T} = AnyCuDeviceArray{T,2} +const AnyCuDeviceVecOrMat{T} = Union{AnyCuDeviceVector{T}, AnyCuDeviceMatrix{T}} + ## array interface @@ -224,18 +237,18 @@ Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a) end end -function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A} +function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A,I}) where {T,S,N,A,I} err = GPUArrays._reinterpret_exception(T, a) err === nothing || throw(err) if sizeof(T) == sizeof(S) # fast case - return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize) + return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize) end isize = size(a) size1 = div(isize[1]*sizeof(S), sizeof(T)) osize = tuple(size1, Base.tail(isize)...) - return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize) + return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize) end @@ -252,7 +265,7 @@ function Base.reshape(a::CuDeviceArray{T,M,A}, dims::NTuple{N,Int}) where {T,N,M end # create a derived device array (reinterpreted or reshaped) that's still a CuDeviceArray -@inline function _derived_array(a::CuDeviceArray{<:Any,<:Any,A}, ::Type{T}, - osize::Dims{N}) where {T, N, A} - return CuDeviceArray{T,N,A}(a.ptr, osize, a.maxsize) +@inline function _derived_array(a::CuDeviceArray{<:Any,<:Any,A,I}, ::Type{T}, + osize::Dims{N}) where {T, N, A, I} + return CuDeviceArray{T,N,A,I}(a.ptr, osize, a.maxsize) end diff --git a/src/device/intrinsics/shared_memory.jl b/src/device/intrinsics/shared_memory.jl index 17b204b5d0..97afe4ebf8 100644 --- a/src/device/intrinsics/shared_memory.jl +++ b/src/device/intrinsics/shared_memory.jl @@ -16,7 +16,8 @@ generator function will be called dynamically. # NOTE: this relies on const-prop to forward the literal length to the generator. # maybe we should include the size in the type, like StaticArrays does? ptr = emit_shmem(T, Val(len)) - CuDeviceArray{T,N,AS.Shared}(ptr, dims) + # XXX: 4GB ought to be enough shared memory for anybody + CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims) end CuStaticSharedArray(::Type{T}, len::Integer) where {T} = CuStaticSharedArray(T, (len,)) @@ -53,7 +54,8 @@ shared memory; in the case of a homogeneous multi-part buffer it is preferred to end end ptr = emit_shmem(T) + offset - CuDeviceArray{T,N,AS.Shared}(ptr, dims) + # XXX: 4GB ought to be enough shared memory for anybody + CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims) end Base.@propagate_inbounds CuDynamicSharedArray(::Type{T}, len::Integer, offset) where {T} = CuDynamicSharedArray(T, (len,), offset) diff --git a/src/device/random.jl b/src/device/random.jl index ab3589d6e1..bdd30dd5fb 100644 --- a/src/device/random.jl +++ b/src/device/random.jl @@ -22,7 +22,7 @@ import RandomNumbers } attributes #0 = { alwaysinline } """, "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{}) - CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,)) + CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,)) end # array with per-warp counters, incremented when generating numbers @@ -36,7 +36,7 @@ end } attributes #0 = { alwaysinline } """, "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{}) - CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,)) + CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,)) end # initialization function, called automatically at the start of each kernel because @@ -204,7 +204,7 @@ end for var in [:ki, :wi, :fi, :ke, :we, :fe] val = getfield(Random, var) gpu_var = Symbol("gpu_$var") - arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant}) + arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant,Int32}) @eval @inline @generated function $gpu_var() ptr = emit_constant_array($(QuoteNode(var)), $val) Expr(:call, $arr_typ, ptr, $(size(val))) diff --git a/test/core/codegen.jl b/test/core/codegen.jl index 9ca772d0f8..7f63f630d6 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -153,7 +153,7 @@ end return end - asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}})) + asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global,Int32}})) @test !occursin(".local", asm) end diff --git a/test/core/device/intrinsics/cooperative_groups.jl b/test/core/device/intrinsics/cooperative_groups.jl index f8e161c67f..640ee43e84 100644 --- a/test/core/device/intrinsics/cooperative_groups.jl +++ b/test/core/device/intrinsics/cooperative_groups.jl @@ -45,7 +45,7 @@ if capability(device()) >= v"6.0" && attribute(device(), CUDA.DEVICE_ATTRIBUTE_C # (the occupancy API could be used to calculate how many blocks can fit per SM, # but that doesn't matter for the tests, so we assume a single block per SM.) maxBlocks = attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) - kernel = cufunction(kernel_vadd, NTuple{3, CuDeviceArray{Float32,2,AS.Global}}) + kernel = cufunction(kernel_vadd, NTuple{3, CuDeviceArray{Float32,2,AS.Global,Int}}) maxThreads = CUDA.maxthreads(kernel) a = rand(Float32, maxBlocks, maxThreads) diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl index d9f868a132..176f72e021 100644 --- a/test/core/device/intrinsics/math.jl +++ b/test/core/device/intrinsics/math.jl @@ -143,7 +143,7 @@ using SpecialFunctions @inbounds b[], c[] = @fastmath sincos(a[]) return end - asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}})) + asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global,Int32}})) @assert contains(asm, "sin.approx.f32") @assert contains(asm, "cos.approx.f32") @assert !contains(asm, "__nv") # from libdevice diff --git a/test/core/device/intrinsics/wmma.jl b/test/core/device/intrinsics/wmma.jl index cc7db1c0bb..97c177fd07 100644 --- a/test/core/device/intrinsics/wmma.jl +++ b/test/core/device/intrinsics/wmma.jl @@ -342,7 +342,7 @@ end return end - ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global},))) + ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global,Int32},))) @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx) @test occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx)