diff --git a/lib/cusparse/device.jl b/lib/cusparse/device.jl
index 45dc480032..43117a5a84 100644
--- a/lib/cusparse/device.jl
+++ b/lib/cusparse/device.jl
@@ -11,9 +11,9 @@ using SparseArrays
 export CuSparseDeviceVector, CuSparseDeviceMatrixCSC, CuSparseDeviceMatrixCSR,
        CuSparseDeviceMatrixBSR, CuSparseDeviceMatrixCOO
 
-struct CuSparseDeviceVector{Tv,Ti, A} <: AbstractSparseVector{Tv,Ti}
-    iPtr::CuDeviceVector{Ti, A}
-    nzVal::CuDeviceVector{Tv, A}
+struct CuSparseDeviceVector{Tv,Ti,A} <: AbstractSparseVector{Tv,Ti}
+    iPtr::CuDeviceVector{Ti,A,Ti}
+    nzVal::CuDeviceVector{Tv,A,Ti}
     len::Int
     nnz::Ti
 end
@@ -23,9 +23,9 @@ Base.size(g::CuSparseDeviceVector) = (g.len,)
 SparseArrays.nnz(g::CuSparseDeviceVector) = g.nnz
 
 struct CuSparseDeviceMatrixCSC{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
-    colPtr::CuDeviceVector{Ti, A}
-    rowVal::CuDeviceVector{Ti, A}
-    nzVal::CuDeviceVector{Tv, A}
+    colPtr::CuDeviceVector{Ti,A,Ti}
+    rowVal::CuDeviceVector{Ti,A,Ti}
+    nzVal::CuDeviceVector{Tv,A,Ti}
     dims::NTuple{2,Int}
     nnz::Ti
 end
@@ -35,10 +35,10 @@ Base.size(g::CuSparseDeviceMatrixCSC) = g.dims
 SparseArrays.nnz(g::CuSparseDeviceMatrixCSC) = g.nnz
 
 struct CuSparseDeviceMatrixCSR{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
-    rowPtr::CuDeviceVector{Ti, A}
-    colVal::CuDeviceVector{Ti, A}
-    nzVal::CuDeviceVector{Tv, A}
-    dims::NTuple{2, Int}
+    rowPtr::CuDeviceVector{Ti,A,Ti}
+    colVal::CuDeviceVector{Ti,A,Ti}
+    nzVal::CuDeviceVector{Tv,A,Ti}
+    dims::NTuple{2,Int}
     nnz::Ti
 end
 
@@ -47,9 +47,9 @@ Base.size(g::CuSparseDeviceMatrixCSR) = g.dims
 SparseArrays.nnz(g::CuSparseDeviceMatrixCSR) = g.nnz
 
 struct CuSparseDeviceMatrixBSR{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
-    rowPtr::CuDeviceVector{Ti, A}
-    colVal::CuDeviceVector{Ti, A}
-    nzVal::CuDeviceVector{Tv, A}
+    rowPtr::CuDeviceVector{Ti,A,Ti}
+    colVal::CuDeviceVector{Ti,A,Ti}
+    nzVal::CuDeviceVector{Tv,A,Ti}
     dims::NTuple{2,Int}
     blockDim::Ti
     dir::Char
@@ -61,9 +61,9 @@ Base.size(g::CuSparseDeviceMatrixBSR) = g.dims
 SparseArrays.nnz(g::CuSparseDeviceMatrixBSR) = g.nnz
 
 struct CuSparseDeviceMatrixCOO{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
-    rowInd::CuDeviceVector{Ti, A}
-    colInd::CuDeviceVector{Ti, A}
-    nzVal::CuDeviceVector{Tv, A}
+    rowInd::CuDeviceVector{Ti,A,Ti}
+    colInd::CuDeviceVector{Ti,A,Ti}
+    nzVal::CuDeviceVector{Tv,A,Ti}
     dims::NTuple{2,Int}
     nnz::Ti
 end
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 75228290b8..9b9c1d2d1c 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -19,3 +19,14 @@ BroadcastStyle(::CUDA.CuArrayStyle{N, B1},
 # allocation of output arrays
 Base.similar(bc::Broadcasted{CuArrayStyle{N,B}}, ::Type{T}, dims) where {T,N,B} =
     similar(CuArray{T,length(dims),B}, dims)
+
+# Base.Broadcast can't handle Int32 axes
+# XXX: not using a quirk, as constprop/irinterpret is crucial here
+# XXX: 1.11 uses to_index i nstead of CartesianIndex
+Base.@propagate_inbounds Broadcast.newindex(arg::AnyCuDeviceArray, I::CartesianIndex) = CartesianIndex(_newindex(axes(arg), I.I))
+Base.@propagate_inbounds Broadcast.newindex(arg::AnyCuDeviceArray, I::Integer) = CartesianIndex(_newindex(axes(arg), (I,)))
+Base.@propagate_inbounds _newindex(ax::Tuple, I::Tuple) = # XXX: upstream this?
+  (ifelse(length(ax[1]) == 1, promote(ax[1][1], I[1])...), _newindex(Base.tail(ax), Base.tail(I))...)
+Base.@propagate_inbounds _newindex(ax::Tuple{}, I::Tuple) = ()
+Base.@propagate_inbounds _newindex(ax::Tuple, I::Tuple{}) = (ax[1][1], _newindex(Base.tail(ax), ())...)
+Base.@propagate_inbounds _newindex(ax::Tuple{}, I::Tuple{}) = ()
diff --git a/src/device/array.jl b/src/device/array.jl
index a53b83324d..9b89731916 100644
--- a/src/device/array.jl
+++ b/src/device/array.jl
@@ -6,36 +6,49 @@ export CuDeviceArray, CuDeviceVector, CuDeviceMatrix, ldg
 ## construction
 
 """
-    CuDeviceArray{T,N,A}(ptr, dims, [maxsize])
+    CuDeviceArray{T,N,A,I}(ptr, dims, [maxsize])
 
 Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a
-pointer, where `N` is determined from the length of `dims` and `T` is determined from the
-type of `ptr`. `dims` may be a single scalar, or a tuple of integers corresponding to the
-lengths in each dimension). If the rank `N` is supplied explicitly as in `Array{T,N}(dims)`,
-then it must match the length of `dims`. The same applies to the element type `T`, which
-should match the type of the pointer `ptr`.
+pointer `ptr` in address space `A`. `dims` should be a tuple of `N` integers corresponding
+to the lengths in each dimension. `maxsize` is the maximum number of bytes that can be
+stored in the array, and is determined automatically if not specified. `I` is the integer
+type used to store the size of the array, and is determined automatically if not specified.
 """
 CuDeviceArray
 
-# NOTE: we can't support the typical `tuple or series of integer` style construction,
-#       because we're currently requiring a trailing pointer argument.
-
-struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
+struct CuDeviceArray{T,N,A,I} <: DenseArray{T,N}
     ptr::LLVMPtr{T,A}
-    maxsize::Int
-
-    dims::Dims{N}
-    len::Int
+    maxsize::I
+
+    dims::NTuple{N,I}
+    len::I
+
+    # determine an index type based on the size of the array.
+    # this is type unstable, so only use this constructor from the host side.
+    function CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
+                                  maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N}
+        if maxsize <= typemax(Int32)
+            CuDeviceArray{T,N,A,Int32}(ptr, dims, maxsize)
+        else
+            CuDeviceArray{T,N,A,Int64}(ptr, dims, maxsize)
+        end
+    end
 
-    # inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
-    CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
-                         maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} =
-        new(ptr, maxsize, dims, prod(dims))
+    # fully typed, for use in device code
+    CuDeviceArray{T,N,A,I}(ptr::LLVMPtr{T,A}, dims::Tuple,
+                           maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N,I} =
+        new{T,N,A,I}(ptr, convert(I, maxsize), map(I, dims), convert(I, prod(dims)))
 end
 
 const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A}
 const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A}
 
+# anything that's (secretly) backed by a CuArray
+const AnyCuDeviceArray{T,N} = Union{CuDeviceArray{T,N}, WrappedArray{T,N,CuDeviceArray,CuDeviceArray{T,N}}}
+const AnyCuDeviceVector{T} = AnyCuDeviceArray{T,1}
+const AnyCuDeviceMatrix{T} = AnyCuDeviceArray{T,2}
+const AnyCuDeviceVecOrMat{T} = Union{AnyCuDeviceVector{T}, AnyCuDeviceMatrix{T}}
+
 
 ## array interface
 
@@ -224,18 +237,18 @@ Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a)
     end
 end
 
-function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
+function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A,I}) where {T,S,N,A,I}
   err = GPUArrays._reinterpret_exception(T, a)
   err === nothing || throw(err)
 
   if sizeof(T) == sizeof(S) # fast case
-    return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
+    return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
   end
 
   isize = size(a)
   size1 = div(isize[1]*sizeof(S), sizeof(T))
   osize = tuple(size1, Base.tail(isize)...)
-  return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
+  return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
 end
 
 
@@ -252,7 +265,7 @@ function Base.reshape(a::CuDeviceArray{T,M,A}, dims::NTuple{N,Int}) where {T,N,M
 end
 
 # create a derived device array (reinterpreted or reshaped) that's still a CuDeviceArray
-@inline function _derived_array(a::CuDeviceArray{<:Any,<:Any,A}, ::Type{T},
-                                osize::Dims{N}) where {T, N, A}
-  return CuDeviceArray{T,N,A}(a.ptr, osize, a.maxsize)
+@inline function _derived_array(a::CuDeviceArray{<:Any,<:Any,A,I}, ::Type{T},
+                                osize::Dims{N}) where {T, N, A, I}
+  return CuDeviceArray{T,N,A,I}(a.ptr, osize, a.maxsize)
 end
diff --git a/src/device/intrinsics/shared_memory.jl b/src/device/intrinsics/shared_memory.jl
index 17b204b5d0..97afe4ebf8 100644
--- a/src/device/intrinsics/shared_memory.jl
+++ b/src/device/intrinsics/shared_memory.jl
@@ -16,7 +16,8 @@ generator function will be called dynamically.
     # NOTE: this relies on const-prop to forward the literal length to the generator.
     #       maybe we should include the size in the type, like StaticArrays does?
     ptr = emit_shmem(T, Val(len))
-    CuDeviceArray{T,N,AS.Shared}(ptr, dims)
+    # XXX: 4GB ought to be enough shared memory for anybody
+    CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
 end
 CuStaticSharedArray(::Type{T}, len::Integer) where {T} = CuStaticSharedArray(T, (len,))
 
@@ -53,7 +54,8 @@ shared memory; in the case of a homogeneous multi-part buffer it is preferred to
         end
     end
     ptr = emit_shmem(T) + offset
-    CuDeviceArray{T,N,AS.Shared}(ptr, dims)
+    # XXX: 4GB ought to be enough shared memory for anybody
+    CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
 end
 Base.@propagate_inbounds CuDynamicSharedArray(::Type{T}, len::Integer, offset) where {T} =
     CuDynamicSharedArray(T, (len,), offset)
diff --git a/src/device/random.jl b/src/device/random.jl
index ab3589d6e1..bdd30dd5fb 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -22,7 +22,7 @@ import RandomNumbers
              }
              attributes #0 = { alwaysinline }
           """, "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
-    CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
+    CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
 end
 
 # array with per-warp counters, incremented when generating numbers
@@ -36,7 +36,7 @@ end
              }
              attributes #0 = { alwaysinline }
           """, "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
-    CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
+    CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
 end
 
 # initialization function, called automatically at the start of each kernel because
@@ -204,7 +204,7 @@ end
 for var in [:ki, :wi, :fi, :ke, :we, :fe]
     val = getfield(Random, var)
     gpu_var = Symbol("gpu_$var")
-    arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant})
+    arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant,Int32})
     @eval @inline @generated function $gpu_var()
         ptr = emit_constant_array($(QuoteNode(var)), $val)
         Expr(:call, $arr_typ, ptr, $(size(val)))
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 9ca772d0f8..7f63f630d6 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -153,7 +153,7 @@ end
         return
     end
 
-    asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
+    asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global,Int32}}))
     @test !occursin(".local", asm)
 end
 
diff --git a/test/core/device/intrinsics/cooperative_groups.jl b/test/core/device/intrinsics/cooperative_groups.jl
index f8e161c67f..640ee43e84 100644
--- a/test/core/device/intrinsics/cooperative_groups.jl
+++ b/test/core/device/intrinsics/cooperative_groups.jl
@@ -45,7 +45,7 @@ if capability(device()) >= v"6.0" && attribute(device(), CUDA.DEVICE_ATTRIBUTE_C
     # (the occupancy API could be used to calculate how many blocks can fit per SM,
     #  but that doesn't matter for the tests, so we assume a single block per SM.)
     maxBlocks = attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
-    kernel = cufunction(kernel_vadd, NTuple{3, CuDeviceArray{Float32,2,AS.Global}})
+    kernel = cufunction(kernel_vadd, NTuple{3, CuDeviceArray{Float32,2,AS.Global,Int}})
     maxThreads = CUDA.maxthreads(kernel)
 
     a = rand(Float32, maxBlocks, maxThreads)
diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl
index d9f868a132..176f72e021 100644
--- a/test/core/device/intrinsics/math.jl
+++ b/test/core/device/intrinsics/math.jl
@@ -143,7 +143,7 @@ using SpecialFunctions
             @inbounds b[], c[] = @fastmath sincos(a[])
             return
         end
-        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
+        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global,Int32}}))
         @assert contains(asm, "sin.approx.f32")
         @assert contains(asm, "cos.approx.f32")
         @assert !contains(asm, "__nv")  # from libdevice
diff --git a/test/core/device/intrinsics/wmma.jl b/test/core/device/intrinsics/wmma.jl
index cc7db1c0bb..97c177fd07 100644
--- a/test/core/device/intrinsics/wmma.jl
+++ b/test/core/device/intrinsics/wmma.jl
@@ -342,7 +342,7 @@ end
             return
         end
 
-        ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global},)))
+        ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global,Int32},)))
 
         @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
         @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx)