diff --git a/docs/src/interface.md b/docs/src/interface.md
index 01c0a3c9..44f62405 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -13,8 +13,6 @@ all, you need to provide a type that represents your execution back-end and a wa
 kernels:
 
 ```@docs
-GPUArrays.AbstractGPUBackend
-GPUArrays.AbstractKernelContext
 GPUArrays.gpu_call
 GPUArrays.thread_block_heuristic
 ```
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
index 3f80f90a..ccfe1f8a 100644
--- a/src/GPUArrays.jl
+++ b/src/GPUArrays.jl
@@ -18,9 +18,9 @@ using Reexport
 include("device/execution.jl")
 ## executed on-device
 include("device/abstractarray.jl")
-include("device/indexing.jl")
+#include("device/indexing.jl")
 include("device/memory.jl")
-include("device/synchronization.jl")
+#include("device/synchronization.jl")
 
 using KernelAbstractions
 # host abstractions
@@ -28,7 +28,7 @@ include("host/abstractarray.jl")
 include("host/construction.jl")
 ## integrations and specialized methods
 include("host/base.jl")
-include("host/indexing.jl")
+#include("host/indexing.jl")
 include("host/broadcast.jl")
 include("host/mapreduce.jl")
 include("host/linalg.jl")
diff --git a/src/device/indexing.jl b/src/device/indexing.jl
deleted file mode 100644
index b0b9990d..00000000
--- a/src/device/indexing.jl
+++ /dev/null
@@ -1,83 +0,0 @@
-# indexing
-
-export global_index, global_size, linear_index, @linearidx, @cartesianidx
-
-
-## hardware
-
-for f in (:blockidx, :blockdim, :threadidx, :griddim)
-    @eval $f(ctx::AbstractKernelContext)::Int = error("Not implemented") # COV_EXCL_LINE
-    @eval export $f
-end
-
-"""
-    global_index(ctx::AbstractKernelContext)
-
-Query the global index of the current thread in the launch configuration (i.e. as far as the
-hardware is concerned).
-"""
-@inline function global_index(ctx::AbstractKernelContext)
-    threadidx(ctx) + (blockidx(ctx) - 1) * blockdim(ctx)
-end
-
-"""
-    global_size(ctx::AbstractKernelContext)
-
-Query the global size of the launch configuration (total number of threads launched).
-"""
-@inline function global_size(ctx::AbstractKernelContext)
-    griddim(ctx) * blockdim(ctx)
-end
-
-
-## logical
-
-"""
-    linear_index(ctx::AbstractKernelContext, grididx::Int=1)
-
-Return a linear index for the current kernel by querying hardware registers (similar to
-`get_global_id` in OpenCL). For applying a grid stride (in terms of [`global_size`](@ref)),
-specify `grididx`.
-
-"""
-@inline function linear_index(ctx::AbstractKernelContext, grididx::Int=1)
-    global_index(ctx) + (grididx - 1) * global_size(ctx)
-end
-
-"""
-    linearidx(A, grididx=1, ctxsym=:ctx)
-
-Macro form of [`linear_index`](@ref), which return from the surrouunding scope when out of
-bounds:
-
-    ```julia
-    function kernel(ctx::AbstractKernelContext, A)
-        idx = @linearidx A
-        # from here on it's safe to index into A with idx
-        @inbounds begin
-            A[idx] = ...
-        end
-    end
-    ```
-"""
-macro linearidx(A, grididx=1, ctxsym=:ctx)
-    quote
-        x = $(esc(A))
-        i = linear_index($(esc(ctxsym)), $(esc(grididx)))
-        i > length(x) && return
-        i
-    end
-end
-
-"""
-    cartesianidx(A, grididx=1, ctxsym=:ctx)
-
-Like [`@linearidx`](@ref), but returns a N-dimensional `CartesianIndex`.
-"""
-macro cartesianidx(A, grididx=1, ctxsym=:ctx)
-    quote
-        x = $(esc(A))
-        i = @linearidx(x, $(esc(grididx)), $(esc(ctxsym)))
-        @inbounds CartesianIndices(x)[i]
-    end
-end
diff --git a/src/device/synchronization.jl b/src/device/synchronization.jl
deleted file mode 100644
index b16d2518..00000000
--- a/src/device/synchronization.jl
+++ /dev/null
@@ -1,13 +0,0 @@
-# synchronization
-
-export synchronize_threads
-
-"""
-     synchronize_threads(ctx::AbstractKernelContext)
-
-in CUDA terms `__synchronize`
-in OpenCL terms: `barrier(CLK_LOCAL_MEM_FENCE)`
-"""
-function synchronize_threads(ctx::AbstractKernelContext)
-    error("Not implemented") # COV_EXCL_LINE
-end
diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl
index 05e0e8d6..6387f006 100644
--- a/src/host/abstractarray.jl
+++ b/src/host/abstractarray.jl
@@ -147,7 +147,7 @@ end
 
 ## generalized blocks of heterogeneous memory
 
-@kernel function cartesian_copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets)
+@kernel function cartesian_copy_kernel!(dest, dest_offsets, src, src_offsets)
     I = @index(Global, Cartesian)
     @inbounds dest[I + dest_offsets] = src[I + src_offsets]
 end
diff --git a/src/host/base.jl b/src/host/base.jl
index 96fe3fd4..0feeb30e 100644
--- a/src/host/base.jl
+++ b/src/host/base.jl
@@ -4,8 +4,7 @@ import Base: _RepeatInnerOuter
 # Handle `out = repeat(x; inner)` by parallelizing over `out` array This can benchmark
 # faster if repeating elements along the first axis (i.e. `inner=(n, ones...)`), as data
 # access can be contiguous on write.
-function repeat_inner_dst_kernel!(
-    ctx::AbstractKernelContext,
+@kernel function repeat_inner_dst_kernel!(
     xs::AbstractArray{<:Any, N},
     inner::NTuple{N, Int},
     out::AbstractArray{<:Any, N}
@@ -13,13 +12,12 @@ function repeat_inner_dst_kernel!(
     # Get the "stride" index in each dimension, where the size
     # of the stride is given by `inner`. The stride-index (sdx) then
     # corresponds to the index of the repeated value in `xs`.
-    odx = @cartesianidx out
+    odx = @index(Global, Cartesian)
     dest_inds = odx.I
     sdx = ntuple(N) do i
         @inbounds (dest_inds[i] - 1) ÷ inner[i] + 1
     end
     @inbounds out[odx] = xs[CartesianIndex(sdx)]
-    return nothing
 end
 
 # Handle `out = repeat(x; inner)` by parallelizing over the `xs` array This tends to
@@ -90,8 +88,6 @@ end
         end
         @inbounds out[CartesianIndex(odx)] = val
     end
-
-    return nothing
 end
 
 function repeat_outer(xs::AnyGPUArray, outer)
diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
index 9a44fb83..0e2f5563 100644
--- a/src/host/broadcast.jl
+++ b/src/host/broadcast.jl
@@ -51,23 +51,22 @@ end
     bc′ = Broadcast.preprocess(dest, bc)
 
     # grid-stride kernel
-    function broadcast_kernel(ctx, dest, bc′, nelem)
+    @kernel function broadcast_kernel(dest, bc′, nelem)
         i = 0
+        I = @index(Global, Linear)
         while i < nelem
             i += 1
-            I = @cartesianidx(dest, i)
-            @inbounds dest[I] = bc′[I]
+            idx = CartesianIndices(dest)[(I-1)*nelem + i]
+            @inbounds dest[idx] = bc′[idx]
         end
-        return
     end
     elements = length(dest)
     elements_per_thread = typemax(Int)
-    heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc′, 1;
-                                 elements, elements_per_thread)
-    config = launch_configuration(backend(dest), heuristic;
-                                  elements, elements_per_thread)
-    gpu_call(broadcast_kernel, dest, bc′, config.elements_per_thread;
-             threads=config.threads, blocks=config.blocks)
+    # TODO: figure out actual arguments, 3 should be workgroupsize
+    config = KernelAbstractions.launch_config(broadcast_kernel, elements,
+                                              elements/elements_per_thread)
+    kernel! = broadcast_kernel(get_backend(dest), config.threads)
+    kernel!(dest, bc', nelem, ndrange = config.ndrange)
 
     return dest
 end
diff --git a/src/host/construction.jl b/src/host/construction.jl
index 482d50e9..39d3aecf 100644
--- a/src/host/construction.jl
+++ b/src/host/construction.jl
@@ -11,8 +11,8 @@ Base.convert(::Type{T}, a::AbstractArray) where {T<:AbstractGPUArray} = a isa T
 
 function Base.fill!(A::AnyGPUArray{T}, x) where T
     length(A) == 0 && return A
-    @kernel fill!(a, val)
-        idx = @index(Linear, Global)
+    @kernel function fill!(a, val)
+        idx = @index(Global, Linear)
         @inbounds a[idx] = val
     end
     kernel = fill!(backend(A))
@@ -23,11 +23,12 @@ end
 
 ## identity matrices
 
-@kernel function identity_kernel(ctx::AbstractKernelContext, res::AbstractArray{T}, stride, val) where T
+@kernel function identity_kernel(res::AbstractArray{T}, stride, val) where T
     i = @index(Global, Linear)
     ilin = (stride * (i - 1)) + i
-    ilin > length(res) && return
-    @inbounds res[ilin] = val
+    if ilin < length(res)
+        @inbounds res[ilin] = val
+    end
 end
 
 function (T::Type{<: AnyGPUArray{U}})(s::UniformScaling, dims::Dims{2}) where {U}
diff --git a/src/host/indexing.jl b/src/host/indexing.jl
deleted file mode 100644
index 34fc4084..00000000
--- a/src/host/indexing.jl
+++ /dev/null
@@ -1,182 +0,0 @@
-# host-level indexing
-
-
-# basic indexing with integers
-
-Base.IndexStyle(::Type{<:AbstractGPUArray}) = Base.IndexLinear()
-
-function Base.getindex(xs::AbstractGPUArray{T}, I::Integer...) where T
-    assertscalar("getindex")
-    i = Base._to_linear_index(xs, I...)
-    x = Array{T}(undef, 1)
-    copyto!(x, 1, xs, i, 1)
-    return x[1]
-end
-
-function Base.setindex!(xs::AbstractGPUArray{T}, v::T, I::Integer...) where T
-    assertscalar("setindex!")
-    i = Base._to_linear_index(xs, I...)
-    x = T[v]
-    copyto!(xs, i, x, 1, 1)
-    return xs
-end
-
-Base.setindex!(xs::AbstractGPUArray, v, I::Integer...) =
-    setindex!(xs, convert(eltype(xs), v), I...)
-
-
-# basic indexing with cartesian indices
-
-Base.@propagate_inbounds Base.getindex(A::AbstractGPUArray, I::Union{Integer, CartesianIndex}...) =
-    A[Base.to_indices(A, I)...]
-Base.@propagate_inbounds Base.setindex!(A::AbstractGPUArray, v, I::Union{Integer, CartesianIndex}...) =
-    (A[Base.to_indices(A, I)...] = v; A)
-
-
-# generalized multidimensional indexing
-
-Base.getindex(A::AbstractGPUArray, I...) = _getindex(A, to_indices(A, I)...)
-
-function _getindex(src::AbstractGPUArray, Is...)
-    shape = Base.index_shape(Is...)
-    dest = similar(src, shape)
-    any(isempty, Is) && return dest # indexing with empty array
-    idims = map(length, Is)
-
-    AT = typeof(src).name.wrapper
-    # NOTE: we are pretty liberal here supporting non-GPU indices...
-    gpu_call(getindex_kernel, dest, src, idims, adapt(AT, Is)...)
-    return dest
-end
-
-@generated function getindex_kernel(ctx::AbstractKernelContext, dest, src, idims,
-                                    Is::Vararg{Any,N}) where {N}
-    quote
-        i = @linearidx dest
-        is = @inbounds CartesianIndices(idims)[i]
-        @nexprs $N i -> I_i = @inbounds(Is[i][is[i]])
-        val = @ncall $N getindex src i -> I_i
-        @inbounds dest[i] = val
-        return
-    end
-end
-
-Base.setindex!(A::AbstractGPUArray, v, I...) = _setindex!(A, v, to_indices(A, I)...)
-
-function _setindex!(dest::AbstractGPUArray, src, Is...)
-    isempty(Is) && return dest
-    idims = length.(Is)
-    len = prod(idims)
-    len==0 && return dest
-
-    AT = typeof(dest).name.wrapper
-    # NOTE: we are pretty liberal here supporting non-GPU sources and indices...
-    gpu_call(setindex_kernel, dest, adapt(AT, src), idims, len, adapt(AT, Is)...;
-             elements=len)
-    return dest
-end
-
-@generated function setindex_kernel(ctx::AbstractKernelContext, dest, src, idims, len,
-                                    Is::Vararg{Any,N}) where {N}
-    quote
-        i = linear_index(ctx)
-        i > len && return
-        is = @inbounds CartesianIndices(idims)[i]
-        @nexprs $N i -> I_i = @inbounds(Is[i][is[i]])
-        @ncall $N setindex! dest src[i] i -> I_i
-        return
-    end
-end
-
-
-## find*
-
-# simple array type that returns the index used to access an element, while
-# retaining the dimensionality of the original array. this can be used to
-# broadcast or reduce an array together with its indices, whereas normally
-# combining e.g. a 2x2 array with its 4-element eachindex array would result
-# in a 4x4 broadcast or reduction.
-struct EachIndex{T,N,IS} <: AbstractArray{T,N}
-    dims::NTuple{N,Int}
-    indices::IS
-end
-EachIndex(xs::AbstractArray) =
-    EachIndex{typeof(firstindex(xs)), ndims(xs), typeof(eachindex(xs))}(
-              size(xs), eachindex(xs))
-Base.size(ei::EachIndex) = ei.dims
-Base.getindex(ei::EachIndex, i::Int) = ei.indices[i]
-Base.IndexStyle(::Type{<:EachIndex}) = Base.IndexLinear()
-
-function Base.findfirst(f::Function, xs::AnyGPUArray)
-    indices = EachIndex(xs)
-    dummy_index = first(indices)
-
-    # given two pairs of (istrue, index), return the one with the smallest index
-    function reduction(t1, t2)
-        (x, i), (y, j) = t1, t2
-        if i > j
-            t1, t2 = t2, t1
-            (x, i), (y, j) = t1, t2
-        end
-        x && return t1
-        y && return t2
-        return (false, dummy_index)
-    end
-
-    res = mapreduce((x, y)->(f(x), y), reduction, xs, indices;
-                    init = (false, dummy_index))
-    if res[1]
-        # out of consistency with Base.findarray, return a CartesianIndex
-        # when the input is a multidimensional array
-        ndims(xs) == 1 && return res[2]
-        return CartesianIndices(xs)[res[2]]
-    else
-        return nothing
-    end
-end
-
-Base.findfirst(xs::AnyGPUArray{Bool}) = findfirst(identity, xs)
-
-function findminmax(binop, xs::AnyGPUArray; init, dims)
-    indices = EachIndex(xs)
-    dummy_index = firstindex(xs)
-
-    function reduction(t1, t2)
-        (x, i), (y, j) = t1, t2
-
-        binop(x, y) && return t2
-        x == y && return (x, min(i, j))
-        return t1
-    end
-
-    @static if VERSION < v"1.7.0-DEV.119"
-    # before JuliaLang/julia#35316, isless/isgreated did not order NaNs last
-    function reduction(t1::Tuple{<:AbstractFloat,<:Any}, t2::Tuple{<:AbstractFloat,<:Any})
-        (x, i), (y, j) = t1, t2
-
-        isnan(x) && return t1
-        isnan(y) && return t2
-
-        binop(x, y) && return t2
-        x == y && return (x, min(i, j))
-        return t1
-    end
-    end
-
-    if dims == Colon()
-        res = mapreduce(tuple, reduction, xs, indices; init = (init, dummy_index))
-
-        # out of consistency with Base.findarray, return a CartesianIndex
-        # when the input is a multidimensional array
-        return (res[1], ndims(xs) == 1 ? res[2] : CartesianIndices(xs)[res[2]])
-    else
-        res = mapreduce(tuple, reduction, xs, indices;
-                        init = (init, dummy_index), dims=dims)
-        vals = map(x->x[1], res)
-        inds = map(x->ndims(xs) == 1 ? x[2] : CartesianIndices(xs)[x[2]], res)
-        return (vals, inds)
-    end
-end
-
-Base.findmax(a::AnyGPUArray; dims=:) = findminmax(Base.isless, a; init=typemin(eltype(a)), dims)
-Base.findmin(a::AnyGPUArray; dims=:) = findminmax(Base.isgreater, a; init=typemax(eltype(a)), dims)
diff --git a/src/host/linalg.jl b/src/host/linalg.jl
index 64f49241..c7d44175 100644
--- a/src/host/linalg.jl
+++ b/src/host/linalg.jl
@@ -12,20 +12,20 @@ function LinearAlgebra.transpose!(B::AbstractGPUMatrix, A::AbstractGPUVector)
 end
 function LinearAlgebra.adjoint!(B::AbstractGPUVector, A::AbstractGPUMatrix)
     axes(B,1) == axes(A,2) && axes(A,1) == 1:1 || throw(DimensionMismatch("adjoint"))
-    gpu_call(B, A) do ctx, B, A
-        idx = @linearidx B
+    @kernel function adjoint_kernel!(B, A)
+        idx = @index(Global, Linear)
         @inbounds B[idx] = adjoint(A[1, idx])
-        return
     end
+    adjoint_kernel!(backend(B))(B, A, ndrange = size(B))
     B
 end
 function LinearAlgebra.adjoint!(B::AbstractGPUMatrix, A::AbstractGPUVector)
     axes(B,2) == axes(A,1) && axes(B,1) == 1:1 || throw(DimensionMismatch("adjoint"))
-    gpu_call(B, A) do ctx, B, A
-        idx = @linearidx A
+    @kernel function adjoint_kernel!(B, A)
+        idx = @index(Global, Linear)
         @inbounds B[1, idx] = adjoint(A[idx])
-        return
     end
+    adjoint_kernel!(backend(A))(B, A, ndrange = size(A))
     B
 end
 
@@ -33,11 +33,11 @@ LinearAlgebra.transpose!(B::AbstractGPUArray, A::AbstractGPUArray) = transpose_f
 LinearAlgebra.adjoint!(B::AbstractGPUArray, A::AbstractGPUArray) = transpose_f!(adjoint, B, A)
 function transpose_f!(f, B::AbstractGPUMatrix{T}, A::AbstractGPUMatrix{T}) where T
     axes(B,1) == axes(A,2) && axes(B,2) == axes(A,1) || throw(DimensionMismatch(string(f)))
-    gpu_call(B, A) do ctx, B, A
-        idx = @cartesianidx A
+    @kernel function transpose_kernel!(B, A)
+        idx = @index(Global, Cartesian)
         @inbounds B[idx[2], idx[1]] = f(A[idx[1], idx[2]])
-        return
     end
+    transpose_kernel!(backend(B))(B, A, ndrange = size(A))
     B
 end
 
@@ -60,47 +60,47 @@ end
 ## copy upper triangle to lower and vice versa
 
 function LinearAlgebra.copytri!(A::AbstractGPUMatrix{T}, uplo::AbstractChar, conjugate::Bool=false) where T
-  n = LinearAlgebra.checksquare(A)
-  if uplo == 'U' && conjugate
-      gpu_call(A) do ctx, _A
-        I = @cartesianidx _A
-        i, j = Tuple(I)
-        if j > i
-          _A[j,i] = conj(_A[i,j])
+    n = LinearAlgebra.checksquare(A)
+    if uplo == 'U' && conjugate
+        @kernel function U_conj!(_A)
+            I = @index(Global, Cartesian)
+            i, j = Tuple(I)
+            if j > i
+              _A[j,i] = conj(_A[i,j])
+            end
         end
-        return
-      end
-  elseif uplo == 'U' && !conjugate
-      gpu_call(A) do ctx, _A
-        I = @cartesianidx _A
-        i, j = Tuple(I)
-        if j > i
-          _A[j,i] = _A[i,j]
+        U_conj!(backend(_A), ndrange = size(_A))
+    elseif uplo == 'U' && !conjugate
+        @kernel function U_noconj!(_A)
+            I = @index(Global, Cartesian)
+            i, j = Tuple(I)
+            if j > i
+              _A[j,i] = _A[i,j]
+            end
         end
-        return
-      end
-  elseif uplo == 'L' && conjugate
-      gpu_call(A) do ctx, _A
-        I = @cartesianidx _A
-        i, j = Tuple(I)
-        if j > i
-          _A[i,j] = conj(_A[j,i])
+        U_noconj!(backend(_A))(_A, ndrange=size(_A))
+    elseif uplo == 'L' && conjugate
+        @kernel function L_conj!(_A)
+            I = @index(Global, Cartesian)
+            i, j = Tuple(I)
+            if j > i
+              _A[i,j] = conj(_A[j,i])
+            end
         end
-        return
-      end
-  elseif uplo == 'L' && !conjugate
-      gpu_call(A) do ctx, _A
-        I = @cartesianidx _A
-        i, j = Tuple(I)
-        if j > i
-          _A[i,j] = _A[j,i]
+        L_conj!(backend(_A))(_A, ndrange = size(_A))
+    elseif uplo == 'L' && !conjugate
+        @kernel function L_noconj!(_A)
+            I = @index(Global, Cartesian)
+            i, j = Tuple(I)
+            if j > i
+              _A[i,j] = _A[j,i]
+            end
         end
-        return
-      end
-  else
-      throw(ArgumentError("uplo argument must be 'U' (upper) or 'L' (lower), got $uplo"))
-  end
-  A
+        L_noconj!(backend(_A))(_A, ndrange = size(_A))
+    else
+        throw(ArgumentError("uplo argument must be 'U' (upper) or 'L' (lower), got $uplo"))
+    end
+    A
 end
 
 
@@ -118,26 +118,26 @@ for T in (UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriang
 end
 
 function LinearAlgebra.tril!(A::AbstractGPUMatrix{T}, d::Integer = 0) where T
-  gpu_call(A, d; name="tril!") do ctx, _A, _d
-    I = @cartesianidx _A
+  @kernel function tril_kernel!(_A, _d)
+    I = @index(Global, Cartesian)
     i, j = Tuple(I)
     if i < j - _d
       _A[i, j] = 0
     end
-    return
   end
+  tril_kernel!(backend(_A))(_A, _d, ndrange = size(_A))
   return A
 end
 
 function LinearAlgebra.triu!(A::AbstractGPUMatrix{T}, d::Integer = 0) where T
-  gpu_call(A, d; name="triu!") do ctx, _A, _d
-    I = @cartesianidx _A
+  @kernel function triu_kernel!(_A, _d)
+    I = @index(Global, Cartesian)
     i, j = Tuple(I)
     if j < i + _d
       _A[i, j] = 0
     end
-    return
   end
+  triu_kernel!(backend(_A))(_A, _d, ndrange = length(_A))
   return A
 end
 
@@ -300,8 +300,8 @@ function generic_matmatmul!(C::AbstractArray{R}, A::AbstractArray{T}, B::Abstrac
         return fill!(C, zero(R))
     end
 
-    gpu_call(C, A, B; name="matmatmul!") do ctx, C, A, B
-        idx = @linearidx C
+    @kernel function matmatmul_kernel!(C, A, B)
+        idx = @index(Global, Linear)
         i, j = @inbounds Tuple(CartesianIndices(C)[idx])..., 1
 
         @inbounds if i <= size(A,1) && j <= size(B,2)
@@ -312,10 +312,8 @@ function generic_matmatmul!(C::AbstractArray{R}, A::AbstractArray{T}, B::Abstrac
             end
             C[i,j] = Ctmp*a + C[i,j]*b
         end
-
-        return
     end
-
+    matmatmul_kernel!(backend(C))(C, A, B, ndrange = size(C))
     C
 end
 
@@ -342,22 +340,22 @@ LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:A
 
 
 function generic_rmul!(X::AbstractArray, s::Number)
-    gpu_call(X, s; name="rmul!") do ctx, X, s
-        i = @linearidx X
+    @kernel function rmul_kernel!(X, s)
+        i = @index(Global, Linear)
         @inbounds X[i] *= s
-        return
     end
+    rmul_kernel!(backend(X))(X, s, ndrange = size(X))
     return X
 end
 
 LinearAlgebra.rmul!(A::AbstractGPUArray, b::Number) = generic_rmul!(A, b)
 
 function generic_lmul!(s::Number, X::AbstractArray)
-    gpu_call(X, s; name="lmul!") do ctx, X, s
-        i = @linearidx X
+    @kernel function lmul_kernel!(X, s)
+        i = @index(Global, linear)
         @inbounds X[i] = s*X[i]
-        return
     end
+    lmul_kernel!(backend(X))(X, s, ndrange = size(X))
     return X
 end
 
@@ -392,15 +390,15 @@ function _permutedims!(::Type{IT}, dest::AbstractGPUArray, src::AbstractGPUArray
     dest_strides = ntuple(k->k==1 ? 1 : prod(i->size(dest, i), 1:k-1), N)
     dest_strides_perm = ntuple(i->IT(dest_strides[findfirst(==(i), perm)]), N)
     size_src = IT.(size(src))
-    function permutedims_kernel(ctx, dest, src, size_src, dest_strides_perm)
-        SLI = @linearidx dest
+    @kernel function permutedims_kernel!(dest, src, size_src, dest_strides_perm)
+        SLI = @index(Global, Linear)
         assume(0 < SLI <= typemax(IT))
         LI = IT(SLI)
         dest_index = permute_linearindex(size_src, LI, dest_strides_perm)
         @inbounds dest[dest_index] = src[LI]
-        return
     end
-    gpu_call(permutedims_kernel, vec(dest), vec(src), size_src, dest_strides_perm)
+    permutedims_kernel!(backend(dest))(dest, src, size_src, dest_strides_perm,
+                        ndrange = size(dest))
     return dest
 end
 
@@ -477,28 +475,28 @@ end
 ## rotate
 
 function LinearAlgebra.rotate!(x::AbstractGPUArray, y::AbstractGPUArray, c::Number, s::Number)
-    gpu_call(x, y, c, s; name="rotate!") do ctx, x, y, c, s
-        i = @linearidx x
+    @kernel function rotate_kernel!(x, y, c, s)
+        i = @index(Global, Linear)
         @inbounds xi = x[i]
         @inbounds yi = y[i]
         @inbounds x[i] =       c  * xi + s * yi
         @inbounds y[i] = -conj(s) * xi + c * yi
-        return
     end
+    rotate_kernel!(backend(x))(x, y, c, s, ndrange = size(x))
     return x, y
 end
 
 ## reflect
 
 function LinearAlgebra.reflect!(x::AbstractGPUArray, y::AbstractGPUArray, c::Number, s::Number)
-    gpu_call(x, y, c, s; name="reflect!") do ctx, x, y, c, s
-        i = @linearidx x
+    @kernel function  reflect_kernel!(x, y, c, s)
+        i = @index(Global, Linear)
         @inbounds xi = x[i]
         @inbounds yi = y[i]
         @inbounds x[i] =      c  * xi + s * yi
         @inbounds y[i] = conj(s) * xi - c * yi
-        return
     end
+    reflect_kernel!(backend(x))(x, y, c, s, ndrange = size(x))
     return x, y
 end
 
diff --git a/src/host/math.jl b/src/host/math.jl
index 8d02c97f..153593fa 100644
--- a/src/host/math.jl
+++ b/src/host/math.jl
@@ -1,10 +1,10 @@
 # Base mathematical operations
 
 function Base.clamp!(A::AnyGPUArray, low, high)
-    gpu_call(A, low, high) do ctx, A, low, high
-        I = @cartesianidx A
+    @kernel function clamp_kernel!(A::AnyGPUArray, low, high)
+        I = @index(Global, Cartesian)
         A[I] = clamp(A[I], low, high)
-        return
     end
+    clamp_kernel!(backend(A))(A, low, high, ndrange = size(A))
     return A
 end
diff --git a/src/host/random.jl b/src/host/random.jl
index bd62590f..0e7a80db 100644
--- a/src/host/random.jl
+++ b/src/host/random.jl
@@ -30,14 +30,14 @@ function next_rand(state::NTuple{4, T}) where {T <: Unsigned}
     return state, tmp
 end
 
-function gpu_rand(::Type{T}, ctx::AbstractKernelContext, randstate::AbstractVector{NTuple{4, UInt32}}) where T
+function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T
     threadid = GPUArrays.threadidx(ctx)
     stateful_rand = next_rand(randstate[threadid])
     randstate[threadid] = stateful_rand[1]
     return make_rand_num(T, stateful_rand[2])
 end
 
-function gpu_rand(::Type{T}, ctx::AbstractKernelContext, randstate::AbstractVector{NTuple{4, UInt32}}) where T <: Integer
+function gpu_rand(::Type{T}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T <: Integer
     threadid = GPUArrays.threadidx(ctx)
     result = zero(T)
     if sizeof(T) >= 4
@@ -55,9 +55,9 @@ end
 
 # support for complex numbers
 
-function gpu_rand(::Type{Complex{T}}, ctx::AbstractKernelContext, randstate::AbstractVector{NTuple{4, UInt32}}) where T
-    re = gpu_rand(T, ctx, randstate)
-    im = gpu_rand(T, ctx, randstate)
+function gpu_rand(::Type{Complex{T}}, threadid, randstate::AbstractVector{NTuple{4, UInt32}}) where T
+    re = gpu_rand(T, threadid, randstate)
+    im = gpu_rand(T, threadid, randstate)
     return complex(re, im)
 end
 
@@ -84,9 +84,9 @@ function Random.seed!(rng::RNG, seed::Vector{UInt32})
 end
 
 function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
-    @kernel rand!(a, randstate)
+    @kernel function rand!(a, randstate)
         idx = @index(Global, Linear)
-        @inbounds a[idx] = gpu_rand(T, ctx, randstates)
+        @inbounds a[idx] = gpu_rand(T, idx, randstates)
     end
     kernel = rand!(backend(A))
     kernel(A, rng.state)
@@ -96,11 +96,11 @@ end
 function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
     threads = (length(A) - 1) ÷ 2 + 1
     length(A) == 0 && return
-    @kernel randn!(a, randstates)
+    @kernel function randn!(a, randstates)
         i = @index(Global, Linear) 
         idx = 2*(i - 1) + 1
-        U1 = gpu_rand(T, ctx, randstates)
-        U2 = gpu_rand(T, ctx, randstates)
+        U1 = gpu_rand(T, i, randstates)
+        U2 = gpu_rand(T, i, randstates)
         Z0 = sqrt(T(-2.0)*log(U1))*cos(T(2pi)*U2)
         Z1 = sqrt(T(-2.0)*log(U1))*sin(T(2pi)*U2)
         @inbounds a[idx] = Z0