diff --git a/.gitignore b/.gitignore
index a60ac7da48..9f7d2843b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,14 +1,18 @@
 *.o
+*.d
 *.f90
 *.mod
 *.a
 *~
 tests/*_test
+tests/*_ctest
+make.inc
 milc_interface/*
 *#*
 *.pyc
 tunecache.tsv
 profile.tsv
+profile_*.tsv
 config.log
 CMakeCache.txt
 CMakeFiles
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 803f5dba41..2ffbd8994c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,11 +72,11 @@ else()
   set(DEFTARGET "CUDA")
 endif()
 
-set(VALID_TARGET_TYPES CUDA HIP SYCL)
+set(VALID_TARGET_TYPES CUDA HIP SYCL OMPTARGET)
 set(QUDA_TARGET_TYPE
   "${DEFTARGET}"
   CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}")
-set_property(CACHE QUDA_TARGET_TYPE PROPERTY STRINGS CUDA HIP SYCL)
+set_property(CACHE QUDA_TARGET_TYPE PROPERTY STRINGS CUDA HIP SYCL OMPTARGET)
 
 string(TOUPPER ${QUDA_TARGET_TYPE} CHECK_TARGET_TYPE)
 list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID)
@@ -240,10 +240,10 @@ option(QUDA_ALTERNATIVE_I_TO_F "enable using alternative integer-to-float conver
 
 option(QUDA_OPENMP "enable OpenMP" OFF)
 set(QUDA_CXX_STANDARD
-    17
-    CACHE STRING "set the CXX Standard (14 or 17)")
+    20
+    CACHE STRING "set the CXX Standard (14, 17, 20)")
 
-set_property(CACHE QUDA_CXX_STANDARD PROPERTY STRINGS 14 17)
+set_property(CACHE QUDA_CXX_STANDARD PROPERTY STRINGS 14 17 20)
 
 option(QUDA_BACKWARDS "Enable stacktrace generation using backwards-cpp")
 
diff --git a/include/array.h b/include/array.h
index 3005087c85..e243043317 100644
--- a/include/array.h
+++ b/include/array.h
@@ -17,13 +17,6 @@ namespace quda
     constexpr T &operator[](int i) { return data[i]; }
     constexpr const T &operator[](int i) const { return data[i]; }
     constexpr int size() const { return n; }
-
-    array() = default;
-    array(const array<T, n> &) = default;
-    array(array<T, n> &&) = default;
-
-    array<T, n> &operator=(const array<T, n> &) = default;
-    array<T, n> &operator=(array<T, n> &&) = default;
   };
 
   template <typename T, int n> std::ostream &operator<<(std::ostream &output, const array<T, n> &a)
diff --git a/include/comm_quda.h b/include/comm_quda.h
index 4341cf58d9..8c3567308b 100644
--- a/include/comm_quda.h
+++ b/include/comm_quda.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <cstdint>
+#include <cstdlib>
 #include <vector>
 #include <quda_constants.h>
 #include <quda_api.h>
diff --git a/include/communicator_quda.h b/include/communicator_quda.h
index aec02b8c2a..3365097027 100644
--- a/include/communicator_quda.h
+++ b/include/communicator_quda.h
@@ -544,7 +544,7 @@ namespace quda
 
     if (gpuid < 0) {
       int device_count = device::get_device_count();
-      if (device_count == 0) { errorQuda("No devices found"); }
+      if (device_count == 0) { warningQuda("No devices found"); }
 
       // We initialize gpuid if it's still negative.
       gpuid = 0;
@@ -558,7 +558,7 @@ namespace quda
           gpuid = gpuid % device_count;
           printf("MPS enabled, rank=%3d -> gpu=%d\n", comm_rank(), gpuid);
         } else {
-          errorQuda("Too few GPUs available on %s", comm_hostname());
+          warningQuda("Too few GPUs available on %s", comm_hostname());
         }
       }
     } // -ve gpuid
diff --git a/include/complex_quda.h b/include/complex_quda.h
index 18da63def5..369598ea52 100644
--- a/include/complex_quda.h
+++ b/include/complex_quda.h
@@ -27,6 +27,8 @@
 #include <type_traits>
 #include <quda_arch.h> // for double2 / float2
 
+#include <quda_api.h>
+
 namespace quda {
   namespace gauge {
     template<typename Float, typename storeFloat> struct fieldorder_wrapper;
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index df176549fc..26bbdfb82b 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -12,7 +12,7 @@
 #include <kernel_helper.h>
 #include <tune_quda.h>
 
-#if defined(_NVHPC_CUDA)
+#if defined(_NVHPC_CUDA) || defined(QUDA_TARGET_OMPTARGET)
 #include <constant_kernel_arg.h>
 constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::FALSE;
 #else
diff --git a/include/kernel_helper.h b/include/kernel_helper.h
index dcb33baba0..2183c1c28d 100644
--- a/include/kernel_helper.h
+++ b/include/kernel_helper.h
@@ -5,6 +5,17 @@
 namespace quda
 {
 
+  enum ThreadsSync {
+    ThreadsSyncNo = 0,
+    ThreadsSyncX = 1,
+    ThreadsSyncY = 2,
+    ThreadsSyncXY = 3,
+    ThreadsSyncZ = 4,
+    ThreadsSyncXZ = 5,
+    ThreadsSyncYZ = 6,
+    ThreadsSyncAll = 7
+  };
+
   struct kernel_t {
     const void *func;
     const std::string name;
diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh
index 03f9d4b75e..f9bbc7247f 100644
--- a/include/kernels/coarse_op_kernel.cuh
+++ b/include/kernels/coarse_op_kernel.cuh
@@ -1029,6 +1029,7 @@ namespace quda {
 
   template <typename Arg>
   __device__ __host__ inline int virtualThreadIdx(const Arg &arg) {
+    QUDA_RT_CONSTS;
     int warp_id = threadIdx.x / device::warp_size();
     int warp_lane = threadIdx.x % device::warp_size();
     int tx = warp_id * (device::warp_size() / arg.aggregates_per_block) + warp_lane / arg.aggregates_per_block;
@@ -1037,12 +1038,14 @@ namespace quda {
 
   template <typename Arg>
   __device__ __host__ inline int virtualBlockDim(const Arg &arg) {
+    QUDA_RT_CONSTS;
     int block_dim_x = blockDim.x / arg.aggregates_per_block;
     return block_dim_x;
   }
 
   template <typename Arg>
   __device__ __host__ inline int coarseIndex(const Arg &arg) {
+    QUDA_RT_CONSTS;
     int warp_lane = threadIdx.x % device::warp_size();
     int x_coarse = (arg.coarse_color_wave ? blockIdx.y : blockIdx.x) * arg.aggregates_per_block + warp_lane % arg.aggregates_per_block;
     return x_coarse;
@@ -1396,6 +1399,7 @@ namespace quda {
     template <typename VUV, typename Pack, typename Arg>
     inline __device__ void operator()(VUV &vuv, bool isDiagonal, int coarse_x_cb, int coarse_parity, int i0, int j0, int parity, const Pack &pack, const Arg &arg)
     {
+      QUDA_RT_CONSTS;
       using real = typename Arg::Float;
       using TileType = typename Arg::vuvTileType;
       const int dim_index = arg.dim_index % arg.Y_atomic.geometry;
@@ -1638,6 +1642,7 @@ namespace quda {
     template <typename Arg> __device__ inline void operator()(int &parity_coarse, int &x_coarse_cb, int &parity, int &x_cb,
                                                               int &parity_c_row, int &c_row, int &c_col, const Arg &arg)
     {
+      QUDA_RT_CONSTS;
       if (arg.coarse_color_wave) {
         int parity_c_row_block_idx_z = blockDim.y*blockIdx.x + threadIdx.y;
         int c_row_block_idx_z = arg.parity_flip ? (parity_c_row_block_idx_z % arg.coarse_color_grid_z ) : (parity_c_row_block_idx_z / 2); // coarse color row index
diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh
index b3637e1652..b182219a1b 100644
--- a/include/kernels/color_spinor_pack.cuh
+++ b/include/kernels/color_spinor_pack.cuh
@@ -64,6 +64,7 @@ namespace quda {
 
   template <typename store_t, typename ghost_store_t, int nSpin_, int nColor_, int nDim_, QudaFieldOrder order>
   struct PackGhostArg : kernel_param<> {
+    static constexpr ThreadsSync requires_threads_sync = ThreadsSyncX;
     static constexpr bool block_float = sizeof(store_t) == QUDA_SINGLE_PRECISION && isFixed<ghost_store_t>::value;
 
     // ensure we only compile supported block-float kernels
diff --git a/include/kernels/dslash_coarse.cuh b/include/kernels/dslash_coarse.cuh
index 18dd1da1a5..f2a9847625 100644
--- a/include/kernels/dslash_coarse.cuh
+++ b/include/kernels/dslash_coarse.cuh
@@ -31,6 +31,8 @@ namespace quda {
   template <bool dslash_, bool clover_, bool dagger_, DslashType type_, int color_stride_, int dim_stride_, typename Float,
             typename yFloat, typename ghostFloat, int nSpin_, int nColor_, bool native>
   struct DslashCoarseArg : kernel_param<> {
+    static constexpr ThreadsSync requires_threads_sync = ThreadsSyncAll;
+
     static constexpr bool dslash = dslash_;
     static constexpr bool clover = clover_;
     static constexpr bool dagger = dagger_;
@@ -364,11 +366,18 @@ namespace quda {
 
       if (doBulk<Arg::type>() && Arg::clover && dir==0 && dim==0) applyClover<Mc>(out, arg, x_cb, src_idx, parity, s, color_block, color_offset);
 
+#ifdef QUDA_TARGET_OMPTARGET
+      // reduce down to the first group of column-split threads, do it for every threads in openmp.
+      out = warp_combine<Arg::color_stride>(out);
+#endif
+
       if (dir==0 && dim==0) {
         const int my_spinor_parity = (arg.nParity == 2) ? parity : 0;
 
+#ifndef QUDA_TARGET_OMPTARGET
         // reduce down to the first group of column-split threads
         out = warp_combine<Arg::color_stride>(out);
+#endif
 
 #pragma unroll
         for (int color_local=0; color_local<Mc; color_local++) {
diff --git a/include/kernels/dslash_domain_wall_m5.cuh b/include/kernels/dslash_domain_wall_m5.cuh
index 799e511b44..e9b24cdc4b 100644
--- a/include/kernels/dslash_domain_wall_m5.cuh
+++ b/include/kernels/dslash_domain_wall_m5.cuh
@@ -212,6 +212,7 @@ namespace quda
   template <bool sync, bool dagger, bool shared, class Vector, class Arg, Dslash5Type type = Arg::type>
   __device__ __host__ inline Vector d5(const Arg &arg, const Vector &in, int parity, int x_cb, int s, int src_idx)
   {
+    QUDA_RT_CONSTS;
 
     using real = typename Arg::real;
     constexpr bool is_variable = true;
@@ -378,6 +379,7 @@ namespace quda
   __device__ __host__ inline Vector constantInv(const Arg &arg, const Vector &in, int parity, int x_cb, int s_,
                                                 int src_idx)
   {
+    QUDA_RT_CONSTS;
     using real = typename Arg::real;
     const auto k = arg.kappa;
     const auto inv = arg.inv;
@@ -436,6 +438,7 @@ namespace quda
   __device__ __host__ inline Vector variableInv(const Arg &arg, const Vector &in, int parity, int x_cb, int s_,
                                                 int src_idx)
   {
+    QUDA_RT_CONSTS;
     constexpr int nSpin = 4;
     using real = typename Arg::real;
     typedef ColorSpinor<real, Arg::nColor, nSpin / 2> HalfVector;
diff --git a/include/kernels/dslash_mobius_eofa.cuh b/include/kernels/dslash_mobius_eofa.cuh
index c46ffa1d62..7106fa70ec 100644
--- a/include/kernels/dslash_mobius_eofa.cuh
+++ b/include/kernels/dslash_mobius_eofa.cuh
@@ -106,6 +106,7 @@ namespace quda
 
       __device__ __host__ inline void operator()(int x_cb, int src_s, int parity)
       {
+        QUDA_RT_CONSTS;
         using real = typename Arg::real;
         typedef ColorSpinor<real, Arg::nColor, 4> Vector;
 
@@ -186,6 +187,7 @@ namespace quda
 
       __device__ __host__ inline void operator()(int x_cb, int src_s, int parity)
       {
+        QUDA_RT_CONSTS;
         using real = typename Arg::real;
         typedef ColorSpinor<real, Arg::nColor, 4> Vector;
 
diff --git a/include/kernels/gauge_fix_ovr.cuh b/include/kernels/gauge_fix_ovr.cuh
index c43649c15e..4707275586 100644
--- a/include/kernels/gauge_fix_ovr.cuh
+++ b/include/kernels/gauge_fix_ovr.cuh
@@ -101,6 +101,7 @@ namespace quda {
    */
   template <typename store_t, QudaReconstructType recon, int gauge_dir_, bool halo_, int type_>
   struct GaugeFixArg : kernel_param<> {
+    static constexpr ThreadsSync requires_threads_sync = ThreadsSyncAll;
     using real = typename mapper<store_t>::type;
     static constexpr int gauge_dir = gauge_dir_;
     static constexpr bool halo = halo_;
diff --git a/include/kernels/momentum.cuh b/include/kernels/momentum.cuh
index 41f8bc929c..be3f79bbd5 100644
--- a/include/kernels/momentum.cuh
+++ b/include/kernels/momentum.cuh
@@ -90,6 +90,17 @@ namespace quda {
     constexpr MomUpdate(const Arg &arg) : arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
+#ifdef QUDA_TARGET_OMPTARGET
+    static reduce_t reduce_omp(const reduce_t &a, const reduce_t &b)
+    {
+      auto c = a;
+      if (b[0] > a[0]) c[0] = b[0];
+      if (b[1] > a[1]) c[1] = b[1];
+      return c;
+    }
+    static reduce_t init_omp() { return reduce_t(); }  // see UpdateMomArg::init().
+#endif
+
     // calculate the momentum contribution to the action.  This uses the
     // MILC convention where we subtract 4.0 from each matrix norm in
     // order to increase stability
diff --git a/include/kernels/multi_blas_core.cuh b/include/kernels/multi_blas_core.cuh
index bc49a2335b..e05402b46a 100644
--- a/include/kernels/multi_blas_core.cuh
+++ b/include/kernels/multi_blas_core.cuh
@@ -35,6 +35,7 @@ namespace quda
     struct MultiBlasArg : kernel_param<>,
       SpinorXZ<NXZ_, store_t, N, Functor_::use_z>,
       SpinorYW<max_YW_size<NXZ_, store_t, y_store_t, Functor_>(), store_t, N, y_store_t, Ny, Functor_::use_w> {
+      static constexpr ThreadsSync requires_threads_sync = ThreadsSyncAll;
       using real = real_;
       using Functor = Functor_;
       static constexpr int warp_split = warp_split_;
diff --git a/include/kernels/random_init.cuh b/include/kernels/random_init.cuh
index ef0eb93d47..13a37afd2e 100644
--- a/include/kernels/random_init.cuh
+++ b/include/kernels/random_init.cuh
@@ -9,6 +9,7 @@
 namespace quda {
 
   struct rngArg : kernel_param<> {
+    static constexpr ThreadsSync requires_threads_sync = ThreadsSyncNo;
     int commCoord[QUDA_MAX_DIM];
     int X[QUDA_MAX_DIM];
     int X_global[QUDA_MAX_DIM];
diff --git a/include/kernels/reduce_init.cuh b/include/kernels/reduce_init.cuh
index f5a130a385..1915e24188 100644
--- a/include/kernels/reduce_init.cuh
+++ b/include/kernels/reduce_init.cuh
@@ -6,6 +6,7 @@ namespace quda {
   namespace reducer {
 
     template <typename T_> struct init_arg : kernel_param<> {
+      static constexpr ThreadsSync requires_threads_sync = ThreadsSyncNo;
       using T = T_;
       T *count;
       init_arg(T *count, int n_reduce) :
diff --git a/include/quda_api.h b/include/quda_api.h
index e1ec69bbe1..fb14992513 100644
--- a/include/quda_api.h
+++ b/include/quda_api.h
@@ -5,6 +5,13 @@
 #include <enum_quda.h>
 #include <quda_ptr.h>
 
+/* We have to overwrite some cuda-ism here even for public interface,
+   other wise we can't compile tests.
+ */
+#ifdef QUDA_TARGET_OMPTARGET
+#include "targets/omptarget/quda_api.h"
+#endif
+
 /**
    @file quda_api.h
 
diff --git a/include/quda_arch.h b/include/quda_arch.h
index 45a8ed34e4..f1558b6c5e 100644
--- a/include/quda_arch.h
+++ b/include/quda_arch.h
@@ -14,6 +14,9 @@
 
 #elif defined(QUDA_TARGET_SYCL)
 #include <targets/sycl/quda_sycl.h>
+
+#elif defined(QUDA_TARGET_OMPTARGET)
+#include <omp.h>
 #endif
 
 #ifdef QUDA_OPENMP
diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index ea6657120f..390517d073 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -238,6 +238,12 @@ static_assert(QUDA_ORDER_FP_MG == 2 || QUDA_ORDER_FP_MG == 4 || QUDA_ORDER_FP_MG
  */
 #cmakedefine QUDA_TARGET_SYCL @QUDA_TARGET_SYCL@
 
-#if !defined(QUDA_TARGET_CUDA) && !defined(QUDA_TARGET_HIP) && !defined(QUDA_TARGET_SYCL)
+/**
+ * @def QUDA_TARGET_OMPTARGET
+ * @brief This macro is set by CMake if the OMPTARGET Build Target is selected
+ */
+#cmakedefine QUDA_TARGET_OMPTARGET @QUDA_TARGET_OMPTARGET@
+
+#if !defined(QUDA_TARGET_CUDA) && !defined(QUDA_TARGET_HIP) && !defined(QUDA_TARGET_SYCL) && !defined(QUDA_TARGET_OMPTARGET)
 #error "No QUDA_TARGET selected"
 #endif
diff --git a/include/quda_ptr.h b/include/quda_ptr.h
index aab76f6b89..7ee91bf717 100644
--- a/include/quda_ptr.h
+++ b/include/quda_ptr.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ostream>
+#include <utility>
 #include "malloc_quda.h"
 
 namespace quda
diff --git a/include/targets/omptarget/FFT_Plans.h b/include/targets/omptarget/FFT_Plans.h
new file mode 100644
index 0000000000..baed179446
--- /dev/null
+++ b/include/targets/omptarget/FFT_Plans.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include <quda_internal.h>
+#include <quda_matrix.h>
+
+using FFTPlanHandle = int;
+/*
+#include <cufft.h>
+
+using FFTPlanHandle = cufftHandle;
+#define FFT_FORWARD     CUFFT_FORWARD
+#define FFT_INVERSE     CUFFT_INVERSE
+
+#ifndef GPU_GAUGE_ALG
+*/
+#ifdef QUDA_TARGET_OMPTARGET
+#define CUFFT_SAFE_CALL(call)
+
+inline void ApplyFFT(FFTPlanHandle &, float2 *, float2 *, int)
+{
+  errorQuda("unimplemented");
+}
+
+inline void ApplyFFT(FFTPlanHandle &, double2 *, double2 *, int)
+{
+  errorQuda("unimplemented");
+}
+
+inline void SetPlanFFTMany(FFTPlanHandle &, int4, int, QudaPrecision)
+{
+  errorQuda("unimplemented");
+}
+
+inline void SetPlanFFT2DMany(FFTPlanHandle &, int4, int, QudaPrecision)
+{
+  errorQuda("unimplemented");
+}
+
+inline void FFTDestroyPlan(FFTPlanHandle &)
+{
+  errorQuda("unimplemented");
+}
+#else
+
+/*-------------------------------------------------------------------------------*/
+#define CUFFT_SAFE_CALL( call) {                                      \
+    cufftResult err = call;                                         \
+    if ( CUFFT_SUCCESS != err ) {                                     \
+      fprintf(stderr, "CUFFT error in file '%s' in line %i.\n",   \
+              __FILE__, __LINE__);                                \
+      exit(EXIT_FAILURE);                                         \
+    } }
+/*-------------------------------------------------------------------------------*/
+
+/**
+ * @brief Call CUFFT to perform a single-precision complex-to-complex
+ * transform plan in the transform direction as specified by direction
+ * parameter
+ * @param[in] CUFFT plan
+ * @param[in] data_in, pointer to the complex input data (in GPU memory) to transform
+ * @param[out] data_out, pointer to the complex output data (in GPU memory)
+ * @param[in] direction, the transform direction: CUFFT_FORWARD or CUFFT_INVERSE
+ */
+inline void ApplyFFT(FFTPlanHandle &plan, float2 *data_in, float2 *data_out, int direction){
+  CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)data_in, (cufftComplex *)data_out, direction));
+}
+
+/**
+ * @brief Call CUFFT to perform a double-precision complex-to-complex transform plan in the transform direction
+as specified by direction parameter
+ * @param[in] CUFFT plan
+ * @param[in] data_in, pointer to the complex input data (in GPU memory) to transform
+ * @param[out] data_out, pointer to the complex output data (in GPU memory)
+ * @param[in] direction, the transform direction: CUFFT_FORWARD or CUFFT_INVERSE
+ */
+inline void ApplyFFT(FFTPlanHandle &plan, double2 *data_in, double2 *data_out, int direction){
+  CUFFT_SAFE_CALL(cufftExecZ2Z(plan, (cufftDoubleComplex *)data_in, (cufftDoubleComplex *)data_out, direction));
+}
+
+/**
+ * @brief Creates a CUFFT plan supporting 4D (1D+3D) data layouts for complex-to-complex
+ * @param[out] plan, CUFFT plan
+ * @param[in] size, int4 with lattice size dimensions, (.x,.y,.z,.w) -> (Nx, Ny, Nz, Nt)
+ * @param[in] dim, 1 for 1D plan along the temporal direction with batch size Nx*Ny*Nz, 3 for 3D plan along Nx, Ny and Nz with batch size Nt
+ * @param[in] precision The precision of the computation
+ */
+
+inline void SetPlanFFTMany(FFTPlanHandle &plan, int4 size, int dim, QudaPrecision precision)
+{
+  auto type = precision == QUDA_DOUBLE_PRECISION ? CUFFT_Z2Z : CUFFT_C2C;
+  switch (dim) {
+  case 1:
+  {
+    int n[1] = { size.w };
+    CUFFT_SAFE_CALL(cufftPlanMany(&plan, 1, n, NULL, 1, 0, NULL, 1, 0, type, size.x * size.y * size.z));
+  }
+  break;
+  case 3:
+  {
+    int n[3] = { size.x, size.y, size.z };
+    CUFFT_SAFE_CALL(cufftPlanMany(&plan, 3, n, NULL, 1, 0, NULL, 1, 0, type, size.w));
+  }
+  break;
+  }
+}
+
+/**
+ * @brief Creates a CUFFT plan supporting 4D (2D+2D) data layouts for complex-to-complex
+ * @param[out] plan, CUFFT plan
+ * @param[in] size, int4 with lattice size dimensions, (.x,.y,.z,.w) -> (Nx, Ny, Nz, Nt)
+ * @param[in] dim, 0 for 2D plan in Z-T planes with batch size Nx*Ny, 1 for 2D plan in X-Y planes with batch size Nz*Nt
+ * @param[in] precision The precision of the computation
+ */
+inline void SetPlanFFT2DMany(cufftHandle &plan, int4 size, int dim, QudaPrecision precision)
+{
+  auto type = precision == QUDA_DOUBLE_PRECISION ? CUFFT_Z2Z : CUFFT_C2C;
+  switch (dim) {
+  case 0:
+  {
+    int n[2] = { size.w, size.z };
+    CUFFT_SAFE_CALL(cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, type, size.x * size.y));
+  }
+  break;
+  case 1:
+  {
+    int n[2] = { size.x, size.y };
+    CUFFT_SAFE_CALL(cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, type, size.z * size.w));
+  }
+  break;
+  }
+}
+
+inline void FFTDestroyPlan( FFTPlanHandle &plan) {
+   CUFFT_SAFE_CALL(cufftDestroy(plan));
+}
+
+#endif
diff --git a/include/targets/omptarget/atomic_helper.h b/include/targets/omptarget/atomic_helper.h
new file mode 100644
index 0000000000..94ee26ba77
--- /dev/null
+++ b/include/targets/omptarget/atomic_helper.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <array.h>
+
+/**
+   @file atomic_helper.h
+
+   @section Provides definitions of atomic functions that are used in QUDA.
+ */
+
+namespace quda
+{
+
+  /**
+     @brief atomic_fetch_add function performs similarly as atomic_ref::fetch_add
+     @param[in,out] addr The memory address of the variable we are
+     updating atomically
+     @param[in] val The value we summing to the value at addr
+  */
+  template <typename T> __device__ __host__ inline void atomic_fetch_add(T *addr, T val)
+  {
+#pragma omp atomic update
+      *addr += val;
+  }
+
+  template <typename T> __device__ __host__ inline void atomic_fetch_add(complex<T> *addr, complex<T> val)
+  {
+    atomic_fetch_add(reinterpret_cast<T *>(addr) + 0, val.real());
+    atomic_fetch_add(reinterpret_cast<T *>(addr) + 1, val.imag());
+  }
+
+  template <typename T, int n> __device__ __host__ inline void atomic_fetch_add(array<T, n> *addr, array<T, n> val)
+  {
+    for (int i = 0; i < n; i++) atomic_fetch_add(&(*addr)[i], val[i]);
+  }
+
+  /**
+     @brief atomic_fetch_max function that does an atomic max.
+     @param[in,out] addr The memory address of the variable we are
+     updating atomically
+     @param[in] val The value we are comparing against.  Must be
+     positive valued else result is undefined.
+  */
+  __device__ __host__ inline void atomic_fetch_abs_max(float *addr, float val)
+  {
+#pragma omp atomic compare
+    if(*addr<val){*addr=val;}
+  }
+  __device__ __host__ inline void atomic_fetch_abs_max(double *addr, double val)
+  {
+#pragma omp atomic compare
+    if(*addr<val){*addr=val;}
+  }
+
+  template <typename T>
+  inline T atomic_read(T &x)
+  {
+    T v;
+    #pragma omp atomic read
+    v = x;
+    return v;
+  }
+  template <typename T, int N>
+  inline array<T,N> atomic_read(array<T,N> &x)
+  {
+    array<T,N> v;
+    for (int i = 0; i < N; ++i)
+      v[i] = atomic_read(x[i]);
+    return v;
+  }
+  template <typename T>
+  inline complex<T> atomic_read(complex<T> &x)
+  {
+    complex<T> v (atomic_read(x.x), atomic_read(x.y));
+    return v;
+  }
+  template <typename T>
+  inline deviation_t<T> atomic_read(deviation_t<T> &x)
+  {
+    deviation_t<T> v;
+    v.diff = atomic_read(x.diff);
+    v.ref = atomic_read(x.ref);
+    return v;
+  }
+} // namespace quda
diff --git a/include/targets/omptarget/block_reduce_helper.h b/include/targets/omptarget/block_reduce_helper.h
new file mode 100644
index 0000000000..02134f0c2e
--- /dev/null
+++ b/include/targets/omptarget/block_reduce_helper.h
@@ -0,0 +1,170 @@
+#pragma once
+
+#include <target_device.h>
+#include <reducer.h>
+
+/**
+   @file block_reduce_helper.h
+
+   @section This files contains the OpenMP target specializations for
+   warp- and block-level reductions
+ */
+
+using namespace quda;
+
+namespace quda
+{
+
+  namespace target
+  {
+    template <typename T>
+    constexpr bool enough_shared_mem(void)
+    {
+      constexpr auto max_nthr = device::max_block_size();
+      return max_nthr*sizeof(T) <= device::max_shared_memory_size()-sizeof(device::get_shared_cache()[0])*128;  // FIXME arbitrary, the number is arbitrary, offset 128 below & in reduce_helper.h:/reduce
+    }
+    /**
+       @brief OpenMP reduction over a group of consecutive threads smaller than omp_num_threads()
+     */
+    template <typename T, typename reducer_t>
+    inline T any_reduce_impl(const reducer_t &r, const T &value_, const int batch, const int block_size, const bool all, const bool async)
+    {
+      static_assert(enough_shared_mem<T>(), "Shared cache not large enough for tempStorage");
+      T *storage = (T*)&device::get_shared_cache()[128];  // FIXME arbitrary
+      const int tid = omp_get_thread_num();
+      const auto& v0 = r.init();
+#if 1
+      const auto batch_begin = block_size*batch;
+      const auto batch_end = batch_begin+block_size;
+      auto value = value_;
+      for(int offset=1;offset<block_size;offset*=2){
+        if(offset>1 || !async){ // only synchronize if we are not pipelining
+          #pragma omp barrier
+        }
+        storage[tid] = value;
+        const auto j = tid+offset;
+        #pragma omp barrier
+        if(j<batch_end)
+          value = r(value, storage[j]);
+        else
+          value = r(value, v0);
+      }
+      if(all){
+        if(tid==block_size*batch)
+          storage[tid] = value;
+        #pragma omp barrier
+        value = storage[block_size*batch];
+        if(!async){
+          #pragma omp barrier
+        }
+      }
+#else
+      const int nthr = omp_get_num_threads();
+      if(!async){ // only synchronize if we are not pipelining
+        #pragma omp barrier
+      }
+      storage[tid] = value_;
+      #pragma omp barrier
+      for(int offset=1;offset<block_size;offset*=2){
+        #pragma omp for
+        for(int i=0;i<nthr;i+=2*offset){
+          const auto j = i+offset;
+          const auto batch_end = block_size*(1+i/block_size);
+          const auto& u = storage[i];
+          const auto& v = j<batch_end ? storage[j] : v0;
+          const auto& z = r(u, v);
+          storage[i] = z;
+        }
+      }
+      const auto& value = storage[block_size*batch];
+#endif
+      return value;
+    }
+    template <typename T, typename R>
+    inline T any_reduce(const R &r, const T &value_, const int batch, const int block_size, const bool all, const bool async)
+    {
+      if constexpr (enough_shared_mem<T>())
+        return any_reduce_impl(r, value_, batch, block_size, all, async);
+      else{
+        using V = typename T::value_type;
+        constexpr auto N = T::N;
+        if constexpr (
+            std::is_same_v<typename R::reducer_t,plus<typename R::reduce_t>> &&
+            std::is_same_v<T,typename R::reduce_t> &&
+            std::is_same_v<T,array<V,N>>){
+          // make sure the implementation is still compatible: ../../array.h
+          constexpr auto N0 = N/2;
+          constexpr auto N1 = N-N0;
+          using T0 = array<V,N0>;
+          using T1 = array<V,N1>;
+          const constexpr plus<T0> r0 {};
+          const constexpr plus<T1> r1 {};
+          auto value = value_;
+          // recurse to myself
+          reinterpret_cast<T0&>(value[0]) = any_reduce(r0, reinterpret_cast<const T0&>(value_[0]), batch, block_size, all, async);
+          // recurse with async==false
+          reinterpret_cast<T1&>(value[N0]) = any_reduce(r1, reinterpret_cast<const T1&>(value_[N0]), batch, block_size, all, false);
+          return value;
+        }else
+          static_assert(sizeof(T)==0, "unimplemented reduction");  // let me fail at compile time
+      }
+    }
+  }
+
+  // pre-declaration of warp_reduce that we wish to specialize
+  template <bool> struct warp_reduce;
+
+  /**
+     @brief OpenMP target specialization of warp_reduce
+  */
+  template <> struct warp_reduce<true> {
+
+    /**
+       @brief Perform a warp-wide reduction
+       @param[in] value_ thread-local value to be reduced
+       @param[in] all Whether we want all threads to have visibility
+       to the result (all = true) or just the first thread in the
+       warp (all = false)
+       @param[in] r The reduction operation we want to apply
+       @return The warp-wide reduced value
+     */
+    template <typename T, typename reducer_t, typename param_t>
+    __device__ inline T operator()(const T &value_, bool all, const reducer_t &r, const param_t &)
+    {
+      constexpr int block_size = device::warp_size();
+      const int batch = omp_get_thread_num() / block_size;
+      return target::any_reduce(r, value_, batch, block_size, all, false);
+    }
+  };
+
+  // pre-declaration of block_reduce that we wish to specialize
+  template <bool> struct block_reduce;
+
+  /**
+     @brief OpenMP target  specialization of block_reduce
+  */
+  template <> struct block_reduce<true> {
+
+    /**
+       @brief Perform a block-wide reduction
+       @param[in] value_ thread-local value to be reduced
+       @param[in] async Whether this reduction will be performed
+       asynchronously with respect to the calling threads
+       @param[in] batch The batch index of the reduction
+       @param[in] all Whether we want all threads to have visibility
+       to the result (all = true) or just the first thread in the
+       block (all = false)
+       @param[in] r The reduction operation we want to apply
+       @return The block-wide reduced value
+     */
+    template <typename T, typename reducer_t, typename param_t>
+    __device__ inline T operator()(const T &value_, bool async, int batch, bool all, const reducer_t &r, const param_t &)
+    {
+      const auto block_size = target::block_size<param_t::block_dim>();
+      return target::any_reduce(r, value_, batch, block_size, all, async);
+    }
+  };
+
+} // namespace quda
+
+#include "../generic/block_reduce_helper.h"
diff --git a/include/targets/omptarget/block_reduction_kernel.h b/include/targets/omptarget/block_reduction_kernel.h
new file mode 100644
index 0000000000..d0a8c7f0e1
--- /dev/null
+++ b/include/targets/omptarget/block_reduction_kernel.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#include <target_device.h>
+#include <kernel_helper.h>
+#include <block_reduce_helper.h>
+
+namespace quda
+{
+
+  /**
+     @brief This helper function swizzles the block index through
+     mapping the block index onto a matrix and tranposing it.  This is
+     done to potentially increase the cache utilization.  Requires
+     that the argument class has a member parameter "swizzle" which
+     determines if we are swizzling and a parameter "swizzle_factor"
+     which is the effective matrix dimension that we are tranposing in
+     this mapping.
+
+     Specifically, the thread block id is remapped by
+     transposing its coordinates: if the original order can be
+     parameterized by
+
+     blockIdx.x = j * swizzle + i,
+
+     then the new order is
+
+     block_idx = i * (gridDim.x / swizzle) + j
+
+     We need to factor out any remainder and leave this in original
+     ordering.
+
+     @param arg Kernel argument struct
+     @return Swizzled block index
+   */
+  template <typename Arg> __device__ constexpr int virtual_block_idx(const Arg &arg)
+  {
+    QUDA_RT_CONSTS;
+    auto block_idx = blockIdx.x;
+    if (arg.swizzle) {
+      // the portion of the grid that is exactly divisible by the number of SMs
+      const auto gridp = gridDim.x - gridDim.x % arg.swizzle_factor;
+
+      if (block_idx < gridp) {
+        // this is the portion of the block that we are going to transpose
+        const int i = blockIdx.x % arg.swizzle_factor;
+        const int j = blockIdx.x / arg.swizzle_factor;
+
+        // transpose the coordinates
+        block_idx = i * (gridp / arg.swizzle_factor) + j;
+      }
+    }
+    return block_idx;
+  }
+
+  /**
+     @brief This class is derived from the arg class that the functor
+     creates and curries in the block size.  This allows the block
+     size to be set statically at launch time in the actual argument
+     class that is passed to the kernel.
+
+     @tparam block_size x-dimension block-size
+     @param[in] arg Kernel argument
+   */
+  template <unsigned int block_size_, typename Arg_> struct BlockKernelArg : Arg_ {
+    static constexpr ThreadsSync requires_threads_sync = ThreadsSyncYZ;
+    using Arg = Arg_;
+    static constexpr unsigned int block_size = block_size_;
+    BlockKernelArg(const Arg &arg) : Arg(arg) { }
+  };
+
+  /**
+     @brief BlockKernel2D_impl is the implementation of the Generic
+     block kernel.  Here, we split the block (CTA) and thread indices
+     and pass them separately to the transform functor.  The x thread
+     dimension is templated (Arg::block_size), e.g., for efficient
+     reductions.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @param[in] arg Kernel argument
+  */
+  template <template <typename> class Functor, typename Arg>
+  __forceinline__ __device__ void BlockKernel2D_impl(const Arg &arg)
+  {
+    QUDA_RT_CONSTS;
+    const dim3 block_idx(virtual_block_idx(arg), blockIdx.y, blockIdx.z);
+    const dim3 thread_idx(threadIdx.x, threadIdx.y, threadIdx.z);
+    auto j = blockDim.y * blockIdx.y + threadIdx.y;
+    auto k = blockDim.z * blockIdx.z + threadIdx.z;
+    if (j >= arg.threads.y) return;
+    if (k >= arg.threads.z) return;
+
+    Functor<Arg> t(arg);
+    t(block_idx, thread_idx);
+  }
+
+  /**
+     @brief BlockKernel2D is the entry point of the generic block
+     kernel.  This is the specialization where the kernel argument
+     struct is passed by value directly to the kernel.  The kernel
+     type will impose launch bounds if requested (Arg::launch_bounds)
+     or if a block_size > 512 is required.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension).  Not supported at present.
+     @param[in] arg Kernel argument
+   */
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  __launch_bounds__(Arg::launch_bounds ?
+                      Arg::block_size :
+                      0) __global__ std::enable_if_t<device::use_kernel_arg<Arg>(), void> BlockKernel2D(Arg arg)
+  {
+    static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
+    QUDA_OMPTARGET_KERNEL_BEGIN(arg)
+      BlockKernel2D_impl<Functor, Arg>(arg);
+    QUDA_OMPTARGET_KERNEL_END
+  }
+
+  /**
+     @brief BlockKernel2D is the entry point of the generic block
+     kernel.  This is the specialization where the kernel argument
+     struct is copied to the device prior to kernel launch.  The kernel
+     type will impose launch bounds if requested (Arg::launch_bounds)
+     or if a block_size > 512 is required.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension).  Not supported at present.
+     @param[in] arg Kernel argument
+   */
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  __launch_bounds__(Arg::launch_bounds ?
+                      Arg::block_size :
+                      0) __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> BlockKernel2D(Arg *argp)
+  {
+    static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
+    QUDA_OMPTARGET_KERNEL_BEGIN_PTR(argp)
+      BlockKernel2D_impl<Functor, Arg>(*argp);
+    QUDA_OMPTARGET_KERNEL_END
+  }
+
+}
diff --git a/include/targets/omptarget/constant_kernel_arg.h b/include/targets/omptarget/constant_kernel_arg.h
new file mode 100644
index 0000000000..76962aefb2
--- /dev/null
+++ b/include/targets/omptarget/constant_kernel_arg.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <util_quda.h>
+#include <quda_api.h>
+#include <target_device.h>
+
+/**
+   @file constant_kernel_arg.h
+
+   This file should be included in the kernel files for which we wish
+   to utilize __constant__ memory for the kernel parameter struct.
+   This needs to be included before the definition of the kernel,
+   e.g., kernel.h in order for the compiler to do the kernel
+   instantiation correctly.
+ */
+
+// set a preprocessor flag that we have included constant_kernel_arg.h
+#define QUDA_USE_CONSTANT_MEMORY
+
+namespace quda
+{
+
+  namespace device
+  {
+
+    /**
+       @brief The buffer used for kernel parameters
+    */
+    extern void *constant_arg_buffer;
+
+    /**
+       @brief Helper function that returns a pointer to the buffer.
+     */
+    template <typename Arg> constexpr std::enable_if_t<!use_kernel_arg<Arg>(), void *> get_constant_buffer()
+    {
+      return constant_arg_buffer;
+    }
+
+  } // namespace device
+
+} // namespace quda
diff --git a/include/targets/omptarget/kernel.h b/include/targets/omptarget/kernel.h
new file mode 100644
index 0000000000..070da6a4b2
--- /dev/null
+++ b/include/targets/omptarget/kernel.h
@@ -0,0 +1,207 @@
+#pragma once
+
+#include <target_device.h>
+#include <kernel_helper.h>
+
+#define OMP_KERNEL(kern) \
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false> \
+  __global__ std::enable_if_t<device::use_kernel_arg<Arg>(), void> kern(Arg arg) \
+  { \
+    QUDA_OMPTARGET_KERNEL_BEGIN(arg) \
+      kern##_impl<Functor, Arg, grid_stride>(arg); \
+    QUDA_OMPTARGET_KERNEL_END \
+  }
+
+#define OMP_KERNEL_PTR(kern) \
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false> \
+  __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> kern(Arg *argp) \
+  { \
+    QUDA_OMPTARGET_KERNEL_BEGIN_PTR(argp) \
+      kern##_impl<Functor, Arg, grid_stride>(*argp); \
+    QUDA_OMPTARGET_KERNEL_END \
+  }
+
+namespace quda
+{
+
+  /**
+     @brief Kernel1D_impl is the implementation of the generic 1-d
+     kernel.  Functors that utilize this kernel have a
+     single parallelization dimension.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread.
+     @param[in] arg Kernel argument
+   */
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  __forceinline__ __device__ void Kernel1D_impl(const Arg &arg)
+  {
+    QUDA_RT_CONSTS;
+    Functor<Arg> f(arg);
+
+    auto i = threadIdx.x + blockIdx.x * blockDim.x;
+
+    while (i < arg.threads.x) {
+      f(i);
+      if (grid_stride)
+        i += gridDim.x * blockDim.x;
+      else
+        break;
+    }
+  }
+
+  /**
+     @brief Kernel1D is the entry point of the generic 1-d kernel.
+     This is the specialization where the kernel argument struct is
+     passed by value directly to the kernel.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread.
+     @param[in] arg Kernel argument
+   */
+  OMP_KERNEL(Kernel1D);
+
+  /**
+     @brief Kernel1D is the entry point of the generic 1-d kernel.
+     This is the specialization where the kernel argument struct is
+     copied to the device prior to kernel launch.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread.
+     @param[in] arg Kernel argument
+   */
+  OMP_KERNEL_PTR(Kernel1D);
+
+  /**
+     @brief Kernel2D_impl is the implementation of the generic 2-d
+     kernel.  Functors that utilize this kernel have two
+     parallelization dimensions.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  __forceinline__ __device__ void Kernel2D_impl(const Arg &arg)
+  {
+    QUDA_RT_CONSTS;
+    Functor<Arg> f(arg);
+
+    auto i = threadIdx.x + blockIdx.x * blockDim.x;
+    auto j = threadIdx.y + blockIdx.y * blockDim.y;
+    if (j >= arg.threads.y) return;
+
+    while (i < arg.threads.x) {
+      f(i, j);
+      if (grid_stride)
+        i += gridDim.x * blockDim.x;
+      else
+        break;
+    }
+  }
+
+  /**
+     @brief Kernel2D is the entry point of the generic 2-d kernel.
+     This is the specialization where the kernel argument struct is
+     passed by value directly to the kernel.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  OMP_KERNEL(Kernel2D);
+
+  /**
+     @brief Kernel2D is the entry point of the generic 2-d kernel.
+     This is the specialization where the kernel argument struct is
+     copied to the device prior to kernel launch.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  OMP_KERNEL_PTR(Kernel2D);
+
+  /**
+     @brief Kernel3D_impl is the implementation of the generic 3-d
+     kernel.  Functors that utilize this kernel have three
+     parallelization dimensions.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  __forceinline__ __device__ void Kernel3D_impl(const Arg &arg)
+  {
+    QUDA_RT_CONSTS;
+    Functor<Arg> f(arg);
+
+    auto i = threadIdx.x + blockIdx.x * blockDim.x;
+    auto j = threadIdx.y + blockIdx.y * blockDim.y;
+    auto k = threadIdx.z + blockIdx.z * blockDim.z;
+    if (j >= arg.threads.y) return;
+    if (k >= arg.threads.z) return;
+
+    while (i < arg.threads.x) {
+      f(i, j, k);
+      if (grid_stride)
+        i += gridDim.x * blockDim.x;
+      else
+        break;
+    }
+  }
+
+  /**
+     @brief Kernel3D is the entry point of the generic 3-d kernel.
+     This is the specialization where the kernel argument struct is
+     passed by value directly to the kernel.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  OMP_KERNEL(Kernel3D);
+
+  /**
+     @brief Kernel3D is the entry point of the generic 3-d kernel.
+     This is the specialization where the kernel argument struct is
+     passed by value directly to the kernel.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  OMP_KERNEL_PTR(Kernel3D);
+
+}
+
+#undef OMP_KERNEL
+#undef OMP_KERNEL_PTR
diff --git a/include/targets/omptarget/load_store.h b/include/targets/omptarget/load_store.h
new file mode 100644
index 0000000000..175b135378
--- /dev/null
+++ b/include/targets/omptarget/load_store.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <register_traits.h>
+
+namespace quda
+{
+
+  /**
+     @brief Element type used for coalesced storage.
+   */
+  template <typename T>
+  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+
+  // pre-declaration of vector_load that we wish to specialize
+  template <bool> struct vector_load_impl;
+
+  // CUDA specializations of the vector_load
+  template <> struct vector_load_impl<true> {
+    template <typename T> __device__ inline void operator()(T &value, const void *ptr, int idx)
+    {
+      memcpy(&value, reinterpret_cast<const T *>(ptr) + idx, sizeof(T));
+    }
+
+    __device__ inline void operator()(short8 &value, const void *ptr, int idx)
+    {
+      float4 tmp;
+      operator()(tmp, ptr, idx);
+      memcpy(&value, &tmp, sizeof(float4));
+    }
+
+    __device__ inline void operator()(char8 &value, const void *ptr, int idx)
+    {
+      float2 tmp;
+      operator()(tmp, ptr, idx);
+      memcpy(&value, &tmp, sizeof(float2));
+    }
+  };
+
+  // pre-declaration of vector_store that we wish to specialize
+  template <bool> struct vector_store_impl;
+
+  // CUDA specializations of the vector_store using inline ptx
+  template <> struct vector_store_impl<true> {
+    template <typename T> __device__ inline void operator()(void *ptr, int idx, const T &value)
+    {
+      memcpy(reinterpret_cast<T *>(ptr) + idx, &value, sizeof(T));
+    }
+
+    __device__ inline void operator()(void *ptr, int idx, const short8 &value)
+    {
+      memcpy(reinterpret_cast<float4 *>(ptr) + idx, &value, sizeof(float4));
+    }
+
+    __device__ inline void operator()(void *ptr, int idx, const char8 &value)
+    {
+      memcpy(reinterpret_cast<float2 *>(ptr) + idx, &value, sizeof(float2));
+    }
+
+    __device__ inline void operator()(void *ptr, int idx, const char4 &value)
+    {
+      memcpy(reinterpret_cast<short2 *>(ptr) + idx, &value, sizeof(short2));
+    }
+  };
+
+} // namespace quda
+
+#include "../generic/load_store.h"
diff --git a/include/targets/omptarget/math_helper.cuh b/include/targets/omptarget/math_helper.cuh
new file mode 100644
index 0000000000..a5de3897cc
--- /dev/null
+++ b/include/targets/omptarget/math_helper.cuh
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <cmath>
+#include <target_device.h>
+
+namespace quda {
+
+  /**
+   * @brief Maximum of two numbers
+   * @param a first number
+   * @param b second number
+   */
+  template<typename T>
+  inline T max(const T &a, const T &b) { return a > b ? a : b; }
+
+  /**
+   * @brief Minimum of two numbers
+   * @param a first number
+   * @param b second number
+   */
+  template<typename T>
+  inline T min(const T &a, const T &b) { return a < b ? a : b; }
+
+
+  /**
+   * @brief Combined sin and cos calculation in QUDA NAMESPACE
+   * @param a the angle
+   * @param s pointer to the storage for the result of the sin
+   * @param c pointer to the storage for the result of the cos
+   */
+  template<typename T>
+  inline void sincos(const T& a, T* s, T* c) { *s = std::sin(a); *c = std::cos(a); }
+
+  /**
+   * @brief Combined sinpi and cospi calculation in QUDA NAMESPACE
+   * @param a the angle
+   * @param s pointer to the storage for the result of the sin
+   * @param c pointer to the storage for the result of the cos
+   */
+  template <typename T> inline void sincospi(const T& a, T *s, T *c) { quda::sincos(a * static_cast<T>(M_PI), s, c); }
+
+  /**
+   * @brief Sine pi calculation in QUDA NAMESPACE.
+   * @param a the angle
+   * @return result of the sin(a * pi)
+   */
+  template <typename T> inline T sinpi(T a) { return sin(a * static_cast<T>(M_PI)); }
+
+  /**
+   * @brief Cosine pi calculation in QUDA NAMESPACE.
+   * @param a the angle
+   * @return result of the cos(a * pi)
+   */
+  template <typename T> inline T cospi(T a) { return cos(a * static_cast<T>(M_PI)); }
+
+  /**
+   * @brief Reciprocal square root function (rsqrt)
+   * @param a the argument  (In|out)
+   */
+  template<typename T> inline T rsqrt(T a) { return static_cast<T>(1.0) / std::sqrt(a); }
+
+  /*
+    @brief Fast power function that works for negative "a" argument
+    @param a argument we want to raise to some power
+    @param b power that we want to raise a to
+    @return pow(a,b)
+  */
+  template <typename real> __device__ __host__ inline real fpow(real a, int b) { return std::pow(a, b); }
+
+  /**
+     @brief Optimized division routine on the device
+  */
+  __device__ __host__ inline float fdividef(float a, float b) { return a / b; }
+
+}
diff --git a/include/targets/omptarget/quda_api.h b/include/targets/omptarget/quda_api.h
new file mode 100644
index 0000000000..9c05f2593e
--- /dev/null
+++ b/include/targets/omptarget/quda_api.h
@@ -0,0 +1,96 @@
+#pragma once
+
+// OMPTARGET SPECIFIC workarounds
+#define __host__
+#define __device__
+#define __global__
+#define __launch_bounds__(...)
+#define __syncthreads() _Pragma("omp barrier")
+#define __threadfence() _Pragma("omp barrier")
+#define __forceinline__ inline __attribute__((always_inline))
+
+#define CUDA_SUCCESS QUDA_SUCCESS
+
+#define QUDA_RT_CONSTS \
+  const dim3 & blockDim=target::omptarget::launch_param_kernel_block();\
+  const dim3 & gridDim=target::omptarget::launch_param_kernel_grid();\
+  const dim3\
+    threadIdx(omp_get_thread_num()%blockDim.x, (omp_get_thread_num()/blockDim.x)%blockDim.y, omp_get_thread_num()/(blockDim.x*blockDim.y)),\
+    blockIdx(omp_get_team_num()%gridDim.x, (omp_get_team_num()/gridDim.x)%gridDim.y, omp_get_team_num()/(gridDim.x*gridDim.y))
+
+#include <functional>
+#include <iostream>
+#include <omp.h>
+
+struct dim3 {unsigned int x,y,z;
+  constexpr dim3():x(1u),y(1u),z(1u){}
+  constexpr dim3(unsigned int x):x(x),y(1u),z(1u){}
+  constexpr dim3(unsigned int x,unsigned int y):x(x),y(y),z(1u){}
+  constexpr dim3(unsigned int x,unsigned int y,unsigned int z):x(x),y(y),z(z){}
+};
+struct int2 {int x,y;};
+struct int4 {int x,y,z,w;};
+static inline int4 make_int4(int x,int y,int z,int w){return int4{x,y,z,w};}
+struct uint2 {unsigned int x,y;};
+static inline uint2 make_uint2(unsigned int x,unsigned int y){return uint2{x,y};}
+struct uint3 {unsigned int x,y,z;};
+static inline uint3 make_uint3(unsigned int x,unsigned int y,unsigned int z){return uint3{x,y,z};}
+struct char2 {char x,y;};
+static inline char2 make_char2(char x,char y){return char2{x,y};}
+struct char3 {char x,y,z;};
+struct char4 {char x,y,z,w;};
+struct short2 {short x,y;};
+static inline short2 make_short2(short x,short y){return short2{x,y};}
+struct short3 {short x,y,z;};
+struct short4 {short x,y,z,w;};
+struct float2 {float x,y;};
+static inline float2 make_float2(float x,float y){return float2{x,y};}
+struct float3 {float x,y,z;};
+static inline float3 make_float3(float x,float y,float z){return float3{x,y,z};}
+struct float4 {float x,y,z,w;};
+static inline float4 make_float4(float x,float y,float z,float w){return float4{x,y,z,w};}
+struct double2 {double x,y;};
+static inline double2 make_double2(double x,double y){return double2{x,y};}
+struct double3 {double x,y,z;};
+static inline double3 make_double3(double x,double y,double z){return double3{x,y,z};}
+struct double4 {double x,y,z,w;};
+static inline double4 make_double4(double x,double y,double z,double w){return double4{x,y,z,w};}
+
+template <typename... Fmt>
+static inline int
+ompwip_(const char * const file, const size_t line, const char * const func, std::function<void(void)>f, Fmt... fmt)
+{
+  if(0==omp_get_team_num()&&0==omp_get_thread_num()){
+    printf("OMP WIP: ");
+    printf(fmt...);
+    printf(" %s:%ld %s\n", file, line, func);
+  }
+  f();
+  return 0;
+}
+template <typename... Fmt>
+static inline int
+ompwip_(const char * const file, const size_t line, const char * const func, const char * const msg, Fmt... fmt)
+{
+  return ompwip_(file,line,func,[](){},msg,fmt...);
+}
+static inline int
+ompwip_(const char * const file, const size_t line, const char * const func, std::function<void(void)>f=[](){})
+{
+  return ompwip_(file,line,func,f,"");
+}
+#define ompwip(...) ompwip_(__FILE__,__LINE__,__PRETTY_FUNCTION__,##__VA_ARGS__)
+
+using cudaStream_t = int;  // device.h:/cudaStream_t
+using CUresult = int;  // ../lib/coarse_op.cuh:/CUresult
+
+namespace quda {
+  namespace target {
+    namespace omptarget {
+      dim3 & launch_param_grid(void);
+      dim3 & launch_param_block(void);
+    }
+  }
+}
+
+#include "../../quda_api.h"
diff --git a/include/targets/omptarget/quda_fp16.cuh b/include/targets/omptarget/quda_fp16.cuh
new file mode 100644
index 0000000000..c5ad585dda
--- /dev/null
+++ b/include/targets/omptarget/quda_fp16.cuh
@@ -0,0 +1,21 @@
+#pragma once
+
+// #include <cuda_fp16.h>
+
+namespace quda
+{
+
+  __device__ inline half2 habs2(half2 input) {
+/*
+#if CUDA_VERSION >= 10020
+    return __habs2(input);
+#else
+*/
+    static constexpr uint32_t maximum_mask = 0x7fff7fffu; // 0111 1111 1111 1111 0111 1111 1111 1111
+
+    uint32_t input_masked = *reinterpret_cast<const uint32_t *>(&input) & maximum_mask;
+    return *reinterpret_cast<half2 *>(&input_masked);
+//#endif
+  }
+
+} // namespace quda
diff --git a/include/targets/omptarget/quda_omptarget_api.h b/include/targets/omptarget/quda_omptarget_api.h
new file mode 100644
index 0000000000..91b5de8f23
--- /dev/null
+++ b/include/targets/omptarget/quda_omptarget_api.h
@@ -0,0 +1,11 @@
+#pragma once
+
+namespace quda {
+  namespace target {
+    namespace omptarget {
+      int qudaSetupLaunchParameter(const TuneParam &);
+      void set_runtime_error(int error, const char *api_func, const char *func, const char *file,
+                             const char *line, bool allow_error = false);
+    }
+  }
+}
diff --git a/include/targets/omptarget/random_helper.h b/include/targets/omptarget/random_helper.h
new file mode 100644
index 0000000000..86b9c15f8d
--- /dev/null
+++ b/include/targets/omptarget/random_helper.h
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <random_quda.h>
+#include <mrg32k3a.h>
+
+namespace quda
+{
+
+  struct RNGState {
+    target::rng::MRG32k3a state;
+    bool has_extf, has_extd;
+    float extf;
+    double extd;
+  };
+
+  /**
+   * \brief random init
+   * @param [in] seed -- The RNG seed
+   * @param [in] sequence -- The sequence
+   * @param [in] offset -- the offset
+   * @param [in,out] state - the RNG State
+   */
+  inline void random_init(unsigned long long seed, unsigned long long sequence,
+			  unsigned long long offset, RNGState &state)
+  {
+    target::rng::seed(state.state, seed, sequence);
+    target::rng::skip(state.state, offset);
+    state.has_extf = 0;
+    state.has_extd = 0;
+    state.extf = 0.0f;
+    state.extd = 0.0;
+  }
+
+  template <class Real> struct uniform {
+  };
+
+  template <> struct uniform<float> {
+
+    /**
+     * \brief Return a uniform deviate between 0 and 1
+     * @param [in,out] the RNG State
+     */
+    static inline float rand(RNGState &state)
+    {
+      return (float)target::rng::uniform(state.state);
+    }
+
+    /**
+     * \brief return a uniform deviate between a and b
+     * @param [in,out] the RNG state
+     * @param [in] a (the lower end of the range)
+     * @param [in] b (the upper end of the range)
+     */
+    static inline float rand(RNGState &state, float a, float b)
+    {
+      return a + (b - a) * (float)target::rng::uniform(state.state);
+    }
+
+  };
+
+  template <> struct uniform<double> {
+    /**
+     * \brief Return a uniform deviate between 0 and 1
+     * @param [in,out] the RNG State
+     */
+    static inline double rand(RNGState &state)
+    {
+      return target::rng::uniform(state.state);
+    }
+
+    /**
+     * \brief Return a uniform deviate between a and b
+     * @param [in,out] the RNG State
+     * @param [in] a -- the lower end of the range
+     * @param [in] b -- the high end of the range
+     */
+    static inline double rand(RNGState &state, double a, double b)
+    {
+      return a + (b - a) * target::rng::uniform(state.state);
+    }
+  };
+
+  template <class Real> struct normal {
+  };
+
+  template <> struct normal<float> {
+    /**
+     * \brief return a gaussian normal deviate with mean of 0
+     * @param [in,out] state
+     */
+    static inline float rand(RNGState &state)
+    {
+      if(state.has_extf){
+        state.has_extf = 0;
+        return state.extf;
+      }else{
+        float x,y;
+        target::rng::gaussian(state.state, x, y);
+        state.has_extf = 1;
+        state.extf = y;
+        return x;
+      }
+    }
+  };
+
+  template <> struct normal<double> {
+    /**
+     * \brief return a gaussian (normal) deviate with a mean of 0
+     * @param [in,out] state
+     */
+    static inline double rand(RNGState &state)
+    {
+      if(state.has_extd){
+        state.has_extd = 0;
+        return state.extd;
+      }else{
+        double x,y;
+        target::rng::gaussian(state.state, x, y);
+        state.has_extd = 1;
+        state.extd = y;
+        return x;
+      }
+    }
+  };
+
+} // namespace quda
diff --git a/include/targets/omptarget/reduce_helper.h b/include/targets/omptarget/reduce_helper.h
new file mode 100644
index 0000000000..22f7f72765
--- /dev/null
+++ b/include/targets/omptarget/reduce_helper.h
@@ -0,0 +1,201 @@
+#pragma once
+
+#include <quda_internal.h>
+#include <target_device.h>
+#include <block_reduce_helper.h>
+#include <kernel_helper.h>
+
+#define QUDA_OPENMP_ATOMIC_READ_PARTIAL_REDUCTION
+
+#ifdef QUDA_OPENMP_ATOMIC_READ_PARTIAL_REDUCTION
+#include <atomic_helper.h>
+#endif
+
+using count_t = unsigned int;
+
+namespace quda
+{
+
+  // declaration of reduce function
+  template <typename Reducer, typename Arg, typename T>
+  __device__ inline void reduce(Arg &arg, const Reducer &r, const T &in, const int idx = 0);
+
+  /**
+     @brief ReduceArg is the argument type that all kernel arguments
+     shoud inherit from if the kernel is to utilize global reductions.
+     @tparam T the type that will be reduced
+     @tparam use_kernel_arg Whether the kernel will source the
+     parameter struct as an explicit kernel argument or from constant
+     memory
+   */
+  template <typename T, use_kernel_arg_p use_kernel_arg = use_kernel_arg_p::TRUE> struct ReduceArg : kernel_param<use_kernel_arg> {
+    static constexpr ThreadsSync requires_threads_sync = ThreadsSyncAll;
+    using reduce_t = T;
+
+    template <typename Reducer, typename Arg, typename I>
+    friend __device__ void reduce(Arg &, const Reducer &, const I &, const int);
+    qudaError_t launch_error; /** only do complete if no launch error to avoid hang */
+    static constexpr unsigned int max_n_batch_block
+      = 1; /** by default reductions do not support batching withing the block */
+
+  private:
+    const int n_reduce; /** number of reductions of length n_item */
+    T *partial; /** device buffer */
+    T *result_d; /** device-mapped host buffer */
+    T *result_h; /** host buffer */
+    count_t *count; /** count array that is used to track the number of completed thread blocks at a given batch index */
+    T *device_output_async_buffer = nullptr; // Optional device output buffer for the reduction result
+
+  public:
+    /**
+       @brief Constructor for ReduceArg
+       @param[in] threads The number threads partaking in the kernel
+       @param[in] n_reduce The number of reductions
+    */
+    ReduceArg(dim3 threads, int n_reduce = 1, bool = false) :
+      kernel_param<use_kernel_arg>(threads), launch_error(QUDA_ERROR_UNINITIALIZED), n_reduce(n_reduce)
+    {
+      reducer::init(n_reduce, sizeof(*partial));
+      // these buffers may be allocated in init, so we can't set the local copies until now
+      partial = static_cast<decltype(partial)>(reducer::get_device_buffer());
+      result_d = static_cast<decltype(result_d)>(reducer::get_mapped_buffer());
+      result_h = static_cast<decltype(result_h)>(reducer::get_host_buffer());
+      count = reducer::get_count<count_t>();
+
+      if (commAsyncReduction()) result_d = partial;
+    }
+
+    /**
+      @brief Set device_output_async_buffer
+    */
+    void set_output_async_buffer(T *ptr)
+    {
+      if (!commAsyncReduction()) {
+        errorQuda("When setting the asynchronous buffer the commAsyncReduction option must be set.");
+      }
+      device_output_async_buffer = ptr;
+    }
+
+    /**
+      @brief Get device_output_async_buffer
+    */
+    __device__ __host__ T *get_output_async_buffer() const { return device_output_async_buffer; }
+
+    /**
+       @brief Finalize the reduction, returning the computed reduction
+       into result.  The generic path posts an event after the kernel
+       and then polls on completion of the event.
+       @param[out] result The reduction result is copied here
+       @param[in] stream The stream on which we the reduction is being done
+     */
+    template <typename host_t, typename device_t = host_t>
+    void complete(std::vector<host_t> &result, const qudaStream_t stream = device::get_default_stream())
+    {
+      if (launch_error == QUDA_ERROR) return; // kernel launch failed so return
+      if (launch_error == QUDA_ERROR_UNINITIALIZED) errorQuda("No reduction kernel appears to have been launched");
+      auto event = reducer::get_event();
+      qudaEventRecord(event, stream);
+      while (!qudaEventQuery(event)) { }
+
+      // copy back result element by element and convert if necessary to host reduce type
+      // unit size here may differ from system_atomic_t size, e.g., if doing double-double
+      const int n_element = n_reduce * sizeof(T) / sizeof(device_t);
+      if (result.size() != (unsigned)n_element)
+        errorQuda("result vector length %lu does not match n_reduce %d", result.size(), n_element);
+      for (int i = 0; i < n_element; i++) result[i] = reinterpret_cast<device_t *>(result_h)[i];
+    }
+  };
+
+  /**
+     @brief Generic reduction function that reduces block-distributed
+     data "in" per thread to a single value.  This is the generic
+     variant which require explicit host-device synchronization to
+     signal the completion of the reduction to the host.
+
+     The reduce function supports:
+     - a global reduction across the x thread dimension
+     - a local block reduction across the y thread dimension
+     - the z thread dimension is a batching dimension in the case of independent reductions
+
+     @param[in,out] arg The kernel argument, this must derive from ReduceArg
+     @param[in] r Instance of the reducer to be used in this reduction
+     @param[in] in The input per-thread data to be reduced
+     @param[in] idx In the case of multiple reductions, idx identifies
+     which reduction this thread block corresponds to and should be
+     constant along the x and y thread dimensions.
+  */
+  template <typename Reducer, typename Arg, typename T>
+  __device__ inline void reduce(Arg &arg, const Reducer &r, const T &in, const int idx)
+  {
+    constexpr auto n_batch_block = std::min(Arg::max_n_batch_block, device::max_block_size());
+    using BlockReduce = BlockReduce<T, Reducer::reduce_block_dim, n_batch_block>;
+    // bool isLastBlockDone[n_batch_block];
+    static_assert(sizeof(bool)*n_batch_block <= sizeof(device::get_shared_cache()[0])*64, "Shared cache not large enough for isLastBlockDone");  // FIXME arbitrary, 128 is used in block_reduce_helper.h:/tempStorage/
+    bool *isLastBlockDone = (bool*)device::get_shared_cache();
+    bool *hasLastBlockDone = (bool*)&device::get_shared_cache()[64];
+    // printf("team %d thread %d isLastBlockDone %p\n", omp_get_team_num(), omp_get_thread_num(), isLastBlockDone);
+
+    T aggregate = BlockReduce(target::thread_idx().z).Reduce(in, r);
+    // printf("team %d thread %d  r %g  aggregate %g\n", omp_get_team_num(), omp_get_thread_num(), *(double*)(&in), *(double*)(&aggregate));
+
+    if (target::thread_idx_linear<2>() == 0) {
+      arg.partial[idx * target::grid_dim().x + target::block_idx().x] = aggregate;
+      // __threadfence(); // flush result
+
+      // increment global block counter
+      // auto value = atomicInc(&arg.count[idx], target::grid_dim().x);
+      unsigned int value = 0u;
+      unsigned int *c = &arg.count[idx];
+      #pragma omp atomic capture
+      { value = *c; *c = *c + 1; }
+
+      // determine if last block
+      isLastBlockDone[target::thread_idx().z] = (value == (target::grid_dim().x - 1));
+    }
+    #pragma omp barrier
+    #pragma omp single
+    {
+      *hasLastBlockDone = 0;
+      for(int i=0;i<target::block_dim().z;++i)
+        if(isLastBlockDone[i]){
+          *hasLastBlockDone = 1;
+          break;
+        }
+    }
+
+    // finish the reduction if last block
+    if (*hasLastBlockDone) {
+      bool thisSubBlock = isLastBlockDone[target::thread_idx().z];
+      T sum = r.init();
+      if (thisSubBlock) {
+        auto i = target::thread_idx_linear<2>();
+        while (i < target::grid_dim().x) {
+#ifdef QUDA_OPENMP_ATOMIC_READ_PARTIAL_REDUCTION
+          sum = r(sum, atomic_read(arg.partial[idx * target::grid_dim().x + i]));
+#else
+          sum = r(sum, arg.partial[idx * target::grid_dim().x + i]);
+#endif
+          // printf("team %d thread %d  sum %g\n", omp_get_team_num(), omp_get_thread_num(), *(double*)(&sum));
+          i += target::block_size<2>();
+        }
+      }
+
+      sum = BlockReduce(target::thread_idx().z).Reduce(sum, r);
+
+      // write out the final reduced value
+      if (thisSubBlock) {
+        if (target::thread_idx_linear<2>() == 0) {
+          // printf("team %d thread %d  final sum %g\n", omp_get_team_num(), omp_get_thread_num(), *(double*)(&sum));
+          if (arg.get_output_async_buffer()) {
+            arg.get_output_async_buffer()[idx] = sum;
+          } else {
+            arg.result_d[idx] = sum;
+          }
+          arg.count[idx] = 0; // set to zero for next time
+        }
+      }
+      #pragma omp barrier
+    }  // hasLastBlockDone
+  }
+
+} // namespace quda
diff --git a/include/targets/omptarget/reduction_kernel.h b/include/targets/omptarget/reduction_kernel.h
new file mode 100644
index 0000000000..3b4c5dc82b
--- /dev/null
+++ b/include/targets/omptarget/reduction_kernel.h
@@ -0,0 +1,166 @@
+#pragma once
+
+#include <target_device.h>
+#include <reduce_helper.h>
+
+#define OMP_KERNEL(kern) \
+  template <template <typename> class Functor, typename Arg, bool grid_stride = true> \
+  __global__ std::enable_if_t<device::use_kernel_arg<Arg>(), void> kern(Arg arg) \
+  { \
+    QUDA_OMPTARGET_KERNEL_BEGIN(arg) \
+      kern##_impl<Functor, Arg, grid_stride>(arg); \
+    QUDA_OMPTARGET_KERNEL_END \
+  }
+
+#define OMP_KERNEL_PTR(kern) \
+  template <template <typename> class Functor, typename Arg, bool grid_stride = true> \
+  __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> kern(Arg *argp) \
+  { \
+    QUDA_OMPTARGET_KERNEL_BEGIN_PTR(argp) \
+      kern##_impl<Functor, Arg, grid_stride>(*argp); \
+    QUDA_OMPTARGET_KERNEL_END \
+  }
+
+namespace quda
+{
+
+  /**
+     @brief Reduction2D_impl is the implementation of the generic 2-d
+     reduction kernel.  Functors that utilize this kernel have two
+     parallelization dimensions.  The y thread dimenion is constrained
+     to remain inside the thread block and this dimension is
+     contracted in the reduction.
+
+     @tparam Transformer Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  template <template <typename> class Transformer, typename Arg, bool grid_stride = true>
+  __forceinline__ __device__ void Reduction2D_impl(const Arg &arg)
+  {
+    QUDA_RT_CONSTS;
+    using reduce_t = typename Transformer<Arg>::reduce_t;
+    Transformer<Arg> t(arg);
+
+    auto idx = threadIdx.x + blockIdx.x * blockDim.x;
+    auto j = threadIdx.y;
+
+    reduce_t value = t.init();
+
+    while (idx < arg.threads.x) {
+      value = t(value, idx, j);
+      if (grid_stride)
+        idx += blockDim.x * gridDim.x;
+      else
+        break;
+    }
+
+    // perform final inter-block reduction and write out result
+    reduce(arg, t, value);
+  }
+
+  /**
+     @brief Reduction2D is the entry point of the generic 2-d
+     reduction kernel.  This is the specialization where the kernel
+     argument struct is passed by value directly to the kernel.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  OMP_KERNEL(Reduction2D)
+
+  /**
+     @brief Reduction2D is the entry point of the generic 2-d
+     reduction kernel.  This is the specialization where the kernel
+     argument struct is copied to the device prior to kernel launch.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  OMP_KERNEL_PTR(Reduction2D)
+
+  /**
+     @brief MultiReduction_impl is the implementation of the generic
+     multi-reduction kernel.  Functors that utilize this kernel have
+     three parallelization dimensions.  The y thread dimension is
+     constrained to remain inside the thread block and this dimension
+     is contracted in the reduction.  The z thread dimension is a
+     batch dimension that is not contracted in the reduction.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  template <template <typename> class Functor, typename Arg, bool grid_stride = true>
+  __forceinline__ __device__ void MultiReduction_impl(const Arg &arg)
+  {
+    QUDA_RT_CONSTS;
+    using reduce_t = typename Functor<Arg>::reduce_t;
+    Functor<Arg> t(arg);
+
+    auto idx = threadIdx.x + blockIdx.x * blockDim.x;
+    auto k = threadIdx.y;
+    auto j = threadIdx.z + blockIdx.z * blockDim.z;
+
+    if (j >= arg.threads.z) return;
+
+    reduce_t value = t.init();
+
+    while (idx < arg.threads.x) {
+      value = t(value, idx, k, j);
+      if (grid_stride)
+        idx += blockDim.x * gridDim.x;
+      else
+        break;
+    }
+
+    // perform final inter-block reduction and write out result
+    reduce(arg, t, value, j);
+  }
+
+  /**
+     @brief MultiReduction is the entry point of the generic
+     multi-reduction kernel.  This is the specialization where the
+     kernel argument struct is passed by value directly to the kernel.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  OMP_KERNEL(MultiReduction)
+
+  /**
+     @brief MultiReduction is the entry point of the generic
+     multi-reduction kernel.  This is the specialization where the
+     kernel argument struct is passed by value directly to the kernel.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @tparam grid_stride Whether the kernel does multiple computations
+     per thread (in the x dimension)
+     @param[in] arg Kernel argument
+   */
+  OMP_KERNEL_PTR(MultiReduction)
+
+} // namespace quda
+
+#undef OMP_KERNEL
+#undef OMP_KERNEL_PTR
diff --git a/include/targets/omptarget/shared_memory_cache_helper.h b/include/targets/omptarget/shared_memory_cache_helper.h
new file mode 100644
index 0000000000..73be0cd01b
--- /dev/null
+++ b/include/targets/omptarget/shared_memory_cache_helper.h
@@ -0,0 +1 @@
+#include "../generic/shared_memory_cache_helper.h"
diff --git a/include/targets/omptarget/shared_memory_helper.h b/include/targets/omptarget/shared_memory_helper.h
new file mode 100644
index 0000000000..05205c6c84
--- /dev/null
+++ b/include/targets/omptarget/shared_memory_helper.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <target_device.h>
+#include <kernel_ops.h>
+
+/**
+   @file shared_memory_helper.h
+
+   Target specific helper for allocating and accessing shared memory.
+ */
+
+namespace quda
+{
+
+  /**
+     @brief Class which is used to allocate and access shared memory.
+     The shared memory is treated as an array of type T, with the
+     number of elements given by a call to the static member
+     S::size(target::block_dim()).  The byte offset from the beginning
+     of the total shared memory block is given by the static member
+     O::shared_mem_size(target::block_dim()), or 0 if O is void.
+   */
+  template <typename T, typename S, typename O = void> class SharedMemory
+  {
+  public:
+    using value_type = T;
+
+  private:
+    T *data;
+
+    /**
+       @brief This is a dummy instantiation for the host compiler
+    */
+    template <bool, typename dummy = void> struct cache_dynamic {
+      T *operator()(unsigned int)
+      {
+        static T *cache_;
+        return cache_;
+      }
+    };
+
+    /**
+       @brief This is the handle to the dynamic shared memory
+       @return Shared memory pointer
+     */
+    template <typename dummy> struct cache_dynamic<true, dummy> {
+      __device__ inline T *operator()(unsigned int offset)
+      {
+        return reinterpret_cast<T *>(device::get_shared_cache() + offset);
+      }
+    };
+
+    __device__ __host__ inline T *cache(unsigned int offset) const { return target::dispatch<cache_dynamic>(offset); }
+
+  public:
+    /**
+       @brief Byte offset for this shared memory object.
+    */
+    static constexpr unsigned int get_offset(dim3 block)
+    {
+      unsigned int o = 0;
+      if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block); }
+      return o;
+    }
+
+    /**
+       @brief Shared memory size in bytes.
+    */
+    static constexpr unsigned int shared_mem_size(dim3 block) { return get_offset(block) + S::size(block) * sizeof(T); }
+
+    /**
+       @brief Constructor for SharedMemory object.
+    */
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) { }
+
+    template <typename... U>
+    constexpr SharedMemory(const KernelOps<U...> &) : data(cache(get_offset(target::block_dim())))
+    {
+    }
+
+    /**
+       @brief Return this SharedMemory object.
+    */
+    constexpr auto sharedMem() const { return *this; }
+
+    /**
+       @brief Subscripting operator returning a reference to element.
+       @param[in] i The index to use.
+       @return Reference to value stored at that index.
+     */
+    __device__ __host__ T &operator[](int i) const { return data[i]; }
+  };
+
+} // namespace quda
diff --git a/include/targets/omptarget/target_device.h b/include/targets/omptarget/target_device.h
new file mode 100644
index 0000000000..e78a18c548
--- /dev/null
+++ b/include/targets/omptarget/target_device.h
@@ -0,0 +1,319 @@
+#pragma once
+
+#ifndef QUDA_WARP_SIZE
+#define QUDA_WARP_SIZE 16
+#endif
+
+#ifndef QUDA_MAX_BLOCK_SIZE
+#define QUDA_MAX_BLOCK_SIZE 1024
+#endif
+
+#ifndef QUDA_MAX_SHARED_MEMORY_SIZE
+#define QUDA_MAX_SHARED_MEMORY_SIZE 64*1024
+#endif
+
+#ifndef QUDA_OMPTARGET_PARALLEL_LAUNCH_METHOD
+/* 0: racy
+   1: if && barrier
+   2: single
+   3: teams set && parallel
+ */
+#define QUDA_OMPTARGET_PARALLEL_LAUNCH_METHOD 0
+#endif
+
+#if QUDA_OMPTARGET_PARALLEL_LAUNCH_METHOD==0
+
+  #define QUDA_OMPTARGET_PARALLEL_LAUNCH(ld,grid,block) \
+    _Pragma("omp parallel num_threads(ld)") \
+    { target::omptarget::launch_param_device_set(grid, block);
+
+#elif QUDA_OMPTARGET_PARALLEL_LAUNCH_METHOD==1
+
+  #define QUDA_OMPTARGET_PARALLEL_LAUNCH(ld,grid,block) \
+    _Pragma("omp parallel num_threads(ld)") \
+    { if(omp_get_thread_num()==0) \
+        target::omptarget::launch_param_device_set(grid, block); \
+      _Pragma("omp barrier")
+
+#elif QUDA_OMPTARGET_PARALLEL_LAUNCH_METHOD==2
+
+  #define QUDA_OMPTARGET_PARALLEL_LAUNCH(ld,grid,block) \
+    _Pragma("omp parallel num_threads(ld)") \
+    { _Pragma("omp single") \
+      target::omptarget::launch_param_device_set(grid, block);
+
+#elif QUDA_OMPTARGET_PARALLEL_LAUNCH_METHOD==3
+
+  #define QUDA_OMPTARGET_PARALLEL_LAUNCH(ld,grid,block) \
+    target::omptarget::launch_param_device_set(grid, block); \
+    _Pragma("omp parallel num_threads(ld)") \
+    {
+
+#else
+
+  #error "Allowed values for QUDA_OMPTARGET_LAUNCH_METHOD are 0, 1, 2, or 3."
+
+#endif
+
+#define QUDA_OMPTARGET_KERNEL_BEGIN(arg) \
+    const dim3 grid = target::omptarget::launch_param_grid(); \
+    const dim3 block = target::omptarget::launch_param_block(); \
+    const int gd = grid.x*grid.y*grid.z; \
+    const int ld = block.x*block.y*block.z; \
+    _Pragma("omp target teams num_teams(gd) thread_limit(ld) firstprivate(arg,grid,block)") \
+    { QUDA_OMPTARGET_PARALLEL_LAUNCH(ld,grid,block)
+
+#define QUDA_OMPTARGET_KERNEL_BEGIN_PTR(argp) \
+    const dim3 grid = target::omptarget::launch_param_grid(); \
+    const dim3 block = target::omptarget::launch_param_block(); \
+    const int gd = grid.x*grid.y*grid.z; \
+    const int ld = block.x*block.y*block.z; \
+    _Pragma("omp target teams num_teams(gd) thread_limit(ld) is_device_ptr(argp) firstprivate(grid,block)") \
+    { QUDA_OMPTARGET_PARALLEL_LAUNCH(ld,grid,block)
+
+#define QUDA_OMPTARGET_KERNEL_END }} /* closes BEGIN/BEGIN_PTR */
+
+namespace quda {
+
+  namespace target {
+
+    namespace omptarget {
+      inline dim3 & launch_param_kernel_block(void)
+      {
+#if 1
+        static char block[sizeof(dim3)];
+        #pragma omp groupprivate(block)
+        return *reinterpret_cast<dim3*>(block);
+#else
+        /* omp 6 */
+        static char block[sizeof(dim3)];
+        #pragma omp threadprivate(block)
+        return *reinterpret_cast<dim3*>(block);
+#endif
+      }
+      inline dim3 & launch_param_kernel_grid(void)
+      {
+#if 1
+        static char grid[sizeof(dim3)];
+        #pragma omp groupprivate(grid)
+        return *reinterpret_cast<dim3*>(grid);
+#else
+        /* omp 6 */
+        static char grid[sizeof(dim3)];
+        #pragma omp threadprivate(grid)
+        return *reinterpret_cast<dim3*>(grid);
+#endif
+      }
+      inline void launch_param_device_set(dim3 grid, dim3 block)
+      {
+        dim3 & gref = launch_param_kernel_grid();
+        dim3 & bref = launch_param_kernel_block();
+        gref = grid;
+        bref = block;
+      }
+    } // namespace omptarget
+
+#pragma omp begin declare variant match(device={kind(host)})
+    template <template <bool, typename ...> class f, typename ...Args>
+      __host__ __device__ auto dispatch(Args &&... args)
+    {
+      return f<false>()(args...);
+    }
+#pragma omp end declare variant
+#pragma omp begin declare variant match(device={kind(nohost)})
+    template <template <bool, typename ...> class f, typename ...Args>
+      __host__ __device__ auto dispatch(Args &&... args)
+    {
+      return f<true>()(args...);
+    }
+#pragma omp end declare variant
+
+    template <bool is_device> struct is_device_impl { constexpr bool operator()() { return false; } };
+    template <> struct is_device_impl<true> { constexpr bool operator()() { return true; } };
+
+    /**
+       @brief Helper function that returns if the current execution
+       region is on the device
+    */
+    __device__ __host__ inline bool is_device() { return dispatch<is_device_impl>(); }
+
+
+    template <bool is_device> struct is_host_impl { constexpr bool operator()() { return true; } };
+    template <> struct is_host_impl<true> { constexpr bool operator()() { return false; } };
+
+    /**
+       @brief Helper function that returns if the current execution
+       region is on the host
+    */
+    __device__ __host__ inline bool is_host() { return dispatch<is_host_impl>(); }
+
+    template <bool is_device> struct block_dim_impl {
+      inline dim3 operator()() { return dim3(1, 1, 1); }
+    };
+    template <> struct block_dim_impl<true> {
+      __device__ inline dim3 operator()() { return target::omptarget::launch_param_kernel_block(); }
+    };
+
+    /**
+       @brief Helper function that returns the thread block
+       dimensions.  On CUDA this returns the intrinsic blockDim,
+       whereas on the host this returns (1, 1, 1).
+    */
+    __device__ __host__ inline dim3 block_dim() { return dispatch<block_dim_impl>(); }
+
+    template <bool is_device> struct grid_dim_impl {
+      inline dim3 operator()() { return dim3(1, 1, 1); }
+    };
+    template <> struct grid_dim_impl<true> {
+      __device__ inline dim3 operator()() { return target::omptarget::launch_param_kernel_grid(); }
+    };
+
+    /**
+       @brief Helper function that returns the grid dimensions.  On
+       CUDA this returns the intrinsic blockDim, whereas on the host
+       this returns (1, 1, 1).
+    */
+    __device__ __host__ inline dim3 grid_dim() { return dispatch<grid_dim_impl>(); }
+
+    template <bool is_device> struct block_idx_impl {
+      inline dim3 operator()() { return dim3(0, 0, 0); }
+    };
+    template <> struct block_idx_impl<true> {
+      __device__ inline dim3 operator()() {
+        const dim3 & gridDim=target::omptarget::launch_param_kernel_grid();
+        const auto n = (unsigned int)omp_get_team_num();
+        return dim3(n%gridDim.x, (n/gridDim.x)%gridDim.y, n/(gridDim.x*gridDim.y));
+      }
+    };
+
+    /**
+       @brief Helper function that returns the thread indices within a
+       thread block.  On CUDA this returns the intrinsic
+       blockIdx, whereas on the host this just returns (0, 0, 0).
+    */
+    __device__ __host__ inline dim3 block_idx() { return dispatch<block_idx_impl>(); }
+
+    template <bool is_device> struct thread_idx_impl {
+      inline dim3 operator()() { return dim3(0, 0, 0); }
+    };
+    template <> struct thread_idx_impl<true> {
+      __device__ inline dim3 operator()() {
+        const dim3 & blockDim=target::omptarget::launch_param_kernel_block();
+        const auto n = (unsigned int)omp_get_thread_num();
+        return dim3(n%blockDim.x, (n/blockDim.x)%blockDim.y, n/(blockDim.x*blockDim.y));
+      }
+    };
+
+    /**
+       @brief Helper function that returns the thread indices within a
+       thread block.  On CUDA this returns the intrinsic
+       threadIdx, whereas on the host this just returns (0, 0, 0).
+    */
+    __device__ __host__ inline dim3 thread_idx() { return dispatch<thread_idx_impl>(); }
+
+    /**
+       @brief Helper function that returns a linear thread index within a thread block.
+    */
+    template <int dim> __device__ __host__ inline auto thread_idx_linear()
+    {
+      const auto n = (unsigned int)omp_get_thread_num();
+      const dim3 & blockDim=target::omptarget::launch_param_kernel_block();
+      switch (dim) {
+      case 1: return n%blockDim.x;
+      case 2: return n%(blockDim.x*blockDim.y);
+      case 3:
+      default: return n;
+      }
+    }
+
+    /**
+       @brief Helper function that returns the total number thread in a thread block
+    */
+    template <int dim> __device__ __host__ inline auto block_size()
+    {
+      const dim3 & blockDim=target::omptarget::launch_param_kernel_block();
+      switch (dim) {
+      case 1: return blockDim.x;
+      case 2: return blockDim.y * blockDim.x;
+      case 3:
+      default: return blockDim.z * blockDim.y * blockDim.x;
+      }
+    }
+
+  } // namespace target
+
+  namespace device {
+
+    /**
+       @brief Helper function that returns the warp-size of the
+       architecture we are running on.
+    */
+    constexpr int warp_size() { return QUDA_WARP_SIZE; }
+
+    /**
+       @brief Return the thread mask for a converged warp.
+    */
+    constexpr unsigned int warp_converged_mask() { return 0xffffffff; }
+
+    /**
+       @brief Helper function that returns the maximum number of threads
+       in a block in the x dimension.
+    */
+    template <int block_size_y = 1, int block_size_z = 1>
+      constexpr unsigned int max_block_size()
+      {
+        return std::max(warp_size(), QUDA_MAX_BLOCK_SIZE / (block_size_y * block_size_z));
+      }
+
+    /**
+       @brief Helper function that returns the maximum size of a
+       __constant__ buffer on the target architecture.  For CUDA,
+       this is set to the somewhat arbitrary limit of 32 KiB for now.
+    */
+    constexpr size_t max_constant_size() { return 32768; }
+
+    /**
+       @brief Helper function that returns the maximum static size of
+       the kernel arguments passed to a kernel on the target
+       architecture.
+    */
+    constexpr size_t max_kernel_arg_size() { return 64; }
+
+    /**
+       @brief Use a compile time fixed size for the shared local memory,
+       until we can find a way to set it dynamically.
+    */
+    constexpr unsigned int max_shared_memory_size() { return QUDA_MAX_SHARED_MEMORY_SIZE; }
+
+    /**
+       @brief Helper function that returns true if we are to pass the
+       kernel parameter struct to the kernel as an explicit kernel
+       argument.  Otherwise the parameter struct is explicitly copied
+       to the device prior to kernel launch.
+    */
+    template <typename Arg> constexpr bool use_kernel_arg()
+    {
+      return Arg::always_use_kernel_arg() ||
+        (Arg::default_use_kernel_arg() && sizeof(Arg) <= device::max_kernel_arg_size());
+    }
+
+    /**
+       @brief Helper function that returns a pointer to the
+       __constant__ memory buffer.  Note this is the dummy
+       implementation, and is present only to keep the compiler happy
+       in the translation units where constant memory is not used.
+     */
+    template <typename Arg> constexpr std::enable_if_t<use_kernel_arg<Arg>(), void *> get_constant_buffer() { return nullptr; }
+
+    /**
+       @brief Return the address of the shared local memory for the current thread group.
+     */
+    inline char *get_shared_cache(void)
+    {
+      static char s[device::max_shared_memory_size()];
+      #pragma omp groupprivate(s)
+      return s;
+    }
+  }
+
+}
diff --git a/include/targets/omptarget/thread_array.h b/include/targets/omptarget/thread_array.h
new file mode 100644
index 0000000000..fb550cc782
--- /dev/null
+++ b/include/targets/omptarget/thread_array.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#ifdef QUDA_OMPTARGET_THREAD_ARRAY_SIMPLE
+
+namespace quda
+{
+  template <typename T, int n> struct thread_array {
+    using value_type = T;
+    static constexpr int N = n;
+    T data[n];
+
+    constexpr inline T &operator[](int i) { return data[i]; }
+    constexpr inline const T &operator[](int i) const { return data[i]; }
+  };
+} // namespace quda
+
+#else
+
+#include "../generic/thread_array.h"
+
+#endif
diff --git a/include/targets/omptarget/thread_local_cache.h b/include/targets/omptarget/thread_local_cache.h
new file mode 100644
index 0000000000..dd4cd863fc
--- /dev/null
+++ b/include/targets/omptarget/thread_local_cache.h
@@ -0,0 +1 @@
+#include "../generic/thread_local_cache.h"
diff --git a/include/targets/omptarget/tunable_kernel.h b/include/targets/omptarget/tunable_kernel.h
new file mode 100644
index 0000000000..7757954ff8
--- /dev/null
+++ b/include/targets/omptarget/tunable_kernel.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#include <constant_kernel_arg.h>
+#include <tune_quda.h>
+#include <target_device.h>
+#include <lattice_field.h>
+#include <kernel_helper.h>
+#include <kernel.h>
+#include <kernel_ops_target.h>
+#include <quda_omptarget_api.h>
+
+namespace quda {
+
+  template <typename Arg>
+  concept announce_threads_sync = requires {
+    Arg::requires_threads_sync;
+  };
+
+  template <typename Arg>
+  inline bool acceptThreads(const TuneParam &tp, const Arg &arg)
+  {
+    bool fit = tp.block.x*tp.block.y*tp.block.z<=device::max_block_size();
+    bool xrem = arg.threads.x % tp.block.x > 0;
+    bool yrem = arg.threads.y % tp.block.y > 0;
+    bool zrem = arg.threads.z % tp.block.z > 0;
+    if (!fit) {
+      if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
+        warningQuda("rejecting threads setup with a large block size with arg %d %d %d tp grid %d %d %d block %d %d %d\n",arg.threads.x,arg.threads.y,arg.threads.z,tp.grid.x,tp.grid.y,tp.grid.z,tp.block.x,tp.block.y,tp.block.z);
+      return false;
+    }
+    if (xrem || yrem || zrem) {
+      if constexpr(announce_threads_sync<Arg>){
+        if (((Arg::requires_threads_sync & ThreadsSyncX) && xrem) ||
+            ((Arg::requires_threads_sync & ThreadsSyncY) && yrem) ||
+            ((Arg::requires_threads_sync & ThreadsSyncZ) && zrem)) {
+          if(getVerbosity() >= QUDA_DEBUG_VERBOSE)
+            warningQuda("rejecting threads setup with a non-divisible block size with arg %d %d %d tp grid %d %d %d block %d %d %d\n",arg.threads.x,arg.threads.y,arg.threads.z,tp.grid.x,tp.grid.y,tp.grid.z,tp.block.x,tp.block.y,tp.block.z);
+          return false;
+        } else {
+          if(getVerbosity() >= QUDA_DEBUG_VERBOSE)
+            warningQuda("accepting threads setup with a non-divisible block size with arg %d %d %d tp grid %d %d %d block %d %d %d\n",arg.threads.x,arg.threads.y,arg.threads.z,tp.grid.x,tp.grid.y,tp.grid.z,tp.block.x,tp.block.y,tp.block.z);
+          return true;
+        }
+      } else {  // ! announce_threads_sync<Arg>
+        if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+          bool cont = true;
+          std::string reply;
+            ompwip("threads setup with a non-divisible block size with arg %d %d %d tp grid %d %d %d block %d %d %d, yes to stop?",arg.threads.x,arg.threads.y,arg.threads.z,tp.grid.x,tp.grid.y,tp.grid.z,tp.block.x,tp.block.y,tp.block.z);
+          std::getline(std::cin, reply);
+          if (reply[0] == 'y' || reply[0] == 'Y') {
+            cont = false;
+          }
+          return cont;
+        } else {
+          if (getVerbosity() >= QUDA_VERBOSE)
+            ompwip("accepting threads setup with a non-divisible block size with arg %d %d %d tp grid %d %d %d block %d %d %d",arg.threads.x,arg.threads.y,arg.threads.z,tp.grid.x,tp.grid.y,tp.grid.z,tp.block.x,tp.block.y,tp.block.z);
+          return true;
+        }
+      }
+    } else {
+      if(getVerbosity() >= QUDA_DEBUG_VERBOSE)
+        warningQuda("threads setup with arg %d %d %d tp grid %d %d %d block %d %d %d\n",arg.threads.x,arg.threads.y,arg.threads.z,tp.grid.x,tp.grid.y,tp.grid.z,tp.block.x,tp.block.y,tp.block.z);
+    }
+    return true;
+  }
+
+  /**
+     @brief This helper function indicates if the present
+     compilation unit has explicit constant memory usage enabled.
+  */
+  static bool use_constant_memory()
+  {
+#ifdef QUDA_USE_CONSTANT_MEMORY
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  class TunableKernel : public Tunable
+  {
+
+  protected:
+    QudaFieldLocation location;
+
+    template <template <typename> class Functor, bool grid_stride, typename Arg>
+    qudaError_t launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
+    {
+      checkSharedBytes(tp);
+      launch_error = QUDA_SUCCESS;
+      if (acceptThreads(tp, arg) && 0==target::omptarget::qudaSetupLaunchParameter(tp)) {
+        if constexpr (device::use_kernel_arg<Arg>()) {
+          reinterpret_cast<void(*)(Arg)>(const_cast<void*>(kernel.func))(arg);
+        } else {
+          static_assert(sizeof(Arg) <= device::max_constant_size(), "Parameter struct is greater than max constant size");
+          Arg *argp = reinterpret_cast<Arg*>(device::get_constant_buffer<Arg>());
+          memcpy(argp, &arg, sizeof(Arg));
+          reinterpret_cast<void(*)(Arg*)>(const_cast<void*>(kernel.func))(argp);
+        }
+        launch_error = qudaGetLastError();
+      } else {
+        launch_error = QUDA_ERROR;
+      }
+      target::omptarget::set_runtime_error(launch_error, __func__, __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+      return launch_error;
+    }
+
+  public:
+    TunableKernel(const LatticeField &field, QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION) :
+      location(location != QUDA_INVALID_FIELD_LOCATION ? location : field.Location())
+    {
+      strcpy(vol, field.VolString().c_str());
+      strcpy(aux, compile_type_str(field, location));
+      if (this->location == QUDA_CUDA_FIELD_LOCATION && use_constant_memory()) strcat(aux, "cmem,");
+      if (this->location == QUDA_CPU_FIELD_LOCATION) strcat(aux, getOmpThreadStr());
+      strcat(aux, field.AuxString().c_str());
+    }
+
+    TunableKernel(size_t n_items, QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION) : location(location)
+    {
+      u64toa(vol, n_items);
+      strcpy(aux, compile_type_str(location));
+      if (location == QUDA_CUDA_FIELD_LOCATION && use_constant_memory()) strcat(aux, "cmem,");
+      if (this->location == QUDA_CPU_FIELD_LOCATION) strcat(aux, getOmpThreadStr());
+    }
+
+    virtual bool advanceTuneParam(TuneParam &param) const override
+    {
+      return location == QUDA_CPU_FIELD_LOCATION ? false : Tunable::advanceTuneParam(param);
+    }
+
+    TuneKey tuneKey() const override { return TuneKey(vol, typeid(*this).name(), aux); }
+  };
+
+}
diff --git a/include/targets/omptarget/warp_collective.h b/include/targets/omptarget/warp_collective.h
new file mode 100644
index 0000000000..f01453725f
--- /dev/null
+++ b/include/targets/omptarget/warp_collective.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <target_device.h>
+
+namespace quda
+{
+
+  template <bool is_device> struct warp_combine_impl {
+    template <typename T> T operator()(T &x, int) { return x; }
+  };
+
+  template <> struct warp_combine_impl<true> {
+    template <typename T> __device__ inline T operator()(T &x, int warp_split)
+    {
+      using R = typename T::value_type;
+      constexpr auto max_nthr = device::max_block_size();
+      static_assert(max_nthr*sizeof(R) <= device::max_shared_memory_size()-sizeof(device::get_shared_cache()[0])*128, "Shared cache not large enough for tempStorage");  // FIXME arbitrary, the number is arbitrary, offset 128 below
+      R *storage = (R*)&device::get_shared_cache()[128];  // FIXME arbitrary
+      const int tid = omp_get_thread_num();
+
+      constexpr int warp_size = device::warp_size();
+      const int wid = tid / warp_size;
+      const int warpend = (wid+1)*warp_size;
+      const int split_size = warp_size / warp_split;
+      if (warp_split > 1) {
+#pragma unroll
+        for (int i = 0; i < x.size(); i++) {
+          // reduce down to the first group of column-split threads
+#pragma unroll
+          for (int offset = warp_size / 2; offset >= split_size; offset /= 2) {
+            #pragma omp barrier
+            storage[tid] = x[i];
+            const auto tid_offset = tid + offset;
+            const auto load_id = tid_offset >= warpend ? tid_offset-warp_size : tid_offset;
+            #pragma omp barrier
+            const auto& y = storage[load_id];
+            x[i].real(x[i].real() + y.real());
+            x[i].imag(x[i].imag() + y.imag());
+          }
+        }
+      }
+      return x;
+    }
+  };
+
+  template <int warp_split, typename T> __device__ __host__ inline T warp_combine(T &x)
+  {
+    return target::dispatch<warp_combine_impl>(x, warp_split);
+  }
+
+} // namespace quda
diff --git a/include/tunable_block_reduction.h b/include/tunable_block_reduction.h
index bf4529dbf1..45d562f47d 100644
--- a/include/tunable_block_reduction.h
+++ b/include/tunable_block_reduction.h
@@ -154,9 +154,9 @@ namespace quda
     {
       dim3 block = param.block;
       dim3 grid = param.grid;
-      bool ret = tune_block_x ? Tunable::advanceBlockDim(param) : false;
       param.block.z = block.z;
       param.grid.z = grid.z;
+      bool ret = tune_block_x ? Tunable::advanceBlockDim(param) : false;
 
       if (ret) {
         return true;
diff --git a/include/tunable_nd.h b/include/tunable_nd.h
index f7076f9dff..096fda90a7 100644
--- a/include/tunable_nd.h
+++ b/include/tunable_nd.h
@@ -264,9 +264,9 @@ namespace quda
     {
       dim3 block = param.block;
       dim3 grid = param.grid;
-      bool ret = tune_block_x ? Tunable::advanceBlockDim(param) : false;
       param.block.y = block.y;
       param.grid.y = grid.y;
+      bool ret = tune_block_x ? Tunable::advanceBlockDim(param) : false;
 
       if (ret) {
         return true;
@@ -523,11 +523,11 @@ namespace quda
     {
       dim3 block = param.block;
       dim3 grid = param.grid;
+      param.block.z = block.z;
+      param.grid.z = grid.z;
       bool ret = tune_block_y                           ? TunableKernel2D_base<grid_stride>::advanceBlockDim(param) :
         TunableKernel2D_base<grid_stride>::tune_block_x ? Tunable::advanceBlockDim(param) :
                                                           false;
-      param.block.z = block.z;
-      param.grid.z = grid.z;
 
       if (ret) {
         // we advanced the block.x / block.y so we're done
diff --git a/include/tune_quda.h b/include/tune_quda.h
index 03b9b148ee..64f38200ed 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -87,7 +87,27 @@ namespace quda {
       if (tuneGridDim()) {
         const int step = gridStep();
         param.grid.x += step;
+#ifdef QUDA_TARGET_OMPTARGET
+        /* Makes sure the runtime allows us to set the parameters. */
+        bool failed = false;
+        {
+          const int gd = param.grid.x*param.grid.y*param.grid.z;
+          const int ld = param.block.x*param.block.y*param.block.z;
+          int gn = 0, ln = 0;
+          #pragma omp target teams num_teams(gd) thread_limit(ld) map(tofrom:gn,ln)
+          #pragma omp parallel num_threads(ld)
+          {
+            if(omp_get_team_num()==0 && omp_get_thread_num()==0){
+              gn = omp_get_num_teams();
+              ln = omp_get_num_threads();
+            }
+          }
+          failed = gn!=gd||ln!=ld;
+        }
+        if (failed || param.grid.x > maxGridSize()) {
+#else
         if (param.grid.x > maxGridSize()) {
+#endif
           param.grid.x = minGridSize();
           return false;
         } else {
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 950ac83a09..8837c9b173 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -293,6 +293,9 @@ endif()
 if(${QUDA_TARGET_TYPE} STREQUAL "SYCL")
   include(targets/sycl/target_sycl.cmake)
 endif()
+if(${QUDA_TARGET_TYPE} STREQUAL "OMPTARGET")
+	include(targets/omptarget/target_omptarget.cmake)
+endif()
 
 # Set the maximum multi-RHS per kernel if not already set by the target
 if(NOT DEFINED QUDA_MAX_MULTI_RHS)
diff --git a/lib/block_orthogonalize.in.cu b/lib/block_orthogonalize.in.cu
index e67e083b94..daeadb9fe7 100644
--- a/lib/block_orthogonalize.in.cu
+++ b/lib/block_orthogonalize.in.cu
@@ -1,3 +1,5 @@
+#include <array>
+
 #include <color_spinor_field.h>
 #include <uint_to_char.h>
 #include <vector>
diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index 6bfe26399b..2311ce3840 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -3,6 +3,7 @@
 #include <string.h>
 #include <math.h>
 #include <typeinfo>
+#include <utility>
 
 #include <quda_internal.h>
 #include <clover_field.h>
diff --git a/lib/coarse_op.in.cu b/lib/coarse_op.in.cu
index c586f8f1dd..b156e25e1e 100644
--- a/lib/coarse_op.in.cu
+++ b/lib/coarse_op.in.cu
@@ -156,7 +156,7 @@ namespace quda {
     QudaFieldLocation location = checkLocation(Y, X);
 
     GaugeField *U = location == QUDA_CUDA_FIELD_LOCATION ? const_cast<GaugeField*>(&gauge) : nullptr;
-    CloverField *C = location == QUDA_CUDA_FIELD_LOCATION ? const_cast<CloverField*>(clover) : nullptr;
+    CloverField *C = location == QUDA_CUDA_FIELD_LOCATION && clover ? const_cast<CloverField*>(clover) : nullptr;
 
     if (location == QUDA_CPU_FIELD_LOCATION) {
       //First make a cpu gauge field from the cuda gauge field
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index ebd69a2ec4..ba2e622708 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -1,6 +1,7 @@
 #include <string.h>
 #include <iostream>
 #include <typeinfo>
+#include <utility>
 
 #include <color_spinor_field.h>
 #include <dslash_quda.h>
diff --git a/lib/dslash_policy.hpp b/lib/dslash_policy.hpp
index f7d024085f..089694daa3 100644
--- a/lib/dslash_policy.hpp
+++ b/lib/dslash_policy.hpp
@@ -1,3 +1,4 @@
+#include <array>
 #include <memory>
 #include <tune_quda.h>
 #include <index_helper.cuh>
diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp
index 6e73da086f..a70baa2998 100644
--- a/lib/eig_block_trlm.cpp
+++ b/lib/eig_block_trlm.cpp
@@ -1,6 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
+#include <complex>
 #include <iostream>
 #include <vector>
 #include <algorithm>
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 20cf7621ac..2ea8bb1239 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -1,4 +1,5 @@
 #include <typeinfo>
+#include <utility>
 #include <gauge_field.h>
 #include <blas_quda.h>
 #include <timer.h>
diff --git a/lib/gauge_fix_fft.cu b/lib/gauge_fix_fft.cu
index 483fcc37b0..5fe0637599 100644
--- a/lib/gauge_fix_fft.cu
+++ b/lib/gauge_fix_fft.cu
@@ -185,6 +185,9 @@ namespace quda {
   void gaugeFixingFFT(GaugeField& data, int Nsteps, int verbose_interval,
                       double alpha0, int autotune, double tolerance, int stopWtheta)
   {
+#ifdef QUDA_TARGET_OMPTARGET
+    ompwip("unimplemented");
+#else
     TimeProfile profileInternalGaugeFixFFT("InternalGaugeFixQudaFFT", false);
 
     profileInternalGaugeFixFFT.TPSTART(QUDA_PROFILE_COMPUTE);
@@ -357,6 +360,7 @@ namespace quda {
     logQuda(QUDA_SUMMARIZE, "Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
 
     host_free(num_failures_h);
+#endif
   }
 
   template<typename Float, int nColors, QudaReconstructType recon> struct GaugeFixingFFT {
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index e5b678acfd..06eda00e80 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3,6 +3,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
+#include <utility>
 #include <sys/time.h>
 
 #include <quda.h>
diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index 6b4c5e669e..fc4693abc2 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -1,4 +1,5 @@
 #include <typeinfo>
+#include <utility>
 #include <quda_internal.h>
 #include <lattice_field.h>
 #include <color_spinor_field.h>
diff --git a/lib/targets/generic/blas_lapack_eigen.cpp b/lib/targets/generic/blas_lapack_eigen.cpp
index ee753eb201..90efb26f75 100644
--- a/lib/targets/generic/blas_lapack_eigen.cpp
+++ b/lib/targets/generic/blas_lapack_eigen.cpp
@@ -1,3 +1,4 @@
+#include <complex>
 #include <timer.h>
 #include <blas_lapack.h>
 #include <eigen_helper.h>
@@ -331,32 +332,32 @@ namespace quda
         if (blas_param.data_type == QUDA_BLAS_DATATYPE_Z) {
 
           typedef std::complex<double> Z;
-          const Z alpha = blas_param.alpha;
-          const Z beta = blas_param.beta;
+          const Z alpha = reinterpret_cast<std::complex<double>&>(blas_param.alpha);
+          const Z beta = reinterpret_cast<std::complex<double>&>(blas_param.beta);
           GEMM<MatrixXcd, Z>(A_h, B_h, C_h, alpha, beta, max_stride, blas_param);
           flops += batch * FLOPS_CGEMM(blas_param.m, blas_param.n, blas_param.k);
 
         } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_C) {
 
           typedef std::complex<float> C;
-          const C alpha = blas_param.alpha;
-          const C beta = blas_param.beta;
+          const C alpha(reinterpret_cast<std::complex<double>&>(blas_param.alpha).real(),reinterpret_cast<std::complex<double>&>(blas_param.alpha).imag());
+          const C beta(reinterpret_cast<std::complex<double>&>(blas_param.beta).real(),reinterpret_cast<std::complex<double>&>(blas_param.beta).imag());
           GEMM<MatrixXcf, C>(A_h, B_h, C_h, alpha, beta, max_stride, blas_param);
           flops += batch * FLOPS_CGEMM(blas_param.m, blas_param.n, blas_param.k);
 
         } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_D) {
 
           typedef double D;
-          const D alpha = (D)(static_cast<std::complex<double>>(blas_param.alpha).real());
-          const D beta = (D)(static_cast<std::complex<double>>(blas_param.beta).real());
+          const D alpha = (D)(static_cast<std::complex<double>>(reinterpret_cast<std::complex<double>&>(blas_param.alpha)).real());
+          const D beta = (D)(static_cast<std::complex<double>>(reinterpret_cast<std::complex<double>&>(blas_param.beta)).real());
           GEMM<MatrixXd, D>(A_h, B_h, C_h, alpha, beta, max_stride, blas_param);
           flops += batch * FLOPS_SGEMM(blas_param.m, blas_param.n, blas_param.k);
 
         } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_S) {
 
           typedef float S;
-          const S alpha = (S)(static_cast<std::complex<float>>(blas_param.alpha).real());
-          const S beta = (S)(static_cast<std::complex<float>>(blas_param.beta).real());
+          const S alpha = (S)(static_cast<std::complex<float>>(reinterpret_cast<std::complex<double>&>(blas_param.alpha)).real());
+          const S beta = (S)(static_cast<std::complex<float>>(reinterpret_cast<std::complex<double>&>(blas_param.beta)).real());
           GEMM<MatrixXf, S>(A_h, B_h, C_h, alpha, beta, max_stride, blas_param);
           flops += batch * FLOPS_SGEMM(blas_param.m, blas_param.n, blas_param.k);
 
diff --git a/lib/targets/omptarget/CMakeLists.txt b/lib/targets/omptarget/CMakeLists.txt
new file mode 100644
index 0000000000..9d273d5b6a
--- /dev/null
+++ b/lib/targets/omptarget/CMakeLists.txt
@@ -0,0 +1,2 @@
+# add target specific files / options
+target_sources(quda_cpp PRIVATE  blas_lapack_mkl.cpp comm_target.cpp device.cpp malloc.cpp quda_api.cpp)
diff --git a/lib/targets/omptarget/blas_lapack_mkl.cpp b/lib/targets/omptarget/blas_lapack_mkl.cpp
new file mode 100644
index 0000000000..6b2db44e7d
--- /dev/null
+++ b/lib/targets/omptarget/blas_lapack_mkl.cpp
@@ -0,0 +1,489 @@
+#include <complex.h>
+#include <blas_lapack.h>
+#include <timer.h>
+#ifdef NATIVE_LAPACK_LIB
+#include <mkl.h>
+#include <mkl_omp_offload.h>
+#include <malloc_quda.h>
+#endif
+
+//#define _DEBUG
+
+#ifdef _DEBUG
+#include <eigen_helper.h>
+#endif
+
+namespace quda
+{
+
+  namespace blas_lapack
+  {
+
+    namespace native
+    {
+
+      void init() {}
+      void destroy() {}
+
+#ifdef _DEBUG
+      template <typename EigenMatrix, typename Float>
+      __host__ void checkEigen(std::complex<Float> *A_h, std::complex<Float> *Ainv_h, int n, uint64_t batch)
+      {
+        EigenMatrix A = EigenMatrix::Zero(n, n);
+        EigenMatrix Ainv = EigenMatrix::Zero(n, n);
+        for (int j = 0; j < n; j++) {
+          for (int k = 0; k < n; k++) {
+            A(k, j) = A_h[batch * n * n + j * n + k];
+            Ainv(k, j) = Ainv_h[batch * n * n + j * n + k];
+          }
+        }
+
+        // Check result:
+        EigenMatrix unit = EigenMatrix::Identity(n, n);
+        EigenMatrix prod = A * Ainv;
+        Float L2norm = ((prod - unit).norm() / (n * n));
+        printfQuda("cuBLAS: Norm of (A * Ainv - I) batch %lu = %e\n", batch, L2norm);
+      }
+#endif
+
+#ifdef NATIVE_LAPACK_LIB
+      // FIXME do this in pipelined fashion to reduce memory overhead.
+      long long BatchInvertMatrix(void *Ainv, void *A, const int n, const uint64_t batch, QudaPrecision prec,
+                                  QudaFieldLocation location)
+      {
+        if (getVerbosity() >= QUDA_VERBOSE)
+          printfQuda("BatchInvertMatrix (native - MKL): Nc = %d, batch = %lu\n", n, batch);
+
+        long long flops = 0;
+        timeval start, stop;
+        gettimeofday(&start, NULL);
+
+        size_t size = 2 * n * n * prec * batch;
+        void *A_d = location == QUDA_CUDA_FIELD_LOCATION ? A : pool_device_malloc(size);
+        void *Ainv_d = location == QUDA_CUDA_FIELD_LOCATION ? Ainv : pool_device_malloc(size);
+        if (location == QUDA_CPU_FIELD_LOCATION) qudaMemcpy(A_d, A, size, qudaMemcpyHostToDevice);
+
+#ifdef _DEBUG
+        // Debug code: Copy original A matrix to host
+        if (prec == QUDA_SINGLE_PRECISION) {
+          std::complex<float> *A_h
+            = (location == QUDA_CUDA_FIELD_LOCATION ? static_cast<std::complex<float> *>(pool_pinned_malloc(size)) :
+                                                      static_cast<std::complex<float> *>(A_d));
+          if (location == QUDA_CUDA_FIELD_LOCATION) qudaMemcpy((void *)A_h, A_d, size, qudaMemcpyDeviceToHost);
+        } else if (prec == QUDA_DOUBLE_PRECISION) {
+          std::complex<double> *A_h
+            = (location == QUDA_CUDA_FIELD_LOCATION ? static_cast<std::complex<double> *>(pool_pinned_malloc(size)) :
+                                                      static_cast<std::complex<double> *>(A_d));
+          if (location == QUDA_CUDA_FIELD_LOCATION) qudaMemcpy((void *)A_h, A_d, size, qudaMemcpyDeviceToHost);
+        } else {
+          errorQuda("%s not implemented for precision=%d", __func__, prec);
+        }
+#endif
+
+        MKL_INT *dipiv = static_cast<MKL_INT *>(pool_device_malloc(batch * n * sizeof(MKL_INT)));
+        MKL_INT *dinfo_array = static_cast<MKL_INT *>(pool_device_malloc(batch * sizeof(MKL_INT)));
+        MKL_INT *info_array = static_cast<MKL_INT *>(pool_pinned_malloc(batch * sizeof(MKL_INT)));
+        memset(info_array, '0', batch * sizeof(MKL_INT)); // silence memcheck warnings
+
+        MKL_INT n_array = n;
+        MKL_INT stride_array = n * n;
+        MKL_INT batch_size = batch;
+
+        if (prec == QUDA_SINGLE_PRECISION) {
+          typedef MKL_Complex8 C;
+          #pragma omp dispatch is_device_ptr(A_d, dipiv, dinfo_array)
+          cgetrf_batch_strided(&n_array, &n_array, (C *)A_d, &n_array, &stride_array, dipiv, &n_array, &batch_size, dinfo_array);
+          flops += batch * FLOPS_CGETRF(n, n);
+
+          qudaMemcpy(info_array, dinfo_array, batch * sizeof(MKL_INT), qudaMemcpyDeviceToHost);
+          for (uint64_t i = 0; i < batch; i++) {
+            if (info_array[i] < 0) {
+              errorQuda("%lu argument had an illegal value or another error occured, such as memory allocation failed",
+                        i);
+            } else if (info_array[i] > 0) {
+              errorQuda("%lu factorization completed but the factor U is exactly singular", i);
+            }
+          }
+
+          #pragma omp dispatch is_device_ptr(A_d, Ainv_d, dipiv, dinfo_array)
+          cgetri_oop_batch_strided(&n_array, (C *)A_d, &n_array, &stride_array, dipiv, &n_array, (C *)Ainv_d, &n_array, &stride_array, &batch_size, dinfo_array);
+          flops += batch * FLOPS_CGETRI(n);
+
+          qudaMemcpy(info_array, dinfo_array, batch * sizeof(MKL_INT), qudaMemcpyDeviceToHost);
+
+          for (uint64_t i = 0; i < batch; i++) {
+            if (info_array[i] < 0) {
+              errorQuda("%lu argument had an illegal value or another error occured, such as memory allocation failed",
+                        i);
+            } else if (info_array[i] > 0) {
+              errorQuda("%lu factorization completed but the factor U is exactly singular", i);
+            }
+          }
+
+#ifdef _DEBUG
+          // Debug code: Copy computed Ainv to host
+          std::complex<float> *Ainv_h = static_cast<std::complex<float> *>(pool_pinned_malloc(size));
+          qudaMemcpy((void *)Ainv_h, Ainv_d, size, qudaMemcpyDeviceToHost);
+
+          for (uint64_t i = 0; i < batch; i++) { checkEigen<MatrixXcf, float>(A_h, Ainv_h, n, i); }
+          pool_pinned_free(Ainv_h);
+          pool_pinned_free(A_h);
+#endif
+        } else if (prec == QUDA_DOUBLE_PRECISION) {
+          typedef MKL_Complex16 Z;
+          #pragma omp dispatch is_device_ptr(A_d, dipiv, dinfo_array)
+          zgetrf_batch_strided(&n_array, &n_array, (Z *)A_d, &n_array, &stride_array, dipiv, &n_array, &batch_size, dinfo_array);
+          flops += batch * FLOPS_ZGETRF(n, n);
+
+          qudaMemcpy(info_array, dinfo_array, batch * sizeof(MKL_INT), qudaMemcpyDeviceToHost);
+          for (uint64_t i = 0; i < batch; i++) {
+            if (info_array[i] < 0) {
+              errorQuda("%lu argument had an illegal value or another error occured, such as memory allocation failed",
+                        i);
+            } else if (info_array[i] > 0) {
+              errorQuda("%lu factorization completed but the factor U is exactly singular", i);
+            }
+          }
+
+          #pragma omp dispatch is_device_ptr(A_d, Ainv_d, dipiv, dinfo_array)
+          zgetri_oop_batch_strided(&n_array, (Z *)A_d, &n_array, &stride_array, dipiv, &n_array, (Z *)Ainv_d, &n_array, &stride_array, &batch_size, dinfo_array);
+          flops += batch * FLOPS_CGETRI(n);
+
+          qudaMemcpy(info_array, dinfo_array, batch * sizeof(MKL_INT), qudaMemcpyDeviceToHost);
+          for (uint64_t i = 0; i < batch; i++) {
+            if (info_array[i] < 0) {
+              errorQuda("%lu argument had an illegal value or another error occured, such as memory allocation failed",
+                        i);
+            } else if (info_array[i] > 0) {
+              errorQuda("%lu factorization completed but the factor U is exactly singular", i);
+            }
+          }
+
+#ifdef _DEBUG
+          // Debug code: Copy computed Ainv to host
+          std::complex<double> *Ainv_h = static_cast<std::complex<double> *>(pool_pinned_malloc(size));
+          qudaMemcpy((void *)Ainv_h, Ainv_d, size, qudaMemcpyDeviceToHost);
+
+          for (uint64_t i = 0; i < batch; i++) { checkEigen<MatrixXcd, double>(A_h, Ainv_h, n, i); }
+          pool_pinned_free(Ainv_h);
+          pool_pinned_free(A_h);
+#endif
+        } else {
+          errorQuda("%s not implemented for precision=%d", __func__, prec);
+        }
+
+        if (location == QUDA_CPU_FIELD_LOCATION) {
+          qudaMemcpy(Ainv, Ainv_d, size, qudaMemcpyDeviceToHost);
+          pool_device_free(Ainv_d);
+          pool_device_free(A_d);
+        }
+
+        pool_device_free(dipiv);
+        pool_device_free(dinfo_array);
+        pool_pinned_free(info_array);
+
+        qudaDeviceSynchronize();
+        gettimeofday(&stop, NULL);
+        long ds = stop.tv_sec - start.tv_sec;
+        long dus = stop.tv_usec - start.tv_usec;
+        double time = ds + 0.000001 * dus;
+
+        if (getVerbosity() >= QUDA_VERBOSE)
+          printfQuda("Batched matrix inversion completed in %f seconds with GFLOPS = %f\n", time, 1e-9 * flops / time);
+
+        return flops;
+      }
+#else
+      long long BatchInvertMatrix(void *, void *, const int, const uint64_t, QudaPrecision, QudaFieldLocation)
+      {
+        errorQuda("Native BLAS not built. Please build and use native BLAS or use generic BLAS");
+        return 0; // Stops a compiler warning
+      }
+#endif
+
+#ifdef NATIVE_LAPACK_LIB
+      long long stridedBatchGEMM(void *A_data, void *B_data, void *C_data, QudaBLASParam blas_param,
+                                 QudaFieldLocation location)
+      {
+        long long flops = 0;
+        timeval start, stop;
+        gettimeofday(&start, NULL);
+
+        // Sanity checks on parameters
+        //-------------------------------------------------------------------------
+        // If the user passes non positive M,N, or K, we error out
+        int min_dim = std::min(blas_param.m, std::min(blas_param.n, blas_param.k));
+        if (min_dim <= 0) {
+          errorQuda("BLAS dims must be positive: m=%d, n=%d, k=%d", blas_param.m, blas_param.n, blas_param.k);
+        }
+
+        // If the user passes a negative stride, we error out as this has no meaning.
+        int min_stride = std::min(std::min(blas_param.a_stride, blas_param.b_stride), blas_param.c_stride);
+        if (min_stride < 0) {
+          errorQuda("BLAS strides must be positive or zero: a_stride=%d, b_stride=%d, c_stride=%d", blas_param.a_stride,
+                    blas_param.b_stride, blas_param.c_stride);
+        }
+
+        // If the user passes a negative offset, we error out as this has no meaning.
+        int min_offset = std::min(std::min(blas_param.a_offset, blas_param.b_offset), blas_param.c_offset);
+        if (min_offset < 0) {
+          errorQuda("BLAS offsets must be positive or zero: a_offset=%d, b_offset=%d, c_offset=%d", blas_param.a_offset,
+                    blas_param.b_offset, blas_param.c_offset);
+        }
+
+        // If the batch value is non-positve, we error out
+        if (blas_param.batch_count <= 0) { errorQuda("Batches must be positive: batches=%d", blas_param.batch_count); }
+
+        // Leading dims are dependendent on the matrix op type.
+        if (blas_param.data_order == QUDA_BLAS_DATAORDER_COL) {
+          if (blas_param.trans_a == QUDA_BLAS_OP_N) {
+            if (blas_param.lda < std::max(1, blas_param.m))
+              errorQuda("lda=%d must be >= max(1,m=%d)", blas_param.lda, blas_param.m);
+          } else {
+            if (blas_param.lda < std::max(1, blas_param.k))
+              errorQuda("lda=%d must be >= max(1,k=%d)", blas_param.lda, blas_param.k);
+          }
+
+          if (blas_param.trans_b == QUDA_BLAS_OP_N) {
+            if (blas_param.ldb < std::max(1, blas_param.k))
+              errorQuda("ldb=%d must be >= max(1,k=%d)", blas_param.ldb, blas_param.k);
+          } else {
+            if (blas_param.ldb < std::max(1, blas_param.n))
+              errorQuda("ldb=%d must be >= max(1,n=%d)", blas_param.ldb, blas_param.n);
+          }
+          if (blas_param.ldc < std::max(1, blas_param.m))
+            errorQuda("ldc=%d must be >= max(1,m=%d)", blas_param.ldc, blas_param.m);
+        } else {
+          if (blas_param.trans_a == QUDA_BLAS_OP_N) {
+            if (blas_param.lda < std::max(1, blas_param.k))
+              errorQuda("lda=%d must be >= max(1,k=%d)", blas_param.lda, blas_param.k);
+          } else {
+            if (blas_param.lda < std::max(1, blas_param.m))
+              errorQuda("lda=%d must be >= max(1,m=%d)", blas_param.lda, blas_param.m);
+          }
+          if (blas_param.trans_b == QUDA_BLAS_OP_N) {
+            if (blas_param.ldb < std::max(1, blas_param.n))
+              errorQuda("ldb=%d must be >= max(1,n=%d)", blas_param.ldb, blas_param.n);
+          } else {
+            if (blas_param.ldb < std::max(1, blas_param.k))
+              errorQuda("ldb=%d must be >= max(1,k=%d)", blas_param.ldb, blas_param.k);
+          }
+          if (blas_param.ldc < std::max(1, blas_param.n))
+            errorQuda("ldc=%d must be >= max(1,n=%d)", blas_param.ldc, blas_param.n);
+        }
+        //-------------------------------------------------------------------------
+
+        // Parse parameters for CUBLAS
+        //-------------------------------------------------------------------------
+        // Swap A and B if in row order
+        if (blas_param.data_order == QUDA_BLAS_DATAORDER_ROW) {
+          std::swap(blas_param.m, blas_param.n);
+          std::swap(blas_param.lda, blas_param.ldb);
+          std::swap(blas_param.trans_a, blas_param.trans_b);
+          std::swap(blas_param.a_offset, blas_param.b_offset);
+          std::swap(blas_param.a_stride, blas_param.b_stride);
+          std::swap(A_data, B_data);
+        }
+
+        // Get maximum stride length to deduce the number of batches in the
+        // computation
+        int max_stride = std::max(std::max(blas_param.a_stride, blas_param.b_stride), blas_param.c_stride);
+
+        // If the user gives strides of 0 for all arrays, we are essentially performing
+        // a GEMM on the first matrices in the array N_{batch} times.
+        // Give them what they ask for, YMMV...
+        // If the strides have not been set, we are just using strides of 1.
+        if (max_stride == 0) max_stride = 1;
+
+        // The number of GEMMs to compute
+        const uint64_t batch = blas_param.batch_count / max_stride;
+
+        uint64_t data_size
+          = (blas_param.data_type == QUDA_BLAS_DATATYPE_S || blas_param.data_type == QUDA_BLAS_DATATYPE_C) ? 4 : 8;
+
+        if (blas_param.data_type == QUDA_BLAS_DATATYPE_C || blas_param.data_type == QUDA_BLAS_DATATYPE_Z) {
+          data_size *= 2;
+        }
+
+        // Number of data between batches
+        unsigned int A_batch_size = blas_param.lda * blas_param.k;
+        if (blas_param.trans_a != QUDA_BLAS_OP_N) A_batch_size = blas_param.lda * blas_param.m;
+        unsigned int B_batch_size = blas_param.ldb * blas_param.n;
+        if (blas_param.trans_b != QUDA_BLAS_OP_N) B_batch_size = blas_param.ldb * blas_param.k;
+        unsigned int C_batch_size = blas_param.ldc * blas_param.n;
+
+        // Strides in the cublas param are defaulted to -1. If that remains unchanged,
+        // the stride will be the regular batch size, else the user specified value
+        // is used.
+        unsigned int a_stride = blas_param.a_stride == 0 ? A_batch_size : A_batch_size * blas_param.a_stride;
+        unsigned int b_stride = blas_param.b_stride == 0 ? B_batch_size : B_batch_size * blas_param.b_stride;
+        unsigned int c_stride = blas_param.c_stride == 0 ? C_batch_size : C_batch_size * blas_param.c_stride;
+
+        // Data size of the entire array
+        size_t sizeAarr = A_batch_size * data_size * batch;
+        size_t sizeBarr = B_batch_size * data_size * batch;
+        size_t sizeCarr = C_batch_size * data_size * batch;
+
+        // If already on the device, just use the given pointer. If the data is on
+        // the host, allocate device memory and transfer
+        void *A_d = location == QUDA_CUDA_FIELD_LOCATION ? A_data : pool_device_malloc(sizeAarr);
+        void *B_d = location == QUDA_CUDA_FIELD_LOCATION ? B_data : pool_device_malloc(sizeBarr);
+        void *C_d = location == QUDA_CUDA_FIELD_LOCATION ? C_data : pool_device_malloc(sizeCarr);
+        if (location == QUDA_CPU_FIELD_LOCATION) {
+          qudaMemcpy(A_d, A_data, sizeAarr, qudaMemcpyHostToDevice);
+          qudaMemcpy(B_d, B_data, sizeBarr, qudaMemcpyHostToDevice);
+          qudaMemcpy(C_d, C_data, sizeCarr, qudaMemcpyHostToDevice);
+        }
+
+        CBLAS_TRANSPOSE trans_a = CblasNoTrans;
+        switch (blas_param.trans_a) {
+        case QUDA_BLAS_OP_N: trans_a = CblasNoTrans; break;
+        case QUDA_BLAS_OP_T: trans_a = CblasTrans; break;
+        case QUDA_BLAS_OP_C: trans_a = CblasConjTrans; break;
+        default: errorQuda("Unknown QUDA_BLAS_OP type %d\n", blas_param.trans_a);
+        }
+
+        CBLAS_TRANSPOSE trans_b = CblasNoTrans;
+        switch (blas_param.trans_b) {
+        case QUDA_BLAS_OP_N: trans_b = CblasNoTrans; break;
+        case QUDA_BLAS_OP_T: trans_b = CblasTrans; break;
+        case QUDA_BLAS_OP_C: trans_b = CblasConjTrans; break;
+        default: errorQuda("Unknown QUDA_BLAS_OP type %d\n", blas_param.trans_b);
+        }
+        //-------------------------------------------------------------------------
+
+        // Call CUBLAS
+        //-------------------------------------------------------------------------
+        if (blas_param.data_type == QUDA_BLAS_DATATYPE_Z) {
+
+          typedef MKL_Complex16 Z;
+          static_assert(sizeof(Z)==sizeof(double2), "MKL_Complex16 and double2 must be the same.");
+
+          const double2 alpha = make_double2((double)(static_cast<std::complex<double>>(blas_param.alpha).real()),
+                                             (double)(static_cast<std::complex<double>>(blas_param.alpha).imag()));
+
+          const double2 beta = make_double2((double)(static_cast<std::complex<double>>(blas_param.beta).real()),
+                                            (double)(static_cast<std::complex<double>>(blas_param.beta).imag()));
+
+          if (batch > 1) {
+            #pragma omp dispatch is_device_ptr(A_d, B_d, C_d)
+            cblas_zgemm_batch_strided(CblasColMajor, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k,
+                                      &alpha, (Z *)A_d + blas_param.a_offset, blas_param.lda, a_stride,
+                                      (Z *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, &beta,
+                                      (Z *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);
+          } else {
+            #pragma omp dispatch is_device_ptr(A_d, B_d, C_d)
+            cblas_zgemm(CblasColMajor, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, &alpha,
+                        (Z *)A_d + blas_param.a_offset, blas_param.lda, (Z *)B_d + blas_param.b_offset,
+                        blas_param.ldb, &beta, (Z *)C_d + blas_param.c_offset, blas_param.ldc);
+          }
+          flops += batch * FLOPS_CGEMM(blas_param.m, blas_param.n, blas_param.k);
+        } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_C) {
+
+          typedef MKL_Complex8 C;
+          static_assert(sizeof(C)==sizeof(float2), "MKL_Complex8 and float2 must be the same.");
+
+          const float2 alpha = make_float2((float)(static_cast<std::complex<double>>(blas_param.alpha).real()),
+                                           (float)(static_cast<std::complex<double>>(blas_param.alpha).imag()));
+
+          const float2 beta = make_float2((float)(static_cast<std::complex<double>>(blas_param.beta).real()),
+                                          (float)(static_cast<std::complex<double>>(blas_param.beta).imag()));
+
+          if (batch > 1) {
+            #pragma omp dispatch is_device_ptr(A_d, B_d, C_d)
+            cblas_cgemm_batch_strided(CblasColMajor, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k,
+                                      &alpha, (C *)A_d + blas_param.a_offset, blas_param.lda, a_stride,
+                                      (C *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, &beta,
+                                      (C *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);
+          } else {
+            #pragma omp dispatch is_device_ptr(A_d, B_d, C_d)
+            cblas_cgemm(CblasColMajor, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, &alpha,
+                        (C *)A_d + blas_param.a_offset, blas_param.lda, (C *)B_d + blas_param.b_offset,
+                        blas_param.ldb, &beta, (C *)C_d + blas_param.c_offset, blas_param.ldc);
+          }
+          flops += batch * FLOPS_CGEMM(blas_param.m, blas_param.n, blas_param.k);
+        } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_D) {
+
+          typedef double D;
+
+          const D alpha = (D)(static_cast<std::complex<double>>(blas_param.alpha).real());
+          const D beta = (D)(static_cast<std::complex<double>>(blas_param.beta).real());
+
+          if (batch > 1) {
+            #pragma omp dispatch is_device_ptr(A_d, B_d, C_d)
+            cblas_dgemm_batch_strided(CblasColMajor, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k,
+                                      alpha, (D *)A_d + blas_param.a_offset, blas_param.lda, a_stride,
+                                      (D *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, beta,
+                                      (D *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);
+          } else {
+            #pragma omp dispatch is_device_ptr(A_d, B_d, C_d)
+            cblas_dgemm(CblasColMajor, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, alpha,
+                        (D *)A_d + blas_param.a_offset, blas_param.lda, (D *)B_d + blas_param.b_offset,
+                        blas_param.ldb, beta, (D *)C_d + blas_param.c_offset, blas_param.ldc);
+          }
+          flops += batch * FLOPS_SGEMM(blas_param.m, blas_param.n, blas_param.k);
+        } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_S) {
+
+          typedef float S;
+
+          const S alpha = (S)(static_cast<std::complex<float>>(blas_param.alpha).real());
+          const S beta = (S)(static_cast<std::complex<float>>(blas_param.beta).real());
+
+          if (batch > 1) {
+            #pragma omp dispatch is_device_ptr(A_d, B_d, C_d)
+            cblas_sgemm_batch_strided(CblasColMajor, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k,
+                                      alpha, (S *)A_d + blas_param.a_offset, blas_param.lda, a_stride,
+                                      (S *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, beta,
+                                      (S *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);
+          } else {
+            #pragma omp dispatch is_device_ptr(A_d, B_d, C_d)
+            cblas_sgemm(CblasColMajor, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, alpha,
+                        (S *)A_d + blas_param.a_offset, blas_param.lda, (S *)B_d + blas_param.b_offset,
+                        blas_param.ldb, beta, (S *)C_d + blas_param.c_offset, blas_param.ldc);
+          }
+          flops += batch * FLOPS_SGEMM(blas_param.m, blas_param.n, blas_param.k);
+        } else {
+          errorQuda("MKL GEMM type %d not implemented\n", blas_param.data_type);
+        }
+        //-------------------------------------------------------------------------
+
+        // Clean up
+        //-------------------------------------------------------------------------
+        if (blas_param.data_order == QUDA_BLAS_DATAORDER_ROW) {
+          std::swap(blas_param.m, blas_param.n);
+          std::swap(blas_param.lda, blas_param.ldb);
+          std::swap(blas_param.trans_a, blas_param.trans_b);
+          std::swap(blas_param.a_offset, blas_param.b_offset);
+          std::swap(blas_param.a_stride, blas_param.b_stride);
+          std::swap(A_data, B_data);
+        }
+
+        if (location == QUDA_CPU_FIELD_LOCATION) {
+          qudaMemcpy(C_data, C_d, sizeCarr, qudaMemcpyDeviceToHost);
+          pool_device_free(A_d);
+          pool_device_free(B_d);
+          pool_device_free(C_d);
+        }
+
+        qudaDeviceSynchronize();
+        gettimeofday(&stop, NULL);
+        long ds = stop.tv_sec - start.tv_sec;
+        long dus = stop.tv_usec - start.tv_usec;
+        double time = ds + 0.000001 * dus;
+        if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
+          printfQuda("Batched matrix GEMM completed in %f seconds with GFLOPS = %f\n", time, 1e-9 * flops / time);
+        //-------------------------------------------------------------------------
+
+        return flops;
+      }
+#else
+      long long stridedBatchGEMM(void *, void *, void *, QudaBLASParam, QudaFieldLocation)
+      {
+        errorQuda("Native BLAS not built. Please build and use native BLAS or use generic BLAS");
+        return 0;
+      }
+#endif
+
+    } // namespace native
+  }   // namespace blas_lapack
+} // namespace quda
diff --git a/lib/targets/omptarget/comm_target.cpp b/lib/targets/omptarget/comm_target.cpp
new file mode 100644
index 0000000000..7b3d46c445
--- /dev/null
+++ b/lib/targets/omptarget/comm_target.cpp
@@ -0,0 +1,43 @@
+#include <comm_quda.h>
+#include <quda_api.h>
+#include <algorithm>
+#include <shmem_helper.cuh>
+
+namespace quda
+{
+  bool comm_peer2peer_possible(int local_gpuid, int neighbor_gpuid)
+  {
+    ompwip("comm_peer2peer_possible -> false");
+    return false;
+  }
+
+  int comm_peer2peer_performance(int local_gpuid, int neighbor_gpuid)
+  {
+    ompwip("comm_peer2peer_performance -> 0");
+    return 0;
+  }
+
+  void comm_create_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &remote, void *local)
+  {
+    ompwip("unimplemented");
+  }
+
+  void comm_destroy_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &)
+  {
+    ompwip("unimplemented");
+  }
+
+  void comm_create_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &remote,
+                                  array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &local)
+  {
+    ompwip("unimplemented");
+  }
+
+  void comm_destroy_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &,
+                                   array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &local)
+  {
+    ompwip("unimplemented");
+  }
+
+} // namespace quda
+
diff --git a/lib/targets/omptarget/device.cpp b/lib/targets/omptarget/device.cpp
new file mode 100644
index 0000000000..a4ee629e5b
--- /dev/null
+++ b/lib/targets/omptarget/device.cpp
@@ -0,0 +1,192 @@
+#include <util_quda.h>
+#include <quda_internal.h>
+#include <target_device.h>
+
+// OMP TARGET TODO: most of what follows are guess work.
+// We also don't support streams.
+
+// until we have a way to query device properties correctly
+
+#ifndef QUDA_OMP_MAX_TEAMS
+#define QUDA_OMP_MAX_TEAMS 2147483647
+#endif
+
+#ifndef QUDA_MAX_THREADS_PER_PROCESSOR
+#define QUDA_MAX_THREADS_PER_PROCESSOR 2048
+#endif
+
+#ifndef QUDA_PROCESSOR_COUNT
+#define QUDA_PROCESSOR_COUNT 64
+#endif
+
+#ifndef QUDA_MAX_BLOCKS_PER_PROCESSOR
+#define QUDA_MAX_BLOCKS_PER_PROCESSOR 32
+#endif
+
+static char ompdevname[] = "OpenMP Target Device";
+static char omphostname[] = "OpenMP Host Device";
+struct DeviceProp{
+  char*name;
+  int id;
+  unsigned int num_procs;
+  unsigned int max_teams;
+  unsigned int max_threads;
+};
+
+static void getDeviceProperties(DeviceProp*p,int dev)
+{
+  constexpr int max_threads = quda::device::max_block_size();
+  int m = 0;
+
+  if(dev>=0 && dev<=omp_get_num_devices())
+    p->name = ompdevname;
+  else
+    p->name = omphostname;
+
+  p->id = dev;
+  #pragma omp target teams device(dev) thread_limit(max_threads) map(from:m)
+  if(omp_get_team_num()==0)
+    m = omp_get_num_procs();
+  p->num_procs = m;
+
+/*
+  #pragma omp target device(dev) map(from:m)
+  m = omp_get_max_teams();
+  p->max_teams = m;
+*/
+  p->max_teams = QUDA_OMP_MAX_TEAMS;
+
+  #pragma omp target teams device(dev) thread_limit(max_threads) map(from:m)
+  if(omp_get_team_num()==0)
+    m = omp_get_max_threads();
+  p->max_threads= m;
+}
+
+static DeviceProp deviceProp;
+
+namespace quda
+{
+
+  namespace device
+  {
+
+    static bool initialized = false;
+
+    static int device_id = -1;
+
+    void print_device(DeviceProp dp)
+    {
+      int dev = dp.id;
+      printfQuda("%d - name:        %s\n", dev, dp.name);
+      printfQuda("%d - num_procs:   %d\n", dev, dp.num_procs);
+      printfQuda("%d - max_teams:   %d\n", dev, dp.max_teams);
+      printfQuda("%d - max_threads: %d\n", dev, dp.max_threads);
+    }
+
+    void init(int dev)
+    {
+      if (initialized) return;
+      initialized = true;
+
+      getDeviceProperties(&deviceProp, dev);
+
+      if (getVerbosity() >= QUDA_SUMMARIZE) {
+        printfQuda("Using device %d: %s\n", dev, deviceProp.name);
+        print_device(deviceProp);
+      }
+      omp_set_default_device(dev);
+
+      device_id = dev;
+    }
+
+    void init_thread()
+    {
+      if (device_id == -1) errorQuda("No OMP device has been initialized for this process");
+      omp_set_default_device(device_id);
+    }
+
+    state_t get_state() { return {}; }
+
+    int get_device_count()
+    {
+      static int device_count = -1;
+      if (device_count < 0) {
+        device_count = omp_get_num_devices();
+        if (device_count == 0)
+          warningQuda("No non-host devices found");
+      }
+      return device_count;
+    }
+
+    void get_visible_devices_string(char device_list_string[128])
+    {
+      constexpr int len = 128;
+      char *device_order_env = getenv("ZE_AFFINITY_MASK");
+      char *oneapi_device_env = getenv("ONEAPI_DEVICE_SELECTOR");
+      if(oneapi_device_env && device_order_env)
+        snprintf(device_list_string, len, "%s%s", oneapi_device_env, device_order_env);
+      else if(oneapi_device_env)
+        snprintf(device_list_string, len, "%s", oneapi_device_env);
+      else if(device_order_env)
+        snprintf(device_list_string, len, "%s", device_order_env);
+    }
+
+    void print_device_properties()
+    {
+      DeviceProp dp;
+      for (int device = 0; device < get_device_count(); device++) {
+        getDeviceProperties(&dp, device);
+        print_device(dp);
+      }
+    }
+
+    void create_context() { }
+
+    void destroy() { }
+
+    qudaStream_t get_stream(unsigned int i)
+    {
+      return qudaStream_t{static_cast<int>(i)};
+    }
+
+    qudaStream_t get_default_stream()
+    {
+      return qudaStream_t{0};
+    }
+
+    unsigned int get_default_stream_idx()
+    {
+      return 0;
+    }
+
+    bool managed_memory_supported() { return false; }
+
+    bool shared_memory_atomic_supported() { return false; }
+
+    size_t max_default_shared_memory() { return device::max_shared_memory_size(); }
+
+    size_t max_dynamic_shared_memory() { return device::max_shared_memory_size(); }
+
+    unsigned int max_threads_per_block() { return deviceProp.max_threads; }
+
+    unsigned int max_threads_per_processor() { return QUDA_MAX_THREADS_PER_PROCESSOR; }
+
+    unsigned int max_threads_per_block_dim(int i) { return deviceProp.max_threads; }
+
+    unsigned int max_grid_size(int i) { return deviceProp.max_teams; }
+
+    unsigned int processor_count() { return QUDA_PROCESSOR_COUNT; }
+
+    unsigned int max_blocks_per_processor() { return QUDA_MAX_BLOCKS_PER_PROCESSOR; }
+
+    namespace profile
+    {
+
+      void start() { /* cudaProfilerStart(); */ }
+
+      void stop() { /* cudaProfilerStop(); */ }
+
+    } // namespace profile
+
+  } // namespace device
+} // namespace quda
diff --git a/lib/targets/omptarget/malloc.cpp b/lib/targets/omptarget/malloc.cpp
new file mode 100644
index 0000000000..b79e85741b
--- /dev/null
+++ b/lib/targets/omptarget/malloc.cpp
@@ -0,0 +1,897 @@
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <map>
+#include <set>
+#include <unistd.h>   // for getpagesize()
+#include <execinfo.h> // for backtrace
+#include <quda_internal.h>
+#include <device.h>
+#include <shmem_helper.cuh>
+#include "timer.h"
+
+#ifdef USE_QDPJIT
+#include "qdp_quda.h"
+#include "qdp_config.h"
+#endif
+
+#ifdef QUDA_BACKWARDSCPP
+#include "backward.hpp"
+#endif
+
+// #pragma omp requires unified_shared_memory
+
+namespace quda
+{
+
+  enum AllocType { DEVICE, DEVICE_PINNED, HOST, PINNED, MAPPED, MANAGED, SHMEM, N_ALLOC_TYPE };
+
+  class MemAlloc
+  {
+
+  public:
+    std::string func;
+    std::string file;
+    int line;
+    size_t size;
+    size_t base_size;
+#ifdef QUDA_BACKWARDSCPP
+    backward::StackTrace st;
+#endif
+
+    MemAlloc() : line(-1), size(0), base_size(0) {}
+
+    MemAlloc(std::string func, std::string file, int line) : func(func), file(file), line(line), size(0), base_size(0)
+    {
+#ifdef QUDA_BACKWARDSCPP
+      st.load_here(32);
+      st.skip_n_firsts(1);
+#endif
+    }
+
+    MemAlloc(const MemAlloc &) = default;
+    MemAlloc(MemAlloc &&) = default;
+    virtual ~MemAlloc() = default;
+    MemAlloc &operator=(const MemAlloc &) = default;
+    MemAlloc &operator=(MemAlloc &&) = default;
+  };
+
+  static std::map<void *, MemAlloc> alloc[N_ALLOC_TYPE];
+  static size_t total_bytes[N_ALLOC_TYPE] = {0};
+  static size_t max_total_bytes[N_ALLOC_TYPE] = {0};
+  static size_t total_host_bytes, max_total_host_bytes;
+  static size_t total_pinned_bytes, max_total_pinned_bytes;
+
+  size_t device_allocated() { return total_bytes[DEVICE]; }
+
+  size_t pinned_allocated() { return total_bytes[PINNED]; }
+
+  size_t mapped_allocated() { return total_bytes[MAPPED]; }
+
+  size_t managed_allocated() { return total_bytes[MANAGED]; }
+
+  size_t host_allocated() { return total_bytes[HOST]; }
+
+  size_t device_allocated_peak() { return max_total_bytes[DEVICE]; }
+
+  size_t pinned_allocated_peak() { return max_total_bytes[PINNED]; }
+
+  size_t mapped_allocated_peak() { return max_total_bytes[MAPPED]; }
+
+  size_t managed_allocated_peak() { return max_total_bytes[MANAGED]; }
+
+  size_t host_allocated_peak() { return max_total_bytes[HOST]; }
+
+  void print_trace(void)
+  {
+    void *array[10];
+    size_t size;
+    char **strings;
+    size = backtrace(array, 10);
+    strings = backtrace_symbols(array, size);
+    printfQuda("Obtained %zd stack frames.\n", size);
+    for (size_t i = 0; i < size; i++) printfQuda("%s\n", strings[i]);
+    free(strings);
+  }
+
+  static void print_alloc_header()
+  {
+    printfQuda("Type    Pointer          Size             Location\n");
+    printfQuda("----------------------------------------------------------\n");
+  }
+
+  static void print_alloc(AllocType type)
+  {
+    const char *type_str[] = {"Device", "Device Pinned", "Host  ", "Pinned", "Mapped", "Managed", "Shmem "};
+
+    for (auto entry : alloc[type]) {
+      void *ptr = entry.first;
+      MemAlloc a = entry.second;
+      printfQuda("%s  %15p  %15lu  %s(), %s:%d\n", type_str[type], ptr, (unsigned long)a.base_size, a.func.c_str(),
+                 a.file.c_str(), a.line);
+#ifdef QUDA_BACKWARDSCPP
+      if (getRankVerbosity()) {
+        backward::Printer p;
+        p.print(a.st);
+      }
+#endif
+    }
+  }
+
+  static void track_malloc(const AllocType &type, const MemAlloc &a, void *ptr)
+  {
+    // ompwip("track malloc %d %p %d\n", ptr, type, a.base_size);
+    total_bytes[type] += a.base_size;
+    if (total_bytes[type] > max_total_bytes[type]) { max_total_bytes[type] = total_bytes[type]; }
+    if (type != DEVICE && type != DEVICE_PINNED && type != SHMEM) {
+      total_host_bytes += a.base_size;
+      if (total_host_bytes > max_total_host_bytes) { max_total_host_bytes = total_host_bytes; }
+    }
+    if (type == PINNED || type == MAPPED) {
+      total_pinned_bytes += a.base_size;
+      if (total_pinned_bytes > max_total_pinned_bytes) { max_total_pinned_bytes = total_pinned_bytes; }
+    }
+    alloc[type][ptr] = a;
+  }
+
+  static void track_free(const AllocType &type, void *ptr)
+  {
+    size_t size = alloc[type][ptr].base_size;
+    total_bytes[type] -= size;
+    if (type != DEVICE && type != DEVICE_PINNED && type != SHMEM) { total_host_bytes -= size; }
+    if (type == PINNED || type == MAPPED) { total_pinned_bytes -= size; }
+    alloc[type].erase(ptr);
+  }
+
+#ifdef OMPTARGET_MAPPED_USE_ASSOCIATE_PTR
+  namespace target {
+    static std::map<const void*,void*> omp_mapped_ptr;  // host -> device
+  }
+#endif
+
+  /**
+   * Under CUDA 4.0, cudaHostRegister seems to require that both the
+   * beginning and end of the buffer be aligned on page boundaries.
+   * This local function takes care of the alignment and gets called
+   * by pinned_malloc_() and mapped_malloc_()
+   */
+  static void *aligned_malloc(MemAlloc &a, size_t size)
+  {
+    void *ptr = nullptr;
+
+    a.size = size;
+
+#if 0
+    a.base_size = size;
+    ptr = malloc(size);
+    if (!ptr) {
+#else
+    // we need to manually align to page boundaries to allow us to bind a texture to mapped memory
+    static int page_size = 2 * getpagesize();
+    a.base_size = ((size + page_size - 1) / page_size) * page_size; // round up to the nearest multiple of page_size
+    int align = posix_memalign(&ptr, page_size, a.base_size);
+    if (!ptr || align != 0) {
+#endif
+      errorQuda("Failed to allocate aligned host memory of size %zu (%s:%d in %s())\n", size, a.file.c_str(), a.line,
+                a.func.c_str());
+    }
+    return ptr;
+  }
+
+  bool use_managed_memory()
+  {
+    static bool managed = false;
+    static bool init = false;
+
+    if (!init) {
+      char *enable_managed_memory = getenv("QUDA_ENABLE_MANAGED_MEMORY");
+      if (enable_managed_memory && strcmp(enable_managed_memory, "1") == 0) {
+        warningQuda("Using managed memory for CUDA allocations");
+        managed = true;
+
+        if (!device::managed_memory_supported())
+          warningQuda("Target device does not report supporting managed memory");
+      }
+
+      init = true;
+    }
+
+    return managed;
+  }
+
+  bool is_prefetch_enabled()
+  {
+    static bool prefetch = false;
+    static bool init = false;
+
+    if (!init) {
+      if (use_managed_memory()) {
+        char *enable_managed_prefetch = getenv("QUDA_ENABLE_MANAGED_PREFETCH");
+        if (enable_managed_prefetch && strcmp(enable_managed_prefetch, "1") == 0) {
+          warningQuda("Enabling prefetch support for managed memory");
+          prefetch = true;
+        }
+      }
+
+      init = true;
+    }
+
+    return prefetch;
+  }
+
+  /**
+   * Perform a standard cudaMalloc() with error-checking.  This
+   * function should only be called via the device_malloc() macro,
+   * defined in malloc_quda.h
+   */
+  void *device_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    if (use_managed_memory()) return managed_malloc_(func, file, line, size);
+
+#ifndef QDP_USE_CUDA_MANAGED_MEMORY
+    MemAlloc a(func, file, line);
+    void *ptr;
+
+    a.size = a.base_size = size;
+
+    if(0<omp_get_num_devices())
+      ptr = omp_target_alloc(size, omp_get_default_device());
+    else{
+      warningQuda("%s:%d %s() allocate on host instead of device", file, line, func);
+      ptr = aligned_alloc(64, size);
+    }
+    if (!ptr) {
+      errorQuda("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    }
+    track_malloc(DEVICE, a, ptr);
+#ifdef HOST_DEBUG
+    cudaMemset(ptr, 0xff, size);
+#endif
+    return ptr;
+#else
+    // when QDO uses managed memory we can bypass the QDP memory manager
+    return device_pinned_malloc_(func, file, line, size);
+#endif
+  }
+
+  /**
+   * Perform a cuMemAlloc with error-checking.  This function is to
+   * guarantee a unique memory allocation on the device, since
+   * cudaMalloc can be redirected (as is the case with QDPJIT).  This
+   * should only be called via the device_pinned_malloc() macro,
+   * defined in malloc_quda.h.
+   */
+  void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    if (!comm_peer2peer_present()) return device_malloc_(func, file, line, size);
+
+    MemAlloc a(func, file, line);
+    a.size = a.base_size = size;
+    void *ptr;
+
+    if(0<omp_get_num_devices())
+      ptr = omp_target_alloc_device(size, omp_get_default_device());
+    else{
+      warningQuda("%s:%d %s() allocate on host instead of device", file, line, func);
+      ptr = aligned_alloc(64, size);
+    }
+    if (!ptr) {
+      errorQuda("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    }
+    track_malloc(DEVICE_PINNED, a, ptr);
+#ifdef HOST_DEBUG
+    cudaMemset(ptr, 0xff, size);
+#endif
+    return ptr;
+  }
+
+  /**
+   * Perform a standard malloc() with error-checking.  This function
+   * should only be called via the safe_malloc() macro, defined in
+   * malloc_quda.h
+   */
+  void *safe_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    MemAlloc a(func, file, line);
+    a.size = a.base_size = size;
+
+    void *ptr = malloc(size);
+    // ompwip("malloc: %p",ptr);
+    if (!ptr) { errorQuda("Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func); }
+    track_malloc(HOST, a, ptr);
+#ifdef HOST_DEBUG
+    memset(ptr, 0xff, size);
+#endif
+    return ptr;
+  }
+
+  /**
+   * Allocate page-locked ("pinned") host memory.  This function
+   * should only be called via the pinned_malloc() macro, defined in
+   * malloc_quda.h
+   *
+   * Note that we do not rely on cudaHostAlloc(), since buffers
+   * allocated in this way have been observed to cause problems when
+   * shared with MPI via GPU Direct on some systems.
+   */
+  void *pinned_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    MemAlloc a(func, file, line);
+    void *ptr = omp_target_alloc_host(size, omp_get_default_device());
+    if(!ptr)
+      errorQuda("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+/*
+    cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterDefault);
+    if (err != cudaSuccess) {
+      errorQuda("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    }
+*/
+    track_malloc(PINNED, a, ptr);
+#ifdef HOST_DEBUG
+    memset(ptr, 0xff, a.base_size);
+#endif
+    return ptr;
+  }
+
+  /**
+   * Allocate page-locked ("pinned") host memory, and map it into the
+   * GPU address space.  This function should only be called via the
+   * mapped_malloc() macro, defined in malloc_quda.h
+   */
+  void *mapped_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    MemAlloc a(func, file, line);
+
+#ifdef OMPTARGET_MAPPED_USE_ASSOCIATE_PTR
+    void *ptr = aligned_malloc(a, size);
+    print_trace();
+    if(0<omp_get_num_devices()){
+      int d = omp_get_default_device();
+      void *dp = omp_target_alloc(a.base_size, d);
+      ompwip("WARNING: require special memcpy, mapped_malloc_ host: %p  device: %p",ptr,dp);
+      if(!dp)
+        errorQuda("%s:%d %s() Failed to allocate device memory of size %zu for mapped malloc\n", file, line, func, size);
+      if(omp_target_associate_ptr(ptr, dp, a.base_size, 0, d))
+        errorQuda("%s:%d %s() Failed to assocaite device memory to host pointer\n", file, line, func);
+      target::omp_mapped_ptr[ptr] = dp;
+    }else{
+      warningQuda("%s:%d %s() mapped malloc without a device", file, line, func);
+      target::omp_mapped_ptr[ptr] = ptr;
+    }
+#else
+    a.size = a.base_size = size;
+    void *ptr;
+    if(0<omp_get_num_devices()){
+      // ptr = omp_target_alloc_shared(size, omp_get_default_device());   // FIXME non-portable
+      ptr = omp_target_alloc_host(size, omp_get_default_device());   // FIXME non-portable
+    }else{
+      warningQuda("%s:%d %s() mapped malloc without a device", file, line, func);
+      ptr = aligned_malloc(a, size);
+    }
+#endif
+    track_malloc(MAPPED, a, ptr);
+#ifdef HOST_DEBUG
+    memset(ptr, 0xff, a.base_size);
+#endif
+    return ptr;
+  }
+  /**
+   * Allocate shemm device memory. This function should only be called via
+   * device_comms_pinned_malloc_()
+   */
+#ifdef NVSHMEM_COMMS
+  void *shmem_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    MemAlloc a(func, file, line);
+
+    a.size = a.base_size = size;
+
+    auto ptr = nvshmem_malloc(size);
+    if (ptr == nullptr) {
+      printfQuda("ERROR: Failed to allocate shmem memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+      errorQuda("Aborting");
+    }
+    track_malloc(SHMEM, a, ptr);
+#ifdef HOST_DEBUG
+    cudaMemset(ptr, 0xff, size);
+#endif
+    return ptr;
+  }
+#endif
+
+  /**
+   * Allocate pinned or symmetric (shmem) device memory for comms. Should only be called via the
+   * device_comms_pinned_malloc macro, defined in malloc_quda.h
+   */
+  void *device_comms_pinned_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+#ifdef NVSHMEM_COMMS
+    return shmem_malloc_(func, file, line, size);
+#else
+    return device_pinned_malloc_(func, file, line, size);
+#endif
+  }
+
+  /**
+   * Perform a standard cudaMallocManaged() with error-checking.  This
+   * function should only be called via the managed_malloc() macro,
+   * defined in malloc_quda.h
+   */
+  void *managed_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    MemAlloc a(func, file, line);
+    void *ptr;
+
+    a.size = a.base_size = size;
+
+    if(0<omp_get_num_devices()){
+      ptr = omp_target_alloc_shared(size, omp_get_default_device());
+    }else{
+      warningQuda("%s:%d %s() managed malloc without a device", file, line, func);
+      ptr = aligned_malloc(a, size);
+    }
+    if (!ptr) {
+      errorQuda("Failed to allocate managed memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    }
+    track_malloc(MANAGED, a, ptr);
+#ifdef HOST_DEBUG
+    cudaMemset(ptr, 0xff, size);
+#endif
+    return ptr;
+  }
+
+  /**
+   * Free device memory allocated with device_malloc().  This function
+   * should only be called via the device_free() macro, defined in
+   * malloc_quda.h
+   */
+  void device_free_(const char *func, const char *file, int line, void *ptr)
+  {
+    if (use_managed_memory()) {
+      managed_free_(func, file, line, ptr);
+      return;
+    }
+
+#ifndef QDP_USE_CUDA_MANAGED_MEMORY
+    // ompwip("device free: %p",ptr);
+    if (!ptr) { errorQuda("Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); }
+    if (!alloc[DEVICE].count(ptr)) {
+      errorQuda("Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
+    }
+    if(0<omp_get_num_devices())
+      omp_target_free(ptr, omp_get_default_device());
+    else
+      free(ptr);
+/*
+    cudaError_t err = cudaFree(ptr);
+    if (err != cudaSuccess) { errorQuda("Failed to free device memory (%s:%d in %s())\n", file, line, func); }
+*/
+    track_free(DEVICE, ptr);
+#else
+    device_pinned_free_(func, file, line, ptr);
+#endif
+  }
+
+  /**
+   * Free device memory allocated with device_pinned malloc().  This
+   * function should only be called via the device_pinned_free()
+   * macro, defined in malloc_quda.h
+   */
+  void device_pinned_free_(const char *func, const char *file, int line, void *ptr)
+  {
+    if (!comm_peer2peer_present()) {
+      device_free_(func, file, line, ptr);
+      return;
+    }
+
+    if (!ptr) { errorQuda("Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); }
+    if (!alloc[DEVICE_PINNED].count(ptr)) {
+      errorQuda("Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
+    }
+    if(0<omp_get_num_devices())
+      omp_target_free(ptr, omp_get_default_device());
+    else
+      free(ptr);
+/*
+    CUresult err = cuMemFree((CUdeviceptr)ptr);
+    if (err != CUDA_SUCCESS) { printfQuda("Failed to free device memory (%s:%d in %s())\n", file, line, func); }
+*/
+    track_free(DEVICE_PINNED, ptr);
+  }
+
+  /**
+   * Free device memory allocated with device_malloc().  This function
+   * should only be called via the device_free() macro, defined in
+   * malloc_quda.h
+   */
+  void managed_free_(const char *func, const char *file, int line, void *ptr)
+  {
+    if (!ptr) { errorQuda("Attempt to free NULL managed pointer (%s:%d in %s())\n", file, line, func); }
+    if (!alloc[MANAGED].count(ptr)) {
+      errorQuda("Attempt to free invalid managed pointer (%s:%d in %s())\n", file, line, func);
+    }
+    if(0<omp_get_num_devices())
+      omp_target_free(ptr, omp_get_default_device());
+    else
+      free(ptr);
+    track_free(MANAGED, ptr);
+  }
+
+  /**
+   * Free host memory allocated with safe_malloc(), pinned_malloc(),
+   * or mapped_malloc().  This function should only be called via the
+   * host_free() macro, defined in malloc_quda.h
+   */
+  void host_free_(const char *func, const char *file, int line, void *ptr)
+  {
+    if (!ptr) { errorQuda("Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func); }
+    if (alloc[HOST].count(ptr)) {
+      track_free(HOST, ptr);
+      free(ptr);
+    } else if (alloc[PINNED].count(ptr)) {
+/*
+      cudaError_t err = cudaHostUnregister(ptr);
+      if (err != cudaSuccess) { errorQuda("Failed to unregister pinned memory (%s:%d in %s())\n", file, line, func); }
+*/
+      track_free(PINNED, ptr);
+      if(0<omp_get_num_devices())
+        omp_target_free(ptr, omp_get_default_device());
+      else
+        free(ptr);
+    } else if (alloc[MAPPED].count(ptr)) {
+#ifdef HOST_ALLOC
+#ifdef OMPTARGET_MAPPED_USE_ASSOCIATE_PTR
+      ompwip("ERROR: HOST_ALLOC alloc[MAPPED].count(ptr) untested code path: %p",ptr); print_trace();
+      free(ptr);
+#else
+      if(0<omp_get_num_devices())
+        omp_target_free(ptr, omp_get_default_device());
+      else
+        free(ptr);
+#endif
+/*
+      cudaError_t err = cudaFreeHost(ptr);
+      if (err != cudaSuccess) { errorQuda("Failed to free host memory (%s:%d in %s())\n", file, line, func); }
+*/
+#else
+#ifdef OMPTARGET_MAPPED_USE_ASSOCIATE_PTR
+      // ompwip("!HOST_ALLOC alloc[MAPPED].count(ptr) free: %p",ptr);
+      void *dp = target::omp_mapped_ptr[ptr];
+      if(dp!=ptr){
+        omp_target_disassociate_ptr(ptr, omp_get_default_device());
+        ompwip("!HOST_ALLOC alloc[MAPPED].count(ptr) free mapped host: %p  device: %p",ptr,dp);
+        omp_target_free(dp, omp_get_default_device());
+      }
+      free(ptr);
+#else
+      if(0<omp_get_num_devices())
+        omp_target_free(ptr, omp_get_default_device());
+      else
+        free(ptr);
+#endif
+/*
+      cudaError_t err = cudaHostUnregister(ptr);
+      if (err != cudaSuccess) {
+        errorQuda("Failed to unregister host-mapped memory (%s:%d in %s())\n", file, line, func);
+      }
+      free(ptr);
+*/
+#endif
+      track_free(MAPPED, ptr);
+    } else {
+      printfQuda("ERROR: Attempt to free invalid host pointer (%s:%d in %s())\n", file, line, func);
+      print_trace();
+      errorQuda("Aborting");
+    }
+  }
+
+#ifdef NVSHMEM_COMMS
+  /**
+   * Free symmetric memory allocated with shmem_malloc_. Should only be called via the device_comms_* functions.
+   */
+  void shmem_free_(const char *func, const char *file, int line, void *ptr)
+  {
+    if (!ptr) {
+      printfQuda("ERROR: Attempt to free NULL shmem pointer (%s:%d in %s())\n", file, line, func);
+      errorQuda("Aborting");
+    }
+    if (!alloc[SHMEM].count(ptr)) {
+      printfQuda("ERROR: Attempt to free invalid shmem pointer (%s:%d in %s())\n", file, line, func);
+      errorQuda("Aborting");
+    }
+    nvshmem_free(ptr);
+    track_free(SHMEM, ptr);
+  }
+#endif
+
+  /**
+   * Free device comms memory allocated with device_comms_pinned_malloc(). This function should only be
+   * called via the device_comms_pinned_free() macro, defined in malloc_quda.h
+   */
+  void device_comms_pinned_free_(const char *func, const char *file, int line, void *ptr)
+  {
+#ifdef NVSHMEM_COMMS
+    shmem_free_(func, file, line, ptr);
+#else
+    device_pinned_free_(func, file, line, ptr);
+#endif
+  }
+
+  void printPeakMemUsage()
+  {
+    printfQuda("Device memory used = %.1f MiB\n", max_total_bytes[DEVICE] / (double)(1 << 20));
+    printfQuda("Pinned device memory used = %.1f MiB\n", max_total_bytes[DEVICE_PINNED] / (double)(1 << 20));
+    printfQuda("Managed memory used = %.1f MiB\n", max_total_bytes[MANAGED] / (double)(1 << 20));
+    printfQuda("Shmem memory used = %.1f MiB\n", max_total_bytes[SHMEM] / (double)(1 << 20));
+    printfQuda("Page-locked host memory used = %.1f MiB\n", max_total_pinned_bytes / (double)(1 << 20));
+    printfQuda("Total host memory used >= %.1f MiB\n", max_total_host_bytes / (double)(1 << 20));
+  }
+
+  void assertAllMemFree()
+  {
+    if (!alloc[DEVICE].empty() || !alloc[DEVICE_PINNED].empty() || !alloc[HOST].empty() || !alloc[PINNED].empty()
+        || !alloc[MAPPED].empty()) {
+      warningQuda("The following internal memory allocations were not freed.");
+      printfQuda("\n");
+      print_alloc_header();
+      print_alloc(DEVICE);
+      print_alloc(DEVICE_PINNED);
+      print_alloc(SHMEM);
+      print_alloc(HOST);
+      print_alloc(PINNED);
+      print_alloc(MAPPED);
+      printfQuda("\n");
+    }
+  }
+
+  QudaFieldLocation get_pointer_location(const void *ptr)
+  {
+    static std::set<void *> other_ptr;  // OMP TARGET TODO: unknown pointers, assume on CPU
+    static std::set<void *> other_target_ptr;  // OMP TARGET TODO: unknown pointers, assume on TARGET
+    void *p = const_cast<void*>(ptr);
+    QudaFieldLocation fl = QUDA_INVALID_FIELD_LOCATION;
+    if(alloc[DEVICE].count(p) || alloc[DEVICE_PINNED].count(p) || other_target_ptr.count(p)){
+      // ompwip("get_pointer_location %p returns QUDA_CUDA_FIELD_LOCATION",ptr);
+      fl = QUDA_CUDA_FIELD_LOCATION;
+    }else if(alloc[HOST].count(p) || alloc[PINNED].count(p) || other_ptr.count(p)){
+      // ompwip("get_pointer_location %p returns QUDA_CPU_FIELD_LOCATION",ptr);
+      fl = QUDA_CPU_FIELD_LOCATION;
+    }else if(alloc[MAPPED].count(p)){
+#ifdef OMPTARGET_MAPPED_USE_ASSOCIATE_PTR
+      fl = QUDA_CPU_FIELD_LOCATION;
+#else
+      fl = QUDA_CUDA_FIELD_LOCATION;
+#endif
+    }else{
+      // OMP TARGET ARCHITECTURE SPECIFIC HACK
+      if(((uintptr_t)ptr)>>48){
+        ompwip("WARNING: get_pointer_location assumes %p to be QUDA_CUDA_FIELD_LOCATION",ptr);
+        other_target_ptr.insert(p);
+        fl = QUDA_CUDA_FIELD_LOCATION;
+      }else{
+        ompwip("WARNING: get_pointer_location assumes %p to be QUDA_CPU_FIELD_LOCATION",ptr);
+        other_ptr.insert(p);
+        fl = QUDA_CPU_FIELD_LOCATION;
+      }
+    }
+    return fl;
+/*
+    CUpointer_attribute attribute[] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
+    CUmemorytype mem_type;
+    void *data[] = {&mem_type};
+    CUresult error = cuPointerGetAttributes(1, attribute, data, reinterpret_cast<CUdeviceptr>(ptr));
+    if (error != CUDA_SUCCESS) {
+      const char *string;
+      cuGetErrorString(error, &string);
+      errorQuda("cuPointerGetAttributes failed with error %s", string);
+    }
+
+    // catch pointers that have not been created in CUDA
+    if (mem_type == 0) mem_type = CU_MEMORYTYPE_HOST;
+
+    switch (mem_type) {
+    case CU_MEMORYTYPE_DEVICE:
+    case CU_MEMORYTYPE_UNIFIED: return QUDA_CUDA_FIELD_LOCATION;
+    case CU_MEMORYTYPE_HOST: return QUDA_CPU_FIELD_LOCATION;
+    default: errorQuda("Unknown memory type %d", mem_type); return QUDA_INVALID_FIELD_LOCATION;
+    }
+*/
+  }
+
+  void *get_mapped_device_pointer_(const char *func, const char *file, int line, const void *host)
+  {
+#ifdef OMPTARGET_MAPPED_USE_ASSOCIATE_PTR
+    print_trace();
+    auto dp = target::omp_mapped_ptr[host];
+    ompwip("WARNING: get_mapped_device_pointer_ host: %p  device: %p  make sure to copy to host before reading from host",host,dp);
+    return dp;
+#else
+    return (void*)host;
+#endif
+/*
+    void *device;
+    auto error = cudaHostGetDevicePointer(&device, const_cast<void *>(host), 0);
+    if (error != cudaSuccess) {
+      errorQuda("cudaHostGetDevicePointer failed with error %s (%s:%d in %s()", cudaGetErrorString(error), file, line,
+                func);
+    }
+    return device;
+*/
+  }
+
+  void register_pinned_(const char *func, const char *file, int line, void *ptr, size_t bytes)
+  {
+    ompwip("WARNING: register_pinned_ unimplemented: %p",ptr);
+/*
+    auto error = cudaHostRegister(ptr, bytes, cudaHostRegisterDefault);
+    if (error != cudaSuccess) {
+      errorQuda("cudaHostRegister failed with error %s (%s:%d in %s()",
+                cudaGetErrorString(error), file, line, func);
+    }
+*/
+  }
+
+  void unregister_pinned_(const char *func, const char *file, int line, void *ptr)
+  {
+    ompwip("WARNING: unregister_pinned_ unimplemented: %p",ptr);
+/*
+    auto error = cudaHostUnregister(ptr);
+    if (error != cudaSuccess) {
+      errorQuda("cudaHostUnregister failed with error %s (%s:%d in %s()",
+                cudaGetErrorString(error), file, line, func);
+    }
+*/
+  }
+
+  namespace pool
+  {
+
+    /** Cache of inactive pinned-memory allocations.  We cache pinned
+        memory allocations so that fields can reuse these with minimal
+        overhead.*/
+    static std::multimap<size_t, void *> pinnedCache;
+
+    /** Sizes of active pinned-memory allocations.  For convenience,
+        we keep track of the sizes of active allocations (i.e., those not
+        in the cache). */
+    static std::map<void *, size_t> pinnedSize;
+
+    /** Cache of inactive device-memory allocations.  We cache pinned
+        memory allocations so that fields can reuse these with minimal
+        overhead.*/
+    static std::multimap<size_t, void *> deviceCache;
+
+    /** Sizes of active device-memory allocations.  For convenience,
+        we keep track of the sizes of active allocations (i.e., those not
+        in the cache). */
+    static std::map<void *, size_t> deviceSize;
+
+    static bool pool_init = false;
+
+    /** whether to use a memory pool allocator for device memory */
+    static bool device_memory_pool = true;
+
+    /** whether to use a memory pool allocator for pinned memory */
+    static bool pinned_memory_pool = true;
+
+    void init()
+    {
+      if (!pool_init) {
+        // device memory pool
+        char *enable_device_pool = getenv("QUDA_ENABLE_DEVICE_MEMORY_POOL");
+        if (!enable_device_pool || strcmp(enable_device_pool, "0") != 0) {
+          warningQuda("Using device memory pool allocator");
+          device_memory_pool = true;
+        } else {
+          warningQuda("Not using device memory pool allocator");
+          device_memory_pool = false;
+        }
+
+        // pinned memory pool
+        char *enable_pinned_pool = getenv("QUDA_ENABLE_PINNED_MEMORY_POOL");
+        if (!enable_pinned_pool || strcmp(enable_pinned_pool, "0") != 0) {
+          warningQuda("Using pinned memory pool allocator");
+          pinned_memory_pool = true;
+        } else {
+          warningQuda("Not using pinned memory pool allocator");
+          pinned_memory_pool = false;
+        }
+        pool_init = true;
+      }
+    }
+
+    void *pinned_malloc_(const char *func, const char *file, int line, size_t nbytes)
+    {
+      void *ptr = nullptr;
+      if (pinned_memory_pool) {
+        if (pinnedCache.empty()) {
+          ptr = quda::pinned_malloc_(func, file, line, nbytes);
+        } else {
+          auto it = pinnedCache.lower_bound(nbytes);
+          if (it != pinnedCache.end()) { // sufficiently large allocation found
+            nbytes = it->first;
+            ptr = it->second;
+            pinnedCache.erase(it);
+          } else { // sacrifice the smallest cached allocation
+            it = pinnedCache.begin();
+            ptr = it->second;
+            pinnedCache.erase(it);
+            host_free(ptr);
+            ptr = quda::pinned_malloc_(func, file, line, nbytes);
+          }
+        }
+        pinnedSize[ptr] = nbytes;
+      } else {
+        ptr = quda::pinned_malloc_(func, file, line, nbytes);
+      }
+      return ptr;
+    }
+
+    void pinned_free_(const char *func, const char *file, int line, void *ptr)
+    {
+      if (pinned_memory_pool) {
+        if (!pinnedSize.count(ptr)) { errorQuda("Attempt to free invalid pointer"); }
+        pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
+        pinnedSize.erase(ptr);
+      } else {
+        quda::host_free_(func, file, line, ptr);
+      }
+    }
+
+    void *device_malloc_(const char *func, const char *file, int line, size_t nbytes)
+    {
+      void *ptr = nullptr;
+      if (device_memory_pool) {
+        if (deviceCache.empty()) {
+          ptr = quda::device_malloc_(func, file, line, nbytes);
+        } else {
+          auto it = deviceCache.lower_bound(nbytes);
+          if (it != deviceCache.end()) { // sufficiently large allocation found
+            nbytes = it->first;
+            ptr = it->second;
+            deviceCache.erase(it);
+          } else { // sacrifice the smallest cached allocation
+            it = deviceCache.begin();
+            ptr = it->second;
+            deviceCache.erase(it);
+            quda::device_free_(func, file, line, ptr);
+            ptr = quda::device_malloc_(func, file, line, nbytes);
+          }
+        }
+        deviceSize[ptr] = nbytes;
+      } else {
+        ptr = quda::device_malloc_(func, file, line, nbytes);
+      }
+      return ptr;
+    }
+
+    void device_free_(const char *func, const char *file, int line, void *ptr)
+    {
+      if (device_memory_pool) {
+        if (!deviceSize.count(ptr)) { errorQuda("Attempt to free invalid pointer"); }
+        deviceCache.insert(std::make_pair(deviceSize[ptr], ptr));
+        deviceSize.erase(ptr);
+      } else {
+        quda::device_free_(func, file, line, ptr);
+      }
+    }
+
+    void flush_pinned()
+    {
+      if (pinned_memory_pool) {
+        for (auto it : pinnedCache) { host_free(it.second); }
+        pinnedCache.clear();
+      }
+    }
+
+    void flush_device()
+    {
+      if (device_memory_pool) {
+        for (auto it : deviceCache) { device_free(it.second); }
+        deviceCache.clear();
+      }
+    }
+
+  } // namespace pool
+
+} // namespace quda
diff --git a/lib/targets/omptarget/quda_api.cpp b/lib/targets/omptarget/quda_api.cpp
new file mode 100644
index 0000000000..bb66ece86d
--- /dev/null
+++ b/lib/targets/omptarget/quda_api.cpp
@@ -0,0 +1,483 @@
+#include <unordered_set>
+#include <tune_quda.h>
+#include <uint_to_char.h>
+#include <quda_internal.h>
+#include <timer.h>
+#include <device.h>
+#include <target_device.h>
+#include <quda_omptarget_api.h>
+
+namespace quda
+{
+
+  static qudaError_t last_error = QUDA_SUCCESS;
+  static std::string last_error_str("OMPTARGET_SUCCESS");
+
+  qudaError_t qudaGetLastError()
+  {
+    auto rtn = last_error;
+    last_error = QUDA_SUCCESS;
+    return rtn;
+  }
+
+  std::string qudaGetLastErrorString()
+  {
+    auto rtn = last_error_str;
+    last_error_str = "OMPTARGET_SUCCESS";
+    return rtn;
+  }
+
+  namespace device {
+    void *constant_arg_buffer;
+  }
+
+  namespace target {
+    namespace omptarget {
+      struct {
+        dim3 block, grid;
+      } launch_param_host;
+
+      dim3 & launch_param_block(void)
+      {
+        return launch_param_host.block;
+      }
+
+      dim3 & launch_param_grid(void)
+      {
+        return launch_param_host.grid;
+      }
+
+      int qudaSetupLaunchParameter(const TuneParam &tp)
+      {
+        static int init = 0;
+        if(!init){
+          int dev = omp_get_default_device();
+          device::constant_arg_buffer = omp_target_alloc_host(device::max_constant_size(), dev);
+          if(!device::constant_arg_buffer){
+            errorQuda("failed to allocate %lu bytes host memory for kernel arguments.", device::max_constant_size());
+            return -1;
+          }
+          init = 1;
+        }
+        launch_param_host.grid = tp.grid;
+        launch_param_host.block = tp.block;
+        return 0;
+      }
+
+      static inline int
+      ompMemset(void *p, unsigned char b, std::size_t s)
+      {
+        constexpr int max_threads = device::max_block_size();
+        constexpr size_t nb = sizeof(float);
+        if(s%nb==0){
+          float *c = reinterpret_cast<float *>(p);
+          float f;
+          unsigned char bs[nb];
+          for(std::size_t i=0; i<nb; ++i) bs[i] = b;
+          memcpy(&f, bs, nb);
+          const std::size_t sb = s/nb;
+          #pragma omp target teams distribute parallel for simd thread_limit(max_threads) is_device_ptr(c)
+          for(std::size_t i=0;i<sb;++i) c[i] = f;
+        }else{
+          unsigned char *c = reinterpret_cast<unsigned char *>(p);
+          #pragma omp target teams distribute parallel for simd thread_limit(max_threads) is_device_ptr(c)
+          for(std::size_t i=0;i<s;++i) c[i] = b;
+        }
+        return 0;
+      }
+
+      static inline int
+      ompMemsetAsync(void *p, unsigned char b, std::size_t s, qudaStream_t stream)
+      {
+        return ompMemset(p, b, s);
+      }
+
+      static inline int
+      ompMemset2D(void *p, size_t pitch, unsigned char b, size_t w, size_t h)
+      {
+        constexpr int max_threads = device::max_block_size();
+        constexpr size_t nb = sizeof(float);
+        if(w%nb==0 && pitch%nb==0){
+          float *c = reinterpret_cast<float *>(p);
+          float f;
+          unsigned char bs[nb];
+          for(std::size_t i=0; i<nb; ++i) bs[i] = b;
+          memcpy(&f, bs, nb);
+          const std::size_t wb = w/nb, pitchb = pitch/nb;
+          #pragma omp target teams distribute parallel for simd thread_limit(max_threads) is_device_ptr(c) collapse(2)
+          for(std::size_t i=0;i<h;++i)
+            for(std::size_t j=0;j<wb;++j)
+              c[j+i*pitchb] = b;
+        }else{
+          unsigned char *c = reinterpret_cast<unsigned char *>(p);
+          #pragma omp target teams distribute parallel for simd thread_limit(max_threads) is_device_ptr(c) collapse(2)
+          for(std::size_t i=0;i<h;++i)
+            for(std::size_t j=0;j<w;++j)
+              c[j+i*pitch] = b;
+        }
+        return 0;
+      }
+
+      static inline int
+      ompMemset2DAsync(void *p, size_t pitch, unsigned char b, size_t w, size_t h, qudaStream_t stream)
+      {
+        return ompMemset2D(p, pitch, b, w, h);
+      }
+
+      static inline int
+      ompMemcpy(void *d, void *s, std::size_t c, qudaMemcpyKind k)
+      {
+        // ompwip("memcpy 0x%p <- 0x%p %d %d\n", d, s, c, k);
+        int r = 0;  // return value from omp_target_memcpy, note that no return value is reserved for memcpy.
+        switch(k){
+        case qudaMemcpyHostToHost:
+          memcpy(d,s,c);
+          break;
+        case qudaMemcpyHostToDevice:
+          if(0<omp_get_num_devices()){
+            r = omp_target_memcpy(d,s,c,0,0,omp_get_default_device(),omp_get_initial_device());
+          }else{
+            warningQuda("cudaMemcpyHostToDevice without a device, calling memcpy");
+            memcpy(d,s,c);
+          }
+          break;
+        case qudaMemcpyDeviceToHost:
+          if(0<omp_get_num_devices()){
+            r = omp_target_memcpy(d,s,c,0,0,omp_get_initial_device(),omp_get_default_device());
+          }else{
+            warningQuda("cudaMemcpyDeviceToHost without a device, calling memcpy");
+            memcpy(d,s,c);
+          }
+          break;
+        case qudaMemcpyDeviceToDevice:
+          r = omp_target_memcpy(d,s,c,0,0,omp_get_default_device(),omp_get_default_device());
+          break;
+        case qudaMemcpyDefault:
+          if(0<omp_get_num_devices()){
+            if(QUDA_CUDA_FIELD_LOCATION==quda::get_pointer_location(d)){
+              if(QUDA_CUDA_FIELD_LOCATION==quda::get_pointer_location(s)){
+                r = omp_target_memcpy(d,s,c,0,0,omp_get_default_device(),omp_get_default_device());
+              }else{
+                r = omp_target_memcpy(d,s,c,0,0,omp_get_default_device(),omp_get_initial_device());
+              }
+            }else{
+              if(QUDA_CUDA_FIELD_LOCATION==quda::get_pointer_location(s)){
+                r = omp_target_memcpy(d,s,c,0,0,omp_get_initial_device(),omp_get_default_device());
+              }else{
+                memcpy(d,s,c);
+              }
+            }
+          }else{
+            warningQuda("cudaMemcpyDefault without a device, calling memcpy");
+            memcpy(d,s,c);
+          }
+          break;
+        default: errorQuda("Unsupported qudaMemcpyType %d", k);
+        }
+        return r;
+      }
+
+      static inline int
+      ompMemcpyAsync(void *d, void *s, std::size_t c, qudaMemcpyKind k, qudaStream_t stream)
+      {
+        return ompMemcpy(d, s, c, k);
+      }
+
+      void set_runtime_error(int error, const char *api_func, const char *func, const char *file,
+                             const char *line, bool allow_error)
+      {
+        if (error == 0) return;
+        last_error = error == 0 ? QUDA_SUCCESS : QUDA_ERROR;
+        last_error_str = "OMPTARGET_ERROR";
+        if (!allow_error)
+          errorQuda("%s returned %s\n (%s:%s in %s())\n", api_func, last_error_str.c_str(), file, line, func);
+      }
+    }
+  } // namespace target
+
+  using namespace target::omptarget;
+
+  class QudaMem : public Tunable
+  {
+    void *dst;
+    const void *src;
+    const size_t count;
+    const int value;
+    const bool copy;
+    const qudaMemcpyKind kind;
+    const bool async;
+    const char *name;
+    const bool active_tuning;
+    const char *func;
+    const char *file;
+    const char *line;
+
+  public:
+    inline QudaMem(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream,
+                   bool async, const char *func, const char *file, const char *line) :
+      dst(dst),
+      src(src),
+      count(count),
+      value(0),
+      copy(true),
+      kind(kind),
+      async(async),
+      active_tuning(activeTuning()),
+      func(func),
+      file(file),
+      line(line)
+    {
+      if (!async) {
+        switch (kind) {
+        case qudaMemcpyDeviceToHost: name = "qudaMemcpyDeviceToHost"; break;
+        case qudaMemcpyHostToDevice: name = "qudaMemcpyHostToDevice"; break;
+        case qudaMemcpyHostToHost: name = "qudaMemcpyHostToHost"; break;
+        case qudaMemcpyDeviceToDevice: name = "qudaMemcpyDeviceToDevice"; break;
+        case qudaMemcpyDefault: name = "qudaMemcpyDefault"; break;
+        default: errorQuda("Unsupported qudaMemcpyKind %d", kind);
+        }
+      } else {
+        switch (kind) {
+        case qudaMemcpyDeviceToHost: name = "qudaMemcpyAsyncDeviceToHost"; break;
+        case qudaMemcpyHostToDevice: name = "qudaMemcpyAsyncHostToDevice"; break;
+        case qudaMemcpyHostToHost: name = "qudaMemcpyAsyncHostToHost"; break;
+        case qudaMemcpyDeviceToDevice: name = "qudaMemcpyAsyncDeviceToDevice"; break;
+        case qudaMemcpyDefault: name = "qudaMemcpyAsyncDefault"; break;
+        default: errorQuda("Unsupported qudaMemcpyKind %d", kind);
+        }
+      }
+      strcpy(aux, func);
+      strcat(aux, ",");
+      strcat(aux, file);
+      strcat(aux, ",");
+      strcat(aux, line);
+
+      apply(stream);
+    }
+
+    inline QudaMem(void *dst, int value, size_t count, const qudaStream_t &stream, bool async, const char *func,
+                   const char *file, const char *line) :
+      dst(dst),
+      src(nullptr),
+      count(count),
+      value(value),
+      copy(false),
+      kind(qudaMemcpyDefault),
+      async(async),
+      active_tuning(activeTuning())
+    {
+      name = !async ? "qudaMemset" : "qudaMemsetAsync";
+      strcpy(aux, func);
+      strcat(aux, ",");
+      strcat(aux, file);
+      strcat(aux, ",");
+      strcat(aux, line);
+
+      apply(stream);
+    }
+
+    void apply(const qudaStream_t &stream) override
+    {
+      if (!active_tuning) tuneLaunch(*this, getTuning(), getVerbosity());
+
+      if (copy) {
+        if (async) {
+          auto error = ompMemcpyAsync(dst, (void*)src, count, kind, stream);
+          set_runtime_error(error, "qudaMemcpyAsync", func, file, line, active_tuning);
+        } else {
+          auto error = ompMemcpy(dst, (void*)src, count, kind);
+          set_runtime_error(error, "qudaMemcpy", func, file, line, active_tuning);
+        }
+      } else {
+        auto error
+          = async ? ompMemsetAsync(dst, value, count, stream) : ompMemset(dst, value, count);
+        set_runtime_error(error, "qudaMemset", func, file, line, active_tuning);
+      }
+    }
+
+    bool advanceTuneParam(TuneParam &) const override { return false; }
+
+    TuneKey tuneKey() const override
+    {
+      char vol[128];
+      strcpy(vol, "bytes=");
+      u64toa(vol + 6, (uint64_t)count);
+      return TuneKey(vol, name, aux);
+    }
+
+    long long bytes() const override { return kind == qudaMemcpyDeviceToDevice ? 2 * count : count; }
+  };
+
+  void qudaMemcpy_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
+                   const char *line)
+  {
+    if (count == 0) return;
+    QudaMem copy(dst, src, count, kind, device::get_default_stream(), false, func, file, line);
+  }
+
+  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func,
+                   const char *file, const char *line)
+  {
+    if (count == 0) return;
+    QudaMem copy(dst.data(), src.data(), count, kind, device::get_default_stream(), false, func,
+                 file, line);
+  }
+
+  void qudaMemcpyAsync_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream,
+                        const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+
+    if (kind == qudaMemcpyDeviceToDevice) {
+      QudaMem copy(dst, src, count, kind, stream, true, func, file, line);
+    } else {
+      ompMemcpyAsync(dst, (void*)src, count, kind, stream);
+    }
+  }
+
+  void qudaMemcpyP2PAsync_(void *dst, const void *src, size_t count, const qudaStream_t &stream, const char *func,
+                           const char *file, const char *line)
+  {
+    if (count == 0) return;
+    auto error = ompMemcpyAsync(dst, (void*)src, count, qudaMemcpyDeviceToDevice, stream);
+    set_runtime_error(error, "cudaMemcpyAsync", func, file, line);
+  }
+
+  void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+    QudaMem set(ptr, value, count, device::get_default_stream(), false, func, file, line);
+  }
+
+  void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, device::get_default_stream(), false, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
+  }
+
+  void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
+                        const char *file, const char *line)
+  {
+    if (count == 0) return;
+    QudaMem copy(ptr, value, count, stream, true, func, file, line);
+  }
+
+  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
+                        const char *file, const char *line)
+  {
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, stream, true, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
+  }
+
+  void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height,
+                          const qudaStream_t &stream, const char *func, const char *file, const char *line)
+  {
+    if (ptr.is_device()) {
+      auto error = ompMemset2DAsync(static_cast<char *>(ptr.data()) + offset, pitch, value, width, height, stream);
+      set_runtime_error(error, __func__, func, file, line);
+    } else {
+      for (auto i = 0u; i < height; i++) memset(static_cast<char *>(ptr.data()) + offset + i * pitch, value, width);
+    }
+  }
+
+  void qudaMemPrefetchAsync_(void *ptr, size_t count, QudaFieldLocation mem_space, const qudaStream_t &stream,
+                             const char *func, const char *file, const char *line)
+  {
+    ompwip("doing nothing");
+  }
+
+  constexpr int max_quda_event = 16;
+  struct QudaEvent { bool active; double time; };
+  static QudaEvent global_quda_event[max_quda_event];
+
+  bool qudaEventQuery_(qudaEvent_t &quda_event, const char *func, const char *file, const char *line)
+  {
+    return true;
+  }
+
+  void qudaEventRecord_(qudaEvent_t &quda_event, qudaStream_t stream, const char *func, const char *file, const char *line)
+  {
+    QudaEvent *e = reinterpret_cast<QudaEvent *>(quda_event.event);
+    if(e!=nullptr)
+      e->time = omp_get_wtime();
+  }
+
+  void qudaStreamWaitEvent_(qudaStream_t stream, qudaEvent_t quda_event, unsigned int flags, const char *func,
+                            const char *file, const char *line)
+  {
+  }
+
+  qudaEvent_t qudaEventCreate_(const char *func, const char *file, const char *line)
+  {
+    qudaEvent_t quda_event;
+    quda_event.event = nullptr;
+    return quda_event;
+  }
+
+  qudaEvent_t qudaChronoEventCreate_(const char *func, const char *file, const char *line)
+  {
+    qudaEvent_t quda_event;
+    int i;
+    for(i=0;i<max_quda_event;++i)
+      if(!global_quda_event[i].active)
+        break;
+    if(i<max_quda_event){
+      global_quda_event[i].active = true;
+      global_quda_event[i].time = 0.;
+      quda_event.event = reinterpret_cast<void*>(&global_quda_event[i]);
+    }else{
+      errorQuda("global_quda_event exhausted.");
+    }
+    return quda_event;
+  }
+
+  float qudaEventElapsedTime_(const qudaEvent_t &start, const qudaEvent_t &stop, const char *func, const char *file,
+                              const char *line)
+  {
+    return static_cast<float>(reinterpret_cast<QudaEvent *>(stop.event)->time - reinterpret_cast<QudaEvent *>(start.event)->time);
+  }
+
+  void qudaEventDestroy_(qudaEvent_t &event, const char *func, const char *file, const char *line)
+  {
+    QudaEvent *e = reinterpret_cast<QudaEvent *>(event.event);
+    if(e!=nullptr){
+      e->active = false;
+      e->time = 0.;
+    }
+  }
+
+  void qudaEventSynchronize_(const qudaEvent_t &quda_event, const char *func, const char *file, const char *line)
+  {
+    // ompwip("unimplemented");
+  }
+
+  void qudaStreamSynchronize_(const qudaStream_t &stream, const char *func, const char *file, const char *line)
+  {
+    // ompwip("unimplemented");
+  }
+
+  void qudaDeviceSynchronize_(const char *func, const char *file, const char *line)
+  {
+    // ompwip("unimplemented");
+  }
+
+  void *qudaGetSymbolAddress_(const char *symbol, const char *func, const char *file, const char *line)
+  {
+    ompwip("unimplemented");
+    return nullptr;
+  }
+
+  void printAPIProfile()
+  {
+    ompwip("unimplemented");
+  }
+
+} // namespace quda
diff --git a/lib/targets/omptarget/target_omptarget.cmake b/lib/targets/omptarget/target_omptarget.cmake
new file mode 100644
index 0000000000..e869699dea
--- /dev/null
+++ b/lib/targets/omptarget/target_omptarget.cmake
@@ -0,0 +1,127 @@
+# ######################################################################################################################
+# OMPTARGET specific part of CMakeLists
+set(QUDA_TARGET_OMPTARGET ON)
+
+if(DEFINED ENV{QUDA_GPU_ARCH})
+  set(QUDA_DEFAULT_GPU_ARCH $ENV{QUDA_GPU_ARCH})
+else()
+  set(QUDA_DEFAULT_GPU_ARCH xehp)
+endif()
+
+set(QUDA_GPU_ARCH
+    ${QUDA_DEFAULT_GPU_ARCH}
+    CACHE STRING "set the GPU architecture (xehp)")
+set_property(CACHE QUDA_GPU_ARCH PROPERTY STRINGS xehp)
+
+message(STATUS "Building for GPU Architectures: ${QUDA_GPU_ARCH}")
+
+set(QUDA_WARP_SIZE 32 CACHE STRING "OpenMP target logical warp size")
+set_property(CACHE QUDA_WARP_SIZE PROPERTY STRINGS 8 16 32 64)
+target_compile_definitions(quda PUBLIC QUDA_WARP_SIZE=${QUDA_WARP_SIZE})
+message(STATUS "Using logical warp size: ${QUDA_WARP_SIZE}")
+
+set(QUDA_MAX_BLOCK_SIZE 1024 CACHE STRING "OpenMP target maximum team size (number of threads per team)")
+set_property(CACHE QUDA_MAX_BLOCK_SIZE PROPERTY STRINGS 256 512 768 1024)
+target_compile_definitions(quda PUBLIC QUDA_MAX_BLOCK_SIZE=${QUDA_MAX_BLOCK_SIZE})
+message(STATUS "Using maximum team size: ${QUDA_MAX_BLOCK_SIZE}")
+
+set(QUDA_MAX_SHARED_MEMORY_SIZE 114688 CACHE STRING "OpenMP target maximum shared memory size (among threads in a team)")
+set_property(CACHE QUDA_MAX_SHARED_MEMORY_SIZE PROPERTY STRINGS 32768 49152 65536 81920 98304 114688 131072)
+target_compile_definitions(quda PUBLIC QUDA_MAX_SHARED_MEMORY_SIZE=${QUDA_MAX_SHARED_MEMORY_SIZE})
+message(STATUS "Using maximum shared memory size: ${QUDA_MAX_SHARED_MEMORY_SIZE}")
+
+option(QUDA_OMPTARGET_THREAD_ARRAY_SLM "Build OpenMP target backend with thread_array on SLM" ON)
+mark_as_advanced(QUDA_OMPTARGET_THREAD_ARRAY_SLM)
+
+option(QUDA_OMPTARGET_JIT "Build OpenMP target backend with JIT" OFF)
+mark_as_advanced(QUDA_OMPTARGET_JIT)
+
+option(QUDA_OMPTARGET_SIMD16 "Build OpenMP target backend with simd width 16" OFF)
+mark_as_advanced(QUDA_OMPTARGET_SIMD16)
+
+string(REPLACE " " "" QUDA_GPU_ARCH_TAG "${QUDA_GPU_ARCH}")
+
+# ######################################################################################################################
+# define omptarget flags
+
+if(QUDA_OMPTARGET_JIT)
+    set(QUDA_OMPTARGET_FLAGS -fiopenmp -fopenmp-targets=spir64 -fopenmp-version=60 "SHELL:-mllvm -vpo-paropt-simulate-get-num-threads-in-target=false")
+else()
+    #set(QUDA_OMPTARGET_FLAGS -fiopenmp -fopenmp-targets=spir64_gen -fopenmp-version=60 "SHELL:-mllvm -pragma-unroll-threshold=16" "SHELL:-mllvm -vpo-paropt-simulate-get-num-threads-in-target=false")
+    #set(QUDA_OMPTARGET_FLAGS -fiopenmp -fopenmp-targets=spir64_gen -fopenmp-device-code-split=per_kernel -fopenmp-max-parallel-link-jobs=64 -fopenmp-version=60 "SHELL:-mllvm -vpo-paropt-simulate-get-num-threads-in-target=false")
+    set(QUDA_OMPTARGET_FLAGS -fiopenmp -fopenmp-version=60 "SHELL:-mllvm -vpo-paropt-simulate-get-num-threads-in-target=false")
+    if(QUDA_OMPTARGET_DEBUG)
+        set(QUDA_OMPTARGET_FLAGS ${QUDA_OMPTARGET_FLAGS} -g -fopenmp-targets=spir64_gen=-g0)
+    else()
+        set(QUDA_OMPTARGET_FLAGS ${QUDA_OMPTARGET_FLAGS} -fopenmp-targets=spir64_gen)
+    endif()
+endif()
+
+if(QUDA_OMPTARGET_SIMD16)
+    set(QUDA_OMPTARGET_FLAGS ${QUDA_OMPTARGET_FLAGS} "SHELL:-mllvm -vpo-paropt-fixed-simd-width=16")
+    set(QUDA_GPU_ARCH_TAG "${QUDA_GPU_ARCH_TAG}-SIMD16")
+endif()
+
+message(STATUS "Using OpenMP target flags: ${QUDA_OMPTARGET_FLAGS}")
+
+# QUDA_HASH for tunecache
+if(QUDA_OMPTARGET_JIT)
+    set(HASH cpu_arch=${CPU_ARCH},gpu_arch=JIT,cxx_version=${CMAKE_CXX_COMPILER_VERSION})
+    set(GITVERSION "${PROJECT_VERSION}-${GITVERSION}-omptarget:JIT")
+else()
+    set(HASH cpu_arch=${CPU_ARCH},gpu_arch=${QUDA_GPU_ARCH_TAG},cxx_version=${CMAKE_CXX_COMPILER_VERSION})
+    set(GITVERSION "${PROJECT_VERSION}-${GITVERSION}-omptarget")
+endif()
+
+# ######################################################################################################################
+# omptarget specific compile options
+
+if(NOT QUDA_OMPTARGET_THREAD_ARRAY_SLM)
+    target_compile_definitions(quda PUBLIC QUDA_OMPTARGET_THREAD_ARRAY_SIMPLE)
+endif()
+
+target_include_directories(quda PRIVATE
+  ${CMAKE_SOURCE_DIR}/include/targets/omptarget)
+
+# We need to overwrite some cuda-ism
+target_include_directories(quda PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include/targets/omptarget>
+  $<INSTALL_INTERFACE:include/targets/omptarget>)
+target_compile_options(
+  quda
+  PRIVATE -Wall
+          -Wextra
+          -Wno-unknown-pragmas
+          -Wno-unused-result
+          $<$<CONFIG:STRICT>:-Werror
+          -Wno-error=pass-failed>
+          $<$<CONFIG:SANITIZE>:-fsanitize=address
+          -fsanitize=undefined>)
+
+target_compile_options(quda PRIVATE ${QUDA_OMPTARGET_FLAGS})
+
+if(QUDA_OMPTARGET_JIT)
+    target_link_options(quda PRIVATE ${QUDA_OMPTARGET_FLAGS} -flink-huge-device-code)
+else()
+    target_link_options(quda PRIVATE ${QUDA_OMPTARGET_FLAGS} -Xopenmp-target-backend "-device ${QUDA_GPU_ARCH}" -flink-huge-device-code)
+endif()
+
+set_source_files_properties( ${QUDA_CU_OBJS} PROPERTIES LANGUAGE CXX)
+set_source_files_properties( ${QUDA_CU_OBJS} PROPERTIES COMPILE_FLAGS "-x c++")
+
+# target_link_libraries(quda PUBLIC hip::hiprand roc::rocrand hip::hipcub roc::rocprim_hip)
+# target_link_libraries(quda PUBLIC roc::hipblas roc::rocblas)
+
+set(OMPTARGET_MKL_LIBRARY "-lmkl_sycl -lmkl_intel_ilp64 -lmkl_core -lmkl_intel_thread")
+
+if(QUDA_BUILD_NATIVE_LAPACK)
+  target_compile_definitions(quda PRIVATE MKL_ILP64)
+  target_link_libraries(quda PUBLIC ${OMPTARGET_MKL_LIBRARY})
+endif()
+
+# if(QUDA_GAUGE_ALG)
+#   target_include_directories(quda PUBLIC ${ROCM_PATH}/hipfft/include)
+#   target_link_libraries(quda PUBLIC hip::hipfft)
+# endif(QUDA_GAUGE_ALG)
+
+add_subdirectory(targets/omptarget)
diff --git a/lib/tune.cpp b/lib/tune.cpp
index 3bcd7da964..3cbb28970b 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -1077,8 +1077,8 @@ namespace quda
             best_param = param;
           }
           if (error == QUDA_SUCCESS && tunable.launchError() == QUDA_SUCCESS) {
-            logQuda(QUDA_DEBUG_VERBOSE, "T   %s gives %s\n", tunable.paramString(param).c_str(),
-                    tunable.perfString(elapsed_time).c_str());
+            logQuda(QUDA_DEBUG_VERBOSE, "T   %s gives %s (time %.3f ms)\n", tunable.paramString(param).c_str(),
+                    tunable.perfString(elapsed_time).c_str(), elapsed_time*1e3);
           } else {
             logQuda(QUDA_DEBUG_VERBOSE, "    %s gives %s\n", tunable.paramString(param).c_str(),
                     qudaGetLastErrorString().c_str());
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b295e00d9c..687729224b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -185,7 +185,7 @@ if(QUDA_QIO)
 endif()
 
 add_executable(tune_test tune_test.cpp)
-target_link_libraries(tune_test ${TEST_LIBS})
+target_link_libraries(tune_test ${TEST_LIBS} "-Wl,--unresolved-symbols=ignore-all")
 quda_checkbuildtest(tune_test QUDA_BUILD_ALL_TESTS)
 install(TARGETS tune_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR})