diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3fa2a949c..7e93a258de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,9 +166,12 @@ option(QUDA_ARPACK_LOGGING "enable ARPACK logging (not available for NG)" OFF)
 # OpenBLAS
 option(QUDA_OPENBLAS "enable OpenBLAS" OFF)
 
+option(QUDA_QCD_PLUS_QED "Enable QCD+QED features (builds QUDA_RECONSTRUCT_9/13 instead of 8/12 for Wilson-type operators)" OFF)
+
 # Interface options
 option(QUDA_INTERFACE_QDP "build qdp interface" ON)
 option(QUDA_INTERFACE_MILC "build milc interface" ON)
+option(QUDA_INTERFACE_OPENQCD "build OpenQCD interface (for QCD+QED support, see QUDA_QCD_PLUS_QED)" OFF)
 option(QUDA_INTERFACE_CPS "build cps interface" OFF)
 option(QUDA_INTERFACE_QDPJIT "build qdpjit interface" OFF)
 option(QUDA_INTERFACE_BQCD "build bqcd interface" OFF)
@@ -306,6 +309,13 @@ if(QUDA_MPI AND QUDA_QMP)
     "Specifying QUDA_QMP and QUDA_MPI might result in undefined behavior. If you intend to use QMP set QUDA_MPI=OFF.")
 endif()
 
+if(QUDA_INTERFACE_OPENQCD AND QUDA_QMP)
+  message(SEND_ERROR "OpenQCD does not support QMP comms.")
+endif()
+
+if(QUDA_INTERFACE_OPENQCD AND NOT (QUDA_MPI))
+  message(SEND_ERROR "OpenQCD requires QUDA built with MPI comms.")
+endif()
 
 if(QUDA_NVSHMEM AND NOT (QUDA_QMP OR QUDA_MPI))
   message(SEND_ERROR "Specifying QUDA_NVSHMEM requires either QUDA_QMP or QUDA_MPI.")
diff --git a/README.md b/README.md
index 274f708b3e..3e5e9bb62f 100644
--- a/README.md
+++ b/README.md
@@ -265,6 +265,7 @@ Advanced Scientific Computing (PASC21) [arXiv:2104.05615[hep-lat]].
 *  Joel Giedt (Rensselaer Polytechnic Institute) 
 *  Steven Gottlieb (Indiana University) 
 *  Anthony Grebe (Fermilab)
+*  Roman Gruber (ETH)
 *  Kyriakos Hadjiyiannakou (Cyprus)
 *  Ben Hoerz (Intel)
 *  Leon Hostetler (Indiana University)
diff --git a/include/clover_field.h b/include/clover_field.h
index 32fffd2973..f50a5ae253 100644
--- a/include/clover_field.h
+++ b/include/clover_field.h
@@ -166,6 +166,8 @@ namespace quda {
       inverse(param.inverse),
       clover(param.clover),
       cloverInv(param.cloverInv),
+      csw(param.csw),
+      coeff(param.coeff),
       twist_flavor(param.twist_flavor),
       mu2(param.mu2),
       epsilon2(param.epsilon2),
diff --git a/include/clover_field_order.h b/include/clover_field_order.h
index af1a8d73c8..dc9a92084a 100644
--- a/include/clover_field_order.h
+++ b/include/clover_field_order.h
@@ -11,6 +11,7 @@
 #include <convert.h>
 #include <clover_field.h>
 #include <complex_quda.h>
+#include <index_helper.cuh>
 #include <quda_matrix.h>
 #include <color_spinor.h>
 #include <load_store.h>
@@ -614,7 +615,11 @@ namespace quda {
             errorQuda("Accessor reconstruct = %d does not match field reconstruct %d", enable_reconstruct,
                       clover.Reconstruct());
           if (clover.max_element(is_inverse) == 0.0 && isFixed<Float>::value)
+#ifdef BUILD_OPENQCD_INTERFACE
+            warningQuda("%p max_element(%d) appears unset", &clover, is_inverse); /* ignore if the SW-field is zero */
+#else
             errorQuda("%p max_element(%d) appears unset", &clover, is_inverse);
+#endif
           if (clover.Diagonal() == 0.0 && clover.Reconstruct()) errorQuda("%p diagonal appears unset", &clover);
           this->clover = clover_ ? clover_ : clover.data<Float *>(is_inverse);
         }
@@ -983,6 +988,97 @@ namespace quda {
         size_t Bytes() const { return length*sizeof(Float); }
       };
 
+      /**
+       * OpenQCD ordering for clover fields
+       */
+      template <typename Float, int length = 72> struct OpenQCDOrder {
+        static constexpr bool enable_reconstruct = false;
+        typedef typename mapper<Float>::type RegType;
+        Float *clover;
+        const int volumeCB;
+        const QudaTwistFlavorType twist_flavor;
+        const Float mu2;
+        const Float epsilon2;
+        const double coeff;
+        const double csw;
+        const double kappa;
+        const int dim[4]; // xyzt convention
+        const int L[4];   // txyz convention
+
+        OpenQCDOrder(const CloverField &clover, bool inverse, Float *clover_ = nullptr, void * = nullptr) :
+          volumeCB(clover.Stride()),
+          twist_flavor(clover.TwistFlavor()),
+          mu2(clover.Mu2()),
+          epsilon2(clover.Epsilon2()),
+          coeff(clover.Coeff()),
+          csw(clover.Csw()),
+          kappa(clover.Coeff() / clover.Csw()),
+          dim {clover.X()[0], clover.X()[1], clover.X()[2], clover.X()[3]}, // *local* lattice dimensions, xyzt
+          L {clover.X()[3], clover.X()[0], clover.X()[1], clover.X()[2]}    // *local* lattice dimensions, txyz
+        {
+          if (clover.Order() != QUDA_OPENQCD_CLOVER_ORDER) {
+            errorQuda("Invalid clover order %d for this accessor", clover.Order());
+          }
+          this->clover = clover_ ? clover_ : clover.data<Float *>(inverse);
+          if (clover.Coeff() == 0.0 || clover.Csw() == 0.0) { errorQuda("Neither coeff nor csw may be zero!"); }
+        }
+
+        QudaTwistFlavorType TwistFlavor() const { return twist_flavor; }
+        Float Mu2() const { return mu2; }
+        Float Epsilon2() const { return epsilon2; }
+
+        /**
+         * @brief      Gets the offset in Floats from the openQCD base pointer to
+         *             the spinor field.
+         *
+         * @param[in]  x_cb    Checkerboard index coming from quda
+         * @param[in]  parity  The parity coming from quda
+         *
+         * @return     The offset.
+         */
+        __device__ __host__ inline int getCloverOffset(int x_cb, int parity) const
+        {
+          int x_quda[4], x[4];
+          getCoords(x_quda, x_cb, dim, parity); // x_quda contains xyzt local Carthesian corrdinates
+          openqcd::rotate_coords(x_quda, x);    // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate
+          return openqcd::ipt(x, L) * length;
+        }
+
+        /**
+         * @brief      Load a clover field at lattice point x_cb
+         *
+         * @param      v       The output clover matrix in QUDA order
+         * @param      x_cb    The checkerboarded lattice site
+         * @param      parity  The parity of the lattice site
+         */
+        __device__ __host__ inline void load(RegType v[length], int x_cb, int parity) const
+        {
+          int sign[36] = {-1, -1, -1, -1, -1, -1,                 // diagonals (idx 0-5)
+                          -1, +1, -1, +1, -1, -1, -1, -1, -1, -1, // column 0  (idx 6-15)
+                          -1, +1, -1, -1, -1, -1, -1, -1,         // column 1  (idx 16-23)
+                          -1, -1, -1, -1, -1, -1,                 // column 2  (idx 24-29)
+                          -1, +1, -1, +1,                         // column 3  (idx 30-33)
+                          -1, +1};                                // column 4  (idx 34-35)
+          int map[36] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 18, 19, 24, 25, 16, 17,
+                         12, 13, 20, 21, 26, 27, 14, 15, 22, 23, 28, 29, 30, 31, 32, 33, 34, 35};
+          const int M = length / 2;
+          int offset = getCloverOffset(x_cb, parity);
+          auto Ap = &clover[offset];     // A_+
+          auto Am = &clover[offset + M]; // A_-
+
+#pragma unroll
+          for (int i = 0; i < M; i++) {
+            v[i] = sign[i] * (kappa * Am[map[i]] - (i < 6));
+            v[M + i] = sign[i] * (kappa * Ap[map[i]] - (i < 6));
+          }
+        }
+
+        // FIXME implement the save routine for OpenQCD ordered fields
+        __device__ __host__ inline void save(RegType[length], int, int) const { }
+
+        size_t Bytes() const { return length * sizeof(Float); }
+      };
+
   } // namespace clover
 
   // Use traits to reduce the template explosion
diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 6362b4677b..afca84c105 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -207,6 +207,9 @@ namespace quda
       } else if (inv_param.dirac_order == QUDA_TIFR_PADDED_DIRAC_ORDER) {
         fieldOrder = QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER;
         siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
+      } else if (inv_param.dirac_order == QUDA_OPENQCD_DIRAC_ORDER) {
+        fieldOrder = QUDA_OPENQCD_FIELD_ORDER;
+        siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
       } else {
         errorQuda("Dirac order %d not supported", inv_param.dirac_order);
       }
diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 46ad849079..2c46c23ea9 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -1759,6 +1759,85 @@ namespace quda
       size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); }
     };
 
+    /**
+     * struct to define order of spinor fields in OpenQCD
+     *
+     * @tparam     Float  Underlying type of data (precision)
+     * @tparam     Ns     Number of spin degrees of freedom
+     * @tparam     Nc     Number of color degrees of freedom
+     */
+    template <typename Float, int Ns, int Nc> struct OpenQCDDiracOrder {
+      using Accessor = OpenQCDDiracOrder<Float, Ns, Nc>;
+      using real = typename mapper<Float>::type;
+      using complex = complex<real>;
+
+      static const int length = 2 * Ns * Nc; // 12 complex (2 floats) numbers per spinor color field
+      Float *field;
+      size_t offset;
+      Float *ghost[8];
+      int volumeCB;
+      int faceVolumeCB[4];
+      int nParity;
+      const int dim[4]; // xyzt convention
+      const int L[4];   // txyz convention
+
+      OpenQCDDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) :
+        field(field_ ? field_ : a.data<Float *>()),
+        offset(a.Bytes() / (2 * sizeof(Float))), // TODO: What's this for??
+        volumeCB(a.VolumeCB()),
+        nParity(a.SiteSubset()),
+        dim {a.X(0), a.X(1), a.X(2), a.X(3)}, // *local* lattice dimensions, xyzt
+        L {a.X(3), a.X(0), a.X(1), a.X(2)}    // *local* lattice dimensions, txyz
+      {
+        if constexpr (length != 24) { errorQuda("Spinor field length %d not supported", length); }
+      }
+
+      /**
+       * @brief      Gets the offset in Floats from the openQCD base pointer to
+       *             the spinor field.
+       *
+       * @param[in]  x       Checkerboard index coming from quda
+       * @param[in]  parity  The parity coming from quda
+       *
+       * @return     The offset.
+       */
+      __device__ __host__ inline int getSpinorOffset(int x_cb, int parity) const
+      {
+        int x_quda[4], x[4];
+        getCoords(x_quda, x_cb, dim, parity); // x_quda contains xyzt local Carthesian corrdinates
+        openqcd::rotate_coords(x_quda, x);    // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate
+        return openqcd::ipt(x, L) * length;
+      }
+
+      __device__ __host__ inline void load(complex v[length / 2], int x_cb, int parity = 0) const
+      {
+        auto in = &field[getSpinorOffset(x_cb, parity)];
+        block_load<complex, length / 2>(v, reinterpret_cast<const complex *>(in));
+      }
+
+      __device__ __host__ inline void save(const complex v[length / 2], int x_cb, int parity = 0) const
+      {
+        auto out = &field[getSpinorOffset(x_cb, parity)];
+        block_store<complex, length / 2>(reinterpret_cast<complex *>(out), v);
+      }
+
+      /**
+         @brief This accessor routine returns a colorspinor_wrapper to this object,
+         allowing us to overload various operators for manipulating at
+         the site level interms of matrix operations.
+         @param[in] x_cb Checkerboarded space-time index we are requesting
+         @param[in] parity Parity we are requesting
+         @return Instance of a colorspinor_wrapper that curries in access to
+         this field at the above coordinates.
+      */
+      __device__ __host__ inline auto operator()(int x_cb, int parity) const
+      {
+        return colorspinor_wrapper<real, Accessor>(*this, x_cb, parity);
+      }
+
+      size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); }
+    }; // openQCDDiracOrder
+
   } // namespace colorspinor
 
   template <typename T, int Ns, int Nc, bool project = false, bool huge_alloc = false, bool disable_ghost = false>
diff --git a/include/comm_quda.h b/include/comm_quda.h
index 8496efab5a..d33e949bdf 100644
--- a/include/comm_quda.h
+++ b/include/comm_quda.h
@@ -49,6 +49,13 @@ namespace quda
   */
   int comm_dim(int dim);
 
+  /**
+     Return whether the dimension dim is a C* dimension or not
+     @param dim Dimension which we are querying
+     @return C* dimension or nor
+  */
+  bool comm_dim_cstar(int dim);
+
   /**
      Return the global number of processes in the dimension dim
      @param dim Dimension which we are querying
diff --git a/include/communicator_quda.h b/include/communicator_quda.h
index bf0e9ffba5..c5ddbe1f75 100644
--- a/include/communicator_quda.h
+++ b/include/communicator_quda.h
@@ -41,6 +41,7 @@ namespace quda
     int (*coords)[QUDA_MAX_DIM];
     int my_rank;
     int my_coords[QUDA_MAX_DIM];
+    int cstar; // number of C* direction as per openQxD convention
     // It might be worth adding communicators to allow for efficient reductions:
     //   #if defined(MPI_COMMS)
     //     MPI_Comm comm;
@@ -126,9 +127,26 @@ namespace quda
   inline int comm_rank_displaced(const Topology *topo, const int displacement[])
   {
     int coords[QUDA_MAX_DIM];
-
-    for (int i = 0; i < QUDA_MAX_DIM; i++) {
-      coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i], comm_dims(topo)[i]) : 0;
+    int shift_integer;
+
+    int Nx_displacement = 0;
+    for (int i = QUDA_MAX_DIM - 1; i >= 0; i--) {
+      // cstar shift[x] shift[y] shift[z] shift[t]
+      // 0     0        0        0        0
+      // 1     0        0        0        0
+      // 2     0        1        0        0
+      // 3     0        1        1        0
+      if (i < topo->ndim && ((i == 1 && topo->cstar >= 2) || (i == 2 && topo->cstar >= 3))) {
+        // if we go over the boundary and have a shifted boundary condition,
+        // we shift Nx/2 ranks in x-direction:
+        // shift_integer       in { 0, 1, 2}
+        // (shift_integer - 1) in {-1, 0, 1}
+        shift_integer = (comm_coords(topo)[i] + displacement[i] + comm_dims(topo)[i]) / comm_dims(topo)[i];
+        Nx_displacement += (shift_integer - 1) * (comm_dims(topo)[0] / 2);
+      }
+      coords[i] = (i < topo->ndim) ?
+        mod(comm_coords(topo)[i] + displacement[i] + (i == 0 ? Nx_displacement : 0), comm_dims(topo)[i]) :
+        0;
     }
 
     return comm_rank_from_coords(topo, coords);
@@ -390,6 +408,12 @@ namespace quda
       return comm_dims(topo)[dim];
     }
 
+    bool comm_dim_cstar(int dim)
+    {
+      Topology *topo = comm_default_topology();
+      return (topo->cstar >= 2 && dim == 1) || (topo->cstar >= 3 && dim == 2);
+    }
+
     int comm_coord(int dim)
     {
       Topology *topo = comm_default_topology();
diff --git a/include/dirac_quda.h b/include/dirac_quda.h
index 2ceb4f5b70..7372c956e3 100644
--- a/include/dirac_quda.h
+++ b/include/dirac_quda.h
@@ -1133,6 +1133,15 @@ namespace quda {
     virtual QudaDiracType getDiracType() const override { return QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC; }
   };
 
+  /**
+   * @brief      Applies gamma matrices to spinor fields
+   *
+   * @param[out] out   Output field
+   * @param[in]  in    Input field
+   * @param[in]  dir   Direction index of gamma matrix
+   */
+  void ApplyGamma(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, QudaGammaDirection_s dir);
+
   void gamma5(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in);
 
   /**
diff --git a/include/eigensolve_quda.h b/include/eigensolve_quda.h
index 4f6d16511e..6ef66e17bf 100644
--- a/include/eigensolve_quda.h
+++ b/include/eigensolve_quda.h
@@ -255,10 +255,11 @@ namespace quda
 
     /**
        @brief Computes Left/Right SVD from pre computed Right/Left
-       @param[in] evecs Computed eigenvectors of NormOp
-       @param[in] evals Computed eigenvalues of NormOp
+       @param[in,out] evecs Computed eigenvectors of NormOp
+       @param[in,out] evals Computed eigenvalues of NormOp
+       @param[in] dagger Whether NormOp was MdagM (false) or MMdag (true)
     */
-    void computeSVD(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals);
+    void computeSVD(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals, bool dagger = false);
 
     /**
        @brief Compute eigenvalues and their residiua
diff --git a/include/enum_quda.h b/include/enum_quda.h
index 48d79de463..1de9c3be35 100644
--- a/include/enum_quda.h
+++ b/include/enum_quda.h
@@ -48,6 +48,7 @@ typedef enum QudaGaugeFieldOrder_s {
   QUDA_BQCD_GAUGE_ORDER,        // expect *gauge, mu, even-odd, spacetime+halos, column-row order
   QUDA_TIFR_GAUGE_ORDER,        // expect *gauge, mu, even-odd, spacetime, column-row order
   QUDA_TIFR_PADDED_GAUGE_ORDER, // expect *gauge, mu, parity, t, z+halo, y, x/2, column-row order
+  QUDA_OPENQCD_GAUGE_ORDER, // expect *gauge, spacetime, mu, parity row-column order -- links attached to odd points only
   QUDA_INVALID_GAUGE_ORDER = QUDA_INVALID_ENUM
 } QudaGaugeFieldOrder;
 
@@ -252,14 +253,16 @@ typedef enum QudaDiracFieldOrder_s {
   QUDA_CPS_WILSON_DIRAC_ORDER,  // odd-even, color inside spin
   QUDA_LEX_DIRAC_ORDER,         // lexicographical order, color inside spin
   QUDA_TIFR_PADDED_DIRAC_ORDER, // padded z dimension for TIFR RHMC code
+  QUDA_OPENQCD_DIRAC_ORDER,     // openqcd
   QUDA_INVALID_DIRAC_ORDER = QUDA_INVALID_ENUM
 } QudaDiracFieldOrder;
 
 typedef enum QudaCloverFieldOrder_s {
-  QUDA_NATIVE_CLOVER_ORDER, // even-odd, Fload-N ordering
-  QUDA_PACKED_CLOVER_ORDER, // even-odd, QDP packed
-  QUDA_QDPJIT_CLOVER_ORDER, // (diagonal / off-diagonal)-chirality-spacetime
-  QUDA_BQCD_CLOVER_ORDER,   // even-odd, super-diagonal packed and reordered
+  QUDA_NATIVE_CLOVER_ORDER,  // even-odd, Fload-N ordering
+  QUDA_PACKED_CLOVER_ORDER,  // even-odd, QDP packed
+  QUDA_QDPJIT_CLOVER_ORDER,  // (diagonal / off-diagonal)-chirality-spacetime
+  QUDA_BQCD_CLOVER_ORDER,    // even-odd, super-diagonal packed and reordered
+  QUDA_OPENQCD_CLOVER_ORDER, // openqcd
   QUDA_INVALID_CLOVER_ORDER = QUDA_INVALID_ENUM
 } QudaCloverFieldOrder;
 
@@ -350,6 +353,7 @@ typedef enum QudaFieldOrder_s {
   QUDA_QDPJIT_FIELD_ORDER,                  // QDP field ordering (complex-color-spin-spacetime)
   QUDA_QOP_DOMAIN_WALL_FIELD_ORDER,         // QOP domain-wall ordering
   QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER, // TIFR RHMC ordering
+  QUDA_OPENQCD_FIELD_ORDER,                 // OPENQCD ordering
   QUDA_INVALID_FIELD_ORDER = QUDA_INVALID_ENUM
 } QudaFieldOrder;
 
@@ -363,16 +367,26 @@ typedef enum QudaFieldCreate_s {
 } QudaFieldCreate;
 
 typedef enum QudaGammaBasis_s { // gamj=((top 2 rows)(bottom 2 rows))  s1,s2,s3 are Pauli spin matrices, 1 is 2x2 identity
-  QUDA_DEGRAND_ROSSI_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((0,1)(1,0))  gam5=((1,0)(0,-1))
-  QUDA_UKQCD_GAMMA_BASIS,         // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,i*s2)(-i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,1)(1,0))
-  QUDA_CHIRAL_GAMMA_BASIS,        // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((0,-1)(-1,0))gam5=((-1,0)(0,1))
-  QUDA_DIRAC_PAULI_GAMMA_BASIS,   // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,-1)(-1,0))
+  QUDA_DEGRAND_ROSSI_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((0,1)(1,0))        gam5=((1,0)(0,-1))
+  QUDA_UKQCD_GAMMA_BASIS,         // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,i*s2)(-i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((1,0)(0,-1))       gam5=((0,1)(1,0))
+  QUDA_CHIRAL_GAMMA_BASIS,        // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((0,-1)(-1,0))      gam5=((-1,0)(0,1))
+  QUDA_OPENQCD_GAMMA_BASIS,       // gam1=((0,-i*s3)(i*s3,0)) gam2=((0,-1)(-1,0))      gam3=((0,-s1)(s1,0))     gam4=((0,-i*s2)(i*s2,0)) gam5=((1,0)(0,-1))
+  QUDA_DIRAC_PAULI_GAMMA_BASIS,   // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((1,0)(0,-1))       gam5=((0,-1)(-1,0))
   QUDA_INVALID_GAMMA_BASIS = QUDA_INVALID_ENUM //  gam5=gam1*gam2*gam3*gam4
 } QudaGammaBasis;
 //  Dirac-Pauli -> DeGrand-Rossi   T = i/sqrt(2)*((s2,-s2)(s2,s2))     field_DR = T * field_DP
 //  UKQCD       -> DeGrand-Rossi   T = i/sqrt(2)*((-s2,-s2)(-s2,s2))   field_DR = T * field_UK
 //  Chiral      -> DeGrand-Rossi   T = i*((0,-s2)(s2,0))               field_DR = T * field_chiral
 
+typedef enum QudaGammaDirection_s {
+  QUDA_GAMMA_X,
+  QUDA_GAMMA_Y,
+  QUDA_GAMMA_Z,
+  QUDA_GAMMA_T,
+  QUDA_GAMMA_5,
+  QUDA_INVALID_GAMMA_INDEX = QUDA_INVALID_ENUM
+} QudaGammaDirection;
+
 typedef enum QudaSourceType_s {
   QUDA_POINT_SOURCE,
   QUDA_RANDOM_SOURCE,
@@ -653,4 +667,3 @@ typedef enum QudaUpdateSplitGauge_s {
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h
index 832c2a31a2..33fa5a9ad8 100644
--- a/include/enum_quda_fortran.h
+++ b/include/enum_quda_fortran.h
@@ -42,6 +42,7 @@
 #define QUDA_BQCD_GAUGE_ORDER 6       // expect *gauge mu even-odd spacetime+halos row-column order
 #define QUDA_TIFR_GAUGE_ORDER 7
 #define QUDA_TIFR_PADDED_GAUGE_ORDER 8
+#define QUDA_OPENQCD_GAUGE_ORDER 9
 #define QUDA_INVALID_GAUGE_ORDER QUDA_INVALID_ENUM
 
 #define QudaTboundary integer(4)
@@ -224,13 +225,15 @@
 #define QUDA_CPS_WILSON_DIRAC_ORDER 4  // odd-even color inside spin
 #define QUDA_LEX_DIRAC_ORDER 5         // lexicographical order color inside spin
 #define QUDA_TIFR_PADDED_DIRAC_ORDER 6
+#define QUDA_OPENQCD_DIRAC_ORDER 7 // openqcd
 #define QUDA_INVALID_DIRAC_ORDER QUDA_INVALID_ENUM
 
 #define QudaCloverFieldOrder integer(4)
-#define QUDA_NATIVE_CLOVER_ORDER 0 // even-odd FloatN ordering
-#define QUDA_PACKED_CLOVER_ORDER 1 // even-odd packed
-#define QUDA_QDPJIT_CLOVER_ORDER 2 // lexicographical order packed
-#define QUDA_BQCD_CLOVER_ORDER 3   // BQCD order which is a packed super-diagonal form
+#define QUDA_NATIVE_CLOVER_ORDER 0  // even-odd FloatN ordering
+#define QUDA_PACKED_CLOVER_ORDER 1  // even-odd packed
+#define QUDA_QDPJIT_CLOVER_ORDER 2  // lexicographical order packed
+#define QUDA_BQCD_CLOVER_ORDER 3    // BQCD order which is a packed super-diagonal form
+#define QUDA_OPENQCD_CLOVER_ORDER 4 // openqcd
 #define QUDA_INVALID_CLOVER_ORDER QUDA_INVALID_ENUM
 
 #define QudaVerbosity integer(4)
@@ -313,6 +316,7 @@
 #define QUDA_QDPJIT_FIELD_ORDER 3                  // QDP field ordering (complex-color-spin-spacetime)
 #define QUDA_QOP_DOMAIN_WALL_FIELD_ORDER 4         // QOP domain-wall ordering
 #define QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER 5 // TIFR RHMC ordering
+#define QUDA_OPENQCD_FIELD_ORDER 6                 // openQCD ordering
 #define QUDA_INVALID_FIELD_ORDER QUDA_INVALID_ENUM
   
 #define QudaFieldCreate integer(4)
@@ -330,6 +334,14 @@
 #define QUDA_DIRAC_PAULI_GAMMA_BASIS 3
 #define QUDA_INVALID_GAMMA_BASIS QUDA_INVALID_ENUM
 
+#define QudaGammaDirection integer(4)
+#define QUDA_GAMMA_X 0
+#define QUDA_GAMMA_Y 1
+#define QUDA_GAMMA_Z 2
+#define QUDA_GAMMA_T 3
+#define QUDA_GAMMA_5 4
+#define QUDA_INVALID_GAMMA_DIRECTION QUDA_INVALID_ENUM
+
 #define QudaSourceType integer(4)
 #define QUDA_POINT_SOURCE 0
 #define QUDA_RANDOM_SOURCE 1
diff --git a/include/gamma.cuh b/include/gamma.cuh
index ed5474711f..bad953f631 100644
--- a/include/gamma.cuh
+++ b/include/gamma.cuh
@@ -26,10 +26,10 @@ namespace quda {
     Gamma(const Gamma &g) = default;
 
     __device__ __host__ inline int getcol(int row) const {
-      if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
-	switch(dir) {
-	case 0:
-	case 1:
+      if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS || basis == QUDA_OPENQCD_GAMMA_BASIS) {
+        switch (dir) {
+        case 0:
+        case 1:
 	  switch(row) {
 	  case 0: return 3;
 	  case 1: return 2;
@@ -54,11 +54,11 @@ namespace quda {
 	  case 3: return 3;
 	  }
 	  break;
-	}
+        }
       } else {
-	switch(dir) {
-	case 0:
-	case 1:
+        switch (dir) {
+        case 0:
+        case 1:
 	  switch(row) {
 	  case 0: return 3;
 	  case 1: return 2;
@@ -90,7 +90,7 @@ namespace quda {
 	  case 3: return 1;
 	  }
 	  break;
-	}
+        }
       }
       return 0;
     }
@@ -201,7 +201,51 @@ namespace quda {
 	  }
 	  break;
 	}
+      } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) {
+        switch (dir) {
+        case 0: /* corresponds to gamma1 in OpenQCD convention */
+          switch (row) {
+          case 0:
+          case 1: return -I;
+          case 2:
+          case 3: return I;
+          }
+          break;
+        case 1: /* gamma2 in openQCD */
+          switch (row) {
+          case 0:
+          case 3: return -1;
+          case 1:
+          case 2: return 1;
+          }
+          break;
+        case 2: /* gamma3 in openQCD */
+          switch (row) {
+          case 0:
+          case 3: return -I;
+          case 1:
+          case 2: return I;
+          }
+          break;
+        case 3: /* gamma0 in openQCD */
+          switch (row) {
+          case 0:
+          case 1:
+          case 2:
+          case 3: return -1;
+          }
+          break;
+        case 4: /* gamma5 in openQCD */
+          switch (row) {
+          case 0:
+          case 1: return 1;
+          case 2:
+          case 3: return -1;
+          }
+          break;
+        }
       }
+
       return 0;
     }
 
@@ -283,7 +327,51 @@ namespace quda {
 	  }
 	  break;
 	}
+      } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) {
+        switch (dir) {
+        case 0: /* gamma1 in openQCD convention */
+          switch (row) {
+          case 0:
+          case 1: return complex<ValueType>(a.imag(), -a.real()); //  I
+          case 2:
+          case 3: return complex<ValueType>(-a.imag(), a.real()); // -I
+          }
+          break;
+        case 1: /* gamma2 in openQCD */
+          switch (row) {
+          case 0:
+          case 3: return -a;
+          case 1:
+          case 2: return a;
+          }
+          break;
+        case 2: /* gamma3 in openQCD */
+          switch (row) {
+          case 0:
+          case 3: return complex<ValueType>(a.imag(), -a.real()); //  I
+          case 1:
+          case 2: return complex<ValueType>(-a.imag(), a.real()); // -I
+          }
+          break;
+        case 3: /* gamma0 in openQCD */
+          switch (row) {
+          case 0:
+          case 1:
+          case 2:
+          case 3: return -a;
+          }
+          break;
+        case 4: /* gamma5 in openQCD */
+          switch (row) {
+          case 0:
+          case 1: return a;
+          case 2:
+          case 3: return -a;
+          }
+          break;
+        }
       }
+
       return a;
     }
 
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 4561f1f21f..827dde5bbf 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -2433,6 +2433,99 @@ namespace quda {
       size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
     };
 
+    /**
+     * struct to define order of gauge fields in OpenQCD
+     */
+    template <typename Float, int length> struct OpenQCDOrder : LegacyOrder<Float, length> {
+
+      using Accessor = OpenQCDOrder<Float, length>;
+      using real = typename mapper<Float>::type;
+      using complex = complex<real>;
+
+      Float *gauge;
+      const int volumeCB;
+      static constexpr int Nc = 3;
+      const int dim[4]; // xyzt convention
+      const int L[4];   // txyz convention
+      const int nproc[4];
+
+      OpenQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
+        LegacyOrder<Float, length>(u, ghost_),
+        gauge(gauge_ ? gauge_ : (Float *)u.data()), // pointer to the gauge field on CPU
+        volumeCB(u.VolumeCB()), // Volume and VolumeCB refer to the global lattice, if VolumeLocal, then local lattice
+        dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]},              // *local* lattice dimensions, xyzt
+        L {u.X()[3], u.X()[0], u.X()[1], u.X()[2]},                // *local* lattice dimensions, txyz
+        nproc {comm_dim(3), comm_dim(0), comm_dim(1), comm_dim(2)} // txyz
+      {
+        if constexpr (length != 18) { errorQuda("Gauge field length %d not supported", length); }
+      }
+
+      /**
+       * @brief      Obtains the offset in Floats from the openQCD base pointer
+       *             to the gauge fields.
+       *
+       * @param[in]  x_cb    Checkerboard index coming from quda
+       * @param[in]  dir     The direction coming from quda
+       * @param[in]  parity  The parity coming from quda
+       *
+       * @return     The offset.
+       */
+      __device__ __host__ inline int getGaugeOffset(int x_cb, int dir, int parity) const
+      {
+        int quda_x[4], x[4];
+        getCoords(quda_x, x_cb, dim, parity); // x_quda = quda local lattice coordinates
+        openqcd::rotate_coords(quda_x, x);    // x = openQCD local lattice coordinates
+
+        int mu = (dir + 1) % 4; // mu = openQCD direction
+        int ix = openqcd::ipt(x, L);
+        int iz = openqcd::iup(x, mu, L, nproc);
+        int ofs = 0;
+        int volume = openqcd::vol(L);
+
+        if (ix < volume / 2) { // ix even -> iz odd
+          if (iz < volume) {   // iz in interior
+            ofs = 8 * (iz - volume / 2) + 2 * mu + 1;
+          } else {
+            int ib = iz - volume - openqcd::ifc(L, nproc, mu) - openqcd::bndry(L, nproc) / 2; // iz in exterior
+            ofs = 4 * volume + openqcd::face_offset(L, nproc, mu) + ib;
+          }
+        } else if (volume / 2 <= ix && ix < volume) { // ix odd
+          ofs = 8 * (ix - volume / 2) + 2 * mu;
+        }
+
+        return ofs * length;
+      }
+
+      __device__ __host__ inline void load(complex v[length / 2], int x_cb, int dir, int parity, Float = 1.0) const
+      {
+        auto in = &gauge[getGaugeOffset(x_cb, dir, parity)];
+        block_load<complex, length / 2>(v, reinterpret_cast<complex *>(in));
+      }
+
+      __device__ __host__ inline void save(const complex v[length / 2], int x_cb, int dir, int parity) const
+      {
+        auto out = &gauge[getGaugeOffset(x_cb, dir, parity)];
+        block_store<complex, length / 2>(reinterpret_cast<complex *>(out), v);
+      }
+
+      /**
+         @brief This accessor routine returns a gauge_wrapper to this object,
+         allowing us to overload various operators for manipulating at
+         the site level interms of matrix operations.
+         @param[in] dim Which dimension are we requesting
+         @param[in] x_cb Checkerboarded space-time index we are requesting
+         @param[in] parity Parity we are requesting
+         @return Instance of a gauge_wrapper that curries in access to
+         this field at the above coordinates.
+      */
+      __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const
+      {
+        return gauge_wrapper<real, Accessor>(const_cast<Accessor &>(*this), dim, x_cb, parity);
+      }
+
+      size_t Bytes() const { return 2 * Nc * Nc * sizeof(Float); }
+    }; // class OpenQCDOrder
+
   } // namespace gauge
 
   template <typename real_out_t, typename store_out_t, typename real_in_t, typename store_in_t, bool block_float, typename norm_t>
@@ -2477,5 +2570,8 @@ namespace quda {
   template <typename T, int Nc> struct gauge_order_mapper<T, QUDA_NATIVE_GAUGE_ORDER, Nc> {
     typedef gauge::FloatNOrder<T, 2 * Nc * Nc, QUDA_RECONSTRUCT_NO> type;
   };
+  template <typename T, int Nc> struct gauge_order_mapper<T, QUDA_OPENQCD_GAUGE_ORDER, Nc> {
+    typedef gauge::OpenQCDOrder<T, 2 * Nc * Nc> type;
+  };
 
 } // namespace quda
diff --git a/include/index_helper.cuh b/include/index_helper.cuh
index db58c0daed..35ec4bd0e5 100644
--- a/include/index_helper.cuh
+++ b/include/index_helper.cuh
@@ -1117,6 +1117,295 @@ namespace quda {
     return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
   }
 
+  /**
+   * These are index helper functions used in the order classes of openQCD, i.e.
+   *
+   * - OpenQCDOrder       in quda:include/gauge_field_order.h
+   * - OpenQCDDiracOrder  in quda:include/color_spinor_field_order.h
+   * - OpenQCDOrder       in quda:include/clover_field_order.h.
+   *
+   * The main helper functions are ipt() and iup(), giving pure function
+   * implementations of the ipt[] and iup[][] arrays (see
+   * https://doi.org/10.22323/1.466.0280) that are needed to calculate the correct offsets
+   * of the fields base pointers.
+   */
+  namespace openqcd
+  {
+
+    /**
+     * @brief      Returns the surface in direction mu
+     *
+     * @param      X     Extent in all 4 directions
+     * @param[in]  mu    Direction
+     *
+     * @return     Surface
+     */
+    __device__ __host__ inline int surface(const int X[4], const int mu)
+    {
+      if (mu == 0) {
+        return X[1] * X[2] * X[3];
+      } else if (mu == 1) {
+        return X[0] * X[2] * X[3];
+      } else if (mu == 2) {
+        return X[0] * X[1] * X[3];
+      }
+      return X[0] * X[1] * X[2];
+    }
+
+    /**
+     * @brief      Return BNDRY (see openqcd:include/global.h)
+     *
+     * @param[in]  L      Local lattice extent L0-L3 in txyz convention
+     * @param[in]  nproc  NPROC0-NPROC3 from openqcd
+     *
+     * @return     BNDRY
+     */
+    __device__ __host__ inline int bndry(const int L[4], const int nproc[4])
+    {
+      return 2
+        * (((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1))
+           + ((1 - (nproc[2] % 2)) * surface(L, 2)) + ((1 - (nproc[3] % 2)) * surface(L, 3)));
+    }
+
+    /**
+     * @brief      Calculate the offset needed for boundary points in openQCD.
+     *
+     * @param[in]  L      Local lattice extent L0-L3 in txyz convention
+     * @param[in]  nproc  NPROC0-NPROC3 from openqcd
+     * @param[in]  mu     Direction in txyz
+     *
+     * @return     The offset
+     */
+    __device__ __host__ inline int ifc(const int L[4], const int nproc[4], const int mu)
+    {
+      if (mu == 0) {
+        return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2;
+      } else if (mu == 1) {
+        return ((1 - (nproc[0] % 2)) * surface(L, 0)) + (((1 - (nproc[1] % 2)) * surface(L, 1)) / 2);
+      } else if (mu == 2) {
+        return ((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1))
+          + (((1 - (nproc[2] % 2)) * surface(L, 2)) / 2);
+      }
+      return ((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1))
+        + ((1 - (nproc[2] % 2)) * surface(L, 2)) + (((1 - (nproc[3] % 2)) * surface(L, 3)) / 2);
+    }
+
+    /**
+     * @brief      Calculate the offset of the faces in openQCD.
+     *
+     * @param[in]  L      Local lattice extent L0-L3 in txyz convention
+     * @param[in]  nproc  NPROC0-NPROC3 from openqcd
+     * @param[in]  mu     Direction in txyz
+     *
+     * @return     The offset
+     */
+    __device__ __host__ inline int face_offset(const int L[4], const int nproc[4], const int mu)
+    {
+      if (mu == 0) {
+        return 0;
+      } else if (mu == 1) {
+        return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2;
+      } else if (mu == 2) {
+        return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2 + ((1 - (nproc[1] % 2)) * surface(L, 1)) / 2;
+      }
+      return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2 + ((1 - (nproc[1] % 2)) * surface(L, 1)) / 2
+        + ((1 - (nproc[2] % 2)) * surface(L, 2)) / 2;
+    }
+
+    /**
+     * @brief      Rotate coordinates (xyzt -> txyz)
+     *
+     * @param[in]  x_quda     Cartesian local lattice coordinates in quda
+     *                        convention (xyzt)
+     * @param[out] x_openQCD  Cartesian local lattice coordinates in openQCD
+     *                        convention (txyz)
+     */
+    __device__ __host__ inline void rotate_coords(const int x_quda[4], int x_openQCD[4])
+    {
+      x_openQCD[1] = x_quda[0];
+      x_openQCD[2] = x_quda[1];
+      x_openQCD[3] = x_quda[2];
+      x_openQCD[0] = x_quda[3];
+    }
+
+    /**
+     * @brief      Generate a lexicographical index with x[Ndims-1] running
+     *             fastest, for example if Ndims=4:
+     *             ix = X3*X2*X1*x0 + X3*X2*x1 + X3*x2 + x3.
+     *
+     * @param[in]  x      Integer array of dimension Ndims with coordinates
+     * @param[in]  X      Integer array of dimension Ndims with extents
+     * @param[in]  Ndims  The number of dimensions
+     *
+     * @return     Lexicographical index
+     */
+    __device__ __host__ inline int lexi(const int *x, const int *X, const int Ndims)
+    {
+      int i, ix = x[0];
+
+#pragma unroll
+      for (i = 1; i < Ndims; i++) { ix = (X[i] * ix + x[i]); }
+      return ix;
+    }
+
+    /**
+     * @brief      Return the volume
+     *
+     * @param[in]  X     Integer array of 4 dimensions
+     *
+     * @return     Volume
+     */
+    __device__ __host__ inline int vol(const int X[4]) { return X[0] * X[1] * X[2] * X[3]; }
+
+    /**
+     * @brief      Return cbs[]. This is the cache block size in openQCD, which
+     *             the local lattice is divided into.
+     *
+     * @param[in]  mu    Direction
+     * @param[in]  X     Extents
+     *
+     * @return     cbs
+     */
+    __device__ __host__ inline int setup_cbs(const int mu, const int X[4])
+    {
+      if (mu == 0) {
+        return X[0];
+      } else if ((X[mu] % 4) == 0) {
+        return 4;
+      } else if ((X[mu] % 3) == 0) {
+        return 3;
+      } else if ((X[mu] % 2) == 0) {
+        return 2;
+      } else {
+        return 1;
+      }
+    }
+
+    /**
+     * @brief      Pure function to return ipt[iy], where
+     *             iy=x3+L3*x2+L2*L3*x1+L1*L2*L3*x0 without accessing the
+     *             ipt-array, but calculating the index on the fly. Notice that xi
+     *             and Li are in openQCD (txyz) convention. If they come from
+     *             QUDA, you have to rotate them first (see rotate_coords).
+     *
+     * @param[in]  x     Carthesian local lattice corrdinates, 0 <= x[i] < L[i]
+     * @param[in]  L     Local lattice extents
+     *
+     * @return     ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0] = the local flat index of
+     *             openQCD
+     */
+    __device__ __host__ inline int ipt(const int x[4], const int L[4])
+    {
+      int xb[4], xn[4];
+
+      // openQCDs cache block size in txyz convention
+      int cbs[4] = {setup_cbs(0, L), setup_cbs(1, L), setup_cbs(2, L), setup_cbs(3, L)};
+
+      // openQCDs cache block grid in txyz convention
+      int cbn[4] = {L[0] / cbs[0], L[1] / cbs[1], L[2] / cbs[2], L[3] / cbs[3]};
+
+      xb[0] = x[0] % cbs[0];
+      xb[1] = x[1] % cbs[1];
+      xb[2] = x[2] % cbs[2];
+      xb[3] = x[3] % cbs[3];
+
+      xn[0] = x[0] / cbs[0];
+      xn[1] = x[1] / cbs[1];
+      xn[2] = x[2] / cbs[2];
+      xn[3] = x[3] / cbs[3];
+
+      return (lexi(xb, cbs, 4) / 2 + vol(cbs) * lexi(xn, cbn, 4) / 2
+              + ((x[0] + x[1] + x[2] + x[3]) % 2 != 0) * (vol(L) / 2) // odd -> +VOLUME/2
+      );
+    }
+
+    /**
+     * @brief      Determines the number of boundary points in direction mu prior to
+     *             the Carthesian index x with dimensions X
+     *
+     * @param[in]  mu    Direction
+     * @param[in]  x     Lattice point
+     * @param[in]  X     Dimensions of the lattice/block
+     *
+     * @return     Number of prior boundary points
+     */
+    __device__ __host__ inline int boundary_pts(const int mu, const int x[4], const int X[4])
+    {
+      int ret = 0;
+
+      if (mu == 3) {
+        ret = lexi(x, X, 3); // lexi without x[3]
+      } else if (mu == 2) {
+        ret = X[3] * lexi(x, X, 2);
+        if (x[2] == (X[2] - 1)) {
+          ret += x[3]; // lexi without x[2]
+        }
+      } else if (mu == 1) {
+        if (x[1] == (X[1] - 1)) {
+          ret = X[2] * X[3] * x[0] + X[3] * x[2] + x[3]; // lexi without x[1]
+        } else {
+          ret = surface(X, 1);
+        }
+      } else if (mu == 0) {
+        if (x[0] == (X[0] - 1)) {
+          ret = lexi(x + 1, X + 1, 3); // lexi without x[0]
+        } else {
+          ret = surface(X, 0);
+        }
+      }
+
+      return ret;
+    }
+
+    /**
+     * @brief      Pure implementation of iup[ix][mu]. Returns neighbouring
+     *             point of ix in positive mu direction.
+     *
+     * @param[in]  x      Cartesian local lattice corrdinates, 0 <= x[i] < Li,
+     *                    length 4 in txyz convention
+     * @param[in]  mu     Direction in txyz convention
+     * @param[in]  L      Local lattice extents, length 4 in txyz convention
+     * @param[in]  nproc  NPROC0-NPROC3 from openqcd
+     *
+     * @return     iup[ix][mu]
+     */
+    __device__ __host__ inline int iup(const int x[4], const int mu, const int L[4], const int nproc[4])
+    {
+      int i, ret, xb[4], xn[4];
+
+      if ((x[mu] == (L[mu] - 1)) && (nproc[mu] > 1)) {
+
+        int cbs[4] = {setup_cbs(0, L), setup_cbs(1, L), setup_cbs(2, L), setup_cbs(3, L)};
+        int cbn[4] = {L[0] / cbs[0], L[1] / cbs[1], L[2] / cbs[2], L[3] / cbs[3]};
+
+        xb[0] = x[0] % cbs[0];
+        xb[1] = x[1] % cbs[1];
+        xb[2] = x[2] % cbs[2];
+        xb[3] = x[3] % cbs[3];
+
+        xn[0] = x[0] / cbs[0];
+        xn[1] = x[1] / cbs[1];
+        xn[2] = x[2] / cbs[2];
+        xn[3] = x[3] / cbs[3];
+
+        ret = vol(L) + ifc(L, nproc, mu);
+        if ((x[0] + x[1] + x[2] + x[3]) % 2 == 0) { ret += bndry(L, nproc) / 2; }
+
+        ret += surface(cbs, mu) * boundary_pts(mu, xn, cbn) / 2;
+        ret += boundary_pts(mu, xb, cbs) / 2;
+        return ret;
+
+      } else {
+#pragma unroll
+        for (i = 0; i < 4; i++) { xb[i] = x[i]; }
+
+        xb[mu] = (xb[mu] + 1) % (L[mu] * nproc[mu]);
+        return ipt(xb, L);
+      }
+    }
+
+  } // namespace openqcd
+
   /**
      @brief Compute the flattened 4-d index from a separate T and
      flattened 3-d XYZ index.
diff --git a/include/instantiate.h b/include/instantiate.h
index 9b40ede127..2b12b184e4 100644
--- a/include/instantiate.h
+++ b/include/instantiate.h
@@ -68,6 +68,11 @@ namespace quda
   template <> constexpr bool is_enabled<QUDA_TIFR_GAUGE_ORDER>() { return false; }
   template <> constexpr bool is_enabled<QUDA_TIFR_PADDED_GAUGE_ORDER>() { return false; }
 #endif
+#ifdef BUILD_OPENQCD_INTERFACE
+  template <> constexpr bool is_enabled<QUDA_OPENQCD_GAUGE_ORDER>() { return true; }
+#else
+  template <> constexpr bool is_enabled<QUDA_OPENQCD_GAUGE_ORDER>() { return false; }
+#endif
 
   /**
      @brief Helper function for returning if a given precision is enabled
@@ -140,8 +145,13 @@ namespace quda
   };
 
   struct ReconstructWilson {
+#ifdef BUILD_QCD_PLUS_QED
+    static constexpr std::array<QudaReconstructType, 3> recon
+      = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9};
+#else
     static constexpr std::array<QudaReconstructType, 3> recon
       = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8};
+#endif
   };
 
   struct ReconstructStaggered {
@@ -610,6 +620,21 @@ namespace quda
   template <> constexpr bool is_enabled<QUDA_COVDEV_DSLASH>() { return true; }
 #endif
 
+  struct WilsonReconstruct {
+#ifdef BUILD_QCD_PLUS_QED
+    static constexpr std::array<QudaReconstructType, 3> recon
+      = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9};
+#else
+    static constexpr std::array<QudaReconstructType, 3> recon
+      = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8};
+#endif
+  };
+
+  struct StaggeredReconstruct {
+    static constexpr std::array<QudaReconstructType, 3> recon
+      = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9};
+  };
+
 #ifdef GPU_DISTANCE_PRECONDITIONING
   constexpr bool is_enabled_distance_precondition() { return true; }
 #else
diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh
index 505163cc93..c0fadbb376 100644
--- a/include/kernels/copy_color_spinor.cuh
+++ b/include/kernels/copy_color_spinor.cuh
@@ -220,6 +220,60 @@ namespace quda
     }
   };
 
+  /** Transform from openqcd into non-relativistic basis (a.k.a UKQCD basis):
+   *    gamma_ukqcd = U gamma_openqcd U^dagger with
+   *     U = [-1  0  1  0]
+             [ 0 -1  0  1]
+             [ 1  0  1  0]
+             [ 0  1  0  1] / sqrt(2),
+   * see https://github.com/JeffersonLab/chroma/blob/master/docs/notes/gamma_conventions.tex
+   for further notes. */
+  template <int Ns, int Nc> struct OpenqcdToNonRelBasis {
+    template <typename FloatOut, typename FloatIn>
+    __device__ __host__ inline void operator()(complex<FloatOut> out[Ns * Nc], const complex<FloatIn> in[Ns * Nc]) const
+    {
+      int s1[4] = {0, 1, 0, 1};
+      int s2[4] = {2, 3, 2, 3};
+      FloatOut K1[4]
+        = {static_cast<FloatOut>(-kP), static_cast<FloatOut>(-kP), static_cast<FloatOut>(kP), static_cast<FloatOut>(kP)};
+      FloatOut K2[4]
+        = {static_cast<FloatOut>(kP), static_cast<FloatOut>(kP), static_cast<FloatOut>(kP), static_cast<FloatOut>(kP)};
+      for (int s = 0; s < Ns; s++) {
+        for (int c = 0; c < Nc; c++) {
+          out[s * Nc + c] = K1[s] * static_cast<complex<FloatOut>>(in[s1[s] * Nc + c])
+            + K2[s] * static_cast<complex<FloatOut>>(in[s2[s] * Nc + c]);
+        }
+      }
+    }
+  };
+
+  /** Transform from non-relativistic (aka ukqcd) into openqcd basis:
+   * gamma_ukqcd = U gamma_openqcd U^dagger with
+   * U = [-1  0 1 0]
+   *     [ 0 -1 0 1]
+   *     [ 1  0 1 0]
+   *     [ 0  1 0 1] / sqrt(2)
+   */
+  template <int Ns, int Nc> struct NonRelToOpenqcdBasis {
+    template <typename FloatOut, typename FloatIn>
+    __device__ __host__ inline void operator()(complex<FloatOut> out[Ns * Nc], const complex<FloatIn> in[Ns * Nc]) const
+    {
+      int s1[4] = {0, 1, 0, 1};
+      int s2[4] = {2, 3, 2, 3};
+
+      FloatOut K1[4]
+        = {static_cast<FloatOut>(-kU), static_cast<FloatOut>(-kU), static_cast<FloatOut>(kU), static_cast<FloatOut>(kU)};
+      FloatOut K2[4]
+        = {static_cast<FloatOut>(kU), static_cast<FloatOut>(kU), static_cast<FloatOut>(kU), static_cast<FloatOut>(kU)};
+      for (int s = 0; s < Ns; s++) {
+        for (int c = 0; c < Nc; c++) {
+          out[s * Nc + c] = K1[s] * static_cast<complex<FloatOut>>(in[s1[s] * Nc + c])
+            + K2[s] * static_cast<complex<FloatOut>>(in[s2[s] * Nc + c]);
+        }
+      }
+    }
+  };
+
   template <typename Arg> struct CopyColorSpinor_ {
     const Arg &arg;
     constexpr CopyColorSpinor_(const Arg &arg) : arg(arg) { }
diff --git a/include/quda.h b/include/quda.h
index 6cbfee5f04..0bd715c5f4 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -70,10 +70,7 @@ extern "C" {
     int staple_pad;   /**< Used by link fattening */
     int llfat_ga_pad; /**< Used by link fattening */
     int mom_ga_pad;   /**< Used by the gauge and fermion forces */
-    union {
-      bool use_split_gauge_bkup; /**< Used by gauge split buffers (default=true keep split gauge after usage)*/
-      int pad;                   /**< Forces 4-byte alignment */
-    };
+    int use_split_gauge_bkup; /**< Used by gauge split buffers (default=true keep split gauge after usage)*/
 
     QudaStaggeredPhase staggered_phase_type; /**< Set the staggered phase type of the links */
     int staggered_phase_applied; /**< Whether the staggered phase has already been applied to the links */
@@ -468,6 +465,9 @@ extern "C" {
     /** The t0 parameter for distance preconditioning, the timeslice where the source is located */
     int distance_pc_t0;
 
+    /** Additional user-defined properties */
+    void *additional_prop;
+
   } QudaInvertParam;
 
   // Parameter set for solving eigenvalue problems.
@@ -1015,7 +1015,7 @@ extern "C" {
    * initQuda.  Calling initQudaMemory requires that the user has
    * previously called initQudaDevice.
    */
-  void initQudaMemory();
+  void initQudaMemory(void);
 
   /**
    * Initialize the library.  This function is actually a wrapper
@@ -1038,7 +1038,7 @@ extern "C" {
    * @details This should only be needed for automated testing when
    * different partitioning is applied within a single run.
    */
-  void updateR();
+  void updateR(void);
 
   /**
    * A new QudaGaugeParam should always be initialized immediately
@@ -1860,7 +1860,7 @@ extern "C" {
   void flushPoolQuda(QudaMemoryType type);
 
   void setMPICommHandleQuda(void *mycomm);
-  
+
   // Parameter set for quark smearing operations
   typedef struct QudaQuarkSmearParam_s {
     //-------------------------------------------------
diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h
new file mode 100644
index 0000000000..6d3257da8b
--- /dev/null
+++ b/include/quda_openqcd_interface.h
@@ -0,0 +1,522 @@
+#pragma once
+
+#include <mpi.h>
+#include <pthread.h>
+
+#define OPENQCD_MAX_INVERTERS 32
+#define OPENQCD_MAX_EIGENSOLVERS 32
+
+/**
+ * The macro battle below is to trick quda.h to think that double_complex is
+ * defined to be the struct below. For this we need to set the __CUDACC_RTC__,
+ * which makes double_complex to be defined as double2 (see quda.h), which we
+ * redefine below as openqcd_complex_dble. The original definitions of
+ * __CUDACC_RTC__ and double2 are recovered below. We do this to be able to
+ * include this header file into a openQxD program and compile with flags
+ * "-std=C89 -pedantic -Werror". Else the compiler trows an
+ * "ISO C90 does not support complex types" error because of the
+ * "double _Complex" data types exposed in quda.h.
+ */
+
+typedef struct {
+  double re, im;
+} openqcd_complex_dble;
+
+#ifdef __CUDACC_RTC__
+#define __CUDACC_RTC_ORIGINAL__ __CUDACC_RTC__
+#endif
+
+#ifdef double2
+#define double2_ORIGINAL double2
+#endif
+
+#define __CUDACC_RTC__
+#define double2 openqcd_complex_dble
+#include <quda.h>
+#undef double2
+#undef __CUDACC_RTC__
+
+#ifdef double2_ORIGINAL
+#define double2 double2_ORIGINAL
+#undef double2_ORIGINAL
+#endif
+
+#ifdef __CUDACC_RTC_ORIGINAL__
+#define __CUDACC_RTC__ __CUDACC_RTC_ORIGINAL__
+#undef __CUDACC_RTC_ORIGINAL__
+#endif
+
+/**
+ * @file    quda_openqcd_interface.h
+ *
+ * @section Description
+ *
+ * The header file defines the interface to enable easy
+ * interfacing between QUDA and the OpenQCD software.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * #############################################################################
+ * OpenQCD immitations of structs as documented in main/README.global (see
+ * openQCD or openQxD official repositories). These structs have the same
+ * members as the ones from openQCD and are (mostly) alignment compatible, but
+ * they don't *have* to be.
+ * #############################################################################
+ */
+typedef struct {
+  int type, cstar;
+  double phi3[2][3], phi1[2];
+} openQCD_bc_parms_t;
+
+typedef struct {
+  int qhat;
+  double m0, su3csw, u1csw, cF[2], theta[3];
+} openQCD_dirac_parms_t;
+
+typedef struct {
+  int gauge, nfl;
+} openQCD_flds_parms_t;
+/**
+ * #############################################################################
+ */
+
+typedef enum OpenQCDGaugeGroup_s {
+  OPENQCD_GAUGE_SU3 = 1,
+  OPENQCD_GAUGE_U1 = 2,
+  OPENQCD_GAUGE_SU3xU1 = 3,
+  OPENQCD_GAUGE_INVALID = QUDA_INVALID_ENUM
+} OpenQCDGaugeGroup;
+
+typedef enum OpenQCDFieldType_s {
+  OPENQCD_FIELD_SPINOR = 1,
+  OPENQCD_FIELD_GAUGE = 2,
+  OPENQCD_FIELD_CLOVER = 3,
+  OPENQCD_FIELD_INVALID = QUDA_INVALID_ENUM
+} OpenQCDFieldType;
+
+/**
+ * Parameters related to problem size and machine topology. They should hold the
+ * numbers in quda format, i.e. xyzt convention. For example L[0] = L1, L[1] =
+ * L2, ...
+ */
+typedef struct {
+  MPI_Comm quda_comm;                       /** QUDA MPI communicator */
+  MPI_Comm world_comm;                      /** World MPI communicator */
+  int L[4];                                 /** Local lattice dimensions L1, L2, L3, L0 */
+  int nproc[4];                             /** Machine grid size NPROC1, NPROC2, NPROC3, NPROC0*/
+  int nproc_blk[4];                         /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK,
+                                                is assumed to be [1, 1, 1, 1] */
+  int N[4];                                 /** Glocal lattice dimensions N1, N2, N3, N3 */
+  int device;                               /** GPU device number */
+  int cstar;                                /** number of cstar directions, equals bc_cstar() */
+  int *data;                                /** rank topology, length 5 + NPROC1*NPROC2*NPROC3*NPROC0:
+                                                data[0] = cstar;
+                                                data[1+i] = nproc[i] for 0 <= i < 4
+                                                data[5+lex(ix,iy,iz,it)] returns rank number in
+                                                openQCD, where lex stands for lexicographical
+                                                indexing (in QUDA order (xyzt)) */
+  openQCD_bc_parms_t (*bc_parms)(void);       /** @see bc_parms() */
+  openQCD_flds_parms_t (*flds_parms)(void);   /** @see flds_parms() */
+  openQCD_dirac_parms_t (*dirac_parms)(void); /** @see dirac_parms() */
+  void *(*h_gauge)(void);                   /** function to return a pointer to the gauge field */
+  void *(*h_sw)(void);                      /** function to return a pointer to the updated Clover field */
+  int (*openqcd2quda)(OpenQCDFieldType type, void *in, void *out);        /** spinor gather function */
+  void (*quda2openqcd)(OpenQCDFieldType type, void *in, void *out);       /** spinor scatter function */
+  void (*get_gfld_flags)(int *ud, int *ad); /** function pointer to gauge field revision query function */
+} openQCD_QudaLayout_t;
+
+/**
+ * Parameters used to create a QUDA context.
+ */
+typedef struct {
+  QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */
+  FILE *logfile;           /** log file handler */
+  void *gauge;             /** base pointer to the gauge fields */
+  int volume;              /** VOLUME */
+  int bndry;               /** BNDRY */
+  int two_grids_equal;     /** Whether the QUDA and the openqxd process grids are equal or not */
+  void *(*buffer_field)(MPI_Comm comm, int idx, void *field);        /** obtain buffer field */
+} openQCD_QudaInitArgs_t;
+
+
+typedef struct {
+  double kappa;  /* kappa: hopping parameter */
+  double mu;     /* mu: twisted mass */
+  double su3csw; /* su3csw: csw coefficient for SU(3) fields */
+  double u1csw;  /* u1csw: csw coefficient for U(1) fields, quda doesn't respect that parameter (yet) */
+  int qhat;      /* qhat: quda doesn't respect that parameter (yet) */
+} openQCD_QudaDiracParam_t;
+
+/**
+ * Initialize the QUDA context.
+ *
+ * @param[in]  init    Meta data for the QUDA context
+ * @param[in]  layout  Layout struct
+ * @param      infile  Input file
+ */
+void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, char *infile);
+
+/**
+ * Destroy the QUDA context and deallocate all solvers.
+ */
+void openQCD_qudaFinalize(void);
+
+/**
+ * Copy a spinor to GPU and back to CPU.
+ *
+ * @param[in]   h_in  Spinor input field (from openQCD)
+ * @param[out]  h_out Spinor output field
+ */
+void openQCD_back_and_forth(void *h_in, void *h_out);
+
+/**
+ * @brief      Wrapper around openqcd::ipt
+ *
+ * @param[in]  x     Euclidean corrdinate in txyz convention
+ *
+ * @return     ipt[x]
+ *
+ * @see        openqcd::ipt()
+ */
+int openQCD_qudaIndexIpt(const int *x);
+int openQCD_qudaIndexIptLexi(const int iy);
+
+/**
+ * @brief      Wrapper around openqcd::iup
+ *
+ * @param[in]  x     Euclidean corrdinate in txyz convention
+ * @param[in]  mu    Direction
+ *
+ * @return     iup[x][mu]
+ *
+ * @see        openqcd::iup()
+ */
+int openQCD_qudaIndexIup(const int *x, const int mu);
+
+/**
+ * @brief      Norm square in QUDA.
+ *
+ * @param[in]  h_in  Spinor input field (from openQCD)
+ *
+ * @return     The norm
+ */
+double openQCD_qudaNorm(void *h_in);
+
+/**
+ * @brief      Prototype function for the norm-square in QUDA without
+ *             transfering the field. Should serve as an example.
+ *
+ * @param[in]  d_in  Spinor input field (device pointer)
+ *
+ * @return     The norm
+ */
+double openQCD_qudaNorm_NoLoads(void *d_in);
+
+/**
+ * @brief      Applies Dirac matrix to spinor.
+ *
+ *             openQCD_out = gamma[dir] * openQCD_in
+ *
+ * @param[in]  dir          Dirac index, 0 <= dir <= 5, notice that dir is in
+ *                          openQCD convention, ie. (0: t, 1: x, 2: y, 3: z, 4: 5, 5: 5)
+ * @param[in]  openQCD_in   of type spinor_dble[NSPIN]
+ * @param[out] openQCD_out  of type spinor_dble[NSPIN]
+ */
+void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out);
+
+/**
+ * @brief      Explicit transfer of an openQCD field from host to device
+ *
+ * @param      openQCD_field  Input openQCD spinor (host pointer)
+ *
+ * @return     Device pointer
+ */
+void *openQCD_qudaH2D(void *openQCD_field);
+
+/**
+ * @brief      Explicit transfer of a QUDA field from device to host
+ *
+ * @param      quda_field     Input quda spinor field (device pointer)
+ * @param      openQCD_field  Output openQCD spinor (host pointer)
+ */
+void openQCD_qudaD2H(void *quda_field, void *openQCD_field);
+
+/**
+ * @brief      Free a device field allocated by openQCD_qudaH2D()
+ *
+ * @param      quda_field  Pointer to device pointer
+ */
+void openQCD_qudaSpinorFree(void **quda_field);
+
+/**
+ * @brief      Apply the Wilson-Clover Dirac operator to a field. All fields
+ *             passed and returned are host (CPU) fields in openQCD order.
+ *
+ * @param[in]  src   Source spinor field
+ * @param[out] dst   Destination spinor field
+ * @param[in]  p     Dirac parameter struct
+ *
+ * @deprecated  Replaced by openQCD_qudaDw()
+ */
+void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p);
+
+/**
+ * @brief      Apply the Dirac operator that corresponds to the current openQxD
+ *             setup to a field. All fields passed and returned are host (CPU)
+ *             fields in openQCD order.
+ *
+ * @param[in]  mu    Twisted mass parameter
+ * @param[in]  in    Input spinor (host pointer)
+ * @param[out] out   Output spinor (host pointer)
+ */
+void openQCD_qudaDw(double mu, void *in, void *out);
+
+/**
+ * @brief      Apply the Dirac operator that corresponds to the current openQxD
+ *             setup to a field. All fields passed and returned are devicde
+ *             (GPU) fields returned by openQCD_qudaH2D().
+ *
+ * @param[in]  mu     Twisted mass parameter
+ * @param[in]  d_in   Input spinor (device pointer)
+ * @param[out] d_out  Output spinor (device pointer)
+ */
+void openQCD_qudaDw_NoLoads(double mu, void *d_in, void *d_out);
+
+/**
+ * Setup the solver interface to quda.  This function parses the file given by
+ * [infile] as an openQCD ini file.  The solver section given by the [id]
+ * parameter must have a key-value pair like solver = QUDA and may contain every
+ * member of the struct [QudaInvertParam].  If one sets inv_type_precondition =
+ * QUDA_MG_INVERTER, one can additionally use all the members from the struct
+ * [QudaMultigridParam] in a section called "Solver {id} Multigrid", where {id}
+ * is replaced by [id].  For every level given by n_level in the above section,
+ * one has to provide a subsection called "Solver {id} Multigrid Level {level}",
+ * where {level} runs from 0 to n_level-1. All these subsections may have keys
+ * given by all the array-valued members of QudaMultigridParam, for example
+ * smoother_tol may appear in all subsections. This function must be called on
+ * all ranks simulaneously.
+ *
+ * @param[in]  id    The identifier of the solver section, i.e. "Solver #". The
+ *                   input file is taken from the arguments of quda_init(). If
+ *                   id is -1, then the section called "Lattice parameters" is
+ *                   parsed in the same way.
+ *
+ * @return     Pointer to the solver context
+ */
+
+void *openQCD_qudaSolverGetHandle(int id);
+
+/**
+ * @brief      Return a hash from a subset of the settings in the
+ *             QudaInvertParam struct. Return 0 if the struct is not initialized
+ *             yet.
+ *
+ * @param[in]  id    The solver identifier
+ *
+ * @return     Hash value
+ */
+int openQCD_qudaSolverGetHash(int id);
+
+/**
+ * @brief      Print solver information about the QUDA solver. Print
+ *             "Solver is not initialized yet" is the solver struct is nul
+ *             initialized yet.
+ *
+ * @param[in]  id    The solver identifier
+ */
+void openQCD_qudaSolverPrintSetup(int id);
+
+/**
+ * @brief      Solve Ax=b for an Clover Wilson operator with a QUDA solver.
+ *             All fields passed and returned are host (CPU) field in openQCD
+ *             order.
+ *
+ * @param[in]  id        The solver identifier in the input file, i.e.
+ *                       "Solver #". The input file is the one given by
+ *                       quda_init
+ * @param[in]  mu        Twisted mass parameter
+ * @param[in]  source    The source
+ * @param[out] solution  The solution
+ * @param[out] status    If the function is able to solve the Dirac equation to
+ *                       the desired accuracy (invert_param->tol), status
+ *                       reports the total number of iteration steps. -1
+ *                       indicates that the inversion failed.
+ *
+ * @return     Residual
+ */
+double openQCD_qudaInvert(int id, double mu, void* source, void* solution, int *status);
+
+
+/**
+ * @brief      Solve Ax=b for an Clover Wilson operator with a QUDA solver. All
+ *             fields passed and returned are host (CPU) field in openQCD order.
+ *             The source and solution fields can be multiple fields, indicated
+ *             by num_src (see QudaInvertParam)
+ *
+ * @param[in]  id         The solver identifier in the input file, i.e.
+ *                        "Solver #". The input file is the one given by
+ *                        quda_init
+ * @param[in]  mu         Twisted mass parameter
+ * @param[in]  sources    The source(s)
+ * @param[out] solutions  The solution(s)
+ * @param[out] status     If the function is able to solve the Dirac equation to
+ *                        the desired accuracy (invert_param->tol), status[i]
+ *                        reports the total number of iteration steps for source
+ *                        i, where i = 0, ..., param->num_src-1. -1 indicates
+ *                        that the inversion failed.
+ * @param      residual   The num_src residuals
+ */
+void openQCD_qudaInvertMultiSrc(int id, double mu, void** sources, void** solutions, int *status, double *residual);
+
+/**
+ * @brief      Set up a async solve. See [[openQCD_qudaInvert]] for details
+ *             about the parameters.
+ */
+void openQCD_qudaInvertAsyncSetup(int id, double mu);
+
+/**
+ * @brief      Register a solve to be started with [[openQCD_qudaInvertAsyncStart]].
+ *             The parameters of this function are exactly the same as for
+ *             [[openQCD_qudaInvert]].
+ */
+void openQCD_qudaInvertAsyncDispatch(void *source, void *solution, int *status);
+
+/**
+ * @brief      Fire up the solves registered with [[openQCD_qudaInvertDispatch]]
+ *             in order of their registration asynchronously. This spawns a
+ *             thread with pthread_create that calls [[openQCD_qudaInvert]].
+ *
+ * @return     The MPI communicator with which the main threads should
+ *             communicate among ewach other until
+ *             [[openQCD_qudaInvertAsyncWait]] has returned.
+ */
+MPI_Comm openQCD_qudaInvertAsyncStart(void);
+
+/**
+ * @brief      Wait for the thread started with [[openQCD_qudaInvertAsyncStart]]
+ *             using pthread_join to finish and collect its results.
+ *
+ * @param      residual  The residuals of the dispatched solves.
+ */
+void openQCD_qudaInvertAsyncWait(double *residual);
+
+/**
+ * @brief      Destroys an existing solver context and frees all involed
+ *             structs.
+ *
+ * @param[in]  id    The solver identifier
+ */
+void openQCD_qudaSolverDestroy(int id);
+
+/**
+ * Setup the eigen-solver interface to quda.  This function parses the file
+ * given by [infile] as an openQCD ini file.  The solver section given by the
+ * [inv_section] parameter must have a key-value pair like solver = QUDA and may
+ * contain every member of the struct [QudaInvertParam].  See
+ * [openQCD_qudaSolverSetup] for more details about the solver. The eigen-solver
+ * section given by the [section] parameter may contain every member of the
+ * struct [QudaEigParam].
+ *
+ * @param[in]  id         The section id of the eigensolver
+ * @param[in]  solver_id  The section id of the solver. If -1, the section is
+ *                        not read in.
+ *
+ * @return     Pointer to the eigen-solver context
+ */
+void *openQCD_qudaEigensolverGetHandle(int id, int solver_id);
+
+/**
+ * @brief      Print the eigensolver setup.
+ *
+ * @param[in]  id         The eigensolver identifier
+ * @param[in]  solver_id  The solver identifier
+ */
+void openQCD_qudaEigensolverPrintSetup(int id, int solver_id);
+
+/**
+ * @brief        Solve Ax=b for an Clover Wilson operator with a multigrid
+ *               solver. All fields are fields passed and returned are host
+ *               (CPU) field in openQCD order.  This function requires an
+ *               existing solver context created with openQCD_qudaSolverSetup().
+ *
+ * @param[in]    id         The eigensolver section identifier
+ * @param[in]    solver_id  The solver section identifier
+ * @param[inout] h_evecs    Allocated array of void-pointers to param->n_conf
+ *                          fields
+ * @param[out]   h_evals    Allocated array of param->n_conf complex_dbles
+ */
+void openQCD_qudaEigensolve(int id, int solver_id, void **h_evecs, void *h_evals);
+
+/**
+ * @brief      Destroys an existing eigen-solver context and frees all involed
+ *             structs.
+ *
+ * @param[in]  id    The section id of the eigensolver
+ */
+void openQCD_qudaEigensolverDestroy(int id);
+
+/**
+ * @brief      Whether QUDA was built with QCD+QED or not.
+ */
+bool openQCD_qudaQCDPlusQEDEnabled(void);
+
+/**
+ * @brief      Wrapper for the plaquette. We could call plaqQuda() directly in
+ *             openQCD, but we have to make sure manually that the gauge field
+ *             is loaded
+ *
+ * @return     Plaquette value
+ * @see        https://github.com/lattice/quda/wiki/Gauge-Measurements#wilson-plaquette-action
+ */
+double openQCD_qudaPlaquette(void);
+
+/**
+ * @brief      Load the gauge fields from host to quda. Notice that the boundary
+ *             fields have to be up2date; i.e. call copy_bnd_hd(), copy_bnd_ud()
+ *             before pass fields into this function.
+ *
+ * @param[in]  gauge       The gauge fields (in openqcd order)
+ * @param[in]  prec        Precision of the incoming gauge field
+ * @param[in]  rec         How the field should be stored internally in QUDA
+ * @param[in]  t_boundary  Time boundary condition
+ */
+void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary);
+
+/**
+ * @brief      Save the gauge fields from quda to host.
+ *
+ * @param[out] gauge       The gauge fields (will be stored in openqcd order)
+ * @param[in]  prec        Precision of the outgoing gauge field
+ * @param[in]  rec         How the field should be stored internally in QUDA
+ * @param[in]  t_boundary  Time boundary condition
+ */
+void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary);
+
+/**
+ * @brief      Free the gauge field allocated in quda.
+ */
+void openQCD_qudaGaugeFree(void);
+
+/**
+ * @brief      Load the clover fields from host to quda.
+ *
+ * @param[in]  clover  The clover fields (in openqcd order)
+ * @param[in]  kappa   The kappa (we need this, because quda has its clover
+ *                     field multiplied by kappa and we have to reverse this
+ *                     when loading ours)
+ * @param[in]  csw     The csw coefficient of the clover field
+ */
+void openQCD_qudaCloverLoad(void *clover, double kappa, double csw);
+
+/**
+ * @brief      Free the clover field allocated in quda.
+ */
+void openQCD_qudaCloverFree(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 65f33a6772..b0f4be1336 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -101,7 +101,7 @@ set (QUDA_OBJS
   copy_gauge_offset.cu copy_color_spinor_offset.cu copy_clover_offset.cu
   staggered_oprod.cu clover_trace_quda.cu
   hisq_paths_force_quda.cu
-  unitarize_force_quda.cu unitarize_links_quda.cu milc_interface.cpp milc_interface_internal.cpp
+  unitarize_force_quda.cu unitarize_links_quda.cu openqcd_interface.cpp milc_interface.cpp milc_interface_internal.cpp
   tune.cpp
   device_vector.cu
   inv_gmresdr_quda.cpp
@@ -601,6 +601,10 @@ if(QUDA_BLOCKSOLVER)
   target_compile_definitions(quda PRIVATE BLOCKSOLVER)
 endif()
 
+if(QUDA_QCD_PLUS_QED)
+  target_compile_definitions(quda PUBLIC BUILD_QCD_PLUS_QED)
+endif(QUDA_QCD_PLUS_QED)
+
 if(QUDA_INTERFACE_QDP OR QUDA_INTERFACE_ALL)
   target_compile_definitions(quda PUBLIC BUILD_QDP_INTERFACE)
 endif(QUDA_INTERFACE_QDP OR QUDA_INTERFACE_ALL)
@@ -625,6 +629,10 @@ if(QUDA_INTERFACE_TIFR OR QUDA_INTERFACE_ALL)
   target_compile_definitions(quda PUBLIC BUILD_TIFR_INTERFACE)
 endif(QUDA_INTERFACE_TIFR OR QUDA_INTERFACE_ALL)
 
+if(QUDA_INTERFACE_OPENQCD OR QUDA_INTERFACE_ALL)
+  target_compile_definitions(quda PUBLIC BUILD_OPENQCD_INTERFACE)
+endif(QUDA_INTERFACE_OPENQCD OR QUDA_INTERFACE_ALL)
+
 # MULTI GPU AND USQCD
 if(QUDA_MPI OR QUDA_QMP)
   target_compile_definitions(quda PUBLIC MULTI_GPU)
diff --git a/lib/check_params.h b/lib/check_params.h
index 15b9c524e7..6dbdc91dfd 100644
--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -111,13 +111,13 @@ void printQudaGaugeParam(QudaGaugeParam *param) {
   P(staggered_phase_applied, 0);
   P(i_mu, 0.0);
   P(overlap, 0);
-  P(use_split_gauge_bkup, true);
+  P(use_split_gauge_bkup, QUDA_BOOLEAN_TRUE);
 #else
   P(staggered_phase_type, QUDA_STAGGERED_PHASE_INVALID);
   P(staggered_phase_applied, INVALID_INT);
   P(i_mu, INVALID_DOUBLE);
   P(overlap, INVALID_INT);
-  P(use_split_gauge_bkup, false);
+  P(use_split_gauge_bkup, QUDA_BOOLEAN_FALSE);
 #endif
 
 #if defined INIT_PARAM
@@ -214,6 +214,7 @@ void printQudaEigParam(QudaEigParam *param) {
   P(use_norm_op, QUDA_BOOLEAN_INVALID);
   P(compute_svd, QUDA_BOOLEAN_INVALID);
   P(require_convergence, QUDA_BOOLEAN_INVALID);
+  P(spectrum, QUDA_SPECTRUM_INVALID);
   P(n_ev, INVALID_INT);
   P(n_kr, INVALID_INT);
   P(n_conv, INVALID_INT);
@@ -768,6 +769,7 @@ void printQudaInvertParam(QudaInvertParam *param) {
 #endif
 
 #ifdef INIT_PARAM
+  P(additional_prop, 0);
   P(distance_pc_alpha0, 0.0);
   P(distance_pc_t0, -1);
 #else
diff --git a/lib/checksum.cu b/lib/checksum.cu
index b759afde8d..af8374e723 100644
--- a/lib/checksum.cu
+++ b/lib/checksum.cu
@@ -51,9 +51,12 @@ namespace quda {
     } else if (u.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
       ChecksumArg<T,QUDA_TIFR_PADDED_GAUGE_ORDER,Nc> arg(u,mini);
       checksum = ChecksumCPU(arg);
+    } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) {
+      ChecksumArg<T, QUDA_OPENQCD_GAUGE_ORDER, Nc> arg(u, mini);
+      checksum = ChecksumCPU(arg);
     } else {
       errorQuda("Checksum not implemented");
-    }    
+    }
 
     return checksum;
   }
diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp
index 2c581886da..11b55e6085 100644
--- a/lib/comm_common.cpp
+++ b/lib/comm_common.cpp
@@ -102,6 +102,12 @@ namespace quda
 
     Topology *topo = new Topology;
 
+#ifdef BUILD_OPENQCD_INTERFACE
+    topo->cstar = (map_data == nullptr) ? 0 : static_cast<int *>(map_data)[0];
+#else
+    topo->cstar = 0;
+#endif
+
     topo->ndim = ndim;
 
     int nodes = 1;
diff --git a/lib/communicator_stack.cpp b/lib/communicator_stack.cpp
index ae36f5b2d0..cd34b9cefe 100644
--- a/lib/communicator_stack.cpp
+++ b/lib/communicator_stack.cpp
@@ -113,6 +113,8 @@ namespace quda
 
   int comm_dim(int dim) { return get_current_communicator().comm_dim(dim); }
 
+  bool comm_dim_cstar(int dim) { return get_current_communicator().comm_dim_cstar(dim); }
+
   int comm_dim_global(int dim) { return get_default_communicator().comm_dim(dim); }
 
   int comm_coord(int dim) { return get_current_communicator().comm_coord(dim); }
diff --git a/lib/copy_clover.cu b/lib/copy_clover.cu
index 7a8273d2f3..fb27eb9416 100644
--- a/lib/copy_clover.cu
+++ b/lib/copy_clover.cu
@@ -129,6 +129,12 @@ namespace quda {
         copyClover<BQCDOrder<FloatIn>, FloatOut, FloatIn>(out, in, inverse, location, Out, In);
 #else
         errorQuda("BQCD interface has not been built\n");
+#endif
+      } else if (in.Order() == QUDA_OPENQCD_CLOVER_ORDER) {
+#ifdef BUILD_OPENQCD_INTERFACE
+        copyClover<OpenQCDOrder<FloatIn>, FloatOut, FloatIn>(out, in, inverse, location, Out, In);
+#else
+        errorQuda("OpenQCD interface has not been built\n");
 #endif
       } else {
         errorQuda("Clover field %d order not supported", in.Order());
diff --git a/lib/copy_clover_offset.cu b/lib/copy_clover_offset.cu
index f29e663c14..7bec58484c 100644
--- a/lib/copy_clover_offset.cu
+++ b/lib/copy_clover_offset.cu
@@ -45,6 +45,16 @@ namespace quda
         CopyFieldOffset<decltype(arg)> copier(arg, in);
 #else
         errorQuda("BQCD interface has not been built\n");
+#endif
+      } else if (in.Order() == QUDA_OPENQCD_CLOVER_ORDER) {
+#ifdef BUILD_OPENQCD_INTERFACE
+        using C = OpenQCDOrder<Float, length>;
+        C out_accessor(out, inverse);
+        C in_accessor(in, inverse);
+        CopyFieldOffsetArg<Field, Element, C> arg(out_accessor, out, in_accessor, in, offset);
+        CopyFieldOffset<decltype(arg)> copier(arg, in);
+#else
+        errorQuda("OpenQCD interface has not been built\n");
 #endif
       } else {
         errorQuda("Clover field %d order not supported", in.Order());
diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh
index 0c28f5849d..7ed4059ae1 100644
--- a/lib/copy_color_spinor.cuh
+++ b/lib/copy_color_spinor.cuh
@@ -27,6 +27,7 @@ namespace quda
       case QUDA_DEGRAND_ROSSI_GAMMA_BASIS: return "degrand_rossi";
       case QUDA_UKQCD_GAMMA_BASIS: return "ukqcd";
       case QUDA_CHIRAL_GAMMA_BASIS: return "chiral";
+      case QUDA_OPENQCD_GAMMA_BASIS: return "openqcd";
       case QUDA_DIRAC_PAULI_GAMMA_BASIS: return "dirac_pauli";
       default: errorQuda("Unknown gamma basis %d", basis);
       }
@@ -68,6 +69,10 @@ namespace quda
           launch<CopyColorSpinor_, enable_host>(tp, stream, Arg<ChiralToNonRelBasis>(out, in, Out_, In_));
         } else if (out.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) {
           launch<CopyColorSpinor_, enable_host>(tp, stream, Arg<NonRelToChiralBasis>(out, in, Out_, In_));
+        } else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS) {
+          launch<CopyColorSpinor_, enable_host>(tp, stream, Arg<OpenqcdToNonRelBasis>(out, in, Out_, In_));
+        } else if (out.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) {
+          launch<CopyColorSpinor_, enable_host>(tp, stream, Arg<NonRelToOpenqcdBasis>(out, in, Out_, In_));
         } else if (out.GammaBasis() == QUDA_DIRAC_PAULI_GAMMA_BASIS && in.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
           launch<CopyColorSpinor_, enable_host>(tp, stream, Arg<DegrandRossiToDiracPaulBasis>(out, in, Out_, In_));
         } else if (out.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS && in.GammaBasis() == QUDA_DIRAC_PAULI_GAMMA_BASIS) {
@@ -123,6 +128,12 @@ namespace quda
         CopyColorSpinor<Ns, Nc, O, I, param_t>(out, in, param);
       else
         errorQuda("QDPJIT interface has not been built");
+    } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) {
+      using O = OpenQCDDiracOrder<FloatOut, Ns, Nc>;
+      if constexpr (is_enabled<QUDA_OPENQCD_GAUGE_ORDER>())
+        CopyColorSpinor<Ns, Nc, O, I, param_t>(out, in, param);
+      else
+        errorQuda("OpenQCD interface has not been built");
     } else {
       errorQuda("Order %d not defined (Ns = %d, Nc = %d, precision = %d)", out.FieldOrder(), Ns, Nc, out.Precision());
     }
@@ -154,6 +165,12 @@ namespace quda
         genericCopyColorSpinor<Ns, Nc, ColorSpinor>(param);
       else
         errorQuda("QDPJIT interface has not been built");
+    } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) {
+      using ColorSpinor = OpenQCDDiracOrder<FloatIn, Ns, Nc>;
+      if constexpr (is_enabled<QUDA_OPENQCD_GAUGE_ORDER>())
+        genericCopyColorSpinor<Ns, Nc, ColorSpinor>(param);
+      else
+        errorQuda("OpenQCD interface has not been built");
     } else {
       errorQuda("Order %d not defined (Ns=%d, Nc=%d, precision = %d)", in.FieldOrder(), Ns, Nc, in.Precision());
     }
diff --git a/lib/copy_gauge_extended.cu b/lib/copy_gauge_extended.cu
index b0ac43f221..843d21b0de 100644
--- a/lib/copy_gauge_extended.cu
+++ b/lib/copy_gauge_extended.cu
@@ -65,7 +65,7 @@ namespace quda {
 #else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT);
 #endif
-#ifdef GPU_STAGGERED_DIRAC
+#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED)
       } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) {
 #if QUDA_RECONSTRUCT & 2
         typedef typename gauge_mapper<FloatOut,QUDA_RECONSTRUCT_13>::type G;
@@ -80,7 +80,7 @@ namespace quda {
 #else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT);
 #endif
-#endif // GPU_STAGGERED_DIRAC
+#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED)
       } else {
 	errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order());
       }
@@ -111,10 +111,18 @@ namespace quda {
       errorQuda("TIFR interface has not been built\n");
 #endif
 
+    } else if (out.Order() == QUDA_OPENQCD_GAUGE_ORDER) {
+
+#ifdef BUILD_OPENQCD_INTERFACE
+      using G = OpenQCDOrder<FloatOut, length>;
+      CopyGaugeEx<FloatOut, FloatIn, length, G, InOrder>(out, in, location, Out, In);
+#else
+      errorQuda("OPENQCD interface has not been built");
+#endif
+
     } else {
       errorQuda("Gauge field %d order not supported", out.Order());
     }
-
   }
 
   template <typename FloatOut, typename FloatIn, int length>
@@ -139,7 +147,7 @@ namespace quda {
 #else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT);
 #endif
-#ifdef GPU_STAGGERED_DIRAC
+#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED)
       } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) {
 #if QUDA_RECONSTRUCT & 2
         typedef typename gauge_mapper<FloatIn,QUDA_RECONSTRUCT_13>::type G;
@@ -154,7 +162,7 @@ namespace quda {
 #else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT);
 #endif
-#endif // GPU_STAGGERED_DIRAC
+#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED)
       } else {
 	errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order());
       }
@@ -185,10 +193,17 @@ namespace quda {
       errorQuda("TIFR interface has not been built\n");
 #endif
 
+    } else if (in.Order() == QUDA_OPENQCD_GAUGE_ORDER) {
+#ifdef BUILD_OPENQCD_INTERFACE
+      using G = OpenQCDOrder<FloatIn, length>;
+      copyGaugeEx<FloatOut, FloatIn, length, G>(out, in, location, Out, In);
+#else
+      errorQuda("OpenQCD interface has not been built\n");
+#endif
+
     } else {
       errorQuda("Gauge field %d order not supported", in.Order());
     }
-
   }
 
   template <typename FloatOut, typename FloatIn>
diff --git a/lib/copy_gauge_inc.cu b/lib/copy_gauge_inc.cu
index 9eeb36e655..f7d8289591 100644
--- a/lib/copy_gauge_inc.cu
+++ b/lib/copy_gauge_inc.cu
@@ -27,7 +27,7 @@ namespace quda {
 #else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT);
 #endif
-#ifdef GPU_STAGGERED_DIRAC
+#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED)
       } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) {
 #if QUDA_RECONSTRUCT & 2
         typedef typename gauge_mapper<FloatOut, QUDA_RECONSTRUCT_13>::type G;
@@ -56,7 +56,7 @@ namespace quda {
 #else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT);
 #endif
-#endif // GPU_STAGGERED_DIRAC
+#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED)
       } else {
         errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order());
       }
@@ -131,10 +131,18 @@ namespace quda {
       errorQuda("TIFR interface has not been built\n");
 #endif
 
+    } else if (out.Order() == QUDA_OPENQCD_GAUGE_ORDER) {
+
+#ifdef BUILD_OPENQCD_INTERFACE
+      copyGauge<FloatOut, FloatIn, length, fine_grain()>(OpenQCDOrder<FloatOut, length>(out, Out, outGhost), inOrder,
+                                                         out, in, location, type);
+#else
+      errorQuda("OPENQCD interface has not been built\n");
+#endif
+
     } else {
       errorQuda("Gauge field %d order not supported", out.Order());
     }
-
   }
 
   template <typename FloatOut, typename FloatIn, int length>
@@ -161,7 +169,7 @@ namespace quda {
 #else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT);
 #endif
-#ifdef GPU_STAGGERED_DIRAC
+#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED)
       } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) {
 #if QUDA_RECONSTRUCT & 2
         typedef typename gauge_mapper<FloatIn,QUDA_RECONSTRUCT_13>::type G;
@@ -183,7 +191,7 @@ namespace quda {
 #else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT);
 #endif
-#endif // GPU_STAGGERED_DIRAC
+#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED)
       } else {
 	errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order());
       }
@@ -259,6 +267,15 @@ namespace quda {
       errorQuda("TIFR interface has not been built\n");
 #endif
 
+    } else if (in.Order() == QUDA_OPENQCD_GAUGE_ORDER) {
+
+#ifdef BUILD_OPENQCD_INTERFACE
+      copyGauge<FloatOut, FloatIn, length>(OpenQCDOrder<FloatIn, length>(in, In, inGhost), out, in, location, Out,
+                                           outGhost, type);
+#else
+      errorQuda("OPENQCD interface has not been built\n");
+#endif
+
     } else {
       errorQuda("Gauge field order %d not supported", in.Order());
     }
diff --git a/lib/copy_gauge_offset.cu b/lib/copy_gauge_offset.cu
index 9a8494c3a2..6f8bc2cb01 100644
--- a/lib/copy_gauge_offset.cu
+++ b/lib/copy_gauge_offset.cu
@@ -100,6 +100,13 @@ namespace quda
         copy_gauge_offset<Field, Element, G>(out, in, offset);
 #else
         errorQuda("TIFR interface has not been built\n");
+#endif
+      } else if (in.Order() == QUDA_OPENQCD_GAUGE_ORDER) {
+#ifdef BUILD_OPENQCD_INTERFACE
+        using G = typename gauge_order_mapper<Float, QUDA_OPENQCD_GAUGE_ORDER, nColor>::type;
+        copy_gauge_offset<Field, Element, G>(out, in, offset);
+#else
+        errorQuda("OpenQCD interface has not been built\n");
 #endif
       } else {
         errorQuda("Gauge field %d order not supported", in.Order());
diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu
index 9d53ffbd32..8af1ac2224 100644
--- a/lib/dslash_gamma_helper.cu
+++ b/lib/dslash_gamma_helper.cu
@@ -42,6 +42,18 @@ namespace quda {
     instantiate_recurse2<GammaApply>(out, in, d);
   }
 
+  void ApplyGamma(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, QudaGammaDirection_s dir)
+  {
+    switch (dir) {
+      case QUDA_GAMMA_X: ApplyGamma(out, in, 0); break;
+      case QUDA_GAMMA_Y: ApplyGamma(out, in, 1); break;
+      case QUDA_GAMMA_Z: ApplyGamma(out, in, 2); break;
+      case QUDA_GAMMA_T: ApplyGamma(out, in, 3); break;
+      case QUDA_GAMMA_5: ApplyGamma(out, in, 4); break;
+      default: errorQuda("Unknown gamma: %d\n", dir);
+    }
+  }
+
   // Applies out(x) = 1/2 * [(1 +/- gamma5) * in] + out
   void ApplyChiralProj(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const int proj)
   {
diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp
index 08222a67cb..7aa60cbd3b 100644
--- a/lib/eig_block_trlm.cpp
+++ b/lib/eig_block_trlm.cpp
@@ -204,7 +204,7 @@ namespace quda
 
       // Compute eigenvalues
       computeEvals(kSpace, evals);
-      if (compute_svd) computeSVD(kSpace, evals);
+      if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger);
     }
 
     // Local clean-up
diff --git a/lib/eig_iram.cpp b/lib/eig_iram.cpp
index aad8682076..0e59021b35 100644
--- a/lib/eig_iram.cpp
+++ b/lib/eig_iram.cpp
@@ -515,7 +515,7 @@ namespace quda
         // Compute the eigen/singular values.
         getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
         computeEvals(kSpace, evals);
-        if (compute_svd) computeSVD(kSpace, evals);
+        if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger);
         converged = true;
 
       } else {
diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp
index 9511810c7f..74920aa4e6 100644
--- a/lib/eig_trlm.cpp
+++ b/lib/eig_trlm.cpp
@@ -26,7 +26,8 @@ namespace quda
     beta.resize(n_kr, 0.0);
 
     // Thick restart specific checks
-    if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than n_ev+6=%d\n", n_kr, n_ev + 6);
+    if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than or equal to n_ev+6=%d\n", n_kr, n_ev + 6);
+    if (n_kr < n_conv + 12) errorQuda("n_kr=%d must be greater than or equal to n_conv+12=%d\n", n_kr, n_conv + 12);
 
     if (!(eig_param->spectrum == QUDA_SPECTRUM_LR_EIG || eig_param->spectrum == QUDA_SPECTRUM_SR_EIG)) {
       errorQuda("Only real spectrum type (LR or SR) can be passed to the TR Lanczos solver");
@@ -180,7 +181,7 @@ namespace quda
       // Compute eigenvalues/singular values
 
       computeEvals(kSpace, evals);
-      if (compute_svd) computeSVD(kSpace, evals);
+      if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger);
     }
 
     // Local clean-up
diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp
index 156969a191..9d1123fb3d 100644
--- a/lib/eigensolve_quda.cpp
+++ b/lib/eigensolve_quda.cpp
@@ -498,7 +498,7 @@ namespace quda
     }
   }
 
-  void EigenSolver::computeSVD(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals)
+  void EigenSolver::computeSVD(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals, bool dagger)
   {
     logQuda(QUDA_SUMMARIZE, "Computing SVD of M\n");
 
@@ -523,9 +523,15 @@ namespace quda
 
       // Lambda already contains the square root of the eigenvalue of the norm op.
 
-      // M*Rev_i = M*Rsv_i = sigma_i Lsv_i
-      mat.Expose()->M({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper},
-                      {evecs.begin() + lower, evecs.begin() + upper});
+      if (dagger) {
+        // Mdag*Lev_i = Mdag*Lsv_i = sigma_i Rsv_i
+        mat.Expose()->Mdag({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper},
+                           {evecs.begin() + lower, evecs.begin() + upper});
+      } else {
+        // M*Rev_i = M*Rsv_i = sigma_i Lsv_i
+        mat.Expose()->M({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper},
+                        {evecs.begin() + lower, evecs.begin() + upper});
+      }
 
       // sigma_i = sqrt(sigma_i (Lsv_i)^dag * sigma_i * Lsv_i )
       auto sigma = blas::norm2({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper});
diff --git a/lib/extract_gauge_ghost.in.cu b/lib/extract_gauge_ghost.in.cu
index 878de6118b..0639d2eaae 100644
--- a/lib/extract_gauge_ghost.in.cu
+++ b/lib/extract_gauge_ghost.in.cu
@@ -111,10 +111,17 @@ namespace quda {
           errorQuda("TIFR interface has not been built");
         }
 
+      } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) {
+
+        if constexpr (is_enabled<QUDA_OPENQCD_GAUGE_ORDER>()) {
+          ExtractGhost<Float, nColor, OpenQCDOrder<Float, length>>(u, Ghost, extract, offset);
+        } else {
+          errorQuda("OpenQCD interface has not been built");
+        }
+
       } else {
         errorQuda("Gauge field %d order not supported", u.Order());
       }
-
     }
   };
 
diff --git a/lib/extract_gauge_ghost_extended.cu b/lib/extract_gauge_ghost_extended.cu
index 37c4f7235a..11b2661e6f 100644
--- a/lib/extract_gauge_ghost_extended.cu
+++ b/lib/extract_gauge_ghost_extended.cu
@@ -142,10 +142,17 @@ namespace quda {
           errorQuda("TIFR interface has not been built");
         }
 
+      } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) {
+
+        if constexpr (is_enabled<QUDA_OPENQCD_GAUGE_ORDER>()) {
+          ExtractGhostEx<OpenQCDOrder<Float, length>>(u, dim, R, ghost, extract);
+        } else {
+          errorQuda("OpenQCD interface has not been built");
+        }
+
       } else {
         errorQuda("Gauge field %d order not supported", u.Order());
       }
-
     }
   };
 
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index fadba50fa8..96e7a51d11 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -152,6 +152,25 @@ namespace quda {
         bytes = site_dim * (x[0] + 4) * (x[1] + 2) * (x[2] + 2) * (x[3] + 2) * nInternal * precision;
       } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) {
         bytes = volume * site_size;
+      } else if (order == QUDA_OPENQCD_GAUGE_ORDER) {
+        /**
+         * With an openQCD gauge field, we need all the links of even lattice
+         * points in positive direction. These are links that lie in the buffer
+         * space that spans 7*BNDRY/4 gauge fields. These boundary fields are
+         * located at base_ptr + 4*VOLUME. Therefore, we need to transfer more
+         * than 4*VOLUME matrices.
+         */
+
+        /* analogue to BNDRY in openQCD:include/global.h */
+        long int bndry = 0;
+        bndry += (1 - (comm_dim(0) % 2)) * x[1] * x[2] * x[3];
+        bndry += (1 - (comm_dim(1) % 2)) * x[0] * x[2] * x[3];
+        bndry += (1 - (comm_dim(2) % 2)) * x[0] * x[1] * x[3];
+        bndry += (1 - (comm_dim(3) % 2)) * x[0] * x[1] * x[2];
+        bndry *= 2;
+
+        length += 18 * 7 * bndry / 4;
+        bytes = length * precision;
       } else {
         bytes = length * precision;
       }
@@ -191,7 +210,7 @@ namespace quda {
 
     } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER
                || order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER
-               || order == QUDA_MILC_SITE_GAUGE_ORDER) {
+               || order == QUDA_MILC_SITE_GAUGE_ORDER || order == QUDA_OPENQCD_GAUGE_ORDER) {
       // does not support device
 
       if (order == QUDA_MILC_SITE_GAUGE_ORDER && param.create != QUDA_REFERENCE_FIELD_CREATE) {
@@ -1391,7 +1410,8 @@ namespace quda {
         }
       } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
                  || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
-                 || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+                 || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER
+                 || Order() == QUDA_OPENQCD_GAUGE_ORDER) {
         std::memcpy(buffer, data(), Bytes());
       } else {
         errorQuda("Unsupported order = %d", Order());
@@ -1411,7 +1431,8 @@ namespace quda {
         }
       } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
                  || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
-                 || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+                 || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER
+                 || Order() == QUDA_OPENQCD_GAUGE_ORDER) {
         std::memcpy(data(), buffer, Bytes());
       } else {
         errorQuda("Unsupported order = %d", Order());
diff --git a/lib/instantiate.cpp b/lib/instantiate.cpp
index 15d0b85148..c209a75398 100644
--- a/lib/instantiate.cpp
+++ b/lib/instantiate.cpp
@@ -1,7 +1,7 @@
 #include <instantiate.h>
 
 /**
-   This file contains deinitions required when compiling with C++14.
+   This file contains definitions required when compiling with C++14.
    Without these, we can end up with undefined references at link
    time.  We can remove this file when we jump to C++17 and declare
    these are inline variables in instantiate.h.
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 47290a14e8..67ec31436c 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -520,7 +520,7 @@ void initQudaDevice(int dev)
 /*
  * Any persistent memory allocations that QUDA uses are done here.
  */
-void initQudaMemory()
+void initQudaMemory(void)
 {
   auto profile = pushProfile(profileInit);
   profileInit.TPSTART(QUDA_PROFILE_INIT);
@@ -546,7 +546,7 @@ void initQudaMemory()
   profileInit.TPSTOP(QUDA_PROFILE_INIT);
 }
 
-void updateR()
+void updateR(void)
 {
   for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
 }
@@ -694,7 +694,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   // set update_split_gauge to reuse backup or not and free the buf if needed
   // always update the flag even gauge reuse with checksum
   // better way would be do the checks more consistently along with clover/stagger
-  if (param->use_split_gauge_bkup == true) {
+  if (param->use_split_gauge_bkup) {
     update_split_gauge = QUDA_UPDATE_SPLIT_GAUGE_TRUE;
   } else {
     update_split_gauge = QUDA_UPDATE_SPLIT_GAUGE_OFF;
diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp
new file mode 100644
index 0000000000..c6034f518b
--- /dev/null
+++ b/lib/openqcd_interface.cpp
@@ -0,0 +1,2271 @@
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+#include <fstream>
+#include <unordered_map>
+#include <string>
+#include <sstream>
+#include <regex>
+#include <type_traits>
+#include <unistd.h>
+#include <pthread.h>
+#include <list>
+
+#include <quda_openqcd_interface.h>
+#include <quda.h>
+#include <quda_internal.h>
+#include <color_spinor_field.h>
+#include <dslash_quda.h>
+#include <invert_quda.h>
+#include <index_helper.cuh>
+#include <multigrid.h>
+#include <mpi.h>
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define WITH_COMM(expr) do { if (in_comm()) { expr; } } while (0)
+
+typedef struct {
+  bool ready;
+  bool created;
+  pthread_t thread;
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+  MPI_Comm main_comm;   /** MPI communicator holding the main threads */
+  MPI_Comm worker_comm; /** MPI communicator holding the worker threads */
+} openQCD_QudaThread_t;
+
+typedef struct {
+  void *source;
+  void *solution;
+  int *status;
+  double retval;
+} openQCD_qudaInvert_args_t;
+
+typedef struct {
+  int initialized;   /** Whether openQCD_qudaInit() was called or not */
+  int ud_rev;        /** Revision of ud field from openqxd */
+  int ad_rev;        /** Revision of ad field from openqxd */
+  int swd_ud_rev;    /** Revision of ud field used to calc/transfer the SW field from openqxd */
+  int swd_ad_rev;    /** Revision of ad field used to calc/transfer the SW field from openqxd */
+  double swd_kappa;  /** kappa corresponding to the current SW field in QUDA */
+  double swd_su3csw; /** SU(3) csw coefficient corresponding to the current SW field in QUDA */
+  double swd_u1csw;  /** U(1) csw coefficient corresponding to the current SW field in QUDA */
+  int swd_qhat;      /** qhat coefficient corresponding to the current SW field in QUDA */
+  openQCD_QudaInitArgs_t init;
+  openQCD_QudaLayout_t layout;
+  openQCD_QudaThread_t thread;
+  std::vector<openQCD_qudaInvert_args_t> inv_args;
+  QudaInvertParam async_params;
+  void *dirac_handle;                       /** void-pointer to QudaInvertParam struct for the Dirac operator.
+                                             * Notice that this void pointer HAS to be directly before
+                                             * inv_handles[32], because it's possible to call
+                                             * openQCD_qudaSolverGetHandle with -1. */
+  void *inv_handles[OPENQCD_MAX_INVERTERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */
+  void *eig_handles[OPENQCD_MAX_EIGENSOLVERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */
+  char infile[1024];                           /** Path to the input file (if given to quda_init()) */
+} openQCD_QudaState_t;
+
+typedef struct openQCD_QudaSolver_s {
+  char infile[1024];            /** Path to the input file (if given to quda_init()) */
+  int id;                       /** Solver section identifier in the input file */
+  QudaMultigridParam *mg_param; /** Pointer to the multigrid param struct */
+  double u1csw;                 /** u1csw property */
+  int qhat;                     /** qhat property */
+  int mg_ud_rev;                /** Revision of ud field from openqxd */
+  int mg_ad_rev;                /** Revision of ad field from openqxd */
+  double mg_kappa;              /** kappa corresponding to the current mg-instance in QUDA */
+  double mg_su3csw;             /** SU(3) csw coefficient corresponding to the current mg-instance in QUDA */
+  double mg_u1csw;              /** U(1) csw coefficient corresponding to the current mg-instance in QUDA */
+  int mg_qhat;                  /** qhat corresponding to the current mg-instance in QUDA */
+} openQCD_QudaSolver;
+
+static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, 0, {}, {}, { false, false, 1, PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER, MPI_COMM_NULL, MPI_COMM_NULL }, {}, {}, nullptr, {}, {}, ""};
+
+using namespace quda;
+
+/**
+ * code for NVTX taken from Jiri Kraus' blog post:
+ * http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/
+ */
+
+#ifdef INTERFACE_NVTX
+
+#if QUDA_NVTX_VERSION == 3
+#include "nvtx3/nvToolsExt.h"
+#else
+#include "nvToolsExt.h"
+#endif
+
+static const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff};
+static const int num_colors = sizeof(colors) / sizeof(uint32_t);
+
+#define PUSH_RANGE(name, cid)                                                                                          \
+  {                                                                                                                    \
+    int color_id = cid;                                                                                                \
+    color_id = color_id % num_colors;                                                                                  \
+    nvtxEventAttributes_t eventAttrib = {0};                                                                           \
+    eventAttrib.version = NVTX_VERSION;                                                                                \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;                                                                  \
+    eventAttrib.colorType = NVTX_COLOR_ARGB;                                                                           \
+    eventAttrib.color = colors[color_id];                                                                              \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;                                                                 \
+    eventAttrib.message.ascii = name;                                                                                  \
+    nvtxRangePushEx(&eventAttrib);                                                                                     \
+  }
+#define POP_RANGE nvtxRangePop();
+#else
+#define PUSH_RANGE(name, cid)
+#define POP_RANGE
+#endif
+
+#define QUDA_OPENQCD_VERBOSE 1
+
+template <bool start> void inline qudaopenqcd_called(const char *func, QudaVerbosity verb)
+{
+  /* add NVTX markup if enabled */
+  if (start) {
+    PUSH_RANGE(func, 1);
+  } else {
+    POP_RANGE;
+  }
+
+#ifdef QUDA_OPENQCD_VERBOSE
+  if (verb >= QUDA_VERBOSE) {
+    if (start) {
+      printfQuda("QUDA_OPENQCD_INTERFACE: %s (called) \n", func);
+    } else {
+      printfQuda("QUDA_OPENQCD_INTERFACE: %s (return) \n", func);
+    }
+  }
+#endif
+}
+
+template <bool start> void inline qudaopenqcd_called(const char *func)
+{
+  qudaopenqcd_called<start>(func, getVerbosity());
+}
+
+/**
+ * Mapping of enums to their actual values. We have this mapping such that we
+ * can use the named parameters in our input files rather than the number. this
+ * makes reading and writing the configuration more understandable.
+ */
+std::unordered_map<std::string, std::string> enum_map
+  = {{"QUDA_CG_INVERTER", std::to_string(QUDA_CG_INVERTER)},
+     {"QUDA_BICGSTAB_INVERTER", std::to_string(QUDA_BICGSTAB_INVERTER)},
+     {"QUDA_GCR_INVERTER", std::to_string(QUDA_GCR_INVERTER)},
+     {"QUDA_MR_INVERTER", std::to_string(QUDA_MR_INVERTER)},
+     {"QUDA_SD_INVERTER", std::to_string(QUDA_SD_INVERTER)},
+     {"QUDA_PCG_INVERTER", std::to_string(QUDA_PCG_INVERTER)},
+     {"QUDA_EIGCG_INVERTER", std::to_string(QUDA_EIGCG_INVERTER)},
+     {"QUDA_INC_EIGCG_INVERTER", std::to_string(QUDA_INC_EIGCG_INVERTER)},
+     {"QUDA_GMRESDR_INVERTER", std::to_string(QUDA_GMRESDR_INVERTER)},
+     {"QUDA_GMRESDR_PROJ_INVERTER", std::to_string(QUDA_GMRESDR_PROJ_INVERTER)},
+     {"QUDA_GMRESDR_SH_INVERTER", std::to_string(QUDA_GMRESDR_SH_INVERTER)},
+     {"QUDA_FGMRESDR_INVERTER", std::to_string(QUDA_FGMRESDR_INVERTER)},
+     {"QUDA_MG_INVERTER", std::to_string(QUDA_MG_INVERTER)},
+     {"QUDA_BICGSTABL_INVERTER", std::to_string(QUDA_BICGSTABL_INVERTER)},
+     {"QUDA_CGNE_INVERTER", std::to_string(QUDA_CGNE_INVERTER)},
+     {"QUDA_CGNR_INVERTER", std::to_string(QUDA_CGNR_INVERTER)},
+     {"QUDA_CG3_INVERTER", std::to_string(QUDA_CG3_INVERTER)},
+     {"QUDA_CG3NE_INVERTER", std::to_string(QUDA_CG3NE_INVERTER)},
+     {"QUDA_CG3NR_INVERTER", std::to_string(QUDA_CG3NR_INVERTER)},
+     {"QUDA_CA_CG_INVERTER", std::to_string(QUDA_CA_CG_INVERTER)},
+     {"QUDA_CA_CGNE_INVERTER", std::to_string(QUDA_CA_CGNE_INVERTER)},
+     {"QUDA_CA_CGNR_INVERTER", std::to_string(QUDA_CA_CGNR_INVERTER)},
+     {"QUDA_CA_GCR_INVERTER", std::to_string(QUDA_CA_GCR_INVERTER)},
+     {"QUDA_INVALID_INVERTER", std::to_string(QUDA_INVALID_INVERTER)},
+     {"QUDA_MAT_SOLUTION", std::to_string(QUDA_MAT_SOLUTION)},
+     {"QUDA_MATDAG_MAT_SOLUTION", std::to_string(QUDA_MATDAG_MAT_SOLUTION)},
+     {"QUDA_MATPC_SOLUTION", std::to_string(QUDA_MATPC_SOLUTION)},
+     {"QUDA_MATPC_DAG_SOLUTION", std::to_string(QUDA_MATPC_DAG_SOLUTION)},
+     {"QUDA_MATPCDAG_MATPC_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SOLUTION)},
+     {"QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION)},
+     {"QUDA_INVALID_SOLUTION", std::to_string(QUDA_INVALID_SOLUTION)},
+     {"QUDA_DIRECT_SOLVE", std::to_string(QUDA_DIRECT_SOLVE)},
+     {"QUDA_NORMOP_SOLVE", std::to_string(QUDA_NORMOP_SOLVE)},
+     {"QUDA_DIRECT_PC_SOLVE", std::to_string(QUDA_DIRECT_PC_SOLVE)},
+     {"QUDA_NORMOP_PC_SOLVE", std::to_string(QUDA_NORMOP_PC_SOLVE)},
+     {"QUDA_NORMERR_SOLVE", std::to_string(QUDA_NORMERR_SOLVE)},
+     {"QUDA_NORMERR_PC_SOLVE", std::to_string(QUDA_NORMERR_PC_SOLVE)},
+     {"QUDA_NORMEQ_SOLVE", std::to_string(QUDA_NORMEQ_SOLVE)},
+     {"QUDA_NORMEQ_PC_SOLVE", std::to_string(QUDA_NORMEQ_PC_SOLVE)},
+     {"QUDA_INVALID_SOLVE", std::to_string(QUDA_INVALID_SOLVE)},
+     {"QUDA_MATPC_EVEN_EVEN", std::to_string(QUDA_MATPC_EVEN_EVEN)},
+     {"QUDA_MATPC_ODD_ODD", std::to_string(QUDA_MATPC_ODD_ODD)},
+     {"QUDA_MATPC_EVEN_EVEN_ASYMMETRIC", std::to_string(QUDA_MATPC_EVEN_EVEN_ASYMMETRIC)},
+     {"QUDA_MATPC_ODD_ODD_ASYMMETRIC", std::to_string(QUDA_MATPC_ODD_ODD_ASYMMETRIC)},
+     {"QUDA_MATPC_INVALID", std::to_string(QUDA_MATPC_INVALID)},
+     {"QUDA_DEFAULT_NORMALIZATION", std::to_string(QUDA_DEFAULT_NORMALIZATION)},
+     {"QUDA_SOURCE_NORMALIZATION", std::to_string(QUDA_SOURCE_NORMALIZATION)},
+     {"QUDA_QUARTER_PRECISION", std::to_string(QUDA_QUARTER_PRECISION)},
+     {"QUDA_HALF_PRECISION", std::to_string(QUDA_HALF_PRECISION)},
+     {"QUDA_SINGLE_PRECISION", std::to_string(QUDA_SINGLE_PRECISION)},
+     {"QUDA_DOUBLE_PRECISION", std::to_string(QUDA_DOUBLE_PRECISION)},
+     {"QUDA_INVALID_PRECISION", std::to_string(QUDA_INVALID_PRECISION)},
+     {"QUDA_BOOLEAN_FALSE", std::to_string(QUDA_BOOLEAN_FALSE)},
+     {"false", std::to_string(QUDA_BOOLEAN_FALSE)},
+     {"FALSE", std::to_string(QUDA_BOOLEAN_FALSE)},
+     {"no", std::to_string(QUDA_BOOLEAN_FALSE)},
+     {"n", std::to_string(QUDA_BOOLEAN_FALSE)},
+     {"off", std::to_string(QUDA_BOOLEAN_FALSE)},
+     {"QUDA_BOOLEAN_TRUE", std::to_string(QUDA_BOOLEAN_TRUE)},
+     {"true", std::to_string(QUDA_BOOLEAN_TRUE)},
+     {"TRUE", std::to_string(QUDA_BOOLEAN_TRUE)},
+     {"yes", std::to_string(QUDA_BOOLEAN_TRUE)},
+     {"y", std::to_string(QUDA_BOOLEAN_TRUE)},
+     {"on", std::to_string(QUDA_BOOLEAN_TRUE)},
+     {"QUDA_BOOLEAN_INVALID", std::to_string(QUDA_BOOLEAN_INVALID)},
+     {"QUDA_COMPUTE_NULL_VECTOR_NO", std::to_string(QUDA_COMPUTE_NULL_VECTOR_NO)},
+     {"QUDA_COMPUTE_NULL_VECTOR_YES", std::to_string(QUDA_COMPUTE_NULL_VECTOR_YES)},
+     {"QUDA_COMPUTE_NULL_VECTOR_INVALID", std::to_string(QUDA_COMPUTE_NULL_VECTOR_INVALID)},
+     {"QUDA_MG_CYCLE_VCYCLE", std::to_string(QUDA_MG_CYCLE_VCYCLE)},
+     {"QUDA_MG_CYCLE_FCYCLE", std::to_string(QUDA_MG_CYCLE_FCYCLE)},
+     {"QUDA_MG_CYCLE_WCYCLE", std::to_string(QUDA_MG_CYCLE_WCYCLE)},
+     {"QUDA_MG_CYCLE_RECURSIVE", std::to_string(QUDA_MG_CYCLE_RECURSIVE)},
+     {"QUDA_MG_CYCLE_INVALID", std::to_string(QUDA_MG_CYCLE_INVALID)},
+     {"QUDA_CPU_FIELD_LOCATION", std::to_string(QUDA_CPU_FIELD_LOCATION)},
+     {"QUDA_CUDA_FIELD_LOCATION", std::to_string(QUDA_CUDA_FIELD_LOCATION)},
+     {"QUDA_INVALID_FIELD_LOCATION", std::to_string(QUDA_INVALID_FIELD_LOCATION)},
+     {"QUDA_TWIST_SINGLET", std::to_string(QUDA_TWIST_SINGLET)},
+     {"QUDA_TWIST_NONDEG_DOUBLET", std::to_string(QUDA_TWIST_NONDEG_DOUBLET)},
+     {"QUDA_TWIST_NO", std::to_string(QUDA_TWIST_NO)},
+     {"QUDA_TWIST_INVALID", std::to_string(QUDA_TWIST_INVALID)},
+     {"QUDA_DAG_NO", std::to_string(QUDA_DAG_NO)},
+     {"QUDA_DAG_YES", std::to_string(QUDA_DAG_YES)},
+     {"QUDA_DAG_INVALID", std::to_string(QUDA_DAG_INVALID)},
+     {"QUDA_KAPPA_NORMALIZATION", std::to_string(QUDA_KAPPA_NORMALIZATION)},
+     {"QUDA_MASS_NORMALIZATION", std::to_string(QUDA_MASS_NORMALIZATION)},
+     {"QUDA_ASYMMETRIC_MASS_NORMALIZATION", std::to_string(QUDA_ASYMMETRIC_MASS_NORMALIZATION)},
+     {"QUDA_INVALID_NORMALIZATION", std::to_string(QUDA_INVALID_NORMALIZATION)},
+     {"QUDA_PRESERVE_SOURCE_NO", std::to_string(QUDA_PRESERVE_SOURCE_NO)},
+     {"QUDA_PRESERVE_SOURCE_YES", std::to_string(QUDA_PRESERVE_SOURCE_YES)},
+     {"QUDA_PRESERVE_SOURCE_INVALID", std::to_string(QUDA_PRESERVE_SOURCE_INVALID)},
+     {"QUDA_USE_INIT_GUESS_NO", std::to_string(QUDA_USE_INIT_GUESS_NO)},
+     {"QUDA_USE_INIT_GUESS_YES", std::to_string(QUDA_USE_INIT_GUESS_YES)},
+     {"QUDA_USE_INIT_GUESS_INVALID", std::to_string(QUDA_USE_INIT_GUESS_INVALID)},
+     {"QUDA_SILENT", std::to_string(QUDA_SILENT)},
+     {"QUDA_SUMMARIZE", std::to_string(QUDA_SUMMARIZE)},
+     {"QUDA_VERBOSE", std::to_string(QUDA_VERBOSE)},
+     {"QUDA_DEBUG_VERBOSE", std::to_string(QUDA_DEBUG_VERBOSE)},
+     {"QUDA_INVALID_VERBOSITY", std::to_string(QUDA_INVALID_VERBOSITY)},
+     {"QUDA_POWER_BASIS", std::to_string(QUDA_POWER_BASIS)},
+     {"QUDA_CHEBYSHEV_BASIS", std::to_string(QUDA_CHEBYSHEV_BASIS)},
+     {"QUDA_INVALID_BASIS", std::to_string(QUDA_INVALID_BASIS)},
+     {"QUDA_ADDITIVE_SCHWARZ", std::to_string(QUDA_ADDITIVE_SCHWARZ)},
+     {"QUDA_MULTIPLICATIVE_SCHWARZ", std::to_string(QUDA_MULTIPLICATIVE_SCHWARZ)},
+     {"QUDA_INVALID_SCHWARZ", std::to_string(QUDA_INVALID_SCHWARZ)},
+     {"QUDA_MADWF_ACCELERATOR", std::to_string(QUDA_MADWF_ACCELERATOR)},
+     {"QUDA_INVALID_ACCELERATOR", std::to_string(QUDA_INVALID_ACCELERATOR)},
+     {"QUDA_L2_RELATIVE_RESIDUAL", std::to_string(QUDA_L2_RELATIVE_RESIDUAL)},
+     {"QUDA_L2_ABSOLUTE_RESIDUAL", std::to_string(QUDA_L2_ABSOLUTE_RESIDUAL)},
+     {"QUDA_HEAVY_QUARK_RESIDUAL", std::to_string(QUDA_HEAVY_QUARK_RESIDUAL)},
+     {"QUDA_INVALID_RESIDUAL", std::to_string(QUDA_INVALID_RESIDUAL)},
+     {"QUDA_NULL_VECTOR_SETUP", std::to_string(QUDA_NULL_VECTOR_SETUP)},
+     {"QUDA_TEST_VECTOR_SETUP", std::to_string(QUDA_TEST_VECTOR_SETUP)},
+     {"QUDA_INVALID_SETUP_TYPE", std::to_string(QUDA_INVALID_SETUP_TYPE)},
+     {"QUDA_TRANSFER_AGGREGATE", std::to_string(QUDA_TRANSFER_AGGREGATE)},
+     {"QUDA_TRANSFER_COARSE_KD", std::to_string(QUDA_TRANSFER_COARSE_KD)},
+     {"QUDA_TRANSFER_OPTIMIZED_KD", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD)},
+     {"QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG)},
+     {"QUDA_TRANSFER_INVALID", std::to_string(QUDA_TRANSFER_INVALID)},
+     {"QUDA_EIG_TR_LANCZOS", std::to_string(QUDA_EIG_TR_LANCZOS)},
+     {"QUDA_EIG_BLK_TR_LANCZOS", std::to_string(QUDA_EIG_BLK_TR_LANCZOS)},
+     {"QUDA_EIG_IR_ARNOLDI", std::to_string(QUDA_EIG_IR_ARNOLDI)},
+     {"QUDA_EIG_BLK_IR_ARNOLDI", std::to_string(QUDA_EIG_BLK_IR_ARNOLDI)},
+     {"QUDA_EIG_INVALID", std::to_string(QUDA_EIG_INVALID)},
+     {"QUDA_SPECTRUM_LM_EIG", std::to_string(QUDA_SPECTRUM_LM_EIG)},
+     {"QUDA_SPECTRUM_SM_EIG", std::to_string(QUDA_SPECTRUM_SM_EIG)},
+     {"QUDA_SPECTRUM_LR_EIG", std::to_string(QUDA_SPECTRUM_LR_EIG)},
+     {"QUDA_SPECTRUM_SR_EIG", std::to_string(QUDA_SPECTRUM_SR_EIG)},
+     {"QUDA_SPECTRUM_LI_EIG", std::to_string(QUDA_SPECTRUM_LI_EIG)},
+     {"QUDA_SPECTRUM_SI_EIG", std::to_string(QUDA_SPECTRUM_SI_EIG)},
+     {"QUDA_SPECTRUM_INVALID", std::to_string(QUDA_SPECTRUM_INVALID)},
+     {"QUDA_MEMORY_DEVICE", std::to_string(QUDA_MEMORY_DEVICE)},
+     {"QUDA_MEMORY_DEVICE_PINNED", std::to_string(QUDA_MEMORY_DEVICE_PINNED)},
+     {"QUDA_MEMORY_HOST", std::to_string(QUDA_MEMORY_HOST)},
+     {"QUDA_MEMORY_HOST_PINNED", std::to_string(QUDA_MEMORY_HOST_PINNED)},
+     {"QUDA_MEMORY_MAPPED", std::to_string(QUDA_MEMORY_MAPPED)},
+     {"QUDA_MEMORY_MANAGED", std::to_string(QUDA_MEMORY_MANAGED)},
+     {"QUDA_MEMORY_INVALID", std::to_string(QUDA_MEMORY_INVALID)},
+     {"QUDA_CUSOLVE_EXTLIB", std::to_string(QUDA_CUSOLVE_EXTLIB)},
+     {"QUDA_EIGEN_EXTLIB", std::to_string(QUDA_EIGEN_EXTLIB)},
+     {"QUDA_EXTLIB_INVALID", std::to_string(QUDA_EXTLIB_INVALID)}};
+
+/**
+ * @brief      Just a simple key-value store
+ */
+class KeyValueStore
+{
+private:
+  std::unordered_map<std::string, std::unordered_map<std::string, std::tuple<std::string, std::string>>> store;
+  std::unordered_map<std::string, std::string> *map = nullptr;
+  std::string filename = "<not set>";
+
+public:
+  /**
+   * @brief      Sets a key value pair
+   *
+   * @param[in]  section  The section
+   * @param[in]  key      The key
+   * @param[in]  value    The value
+   */
+  void set(const std::string &section, const std::string &key, const std::string &value)
+  {
+    if (map != nullptr) {
+      auto mvalue = map->find(value);
+      if (mvalue != map->end()) {
+        std::get<0>(store[section][key]) = mvalue->second;
+        std::get<1>(store[section][key]) = value;
+        return;
+      }
+    }
+    std::get<0>(store[section][key]) = value;
+    std::get<1>(store[section][key]) = value;
+  }
+
+  void set_map(std::unordered_map<std::string, std::string> *_map) { map = _map; }
+
+  bool section_exists(const std::string &section) { return store.find(section) != store.end(); }
+
+  /**
+   * @brief      Gets the specified key.
+   *
+   * @param[in]  section        The section
+   * @param[in]  key            The key
+   * @param[in]  default_value  The default value if section/key is absent
+   *
+   * @tparam     T              Desired return type
+   *
+   * @return     The corresponding value
+   */
+  template <typename T>
+  T get(const std::string &section, const std::string &key, T default_value = T(), bool fail = false)
+  {
+    int idx;
+    std::string rkey;
+    std::smatch match;
+    std::regex p_key("([^\\[]+)\\[(\\d+)\\]"); /* key[idx] */
+    auto sec = store.find(section);
+
+    if (sec != store.end()) {
+      if (std::regex_search(key, match, p_key)) {
+        rkey = match[1];
+        idx = std::stoi(match[2]);
+      } else {
+        rkey = key;
+        idx = 0;
+      }
+
+      auto item = sec->second.find(rkey);
+      if (item != sec->second.end()) {
+        std::stringstream ss(std::get<0>(item->second));
+        if constexpr (std::is_enum_v<T>) {
+          typename std::underlying_type<T>::type result, dummy;
+          for (int i = 0; i < idx; i++) { ss >> dummy; }
+          if (ss >> result) { return static_cast<T>(result); }
+        } else {
+          T result, dummy;
+          for (int i = 0; i < idx; i++) { ss >> dummy; }
+          if (ss >> result) { return result; }
+        }
+      }
+    }
+    if (fail) {
+      errorQuda("Key \"%s\" in section \"%s\" in file %s does not exist.", key.c_str(), section.c_str(),
+                filename.c_str());
+    }
+    return default_value; /* Return default value for non-existent keys */
+  }
+
+  /**
+   * @brief      Fill the store with entries from an ini-file
+   *
+   * @param[in]  fname  The fname
+   */
+  void load(const std::string &fname)
+  {
+    std::string line, section;
+    std::smatch match;
+    filename = fname;
+    std::ifstream file(filename.c_str());
+
+    std::regex p_section("^\\s*\\[([\\w\\ ]+)\\].*$");      /* [section] */
+    std::regex p_comment("^[^#]*(\\s*#.*)$");               /* line # comment */
+    std::regex p_key_val("^([^\\s]+)\\s+(.*[^\\s]+)\\s*$"); /* key value */
+
+    if (file.is_open()) {
+
+      while (std::getline(file, line)) {
+
+        /* remove all comments */
+        if (std::regex_search(line, match, p_comment)) { line.erase(match.position(1)); }
+
+        if (std::regex_search(line, match, p_section)) {
+          section = match[1];
+        } else if (std::regex_search(line, match, p_key_val)) {
+          std::string key = match[1];
+          std::string val = match[2];
+          this->set(section, key, val);
+        }
+      }
+
+      file.close();
+    } else {
+      std::cerr << "Error opening file: " << filename << std::endl;
+    }
+  }
+
+  /**
+   * @brief      Dumps all entries in the store.
+   */
+  void dump(std::string _section = "")
+  {
+    for (const auto &section : store) {
+      if (_section == "" || _section == section.first) {
+        std::cout << "[" << section.first << "]" << std::endl;
+        for (const auto &pair : section.second) {
+          std::cout << "  " << pair.first << " = " << std::get<1>(pair.second);
+          if (std::get<0>(pair.second) != std::get<1>(pair.second)) { std::cout << " # " << std::get<0>(pair.second); }
+          std::cout << std::endl;
+        }
+      }
+    }
+  }
+};
+
+/**
+ * @brief      Returns the local lattice dimensions as lat_dim_t
+ *
+ * @return     The local dimensions.
+ */
+static lat_dim_t get_local_dims(int *fill = nullptr)
+{
+  lat_dim_t X;
+
+  for (int i = 0; i < 4; i++) {
+    if (fill) {
+      fill[i] = qudaState.layout.L[i];
+    } else {
+      X[i] = qudaState.layout.L[i];
+    }
+  }
+
+  return X;
+}
+
+/**
+ * @brief      Calculate the rank from coordinates.
+ *
+ * @param[in]  coords  coords is the 4D cartesian coordinate of a rank
+ * @param[in]  fdata   should point to an instance of qudaLayout.ranks,
+ *                     @see struct openQCD_QudaLayout_t in
+ *                     @file include/quda_openqcd_interface.h
+ *
+ * @return     rank
+ */
+int rankFromCoords(const int *coords, void *fdata)
+{
+  int *base = static_cast<int *>(fdata);
+  int *NPROC = base + 1;
+  int *ranks = base + 5;
+  int i;
+
+  i = coords[3] + NPROC[3] * (coords[2] + NPROC[2] * (coords[1] + NPROC[1] * (coords[0])));
+  return ranks[i];
+}
+
+/**
+ * Set set the local dimensions and machine topology for QUDA to use
+ *
+ * @param      layout  Struct defining local dimensions and machine topology
+ * @param      infile  Input file
+ */
+void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout, char *infile)
+{
+  int my_rank;
+  char prefix[20];
+
+  for (int dir = 0; dir < 4; ++dir) {
+    if (layout.N[dir] % 2 != 0) {
+      errorQuda("Error: Odd lattice dimensions are not supported\n");
+      exit(1);
+    }
+  }
+
+  setMPICommHandleQuda(&layout.quda_comm);
+
+#ifdef MULTI_GPU
+  initCommsGridQuda(4, layout.nproc, rankFromCoords, (void *)(layout.data));
+  static int device = -1; /* enable a default allocation of devices to processes */
+#else
+  static int device = layout.device;
+#endif
+
+  /* must happen *after* communication initialization */
+  MPI_Comm_rank(layout.quda_comm, &my_rank);
+  sprintf(prefix, "QUDA (rank=%d): ", my_rank);
+
+  strcpy(qudaState.infile, infile);
+  if (my_rank == 0) {
+    KeyValueStore kv;
+    kv.set_map(&enum_map);
+    kv.load(qudaState.infile);
+    qudaState.init.verbosity = kv.get<QudaVerbosity>("QUDA", "verbosity", qudaState.init.verbosity);
+  }
+
+  MPI_Bcast((void *)&qudaState.init.verbosity, sizeof(qudaState.init.verbosity), MPI_INT, 0, layout.quda_comm);
+  setVerbosityQuda(qudaState.init.verbosity, prefix, my_rank == 0 ? qudaState.init.logfile : stdout);
+  initQuda(device);
+}
+
+static int getLinkPadding(const int dim[4])
+{
+  int padding = MAX(dim[1] * dim[2] * dim[3] / 2, dim[0] * dim[2] * dim[3] / 2);
+  padding = MAX(padding, dim[0] * dim[1] * dim[3] / 2);
+  padding = MAX(padding, dim[0] * dim[1] * dim[2] / 2);
+  return padding;
+}
+
+/**
+ * @brief      Creates a new quda parameter struct
+ *
+ * @return     The quda parameter struct.
+ */
+static QudaInvertParam newOpenQCDParam(void)
+{
+  static const QudaVerbosity verbosity = getVerbosity();
+
+  QudaInvertParam param = newQudaInvertParam();
+
+  param.verbosity = verbosity;
+
+  param.cpu_prec = QUDA_DOUBLE_PRECISION;              /* The precision used by the input fermion fields */
+  param.cuda_prec = QUDA_DOUBLE_PRECISION;             /* The precision used by the QUDA solver */
+  param.cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA eigensolver */
+
+  param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;     /* The precision used by the QUDA solver */
+  param.cuda_prec_precondition = QUDA_HALF_PRECISION; /* The precision used by the QUDA solver */
+
+  /**
+   * The order of the input and output fermion fields. Imposes fieldOrder =
+   * QUDA_OPENQCD_FIELD_ORDER in color_spinor_field.h and
+   * QUDA_OPENQCD_FIELD_ORDER makes quda to instantiate OpenQCDDiracOrder.
+   */
+  param.dirac_order = QUDA_OPENQCD_DIRAC_ORDER;
+
+  /**
+   * Gamma basis of the input and output host fields. Specifies the basis change
+   * into QUDAs internal gamma basis. Note that QUDA applies the basis change U
+   * to a spinor field when uploading and U^dagger when downloading.
+   */
+  param.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS;
+
+  return param;
+}
+
+/**
+ * @brief      Initialize quda gauge param struct
+ *
+ * @param[in]  prec        Precision
+ * @param[in]  rec         QUDA internal gauge field format
+ * @param[in]  t_boundary  Time boundary condition
+ *
+ * @return     The quda gauge parameter struct.
+ */
+static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary)
+{
+  QudaGaugeParam param = newQudaGaugeParam();
+
+  get_local_dims(param.X);
+  param.cuda_prec_sloppy = param.cpu_prec = param.cuda_prec = prec;
+  param.type = QUDA_SU3_LINKS;
+
+  param.reconstruct_sloppy = param.reconstruct = rec;
+
+  /* This makes quda to instantiate OpenQCDOrder */
+  param.gauge_order = QUDA_OPENQCD_GAUGE_ORDER;
+
+  param.t_boundary = t_boundary;
+  param.gauge_fix = QUDA_GAUGE_FIXED_NO;
+  param.scale = 1.0;
+  param.anisotropy = 1.0;                 /* 1.0 means not anisotropic */
+  param.ga_pad = getLinkPadding(param.X); /* Why this? */
+
+  return param;
+}
+
+void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, char *infile)
+{
+  if (qudaState.initialized) return;
+  qudaState.init = init;
+  qudaState.layout = layout;
+
+  if (layout.quda_comm == MPI_COMM_NULL) return;
+
+  qudaopenqcd_called<true>(__func__);
+  openQCD_qudaSetLayout(qudaState.layout, infile);
+  qudaopenqcd_called<false>(__func__);
+  qudaState.initialized = true;
+}
+
+static int in_comm(void)
+{
+   return qudaState.layout.quda_comm != MPI_COMM_NULL;
+}
+
+void openQCD_qudaFinalize(void)
+{
+  if (!in_comm()) return;
+
+  for (int id = 0; id < OPENQCD_MAX_INVERTERS; ++id) {
+    if (qudaState.inv_handles[id] != nullptr) { openQCD_qudaSolverDestroy(id); }
+  }
+
+  for (int id = 0; id < OPENQCD_MAX_EIGENSOLVERS; ++id) {
+    if (qudaState.eig_handles[id] != nullptr) { openQCD_qudaEigensolverDestroy(id); }
+  }
+
+  qudaState.initialized = false;
+  endQuda();
+}
+
+double openQCD_qudaPlaquette(void)
+{
+  double plaq[3];
+
+  WITH_COMM(plaqQuda(plaq));
+
+  if (!qudaState.init.two_grids_equal)
+    MPI_Bcast(&plaq[0], 1, MPI_DOUBLE, 0, qudaState.layout.world_comm);
+
+  /* Note different Nc normalization wrt openQCD! */
+  return 3.0 * plaq[0];
+}
+
+void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary)
+{
+  void *buf = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, gauge);
+  if (qudaState.layout.openqcd2quda(OPENQCD_FIELD_GAUGE, gauge, buf)) {
+    QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec, t_boundary);
+    loadGaugeQuda(buf, &param);
+  }
+}
+
+void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary)
+{
+  void *buf = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, gauge);
+  if (in_comm()) {
+    QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec, t_boundary);
+    saveGaugeQuda(buf, &param);
+  }
+  qudaState.layout.quda2openqcd(OPENQCD_FIELD_GAUGE, buf, gauge);
+}
+
+void openQCD_qudaGaugeFree(void)
+{
+  WITH_COMM(freeGaugeQuda());
+  qudaState.ud_rev = -1;
+  qudaState.ad_rev = -1;
+  qudaState.swd_ud_rev = -1;
+  qudaState.swd_ad_rev = -1;
+}
+
+void openQCD_qudaCloverLoad(void *clover, double kappa, double csw)
+{
+  QudaInvertParam param = newOpenQCDParam();
+  param.clover_order = QUDA_OPENQCD_CLOVER_ORDER;
+  param.dslash_type = QUDA_CLOVER_WILSON_DSLASH;
+  param.clover_cpu_prec = QUDA_DOUBLE_PRECISION;
+  param.clover_cuda_prec = QUDA_DOUBLE_PRECISION;
+  param.clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION;
+
+  param.kappa = kappa;
+  param.clover_csw = csw;
+  param.clover_coeff = 0.0;
+
+  void *buf = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, clover);
+  if (qudaState.layout.openqcd2quda(OPENQCD_FIELD_CLOVER, clover, buf)) {
+    loadCloverQuda(buf, NULL, &param);
+  }
+}
+
+void openQCD_qudaCloverFree(void)
+{
+  WITH_COMM(freeCloverQuda());
+  qudaState.swd_ud_rev = 0;
+  qudaState.swd_ad_rev = 0;
+  qudaState.swd_kappa = 0.0;
+  qudaState.swd_su3csw = 0.0;
+  qudaState.swd_u1csw = 0.0;
+  qudaState.swd_qhat = 0;
+}
+
+/**
+ * @brief      Set the su3csw corfficient and all related properties.
+ *
+ * @param      param   The parameter struct
+ * @param[in]  su3csw  The su3csw coefficient
+ */
+inline void set_su3csw(QudaInvertParam *param, double su3csw)
+{
+  param->clover_csw = su3csw;
+  if (su3csw != 0.0) {
+    param->clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */
+    param->clover_cpu_prec = QUDA_DOUBLE_PRECISION;
+    param->clover_cuda_prec = QUDA_DOUBLE_PRECISION;
+    param->clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION;
+
+    param->clover_coeff = 0.0;
+
+    /* Set to Wilson Dirac operator with Clover term */
+    param->dslash_type = QUDA_CLOVER_WILSON_DSLASH;
+
+    if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) {
+      param->clover_order = QUDA_NATIVE_CLOVER_ORDER; /* what implication has this? */
+      param->compute_clover = true;
+    } else {
+      param->clover_order = QUDA_OPENQCD_CLOVER_ORDER;
+    }
+  }
+}
+
+/**
+ * @brief      Creates a new quda Dirac parameter struct
+ *
+ * @param[in]  p     OpenQCD Dirac parameter struct
+ *
+ * @return     The quda Dirac parameter struct.
+ */
+static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p)
+{
+  QudaInvertParam param = newOpenQCDParam();
+
+  param.dslash_type = QUDA_WILSON_DSLASH;
+  param.kappa = p.kappa;
+  param.mu = p.mu;
+  param.dagger = QUDA_DAG_NO;
+  param.matpc_type = QUDA_MATPC_EVEN_EVEN;
+
+  set_su3csw(&param, p.su3csw);
+
+  param.inv_type = QUDA_CG_INVERTER; /* just set some, needed? */
+
+  /* What is the difference? only works with QUDA_MASS_NORMALIZATION */
+  param.mass_normalization = QUDA_MASS_NORMALIZATION;
+
+  /* Extent of the 5th dimension (for domain wall) */
+  param.Ls = 1;
+
+  return param;
+}
+
+
+static ColorSpinorField *openQCD_qudaSpinorAlloc(void)
+{
+  if (in_comm()) {
+    QudaInvertParam param = newOpenQCDParam();
+    ColorSpinorParam cpuParam(nullptr, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION);
+    ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION);
+    cudaParam.create = QUDA_NULL_FIELD_CREATE;
+    cudaParam.location = QUDA_CUDA_FIELD_LOCATION;
+    ColorSpinorField *out_d = new ColorSpinorField(cudaParam);
+    return out_d;
+  } else {
+    return nullptr;
+  }
+}
+
+
+void openQCD_qudaD2H(void *quda_field, void *openQCD_field)
+{
+  void *out = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, openQCD_field);
+
+  if (in_comm()) {
+    /* sets up the necessary parameters */
+    QudaInvertParam param = newOpenQCDParam();
+
+    /* creates a field on the CPU */
+    ColorSpinorParam cpuParam(out, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION);
+    ColorSpinorField out_h(cpuParam);
+
+    /* transfer the GPU field to CPU */
+    out_h = *reinterpret_cast<ColorSpinorField *>(quda_field);
+  }
+
+  qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, out, openQCD_field);
+}
+
+void *openQCD_qudaH2D(void *openQCD_field)
+{
+  void *in = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, openQCD_field);
+
+  if (qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, openQCD_field, in)) {
+
+    /* sets up the necessary parameters */
+    QudaInvertParam param = newOpenQCDParam();
+
+    /* creates a field on the CPU */
+    ColorSpinorParam cpuParam(in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION);
+    ColorSpinorField in_h(cpuParam);
+
+    /* creates a field on the GPU with the same parameter set as the CPU field */
+    ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION);
+    ColorSpinorField *in_d = new ColorSpinorField(cudaParam);
+
+    *in_d = in_h; /* transfer the CPU field to GPU */
+    return in_d;
+  }
+
+  return nullptr;
+}
+
+
+void openQCD_back_and_forth(void *h_in, void *h_out)
+{
+  ColorSpinorField *in = reinterpret_cast<ColorSpinorField *>(openQCD_qudaH2D(h_in));
+  ColorSpinorField *out = openQCD_qudaSpinorAlloc();
+  WITH_COMM(*out = *in);
+  openQCD_qudaD2H(out, h_out);
+  openQCD_qudaSpinorFree((void**) &in);
+  openQCD_qudaSpinorFree((void**) &out);
+}
+
+
+int openQCD_qudaIndexIptLexi(const int iy)
+{
+  int x[4];
+  int L_openqcd[4];
+  openqcd::rotate_coords(qudaState.layout.L, L_openqcd);
+
+  x[3] = iy % L_openqcd[3];
+  int c = (iy - x[3])/L_openqcd[3];
+  x[2] = c % L_openqcd[2];
+  c = (c - x[2])/L_openqcd[2];
+  x[1] = c % L_openqcd[1];
+  x[0] = (c - x[1])/L_openqcd[1];
+
+  return openqcd::ipt(x, L_openqcd);
+}
+
+int openQCD_qudaIndexIpt(const int *x)
+{
+  int L_openqcd[4];
+  openqcd::rotate_coords(qudaState.layout.L, L_openqcd);
+  return openqcd::ipt(x, L_openqcd);
+}
+
+int openQCD_qudaIndexIup(const int *x, const int mu)
+{
+  int L_openqcd[4], nproc_openqcd[4];
+  openqcd::rotate_coords(qudaState.layout.L, L_openqcd);
+  openqcd::rotate_coords(qudaState.layout.nproc, nproc_openqcd);
+  return openqcd::iup(x, mu, L_openqcd, nproc_openqcd);
+}
+
+double openQCD_qudaNorm(void *h_in)
+{
+  ColorSpinorField *in = reinterpret_cast<ColorSpinorField *>(openQCD_qudaH2D(h_in));
+  double norm2 = in_comm() ? blas::norm2(*in) : 0.0;
+  openQCD_qudaSpinorFree((void**) &in);
+  if (!qudaState.init.two_grids_equal) {
+    MPI_Bcast(&norm2, 1, MPI_DOUBLE, 0, qudaState.layout.world_comm);
+  }
+  return norm2;
+}
+
+double openQCD_qudaNorm_NoLoads(void *d_in) {
+  double norm2 = in_comm() ? blas::norm2(*reinterpret_cast<ColorSpinorField *>(d_in)) : 0.0;
+  if (!qudaState.init.two_grids_equal) {
+    MPI_Bcast(&norm2, 1, MPI_DOUBLE, 0, qudaState.layout.world_comm);
+  }
+  return norm2;
+}
+
+void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out)
+{
+  ColorSpinorField *in = reinterpret_cast<ColorSpinorField *>(openQCD_qudaH2D(openQCD_in));
+  ColorSpinorField *out = openQCD_qudaSpinorAlloc();
+
+  /* gamma_i run within QUDA using QUDA fields */
+  if (in_comm()) {
+    switch (dir) {
+      case 0: ApplyGamma(*out, *in, QUDA_GAMMA_T); break;
+      case 1: ApplyGamma(*out, *in, QUDA_GAMMA_X); break;
+      case 2: ApplyGamma(*out, *in, QUDA_GAMMA_Y); break;
+      case 3: ApplyGamma(*out, *in, QUDA_GAMMA_Z); break;
+      case 4:
+      case 5:
+        ApplyGamma(*out, *in, QUDA_GAMMA_5);
+        /* UKQCD uses a different convention for Gamma matrices:
+         * gamma5_ukqcd = gammax gammay gammaz gammat,
+         * gamma5_openqcd = gammat gammax gammay gammaz,
+         * and thus
+         * gamma5_openqcd = -1 * U gamma5_ukqcd U^dagger,
+         * with U the transformation matrix from OpenQCD to UKQCD. */
+        blas::ax(-1.0, *out);
+        break;
+      default: errorQuda("Unknown gamma: %d\n", dir);
+    }
+  }
+
+  openQCD_qudaD2H(out, openQCD_out);
+  openQCD_qudaSpinorFree((void**) &in);
+  openQCD_qudaSpinorFree((void**) &out);
+}
+
+
+void openQCD_qudaSpinorFree(void **quda_field)
+{
+  delete reinterpret_cast<ColorSpinorField *>(*quda_field);
+  *quda_field = nullptr;
+}
+
+
+/**
+ * @brief      Check whether the gauge field from openQCD is in sync with the
+ *             one from QUDA.
+ *
+ * @return     true/false
+ */
+inline bool gauge_field_get_up2date(void)
+{
+  int ud_rev = -2, ad_rev = -2;
+
+  /* get current residing gauge field revision (residing in openqxd) */
+  qudaState.layout.get_gfld_flags(&ud_rev, &ad_rev);
+  return ud_rev == qudaState.ud_rev && ad_rev == qudaState.ad_rev;
+}
+
+/**
+ * @brief      Check whether the gauge field is not (yet) set in openQCD.
+ *
+ * @return     true/false
+ */
+inline bool gauge_field_get_unset(void)
+{
+  int ud_rev = -2, ad_rev = -2;
+  qudaState.layout.get_gfld_flags(&ud_rev, &ad_rev);
+  return ud_rev == 0 && ad_rev == 0;
+}
+
+/**
+ * @brief      Check if the current SW field needs to update wrt the parameters from openQCD.
+ *
+ * @return     true/false
+ */
+inline bool clover_field_get_up2date(void)
+{
+  openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms();
+  bool test = gauge_field_get_up2date();
+  return (test && qudaState.swd_ud_rev == qudaState.ud_rev && qudaState.swd_ad_rev == qudaState.ad_rev
+          && qudaState.swd_kappa == 1.0 / (2.0 * (dp.m0 + 4.0)) && qudaState.swd_su3csw == dp.su3csw
+          && qudaState.swd_u1csw == dp.u1csw && qudaState.swd_qhat == dp.qhat);
+}
+
+/**
+ * @brief      Check whether the multigrid instance associated to the parameter
+ *             struct is up to date with the global gauge field revision,
+ *             parameters are in sync, and clover/gauge fields are up to date.
+ *
+ * @param      param  The parameter struct
+ *
+ * @return     true/false
+ */
+inline bool mg_get_up2date(QudaInvertParam *param)
+{
+  openQCD_QudaSolver *additional_prop = static_cast<openQCD_QudaSolver *>(param->additional_prop);
+  openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms();
+  int test = param->preconditioner != nullptr;
+
+  MPI_Bcast(&test, 1, MPI_INT, 0, qudaState.layout.world_comm);
+  return (test && gauge_field_get_up2date() && clover_field_get_up2date()
+          && additional_prop->mg_ud_rev == qudaState.ud_rev && additional_prop->mg_ad_rev == qudaState.ad_rev
+          && additional_prop->mg_kappa == 1.0 / (2.0 * (dp.m0 + 4.0)) && additional_prop->mg_su3csw == dp.su3csw
+          && additional_prop->mg_u1csw == dp.u1csw && additional_prop->mg_qhat == dp.qhat);
+}
+
+/**
+ * @brief      Sets the multigrid instance associated to the parameter struct to
+ *             be in sync with openQxD.
+ *
+ * @param      param  The parameter struct
+ */
+inline void mg_set_revision(QudaInvertParam *param)
+{
+  openQCD_QudaSolver *additional_prop = static_cast<openQCD_QudaSolver *>(param->additional_prop);
+  openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms();
+  additional_prop->mg_ud_rev = qudaState.ud_rev;
+  additional_prop->mg_ad_rev = qudaState.ad_rev;
+  additional_prop->mg_kappa = 1.0 / (2.0 * (dp.m0 + 4.0));
+  additional_prop->mg_su3csw = dp.su3csw;
+  additional_prop->mg_u1csw = dp.u1csw;
+  additional_prop->mg_qhat = dp.qhat;
+}
+
+/**
+ * @brief      Set the global revisions numners for the SW field.
+ */
+inline void clover_field_set_revision(void)
+{
+  openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms();
+  qudaState.swd_ud_rev = qudaState.ud_rev;
+  qudaState.swd_ad_rev = qudaState.ad_rev;
+  qudaState.swd_kappa = 1.0 / (2.0 * (dp.m0 + 4.0));
+  qudaState.swd_su3csw = dp.su3csw;
+  qudaState.swd_u1csw = dp.u1csw;
+  qudaState.swd_qhat = dp.qhat;
+}
+
+/**
+ * @brief      Set the global revisions numners for the gauge field.
+ */
+inline void gauge_field_set_revision(void) { qudaState.layout.get_gfld_flags(&qudaState.ud_rev, &qudaState.ad_rev); }
+
+
+/**
+ * @brief      Check if the solver parameters are in sync with the parameters
+ *             from openQCD.
+ *
+ * @param      param_  The parameter struct
+ *
+ * @return     Whether parameters are in sync or not
+ */
+static int openQCD_qudaInvertParamCheck(void *param_)
+{
+  QudaInvertParam *param = static_cast<QudaInvertParam *>(param_);
+  openQCD_QudaSolver *additional_prop = static_cast<openQCD_QudaSolver *>(param->additional_prop);
+  bool ret = true;
+  openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms();
+
+  if (param->kappa != (1.0 / (2.0 * (dp.m0 + 4.0)))) {
+    WITH_COMM(logQuda(
+      QUDA_VERBOSE, "Property m0/kappa does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n",
+      (1.0 / (2.0 * (dp.m0 + 4.0))), param->kappa));
+    WITH_COMM(logQuda(QUDA_VERBOSE, "  => need params update\n"));
+    ret = false;
+  }
+
+  if (additional_prop->u1csw != dp.u1csw) {
+    WITH_COMM(logQuda(
+      QUDA_VERBOSE,
+      "Property u1csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n",
+      dp.u1csw, additional_prop->u1csw));
+    WITH_COMM(logQuda(QUDA_VERBOSE, "  => need clover field and params update\n"));
+    ret = false;
+  }
+
+  if (additional_prop->qhat != dp.qhat) {
+    WITH_COMM(logQuda(QUDA_VERBOSE,
+            "Property qhat does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %d, QUDA: %d)\n",
+            dp.qhat, additional_prop->qhat));
+    WITH_COMM(logQuda(QUDA_VERBOSE, "  => need gauge, clover field and params update\n"));
+    ret = false;
+  }
+
+  if (param->clover_csw != dp.su3csw) {
+    WITH_COMM(logQuda(
+      QUDA_VERBOSE, "Property su3csw/clover_csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n",
+      dp.su3csw, param->clover_csw));
+    WITH_COMM(logQuda(QUDA_VERBOSE, "  => need clover field and params update\n"));
+    ret = false;
+  }
+
+  return ret;
+}
+
+/**
+ * @brief      Check if the solver identifier is in bounds
+ *
+ * @param[in]  id    The identifier
+ */
+void inline check_solver_id(int id)
+{
+  if (id < -1 || id > OPENQCD_MAX_INVERTERS - 1) {
+    WITH_COMM(errorQuda("Solver id %d is out of range [%d, %d).", id, -1, OPENQCD_MAX_INVERTERS));
+  }
+}
+
+/**
+ * @brief      Check if the eigen-solver identifier is in bounds
+ *
+ * @param[in]  id    The identifier
+ */
+void inline check_eigensolver_id(int id)
+{
+  if (id < 0 || id > OPENQCD_MAX_EIGENSOLVERS - 1) {
+    WITH_COMM(errorQuda("Eigensolver id %d is out of range [%d, %d).", id, 0, OPENQCD_MAX_EIGENSOLVERS));
+  }
+}
+
+
+/**
+ * @brief      Transfer the gauge field if the gauge field was updated in
+ *             openQxD. (Re-)calculate or transfer the clover field if
+ *             parameters have changed or gauge field was updated. Update the
+ *             settings kappa, qhat, su3csw and u1csw in the QudaInvertParam
+ *             struct such that they are in sync with openQxD. Set up or update
+ *             the multigrid instance if set in QudaInvertParam and if gauge- or
+ *             clover-fields or parameters have changed.
+ *
+ * @param      param_  The parameter struct, where in param->additional_prop a
+ *                     pointer to the QudaMultigridParam struct was placed.
+ */
+static void openQCD_qudaSolverUpdate(void *param_)
+{
+  if (param_ == nullptr) { WITH_COMM(errorQuda("Solver handle is NULL.")); }
+
+  QudaInvertParam *param = static_cast<QudaInvertParam *>(param_);
+  openQCD_QudaSolver *additional_prop = static_cast<openQCD_QudaSolver *>(param->additional_prop);
+  openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms();
+
+  bool do_param_update = !openQCD_qudaInvertParamCheck(param_);
+  bool do_gauge_transfer = (!gauge_field_get_up2date() && !gauge_field_get_unset())
+    || additional_prop->qhat != dp.qhat;
+  bool do_clover_update = !clover_field_get_up2date() && !gauge_field_get_unset();
+  bool do_multigrid_update = param_ != qudaState.dirac_handle && param->inv_type_precondition == QUDA_MG_INVERTER
+    && !mg_get_up2date(param) && !gauge_field_get_unset();
+  bool do_multigrid_fat_update = do_multigrid_update
+    && (do_gauge_transfer || additional_prop->mg_ud_rev != qudaState.ud_rev
+        || additional_prop->mg_ad_rev != qudaState.ad_rev);
+
+  if (do_gauge_transfer) {
+    if (qudaState.layout.h_gauge == nullptr) { WITH_COMM(errorQuda("qudaState.layout.h_gauge is not set.")); }
+    WITH_COMM(logQuda(QUDA_VERBOSE, "Loading gauge field from openQCD ...\n"));
+    void *h_gauge = qudaState.layout.h_gauge();
+    PUSH_RANGE("openQCD_qudaGaugeLoad", 3);
+
+#ifdef BUILD_QCD_PLUS_QED
+    QudaReconstructType rec = QUDA_RECONSTRUCT_9;
+    if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) {
+      WITH_COMM(warningQuda("QUDA was built with QCD+QED support, but gauge group does not require it (set QUDA_QCD_PLUS_QED=OFF)"));
+    }
+#else
+    QudaReconstructType rec = QUDA_RECONSTRUCT_8;
+    if (qudaState.layout.flds_parms().gauge != OPENQCD_GAUGE_SU3) {
+      WITH_COMM(errorQuda("QUDA was not built with QCD+QED support, but gauge group requires it (set QUDA_QCD_PLUS_QED=ON)"));
+    }
+#endif
+
+    /**
+     * We set t_boundary = QUDA_ANTI_PERIODIC_T. This setting is a label that
+     * tells QUDA the current state of the residing gauge field, that is the
+     * same state as the one we transfer from openqxd. In openqxd the hdfld
+     * exhibits phases of -1 for the temporal time boundaries, meaning that the
+     * gauge fields are explicitly multiplied by -1 on the t=0 time slice, see
+     * chs_hd0() in hflds.c. The QUDA_ANTI_PERIODIC_T flag says that these
+     * phases are incorporated into the field and that QUDA has to add these
+     * phases on the t=0 time slice when reconstructing the field from
+     * QUDA_RECONSTRUCT_8/12, but not from QUDA_RECONSTRUCT_NO. In case of
+     * QUDA_RECONSTRUCT_NO the value if t_boundary has no effect.
+     *
+     * @see        https://github.com/lattice/quda/issues/1315
+     * @see        Reconstruct#Unpack() in gauge_field_order.h
+     * @see        Reconstruct<8,...>#Unpack() in gauge_field_order.h
+     * @see        Reconstruct<12,...>#Unpack() in gauge_field_order.h
+     */
+    openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION, rec, QUDA_ANTI_PERIODIC_T);
+    gauge_field_set_revision();
+    POP_RANGE;
+  }
+
+  if (do_param_update) {
+    WITH_COMM(logQuda(QUDA_VERBOSE, "Syncing kappa, qhat, su3csw, u1csw values from openQCD ...\n"));
+    param->kappa = 1.0 / (2.0 * (dp.m0 + 4.0));
+    additional_prop->u1csw = dp.u1csw;
+    additional_prop->qhat = dp.qhat;
+    set_su3csw(param, dp.su3csw);
+
+    QudaInvertParam *mg_inv_param = additional_prop->mg_param->invert_param;
+    mg_inv_param->kappa = 1.0 / (2.0 * (dp.m0 + 4.0));
+    set_su3csw(mg_inv_param, dp.su3csw);
+  }
+
+  if (do_clover_update) {
+    if (param->clover_csw == 0.0) {
+      WITH_COMM(logQuda(QUDA_VERBOSE, "Deallocating Clover field in QUDA ...\n"));
+      openQCD_qudaCloverFree();
+      qudaState.swd_ud_rev = 0;
+      qudaState.swd_ad_rev = 0;
+      qudaState.swd_kappa = 0.0;
+      qudaState.swd_su3csw = 0.0;
+      qudaState.swd_u1csw = 0.0;
+      qudaState.swd_qhat = 0;
+    } else {
+      if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) {
+        /**
+         * SU3 case:
+         * Leaving both h_clover = h_clovinv = NULL allocates the clover field on
+         * the GPU and finally calls @createCloverQuda to calculate the clover
+         * field.
+         */
+        WITH_COMM(logQuda(QUDA_VERBOSE, "Generating Clover field in QUDA ...\n"));
+        PUSH_RANGE("loadCloverQuda", 3);
+        WITH_COMM(loadCloverQuda(NULL, NULL, param));
+        POP_RANGE;
+        clover_field_set_revision();
+      } else {
+        /**
+         * U3 case: Transfer the SW-field from openQCD.
+         */
+
+        if (qudaState.layout.h_sw == nullptr) { WITH_COMM(errorQuda("qudaState.layout.h_sw is not set.")); }
+
+        WITH_COMM(logQuda(QUDA_VERBOSE, "Loading Clover field from openQCD ...\n"));
+        void *h_sw = qudaState.layout.h_sw();
+        PUSH_RANGE("openQCD_qudaCloverLoad", 3);
+        openQCD_qudaCloverLoad(h_sw, param->kappa, param->clover_csw);
+        POP_RANGE;
+        clover_field_set_revision();
+
+        /*loadCloverQuda(qudaState.layout.h_sw(), NULL, param);*/
+        /* TODO: The above line would be prefered over openQCD_qudaCloverLoad, but throws this error, no idea why?
+        QUDA: ERROR: qudaEventRecord_ returned CUDA_ERROR_ILLEGAL_ADDRESS
+         (timer.h:82 in start())
+         (rank 0, host yoshi, quda_api.cpp:72 in void quda::target::cuda::set_driver_error(CUresult, const char*, const
+        char*, const char*, const char*, bool)()) QUDA:        last kernel called was
+        (name=N4quda10CopyCloverINS_6clover11FloatNOrderIdLi72ELi2ELb0ELb1ELb0EEENS1_12OpenQCDOrderIdLi72EEEddEE,volume=32x16x16x64,aux=GPU-offline,vol=524288precision=8Nc=3,compute_diagonal)*/
+      }
+    }
+  }
+
+  /* setup/update the multigrid instance or do nothing */
+  if (do_multigrid_update) {
+    QudaMultigridParam *mg_param = additional_prop->mg_param;
+
+    if (mg_param == nullptr) { WITH_COMM(errorQuda("No multigrid parameter struct set.")); }
+
+    if (do_multigrid_fat_update && param->preconditioner != nullptr) {
+      WITH_COMM(logQuda(QUDA_VERBOSE, "Destroying existing multigrid instance ...\n"));
+      PUSH_RANGE("destroyMultigridQuda", 4);
+      WITH_COMM(destroyMultigridQuda(param->preconditioner));
+      param->preconditioner = nullptr;
+      POP_RANGE;
+
+      additional_prop->mg_ud_rev = 0;
+      additional_prop->mg_ad_rev = 0;
+      additional_prop->mg_kappa = 0.0;
+      additional_prop->mg_su3csw = 0.0;
+      additional_prop->mg_u1csw = 0.0;
+      additional_prop->mg_qhat = 0;
+    }
+
+    if (in_comm()) {
+      if (param->preconditioner == nullptr) {
+        logQuda(QUDA_VERBOSE, "Setting up multigrid instance ...\n");
+        PUSH_RANGE("newMultigridQuda", 4);
+        param->preconditioner = newMultigridQuda(mg_param);
+        POP_RANGE;
+      } else {
+        logQuda(QUDA_VERBOSE, "Updating existing multigrid instance ...\n");
+        PUSH_RANGE("updateMultigridQuda", 4);
+        updateMultigridQuda(param->preconditioner, mg_param);
+        POP_RANGE;
+      }
+    }
+    mg_set_revision(param);
+  }
+}
+
+
+static void *openQCD_qudaSolverReadIn(int id)
+{
+  int my_rank;
+  openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms();
+
+  MPI_Comm_rank(qudaState.layout.world_comm, &my_rank);
+
+  /* Allocate on the heap */
+  QudaInvertParam *param = new QudaInvertParam(newQudaInvertParam());
+  QudaInvertParam *invert_param_mg = new QudaInvertParam(newQudaInvertParam());
+  QudaMultigridParam *multigrid_param = new QudaMultigridParam(newQudaMultigridParam());
+  std::string section = "Solver " + std::to_string(id);
+
+  /* Some default settings */
+  /* Some of them should not be changed */
+  param->verbosity = QUDA_SUMMARIZE;
+  param->cpu_prec = QUDA_DOUBLE_PRECISION;
+  param->cuda_prec = QUDA_DOUBLE_PRECISION;
+  param->cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION;
+  param->cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
+  param->cuda_prec_precondition = QUDA_HALF_PRECISION;
+  param->dirac_order = QUDA_OPENQCD_DIRAC_ORDER;
+  param->gamma_basis = QUDA_OPENQCD_GAMMA_BASIS;
+  param->dslash_type = QUDA_WILSON_DSLASH;
+  param->kappa = 1.0 / (2.0 * (dp.m0 + 4.0));
+  param->mu = 0.0;
+  param->dagger = QUDA_DAG_NO;
+  param->solution_type = QUDA_MAT_SOLUTION;
+  param->solve_type = QUDA_DIRECT_SOLVE;
+  param->matpc_type = QUDA_MATPC_EVEN_EVEN;
+  param->solver_normalization = QUDA_DEFAULT_NORMALIZATION;
+  param->inv_type_precondition = QUDA_INVALID_INVERTER; /* disables any preconditioning   */
+  param->mass_normalization = QUDA_MASS_NORMALIZATION;
+
+  set_su3csw(param, dp.su3csw);
+
+  if (my_rank == 0 && id != -1) {
+
+    KeyValueStore kv;
+    kv.set_map(&enum_map);
+    kv.load(qudaState.infile);
+
+    if (!kv.section_exists(section)) {
+      WITH_COMM(errorQuda("Solver section \"%s\" in file %s does not exist.", section.c_str(), qudaState.infile));
+    }
+
+    param->verbosity = kv.get<QudaVerbosity>(section, "verbosity", param->verbosity);
+
+    if (param->verbosity >= QUDA_DEBUG_VERBOSE) { kv.dump(); }
+
+    if (kv.get<std::string>(section, "solver") != "QUDA") {
+      WITH_COMM(errorQuda("Solver section \"%s\" in file %s is not a valid quda-solver section (solver = %s).", section.c_str(),
+                qudaState.infile, kv.get<std::string>(section, "solver").c_str()));
+    }
+
+    /* both fields reside on the CPU */
+    param->input_location = kv.get<QudaFieldLocation>(section, "input_location", QUDA_CPU_FIELD_LOCATION);
+    param->output_location = kv.get<QudaFieldLocation>(section, "output_location", QUDA_CPU_FIELD_LOCATION);
+
+    param->inv_type = kv.get<QudaInverterType>(section, "inv_type", param->inv_type);
+    param->kappa = kv.get<double>(section, "kappa", param->kappa);
+    param->mu = kv.get<double>(section, "mu", param->mu);
+    param->tm_rho = kv.get<double>(section, "tm_rho", param->tm_rho);
+    param->epsilon = kv.get<double>(section, "epsilon", param->epsilon);
+    param->twist_flavor = kv.get<QudaTwistFlavorType>(section, "twist_flavor", param->twist_flavor);
+    param->laplace3D = kv.get<int>(section, "laplace3D", param->laplace3D);
+
+    /* Solver settings */
+    param->tol = kv.get<double>(section, "tol", param->tol);
+    param->tol_restart = kv.get<double>(section, "tol_restart", param->tol_restart);
+    param->tol_hq = kv.get<double>(section, "tol_hq", param->tol_hq);
+
+    param->compute_true_res = kv.get<int>(section, "compute_true_res", param->compute_true_res);
+    param->true_res[0] = kv.get<double>(section, "true_res", param->true_res[0]); /* TODO: mrhs */
+    param->true_res_hq[0] = kv.get<double>(section, "true_res_hq", param->true_res_hq[0]);
+    param->maxiter = kv.get<int>(section, "maxiter", param->maxiter);
+    param->reliable_delta = kv.get<double>(section, "reliable_delta", param->reliable_delta);
+    param->reliable_delta_refinement
+      = kv.get<double>(section, "reliable_delta_refinement", param->reliable_delta_refinement);
+    param->use_alternative_reliable = kv.get<int>(section, "use_alternative_reliable", param->use_alternative_reliable);
+    param->use_sloppy_partial_accumulator
+      = kv.get<int>(section, "use_sloppy_partial_accumulator", param->use_sloppy_partial_accumulator);
+
+    param->solution_accumulator_pipeline
+      = kv.get<int>(section, "solution_accumulator_pipeline", param->solution_accumulator_pipeline);
+
+    param->max_res_increase = kv.get<int>(section, "max_res_increase", param->max_res_increase);
+    param->max_res_increase_total = kv.get<int>(section, "max_res_increase_total", param->max_res_increase_total);
+    param->max_hq_res_increase = kv.get<int>(section, "max_hq_res_increase", param->max_hq_res_increase);
+    param->max_hq_res_restart_total = kv.get<int>(section, "max_hq_res_restart_total", param->max_hq_res_restart_total);
+
+    param->heavy_quark_check = kv.get<int>(section, "heavy_quark_check", param->heavy_quark_check);
+
+    param->pipeline = kv.get<int>(section, "pipeline", param->pipeline);
+    param->num_offset = kv.get<int>(section, "num_offset", param->num_offset);
+    param->num_src = kv.get<int>(section, "num_src", param->num_src);
+    param->num_src_per_sub_partition
+      = kv.get<int>(section, "num_src_per_sub_partition", param->num_src_per_sub_partition);
+
+    param->split_grid[0] = kv.get<int>(section, "split_grid[1]", param->split_grid[0]);
+    param->split_grid[1] = kv.get<int>(section, "split_grid[2]", param->split_grid[1]);
+    param->split_grid[2] = kv.get<int>(section, "split_grid[3]", param->split_grid[2]);
+    param->split_grid[3] = kv.get<int>(section, "split_grid[0]", param->split_grid[3]);
+
+    param->overlap = kv.get<int>(section, "overlap", param->overlap);
+
+    for (int i = 0; i < param->num_offset; i++) {
+      std::string sub_key = "offset[" + std::to_string(i) + "]";
+      param->offset[i] = kv.get<double>(section, sub_key, param->offset[i]);
+      sub_key = "tol_offset[" + std::to_string(i) + "]";
+      param->tol_offset[i] = kv.get<double>(section, sub_key, param->tol_offset[i]);
+      sub_key = "tol_hq_offset[" + std::to_string(i) + "]";
+      param->tol_hq_offset[i] = kv.get<double>(section, sub_key, param->tol_hq_offset[i]);
+    }
+
+    param->compute_action = kv.get<int>(section, "compute_action", param->compute_action);
+
+    param->solution_type = kv.get<QudaSolutionType>(section, "solution_type", param->solution_type);
+    param->solve_type = kv.get<QudaSolveType>(section, "solve_type", param->solve_type);
+    param->matpc_type = kv.get<QudaMatPCType>(section, "matpc_type", param->matpc_type);
+    param->dagger = kv.get<QudaDagType>(section, "dagger", param->dagger);
+    param->mass_normalization = kv.get<QudaMassNormalization>(section, "mass_normalization", param->mass_normalization);
+    param->solver_normalization
+      = kv.get<QudaSolverNormalization>(section, "solver_normalization", param->solver_normalization);
+
+    param->preserve_source = kv.get<QudaPreserveSource>(section, "preserve_source", param->preserve_source);
+
+    param->cpu_prec = kv.get<QudaPrecision>(section, "cpu_prec", param->cpu_prec);
+    param->cuda_prec = kv.get<QudaPrecision>(section, "cuda_prec", param->cuda_prec);
+    param->cuda_prec_sloppy = kv.get<QudaPrecision>(section, "cuda_prec_sloppy", param->cuda_prec_sloppy);
+    param->cuda_prec_refinement_sloppy
+      = kv.get<QudaPrecision>(section, "cuda_prec_refinement_sloppy", param->cuda_prec_refinement_sloppy);
+    param->cuda_prec_precondition
+      = kv.get<QudaPrecision>(section, "cuda_prec_precondition", param->cuda_prec_precondition);
+    param->cuda_prec_eigensolver = kv.get<QudaPrecision>(section, "cuda_prec_eigensolver", param->cuda_prec_eigensolver);
+
+    param->clover_location = kv.get<QudaFieldLocation>(section, "clover_location", param->clover_location);
+    param->clover_cpu_prec = kv.get<QudaPrecision>(section, "clover_cpu_prec", param->clover_cpu_prec);
+    param->clover_cuda_prec = kv.get<QudaPrecision>(section, "clover_cuda_prec", param->clover_cuda_prec);
+    param->clover_cuda_prec_sloppy
+      = kv.get<QudaPrecision>(section, "clover_cuda_prec_sloppy", param->clover_cuda_prec_sloppy);
+    param->clover_cuda_prec_refinement_sloppy
+      = kv.get<QudaPrecision>(section, "clover_cuda_prec_refinement_sloppy", param->clover_cuda_prec_refinement_sloppy);
+    param->clover_cuda_prec_precondition
+      = kv.get<QudaPrecision>(section, "clover_cuda_prec_precondition", param->clover_cuda_prec_precondition);
+    param->clover_cuda_prec_eigensolver
+      = kv.get<QudaPrecision>(section, "clover_cuda_prec_eigensolver", param->clover_cuda_prec_eigensolver);
+
+    param->use_init_guess = kv.get<QudaUseInitGuess>(section, "use_init_guess", param->use_init_guess);
+
+    param->clover_csw = kv.get<double>(section, "clover_csw", param->clover_csw);
+    param->clover_coeff = kv.get<double>(section, "clover_coeff", param->clover_coeff);
+    param->clover_rho = kv.get<double>(section, "clover_rho", param->clover_rho);
+    param->compute_clover_trlog = kv.get<int>(section, "compute_clover_trlog", param->compute_clover_trlog);
+    param->Nsteps = kv.get<int>(section, "Nsteps", param->Nsteps);
+    param->gcrNkrylov = kv.get<int>(section, "gcrNkrylov", param->gcrNkrylov);
+
+    param->inv_type_precondition
+      = kv.get<QudaInverterType>(section, "inv_type_precondition", param->inv_type_precondition);
+    param->deflate = kv.get<QudaBoolean>(section, "deflate", param->deflate);
+    param->verbosity_precondition
+      = kv.get<QudaVerbosity>(section, "verbosity_precondition", param->verbosity_precondition);
+    param->tol_precondition = kv.get<double>(section, "tol_precondition", param->tol_precondition);
+    param->maxiter_precondition = kv.get<int>(section, "maxiter_precondition", param->maxiter_precondition);
+    param->omega = kv.get<double>(section, "omega", param->omega);
+    param->ca_basis = kv.get<QudaCABasis>(section, "ca_basis", param->ca_basis);
+    param->ca_lambda_min = kv.get<double>(section, "ca_lambda_min", param->ca_lambda_min);
+    param->ca_lambda_max = kv.get<double>(section, "ca_lambda_max", param->ca_lambda_max);
+    param->ca_basis_precondition = kv.get<QudaCABasis>(section, "ca_basis_precondition", param->ca_basis_precondition);
+    param->ca_lambda_min_precondition
+      = kv.get<double>(section, "ca_lambda_min_precondition", param->ca_lambda_min_precondition);
+    param->ca_lambda_max_precondition
+      = kv.get<double>(section, "ca_lambda_max_precondition", param->ca_lambda_max_precondition);
+    param->precondition_cycle = kv.get<int>(section, "precondition_cycle", param->precondition_cycle);
+    param->schwarz_type = kv.get<QudaSchwarzType>(section, "schwarz_type", param->schwarz_type);
+    param->accelerator_type_precondition
+      = kv.get<QudaAcceleratorType>(section, "accelerator_type_precondition", param->accelerator_type_precondition);
+
+    param->madwf_diagonal_suppressor
+      = kv.get<double>(section, "madwf_diagonal_suppressor", param->madwf_diagonal_suppressor);
+    param->madwf_ls = kv.get<int>(section, "madwf_ls", param->madwf_ls);
+    param->madwf_null_miniter = kv.get<int>(section, "madwf_null_miniter", param->madwf_null_miniter);
+    param->madwf_null_tol = kv.get<double>(section, "madwf_null_tol", param->madwf_null_tol);
+    param->madwf_train_maxiter = kv.get<int>(section, "madwf_train_maxiter", param->madwf_train_maxiter);
+
+    param->madwf_param_load = kv.get<QudaBoolean>(section, "madwf_param_load", param->madwf_param_load);
+    param->madwf_param_save = kv.get<QudaBoolean>(section, "madwf_param_save", param->madwf_param_save);
+    strcpy(param->madwf_param_infile,
+           kv.get<std::string>(section, "madwf_param_infile", param->madwf_param_infile).c_str());
+    strcpy(param->madwf_param_outfile,
+           kv.get<std::string>(section, "madwf_param_outfile", param->madwf_param_outfile).c_str());
+
+    param->residual_type = kv.get<QudaResidualType>(section, "residual_type", param->residual_type);
+
+    if (param->inv_type_precondition == QUDA_MG_INVERTER) {
+
+      std::string mg_section = section + " Multigrid";
+
+      if (!kv.section_exists(mg_section)) {
+        WITH_COMM(errorQuda("Solver section \"%s\" in file %s does not exist.", mg_section.c_str(), qudaState.infile));
+      }
+
+      /* (shallow) copy the struct */
+      *invert_param_mg = *param;
+
+      /* these have to be fixed, and cannot be overwritten by the input file */
+      invert_param_mg->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+      invert_param_mg->dirac_order = QUDA_DIRAC_ORDER;
+
+      multigrid_param->n_level = kv.get<int>(mg_section, "n_level", multigrid_param->n_level, true);
+      multigrid_param->setup_type = kv.get<QudaSetupType>(mg_section, "setup_type", multigrid_param->setup_type);
+      multigrid_param->pre_orthonormalize
+        = kv.get<QudaBoolean>(mg_section, "pre_orthonormalize", multigrid_param->pre_orthonormalize);
+      multigrid_param->post_orthonormalize
+        = kv.get<QudaBoolean>(mg_section, "post_orthonormalize", multigrid_param->post_orthonormalize);
+      multigrid_param->compute_null_vector
+        = kv.get<QudaComputeNullVector>(mg_section, "compute_null_vector", multigrid_param->compute_null_vector);
+      multigrid_param->generate_all_levels
+        = kv.get<QudaBoolean>(mg_section, "generate_all_levels", multigrid_param->generate_all_levels);
+      multigrid_param->run_verify = kv.get<QudaBoolean>(mg_section, "run_verify", multigrid_param->run_verify);
+      multigrid_param->run_low_mode_check
+        = kv.get<QudaBoolean>(mg_section, "run_low_mode_check", multigrid_param->run_low_mode_check);
+      multigrid_param->run_oblique_proj_check
+        = kv.get<QudaBoolean>(mg_section, "run_oblique_proj_check", multigrid_param->run_oblique_proj_check);
+      multigrid_param->coarse_guess = kv.get<QudaBoolean>(mg_section, "coarse_guess", multigrid_param->coarse_guess);
+      multigrid_param->preserve_deflation
+        = kv.get<QudaBoolean>(mg_section, "preserve_deflation", multigrid_param->preserve_deflation);
+      multigrid_param->allow_truncation
+        = kv.get<QudaBoolean>(mg_section, "allow_truncation", multigrid_param->allow_truncation);
+      multigrid_param->staggered_kd_dagger_approximation = kv.get<QudaBoolean>(
+        mg_section, "staggered_kd_dagger_approximation", multigrid_param->staggered_kd_dagger_approximation);
+      multigrid_param->thin_update_only
+        = kv.get<QudaBoolean>(mg_section, "thin_update_only", multigrid_param->thin_update_only);
+
+      for (int i = 0; i < multigrid_param->n_level; i++) {
+        std::string subsection = section + " Multigrid Level " + std::to_string(i);
+
+        if (!kv.section_exists(subsection)) {
+          WITH_COMM(errorQuda("Solver section \"%s\" in file %s does not exist.", subsection.c_str(), qudaState.infile));
+        }
+
+        multigrid_param->geo_block_size[i][0]
+          = kv.get<int>(subsection, "geo_block_size[1]", multigrid_param->geo_block_size[i][0]);
+        multigrid_param->geo_block_size[i][1]
+          = kv.get<int>(subsection, "geo_block_size[2]", multigrid_param->geo_block_size[i][1]);
+        multigrid_param->geo_block_size[i][2]
+          = kv.get<int>(subsection, "geo_block_size[3]", multigrid_param->geo_block_size[i][2]);
+        multigrid_param->geo_block_size[i][3]
+          = kv.get<int>(subsection, "geo_block_size[0]", multigrid_param->geo_block_size[i][3]);
+        multigrid_param->spin_block_size[i]
+          = kv.get<int>(subsection, "spin_block_size", multigrid_param->spin_block_size[i]);
+        multigrid_param->n_vec[i] = kv.get<int>(subsection, "n_vec", multigrid_param->n_vec[i]);
+        multigrid_param->n_vec_batch[i] = kv.get<int>(subsection, "n_vec_batch", 1);
+        multigrid_param->precision_null[i]
+          = kv.get<QudaPrecision>(subsection, "precision_null", multigrid_param->precision_null[i]);
+        multigrid_param->n_block_ortho[i] = kv.get<int>(subsection, "n_block_ortho", multigrid_param->n_block_ortho[i]);
+        multigrid_param->block_ortho_two_pass[i]
+          = kv.get<QudaBoolean>(subsection, "block_ortho_two_pass", multigrid_param->block_ortho_two_pass[i]);
+        multigrid_param->verbosity[i] = kv.get<QudaVerbosity>(subsection, "verbosity", multigrid_param->verbosity[i]);
+        multigrid_param->setup_inv_type[i]
+          = kv.get<QudaInverterType>(subsection, "setup_inv_type", multigrid_param->setup_inv_type[i]);
+        multigrid_param->setup_use_mma[i]
+          = kv.get<QudaBoolean>(subsection, "setup_use_mma", multigrid_param->setup_use_mma[i]);
+        multigrid_param->dslash_use_mma[i]
+          = kv.get<QudaBoolean>(subsection, "dslash_use_mma", multigrid_param->dslash_use_mma[i]);
+        multigrid_param->num_setup_iter[i]
+          = kv.get<int>(subsection, "num_setup_iter", multigrid_param->num_setup_iter[i]);
+        multigrid_param->setup_tol[i] = kv.get<double>(subsection, "setup_tol", multigrid_param->setup_tol[i]);
+        multigrid_param->setup_maxiter[i] = kv.get<int>(subsection, "setup_maxiter", multigrid_param->setup_maxiter[i]);
+        multigrid_param->setup_maxiter_refresh[i]
+          = kv.get<int>(subsection, "setup_maxiter_refresh", multigrid_param->setup_maxiter_refresh[i]);
+        multigrid_param->setup_ca_basis[i]
+          = kv.get<QudaCABasis>(subsection, "setup_ca_basis", multigrid_param->setup_ca_basis[i]);
+        multigrid_param->setup_ca_basis_size[i]
+          = kv.get<int>(subsection, "setup_ca_basis_size", multigrid_param->setup_ca_basis_size[i]);
+        multigrid_param->setup_ca_lambda_min[i]
+          = kv.get<double>(subsection, "setup_ca_lambda_min", multigrid_param->setup_ca_lambda_min[i]);
+        multigrid_param->setup_ca_lambda_max[i]
+          = kv.get<double>(subsection, "setup_ca_lambda_max", multigrid_param->setup_ca_lambda_max[i]);
+
+        multigrid_param->coarse_solver[i]
+          = kv.get<QudaInverterType>(subsection, "coarse_solver", multigrid_param->coarse_solver[i]);
+        multigrid_param->coarse_solver_tol[i]
+          = kv.get<double>(subsection, "coarse_solver_tol", multigrid_param->coarse_solver_tol[i]);
+        multigrid_param->coarse_solver_maxiter[i]
+          = kv.get<int>(subsection, "coarse_solver_maxiter", multigrid_param->coarse_solver_maxiter[i]);
+        multigrid_param->coarse_solver_ca_basis[i]
+          = kv.get<QudaCABasis>(subsection, "coarse_solver_ca_basis", multigrid_param->coarse_solver_ca_basis[i]);
+        multigrid_param->coarse_solver_ca_basis_size[i]
+          = kv.get<int>(subsection, "coarse_solver_ca_basis_size", multigrid_param->coarse_solver_ca_basis_size[i]);
+        multigrid_param->coarse_solver_ca_lambda_min[i]
+          = kv.get<double>(subsection, "coarse_solver_ca_lambda_min", multigrid_param->coarse_solver_ca_lambda_min[i]);
+        multigrid_param->coarse_solver_ca_lambda_max[i]
+          = kv.get<double>(subsection, "coarse_solver_ca_lambda_max", multigrid_param->coarse_solver_ca_lambda_max[i]);
+        multigrid_param->smoother[i] = kv.get<QudaInverterType>(subsection, "smoother", multigrid_param->smoother[i]);
+        multigrid_param->smoother_tol[i] = kv.get<double>(subsection, "smoother_tol", multigrid_param->smoother_tol[i]);
+        multigrid_param->nu_pre[i] = kv.get<int>(subsection, "nu_pre", multigrid_param->nu_pre[i]);
+        multigrid_param->nu_post[i] = kv.get<int>(subsection, "nu_post", multigrid_param->nu_post[i]);
+        multigrid_param->smoother_solver_ca_basis[i]
+          = kv.get<QudaCABasis>(subsection, "smoother_solver_ca_basis", multigrid_param->smoother_solver_ca_basis[i]);
+        multigrid_param->smoother_solver_ca_lambda_min[i] = kv.get<double>(
+          subsection, "smoother_solver_ca_lambda_min", multigrid_param->smoother_solver_ca_lambda_min[i]);
+        multigrid_param->smoother_solver_ca_lambda_max[i] = kv.get<double>(
+          subsection, "smoother_solver_ca_lambda_max", multigrid_param->smoother_solver_ca_lambda_max[i]);
+        multigrid_param->omega[i] = kv.get<double>(subsection, "omega", multigrid_param->omega[i]);
+        multigrid_param->smoother_halo_precision[i]
+          = kv.get<QudaPrecision>(subsection, "smoother_halo_precision", multigrid_param->smoother_halo_precision[i]);
+        multigrid_param->smoother_schwarz_type[i]
+          = kv.get<QudaSchwarzType>(subsection, "smoother_schwarz_type", multigrid_param->smoother_schwarz_type[i]);
+        multigrid_param->smoother_schwarz_cycle[i]
+          = kv.get<int>(subsection, "smoother_schwarz_cycle", multigrid_param->smoother_schwarz_cycle[i]);
+        multigrid_param->coarse_grid_solution_type[i] = kv.get<QudaSolutionType>(
+          subsection, "coarse_grid_solution_type", multigrid_param->coarse_grid_solution_type[i]);
+        multigrid_param->smoother_solve_type[i]
+          = kv.get<QudaSolveType>(subsection, "smoother_solve_type", multigrid_param->smoother_solve_type[i]);
+        multigrid_param->cycle_type[i]
+          = kv.get<QudaMultigridCycleType>(subsection, "cycle_type", multigrid_param->cycle_type[i]);
+        multigrid_param->global_reduction[i]
+          = kv.get<QudaBoolean>(subsection, "global_reduction", multigrid_param->global_reduction[i]);
+        multigrid_param->location[i] = kv.get<QudaFieldLocation>(subsection, "location", multigrid_param->location[i]);
+        multigrid_param->setup_location[i]
+          = kv.get<QudaFieldLocation>(subsection, "setup_location", multigrid_param->setup_location[i]);
+        multigrid_param->use_eig_solver[i]
+          = kv.get<QudaBoolean>(subsection, "use_eig_solver", multigrid_param->use_eig_solver[i]);
+
+        multigrid_param->vec_load[i] = kv.get<QudaBoolean>(subsection, "vec_load", multigrid_param->vec_load[i]);
+        multigrid_param->vec_store[i] = kv.get<QudaBoolean>(subsection, "vec_store", multigrid_param->vec_store[i]);
+        /*strcpy(multigrid_param->vec_infile[i], kv.get<std::string>(subsection, "vec_infile",
+        multigrid_param->vec_infile[i]).c_str()); strcpy(multigrid_param->vec_outfile[i],
+        kv.get<std::string>(subsection, "vec_outfile", multigrid_param->vec_outfile[i]).c_str());*/
+
+        multigrid_param->mu_factor[i] = kv.get<double>(subsection, "mu_factor", multigrid_param->mu_factor[i]);
+        multigrid_param->transfer_type[i]
+          = kv.get<QudaTransferType>(subsection, "transfer_type", multigrid_param->transfer_type[i]);
+      }
+    }
+  }
+
+  /* transfer of the struct to all processes */
+  MPI_Bcast((void *)param, sizeof(*param), MPI_BYTE, 0, qudaState.layout.world_comm);
+  MPI_Bcast((void *)invert_param_mg, sizeof(*invert_param_mg), MPI_BYTE, 0, qudaState.layout.world_comm);
+  MPI_Bcast((void *)multigrid_param, sizeof(*multigrid_param), MPI_BYTE, 0, qudaState.layout.world_comm);
+  multigrid_param->invert_param = invert_param_mg;
+
+  /**
+   * We need a void* to store the multigrid_param (QudaMultigridParam) struct,
+   * such that we can access it and setup multigrid in a later stage, for
+   * instance right before calling invertQuda() if multigrid was not
+   * instantiated until then.
+   */
+  openQCD_QudaSolver *additional_prop = new openQCD_QudaSolver();
+  sprintf(additional_prop->infile, "%s", qudaState.infile);
+  additional_prop->id = id;
+  additional_prop->mg_param = multigrid_param;
+  additional_prop->u1csw = 0.0;
+  additional_prop->qhat = 0.0;
+  param->additional_prop = reinterpret_cast<void *>(additional_prop);
+
+  return (void *)param;
+}
+
+void *openQCD_qudaSolverGetHandle(int id)
+{
+  check_solver_id(id);
+  if (qudaState.inv_handles[id] == nullptr) {
+    if (id != -1) {
+      WITH_COMM(logQuda(QUDA_VERBOSE, "Read in solver parameters from file %s for solver (id=%d)\n", qudaState.infile, id));
+    }
+    qudaState.inv_handles[id] = openQCD_qudaSolverReadIn(id);
+  }
+
+  openQCD_qudaSolverUpdate(qudaState.inv_handles[id]);
+  return qudaState.inv_handles[id];
+}
+
+void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p)
+{
+  void *in = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, src);
+  void *out = qudaState.init.buffer_field(qudaState.layout.world_comm, 1, dst);
+  qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, src, in);
+
+  if (in_comm()) {
+
+    QudaInvertParam param = newOpenQCDDiracParam(p);
+
+    /* both fields reside on the CPU */
+    param.input_location = QUDA_CPU_FIELD_LOCATION;
+    param.output_location = QUDA_CPU_FIELD_LOCATION;
+
+    MatQuda(static_cast<char *>(out), static_cast<char *>(in), &param);
+
+    logQuda(QUDA_DEBUG_VERBOSE, "MatQuda()\n");
+    logQuda(QUDA_DEBUG_VERBOSE, "  gflops      = %.2e\n", param.gflops);
+    logQuda(QUDA_DEBUG_VERBOSE, "  secs        = %.2e\n", param.secs);
+  }
+
+  qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, out, dst);
+}
+
+void openQCD_qudaDw(double mu, void *src, void *dst)
+{
+  if (gauge_field_get_unset()) {
+    WITH_COMM(errorQuda("Gauge field not populated in openQxD."));
+  }
+
+  QudaInvertParam *param = static_cast<QudaInvertParam *>(openQCD_qudaSolverGetHandle(-1));
+  param->mu = mu;
+
+  if (!openQCD_qudaInvertParamCheck(param)) {
+    WITH_COMM(errorQuda("QudaInvertParam struct check failed, parameters/fields between openQxD and QUDA are not in sync."));
+  }
+
+  /* both fields reside on the CPU */
+  param->input_location = QUDA_CPU_FIELD_LOCATION;
+  param->output_location = QUDA_CPU_FIELD_LOCATION;
+
+  void *in = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, src);
+  void *out = qudaState.init.buffer_field(qudaState.layout.world_comm, 1, dst);
+
+  qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, src, in);
+  WITH_COMM(MatQuda(static_cast<char *>(out), static_cast<char *>(in), param));
+  qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, out, dst);
+}
+
+void openQCD_qudaDw_NoLoads(double mu, void *d_in, void *d_out)
+{
+  if (gauge_field_get_unset()) {
+    WITH_COMM(errorQuda("Gauge field not populated in openQxD."));
+  }
+
+  QudaInvertParam *param = static_cast<QudaInvertParam *>(openQCD_qudaSolverGetHandle(-1));
+  param->mu = mu;
+
+  if (!openQCD_qudaInvertParamCheck(param)) {
+    WITH_COMM(errorQuda("QudaInvertParam struct check failed, parameters/fields between openQxD and QUDA are not in sync."));
+  }
+
+  /* both fields reside on the GPU */
+  param->input_location = QUDA_CUDA_FIELD_LOCATION;
+  param->output_location = QUDA_CUDA_FIELD_LOCATION;
+
+  ColorSpinorField *in = reinterpret_cast<ColorSpinorField *>(d_in);
+  ColorSpinorField *out = reinterpret_cast<ColorSpinorField *>(d_out);
+
+  /* truncated version of what MatQuda does */
+  if (in_comm()) {
+    pushVerbosity(param->verbosity);
+    bool pc = (param->solution_type == QUDA_MATPC_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
+
+    DiracParam diracParam;
+    setDiracParam(diracParam, param, pc);
+
+    Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
+    dirac->M(*out, *in);                      // apply the operator
+    delete dirac;                             // clean up
+
+    if (pc) {
+      if (param->mass_normalization == QUDA_MASS_NORMALIZATION) {
+        blas::ax(0.25 / (param->kappa * param->kappa), *out);
+      } else if (param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
+        blas::ax(0.5 / param->kappa, *out);
+      }
+    } else {
+      if (param->mass_normalization == QUDA_MASS_NORMALIZATION
+          || param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
+        blas::ax(0.5 / param->kappa, *out);
+      }
+    }
+    popVerbosity();
+  }
+}
+
+/**
+ * @brief      Take the string-hash over a struct using std::hash.
+ *
+ * @param[in]  in    Input struct
+ *
+ * @tparam     T     Type of input struct
+ *
+ * @return     Hash value
+ */
+template <typename T> int hash_struct(T *in)
+{
+  int hash = 0;
+  char *cstruct = reinterpret_cast<char *>(in);
+
+  for (char *c = cstruct; c < cstruct + sizeof(T); c += strlen(c) + 1) {
+    if (strlen(c) != 0) { hash ^= (std::hash<std::string> {}(std::string(c)) << 1); }
+  }
+
+  return hash;
+}
+
+int openQCD_qudaSolverGetHash(int id)
+{
+  check_solver_id(id);
+  if (qudaState.inv_handles[id] != nullptr) {
+    QudaInvertParam *param = reinterpret_cast<QudaInvertParam *>(qudaState.inv_handles[id]);
+    QudaInvertParam hparam = newQudaInvertParam();
+    memset(&hparam, '\0', sizeof(QudaInvertParam)); /* set everything to zero */
+
+    /* Set some properties we want to take the hash over */
+    hparam.inv_type = param->inv_type;
+    hparam.tol = param->tol;
+    hparam.tol_restart = param->tol_restart;
+    hparam.tol_hq = param->tol_hq;
+    hparam.maxiter = param->maxiter;
+    hparam.reliable_delta = param->reliable_delta;
+    hparam.solution_type = param->solution_type;
+    hparam.solve_type = param->solve_type;
+    hparam.matpc_type = param->matpc_type;
+    hparam.dagger = param->dagger;
+    hparam.mass_normalization = param->mass_normalization;
+    hparam.solver_normalization = param->solver_normalization;
+    hparam.cpu_prec = param->cpu_prec;
+    hparam.cuda_prec = param->cuda_prec;
+    hparam.use_init_guess = param->use_init_guess;
+    hparam.gcrNkrylov = param->gcrNkrylov;
+
+    return hash_struct<QudaInvertParam>(&hparam);
+  } else {
+    return 0;
+  }
+}
+
+void openQCD_qudaSolverPrintSetup(int id)
+{
+  check_solver_id(id);
+  if (!in_comm()) { return; }
+  if (qudaState.inv_handles[id] != nullptr) {
+    QudaInvertParam *param = static_cast<QudaInvertParam *>(qudaState.inv_handles[id]);
+    openQCD_QudaSolver *additional_prop = static_cast<openQCD_QudaSolver *>(param->additional_prop);
+
+    printQudaInvertParam(param);
+    printfQuda("additional_prop->infile = %s\n", additional_prop->infile);
+    printfQuda("additional_prop->id = %d\n", additional_prop->id);
+    printfQuda("additional_prop->mg_param = %p\n", additional_prop->mg_param);
+    printfQuda("additional_prop->u1csw = %.6e\n", additional_prop->u1csw);
+    printfQuda("additional_prop->qhat = %d\n", additional_prop->qhat);
+    printfQuda("additional_prop->mg_ud_rev = %d\n", additional_prop->mg_ud_rev);
+    printfQuda("additional_prop->mg_ad_rev = %d\n", additional_prop->mg_ad_rev);
+    printfQuda("additional_prop->mg_kappa = %.6e\n", additional_prop->mg_kappa);
+    printfQuda("additional_prop->mg_su3csw = %.6e\n", additional_prop->mg_su3csw);
+    printfQuda("additional_prop->mg_u1csw = %.6e\n", additional_prop->mg_u1csw);
+    printfQuda("additional_prop->mg_qhat = %d\n", additional_prop->mg_qhat);
+    printfQuda("handle = %p\n", param);
+    printfQuda("hash = %d\n", openQCD_qudaSolverGetHash(id));
+
+    printfQuda("inv_type_precondition = %d\n", param->inv_type_precondition);
+
+    if (param->inv_type_precondition == QUDA_MG_INVERTER) { printQudaMultigridParam(additional_prop->mg_param); }
+  } else {
+    printfQuda("<Solver is not initialized yet>\n");
+  }
+}
+
+double openQCD_qudaInvert(int id, double mu, void* source, void* solution, int *status)
+{
+  double residual;
+  openQCD_qudaInvertMultiSrc(id, mu, &source, &solution, status, &residual);
+  return residual;
+}
+
+void openQCD_qudaInvertMultiSrc(int id, double mu, void** sources, void** solutions, int *status, double *residual)
+{
+  if (gauge_field_get_unset()) { WITH_COMM(errorQuda("Gauge field not populated in openQxD.")); }
+
+  /**
+   * This is to make sure we behave in the same way as openQCDs solvers, we call
+   * h_sw() which in turn calls sw_term(). We have to make sure that the SW-term
+   * in openQxD is setup and in sync with QUDAs.
+   */
+  if (qudaState.layout.h_sw != nullptr) {
+    qudaState.layout.h_sw();
+  } else {
+    WITH_COMM(errorQuda("qudaState.layout.h_sw is not set."));
+  }
+
+  QudaInvertParam *param = static_cast<QudaInvertParam *>(openQCD_qudaSolverGetHandle(id));
+  param->mu = mu;
+
+  if (!openQCD_qudaInvertParamCheck(param)) {
+    WITH_COMM(errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync."));
+  }
+
+  void **h_sources = (void**) malloc(param->num_src*sizeof(void*));
+  void **h_solutions = (void**) malloc(param->num_src*sizeof(void*));
+
+  for (int i = 0; i < param->num_src; ++i) {
+    h_sources[i] = qudaState.init.buffer_field(qudaState.layout.world_comm, i, sources[i]);
+    h_solutions[i] = qudaState.init.buffer_field(qudaState.layout.world_comm, i+param->num_src, solutions[i]);
+    qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, sources[i], h_sources[i]);
+    if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) {
+      qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, solutions[i], h_solutions[i]);
+    }
+  }
+
+  WITH_COMM(logQuda(QUDA_VERBOSE, "Calling invertMultiSrcQuda() ...\n"));
+  PUSH_RANGE("invertMultiSrcQuda", 5);
+  WITH_COMM(invertMultiSrcQuda(h_solutions, h_sources, param));
+  POP_RANGE;
+
+  for (int i = 0; i < param->num_src; ++i) {
+    qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, h_solutions[i], solutions[i]);
+    status[i] = param->true_res[i] <= param->tol ? param->iter : -1;
+    residual[i] = param->true_res[i];
+  }
+
+  if (!qudaState.init.two_grids_equal) {
+    MPI_Bcast(status, param->num_src, MPI_INT, 0, qudaState.layout.world_comm);
+    MPI_Bcast(residual, param->num_src, MPI_DOUBLE, 0, qudaState.layout.world_comm);
+  }
+
+  WITH_COMM(logQuda(QUDA_VERBOSE, "openQCD_qudaInvertMultiSrc()\n"));
+  WITH_COMM(logQuda(QUDA_VERBOSE, "  iter           = %d\n", param->iter));
+  WITH_COMM(logQuda(QUDA_VERBOSE, "  gflops         = %.2e\n", param->gflops));
+  WITH_COMM(logQuda(QUDA_VERBOSE, "  secs           = %.2e\n", param->secs));
+  for (int i = 0; i < param->num_src; ++i) {
+    WITH_COMM(logQuda(QUDA_VERBOSE, "  true_res[%d]    = %.2e\n", i, param->true_res[i]));
+    WITH_COMM(logQuda(QUDA_VERBOSE, "  true_res_hq[%d] = %.2e\n", i, param->true_res_hq[i]));
+    WITH_COMM(logQuda(QUDA_VERBOSE, "  status[%d]      = %d\n", i, status[i]));
+  }
+
+  free(h_sources);
+  free(h_solutions);
+}
+
+static void *openQCD_qudaInvertAsyncWrapper(void*)
+{
+  /* enable the thread to use QUDA */
+  WITH_COMM(device::init_thread());
+
+  /* split the world_comm communicator, then signal the main threads to continue */
+  pthread_mutex_lock(&qudaState.thread.mutex);
+  MPI_Comm_split(qudaState.layout.world_comm, 0, 0, &qudaState.thread.worker_comm);
+  qudaState.thread.ready = true;
+  pthread_cond_signal(&qudaState.thread.cond);
+  pthread_mutex_unlock(&qudaState.thread.mutex);
+
+  int Ns = qudaState.async_params.num_src;  /* num sources */
+  int Nd = qudaState.inv_args.size();       /* num dispatched */
+  int Nc = Nd > Ns ? ((Ns+Nd-1)/Ns) : 1;    /* num calls */
+
+  void **h_sources = (void**) malloc(Ns*sizeof(void*));
+  void **h_solutions = (void**) malloc(Ns*sizeof(void*));
+
+  for (int c = 0; c < Nc; ++c) {
+    qudaState.async_params.num_src = (c+1==Nc && Nc>1 && Nd % Ns != 0) ? (Nd % Ns) : min(Nd, Ns);
+
+    for (int s = 0; s < qudaState.async_params.num_src; ++s) {
+      int i = c*Ns + s;
+      h_sources[s] = qudaState.init.buffer_field(qudaState.thread.worker_comm, s, qudaState.inv_args[i].source);
+      h_solutions[s] = qudaState.init.buffer_field(qudaState.thread.worker_comm, s+qudaState.async_params.num_src, qudaState.inv_args[i].solution);
+      qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, qudaState.inv_args[i].source, h_sources[s]);
+      if (qudaState.async_params.use_init_guess == QUDA_USE_INIT_GUESS_YES) {
+        qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, qudaState.inv_args[i].solution, h_solutions[s]);
+      }
+    }
+
+    WITH_COMM(logQuda(QUDA_VERBOSE, "Calling invertMultiSrcQuda() ...\n"));
+    PUSH_RANGE("invertMultiSrcQuda", 5);
+    WITH_COMM(invertMultiSrcQuda(h_solutions, h_sources, &qudaState.async_params));
+    POP_RANGE;
+
+    for (int s = 0; s < qudaState.async_params.num_src; ++s) {
+      int i = c*Ns + s;
+      qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, h_solutions[s], qudaState.inv_args[i].solution);
+
+      if (in_comm()) {
+        *(qudaState.inv_args[i].status) = qudaState.async_params.true_res[s] <= qudaState.async_params.tol ? qudaState.async_params.iter : -1;
+        qudaState.inv_args[i].retval = qudaState.async_params.true_res[s];
+      }
+    }
+
+    WITH_COMM(logQuda(QUDA_VERBOSE, "openQCD_qudaInvertAsync...()\n"));
+    WITH_COMM(logQuda(QUDA_VERBOSE, "  iter           = %d\n", qudaState.async_params.iter));
+    WITH_COMM(logQuda(QUDA_VERBOSE, "  gflops         = %.2e\n", qudaState.async_params.gflops));
+    WITH_COMM(logQuda(QUDA_VERBOSE, "  secs           = %.2e\n", qudaState.async_params.secs));
+    for (int s = 0; s < qudaState.async_params.num_src; ++s) {
+      int i = c*Ns + s;
+      WITH_COMM(logQuda(QUDA_VERBOSE, "  true_res[%d]    = %.2e\n", s, qudaState.async_params.true_res[s]));
+      WITH_COMM(logQuda(QUDA_VERBOSE, "  true_res_hq[%d] = %.2e\n", s, qudaState.async_params.true_res_hq[s]));
+      WITH_COMM(logQuda(QUDA_VERBOSE, "  status[%d]      = %d\n", s, *(qudaState.inv_args[i].status)));
+
+      MPI_Bcast(qudaState.inv_args[i].status, 1, MPI_INT, 0, qudaState.thread.worker_comm);
+      MPI_Bcast((void *)&qudaState.inv_args[i].retval, 1, MPI_DOUBLE, 0, qudaState.thread.worker_comm);
+    }
+  }
+  free(h_sources);
+  free(h_solutions);
+  MPI_Comm_free(&qudaState.thread.worker_comm);
+
+  return nullptr;
+}
+
+void check_mpi_init(void)
+{
+  int flag;
+  MPI_Query_thread(&flag);
+  if (flag < MPI_THREAD_MULTIPLE)
+    WITH_COMM(errorQuda("MPI was not initialized with thread support. "
+      "Initialize MPI with quda_mpi_init instead of MPI_Init."));
+}
+
+void openQCD_qudaInvertAsyncSetup(int id, double mu)
+{
+  check_mpi_init();
+  if (gauge_field_get_unset()) { WITH_COMM(errorQuda("Gauge field not populated in openQxD.")); }
+
+  if (qudaState.layout.h_sw != nullptr) {
+    qudaState.layout.h_sw();
+  } else {
+    WITH_COMM(errorQuda("qudaState.layout.h_sw is not set."));
+  }
+
+  qudaState.async_params = *static_cast<QudaInvertParam *>(openQCD_qudaSolverGetHandle(id));
+  qudaState.async_params.mu = mu;
+
+  if (!openQCD_qudaInvertParamCheck(&qudaState.async_params)) {
+    WITH_COMM(errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync."));
+  }
+}
+
+void openQCD_qudaInvertAsyncDispatch(void *source, void *solution, int *status)
+{
+  check_mpi_init();
+
+  openQCD_qudaInvert_args_t args;
+  args.source = source;
+  args.solution = solution;
+  args.status = status;
+
+  qudaState.inv_args.push_back(args);
+}
+
+MPI_Comm openQCD_qudaInvertAsyncStart(void)
+{
+  check_mpi_init();
+
+  if (qudaState.thread.created == true)
+    WITH_COMM(errorQuda("Thread already created"));
+
+  MPI_Comm_split(qudaState.layout.world_comm, 1, 0, &qudaState.thread.main_comm);
+
+  int rc = pthread_create(&qudaState.thread.thread, nullptr, openQCD_qudaInvertAsyncWrapper, nullptr);
+  if (rc != 0) {
+    perror("Error in openQCD_qudaInvertStart");
+    WITH_COMM(errorQuda("pthread_create failed"));
+  }
+
+  /* wait for the worker threads to split the world_comm communicator properly */
+  pthread_mutex_lock(&qudaState.thread.mutex);
+  while (!qudaState.thread.ready) {
+    pthread_cond_wait(&qudaState.thread.cond, &qudaState.thread.mutex);
+  }
+  pthread_mutex_unlock(&qudaState.thread.mutex);
+
+  qudaState.thread.created = true;
+  return qudaState.thread.main_comm;
+}
+
+void openQCD_qudaInvertAsyncWait(double *residual)
+{
+  check_mpi_init();
+
+  if (!qudaState.thread.created)
+    WITH_COMM(errorQuda("openQCD_qudaInvertStart not called"));
+
+  int rc = pthread_join(qudaState.thread.thread, nullptr);
+  if (rc != 0) {
+    perror("Error in openQCD_qudaInvertWait");
+    WITH_COMM(errorQuda("pthread_join failed"));
+  }
+
+  /* reset the thread state properly */
+  MPI_Comm_free(&qudaState.thread.main_comm);
+  qudaState.thread.created = false;
+  qudaState.thread.ready = false;
+  qudaState.thread.cond = PTHREAD_COND_INITIALIZER;
+  qudaState.thread.mutex = PTHREAD_MUTEX_INITIALIZER;
+  qudaState.thread.worker_comm = MPI_COMM_NULL;
+  qudaState.thread.main_comm = MPI_COMM_NULL;
+
+  if (residual != nullptr) {
+    int i = 0;
+    for (openQCD_qudaInvert_args_t args : qudaState.inv_args) {
+      residual[i] = args.retval;
+      i++;
+    }
+  }
+
+  qudaState.inv_args.clear();
+}
+
+
+void openQCD_qudaSolverDestroy(int id)
+{
+  check_solver_id(id);
+  if (qudaState.inv_handles[id] != nullptr) {
+    QudaInvertParam *param = static_cast<QudaInvertParam *>(qudaState.inv_handles[id]);
+
+    if (param->inv_type_precondition == QUDA_MG_INVERTER) { destroyMultigridQuda(param->preconditioner); }
+
+    delete static_cast<openQCD_QudaSolver *>(param->additional_prop)->mg_param;
+    delete static_cast<openQCD_QudaSolver *>(param->additional_prop);
+    delete param;
+    qudaState.inv_handles[id] = nullptr;
+  }
+}
+
+void *openQCD_qudaEigensolverReadIn(int id, int solver_id)
+{
+  int my_rank;
+  QudaEigParam *param;
+
+  MPI_Comm_rank(qudaState.layout.world_comm, &my_rank);
+  QudaVerbosity verbosity = QUDA_SUMMARIZE;
+
+  /* Allocate on the heap */
+  if (qudaState.eig_handles[id] == nullptr) {
+    param = new QudaEigParam(newQudaEigParam());
+  } else {
+    param = static_cast<QudaEigParam *>(qudaState.eig_handles[id]);
+  }
+
+  if (my_rank == 0) {
+
+    KeyValueStore kv;
+    kv.set_map(&enum_map);
+    kv.load(qudaState.infile);
+
+    std::string section = "Eigensolver " + std::to_string(id);
+
+    verbosity = kv.get<QudaVerbosity>(section, "verbosity", verbosity);
+
+    if (verbosity >= QUDA_DEBUG_VERBOSE) { kv.dump(); }
+
+    if (kv.get<std::string>(section, "solver") != "QUDA") {
+      WITH_COMM(errorQuda("Eigensolver section \"%s\" in file %s is not a valid quda-eigensolver section (solver = %s)\n",
+                section.c_str(), qudaState.infile, kv.get<std::string>(section, "solver").c_str()));
+    }
+
+    param->eig_type = kv.get<QudaEigType>(section, "eig_type", param->eig_type);
+    param->use_poly_acc = kv.get<QudaBoolean>(section, "use_poly_acc", param->use_poly_acc);
+    param->poly_deg = kv.get<int>(section, "poly_deg", param->poly_deg);
+    param->a_min = kv.get<double>(section, "a_min", param->a_min);
+    param->a_max = kv.get<double>(section, "a_max", param->a_max);
+    param->preserve_deflation = kv.get<QudaBoolean>(section, "preserve_deflation", param->preserve_deflation);
+    /*param->*preserve_deflation_space = kv.get<void>(section, *"*preserve_deflation_space", param->preserve_deflation_space);*/
+    param->preserve_evals = kv.get<QudaBoolean>(section, "preserve_evals", param->preserve_evals);
+    param->use_dagger = kv.get<QudaBoolean>(section, "use_dagger", param->use_dagger);
+    param->use_norm_op = kv.get<QudaBoolean>(section, "use_norm_op", param->use_norm_op);
+    param->use_pc = kv.get<QudaBoolean>(section, "use_pc", param->use_pc);
+    param->use_eigen_qr = kv.get<QudaBoolean>(section, "use_eigen_qr", param->use_eigen_qr);
+    param->compute_svd = kv.get<QudaBoolean>(section, "compute_svd", param->compute_svd);
+    param->compute_gamma5 = kv.get<QudaBoolean>(section, "compute_gamma5", param->compute_gamma5);
+    param->require_convergence = kv.get<QudaBoolean>(section, "require_convergence", param->require_convergence);
+    param->spectrum = kv.get<QudaEigSpectrumType>(section, "spectrum", param->spectrum);
+    param->n_ev = kv.get<int>(section, "n_ev", param->n_ev);
+    param->n_kr = kv.get<int>(section, "n_kr", param->n_kr);
+    param->n_conv = kv.get<int>(section, "n_conv", param->n_conv);
+    param->n_ev_deflate = kv.get<int>(section, "n_ev_deflate", param->n_ev_deflate);
+    param->tol = kv.get<double>(section, "tol", param->tol);
+    param->qr_tol = kv.get<double>(section, "qr_tol", param->qr_tol);
+    param->check_interval = kv.get<int>(section, "check_interval", param->check_interval);
+    param->max_restarts = kv.get<int>(section, "max_restarts", param->max_restarts);
+    param->batched_rotate = kv.get<int>(section, "batched_rotate", param->batched_rotate);
+    param->block_size = kv.get<int>(section, "block_size", param->block_size);
+    param->arpack_check = kv.get<QudaBoolean>(section, "arpack_check", param->arpack_check);
+    strcpy(param->QUDA_logfile, kv.get<std::string>(section, "QUDA_logfile", param->QUDA_logfile).c_str());
+    strcpy(param->arpack_logfile, kv.get<std::string>(section, "arpack_logfile", param->arpack_logfile).c_str());
+
+    param->nk = kv.get<int>(section, "nk", param->nk);
+    param->np = kv.get<int>(section, "np", param->np);
+    param->import_vectors = kv.get<QudaBoolean>(section, "import_vectors", param->import_vectors);
+    param->cuda_prec_ritz = kv.get<QudaPrecision>(section, "cuda_prec_ritz", param->cuda_prec_ritz);
+    param->mem_type_ritz = kv.get<QudaMemoryType>(section, "mem_type_ritz", param->mem_type_ritz);
+    param->location = kv.get<QudaFieldLocation>(section, "location", param->location);
+    param->run_verify = kv.get<QudaBoolean>(section, "run_verify", param->run_verify);
+    /*strcpy(param->vec_infile, kv.get<std::string>(section, "vec_infile", param->vec_infile).c_str());*/
+    /*strcpy(param->vec_outfile, kv.get<std::string>(section, "vec_outfile", param->vec_outfile).c_str());*/
+    param->vec_outfile[0] = '\0';
+    param->vec_infile[0] = '\0';
+    param->save_prec = kv.get<QudaPrecision>(section, "save_prec", param->save_prec);
+    param->io_parity_inflate = kv.get<QudaBoolean>(section, "io_parity_inflate", param->io_parity_inflate);
+    param->extlib_type = kv.get<QudaExtLibType>(section, "extlib_type", param->extlib_type);
+  }
+
+  /* transfer of the struct to all the processes */
+  MPI_Bcast((void *)param, sizeof(*param), MPI_BYTE, 0, qudaState.layout.world_comm);
+
+  void *inv_param = openQCD_qudaSolverGetHandle(solver_id);
+  param->invert_param = static_cast<QudaInvertParam *>(inv_param);
+
+  param->invert_param->verbosity = std::max(param->invert_param->verbosity, verbosity);
+
+  if (solver_id != -1 && param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) {
+    WITH_COMM(printQudaInvertParam(param->invert_param));
+  }
+
+  if (param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { WITH_COMM(printQudaEigParam(param)); }
+
+  return (void *)param;
+}
+
+void *openQCD_qudaEigensolverGetHandle(int id, int solver_id)
+{
+  check_eigensolver_id(id);
+  check_solver_id(solver_id);
+
+  if (qudaState.eig_handles[id] == nullptr) {
+    WITH_COMM(logQuda(QUDA_VERBOSE, "Read in eigensolver parameters from file %s for eigensolver (id=%d)\n", qudaState.infile, id));
+    qudaState.eig_handles[id] = openQCD_qudaEigensolverReadIn(id, solver_id);
+  }
+
+  openQCD_qudaSolverUpdate(static_cast<QudaEigParam *>(qudaState.eig_handles[id])->invert_param);
+  return qudaState.eig_handles[id];
+}
+
+void openQCD_qudaEigensolverPrintSetup(int id, int solver_id)
+{
+  check_eigensolver_id(id);
+  check_solver_id(solver_id);
+  if (!in_comm()) { return; }
+
+  if (qudaState.eig_handles[id] != nullptr) {
+    QudaEigParam *param = static_cast<QudaEigParam *>(qudaState.eig_handles[id]);
+    printQudaEigParam(param);
+    printfQuda("\n");
+    openQCD_qudaSolverPrintSetup(solver_id);
+  } else {
+    printfQuda("<Eigensolver is not initialized yet>\n");
+  }
+}
+
+void openQCD_qudaEigensolve(int id, int solver_id, void **h_evecs, void *h_evals)
+{
+  if (gauge_field_get_unset()) { WITH_COMM(errorQuda("Gauge field not populated in openQxD.")); }
+
+  if (qudaState.layout.h_sw != nullptr) {
+    qudaState.layout.h_sw();
+  } else {
+    WITH_COMM(errorQuda("qudaState.layout.h_sw is not set."));
+  }
+
+  QudaEigParam *eig_param = static_cast<QudaEigParam *>(openQCD_qudaEigensolverGetHandle(id, solver_id));
+
+  if (!openQCD_qudaInvertParamCheck(eig_param->invert_param)) {
+    WITH_COMM(errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync."));
+  }
+
+  int N = eig_param->n_conv*(eig_param->compute_svd ? 2 : 1);
+  void **evecs = (void**) malloc(N*sizeof(void*));
+  for (int i = 0; i < N; ++i) {
+    evecs[i] = qudaState.init.buffer_field(qudaState.layout.world_comm, i, h_evecs[i]);
+    qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, h_evecs[i], evecs[i]);
+  }
+
+  WITH_COMM(logQuda(QUDA_VERBOSE, "Calling eigensolveQuda() ...\n"));
+  PUSH_RANGE("eigensolveQuda", 6);
+  WITH_COMM(eigensolveQuda(evecs, static_cast<openqcd_complex_dble *>(h_evals), eig_param));
+  POP_RANGE;
+
+  WITH_COMM(logQuda(QUDA_SUMMARIZE, "openQCD_qudaEigensolve()\n"));
+  WITH_COMM(logQuda(QUDA_SUMMARIZE, "  gflops      = %.2e\n", eig_param->invert_param->gflops));
+  WITH_COMM(logQuda(QUDA_SUMMARIZE, "  secs        = %.2e\n", eig_param->invert_param->secs));
+  WITH_COMM(logQuda(QUDA_SUMMARIZE, "  iter        = %d\n", eig_param->invert_param->iter));
+
+  if (!qudaState.init.two_grids_equal)
+    MPI_Bcast(h_evals, 2*N, MPI_DOUBLE, 0, qudaState.layout.world_comm);
+
+  for (int i = 0; i < N; ++i) {
+    qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, evecs[i], h_evecs[i]);
+  }
+
+  free(evecs);
+}
+
+void openQCD_qudaEigensolverDestroy(int id)
+{
+  check_eigensolver_id(id);
+
+  if (qudaState.eig_handles[id] != nullptr) {
+    QudaEigParam *eig_param = static_cast<QudaEigParam *>(qudaState.eig_handles[id]);
+
+    delete eig_param;
+    qudaState.eig_handles[id] = nullptr;
+  }
+}
+
+bool openQCD_qudaQCDPlusQEDEnabled(void) {
+#ifdef BUILD_QCD_PLUS_QED
+  return true;
+#else
+  return false;
+#endif
+}
+
+#undef WITH_COMM
diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90
index 2f52d56aae..494d77f136 100644
--- a/lib/quda_fortran.F90
+++ b/lib/quda_fortran.F90
@@ -368,6 +368,9 @@ module quda_fortran
      ! The t0 parameter for distance preconditioning, the timeslice where the source is located
      integer(4) distance_pc_t0
 
+     ! Additional user-defined properties
+     integer(8) :: additional_prop
+
   end type quda_invert_param
 
 end module quda_fortran
diff --git a/lib/targets/cuda/comm_target.cpp b/lib/targets/cuda/comm_target.cpp
index bd9fa1fd3e..89bc7f2a38 100644
--- a/lib/targets/cuda/comm_target.cpp
+++ b/lib/targets/cuda/comm_target.cpp
@@ -75,8 +75,11 @@ namespace quda
     // TODO: We maybe can force loopback comms to use the IB path here
     if (comm_dim(dim) == 1) continue;
 #endif
-    // even if comm_dim(2) == 2, we might not have p2p enabled in both directions, so check this
-    const int num_dir = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2;
+    // even if comm_dim(dim) == 2, we might not have p2p enabled in both directions, so check this
+    const int num_dir
+      = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ?
+      1 :
+      2;
     for (int dir = 0; dir < num_dir; dir++) {
       remote[dim][dir] = nullptr;
 #ifndef NVSHMEM_COMMS
diff --git a/lib/targets/hip/comm_target.cpp b/lib/targets/hip/comm_target.cpp
index cbb4725253..30812d4440 100644
--- a/lib/targets/hip/comm_target.cpp
+++ b/lib/targets/hip/comm_target.cpp
@@ -69,9 +69,11 @@ namespace quda
     // open the remote memory handles and set the send ghost pointers
     for (int dim = 0; dim < 4; ++dim) {
       if (comm_dim(dim) == 1) continue;
-      // even if comm_dim(2) == 2, we might not have p2p enabled in both directions, so check this
-      const int num_dir
-        = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2;
+      // even if comm_dim(dim) == 2, we might not have p2p enabled in both directions, so check this
+      const int num_dir = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim)
+                           && comm_peer2peer_enabled(1, dim)) ?
+        1 :
+        2;
       for (int dir = 0; dir < num_dir; ++dir) {
         remote[dim][dir] = nullptr;
         if (!comm_peer2peer_enabled(dir, dim)) continue;
diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp
index 3f04cfcd9c..7e2bbd8078 100644
--- a/tests/invert_test.cpp
+++ b/tests/invert_test.cpp
@@ -121,11 +121,7 @@ void init(int argc, char **argv)
 {
   // Set QUDA's internal parameters
   gauge_param = newQudaGaugeParam();
-  if (use_split_gauge_bkup == 1) {
-    gauge_param.use_split_gauge_bkup = true;
-  } else {
-    gauge_param.use_split_gauge_bkup = false;
-  }
+  gauge_param.use_split_gauge_bkup = use_split_gauge_bkup == 1;
   setWilsonGaugeParam(gauge_param);
 
   inv_param = newQudaInvertParam();
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 1f5c7c726e..7e10bc8b6f 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -140,11 +140,7 @@ void init()
 {
   // Set QUDA internal parameters
   gauge_param = newQudaGaugeParam();
-  if (use_split_gauge_bkup == 1) {
-    gauge_param.use_split_gauge_bkup = true;
-  } else {
-    gauge_param.use_split_gauge_bkup = false;
-  }
+  gauge_param.use_split_gauge_bkup = use_split_gauge_bkup == 1;
   setStaggeredGaugeParam(gauge_param);
   QudaGaugeSmearParam smear_param;
   if (gauge_smear) {