diff --git a/CMakeLists.txt b/CMakeLists.txt index c3fa2a949c..7e93a258de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -166,9 +166,12 @@ option(QUDA_ARPACK_LOGGING "enable ARPACK logging (not available for NG)" OFF) # OpenBLAS option(QUDA_OPENBLAS "enable OpenBLAS" OFF) +option(QUDA_QCD_PLUS_QED "Enable QCD+QED features (builds QUDA_RECONSTRUCT_9/13 instead of 8/12 for Wilson-type operators)" OFF) + # Interface options option(QUDA_INTERFACE_QDP "build qdp interface" ON) option(QUDA_INTERFACE_MILC "build milc interface" ON) +option(QUDA_INTERFACE_OPENQCD "build OpenQCD interface (for QCD+QED support, see QUDA_QCD_PLUS_QED)" OFF) option(QUDA_INTERFACE_CPS "build cps interface" OFF) option(QUDA_INTERFACE_QDPJIT "build qdpjit interface" OFF) option(QUDA_INTERFACE_BQCD "build bqcd interface" OFF) @@ -306,6 +309,13 @@ if(QUDA_MPI AND QUDA_QMP) "Specifying QUDA_QMP and QUDA_MPI might result in undefined behavior. If you intend to use QMP set QUDA_MPI=OFF.") endif() +if(QUDA_INTERFACE_OPENQCD AND QUDA_QMP) + message(SEND_ERROR "OpenQCD does not support QMP comms.") +endif() + +if(QUDA_INTERFACE_OPENQCD AND NOT (QUDA_MPI)) + message(SEND_ERROR "OpenQCD requires QUDA built with MPI comms.") +endif() if(QUDA_NVSHMEM AND NOT (QUDA_QMP OR QUDA_MPI)) message(SEND_ERROR "Specifying QUDA_NVSHMEM requires either QUDA_QMP or QUDA_MPI.") diff --git a/README.md b/README.md index 274f708b3e..3e5e9bb62f 100644 --- a/README.md +++ b/README.md @@ -265,6 +265,7 @@ Advanced Scientific Computing (PASC21) [arXiv:2104.05615[hep-lat]]. * Joel Giedt (Rensselaer Polytechnic Institute) * Steven Gottlieb (Indiana University) * Anthony Grebe (Fermilab) +* Roman Gruber (ETH) * Kyriakos Hadjiyiannakou (Cyprus) * Ben Hoerz (Intel) * Leon Hostetler (Indiana University) diff --git a/include/clover_field.h b/include/clover_field.h index 32fffd2973..f50a5ae253 100644 --- a/include/clover_field.h +++ b/include/clover_field.h @@ -166,6 +166,8 @@ namespace quda { inverse(param.inverse), clover(param.clover), cloverInv(param.cloverInv), + csw(param.csw), + coeff(param.coeff), twist_flavor(param.twist_flavor), mu2(param.mu2), epsilon2(param.epsilon2), diff --git a/include/clover_field_order.h b/include/clover_field_order.h index af1a8d73c8..dc9a92084a 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -614,7 +615,11 @@ namespace quda { errorQuda("Accessor reconstruct = %d does not match field reconstruct %d", enable_reconstruct, clover.Reconstruct()); if (clover.max_element(is_inverse) == 0.0 && isFixed::value) +#ifdef BUILD_OPENQCD_INTERFACE + warningQuda("%p max_element(%d) appears unset", &clover, is_inverse); /* ignore if the SW-field is zero */ +#else errorQuda("%p max_element(%d) appears unset", &clover, is_inverse); +#endif if (clover.Diagonal() == 0.0 && clover.Reconstruct()) errorQuda("%p diagonal appears unset", &clover); this->clover = clover_ ? clover_ : clover.data(is_inverse); } @@ -983,6 +988,97 @@ namespace quda { size_t Bytes() const { return length*sizeof(Float); } }; + /** + * OpenQCD ordering for clover fields + */ + template struct OpenQCDOrder { + static constexpr bool enable_reconstruct = false; + typedef typename mapper::type RegType; + Float *clover; + const int volumeCB; + const QudaTwistFlavorType twist_flavor; + const Float mu2; + const Float epsilon2; + const double coeff; + const double csw; + const double kappa; + const int dim[4]; // xyzt convention + const int L[4]; // txyz convention + + OpenQCDOrder(const CloverField &clover, bool inverse, Float *clover_ = nullptr, void * = nullptr) : + volumeCB(clover.Stride()), + twist_flavor(clover.TwistFlavor()), + mu2(clover.Mu2()), + epsilon2(clover.Epsilon2()), + coeff(clover.Coeff()), + csw(clover.Csw()), + kappa(clover.Coeff() / clover.Csw()), + dim {clover.X()[0], clover.X()[1], clover.X()[2], clover.X()[3]}, // *local* lattice dimensions, xyzt + L {clover.X()[3], clover.X()[0], clover.X()[1], clover.X()[2]} // *local* lattice dimensions, txyz + { + if (clover.Order() != QUDA_OPENQCD_CLOVER_ORDER) { + errorQuda("Invalid clover order %d for this accessor", clover.Order()); + } + this->clover = clover_ ? clover_ : clover.data(inverse); + if (clover.Coeff() == 0.0 || clover.Csw() == 0.0) { errorQuda("Neither coeff nor csw may be zero!"); } + } + + QudaTwistFlavorType TwistFlavor() const { return twist_flavor; } + Float Mu2() const { return mu2; } + Float Epsilon2() const { return epsilon2; } + + /** + * @brief Gets the offset in Floats from the openQCD base pointer to + * the spinor field. + * + * @param[in] x_cb Checkerboard index coming from quda + * @param[in] parity The parity coming from quda + * + * @return The offset. + */ + __device__ __host__ inline int getCloverOffset(int x_cb, int parity) const + { + int x_quda[4], x[4]; + getCoords(x_quda, x_cb, dim, parity); // x_quda contains xyzt local Carthesian corrdinates + openqcd::rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate + return openqcd::ipt(x, L) * length; + } + + /** + * @brief Load a clover field at lattice point x_cb + * + * @param v The output clover matrix in QUDA order + * @param x_cb The checkerboarded lattice site + * @param parity The parity of the lattice site + */ + __device__ __host__ inline void load(RegType v[length], int x_cb, int parity) const + { + int sign[36] = {-1, -1, -1, -1, -1, -1, // diagonals (idx 0-5) + -1, +1, -1, +1, -1, -1, -1, -1, -1, -1, // column 0 (idx 6-15) + -1, +1, -1, -1, -1, -1, -1, -1, // column 1 (idx 16-23) + -1, -1, -1, -1, -1, -1, // column 2 (idx 24-29) + -1, +1, -1, +1, // column 3 (idx 30-33) + -1, +1}; // column 4 (idx 34-35) + int map[36] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 24, 25, 16, 17, + 12, 13, 20, 21, 26, 27, 14, 15, 22, 23, 28, 29, 30, 31, 32, 33, 34, 35}; + const int M = length / 2; + int offset = getCloverOffset(x_cb, parity); + auto Ap = &clover[offset]; // A_+ + auto Am = &clover[offset + M]; // A_- + +#pragma unroll + for (int i = 0; i < M; i++) { + v[i] = sign[i] * (kappa * Am[map[i]] - (i < 6)); + v[M + i] = sign[i] * (kappa * Ap[map[i]] - (i < 6)); + } + } + + // FIXME implement the save routine for OpenQCD ordered fields + __device__ __host__ inline void save(RegType[length], int, int) const { } + + size_t Bytes() const { return length * sizeof(Float); } + }; + } // namespace clover // Use traits to reduce the template explosion diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 6362b4677b..afca84c105 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -207,6 +207,9 @@ namespace quda } else if (inv_param.dirac_order == QUDA_TIFR_PADDED_DIRAC_ORDER) { fieldOrder = QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER; siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + } else if (inv_param.dirac_order == QUDA_OPENQCD_DIRAC_ORDER) { + fieldOrder = QUDA_OPENQCD_FIELD_ORDER; + siteOrder = QUDA_EVEN_ODD_SITE_ORDER; } else { errorQuda("Dirac order %d not supported", inv_param.dirac_order); } diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 46ad849079..2c46c23ea9 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1759,6 +1759,85 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; + /** + * struct to define order of spinor fields in OpenQCD + * + * @tparam Float Underlying type of data (precision) + * @tparam Ns Number of spin degrees of freedom + * @tparam Nc Number of color degrees of freedom + */ + template struct OpenQCDDiracOrder { + using Accessor = OpenQCDDiracOrder; + using real = typename mapper::type; + using complex = complex; + + static const int length = 2 * Ns * Nc; // 12 complex (2 floats) numbers per spinor color field + Float *field; + size_t offset; + Float *ghost[8]; + int volumeCB; + int faceVolumeCB[4]; + int nParity; + const int dim[4]; // xyzt convention + const int L[4]; // txyz convention + + OpenQCDDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) : + field(field_ ? field_ : a.data()), + offset(a.Bytes() / (2 * sizeof(Float))), // TODO: What's this for?? + volumeCB(a.VolumeCB()), + nParity(a.SiteSubset()), + dim {a.X(0), a.X(1), a.X(2), a.X(3)}, // *local* lattice dimensions, xyzt + L {a.X(3), a.X(0), a.X(1), a.X(2)} // *local* lattice dimensions, txyz + { + if constexpr (length != 24) { errorQuda("Spinor field length %d not supported", length); } + } + + /** + * @brief Gets the offset in Floats from the openQCD base pointer to + * the spinor field. + * + * @param[in] x Checkerboard index coming from quda + * @param[in] parity The parity coming from quda + * + * @return The offset. + */ + __device__ __host__ inline int getSpinorOffset(int x_cb, int parity) const + { + int x_quda[4], x[4]; + getCoords(x_quda, x_cb, dim, parity); // x_quda contains xyzt local Carthesian corrdinates + openqcd::rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate + return openqcd::ipt(x, L) * length; + } + + __device__ __host__ inline void load(complex v[length / 2], int x_cb, int parity = 0) const + { + auto in = &field[getSpinorOffset(x_cb, parity)]; + block_load(v, reinterpret_cast(in)); + } + + __device__ __host__ inline void save(const complex v[length / 2], int x_cb, int parity = 0) const + { + auto out = &field[getSpinorOffset(x_cb, parity)]; + block_store(reinterpret_cast(out), v); + } + + /** + @brief This accessor routine returns a colorspinor_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a colorspinor_wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline auto operator()(int x_cb, int parity) const + { + return colorspinor_wrapper(*this, x_cb, parity); + } + + size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } + }; // openQCDDiracOrder + } // namespace colorspinor template diff --git a/include/comm_quda.h b/include/comm_quda.h index 8496efab5a..d33e949bdf 100644 --- a/include/comm_quda.h +++ b/include/comm_quda.h @@ -49,6 +49,13 @@ namespace quda */ int comm_dim(int dim); + /** + Return whether the dimension dim is a C* dimension or not + @param dim Dimension which we are querying + @return C* dimension or nor + */ + bool comm_dim_cstar(int dim); + /** Return the global number of processes in the dimension dim @param dim Dimension which we are querying diff --git a/include/communicator_quda.h b/include/communicator_quda.h index bf0e9ffba5..c5ddbe1f75 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -41,6 +41,7 @@ namespace quda int (*coords)[QUDA_MAX_DIM]; int my_rank; int my_coords[QUDA_MAX_DIM]; + int cstar; // number of C* direction as per openQxD convention // It might be worth adding communicators to allow for efficient reductions: // #if defined(MPI_COMMS) // MPI_Comm comm; @@ -126,9 +127,26 @@ namespace quda inline int comm_rank_displaced(const Topology *topo, const int displacement[]) { int coords[QUDA_MAX_DIM]; - - for (int i = 0; i < QUDA_MAX_DIM; i++) { - coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i], comm_dims(topo)[i]) : 0; + int shift_integer; + + int Nx_displacement = 0; + for (int i = QUDA_MAX_DIM - 1; i >= 0; i--) { + // cstar shift[x] shift[y] shift[z] shift[t] + // 0 0 0 0 0 + // 1 0 0 0 0 + // 2 0 1 0 0 + // 3 0 1 1 0 + if (i < topo->ndim && ((i == 1 && topo->cstar >= 2) || (i == 2 && topo->cstar >= 3))) { + // if we go over the boundary and have a shifted boundary condition, + // we shift Nx/2 ranks in x-direction: + // shift_integer in { 0, 1, 2} + // (shift_integer - 1) in {-1, 0, 1} + shift_integer = (comm_coords(topo)[i] + displacement[i] + comm_dims(topo)[i]) / comm_dims(topo)[i]; + Nx_displacement += (shift_integer - 1) * (comm_dims(topo)[0] / 2); + } + coords[i] = (i < topo->ndim) ? + mod(comm_coords(topo)[i] + displacement[i] + (i == 0 ? Nx_displacement : 0), comm_dims(topo)[i]) : + 0; } return comm_rank_from_coords(topo, coords); @@ -390,6 +408,12 @@ namespace quda return comm_dims(topo)[dim]; } + bool comm_dim_cstar(int dim) + { + Topology *topo = comm_default_topology(); + return (topo->cstar >= 2 && dim == 1) || (topo->cstar >= 3 && dim == 2); + } + int comm_coord(int dim) { Topology *topo = comm_default_topology(); diff --git a/include/dirac_quda.h b/include/dirac_quda.h index 2ceb4f5b70..7372c956e3 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -1133,6 +1133,15 @@ namespace quda { virtual QudaDiracType getDiracType() const override { return QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC; } }; + /** + * @brief Applies gamma matrices to spinor fields + * + * @param[out] out Output field + * @param[in] in Input field + * @param[in] dir Direction index of gamma matrix + */ + void ApplyGamma(cvector_ref &out, cvector_ref &in, QudaGammaDirection_s dir); + void gamma5(cvector_ref &out, cvector_ref &in); /** diff --git a/include/eigensolve_quda.h b/include/eigensolve_quda.h index 4f6d16511e..6ef66e17bf 100644 --- a/include/eigensolve_quda.h +++ b/include/eigensolve_quda.h @@ -255,10 +255,11 @@ namespace quda /** @brief Computes Left/Right SVD from pre computed Right/Left - @param[in] evecs Computed eigenvectors of NormOp - @param[in] evals Computed eigenvalues of NormOp + @param[in,out] evecs Computed eigenvectors of NormOp + @param[in,out] evals Computed eigenvalues of NormOp + @param[in] dagger Whether NormOp was MdagM (false) or MMdag (true) */ - void computeSVD(std::vector &evecs, std::vector &evals); + void computeSVD(std::vector &evecs, std::vector &evals, bool dagger = false); /** @brief Compute eigenvalues and their residiua diff --git a/include/enum_quda.h b/include/enum_quda.h index 48d79de463..1de9c3be35 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -48,6 +48,7 @@ typedef enum QudaGaugeFieldOrder_s { QUDA_BQCD_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime+halos, column-row order QUDA_TIFR_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime, column-row order QUDA_TIFR_PADDED_GAUGE_ORDER, // expect *gauge, mu, parity, t, z+halo, y, x/2, column-row order + QUDA_OPENQCD_GAUGE_ORDER, // expect *gauge, spacetime, mu, parity row-column order -- links attached to odd points only QUDA_INVALID_GAUGE_ORDER = QUDA_INVALID_ENUM } QudaGaugeFieldOrder; @@ -252,14 +253,16 @@ typedef enum QudaDiracFieldOrder_s { QUDA_CPS_WILSON_DIRAC_ORDER, // odd-even, color inside spin QUDA_LEX_DIRAC_ORDER, // lexicographical order, color inside spin QUDA_TIFR_PADDED_DIRAC_ORDER, // padded z dimension for TIFR RHMC code + QUDA_OPENQCD_DIRAC_ORDER, // openqcd QUDA_INVALID_DIRAC_ORDER = QUDA_INVALID_ENUM } QudaDiracFieldOrder; typedef enum QudaCloverFieldOrder_s { - QUDA_NATIVE_CLOVER_ORDER, // even-odd, Fload-N ordering - QUDA_PACKED_CLOVER_ORDER, // even-odd, QDP packed - QUDA_QDPJIT_CLOVER_ORDER, // (diagonal / off-diagonal)-chirality-spacetime - QUDA_BQCD_CLOVER_ORDER, // even-odd, super-diagonal packed and reordered + QUDA_NATIVE_CLOVER_ORDER, // even-odd, Fload-N ordering + QUDA_PACKED_CLOVER_ORDER, // even-odd, QDP packed + QUDA_QDPJIT_CLOVER_ORDER, // (diagonal / off-diagonal)-chirality-spacetime + QUDA_BQCD_CLOVER_ORDER, // even-odd, super-diagonal packed and reordered + QUDA_OPENQCD_CLOVER_ORDER, // openqcd QUDA_INVALID_CLOVER_ORDER = QUDA_INVALID_ENUM } QudaCloverFieldOrder; @@ -350,6 +353,7 @@ typedef enum QudaFieldOrder_s { QUDA_QDPJIT_FIELD_ORDER, // QDP field ordering (complex-color-spin-spacetime) QUDA_QOP_DOMAIN_WALL_FIELD_ORDER, // QOP domain-wall ordering QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER, // TIFR RHMC ordering + QUDA_OPENQCD_FIELD_ORDER, // OPENQCD ordering QUDA_INVALID_FIELD_ORDER = QUDA_INVALID_ENUM } QudaFieldOrder; @@ -363,16 +367,26 @@ typedef enum QudaFieldCreate_s { } QudaFieldCreate; typedef enum QudaGammaBasis_s { // gamj=((top 2 rows)(bottom 2 rows)) s1,s2,s3 are Pauli spin matrices, 1 is 2x2 identity - QUDA_DEGRAND_ROSSI_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((0,1)(1,0)) gam5=((1,0)(0,-1)) - QUDA_UKQCD_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,i*s2)(-i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,1)(1,0)) - QUDA_CHIRAL_GAMMA_BASIS, // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((0,-1)(-1,0))gam5=((-1,0)(0,1)) - QUDA_DIRAC_PAULI_GAMMA_BASIS, // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,-1)(-1,0)) + QUDA_DEGRAND_ROSSI_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((0,1)(1,0)) gam5=((1,0)(0,-1)) + QUDA_UKQCD_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,i*s2)(-i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,1)(1,0)) + QUDA_CHIRAL_GAMMA_BASIS, // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((0,-1)(-1,0)) gam5=((-1,0)(0,1)) + QUDA_OPENQCD_GAMMA_BASIS, // gam1=((0,-i*s3)(i*s3,0)) gam2=((0,-1)(-1,0)) gam3=((0,-s1)(s1,0)) gam4=((0,-i*s2)(i*s2,0)) gam5=((1,0)(0,-1)) + QUDA_DIRAC_PAULI_GAMMA_BASIS, // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,-1)(-1,0)) QUDA_INVALID_GAMMA_BASIS = QUDA_INVALID_ENUM // gam5=gam1*gam2*gam3*gam4 } QudaGammaBasis; // Dirac-Pauli -> DeGrand-Rossi T = i/sqrt(2)*((s2,-s2)(s2,s2)) field_DR = T * field_DP // UKQCD -> DeGrand-Rossi T = i/sqrt(2)*((-s2,-s2)(-s2,s2)) field_DR = T * field_UK // Chiral -> DeGrand-Rossi T = i*((0,-s2)(s2,0)) field_DR = T * field_chiral +typedef enum QudaGammaDirection_s { + QUDA_GAMMA_X, + QUDA_GAMMA_Y, + QUDA_GAMMA_Z, + QUDA_GAMMA_T, + QUDA_GAMMA_5, + QUDA_INVALID_GAMMA_INDEX = QUDA_INVALID_ENUM +} QudaGammaDirection; + typedef enum QudaSourceType_s { QUDA_POINT_SOURCE, QUDA_RANDOM_SOURCE, @@ -653,4 +667,3 @@ typedef enum QudaUpdateSplitGauge_s { #ifdef __cplusplus } #endif - diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 832c2a31a2..33fa5a9ad8 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -42,6 +42,7 @@ #define QUDA_BQCD_GAUGE_ORDER 6 // expect *gauge mu even-odd spacetime+halos row-column order #define QUDA_TIFR_GAUGE_ORDER 7 #define QUDA_TIFR_PADDED_GAUGE_ORDER 8 +#define QUDA_OPENQCD_GAUGE_ORDER 9 #define QUDA_INVALID_GAUGE_ORDER QUDA_INVALID_ENUM #define QudaTboundary integer(4) @@ -224,13 +225,15 @@ #define QUDA_CPS_WILSON_DIRAC_ORDER 4 // odd-even color inside spin #define QUDA_LEX_DIRAC_ORDER 5 // lexicographical order color inside spin #define QUDA_TIFR_PADDED_DIRAC_ORDER 6 +#define QUDA_OPENQCD_DIRAC_ORDER 7 // openqcd #define QUDA_INVALID_DIRAC_ORDER QUDA_INVALID_ENUM #define QudaCloverFieldOrder integer(4) -#define QUDA_NATIVE_CLOVER_ORDER 0 // even-odd FloatN ordering -#define QUDA_PACKED_CLOVER_ORDER 1 // even-odd packed -#define QUDA_QDPJIT_CLOVER_ORDER 2 // lexicographical order packed -#define QUDA_BQCD_CLOVER_ORDER 3 // BQCD order which is a packed super-diagonal form +#define QUDA_NATIVE_CLOVER_ORDER 0 // even-odd FloatN ordering +#define QUDA_PACKED_CLOVER_ORDER 1 // even-odd packed +#define QUDA_QDPJIT_CLOVER_ORDER 2 // lexicographical order packed +#define QUDA_BQCD_CLOVER_ORDER 3 // BQCD order which is a packed super-diagonal form +#define QUDA_OPENQCD_CLOVER_ORDER 4 // openqcd #define QUDA_INVALID_CLOVER_ORDER QUDA_INVALID_ENUM #define QudaVerbosity integer(4) @@ -313,6 +316,7 @@ #define QUDA_QDPJIT_FIELD_ORDER 3 // QDP field ordering (complex-color-spin-spacetime) #define QUDA_QOP_DOMAIN_WALL_FIELD_ORDER 4 // QOP domain-wall ordering #define QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER 5 // TIFR RHMC ordering +#define QUDA_OPENQCD_FIELD_ORDER 6 // openQCD ordering #define QUDA_INVALID_FIELD_ORDER QUDA_INVALID_ENUM #define QudaFieldCreate integer(4) @@ -330,6 +334,14 @@ #define QUDA_DIRAC_PAULI_GAMMA_BASIS 3 #define QUDA_INVALID_GAMMA_BASIS QUDA_INVALID_ENUM +#define QudaGammaDirection integer(4) +#define QUDA_GAMMA_X 0 +#define QUDA_GAMMA_Y 1 +#define QUDA_GAMMA_Z 2 +#define QUDA_GAMMA_T 3 +#define QUDA_GAMMA_5 4 +#define QUDA_INVALID_GAMMA_DIRECTION QUDA_INVALID_ENUM + #define QudaSourceType integer(4) #define QUDA_POINT_SOURCE 0 #define QUDA_RANDOM_SOURCE 1 diff --git a/include/gamma.cuh b/include/gamma.cuh index ed5474711f..bad953f631 100644 --- a/include/gamma.cuh +++ b/include/gamma.cuh @@ -26,10 +26,10 @@ namespace quda { Gamma(const Gamma &g) = default; __device__ __host__ inline int getcol(int row) const { - if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) { - switch(dir) { - case 0: - case 1: + if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS || basis == QUDA_OPENQCD_GAMMA_BASIS) { + switch (dir) { + case 0: + case 1: switch(row) { case 0: return 3; case 1: return 2; @@ -54,11 +54,11 @@ namespace quda { case 3: return 3; } break; - } + } } else { - switch(dir) { - case 0: - case 1: + switch (dir) { + case 0: + case 1: switch(row) { case 0: return 3; case 1: return 2; @@ -90,7 +90,7 @@ namespace quda { case 3: return 1; } break; - } + } } return 0; } @@ -201,7 +201,51 @@ namespace quda { } break; } + } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) { + switch (dir) { + case 0: /* corresponds to gamma1 in OpenQCD convention */ + switch (row) { + case 0: + case 1: return -I; + case 2: + case 3: return I; + } + break; + case 1: /* gamma2 in openQCD */ + switch (row) { + case 0: + case 3: return -1; + case 1: + case 2: return 1; + } + break; + case 2: /* gamma3 in openQCD */ + switch (row) { + case 0: + case 3: return -I; + case 1: + case 2: return I; + } + break; + case 3: /* gamma0 in openQCD */ + switch (row) { + case 0: + case 1: + case 2: + case 3: return -1; + } + break; + case 4: /* gamma5 in openQCD */ + switch (row) { + case 0: + case 1: return 1; + case 2: + case 3: return -1; + } + break; + } } + return 0; } @@ -283,7 +327,51 @@ namespace quda { } break; } + } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) { + switch (dir) { + case 0: /* gamma1 in openQCD convention */ + switch (row) { + case 0: + case 1: return complex(a.imag(), -a.real()); // I + case 2: + case 3: return complex(-a.imag(), a.real()); // -I + } + break; + case 1: /* gamma2 in openQCD */ + switch (row) { + case 0: + case 3: return -a; + case 1: + case 2: return a; + } + break; + case 2: /* gamma3 in openQCD */ + switch (row) { + case 0: + case 3: return complex(a.imag(), -a.real()); // I + case 1: + case 2: return complex(-a.imag(), a.real()); // -I + } + break; + case 3: /* gamma0 in openQCD */ + switch (row) { + case 0: + case 1: + case 2: + case 3: return -a; + } + break; + case 4: /* gamma5 in openQCD */ + switch (row) { + case 0: + case 1: return a; + case 2: + case 3: return -a; + } + break; + } } + return a; } diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 4561f1f21f..827dde5bbf 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2433,6 +2433,99 @@ namespace quda { size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } }; + /** + * struct to define order of gauge fields in OpenQCD + */ + template struct OpenQCDOrder : LegacyOrder { + + using Accessor = OpenQCDOrder; + using real = typename mapper::type; + using complex = complex; + + Float *gauge; + const int volumeCB; + static constexpr int Nc = 3; + const int dim[4]; // xyzt convention + const int L[4]; // txyz convention + const int nproc[4]; + + OpenQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), + gauge(gauge_ ? gauge_ : (Float *)u.data()), // pointer to the gauge field on CPU + volumeCB(u.VolumeCB()), // Volume and VolumeCB refer to the global lattice, if VolumeLocal, then local lattice + dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]}, // *local* lattice dimensions, xyzt + L {u.X()[3], u.X()[0], u.X()[1], u.X()[2]}, // *local* lattice dimensions, txyz + nproc {comm_dim(3), comm_dim(0), comm_dim(1), comm_dim(2)} // txyz + { + if constexpr (length != 18) { errorQuda("Gauge field length %d not supported", length); } + } + + /** + * @brief Obtains the offset in Floats from the openQCD base pointer + * to the gauge fields. + * + * @param[in] x_cb Checkerboard index coming from quda + * @param[in] dir The direction coming from quda + * @param[in] parity The parity coming from quda + * + * @return The offset. + */ + __device__ __host__ inline int getGaugeOffset(int x_cb, int dir, int parity) const + { + int quda_x[4], x[4]; + getCoords(quda_x, x_cb, dim, parity); // x_quda = quda local lattice coordinates + openqcd::rotate_coords(quda_x, x); // x = openQCD local lattice coordinates + + int mu = (dir + 1) % 4; // mu = openQCD direction + int ix = openqcd::ipt(x, L); + int iz = openqcd::iup(x, mu, L, nproc); + int ofs = 0; + int volume = openqcd::vol(L); + + if (ix < volume / 2) { // ix even -> iz odd + if (iz < volume) { // iz in interior + ofs = 8 * (iz - volume / 2) + 2 * mu + 1; + } else { + int ib = iz - volume - openqcd::ifc(L, nproc, mu) - openqcd::bndry(L, nproc) / 2; // iz in exterior + ofs = 4 * volume + openqcd::face_offset(L, nproc, mu) + ib; + } + } else if (volume / 2 <= ix && ix < volume) { // ix odd + ofs = 8 * (ix - volume / 2) + 2 * mu; + } + + return ofs * length; + } + + __device__ __host__ inline void load(complex v[length / 2], int x_cb, int dir, int parity, Float = 1.0) const + { + auto in = &gauge[getGaugeOffset(x_cb, dir, parity)]; + block_load(v, reinterpret_cast(in)); + } + + __device__ __host__ inline void save(const complex v[length / 2], int x_cb, int dir, int parity) const + { + auto out = &gauge[getGaugeOffset(x_cb, dir, parity)]; + block_store(reinterpret_cast(out), v); + } + + /** + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dim Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const + { + return gauge_wrapper(const_cast(*this), dim, x_cb, parity); + } + + size_t Bytes() const { return 2 * Nc * Nc * sizeof(Float); } + }; // class OpenQCDOrder + } // namespace gauge template @@ -2477,5 +2570,8 @@ namespace quda { template struct gauge_order_mapper { typedef gauge::FloatNOrder type; }; + template struct gauge_order_mapper { + typedef gauge::OpenQCDOrder type; + }; } // namespace quda diff --git a/include/index_helper.cuh b/include/index_helper.cuh index db58c0daed..35ec4bd0e5 100644 --- a/include/index_helper.cuh +++ b/include/index_helper.cuh @@ -1117,6 +1117,295 @@ namespace quda { return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1; } + /** + * These are index helper functions used in the order classes of openQCD, i.e. + * + * - OpenQCDOrder in quda:include/gauge_field_order.h + * - OpenQCDDiracOrder in quda:include/color_spinor_field_order.h + * - OpenQCDOrder in quda:include/clover_field_order.h. + * + * The main helper functions are ipt() and iup(), giving pure function + * implementations of the ipt[] and iup[][] arrays (see + * https://doi.org/10.22323/1.466.0280) that are needed to calculate the correct offsets + * of the fields base pointers. + */ + namespace openqcd + { + + /** + * @brief Returns the surface in direction mu + * + * @param X Extent in all 4 directions + * @param[in] mu Direction + * + * @return Surface + */ + __device__ __host__ inline int surface(const int X[4], const int mu) + { + if (mu == 0) { + return X[1] * X[2] * X[3]; + } else if (mu == 1) { + return X[0] * X[2] * X[3]; + } else if (mu == 2) { + return X[0] * X[1] * X[3]; + } + return X[0] * X[1] * X[2]; + } + + /** + * @brief Return BNDRY (see openqcd:include/global.h) + * + * @param[in] L Local lattice extent L0-L3 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd + * + * @return BNDRY + */ + __device__ __host__ inline int bndry(const int L[4], const int nproc[4]) + { + return 2 + * (((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1)) + + ((1 - (nproc[2] % 2)) * surface(L, 2)) + ((1 - (nproc[3] % 2)) * surface(L, 3))); + } + + /** + * @brief Calculate the offset needed for boundary points in openQCD. + * + * @param[in] L Local lattice extent L0-L3 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd + * @param[in] mu Direction in txyz + * + * @return The offset + */ + __device__ __host__ inline int ifc(const int L[4], const int nproc[4], const int mu) + { + if (mu == 0) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2; + } else if (mu == 1) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) + (((1 - (nproc[1] % 2)) * surface(L, 1)) / 2); + } else if (mu == 2) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1)) + + (((1 - (nproc[2] % 2)) * surface(L, 2)) / 2); + } + return ((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1)) + + ((1 - (nproc[2] % 2)) * surface(L, 2)) + (((1 - (nproc[3] % 2)) * surface(L, 3)) / 2); + } + + /** + * @brief Calculate the offset of the faces in openQCD. + * + * @param[in] L Local lattice extent L0-L3 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd + * @param[in] mu Direction in txyz + * + * @return The offset + */ + __device__ __host__ inline int face_offset(const int L[4], const int nproc[4], const int mu) + { + if (mu == 0) { + return 0; + } else if (mu == 1) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2; + } else if (mu == 2) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2 + ((1 - (nproc[1] % 2)) * surface(L, 1)) / 2; + } + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2 + ((1 - (nproc[1] % 2)) * surface(L, 1)) / 2 + + ((1 - (nproc[2] % 2)) * surface(L, 2)) / 2; + } + + /** + * @brief Rotate coordinates (xyzt -> txyz) + * + * @param[in] x_quda Cartesian local lattice coordinates in quda + * convention (xyzt) + * @param[out] x_openQCD Cartesian local lattice coordinates in openQCD + * convention (txyz) + */ + __device__ __host__ inline void rotate_coords(const int x_quda[4], int x_openQCD[4]) + { + x_openQCD[1] = x_quda[0]; + x_openQCD[2] = x_quda[1]; + x_openQCD[3] = x_quda[2]; + x_openQCD[0] = x_quda[3]; + } + + /** + * @brief Generate a lexicographical index with x[Ndims-1] running + * fastest, for example if Ndims=4: + * ix = X3*X2*X1*x0 + X3*X2*x1 + X3*x2 + x3. + * + * @param[in] x Integer array of dimension Ndims with coordinates + * @param[in] X Integer array of dimension Ndims with extents + * @param[in] Ndims The number of dimensions + * + * @return Lexicographical index + */ + __device__ __host__ inline int lexi(const int *x, const int *X, const int Ndims) + { + int i, ix = x[0]; + +#pragma unroll + for (i = 1; i < Ndims; i++) { ix = (X[i] * ix + x[i]); } + return ix; + } + + /** + * @brief Return the volume + * + * @param[in] X Integer array of 4 dimensions + * + * @return Volume + */ + __device__ __host__ inline int vol(const int X[4]) { return X[0] * X[1] * X[2] * X[3]; } + + /** + * @brief Return cbs[]. This is the cache block size in openQCD, which + * the local lattice is divided into. + * + * @param[in] mu Direction + * @param[in] X Extents + * + * @return cbs + */ + __device__ __host__ inline int setup_cbs(const int mu, const int X[4]) + { + if (mu == 0) { + return X[0]; + } else if ((X[mu] % 4) == 0) { + return 4; + } else if ((X[mu] % 3) == 0) { + return 3; + } else if ((X[mu] % 2) == 0) { + return 2; + } else { + return 1; + } + } + + /** + * @brief Pure function to return ipt[iy], where + * iy=x3+L3*x2+L2*L3*x1+L1*L2*L3*x0 without accessing the + * ipt-array, but calculating the index on the fly. Notice that xi + * and Li are in openQCD (txyz) convention. If they come from + * QUDA, you have to rotate them first (see rotate_coords). + * + * @param[in] x Carthesian local lattice corrdinates, 0 <= x[i] < L[i] + * @param[in] L Local lattice extents + * + * @return ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0] = the local flat index of + * openQCD + */ + __device__ __host__ inline int ipt(const int x[4], const int L[4]) + { + int xb[4], xn[4]; + + // openQCDs cache block size in txyz convention + int cbs[4] = {setup_cbs(0, L), setup_cbs(1, L), setup_cbs(2, L), setup_cbs(3, L)}; + + // openQCDs cache block grid in txyz convention + int cbn[4] = {L[0] / cbs[0], L[1] / cbs[1], L[2] / cbs[2], L[3] / cbs[3]}; + + xb[0] = x[0] % cbs[0]; + xb[1] = x[1] % cbs[1]; + xb[2] = x[2] % cbs[2]; + xb[3] = x[3] % cbs[3]; + + xn[0] = x[0] / cbs[0]; + xn[1] = x[1] / cbs[1]; + xn[2] = x[2] / cbs[2]; + xn[3] = x[3] / cbs[3]; + + return (lexi(xb, cbs, 4) / 2 + vol(cbs) * lexi(xn, cbn, 4) / 2 + + ((x[0] + x[1] + x[2] + x[3]) % 2 != 0) * (vol(L) / 2) // odd -> +VOLUME/2 + ); + } + + /** + * @brief Determines the number of boundary points in direction mu prior to + * the Carthesian index x with dimensions X + * + * @param[in] mu Direction + * @param[in] x Lattice point + * @param[in] X Dimensions of the lattice/block + * + * @return Number of prior boundary points + */ + __device__ __host__ inline int boundary_pts(const int mu, const int x[4], const int X[4]) + { + int ret = 0; + + if (mu == 3) { + ret = lexi(x, X, 3); // lexi without x[3] + } else if (mu == 2) { + ret = X[3] * lexi(x, X, 2); + if (x[2] == (X[2] - 1)) { + ret += x[3]; // lexi without x[2] + } + } else if (mu == 1) { + if (x[1] == (X[1] - 1)) { + ret = X[2] * X[3] * x[0] + X[3] * x[2] + x[3]; // lexi without x[1] + } else { + ret = surface(X, 1); + } + } else if (mu == 0) { + if (x[0] == (X[0] - 1)) { + ret = lexi(x + 1, X + 1, 3); // lexi without x[0] + } else { + ret = surface(X, 0); + } + } + + return ret; + } + + /** + * @brief Pure implementation of iup[ix][mu]. Returns neighbouring + * point of ix in positive mu direction. + * + * @param[in] x Cartesian local lattice corrdinates, 0 <= x[i] < Li, + * length 4 in txyz convention + * @param[in] mu Direction in txyz convention + * @param[in] L Local lattice extents, length 4 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd + * + * @return iup[ix][mu] + */ + __device__ __host__ inline int iup(const int x[4], const int mu, const int L[4], const int nproc[4]) + { + int i, ret, xb[4], xn[4]; + + if ((x[mu] == (L[mu] - 1)) && (nproc[mu] > 1)) { + + int cbs[4] = {setup_cbs(0, L), setup_cbs(1, L), setup_cbs(2, L), setup_cbs(3, L)}; + int cbn[4] = {L[0] / cbs[0], L[1] / cbs[1], L[2] / cbs[2], L[3] / cbs[3]}; + + xb[0] = x[0] % cbs[0]; + xb[1] = x[1] % cbs[1]; + xb[2] = x[2] % cbs[2]; + xb[3] = x[3] % cbs[3]; + + xn[0] = x[0] / cbs[0]; + xn[1] = x[1] / cbs[1]; + xn[2] = x[2] / cbs[2]; + xn[3] = x[3] / cbs[3]; + + ret = vol(L) + ifc(L, nproc, mu); + if ((x[0] + x[1] + x[2] + x[3]) % 2 == 0) { ret += bndry(L, nproc) / 2; } + + ret += surface(cbs, mu) * boundary_pts(mu, xn, cbn) / 2; + ret += boundary_pts(mu, xb, cbs) / 2; + return ret; + + } else { +#pragma unroll + for (i = 0; i < 4; i++) { xb[i] = x[i]; } + + xb[mu] = (xb[mu] + 1) % (L[mu] * nproc[mu]); + return ipt(xb, L); + } + } + + } // namespace openqcd + /** @brief Compute the flattened 4-d index from a separate T and flattened 3-d XYZ index. diff --git a/include/instantiate.h b/include/instantiate.h index 9b40ede127..2b12b184e4 100644 --- a/include/instantiate.h +++ b/include/instantiate.h @@ -68,6 +68,11 @@ namespace quda template <> constexpr bool is_enabled() { return false; } template <> constexpr bool is_enabled() { return false; } #endif +#ifdef BUILD_OPENQCD_INTERFACE + template <> constexpr bool is_enabled() { return true; } +#else + template <> constexpr bool is_enabled() { return false; } +#endif /** @brief Helper function for returning if a given precision is enabled @@ -140,8 +145,13 @@ namespace quda }; struct ReconstructWilson { +#ifdef BUILD_QCD_PLUS_QED + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9}; +#else static constexpr std::array recon = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8}; +#endif }; struct ReconstructStaggered { @@ -610,6 +620,21 @@ namespace quda template <> constexpr bool is_enabled() { return true; } #endif + struct WilsonReconstruct { +#ifdef BUILD_QCD_PLUS_QED + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9}; +#else + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8}; +#endif + }; + + struct StaggeredReconstruct { + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9}; + }; + #ifdef GPU_DISTANCE_PRECONDITIONING constexpr bool is_enabled_distance_precondition() { return true; } #else diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index 505163cc93..c0fadbb376 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -220,6 +220,60 @@ namespace quda } }; + /** Transform from openqcd into non-relativistic basis (a.k.a UKQCD basis): + * gamma_ukqcd = U gamma_openqcd U^dagger with + * U = [-1 0 1 0] + [ 0 -1 0 1] + [ 1 0 1 0] + [ 0 1 0 1] / sqrt(2), + * see https://github.com/JeffersonLab/chroma/blob/master/docs/notes/gamma_conventions.tex + for further notes. */ + template struct OpenqcdToNonRelBasis { + template + __device__ __host__ inline void operator()(complex out[Ns * Nc], const complex in[Ns * Nc]) const + { + int s1[4] = {0, 1, 0, 1}; + int s2[4] = {2, 3, 2, 3}; + FloatOut K1[4] + = {static_cast(-kP), static_cast(-kP), static_cast(kP), static_cast(kP)}; + FloatOut K2[4] + = {static_cast(kP), static_cast(kP), static_cast(kP), static_cast(kP)}; + for (int s = 0; s < Ns; s++) { + for (int c = 0; c < Nc; c++) { + out[s * Nc + c] = K1[s] * static_cast>(in[s1[s] * Nc + c]) + + K2[s] * static_cast>(in[s2[s] * Nc + c]); + } + } + } + }; + + /** Transform from non-relativistic (aka ukqcd) into openqcd basis: + * gamma_ukqcd = U gamma_openqcd U^dagger with + * U = [-1 0 1 0] + * [ 0 -1 0 1] + * [ 1 0 1 0] + * [ 0 1 0 1] / sqrt(2) + */ + template struct NonRelToOpenqcdBasis { + template + __device__ __host__ inline void operator()(complex out[Ns * Nc], const complex in[Ns * Nc]) const + { + int s1[4] = {0, 1, 0, 1}; + int s2[4] = {2, 3, 2, 3}; + + FloatOut K1[4] + = {static_cast(-kU), static_cast(-kU), static_cast(kU), static_cast(kU)}; + FloatOut K2[4] + = {static_cast(kU), static_cast(kU), static_cast(kU), static_cast(kU)}; + for (int s = 0; s < Ns; s++) { + for (int c = 0; c < Nc; c++) { + out[s * Nc + c] = K1[s] * static_cast>(in[s1[s] * Nc + c]) + + K2[s] * static_cast>(in[s2[s] * Nc + c]); + } + } + } + }; + template struct CopyColorSpinor_ { const Arg &arg; constexpr CopyColorSpinor_(const Arg &arg) : arg(arg) { } diff --git a/include/quda.h b/include/quda.h index 6cbfee5f04..0bd715c5f4 100644 --- a/include/quda.h +++ b/include/quda.h @@ -70,10 +70,7 @@ extern "C" { int staple_pad; /**< Used by link fattening */ int llfat_ga_pad; /**< Used by link fattening */ int mom_ga_pad; /**< Used by the gauge and fermion forces */ - union { - bool use_split_gauge_bkup; /**< Used by gauge split buffers (default=true keep split gauge after usage)*/ - int pad; /**< Forces 4-byte alignment */ - }; + int use_split_gauge_bkup; /**< Used by gauge split buffers (default=true keep split gauge after usage)*/ QudaStaggeredPhase staggered_phase_type; /**< Set the staggered phase type of the links */ int staggered_phase_applied; /**< Whether the staggered phase has already been applied to the links */ @@ -468,6 +465,9 @@ extern "C" { /** The t0 parameter for distance preconditioning, the timeslice where the source is located */ int distance_pc_t0; + /** Additional user-defined properties */ + void *additional_prop; + } QudaInvertParam; // Parameter set for solving eigenvalue problems. @@ -1015,7 +1015,7 @@ extern "C" { * initQuda. Calling initQudaMemory requires that the user has * previously called initQudaDevice. */ - void initQudaMemory(); + void initQudaMemory(void); /** * Initialize the library. This function is actually a wrapper @@ -1038,7 +1038,7 @@ extern "C" { * @details This should only be needed for automated testing when * different partitioning is applied within a single run. */ - void updateR(); + void updateR(void); /** * A new QudaGaugeParam should always be initialized immediately @@ -1860,7 +1860,7 @@ extern "C" { void flushPoolQuda(QudaMemoryType type); void setMPICommHandleQuda(void *mycomm); - + // Parameter set for quark smearing operations typedef struct QudaQuarkSmearParam_s { //------------------------------------------------- diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h new file mode 100644 index 0000000000..6d3257da8b --- /dev/null +++ b/include/quda_openqcd_interface.h @@ -0,0 +1,522 @@ +#pragma once + +#include +#include + +#define OPENQCD_MAX_INVERTERS 32 +#define OPENQCD_MAX_EIGENSOLVERS 32 + +/** + * The macro battle below is to trick quda.h to think that double_complex is + * defined to be the struct below. For this we need to set the __CUDACC_RTC__, + * which makes double_complex to be defined as double2 (see quda.h), which we + * redefine below as openqcd_complex_dble. The original definitions of + * __CUDACC_RTC__ and double2 are recovered below. We do this to be able to + * include this header file into a openQxD program and compile with flags + * "-std=C89 -pedantic -Werror". Else the compiler trows an + * "ISO C90 does not support complex types" error because of the + * "double _Complex" data types exposed in quda.h. + */ + +typedef struct { + double re, im; +} openqcd_complex_dble; + +#ifdef __CUDACC_RTC__ +#define __CUDACC_RTC_ORIGINAL__ __CUDACC_RTC__ +#endif + +#ifdef double2 +#define double2_ORIGINAL double2 +#endif + +#define __CUDACC_RTC__ +#define double2 openqcd_complex_dble +#include +#undef double2 +#undef __CUDACC_RTC__ + +#ifdef double2_ORIGINAL +#define double2 double2_ORIGINAL +#undef double2_ORIGINAL +#endif + +#ifdef __CUDACC_RTC_ORIGINAL__ +#define __CUDACC_RTC__ __CUDACC_RTC_ORIGINAL__ +#undef __CUDACC_RTC_ORIGINAL__ +#endif + +/** + * @file quda_openqcd_interface.h + * + * @section Description + * + * The header file defines the interface to enable easy + * interfacing between QUDA and the OpenQCD software. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * ############################################################################# + * OpenQCD immitations of structs as documented in main/README.global (see + * openQCD or openQxD official repositories). These structs have the same + * members as the ones from openQCD and are (mostly) alignment compatible, but + * they don't *have* to be. + * ############################################################################# + */ +typedef struct { + int type, cstar; + double phi3[2][3], phi1[2]; +} openQCD_bc_parms_t; + +typedef struct { + int qhat; + double m0, su3csw, u1csw, cF[2], theta[3]; +} openQCD_dirac_parms_t; + +typedef struct { + int gauge, nfl; +} openQCD_flds_parms_t; +/** + * ############################################################################# + */ + +typedef enum OpenQCDGaugeGroup_s { + OPENQCD_GAUGE_SU3 = 1, + OPENQCD_GAUGE_U1 = 2, + OPENQCD_GAUGE_SU3xU1 = 3, + OPENQCD_GAUGE_INVALID = QUDA_INVALID_ENUM +} OpenQCDGaugeGroup; + +typedef enum OpenQCDFieldType_s { + OPENQCD_FIELD_SPINOR = 1, + OPENQCD_FIELD_GAUGE = 2, + OPENQCD_FIELD_CLOVER = 3, + OPENQCD_FIELD_INVALID = QUDA_INVALID_ENUM +} OpenQCDFieldType; + +/** + * Parameters related to problem size and machine topology. They should hold the + * numbers in quda format, i.e. xyzt convention. For example L[0] = L1, L[1] = + * L2, ... + */ +typedef struct { + MPI_Comm quda_comm; /** QUDA MPI communicator */ + MPI_Comm world_comm; /** World MPI communicator */ + int L[4]; /** Local lattice dimensions L1, L2, L3, L0 */ + int nproc[4]; /** Machine grid size NPROC1, NPROC2, NPROC3, NPROC0*/ + int nproc_blk[4]; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK, + is assumed to be [1, 1, 1, 1] */ + int N[4]; /** Glocal lattice dimensions N1, N2, N3, N3 */ + int device; /** GPU device number */ + int cstar; /** number of cstar directions, equals bc_cstar() */ + int *data; /** rank topology, length 5 + NPROC1*NPROC2*NPROC3*NPROC0: + data[0] = cstar; + data[1+i] = nproc[i] for 0 <= i < 4 + data[5+lex(ix,iy,iz,it)] returns rank number in + openQCD, where lex stands for lexicographical + indexing (in QUDA order (xyzt)) */ + openQCD_bc_parms_t (*bc_parms)(void); /** @see bc_parms() */ + openQCD_flds_parms_t (*flds_parms)(void); /** @see flds_parms() */ + openQCD_dirac_parms_t (*dirac_parms)(void); /** @see dirac_parms() */ + void *(*h_gauge)(void); /** function to return a pointer to the gauge field */ + void *(*h_sw)(void); /** function to return a pointer to the updated Clover field */ + int (*openqcd2quda)(OpenQCDFieldType type, void *in, void *out); /** spinor gather function */ + void (*quda2openqcd)(OpenQCDFieldType type, void *in, void *out); /** spinor scatter function */ + void (*get_gfld_flags)(int *ud, int *ad); /** function pointer to gauge field revision query function */ +} openQCD_QudaLayout_t; + +/** + * Parameters used to create a QUDA context. + */ +typedef struct { + QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */ + FILE *logfile; /** log file handler */ + void *gauge; /** base pointer to the gauge fields */ + int volume; /** VOLUME */ + int bndry; /** BNDRY */ + int two_grids_equal; /** Whether the QUDA and the openqxd process grids are equal or not */ + void *(*buffer_field)(MPI_Comm comm, int idx, void *field); /** obtain buffer field */ +} openQCD_QudaInitArgs_t; + + +typedef struct { + double kappa; /* kappa: hopping parameter */ + double mu; /* mu: twisted mass */ + double su3csw; /* su3csw: csw coefficient for SU(3) fields */ + double u1csw; /* u1csw: csw coefficient for U(1) fields, quda doesn't respect that parameter (yet) */ + int qhat; /* qhat: quda doesn't respect that parameter (yet) */ +} openQCD_QudaDiracParam_t; + +/** + * Initialize the QUDA context. + * + * @param[in] init Meta data for the QUDA context + * @param[in] layout Layout struct + * @param infile Input file + */ +void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, char *infile); + +/** + * Destroy the QUDA context and deallocate all solvers. + */ +void openQCD_qudaFinalize(void); + +/** + * Copy a spinor to GPU and back to CPU. + * + * @param[in] h_in Spinor input field (from openQCD) + * @param[out] h_out Spinor output field + */ +void openQCD_back_and_forth(void *h_in, void *h_out); + +/** + * @brief Wrapper around openqcd::ipt + * + * @param[in] x Euclidean corrdinate in txyz convention + * + * @return ipt[x] + * + * @see openqcd::ipt() + */ +int openQCD_qudaIndexIpt(const int *x); +int openQCD_qudaIndexIptLexi(const int iy); + +/** + * @brief Wrapper around openqcd::iup + * + * @param[in] x Euclidean corrdinate in txyz convention + * @param[in] mu Direction + * + * @return iup[x][mu] + * + * @see openqcd::iup() + */ +int openQCD_qudaIndexIup(const int *x, const int mu); + +/** + * @brief Norm square in QUDA. + * + * @param[in] h_in Spinor input field (from openQCD) + * + * @return The norm + */ +double openQCD_qudaNorm(void *h_in); + +/** + * @brief Prototype function for the norm-square in QUDA without + * transfering the field. Should serve as an example. + * + * @param[in] d_in Spinor input field (device pointer) + * + * @return The norm + */ +double openQCD_qudaNorm_NoLoads(void *d_in); + +/** + * @brief Applies Dirac matrix to spinor. + * + * openQCD_out = gamma[dir] * openQCD_in + * + * @param[in] dir Dirac index, 0 <= dir <= 5, notice that dir is in + * openQCD convention, ie. (0: t, 1: x, 2: y, 3: z, 4: 5, 5: 5) + * @param[in] openQCD_in of type spinor_dble[NSPIN] + * @param[out] openQCD_out of type spinor_dble[NSPIN] + */ +void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out); + +/** + * @brief Explicit transfer of an openQCD field from host to device + * + * @param openQCD_field Input openQCD spinor (host pointer) + * + * @return Device pointer + */ +void *openQCD_qudaH2D(void *openQCD_field); + +/** + * @brief Explicit transfer of a QUDA field from device to host + * + * @param quda_field Input quda spinor field (device pointer) + * @param openQCD_field Output openQCD spinor (host pointer) + */ +void openQCD_qudaD2H(void *quda_field, void *openQCD_field); + +/** + * @brief Free a device field allocated by openQCD_qudaH2D() + * + * @param quda_field Pointer to device pointer + */ +void openQCD_qudaSpinorFree(void **quda_field); + +/** + * @brief Apply the Wilson-Clover Dirac operator to a field. All fields + * passed and returned are host (CPU) fields in openQCD order. + * + * @param[in] src Source spinor field + * @param[out] dst Destination spinor field + * @param[in] p Dirac parameter struct + * + * @deprecated Replaced by openQCD_qudaDw() + */ +void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p); + +/** + * @brief Apply the Dirac operator that corresponds to the current openQxD + * setup to a field. All fields passed and returned are host (CPU) + * fields in openQCD order. + * + * @param[in] mu Twisted mass parameter + * @param[in] in Input spinor (host pointer) + * @param[out] out Output spinor (host pointer) + */ +void openQCD_qudaDw(double mu, void *in, void *out); + +/** + * @brief Apply the Dirac operator that corresponds to the current openQxD + * setup to a field. All fields passed and returned are devicde + * (GPU) fields returned by openQCD_qudaH2D(). + * + * @param[in] mu Twisted mass parameter + * @param[in] d_in Input spinor (device pointer) + * @param[out] d_out Output spinor (device pointer) + */ +void openQCD_qudaDw_NoLoads(double mu, void *d_in, void *d_out); + +/** + * Setup the solver interface to quda. This function parses the file given by + * [infile] as an openQCD ini file. The solver section given by the [id] + * parameter must have a key-value pair like solver = QUDA and may contain every + * member of the struct [QudaInvertParam]. If one sets inv_type_precondition = + * QUDA_MG_INVERTER, one can additionally use all the members from the struct + * [QudaMultigridParam] in a section called "Solver {id} Multigrid", where {id} + * is replaced by [id]. For every level given by n_level in the above section, + * one has to provide a subsection called "Solver {id} Multigrid Level {level}", + * where {level} runs from 0 to n_level-1. All these subsections may have keys + * given by all the array-valued members of QudaMultigridParam, for example + * smoother_tol may appear in all subsections. This function must be called on + * all ranks simulaneously. + * + * @param[in] id The identifier of the solver section, i.e. "Solver #". The + * input file is taken from the arguments of quda_init(). If + * id is -1, then the section called "Lattice parameters" is + * parsed in the same way. + * + * @return Pointer to the solver context + */ + +void *openQCD_qudaSolverGetHandle(int id); + +/** + * @brief Return a hash from a subset of the settings in the + * QudaInvertParam struct. Return 0 if the struct is not initialized + * yet. + * + * @param[in] id The solver identifier + * + * @return Hash value + */ +int openQCD_qudaSolverGetHash(int id); + +/** + * @brief Print solver information about the QUDA solver. Print + * "Solver is not initialized yet" is the solver struct is nul + * initialized yet. + * + * @param[in] id The solver identifier + */ +void openQCD_qudaSolverPrintSetup(int id); + +/** + * @brief Solve Ax=b for an Clover Wilson operator with a QUDA solver. + * All fields passed and returned are host (CPU) field in openQCD + * order. + * + * @param[in] id The solver identifier in the input file, i.e. + * "Solver #". The input file is the one given by + * quda_init + * @param[in] mu Twisted mass parameter + * @param[in] source The source + * @param[out] solution The solution + * @param[out] status If the function is able to solve the Dirac equation to + * the desired accuracy (invert_param->tol), status + * reports the total number of iteration steps. -1 + * indicates that the inversion failed. + * + * @return Residual + */ +double openQCD_qudaInvert(int id, double mu, void* source, void* solution, int *status); + + +/** + * @brief Solve Ax=b for an Clover Wilson operator with a QUDA solver. All + * fields passed and returned are host (CPU) field in openQCD order. + * The source and solution fields can be multiple fields, indicated + * by num_src (see QudaInvertParam) + * + * @param[in] id The solver identifier in the input file, i.e. + * "Solver #". The input file is the one given by + * quda_init + * @param[in] mu Twisted mass parameter + * @param[in] sources The source(s) + * @param[out] solutions The solution(s) + * @param[out] status If the function is able to solve the Dirac equation to + * the desired accuracy (invert_param->tol), status[i] + * reports the total number of iteration steps for source + * i, where i = 0, ..., param->num_src-1. -1 indicates + * that the inversion failed. + * @param residual The num_src residuals + */ +void openQCD_qudaInvertMultiSrc(int id, double mu, void** sources, void** solutions, int *status, double *residual); + +/** + * @brief Set up a async solve. See [[openQCD_qudaInvert]] for details + * about the parameters. + */ +void openQCD_qudaInvertAsyncSetup(int id, double mu); + +/** + * @brief Register a solve to be started with [[openQCD_qudaInvertAsyncStart]]. + * The parameters of this function are exactly the same as for + * [[openQCD_qudaInvert]]. + */ +void openQCD_qudaInvertAsyncDispatch(void *source, void *solution, int *status); + +/** + * @brief Fire up the solves registered with [[openQCD_qudaInvertDispatch]] + * in order of their registration asynchronously. This spawns a + * thread with pthread_create that calls [[openQCD_qudaInvert]]. + * + * @return The MPI communicator with which the main threads should + * communicate among ewach other until + * [[openQCD_qudaInvertAsyncWait]] has returned. + */ +MPI_Comm openQCD_qudaInvertAsyncStart(void); + +/** + * @brief Wait for the thread started with [[openQCD_qudaInvertAsyncStart]] + * using pthread_join to finish and collect its results. + * + * @param residual The residuals of the dispatched solves. + */ +void openQCD_qudaInvertAsyncWait(double *residual); + +/** + * @brief Destroys an existing solver context and frees all involed + * structs. + * + * @param[in] id The solver identifier + */ +void openQCD_qudaSolverDestroy(int id); + +/** + * Setup the eigen-solver interface to quda. This function parses the file + * given by [infile] as an openQCD ini file. The solver section given by the + * [inv_section] parameter must have a key-value pair like solver = QUDA and may + * contain every member of the struct [QudaInvertParam]. See + * [openQCD_qudaSolverSetup] for more details about the solver. The eigen-solver + * section given by the [section] parameter may contain every member of the + * struct [QudaEigParam]. + * + * @param[in] id The section id of the eigensolver + * @param[in] solver_id The section id of the solver. If -1, the section is + * not read in. + * + * @return Pointer to the eigen-solver context + */ +void *openQCD_qudaEigensolverGetHandle(int id, int solver_id); + +/** + * @brief Print the eigensolver setup. + * + * @param[in] id The eigensolver identifier + * @param[in] solver_id The solver identifier + */ +void openQCD_qudaEigensolverPrintSetup(int id, int solver_id); + +/** + * @brief Solve Ax=b for an Clover Wilson operator with a multigrid + * solver. All fields are fields passed and returned are host + * (CPU) field in openQCD order. This function requires an + * existing solver context created with openQCD_qudaSolverSetup(). + * + * @param[in] id The eigensolver section identifier + * @param[in] solver_id The solver section identifier + * @param[inout] h_evecs Allocated array of void-pointers to param->n_conf + * fields + * @param[out] h_evals Allocated array of param->n_conf complex_dbles + */ +void openQCD_qudaEigensolve(int id, int solver_id, void **h_evecs, void *h_evals); + +/** + * @brief Destroys an existing eigen-solver context and frees all involed + * structs. + * + * @param[in] id The section id of the eigensolver + */ +void openQCD_qudaEigensolverDestroy(int id); + +/** + * @brief Whether QUDA was built with QCD+QED or not. + */ +bool openQCD_qudaQCDPlusQEDEnabled(void); + +/** + * @brief Wrapper for the plaquette. We could call plaqQuda() directly in + * openQCD, but we have to make sure manually that the gauge field + * is loaded + * + * @return Plaquette value + * @see https://github.com/lattice/quda/wiki/Gauge-Measurements#wilson-plaquette-action + */ +double openQCD_qudaPlaquette(void); + +/** + * @brief Load the gauge fields from host to quda. Notice that the boundary + * fields have to be up2date; i.e. call copy_bnd_hd(), copy_bnd_ud() + * before pass fields into this function. + * + * @param[in] gauge The gauge fields (in openqcd order) + * @param[in] prec Precision of the incoming gauge field + * @param[in] rec How the field should be stored internally in QUDA + * @param[in] t_boundary Time boundary condition + */ +void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary); + +/** + * @brief Save the gauge fields from quda to host. + * + * @param[out] gauge The gauge fields (will be stored in openqcd order) + * @param[in] prec Precision of the outgoing gauge field + * @param[in] rec How the field should be stored internally in QUDA + * @param[in] t_boundary Time boundary condition + */ +void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary); + +/** + * @brief Free the gauge field allocated in quda. + */ +void openQCD_qudaGaugeFree(void); + +/** + * @brief Load the clover fields from host to quda. + * + * @param[in] clover The clover fields (in openqcd order) + * @param[in] kappa The kappa (we need this, because quda has its clover + * field multiplied by kappa and we have to reverse this + * when loading ours) + * @param[in] csw The csw coefficient of the clover field + */ +void openQCD_qudaCloverLoad(void *clover, double kappa, double csw); + +/** + * @brief Free the clover field allocated in quda. + */ +void openQCD_qudaCloverFree(void); + +#ifdef __cplusplus +} +#endif diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 65f33a6772..b0f4be1336 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -101,7 +101,7 @@ set (QUDA_OBJS copy_gauge_offset.cu copy_color_spinor_offset.cu copy_clover_offset.cu staggered_oprod.cu clover_trace_quda.cu hisq_paths_force_quda.cu - unitarize_force_quda.cu unitarize_links_quda.cu milc_interface.cpp milc_interface_internal.cpp + unitarize_force_quda.cu unitarize_links_quda.cu openqcd_interface.cpp milc_interface.cpp milc_interface_internal.cpp tune.cpp device_vector.cu inv_gmresdr_quda.cpp @@ -601,6 +601,10 @@ if(QUDA_BLOCKSOLVER) target_compile_definitions(quda PRIVATE BLOCKSOLVER) endif() +if(QUDA_QCD_PLUS_QED) + target_compile_definitions(quda PUBLIC BUILD_QCD_PLUS_QED) +endif(QUDA_QCD_PLUS_QED) + if(QUDA_INTERFACE_QDP OR QUDA_INTERFACE_ALL) target_compile_definitions(quda PUBLIC BUILD_QDP_INTERFACE) endif(QUDA_INTERFACE_QDP OR QUDA_INTERFACE_ALL) @@ -625,6 +629,10 @@ if(QUDA_INTERFACE_TIFR OR QUDA_INTERFACE_ALL) target_compile_definitions(quda PUBLIC BUILD_TIFR_INTERFACE) endif(QUDA_INTERFACE_TIFR OR QUDA_INTERFACE_ALL) +if(QUDA_INTERFACE_OPENQCD OR QUDA_INTERFACE_ALL) + target_compile_definitions(quda PUBLIC BUILD_OPENQCD_INTERFACE) +endif(QUDA_INTERFACE_OPENQCD OR QUDA_INTERFACE_ALL) + # MULTI GPU AND USQCD if(QUDA_MPI OR QUDA_QMP) target_compile_definitions(quda PUBLIC MULTI_GPU) diff --git a/lib/check_params.h b/lib/check_params.h index 15b9c524e7..6dbdc91dfd 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -111,13 +111,13 @@ void printQudaGaugeParam(QudaGaugeParam *param) { P(staggered_phase_applied, 0); P(i_mu, 0.0); P(overlap, 0); - P(use_split_gauge_bkup, true); + P(use_split_gauge_bkup, QUDA_BOOLEAN_TRUE); #else P(staggered_phase_type, QUDA_STAGGERED_PHASE_INVALID); P(staggered_phase_applied, INVALID_INT); P(i_mu, INVALID_DOUBLE); P(overlap, INVALID_INT); - P(use_split_gauge_bkup, false); + P(use_split_gauge_bkup, QUDA_BOOLEAN_FALSE); #endif #if defined INIT_PARAM @@ -214,6 +214,7 @@ void printQudaEigParam(QudaEigParam *param) { P(use_norm_op, QUDA_BOOLEAN_INVALID); P(compute_svd, QUDA_BOOLEAN_INVALID); P(require_convergence, QUDA_BOOLEAN_INVALID); + P(spectrum, QUDA_SPECTRUM_INVALID); P(n_ev, INVALID_INT); P(n_kr, INVALID_INT); P(n_conv, INVALID_INT); @@ -768,6 +769,7 @@ void printQudaInvertParam(QudaInvertParam *param) { #endif #ifdef INIT_PARAM + P(additional_prop, 0); P(distance_pc_alpha0, 0.0); P(distance_pc_t0, -1); #else diff --git a/lib/checksum.cu b/lib/checksum.cu index b759afde8d..af8374e723 100644 --- a/lib/checksum.cu +++ b/lib/checksum.cu @@ -51,9 +51,12 @@ namespace quda { } else if (u.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { ChecksumArg arg(u,mini); checksum = ChecksumCPU(arg); + } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + ChecksumArg arg(u, mini); + checksum = ChecksumCPU(arg); } else { errorQuda("Checksum not implemented"); - } + } return checksum; } diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index 2c581886da..11b55e6085 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -102,6 +102,12 @@ namespace quda Topology *topo = new Topology; +#ifdef BUILD_OPENQCD_INTERFACE + topo->cstar = (map_data == nullptr) ? 0 : static_cast(map_data)[0]; +#else + topo->cstar = 0; +#endif + topo->ndim = ndim; int nodes = 1; diff --git a/lib/communicator_stack.cpp b/lib/communicator_stack.cpp index ae36f5b2d0..cd34b9cefe 100644 --- a/lib/communicator_stack.cpp +++ b/lib/communicator_stack.cpp @@ -113,6 +113,8 @@ namespace quda int comm_dim(int dim) { return get_current_communicator().comm_dim(dim); } + bool comm_dim_cstar(int dim) { return get_current_communicator().comm_dim_cstar(dim); } + int comm_dim_global(int dim) { return get_default_communicator().comm_dim(dim); } int comm_coord(int dim) { return get_current_communicator().comm_coord(dim); } diff --git a/lib/copy_clover.cu b/lib/copy_clover.cu index 7a8273d2f3..fb27eb9416 100644 --- a/lib/copy_clover.cu +++ b/lib/copy_clover.cu @@ -129,6 +129,12 @@ namespace quda { copyClover, FloatOut, FloatIn>(out, in, inverse, location, Out, In); #else errorQuda("BQCD interface has not been built\n"); +#endif + } else if (in.Order() == QUDA_OPENQCD_CLOVER_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + copyClover, FloatOut, FloatIn>(out, in, inverse, location, Out, In); +#else + errorQuda("OpenQCD interface has not been built\n"); #endif } else { errorQuda("Clover field %d order not supported", in.Order()); diff --git a/lib/copy_clover_offset.cu b/lib/copy_clover_offset.cu index f29e663c14..7bec58484c 100644 --- a/lib/copy_clover_offset.cu +++ b/lib/copy_clover_offset.cu @@ -45,6 +45,16 @@ namespace quda CopyFieldOffset copier(arg, in); #else errorQuda("BQCD interface has not been built\n"); +#endif + } else if (in.Order() == QUDA_OPENQCD_CLOVER_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + using C = OpenQCDOrder; + C out_accessor(out, inverse); + C in_accessor(in, inverse); + CopyFieldOffsetArg arg(out_accessor, out, in_accessor, in, offset); + CopyFieldOffset copier(arg, in); +#else + errorQuda("OpenQCD interface has not been built\n"); #endif } else { errorQuda("Clover field %d order not supported", in.Order()); diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index 0c28f5849d..7ed4059ae1 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -27,6 +27,7 @@ namespace quda case QUDA_DEGRAND_ROSSI_GAMMA_BASIS: return "degrand_rossi"; case QUDA_UKQCD_GAMMA_BASIS: return "ukqcd"; case QUDA_CHIRAL_GAMMA_BASIS: return "chiral"; + case QUDA_OPENQCD_GAMMA_BASIS: return "openqcd"; case QUDA_DIRAC_PAULI_GAMMA_BASIS: return "dirac_pauli"; default: errorQuda("Unknown gamma basis %d", basis); } @@ -68,6 +69,10 @@ namespace quda launch(tp, stream, Arg(out, in, Out_, In_)); } else if (out.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) { launch(tp, stream, Arg(out, in, Out_, In_)); + } else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS) { + launch(tp, stream, Arg(out, in, Out_, In_)); + } else if (out.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) { + launch(tp, stream, Arg(out, in, Out_, In_)); } else if (out.GammaBasis() == QUDA_DIRAC_PAULI_GAMMA_BASIS && in.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) { launch(tp, stream, Arg(out, in, Out_, In_)); } else if (out.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS && in.GammaBasis() == QUDA_DIRAC_PAULI_GAMMA_BASIS) { @@ -123,6 +128,12 @@ namespace quda CopyColorSpinor(out, in, param); else errorQuda("QDPJIT interface has not been built"); + } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { + using O = OpenQCDDiracOrder; + if constexpr (is_enabled()) + CopyColorSpinor(out, in, param); + else + errorQuda("OpenQCD interface has not been built"); } else { errorQuda("Order %d not defined (Ns = %d, Nc = %d, precision = %d)", out.FieldOrder(), Ns, Nc, out.Precision()); } @@ -154,6 +165,12 @@ namespace quda genericCopyColorSpinor(param); else errorQuda("QDPJIT interface has not been built"); + } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { + using ColorSpinor = OpenQCDDiracOrder; + if constexpr (is_enabled()) + genericCopyColorSpinor(param); + else + errorQuda("OpenQCD interface has not been built"); } else { errorQuda("Order %d not defined (Ns=%d, Nc=%d, precision = %d)", in.FieldOrder(), Ns, Nc, in.Precision()); } diff --git a/lib/copy_gauge_extended.cu b/lib/copy_gauge_extended.cu index b0ac43f221..843d21b0de 100644 --- a/lib/copy_gauge_extended.cu +++ b/lib/copy_gauge_extended.cu @@ -65,7 +65,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED) } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -80,7 +80,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED) } else { errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order()); } @@ -111,10 +111,18 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (out.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + +#ifdef BUILD_OPENQCD_INTERFACE + using G = OpenQCDOrder; + CopyGaugeEx(out, in, location, Out, In); +#else + errorQuda("OPENQCD interface has not been built"); +#endif + } else { errorQuda("Gauge field %d order not supported", out.Order()); } - } template @@ -139,7 +147,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED) } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -154,7 +162,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED) } else { errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order()); } @@ -185,10 +193,17 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (in.Order() == QUDA_OPENQCD_GAUGE_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + using G = OpenQCDOrder; + copyGaugeEx(out, in, location, Out, In); +#else + errorQuda("OpenQCD interface has not been built\n"); +#endif + } else { errorQuda("Gauge field %d order not supported", in.Order()); } - } template diff --git a/lib/copy_gauge_inc.cu b/lib/copy_gauge_inc.cu index 9eeb36e655..f7d8289591 100644 --- a/lib/copy_gauge_inc.cu +++ b/lib/copy_gauge_inc.cu @@ -27,7 +27,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED) } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -56,7 +56,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED) } else { errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order()); } @@ -131,10 +131,18 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (out.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + +#ifdef BUILD_OPENQCD_INTERFACE + copyGauge(OpenQCDOrder(out, Out, outGhost), inOrder, + out, in, location, type); +#else + errorQuda("OPENQCD interface has not been built\n"); +#endif + } else { errorQuda("Gauge field %d order not supported", out.Order()); } - } template @@ -161,7 +169,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED) } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -183,7 +191,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_QCD_PLUS_QED) } else { errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order()); } @@ -259,6 +267,15 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (in.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + +#ifdef BUILD_OPENQCD_INTERFACE + copyGauge(OpenQCDOrder(in, In, inGhost), out, in, location, Out, + outGhost, type); +#else + errorQuda("OPENQCD interface has not been built\n"); +#endif + } else { errorQuda("Gauge field order %d not supported", in.Order()); } diff --git a/lib/copy_gauge_offset.cu b/lib/copy_gauge_offset.cu index 9a8494c3a2..6f8bc2cb01 100644 --- a/lib/copy_gauge_offset.cu +++ b/lib/copy_gauge_offset.cu @@ -100,6 +100,13 @@ namespace quda copy_gauge_offset(out, in, offset); #else errorQuda("TIFR interface has not been built\n"); +#endif + } else if (in.Order() == QUDA_OPENQCD_GAUGE_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + using G = typename gauge_order_mapper::type; + copy_gauge_offset(out, in, offset); +#else + errorQuda("OpenQCD interface has not been built\n"); #endif } else { errorQuda("Gauge field %d order not supported", in.Order()); diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu index 9d53ffbd32..8af1ac2224 100644 --- a/lib/dslash_gamma_helper.cu +++ b/lib/dslash_gamma_helper.cu @@ -42,6 +42,18 @@ namespace quda { instantiate_recurse2(out, in, d); } + void ApplyGamma(cvector_ref &out, cvector_ref &in, QudaGammaDirection_s dir) + { + switch (dir) { + case QUDA_GAMMA_X: ApplyGamma(out, in, 0); break; + case QUDA_GAMMA_Y: ApplyGamma(out, in, 1); break; + case QUDA_GAMMA_Z: ApplyGamma(out, in, 2); break; + case QUDA_GAMMA_T: ApplyGamma(out, in, 3); break; + case QUDA_GAMMA_5: ApplyGamma(out, in, 4); break; + default: errorQuda("Unknown gamma: %d\n", dir); + } + } + // Applies out(x) = 1/2 * [(1 +/- gamma5) * in] + out void ApplyChiralProj(cvector_ref &out, cvector_ref &in, const int proj) { diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp index 08222a67cb..7aa60cbd3b 100644 --- a/lib/eig_block_trlm.cpp +++ b/lib/eig_block_trlm.cpp @@ -204,7 +204,7 @@ namespace quda // Compute eigenvalues computeEvals(kSpace, evals); - if (compute_svd) computeSVD(kSpace, evals); + if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger); } // Local clean-up diff --git a/lib/eig_iram.cpp b/lib/eig_iram.cpp index aad8682076..0e59021b35 100644 --- a/lib/eig_iram.cpp +++ b/lib/eig_iram.cpp @@ -515,7 +515,7 @@ namespace quda // Compute the eigen/singular values. getProfile().TPSTART(QUDA_PROFILE_COMPUTE); computeEvals(kSpace, evals); - if (compute_svd) computeSVD(kSpace, evals); + if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger); converged = true; } else { diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp index 9511810c7f..74920aa4e6 100644 --- a/lib/eig_trlm.cpp +++ b/lib/eig_trlm.cpp @@ -26,7 +26,8 @@ namespace quda beta.resize(n_kr, 0.0); // Thick restart specific checks - if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than n_ev+6=%d\n", n_kr, n_ev + 6); + if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than or equal to n_ev+6=%d\n", n_kr, n_ev + 6); + if (n_kr < n_conv + 12) errorQuda("n_kr=%d must be greater than or equal to n_conv+12=%d\n", n_kr, n_conv + 12); if (!(eig_param->spectrum == QUDA_SPECTRUM_LR_EIG || eig_param->spectrum == QUDA_SPECTRUM_SR_EIG)) { errorQuda("Only real spectrum type (LR or SR) can be passed to the TR Lanczos solver"); @@ -180,7 +181,7 @@ namespace quda // Compute eigenvalues/singular values computeEvals(kSpace, evals); - if (compute_svd) computeSVD(kSpace, evals); + if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger); } // Local clean-up diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp index 156969a191..9d1123fb3d 100644 --- a/lib/eigensolve_quda.cpp +++ b/lib/eigensolve_quda.cpp @@ -498,7 +498,7 @@ namespace quda } } - void EigenSolver::computeSVD(std::vector &evecs, std::vector &evals) + void EigenSolver::computeSVD(std::vector &evecs, std::vector &evals, bool dagger) { logQuda(QUDA_SUMMARIZE, "Computing SVD of M\n"); @@ -523,9 +523,15 @@ namespace quda // Lambda already contains the square root of the eigenvalue of the norm op. - // M*Rev_i = M*Rsv_i = sigma_i Lsv_i - mat.Expose()->M({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper}, - {evecs.begin() + lower, evecs.begin() + upper}); + if (dagger) { + // Mdag*Lev_i = Mdag*Lsv_i = sigma_i Rsv_i + mat.Expose()->Mdag({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper}, + {evecs.begin() + lower, evecs.begin() + upper}); + } else { + // M*Rev_i = M*Rsv_i = sigma_i Lsv_i + mat.Expose()->M({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper}, + {evecs.begin() + lower, evecs.begin() + upper}); + } // sigma_i = sqrt(sigma_i (Lsv_i)^dag * sigma_i * Lsv_i ) auto sigma = blas::norm2({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper}); diff --git a/lib/extract_gauge_ghost.in.cu b/lib/extract_gauge_ghost.in.cu index 878de6118b..0639d2eaae 100644 --- a/lib/extract_gauge_ghost.in.cu +++ b/lib/extract_gauge_ghost.in.cu @@ -111,10 +111,17 @@ namespace quda { errorQuda("TIFR interface has not been built"); } + } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + + if constexpr (is_enabled()) { + ExtractGhost>(u, Ghost, extract, offset); + } else { + errorQuda("OpenQCD interface has not been built"); + } + } else { errorQuda("Gauge field %d order not supported", u.Order()); } - } }; diff --git a/lib/extract_gauge_ghost_extended.cu b/lib/extract_gauge_ghost_extended.cu index 37c4f7235a..11b2661e6f 100644 --- a/lib/extract_gauge_ghost_extended.cu +++ b/lib/extract_gauge_ghost_extended.cu @@ -142,10 +142,17 @@ namespace quda { errorQuda("TIFR interface has not been built"); } + } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + + if constexpr (is_enabled()) { + ExtractGhostEx>(u, dim, R, ghost, extract); + } else { + errorQuda("OpenQCD interface has not been built"); + } + } else { errorQuda("Gauge field %d order not supported", u.Order()); } - } }; diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index fadba50fa8..96e7a51d11 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -152,6 +152,25 @@ namespace quda { bytes = site_dim * (x[0] + 4) * (x[1] + 2) * (x[2] + 2) * (x[3] + 2) * nInternal * precision; } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) { bytes = volume * site_size; + } else if (order == QUDA_OPENQCD_GAUGE_ORDER) { + /** + * With an openQCD gauge field, we need all the links of even lattice + * points in positive direction. These are links that lie in the buffer + * space that spans 7*BNDRY/4 gauge fields. These boundary fields are + * located at base_ptr + 4*VOLUME. Therefore, we need to transfer more + * than 4*VOLUME matrices. + */ + + /* analogue to BNDRY in openQCD:include/global.h */ + long int bndry = 0; + bndry += (1 - (comm_dim(0) % 2)) * x[1] * x[2] * x[3]; + bndry += (1 - (comm_dim(1) % 2)) * x[0] * x[2] * x[3]; + bndry += (1 - (comm_dim(2) % 2)) * x[0] * x[1] * x[3]; + bndry += (1 - (comm_dim(3) % 2)) * x[0] * x[1] * x[2]; + bndry *= 2; + + length += 18 * 7 * bndry / 4; + bytes = length * precision; } else { bytes = length * precision; } @@ -191,7 +210,7 @@ namespace quda { } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER - || order == QUDA_MILC_SITE_GAUGE_ORDER) { + || order == QUDA_MILC_SITE_GAUGE_ORDER || order == QUDA_OPENQCD_GAUGE_ORDER) { // does not support device if (order == QUDA_MILC_SITE_GAUGE_ORDER && param.create != QUDA_REFERENCE_FIELD_CREATE) { @@ -1391,7 +1410,8 @@ namespace quda { } } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER - || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { + || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER + || Order() == QUDA_OPENQCD_GAUGE_ORDER) { std::memcpy(buffer, data(), Bytes()); } else { errorQuda("Unsupported order = %d", Order()); @@ -1411,7 +1431,8 @@ namespace quda { } } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER - || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { + || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER + || Order() == QUDA_OPENQCD_GAUGE_ORDER) { std::memcpy(data(), buffer, Bytes()); } else { errorQuda("Unsupported order = %d", Order()); diff --git a/lib/instantiate.cpp b/lib/instantiate.cpp index 15d0b85148..c209a75398 100644 --- a/lib/instantiate.cpp +++ b/lib/instantiate.cpp @@ -1,7 +1,7 @@ #include /** - This file contains deinitions required when compiling with C++14. + This file contains definitions required when compiling with C++14. Without these, we can end up with undefined references at link time. We can remove this file when we jump to C++17 and declare these are inline variables in instantiate.h. diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 47290a14e8..67ec31436c 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -520,7 +520,7 @@ void initQudaDevice(int dev) /* * Any persistent memory allocations that QUDA uses are done here. */ -void initQudaMemory() +void initQudaMemory(void) { auto profile = pushProfile(profileInit); profileInit.TPSTART(QUDA_PROFILE_INIT); @@ -546,7 +546,7 @@ void initQudaMemory() profileInit.TPSTOP(QUDA_PROFILE_INIT); } -void updateR() +void updateR(void) { for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); } @@ -694,7 +694,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) // set update_split_gauge to reuse backup or not and free the buf if needed // always update the flag even gauge reuse with checksum // better way would be do the checks more consistently along with clover/stagger - if (param->use_split_gauge_bkup == true) { + if (param->use_split_gauge_bkup) { update_split_gauge = QUDA_UPDATE_SPLIT_GAUGE_TRUE; } else { update_split_gauge = QUDA_UPDATE_SPLIT_GAUGE_OFF; diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp new file mode 100644 index 0000000000..c6034f518b --- /dev/null +++ b/lib/openqcd_interface.cpp @@ -0,0 +1,2271 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define WITH_COMM(expr) do { if (in_comm()) { expr; } } while (0) + +typedef struct { + bool ready; + bool created; + pthread_t thread; + pthread_mutex_t mutex; + pthread_cond_t cond; + MPI_Comm main_comm; /** MPI communicator holding the main threads */ + MPI_Comm worker_comm; /** MPI communicator holding the worker threads */ +} openQCD_QudaThread_t; + +typedef struct { + void *source; + void *solution; + int *status; + double retval; +} openQCD_qudaInvert_args_t; + +typedef struct { + int initialized; /** Whether openQCD_qudaInit() was called or not */ + int ud_rev; /** Revision of ud field from openqxd */ + int ad_rev; /** Revision of ad field from openqxd */ + int swd_ud_rev; /** Revision of ud field used to calc/transfer the SW field from openqxd */ + int swd_ad_rev; /** Revision of ad field used to calc/transfer the SW field from openqxd */ + double swd_kappa; /** kappa corresponding to the current SW field in QUDA */ + double swd_su3csw; /** SU(3) csw coefficient corresponding to the current SW field in QUDA */ + double swd_u1csw; /** U(1) csw coefficient corresponding to the current SW field in QUDA */ + int swd_qhat; /** qhat coefficient corresponding to the current SW field in QUDA */ + openQCD_QudaInitArgs_t init; + openQCD_QudaLayout_t layout; + openQCD_QudaThread_t thread; + std::vector inv_args; + QudaInvertParam async_params; + void *dirac_handle; /** void-pointer to QudaInvertParam struct for the Dirac operator. + * Notice that this void pointer HAS to be directly before + * inv_handles[32], because it's possible to call + * openQCD_qudaSolverGetHandle with -1. */ + void *inv_handles[OPENQCD_MAX_INVERTERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ + void *eig_handles[OPENQCD_MAX_EIGENSOLVERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ + char infile[1024]; /** Path to the input file (if given to quda_init()) */ +} openQCD_QudaState_t; + +typedef struct openQCD_QudaSolver_s { + char infile[1024]; /** Path to the input file (if given to quda_init()) */ + int id; /** Solver section identifier in the input file */ + QudaMultigridParam *mg_param; /** Pointer to the multigrid param struct */ + double u1csw; /** u1csw property */ + int qhat; /** qhat property */ + int mg_ud_rev; /** Revision of ud field from openqxd */ + int mg_ad_rev; /** Revision of ad field from openqxd */ + double mg_kappa; /** kappa corresponding to the current mg-instance in QUDA */ + double mg_su3csw; /** SU(3) csw coefficient corresponding to the current mg-instance in QUDA */ + double mg_u1csw; /** U(1) csw coefficient corresponding to the current mg-instance in QUDA */ + int mg_qhat; /** qhat corresponding to the current mg-instance in QUDA */ +} openQCD_QudaSolver; + +static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, 0, {}, {}, { false, false, 1, PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER, MPI_COMM_NULL, MPI_COMM_NULL }, {}, {}, nullptr, {}, {}, ""}; + +using namespace quda; + +/** + * code for NVTX taken from Jiri Kraus' blog post: + * http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ + */ + +#ifdef INTERFACE_NVTX + +#if QUDA_NVTX_VERSION == 3 +#include "nvtx3/nvToolsExt.h" +#else +#include "nvToolsExt.h" +#endif + +static const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff}; +static const int num_colors = sizeof(colors) / sizeof(uint32_t); + +#define PUSH_RANGE(name, cid) \ + { \ + int color_id = cid; \ + color_id = color_id % num_colors; \ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ + } +#define POP_RANGE nvtxRangePop(); +#else +#define PUSH_RANGE(name, cid) +#define POP_RANGE +#endif + +#define QUDA_OPENQCD_VERBOSE 1 + +template void inline qudaopenqcd_called(const char *func, QudaVerbosity verb) +{ + /* add NVTX markup if enabled */ + if (start) { + PUSH_RANGE(func, 1); + } else { + POP_RANGE; + } + +#ifdef QUDA_OPENQCD_VERBOSE + if (verb >= QUDA_VERBOSE) { + if (start) { + printfQuda("QUDA_OPENQCD_INTERFACE: %s (called) \n", func); + } else { + printfQuda("QUDA_OPENQCD_INTERFACE: %s (return) \n", func); + } + } +#endif +} + +template void inline qudaopenqcd_called(const char *func) +{ + qudaopenqcd_called(func, getVerbosity()); +} + +/** + * Mapping of enums to their actual values. We have this mapping such that we + * can use the named parameters in our input files rather than the number. this + * makes reading and writing the configuration more understandable. + */ +std::unordered_map enum_map + = {{"QUDA_CG_INVERTER", std::to_string(QUDA_CG_INVERTER)}, + {"QUDA_BICGSTAB_INVERTER", std::to_string(QUDA_BICGSTAB_INVERTER)}, + {"QUDA_GCR_INVERTER", std::to_string(QUDA_GCR_INVERTER)}, + {"QUDA_MR_INVERTER", std::to_string(QUDA_MR_INVERTER)}, + {"QUDA_SD_INVERTER", std::to_string(QUDA_SD_INVERTER)}, + {"QUDA_PCG_INVERTER", std::to_string(QUDA_PCG_INVERTER)}, + {"QUDA_EIGCG_INVERTER", std::to_string(QUDA_EIGCG_INVERTER)}, + {"QUDA_INC_EIGCG_INVERTER", std::to_string(QUDA_INC_EIGCG_INVERTER)}, + {"QUDA_GMRESDR_INVERTER", std::to_string(QUDA_GMRESDR_INVERTER)}, + {"QUDA_GMRESDR_PROJ_INVERTER", std::to_string(QUDA_GMRESDR_PROJ_INVERTER)}, + {"QUDA_GMRESDR_SH_INVERTER", std::to_string(QUDA_GMRESDR_SH_INVERTER)}, + {"QUDA_FGMRESDR_INVERTER", std::to_string(QUDA_FGMRESDR_INVERTER)}, + {"QUDA_MG_INVERTER", std::to_string(QUDA_MG_INVERTER)}, + {"QUDA_BICGSTABL_INVERTER", std::to_string(QUDA_BICGSTABL_INVERTER)}, + {"QUDA_CGNE_INVERTER", std::to_string(QUDA_CGNE_INVERTER)}, + {"QUDA_CGNR_INVERTER", std::to_string(QUDA_CGNR_INVERTER)}, + {"QUDA_CG3_INVERTER", std::to_string(QUDA_CG3_INVERTER)}, + {"QUDA_CG3NE_INVERTER", std::to_string(QUDA_CG3NE_INVERTER)}, + {"QUDA_CG3NR_INVERTER", std::to_string(QUDA_CG3NR_INVERTER)}, + {"QUDA_CA_CG_INVERTER", std::to_string(QUDA_CA_CG_INVERTER)}, + {"QUDA_CA_CGNE_INVERTER", std::to_string(QUDA_CA_CGNE_INVERTER)}, + {"QUDA_CA_CGNR_INVERTER", std::to_string(QUDA_CA_CGNR_INVERTER)}, + {"QUDA_CA_GCR_INVERTER", std::to_string(QUDA_CA_GCR_INVERTER)}, + {"QUDA_INVALID_INVERTER", std::to_string(QUDA_INVALID_INVERTER)}, + {"QUDA_MAT_SOLUTION", std::to_string(QUDA_MAT_SOLUTION)}, + {"QUDA_MATDAG_MAT_SOLUTION", std::to_string(QUDA_MATDAG_MAT_SOLUTION)}, + {"QUDA_MATPC_SOLUTION", std::to_string(QUDA_MATPC_SOLUTION)}, + {"QUDA_MATPC_DAG_SOLUTION", std::to_string(QUDA_MATPC_DAG_SOLUTION)}, + {"QUDA_MATPCDAG_MATPC_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SOLUTION)}, + {"QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION)}, + {"QUDA_INVALID_SOLUTION", std::to_string(QUDA_INVALID_SOLUTION)}, + {"QUDA_DIRECT_SOLVE", std::to_string(QUDA_DIRECT_SOLVE)}, + {"QUDA_NORMOP_SOLVE", std::to_string(QUDA_NORMOP_SOLVE)}, + {"QUDA_DIRECT_PC_SOLVE", std::to_string(QUDA_DIRECT_PC_SOLVE)}, + {"QUDA_NORMOP_PC_SOLVE", std::to_string(QUDA_NORMOP_PC_SOLVE)}, + {"QUDA_NORMERR_SOLVE", std::to_string(QUDA_NORMERR_SOLVE)}, + {"QUDA_NORMERR_PC_SOLVE", std::to_string(QUDA_NORMERR_PC_SOLVE)}, + {"QUDA_NORMEQ_SOLVE", std::to_string(QUDA_NORMEQ_SOLVE)}, + {"QUDA_NORMEQ_PC_SOLVE", std::to_string(QUDA_NORMEQ_PC_SOLVE)}, + {"QUDA_INVALID_SOLVE", std::to_string(QUDA_INVALID_SOLVE)}, + {"QUDA_MATPC_EVEN_EVEN", std::to_string(QUDA_MATPC_EVEN_EVEN)}, + {"QUDA_MATPC_ODD_ODD", std::to_string(QUDA_MATPC_ODD_ODD)}, + {"QUDA_MATPC_EVEN_EVEN_ASYMMETRIC", std::to_string(QUDA_MATPC_EVEN_EVEN_ASYMMETRIC)}, + {"QUDA_MATPC_ODD_ODD_ASYMMETRIC", std::to_string(QUDA_MATPC_ODD_ODD_ASYMMETRIC)}, + {"QUDA_MATPC_INVALID", std::to_string(QUDA_MATPC_INVALID)}, + {"QUDA_DEFAULT_NORMALIZATION", std::to_string(QUDA_DEFAULT_NORMALIZATION)}, + {"QUDA_SOURCE_NORMALIZATION", std::to_string(QUDA_SOURCE_NORMALIZATION)}, + {"QUDA_QUARTER_PRECISION", std::to_string(QUDA_QUARTER_PRECISION)}, + {"QUDA_HALF_PRECISION", std::to_string(QUDA_HALF_PRECISION)}, + {"QUDA_SINGLE_PRECISION", std::to_string(QUDA_SINGLE_PRECISION)}, + {"QUDA_DOUBLE_PRECISION", std::to_string(QUDA_DOUBLE_PRECISION)}, + {"QUDA_INVALID_PRECISION", std::to_string(QUDA_INVALID_PRECISION)}, + {"QUDA_BOOLEAN_FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"false", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"no", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"n", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"off", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"QUDA_BOOLEAN_TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"true", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"yes", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"y", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"on", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"QUDA_BOOLEAN_INVALID", std::to_string(QUDA_BOOLEAN_INVALID)}, + {"QUDA_COMPUTE_NULL_VECTOR_NO", std::to_string(QUDA_COMPUTE_NULL_VECTOR_NO)}, + {"QUDA_COMPUTE_NULL_VECTOR_YES", std::to_string(QUDA_COMPUTE_NULL_VECTOR_YES)}, + {"QUDA_COMPUTE_NULL_VECTOR_INVALID", std::to_string(QUDA_COMPUTE_NULL_VECTOR_INVALID)}, + {"QUDA_MG_CYCLE_VCYCLE", std::to_string(QUDA_MG_CYCLE_VCYCLE)}, + {"QUDA_MG_CYCLE_FCYCLE", std::to_string(QUDA_MG_CYCLE_FCYCLE)}, + {"QUDA_MG_CYCLE_WCYCLE", std::to_string(QUDA_MG_CYCLE_WCYCLE)}, + {"QUDA_MG_CYCLE_RECURSIVE", std::to_string(QUDA_MG_CYCLE_RECURSIVE)}, + {"QUDA_MG_CYCLE_INVALID", std::to_string(QUDA_MG_CYCLE_INVALID)}, + {"QUDA_CPU_FIELD_LOCATION", std::to_string(QUDA_CPU_FIELD_LOCATION)}, + {"QUDA_CUDA_FIELD_LOCATION", std::to_string(QUDA_CUDA_FIELD_LOCATION)}, + {"QUDA_INVALID_FIELD_LOCATION", std::to_string(QUDA_INVALID_FIELD_LOCATION)}, + {"QUDA_TWIST_SINGLET", std::to_string(QUDA_TWIST_SINGLET)}, + {"QUDA_TWIST_NONDEG_DOUBLET", std::to_string(QUDA_TWIST_NONDEG_DOUBLET)}, + {"QUDA_TWIST_NO", std::to_string(QUDA_TWIST_NO)}, + {"QUDA_TWIST_INVALID", std::to_string(QUDA_TWIST_INVALID)}, + {"QUDA_DAG_NO", std::to_string(QUDA_DAG_NO)}, + {"QUDA_DAG_YES", std::to_string(QUDA_DAG_YES)}, + {"QUDA_DAG_INVALID", std::to_string(QUDA_DAG_INVALID)}, + {"QUDA_KAPPA_NORMALIZATION", std::to_string(QUDA_KAPPA_NORMALIZATION)}, + {"QUDA_MASS_NORMALIZATION", std::to_string(QUDA_MASS_NORMALIZATION)}, + {"QUDA_ASYMMETRIC_MASS_NORMALIZATION", std::to_string(QUDA_ASYMMETRIC_MASS_NORMALIZATION)}, + {"QUDA_INVALID_NORMALIZATION", std::to_string(QUDA_INVALID_NORMALIZATION)}, + {"QUDA_PRESERVE_SOURCE_NO", std::to_string(QUDA_PRESERVE_SOURCE_NO)}, + {"QUDA_PRESERVE_SOURCE_YES", std::to_string(QUDA_PRESERVE_SOURCE_YES)}, + {"QUDA_PRESERVE_SOURCE_INVALID", std::to_string(QUDA_PRESERVE_SOURCE_INVALID)}, + {"QUDA_USE_INIT_GUESS_NO", std::to_string(QUDA_USE_INIT_GUESS_NO)}, + {"QUDA_USE_INIT_GUESS_YES", std::to_string(QUDA_USE_INIT_GUESS_YES)}, + {"QUDA_USE_INIT_GUESS_INVALID", std::to_string(QUDA_USE_INIT_GUESS_INVALID)}, + {"QUDA_SILENT", std::to_string(QUDA_SILENT)}, + {"QUDA_SUMMARIZE", std::to_string(QUDA_SUMMARIZE)}, + {"QUDA_VERBOSE", std::to_string(QUDA_VERBOSE)}, + {"QUDA_DEBUG_VERBOSE", std::to_string(QUDA_DEBUG_VERBOSE)}, + {"QUDA_INVALID_VERBOSITY", std::to_string(QUDA_INVALID_VERBOSITY)}, + {"QUDA_POWER_BASIS", std::to_string(QUDA_POWER_BASIS)}, + {"QUDA_CHEBYSHEV_BASIS", std::to_string(QUDA_CHEBYSHEV_BASIS)}, + {"QUDA_INVALID_BASIS", std::to_string(QUDA_INVALID_BASIS)}, + {"QUDA_ADDITIVE_SCHWARZ", std::to_string(QUDA_ADDITIVE_SCHWARZ)}, + {"QUDA_MULTIPLICATIVE_SCHWARZ", std::to_string(QUDA_MULTIPLICATIVE_SCHWARZ)}, + {"QUDA_INVALID_SCHWARZ", std::to_string(QUDA_INVALID_SCHWARZ)}, + {"QUDA_MADWF_ACCELERATOR", std::to_string(QUDA_MADWF_ACCELERATOR)}, + {"QUDA_INVALID_ACCELERATOR", std::to_string(QUDA_INVALID_ACCELERATOR)}, + {"QUDA_L2_RELATIVE_RESIDUAL", std::to_string(QUDA_L2_RELATIVE_RESIDUAL)}, + {"QUDA_L2_ABSOLUTE_RESIDUAL", std::to_string(QUDA_L2_ABSOLUTE_RESIDUAL)}, + {"QUDA_HEAVY_QUARK_RESIDUAL", std::to_string(QUDA_HEAVY_QUARK_RESIDUAL)}, + {"QUDA_INVALID_RESIDUAL", std::to_string(QUDA_INVALID_RESIDUAL)}, + {"QUDA_NULL_VECTOR_SETUP", std::to_string(QUDA_NULL_VECTOR_SETUP)}, + {"QUDA_TEST_VECTOR_SETUP", std::to_string(QUDA_TEST_VECTOR_SETUP)}, + {"QUDA_INVALID_SETUP_TYPE", std::to_string(QUDA_INVALID_SETUP_TYPE)}, + {"QUDA_TRANSFER_AGGREGATE", std::to_string(QUDA_TRANSFER_AGGREGATE)}, + {"QUDA_TRANSFER_COARSE_KD", std::to_string(QUDA_TRANSFER_COARSE_KD)}, + {"QUDA_TRANSFER_OPTIMIZED_KD", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD)}, + {"QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG)}, + {"QUDA_TRANSFER_INVALID", std::to_string(QUDA_TRANSFER_INVALID)}, + {"QUDA_EIG_TR_LANCZOS", std::to_string(QUDA_EIG_TR_LANCZOS)}, + {"QUDA_EIG_BLK_TR_LANCZOS", std::to_string(QUDA_EIG_BLK_TR_LANCZOS)}, + {"QUDA_EIG_IR_ARNOLDI", std::to_string(QUDA_EIG_IR_ARNOLDI)}, + {"QUDA_EIG_BLK_IR_ARNOLDI", std::to_string(QUDA_EIG_BLK_IR_ARNOLDI)}, + {"QUDA_EIG_INVALID", std::to_string(QUDA_EIG_INVALID)}, + {"QUDA_SPECTRUM_LM_EIG", std::to_string(QUDA_SPECTRUM_LM_EIG)}, + {"QUDA_SPECTRUM_SM_EIG", std::to_string(QUDA_SPECTRUM_SM_EIG)}, + {"QUDA_SPECTRUM_LR_EIG", std::to_string(QUDA_SPECTRUM_LR_EIG)}, + {"QUDA_SPECTRUM_SR_EIG", std::to_string(QUDA_SPECTRUM_SR_EIG)}, + {"QUDA_SPECTRUM_LI_EIG", std::to_string(QUDA_SPECTRUM_LI_EIG)}, + {"QUDA_SPECTRUM_SI_EIG", std::to_string(QUDA_SPECTRUM_SI_EIG)}, + {"QUDA_SPECTRUM_INVALID", std::to_string(QUDA_SPECTRUM_INVALID)}, + {"QUDA_MEMORY_DEVICE", std::to_string(QUDA_MEMORY_DEVICE)}, + {"QUDA_MEMORY_DEVICE_PINNED", std::to_string(QUDA_MEMORY_DEVICE_PINNED)}, + {"QUDA_MEMORY_HOST", std::to_string(QUDA_MEMORY_HOST)}, + {"QUDA_MEMORY_HOST_PINNED", std::to_string(QUDA_MEMORY_HOST_PINNED)}, + {"QUDA_MEMORY_MAPPED", std::to_string(QUDA_MEMORY_MAPPED)}, + {"QUDA_MEMORY_MANAGED", std::to_string(QUDA_MEMORY_MANAGED)}, + {"QUDA_MEMORY_INVALID", std::to_string(QUDA_MEMORY_INVALID)}, + {"QUDA_CUSOLVE_EXTLIB", std::to_string(QUDA_CUSOLVE_EXTLIB)}, + {"QUDA_EIGEN_EXTLIB", std::to_string(QUDA_EIGEN_EXTLIB)}, + {"QUDA_EXTLIB_INVALID", std::to_string(QUDA_EXTLIB_INVALID)}}; + +/** + * @brief Just a simple key-value store + */ +class KeyValueStore +{ +private: + std::unordered_map>> store; + std::unordered_map *map = nullptr; + std::string filename = ""; + +public: + /** + * @brief Sets a key value pair + * + * @param[in] section The section + * @param[in] key The key + * @param[in] value The value + */ + void set(const std::string §ion, const std::string &key, const std::string &value) + { + if (map != nullptr) { + auto mvalue = map->find(value); + if (mvalue != map->end()) { + std::get<0>(store[section][key]) = mvalue->second; + std::get<1>(store[section][key]) = value; + return; + } + } + std::get<0>(store[section][key]) = value; + std::get<1>(store[section][key]) = value; + } + + void set_map(std::unordered_map *_map) { map = _map; } + + bool section_exists(const std::string §ion) { return store.find(section) != store.end(); } + + /** + * @brief Gets the specified key. + * + * @param[in] section The section + * @param[in] key The key + * @param[in] default_value The default value if section/key is absent + * + * @tparam T Desired return type + * + * @return The corresponding value + */ + template + T get(const std::string §ion, const std::string &key, T default_value = T(), bool fail = false) + { + int idx; + std::string rkey; + std::smatch match; + std::regex p_key("([^\\[]+)\\[(\\d+)\\]"); /* key[idx] */ + auto sec = store.find(section); + + if (sec != store.end()) { + if (std::regex_search(key, match, p_key)) { + rkey = match[1]; + idx = std::stoi(match[2]); + } else { + rkey = key; + idx = 0; + } + + auto item = sec->second.find(rkey); + if (item != sec->second.end()) { + std::stringstream ss(std::get<0>(item->second)); + if constexpr (std::is_enum_v) { + typename std::underlying_type::type result, dummy; + for (int i = 0; i < idx; i++) { ss >> dummy; } + if (ss >> result) { return static_cast(result); } + } else { + T result, dummy; + for (int i = 0; i < idx; i++) { ss >> dummy; } + if (ss >> result) { return result; } + } + } + } + if (fail) { + errorQuda("Key \"%s\" in section \"%s\" in file %s does not exist.", key.c_str(), section.c_str(), + filename.c_str()); + } + return default_value; /* Return default value for non-existent keys */ + } + + /** + * @brief Fill the store with entries from an ini-file + * + * @param[in] fname The fname + */ + void load(const std::string &fname) + { + std::string line, section; + std::smatch match; + filename = fname; + std::ifstream file(filename.c_str()); + + std::regex p_section("^\\s*\\[([\\w\\ ]+)\\].*$"); /* [section] */ + std::regex p_comment("^[^#]*(\\s*#.*)$"); /* line # comment */ + std::regex p_key_val("^([^\\s]+)\\s+(.*[^\\s]+)\\s*$"); /* key value */ + + if (file.is_open()) { + + while (std::getline(file, line)) { + + /* remove all comments */ + if (std::regex_search(line, match, p_comment)) { line.erase(match.position(1)); } + + if (std::regex_search(line, match, p_section)) { + section = match[1]; + } else if (std::regex_search(line, match, p_key_val)) { + std::string key = match[1]; + std::string val = match[2]; + this->set(section, key, val); + } + } + + file.close(); + } else { + std::cerr << "Error opening file: " << filename << std::endl; + } + } + + /** + * @brief Dumps all entries in the store. + */ + void dump(std::string _section = "") + { + for (const auto §ion : store) { + if (_section == "" || _section == section.first) { + std::cout << "[" << section.first << "]" << std::endl; + for (const auto &pair : section.second) { + std::cout << " " << pair.first << " = " << std::get<1>(pair.second); + if (std::get<0>(pair.second) != std::get<1>(pair.second)) { std::cout << " # " << std::get<0>(pair.second); } + std::cout << std::endl; + } + } + } + } +}; + +/** + * @brief Returns the local lattice dimensions as lat_dim_t + * + * @return The local dimensions. + */ +static lat_dim_t get_local_dims(int *fill = nullptr) +{ + lat_dim_t X; + + for (int i = 0; i < 4; i++) { + if (fill) { + fill[i] = qudaState.layout.L[i]; + } else { + X[i] = qudaState.layout.L[i]; + } + } + + return X; +} + +/** + * @brief Calculate the rank from coordinates. + * + * @param[in] coords coords is the 4D cartesian coordinate of a rank + * @param[in] fdata should point to an instance of qudaLayout.ranks, + * @see struct openQCD_QudaLayout_t in + * @file include/quda_openqcd_interface.h + * + * @return rank + */ +int rankFromCoords(const int *coords, void *fdata) +{ + int *base = static_cast(fdata); + int *NPROC = base + 1; + int *ranks = base + 5; + int i; + + i = coords[3] + NPROC[3] * (coords[2] + NPROC[2] * (coords[1] + NPROC[1] * (coords[0]))); + return ranks[i]; +} + +/** + * Set set the local dimensions and machine topology for QUDA to use + * + * @param layout Struct defining local dimensions and machine topology + * @param infile Input file + */ +void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout, char *infile) +{ + int my_rank; + char prefix[20]; + + for (int dir = 0; dir < 4; ++dir) { + if (layout.N[dir] % 2 != 0) { + errorQuda("Error: Odd lattice dimensions are not supported\n"); + exit(1); + } + } + + setMPICommHandleQuda(&layout.quda_comm); + +#ifdef MULTI_GPU + initCommsGridQuda(4, layout.nproc, rankFromCoords, (void *)(layout.data)); + static int device = -1; /* enable a default allocation of devices to processes */ +#else + static int device = layout.device; +#endif + + /* must happen *after* communication initialization */ + MPI_Comm_rank(layout.quda_comm, &my_rank); + sprintf(prefix, "QUDA (rank=%d): ", my_rank); + + strcpy(qudaState.infile, infile); + if (my_rank == 0) { + KeyValueStore kv; + kv.set_map(&enum_map); + kv.load(qudaState.infile); + qudaState.init.verbosity = kv.get("QUDA", "verbosity", qudaState.init.verbosity); + } + + MPI_Bcast((void *)&qudaState.init.verbosity, sizeof(qudaState.init.verbosity), MPI_INT, 0, layout.quda_comm); + setVerbosityQuda(qudaState.init.verbosity, prefix, my_rank == 0 ? qudaState.init.logfile : stdout); + initQuda(device); +} + +static int getLinkPadding(const int dim[4]) +{ + int padding = MAX(dim[1] * dim[2] * dim[3] / 2, dim[0] * dim[2] * dim[3] / 2); + padding = MAX(padding, dim[0] * dim[1] * dim[3] / 2); + padding = MAX(padding, dim[0] * dim[1] * dim[2] / 2); + return padding; +} + +/** + * @brief Creates a new quda parameter struct + * + * @return The quda parameter struct. + */ +static QudaInvertParam newOpenQCDParam(void) +{ + static const QudaVerbosity verbosity = getVerbosity(); + + QudaInvertParam param = newQudaInvertParam(); + + param.verbosity = verbosity; + + param.cpu_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the input fermion fields */ + param.cuda_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA solver */ + param.cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA eigensolver */ + + param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; /* The precision used by the QUDA solver */ + param.cuda_prec_precondition = QUDA_HALF_PRECISION; /* The precision used by the QUDA solver */ + + /** + * The order of the input and output fermion fields. Imposes fieldOrder = + * QUDA_OPENQCD_FIELD_ORDER in color_spinor_field.h and + * QUDA_OPENQCD_FIELD_ORDER makes quda to instantiate OpenQCDDiracOrder. + */ + param.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; + + /** + * Gamma basis of the input and output host fields. Specifies the basis change + * into QUDAs internal gamma basis. Note that QUDA applies the basis change U + * to a spinor field when uploading and U^dagger when downloading. + */ + param.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; + + return param; +} + +/** + * @brief Initialize quda gauge param struct + * + * @param[in] prec Precision + * @param[in] rec QUDA internal gauge field format + * @param[in] t_boundary Time boundary condition + * + * @return The quda gauge parameter struct. + */ +static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary) +{ + QudaGaugeParam param = newQudaGaugeParam(); + + get_local_dims(param.X); + param.cuda_prec_sloppy = param.cpu_prec = param.cuda_prec = prec; + param.type = QUDA_SU3_LINKS; + + param.reconstruct_sloppy = param.reconstruct = rec; + + /* This makes quda to instantiate OpenQCDOrder */ + param.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; + + param.t_boundary = t_boundary; + param.gauge_fix = QUDA_GAUGE_FIXED_NO; + param.scale = 1.0; + param.anisotropy = 1.0; /* 1.0 means not anisotropic */ + param.ga_pad = getLinkPadding(param.X); /* Why this? */ + + return param; +} + +void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, char *infile) +{ + if (qudaState.initialized) return; + qudaState.init = init; + qudaState.layout = layout; + + if (layout.quda_comm == MPI_COMM_NULL) return; + + qudaopenqcd_called(__func__); + openQCD_qudaSetLayout(qudaState.layout, infile); + qudaopenqcd_called(__func__); + qudaState.initialized = true; +} + +static int in_comm(void) +{ + return qudaState.layout.quda_comm != MPI_COMM_NULL; +} + +void openQCD_qudaFinalize(void) +{ + if (!in_comm()) return; + + for (int id = 0; id < OPENQCD_MAX_INVERTERS; ++id) { + if (qudaState.inv_handles[id] != nullptr) { openQCD_qudaSolverDestroy(id); } + } + + for (int id = 0; id < OPENQCD_MAX_EIGENSOLVERS; ++id) { + if (qudaState.eig_handles[id] != nullptr) { openQCD_qudaEigensolverDestroy(id); } + } + + qudaState.initialized = false; + endQuda(); +} + +double openQCD_qudaPlaquette(void) +{ + double plaq[3]; + + WITH_COMM(plaqQuda(plaq)); + + if (!qudaState.init.two_grids_equal) + MPI_Bcast(&plaq[0], 1, MPI_DOUBLE, 0, qudaState.layout.world_comm); + + /* Note different Nc normalization wrt openQCD! */ + return 3.0 * plaq[0]; +} + +void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary) +{ + void *buf = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, gauge); + if (qudaState.layout.openqcd2quda(OPENQCD_FIELD_GAUGE, gauge, buf)) { + QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec, t_boundary); + loadGaugeQuda(buf, ¶m); + } +} + +void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary) +{ + void *buf = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, gauge); + if (in_comm()) { + QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec, t_boundary); + saveGaugeQuda(buf, ¶m); + } + qudaState.layout.quda2openqcd(OPENQCD_FIELD_GAUGE, buf, gauge); +} + +void openQCD_qudaGaugeFree(void) +{ + WITH_COMM(freeGaugeQuda()); + qudaState.ud_rev = -1; + qudaState.ad_rev = -1; + qudaState.swd_ud_rev = -1; + qudaState.swd_ad_rev = -1; +} + +void openQCD_qudaCloverLoad(void *clover, double kappa, double csw) +{ + QudaInvertParam param = newOpenQCDParam(); + param.clover_order = QUDA_OPENQCD_CLOVER_ORDER; + param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; + param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; + param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param.clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; + + param.kappa = kappa; + param.clover_csw = csw; + param.clover_coeff = 0.0; + + void *buf = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, clover); + if (qudaState.layout.openqcd2quda(OPENQCD_FIELD_CLOVER, clover, buf)) { + loadCloverQuda(buf, NULL, ¶m); + } +} + +void openQCD_qudaCloverFree(void) +{ + WITH_COMM(freeCloverQuda()); + qudaState.swd_ud_rev = 0; + qudaState.swd_ad_rev = 0; + qudaState.swd_kappa = 0.0; + qudaState.swd_su3csw = 0.0; + qudaState.swd_u1csw = 0.0; + qudaState.swd_qhat = 0; +} + +/** + * @brief Set the su3csw corfficient and all related properties. + * + * @param param The parameter struct + * @param[in] su3csw The su3csw coefficient + */ +inline void set_su3csw(QudaInvertParam *param, double su3csw) +{ + param->clover_csw = su3csw; + if (su3csw != 0.0) { + param->clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ + param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; + param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param->clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; + + param->clover_coeff = 0.0; + + /* Set to Wilson Dirac operator with Clover term */ + param->dslash_type = QUDA_CLOVER_WILSON_DSLASH; + + if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { + param->clover_order = QUDA_NATIVE_CLOVER_ORDER; /* what implication has this? */ + param->compute_clover = true; + } else { + param->clover_order = QUDA_OPENQCD_CLOVER_ORDER; + } + } +} + +/** + * @brief Creates a new quda Dirac parameter struct + * + * @param[in] p OpenQCD Dirac parameter struct + * + * @return The quda Dirac parameter struct. + */ +static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) +{ + QudaInvertParam param = newOpenQCDParam(); + + param.dslash_type = QUDA_WILSON_DSLASH; + param.kappa = p.kappa; + param.mu = p.mu; + param.dagger = QUDA_DAG_NO; + param.matpc_type = QUDA_MATPC_EVEN_EVEN; + + set_su3csw(¶m, p.su3csw); + + param.inv_type = QUDA_CG_INVERTER; /* just set some, needed? */ + + /* What is the difference? only works with QUDA_MASS_NORMALIZATION */ + param.mass_normalization = QUDA_MASS_NORMALIZATION; + + /* Extent of the 5th dimension (for domain wall) */ + param.Ls = 1; + + return param; +} + + +static ColorSpinorField *openQCD_qudaSpinorAlloc(void) +{ + if (in_comm()) { + QudaInvertParam param = newOpenQCDParam(); + ColorSpinorParam cpuParam(nullptr, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); + cudaParam.create = QUDA_NULL_FIELD_CREATE; + cudaParam.location = QUDA_CUDA_FIELD_LOCATION; + ColorSpinorField *out_d = new ColorSpinorField(cudaParam); + return out_d; + } else { + return nullptr; + } +} + + +void openQCD_qudaD2H(void *quda_field, void *openQCD_field) +{ + void *out = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, openQCD_field); + + if (in_comm()) { + /* sets up the necessary parameters */ + QudaInvertParam param = newOpenQCDParam(); + + /* creates a field on the CPU */ + ColorSpinorParam cpuParam(out, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField out_h(cpuParam); + + /* transfer the GPU field to CPU */ + out_h = *reinterpret_cast(quda_field); + } + + qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, out, openQCD_field); +} + +void *openQCD_qudaH2D(void *openQCD_field) +{ + void *in = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, openQCD_field); + + if (qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, openQCD_field, in)) { + + /* sets up the necessary parameters */ + QudaInvertParam param = newOpenQCDParam(); + + /* creates a field on the CPU */ + ColorSpinorParam cpuParam(in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField in_h(cpuParam); + + /* creates a field on the GPU with the same parameter set as the CPU field */ + ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); + ColorSpinorField *in_d = new ColorSpinorField(cudaParam); + + *in_d = in_h; /* transfer the CPU field to GPU */ + return in_d; + } + + return nullptr; +} + + +void openQCD_back_and_forth(void *h_in, void *h_out) +{ + ColorSpinorField *in = reinterpret_cast(openQCD_qudaH2D(h_in)); + ColorSpinorField *out = openQCD_qudaSpinorAlloc(); + WITH_COMM(*out = *in); + openQCD_qudaD2H(out, h_out); + openQCD_qudaSpinorFree((void**) &in); + openQCD_qudaSpinorFree((void**) &out); +} + + +int openQCD_qudaIndexIptLexi(const int iy) +{ + int x[4]; + int L_openqcd[4]; + openqcd::rotate_coords(qudaState.layout.L, L_openqcd); + + x[3] = iy % L_openqcd[3]; + int c = (iy - x[3])/L_openqcd[3]; + x[2] = c % L_openqcd[2]; + c = (c - x[2])/L_openqcd[2]; + x[1] = c % L_openqcd[1]; + x[0] = (c - x[1])/L_openqcd[1]; + + return openqcd::ipt(x, L_openqcd); +} + +int openQCD_qudaIndexIpt(const int *x) +{ + int L_openqcd[4]; + openqcd::rotate_coords(qudaState.layout.L, L_openqcd); + return openqcd::ipt(x, L_openqcd); +} + +int openQCD_qudaIndexIup(const int *x, const int mu) +{ + int L_openqcd[4], nproc_openqcd[4]; + openqcd::rotate_coords(qudaState.layout.L, L_openqcd); + openqcd::rotate_coords(qudaState.layout.nproc, nproc_openqcd); + return openqcd::iup(x, mu, L_openqcd, nproc_openqcd); +} + +double openQCD_qudaNorm(void *h_in) +{ + ColorSpinorField *in = reinterpret_cast(openQCD_qudaH2D(h_in)); + double norm2 = in_comm() ? blas::norm2(*in) : 0.0; + openQCD_qudaSpinorFree((void**) &in); + if (!qudaState.init.two_grids_equal) { + MPI_Bcast(&norm2, 1, MPI_DOUBLE, 0, qudaState.layout.world_comm); + } + return norm2; +} + +double openQCD_qudaNorm_NoLoads(void *d_in) { + double norm2 = in_comm() ? blas::norm2(*reinterpret_cast(d_in)) : 0.0; + if (!qudaState.init.two_grids_equal) { + MPI_Bcast(&norm2, 1, MPI_DOUBLE, 0, qudaState.layout.world_comm); + } + return norm2; +} + +void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) +{ + ColorSpinorField *in = reinterpret_cast(openQCD_qudaH2D(openQCD_in)); + ColorSpinorField *out = openQCD_qudaSpinorAlloc(); + + /* gamma_i run within QUDA using QUDA fields */ + if (in_comm()) { + switch (dir) { + case 0: ApplyGamma(*out, *in, QUDA_GAMMA_T); break; + case 1: ApplyGamma(*out, *in, QUDA_GAMMA_X); break; + case 2: ApplyGamma(*out, *in, QUDA_GAMMA_Y); break; + case 3: ApplyGamma(*out, *in, QUDA_GAMMA_Z); break; + case 4: + case 5: + ApplyGamma(*out, *in, QUDA_GAMMA_5); + /* UKQCD uses a different convention for Gamma matrices: + * gamma5_ukqcd = gammax gammay gammaz gammat, + * gamma5_openqcd = gammat gammax gammay gammaz, + * and thus + * gamma5_openqcd = -1 * U gamma5_ukqcd U^dagger, + * with U the transformation matrix from OpenQCD to UKQCD. */ + blas::ax(-1.0, *out); + break; + default: errorQuda("Unknown gamma: %d\n", dir); + } + } + + openQCD_qudaD2H(out, openQCD_out); + openQCD_qudaSpinorFree((void**) &in); + openQCD_qudaSpinorFree((void**) &out); +} + + +void openQCD_qudaSpinorFree(void **quda_field) +{ + delete reinterpret_cast(*quda_field); + *quda_field = nullptr; +} + + +/** + * @brief Check whether the gauge field from openQCD is in sync with the + * one from QUDA. + * + * @return true/false + */ +inline bool gauge_field_get_up2date(void) +{ + int ud_rev = -2, ad_rev = -2; + + /* get current residing gauge field revision (residing in openqxd) */ + qudaState.layout.get_gfld_flags(&ud_rev, &ad_rev); + return ud_rev == qudaState.ud_rev && ad_rev == qudaState.ad_rev; +} + +/** + * @brief Check whether the gauge field is not (yet) set in openQCD. + * + * @return true/false + */ +inline bool gauge_field_get_unset(void) +{ + int ud_rev = -2, ad_rev = -2; + qudaState.layout.get_gfld_flags(&ud_rev, &ad_rev); + return ud_rev == 0 && ad_rev == 0; +} + +/** + * @brief Check if the current SW field needs to update wrt the parameters from openQCD. + * + * @return true/false + */ +inline bool clover_field_get_up2date(void) +{ + openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms(); + bool test = gauge_field_get_up2date(); + return (test && qudaState.swd_ud_rev == qudaState.ud_rev && qudaState.swd_ad_rev == qudaState.ad_rev + && qudaState.swd_kappa == 1.0 / (2.0 * (dp.m0 + 4.0)) && qudaState.swd_su3csw == dp.su3csw + && qudaState.swd_u1csw == dp.u1csw && qudaState.swd_qhat == dp.qhat); +} + +/** + * @brief Check whether the multigrid instance associated to the parameter + * struct is up to date with the global gauge field revision, + * parameters are in sync, and clover/gauge fields are up to date. + * + * @param param The parameter struct + * + * @return true/false + */ +inline bool mg_get_up2date(QudaInvertParam *param) +{ + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms(); + int test = param->preconditioner != nullptr; + + MPI_Bcast(&test, 1, MPI_INT, 0, qudaState.layout.world_comm); + return (test && gauge_field_get_up2date() && clover_field_get_up2date() + && additional_prop->mg_ud_rev == qudaState.ud_rev && additional_prop->mg_ad_rev == qudaState.ad_rev + && additional_prop->mg_kappa == 1.0 / (2.0 * (dp.m0 + 4.0)) && additional_prop->mg_su3csw == dp.su3csw + && additional_prop->mg_u1csw == dp.u1csw && additional_prop->mg_qhat == dp.qhat); +} + +/** + * @brief Sets the multigrid instance associated to the parameter struct to + * be in sync with openQxD. + * + * @param param The parameter struct + */ +inline void mg_set_revision(QudaInvertParam *param) +{ + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms(); + additional_prop->mg_ud_rev = qudaState.ud_rev; + additional_prop->mg_ad_rev = qudaState.ad_rev; + additional_prop->mg_kappa = 1.0 / (2.0 * (dp.m0 + 4.0)); + additional_prop->mg_su3csw = dp.su3csw; + additional_prop->mg_u1csw = dp.u1csw; + additional_prop->mg_qhat = dp.qhat; +} + +/** + * @brief Set the global revisions numners for the SW field. + */ +inline void clover_field_set_revision(void) +{ + openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms(); + qudaState.swd_ud_rev = qudaState.ud_rev; + qudaState.swd_ad_rev = qudaState.ad_rev; + qudaState.swd_kappa = 1.0 / (2.0 * (dp.m0 + 4.0)); + qudaState.swd_su3csw = dp.su3csw; + qudaState.swd_u1csw = dp.u1csw; + qudaState.swd_qhat = dp.qhat; +} + +/** + * @brief Set the global revisions numners for the gauge field. + */ +inline void gauge_field_set_revision(void) { qudaState.layout.get_gfld_flags(&qudaState.ud_rev, &qudaState.ad_rev); } + + +/** + * @brief Check if the solver parameters are in sync with the parameters + * from openQCD. + * + * @param param_ The parameter struct + * + * @return Whether parameters are in sync or not + */ +static int openQCD_qudaInvertParamCheck(void *param_) +{ + QudaInvertParam *param = static_cast(param_); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + bool ret = true; + openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms(); + + if (param->kappa != (1.0 / (2.0 * (dp.m0 + 4.0)))) { + WITH_COMM(logQuda( + QUDA_VERBOSE, "Property m0/kappa does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", + (1.0 / (2.0 * (dp.m0 + 4.0))), param->kappa)); + WITH_COMM(logQuda(QUDA_VERBOSE, " => need params update\n")); + ret = false; + } + + if (additional_prop->u1csw != dp.u1csw) { + WITH_COMM(logQuda( + QUDA_VERBOSE, + "Property u1csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", + dp.u1csw, additional_prop->u1csw)); + WITH_COMM(logQuda(QUDA_VERBOSE, " => need clover field and params update\n")); + ret = false; + } + + if (additional_prop->qhat != dp.qhat) { + WITH_COMM(logQuda(QUDA_VERBOSE, + "Property qhat does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %d, QUDA: %d)\n", + dp.qhat, additional_prop->qhat)); + WITH_COMM(logQuda(QUDA_VERBOSE, " => need gauge, clover field and params update\n")); + ret = false; + } + + if (param->clover_csw != dp.su3csw) { + WITH_COMM(logQuda( + QUDA_VERBOSE, "Property su3csw/clover_csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", + dp.su3csw, param->clover_csw)); + WITH_COMM(logQuda(QUDA_VERBOSE, " => need clover field and params update\n")); + ret = false; + } + + return ret; +} + +/** + * @brief Check if the solver identifier is in bounds + * + * @param[in] id The identifier + */ +void inline check_solver_id(int id) +{ + if (id < -1 || id > OPENQCD_MAX_INVERTERS - 1) { + WITH_COMM(errorQuda("Solver id %d is out of range [%d, %d).", id, -1, OPENQCD_MAX_INVERTERS)); + } +} + +/** + * @brief Check if the eigen-solver identifier is in bounds + * + * @param[in] id The identifier + */ +void inline check_eigensolver_id(int id) +{ + if (id < 0 || id > OPENQCD_MAX_EIGENSOLVERS - 1) { + WITH_COMM(errorQuda("Eigensolver id %d is out of range [%d, %d).", id, 0, OPENQCD_MAX_EIGENSOLVERS)); + } +} + + +/** + * @brief Transfer the gauge field if the gauge field was updated in + * openQxD. (Re-)calculate or transfer the clover field if + * parameters have changed or gauge field was updated. Update the + * settings kappa, qhat, su3csw and u1csw in the QudaInvertParam + * struct such that they are in sync with openQxD. Set up or update + * the multigrid instance if set in QudaInvertParam and if gauge- or + * clover-fields or parameters have changed. + * + * @param param_ The parameter struct, where in param->additional_prop a + * pointer to the QudaMultigridParam struct was placed. + */ +static void openQCD_qudaSolverUpdate(void *param_) +{ + if (param_ == nullptr) { WITH_COMM(errorQuda("Solver handle is NULL.")); } + + QudaInvertParam *param = static_cast(param_); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms(); + + bool do_param_update = !openQCD_qudaInvertParamCheck(param_); + bool do_gauge_transfer = (!gauge_field_get_up2date() && !gauge_field_get_unset()) + || additional_prop->qhat != dp.qhat; + bool do_clover_update = !clover_field_get_up2date() && !gauge_field_get_unset(); + bool do_multigrid_update = param_ != qudaState.dirac_handle && param->inv_type_precondition == QUDA_MG_INVERTER + && !mg_get_up2date(param) && !gauge_field_get_unset(); + bool do_multigrid_fat_update = do_multigrid_update + && (do_gauge_transfer || additional_prop->mg_ud_rev != qudaState.ud_rev + || additional_prop->mg_ad_rev != qudaState.ad_rev); + + if (do_gauge_transfer) { + if (qudaState.layout.h_gauge == nullptr) { WITH_COMM(errorQuda("qudaState.layout.h_gauge is not set.")); } + WITH_COMM(logQuda(QUDA_VERBOSE, "Loading gauge field from openQCD ...\n")); + void *h_gauge = qudaState.layout.h_gauge(); + PUSH_RANGE("openQCD_qudaGaugeLoad", 3); + +#ifdef BUILD_QCD_PLUS_QED + QudaReconstructType rec = QUDA_RECONSTRUCT_9; + if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { + WITH_COMM(warningQuda("QUDA was built with QCD+QED support, but gauge group does not require it (set QUDA_QCD_PLUS_QED=OFF)")); + } +#else + QudaReconstructType rec = QUDA_RECONSTRUCT_8; + if (qudaState.layout.flds_parms().gauge != OPENQCD_GAUGE_SU3) { + WITH_COMM(errorQuda("QUDA was not built with QCD+QED support, but gauge group requires it (set QUDA_QCD_PLUS_QED=ON)")); + } +#endif + + /** + * We set t_boundary = QUDA_ANTI_PERIODIC_T. This setting is a label that + * tells QUDA the current state of the residing gauge field, that is the + * same state as the one we transfer from openqxd. In openqxd the hdfld + * exhibits phases of -1 for the temporal time boundaries, meaning that the + * gauge fields are explicitly multiplied by -1 on the t=0 time slice, see + * chs_hd0() in hflds.c. The QUDA_ANTI_PERIODIC_T flag says that these + * phases are incorporated into the field and that QUDA has to add these + * phases on the t=0 time slice when reconstructing the field from + * QUDA_RECONSTRUCT_8/12, but not from QUDA_RECONSTRUCT_NO. In case of + * QUDA_RECONSTRUCT_NO the value if t_boundary has no effect. + * + * @see https://github.com/lattice/quda/issues/1315 + * @see Reconstruct#Unpack() in gauge_field_order.h + * @see Reconstruct<8,...>#Unpack() in gauge_field_order.h + * @see Reconstruct<12,...>#Unpack() in gauge_field_order.h + */ + openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION, rec, QUDA_ANTI_PERIODIC_T); + gauge_field_set_revision(); + POP_RANGE; + } + + if (do_param_update) { + WITH_COMM(logQuda(QUDA_VERBOSE, "Syncing kappa, qhat, su3csw, u1csw values from openQCD ...\n")); + param->kappa = 1.0 / (2.0 * (dp.m0 + 4.0)); + additional_prop->u1csw = dp.u1csw; + additional_prop->qhat = dp.qhat; + set_su3csw(param, dp.su3csw); + + QudaInvertParam *mg_inv_param = additional_prop->mg_param->invert_param; + mg_inv_param->kappa = 1.0 / (2.0 * (dp.m0 + 4.0)); + set_su3csw(mg_inv_param, dp.su3csw); + } + + if (do_clover_update) { + if (param->clover_csw == 0.0) { + WITH_COMM(logQuda(QUDA_VERBOSE, "Deallocating Clover field in QUDA ...\n")); + openQCD_qudaCloverFree(); + qudaState.swd_ud_rev = 0; + qudaState.swd_ad_rev = 0; + qudaState.swd_kappa = 0.0; + qudaState.swd_su3csw = 0.0; + qudaState.swd_u1csw = 0.0; + qudaState.swd_qhat = 0; + } else { + if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { + /** + * SU3 case: + * Leaving both h_clover = h_clovinv = NULL allocates the clover field on + * the GPU and finally calls @createCloverQuda to calculate the clover + * field. + */ + WITH_COMM(logQuda(QUDA_VERBOSE, "Generating Clover field in QUDA ...\n")); + PUSH_RANGE("loadCloverQuda", 3); + WITH_COMM(loadCloverQuda(NULL, NULL, param)); + POP_RANGE; + clover_field_set_revision(); + } else { + /** + * U3 case: Transfer the SW-field from openQCD. + */ + + if (qudaState.layout.h_sw == nullptr) { WITH_COMM(errorQuda("qudaState.layout.h_sw is not set.")); } + + WITH_COMM(logQuda(QUDA_VERBOSE, "Loading Clover field from openQCD ...\n")); + void *h_sw = qudaState.layout.h_sw(); + PUSH_RANGE("openQCD_qudaCloverLoad", 3); + openQCD_qudaCloverLoad(h_sw, param->kappa, param->clover_csw); + POP_RANGE; + clover_field_set_revision(); + + /*loadCloverQuda(qudaState.layout.h_sw(), NULL, param);*/ + /* TODO: The above line would be prefered over openQCD_qudaCloverLoad, but throws this error, no idea why? + QUDA: ERROR: qudaEventRecord_ returned CUDA_ERROR_ILLEGAL_ADDRESS + (timer.h:82 in start()) + (rank 0, host yoshi, quda_api.cpp:72 in void quda::target::cuda::set_driver_error(CUresult, const char*, const + char*, const char*, const char*, bool)()) QUDA: last kernel called was + (name=N4quda10CopyCloverINS_6clover11FloatNOrderIdLi72ELi2ELb0ELb1ELb0EEENS1_12OpenQCDOrderIdLi72EEEddEE,volume=32x16x16x64,aux=GPU-offline,vol=524288precision=8Nc=3,compute_diagonal)*/ + } + } + } + + /* setup/update the multigrid instance or do nothing */ + if (do_multigrid_update) { + QudaMultigridParam *mg_param = additional_prop->mg_param; + + if (mg_param == nullptr) { WITH_COMM(errorQuda("No multigrid parameter struct set.")); } + + if (do_multigrid_fat_update && param->preconditioner != nullptr) { + WITH_COMM(logQuda(QUDA_VERBOSE, "Destroying existing multigrid instance ...\n")); + PUSH_RANGE("destroyMultigridQuda", 4); + WITH_COMM(destroyMultigridQuda(param->preconditioner)); + param->preconditioner = nullptr; + POP_RANGE; + + additional_prop->mg_ud_rev = 0; + additional_prop->mg_ad_rev = 0; + additional_prop->mg_kappa = 0.0; + additional_prop->mg_su3csw = 0.0; + additional_prop->mg_u1csw = 0.0; + additional_prop->mg_qhat = 0; + } + + if (in_comm()) { + if (param->preconditioner == nullptr) { + logQuda(QUDA_VERBOSE, "Setting up multigrid instance ...\n"); + PUSH_RANGE("newMultigridQuda", 4); + param->preconditioner = newMultigridQuda(mg_param); + POP_RANGE; + } else { + logQuda(QUDA_VERBOSE, "Updating existing multigrid instance ...\n"); + PUSH_RANGE("updateMultigridQuda", 4); + updateMultigridQuda(param->preconditioner, mg_param); + POP_RANGE; + } + } + mg_set_revision(param); + } +} + + +static void *openQCD_qudaSolverReadIn(int id) +{ + int my_rank; + openQCD_dirac_parms_t dp = qudaState.layout.dirac_parms(); + + MPI_Comm_rank(qudaState.layout.world_comm, &my_rank); + + /* Allocate on the heap */ + QudaInvertParam *param = new QudaInvertParam(newQudaInvertParam()); + QudaInvertParam *invert_param_mg = new QudaInvertParam(newQudaInvertParam()); + QudaMultigridParam *multigrid_param = new QudaMultigridParam(newQudaMultigridParam()); + std::string section = "Solver " + std::to_string(id); + + /* Some default settings */ + /* Some of them should not be changed */ + param->verbosity = QUDA_SUMMARIZE; + param->cpu_prec = QUDA_DOUBLE_PRECISION; + param->cuda_prec = QUDA_DOUBLE_PRECISION; + param->cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; + param->cuda_prec_sloppy = QUDA_SINGLE_PRECISION; + param->cuda_prec_precondition = QUDA_HALF_PRECISION; + param->dirac_order = QUDA_OPENQCD_DIRAC_ORDER; + param->gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; + param->dslash_type = QUDA_WILSON_DSLASH; + param->kappa = 1.0 / (2.0 * (dp.m0 + 4.0)); + param->mu = 0.0; + param->dagger = QUDA_DAG_NO; + param->solution_type = QUDA_MAT_SOLUTION; + param->solve_type = QUDA_DIRECT_SOLVE; + param->matpc_type = QUDA_MATPC_EVEN_EVEN; + param->solver_normalization = QUDA_DEFAULT_NORMALIZATION; + param->inv_type_precondition = QUDA_INVALID_INVERTER; /* disables any preconditioning */ + param->mass_normalization = QUDA_MASS_NORMALIZATION; + + set_su3csw(param, dp.su3csw); + + if (my_rank == 0 && id != -1) { + + KeyValueStore kv; + kv.set_map(&enum_map); + kv.load(qudaState.infile); + + if (!kv.section_exists(section)) { + WITH_COMM(errorQuda("Solver section \"%s\" in file %s does not exist.", section.c_str(), qudaState.infile)); + } + + param->verbosity = kv.get(section, "verbosity", param->verbosity); + + if (param->verbosity >= QUDA_DEBUG_VERBOSE) { kv.dump(); } + + if (kv.get(section, "solver") != "QUDA") { + WITH_COMM(errorQuda("Solver section \"%s\" in file %s is not a valid quda-solver section (solver = %s).", section.c_str(), + qudaState.infile, kv.get(section, "solver").c_str())); + } + + /* both fields reside on the CPU */ + param->input_location = kv.get(section, "input_location", QUDA_CPU_FIELD_LOCATION); + param->output_location = kv.get(section, "output_location", QUDA_CPU_FIELD_LOCATION); + + param->inv_type = kv.get(section, "inv_type", param->inv_type); + param->kappa = kv.get(section, "kappa", param->kappa); + param->mu = kv.get(section, "mu", param->mu); + param->tm_rho = kv.get(section, "tm_rho", param->tm_rho); + param->epsilon = kv.get(section, "epsilon", param->epsilon); + param->twist_flavor = kv.get(section, "twist_flavor", param->twist_flavor); + param->laplace3D = kv.get(section, "laplace3D", param->laplace3D); + + /* Solver settings */ + param->tol = kv.get(section, "tol", param->tol); + param->tol_restart = kv.get(section, "tol_restart", param->tol_restart); + param->tol_hq = kv.get(section, "tol_hq", param->tol_hq); + + param->compute_true_res = kv.get(section, "compute_true_res", param->compute_true_res); + param->true_res[0] = kv.get(section, "true_res", param->true_res[0]); /* TODO: mrhs */ + param->true_res_hq[0] = kv.get(section, "true_res_hq", param->true_res_hq[0]); + param->maxiter = kv.get(section, "maxiter", param->maxiter); + param->reliable_delta = kv.get(section, "reliable_delta", param->reliable_delta); + param->reliable_delta_refinement + = kv.get(section, "reliable_delta_refinement", param->reliable_delta_refinement); + param->use_alternative_reliable = kv.get(section, "use_alternative_reliable", param->use_alternative_reliable); + param->use_sloppy_partial_accumulator + = kv.get(section, "use_sloppy_partial_accumulator", param->use_sloppy_partial_accumulator); + + param->solution_accumulator_pipeline + = kv.get(section, "solution_accumulator_pipeline", param->solution_accumulator_pipeline); + + param->max_res_increase = kv.get(section, "max_res_increase", param->max_res_increase); + param->max_res_increase_total = kv.get(section, "max_res_increase_total", param->max_res_increase_total); + param->max_hq_res_increase = kv.get(section, "max_hq_res_increase", param->max_hq_res_increase); + param->max_hq_res_restart_total = kv.get(section, "max_hq_res_restart_total", param->max_hq_res_restart_total); + + param->heavy_quark_check = kv.get(section, "heavy_quark_check", param->heavy_quark_check); + + param->pipeline = kv.get(section, "pipeline", param->pipeline); + param->num_offset = kv.get(section, "num_offset", param->num_offset); + param->num_src = kv.get(section, "num_src", param->num_src); + param->num_src_per_sub_partition + = kv.get(section, "num_src_per_sub_partition", param->num_src_per_sub_partition); + + param->split_grid[0] = kv.get(section, "split_grid[1]", param->split_grid[0]); + param->split_grid[1] = kv.get(section, "split_grid[2]", param->split_grid[1]); + param->split_grid[2] = kv.get(section, "split_grid[3]", param->split_grid[2]); + param->split_grid[3] = kv.get(section, "split_grid[0]", param->split_grid[3]); + + param->overlap = kv.get(section, "overlap", param->overlap); + + for (int i = 0; i < param->num_offset; i++) { + std::string sub_key = "offset[" + std::to_string(i) + "]"; + param->offset[i] = kv.get(section, sub_key, param->offset[i]); + sub_key = "tol_offset[" + std::to_string(i) + "]"; + param->tol_offset[i] = kv.get(section, sub_key, param->tol_offset[i]); + sub_key = "tol_hq_offset[" + std::to_string(i) + "]"; + param->tol_hq_offset[i] = kv.get(section, sub_key, param->tol_hq_offset[i]); + } + + param->compute_action = kv.get(section, "compute_action", param->compute_action); + + param->solution_type = kv.get(section, "solution_type", param->solution_type); + param->solve_type = kv.get(section, "solve_type", param->solve_type); + param->matpc_type = kv.get(section, "matpc_type", param->matpc_type); + param->dagger = kv.get(section, "dagger", param->dagger); + param->mass_normalization = kv.get(section, "mass_normalization", param->mass_normalization); + param->solver_normalization + = kv.get(section, "solver_normalization", param->solver_normalization); + + param->preserve_source = kv.get(section, "preserve_source", param->preserve_source); + + param->cpu_prec = kv.get(section, "cpu_prec", param->cpu_prec); + param->cuda_prec = kv.get(section, "cuda_prec", param->cuda_prec); + param->cuda_prec_sloppy = kv.get(section, "cuda_prec_sloppy", param->cuda_prec_sloppy); + param->cuda_prec_refinement_sloppy + = kv.get(section, "cuda_prec_refinement_sloppy", param->cuda_prec_refinement_sloppy); + param->cuda_prec_precondition + = kv.get(section, "cuda_prec_precondition", param->cuda_prec_precondition); + param->cuda_prec_eigensolver = kv.get(section, "cuda_prec_eigensolver", param->cuda_prec_eigensolver); + + param->clover_location = kv.get(section, "clover_location", param->clover_location); + param->clover_cpu_prec = kv.get(section, "clover_cpu_prec", param->clover_cpu_prec); + param->clover_cuda_prec = kv.get(section, "clover_cuda_prec", param->clover_cuda_prec); + param->clover_cuda_prec_sloppy + = kv.get(section, "clover_cuda_prec_sloppy", param->clover_cuda_prec_sloppy); + param->clover_cuda_prec_refinement_sloppy + = kv.get(section, "clover_cuda_prec_refinement_sloppy", param->clover_cuda_prec_refinement_sloppy); + param->clover_cuda_prec_precondition + = kv.get(section, "clover_cuda_prec_precondition", param->clover_cuda_prec_precondition); + param->clover_cuda_prec_eigensolver + = kv.get(section, "clover_cuda_prec_eigensolver", param->clover_cuda_prec_eigensolver); + + param->use_init_guess = kv.get(section, "use_init_guess", param->use_init_guess); + + param->clover_csw = kv.get(section, "clover_csw", param->clover_csw); + param->clover_coeff = kv.get(section, "clover_coeff", param->clover_coeff); + param->clover_rho = kv.get(section, "clover_rho", param->clover_rho); + param->compute_clover_trlog = kv.get(section, "compute_clover_trlog", param->compute_clover_trlog); + param->Nsteps = kv.get(section, "Nsteps", param->Nsteps); + param->gcrNkrylov = kv.get(section, "gcrNkrylov", param->gcrNkrylov); + + param->inv_type_precondition + = kv.get(section, "inv_type_precondition", param->inv_type_precondition); + param->deflate = kv.get(section, "deflate", param->deflate); + param->verbosity_precondition + = kv.get(section, "verbosity_precondition", param->verbosity_precondition); + param->tol_precondition = kv.get(section, "tol_precondition", param->tol_precondition); + param->maxiter_precondition = kv.get(section, "maxiter_precondition", param->maxiter_precondition); + param->omega = kv.get(section, "omega", param->omega); + param->ca_basis = kv.get(section, "ca_basis", param->ca_basis); + param->ca_lambda_min = kv.get(section, "ca_lambda_min", param->ca_lambda_min); + param->ca_lambda_max = kv.get(section, "ca_lambda_max", param->ca_lambda_max); + param->ca_basis_precondition = kv.get(section, "ca_basis_precondition", param->ca_basis_precondition); + param->ca_lambda_min_precondition + = kv.get(section, "ca_lambda_min_precondition", param->ca_lambda_min_precondition); + param->ca_lambda_max_precondition + = kv.get(section, "ca_lambda_max_precondition", param->ca_lambda_max_precondition); + param->precondition_cycle = kv.get(section, "precondition_cycle", param->precondition_cycle); + param->schwarz_type = kv.get(section, "schwarz_type", param->schwarz_type); + param->accelerator_type_precondition + = kv.get(section, "accelerator_type_precondition", param->accelerator_type_precondition); + + param->madwf_diagonal_suppressor + = kv.get(section, "madwf_diagonal_suppressor", param->madwf_diagonal_suppressor); + param->madwf_ls = kv.get(section, "madwf_ls", param->madwf_ls); + param->madwf_null_miniter = kv.get(section, "madwf_null_miniter", param->madwf_null_miniter); + param->madwf_null_tol = kv.get(section, "madwf_null_tol", param->madwf_null_tol); + param->madwf_train_maxiter = kv.get(section, "madwf_train_maxiter", param->madwf_train_maxiter); + + param->madwf_param_load = kv.get(section, "madwf_param_load", param->madwf_param_load); + param->madwf_param_save = kv.get(section, "madwf_param_save", param->madwf_param_save); + strcpy(param->madwf_param_infile, + kv.get(section, "madwf_param_infile", param->madwf_param_infile).c_str()); + strcpy(param->madwf_param_outfile, + kv.get(section, "madwf_param_outfile", param->madwf_param_outfile).c_str()); + + param->residual_type = kv.get(section, "residual_type", param->residual_type); + + if (param->inv_type_precondition == QUDA_MG_INVERTER) { + + std::string mg_section = section + " Multigrid"; + + if (!kv.section_exists(mg_section)) { + WITH_COMM(errorQuda("Solver section \"%s\" in file %s does not exist.", mg_section.c_str(), qudaState.infile)); + } + + /* (shallow) copy the struct */ + *invert_param_mg = *param; + + /* these have to be fixed, and cannot be overwritten by the input file */ + invert_param_mg->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + invert_param_mg->dirac_order = QUDA_DIRAC_ORDER; + + multigrid_param->n_level = kv.get(mg_section, "n_level", multigrid_param->n_level, true); + multigrid_param->setup_type = kv.get(mg_section, "setup_type", multigrid_param->setup_type); + multigrid_param->pre_orthonormalize + = kv.get(mg_section, "pre_orthonormalize", multigrid_param->pre_orthonormalize); + multigrid_param->post_orthonormalize + = kv.get(mg_section, "post_orthonormalize", multigrid_param->post_orthonormalize); + multigrid_param->compute_null_vector + = kv.get(mg_section, "compute_null_vector", multigrid_param->compute_null_vector); + multigrid_param->generate_all_levels + = kv.get(mg_section, "generate_all_levels", multigrid_param->generate_all_levels); + multigrid_param->run_verify = kv.get(mg_section, "run_verify", multigrid_param->run_verify); + multigrid_param->run_low_mode_check + = kv.get(mg_section, "run_low_mode_check", multigrid_param->run_low_mode_check); + multigrid_param->run_oblique_proj_check + = kv.get(mg_section, "run_oblique_proj_check", multigrid_param->run_oblique_proj_check); + multigrid_param->coarse_guess = kv.get(mg_section, "coarse_guess", multigrid_param->coarse_guess); + multigrid_param->preserve_deflation + = kv.get(mg_section, "preserve_deflation", multigrid_param->preserve_deflation); + multigrid_param->allow_truncation + = kv.get(mg_section, "allow_truncation", multigrid_param->allow_truncation); + multigrid_param->staggered_kd_dagger_approximation = kv.get( + mg_section, "staggered_kd_dagger_approximation", multigrid_param->staggered_kd_dagger_approximation); + multigrid_param->thin_update_only + = kv.get(mg_section, "thin_update_only", multigrid_param->thin_update_only); + + for (int i = 0; i < multigrid_param->n_level; i++) { + std::string subsection = section + " Multigrid Level " + std::to_string(i); + + if (!kv.section_exists(subsection)) { + WITH_COMM(errorQuda("Solver section \"%s\" in file %s does not exist.", subsection.c_str(), qudaState.infile)); + } + + multigrid_param->geo_block_size[i][0] + = kv.get(subsection, "geo_block_size[1]", multigrid_param->geo_block_size[i][0]); + multigrid_param->geo_block_size[i][1] + = kv.get(subsection, "geo_block_size[2]", multigrid_param->geo_block_size[i][1]); + multigrid_param->geo_block_size[i][2] + = kv.get(subsection, "geo_block_size[3]", multigrid_param->geo_block_size[i][2]); + multigrid_param->geo_block_size[i][3] + = kv.get(subsection, "geo_block_size[0]", multigrid_param->geo_block_size[i][3]); + multigrid_param->spin_block_size[i] + = kv.get(subsection, "spin_block_size", multigrid_param->spin_block_size[i]); + multigrid_param->n_vec[i] = kv.get(subsection, "n_vec", multigrid_param->n_vec[i]); + multigrid_param->n_vec_batch[i] = kv.get(subsection, "n_vec_batch", 1); + multigrid_param->precision_null[i] + = kv.get(subsection, "precision_null", multigrid_param->precision_null[i]); + multigrid_param->n_block_ortho[i] = kv.get(subsection, "n_block_ortho", multigrid_param->n_block_ortho[i]); + multigrid_param->block_ortho_two_pass[i] + = kv.get(subsection, "block_ortho_two_pass", multigrid_param->block_ortho_two_pass[i]); + multigrid_param->verbosity[i] = kv.get(subsection, "verbosity", multigrid_param->verbosity[i]); + multigrid_param->setup_inv_type[i] + = kv.get(subsection, "setup_inv_type", multigrid_param->setup_inv_type[i]); + multigrid_param->setup_use_mma[i] + = kv.get(subsection, "setup_use_mma", multigrid_param->setup_use_mma[i]); + multigrid_param->dslash_use_mma[i] + = kv.get(subsection, "dslash_use_mma", multigrid_param->dslash_use_mma[i]); + multigrid_param->num_setup_iter[i] + = kv.get(subsection, "num_setup_iter", multigrid_param->num_setup_iter[i]); + multigrid_param->setup_tol[i] = kv.get(subsection, "setup_tol", multigrid_param->setup_tol[i]); + multigrid_param->setup_maxiter[i] = kv.get(subsection, "setup_maxiter", multigrid_param->setup_maxiter[i]); + multigrid_param->setup_maxiter_refresh[i] + = kv.get(subsection, "setup_maxiter_refresh", multigrid_param->setup_maxiter_refresh[i]); + multigrid_param->setup_ca_basis[i] + = kv.get(subsection, "setup_ca_basis", multigrid_param->setup_ca_basis[i]); + multigrid_param->setup_ca_basis_size[i] + = kv.get(subsection, "setup_ca_basis_size", multigrid_param->setup_ca_basis_size[i]); + multigrid_param->setup_ca_lambda_min[i] + = kv.get(subsection, "setup_ca_lambda_min", multigrid_param->setup_ca_lambda_min[i]); + multigrid_param->setup_ca_lambda_max[i] + = kv.get(subsection, "setup_ca_lambda_max", multigrid_param->setup_ca_lambda_max[i]); + + multigrid_param->coarse_solver[i] + = kv.get(subsection, "coarse_solver", multigrid_param->coarse_solver[i]); + multigrid_param->coarse_solver_tol[i] + = kv.get(subsection, "coarse_solver_tol", multigrid_param->coarse_solver_tol[i]); + multigrid_param->coarse_solver_maxiter[i] + = kv.get(subsection, "coarse_solver_maxiter", multigrid_param->coarse_solver_maxiter[i]); + multigrid_param->coarse_solver_ca_basis[i] + = kv.get(subsection, "coarse_solver_ca_basis", multigrid_param->coarse_solver_ca_basis[i]); + multigrid_param->coarse_solver_ca_basis_size[i] + = kv.get(subsection, "coarse_solver_ca_basis_size", multigrid_param->coarse_solver_ca_basis_size[i]); + multigrid_param->coarse_solver_ca_lambda_min[i] + = kv.get(subsection, "coarse_solver_ca_lambda_min", multigrid_param->coarse_solver_ca_lambda_min[i]); + multigrid_param->coarse_solver_ca_lambda_max[i] + = kv.get(subsection, "coarse_solver_ca_lambda_max", multigrid_param->coarse_solver_ca_lambda_max[i]); + multigrid_param->smoother[i] = kv.get(subsection, "smoother", multigrid_param->smoother[i]); + multigrid_param->smoother_tol[i] = kv.get(subsection, "smoother_tol", multigrid_param->smoother_tol[i]); + multigrid_param->nu_pre[i] = kv.get(subsection, "nu_pre", multigrid_param->nu_pre[i]); + multigrid_param->nu_post[i] = kv.get(subsection, "nu_post", multigrid_param->nu_post[i]); + multigrid_param->smoother_solver_ca_basis[i] + = kv.get(subsection, "smoother_solver_ca_basis", multigrid_param->smoother_solver_ca_basis[i]); + multigrid_param->smoother_solver_ca_lambda_min[i] = kv.get( + subsection, "smoother_solver_ca_lambda_min", multigrid_param->smoother_solver_ca_lambda_min[i]); + multigrid_param->smoother_solver_ca_lambda_max[i] = kv.get( + subsection, "smoother_solver_ca_lambda_max", multigrid_param->smoother_solver_ca_lambda_max[i]); + multigrid_param->omega[i] = kv.get(subsection, "omega", multigrid_param->omega[i]); + multigrid_param->smoother_halo_precision[i] + = kv.get(subsection, "smoother_halo_precision", multigrid_param->smoother_halo_precision[i]); + multigrid_param->smoother_schwarz_type[i] + = kv.get(subsection, "smoother_schwarz_type", multigrid_param->smoother_schwarz_type[i]); + multigrid_param->smoother_schwarz_cycle[i] + = kv.get(subsection, "smoother_schwarz_cycle", multigrid_param->smoother_schwarz_cycle[i]); + multigrid_param->coarse_grid_solution_type[i] = kv.get( + subsection, "coarse_grid_solution_type", multigrid_param->coarse_grid_solution_type[i]); + multigrid_param->smoother_solve_type[i] + = kv.get(subsection, "smoother_solve_type", multigrid_param->smoother_solve_type[i]); + multigrid_param->cycle_type[i] + = kv.get(subsection, "cycle_type", multigrid_param->cycle_type[i]); + multigrid_param->global_reduction[i] + = kv.get(subsection, "global_reduction", multigrid_param->global_reduction[i]); + multigrid_param->location[i] = kv.get(subsection, "location", multigrid_param->location[i]); + multigrid_param->setup_location[i] + = kv.get(subsection, "setup_location", multigrid_param->setup_location[i]); + multigrid_param->use_eig_solver[i] + = kv.get(subsection, "use_eig_solver", multigrid_param->use_eig_solver[i]); + + multigrid_param->vec_load[i] = kv.get(subsection, "vec_load", multigrid_param->vec_load[i]); + multigrid_param->vec_store[i] = kv.get(subsection, "vec_store", multigrid_param->vec_store[i]); + /*strcpy(multigrid_param->vec_infile[i], kv.get(subsection, "vec_infile", + multigrid_param->vec_infile[i]).c_str()); strcpy(multigrid_param->vec_outfile[i], + kv.get(subsection, "vec_outfile", multigrid_param->vec_outfile[i]).c_str());*/ + + multigrid_param->mu_factor[i] = kv.get(subsection, "mu_factor", multigrid_param->mu_factor[i]); + multigrid_param->transfer_type[i] + = kv.get(subsection, "transfer_type", multigrid_param->transfer_type[i]); + } + } + } + + /* transfer of the struct to all processes */ + MPI_Bcast((void *)param, sizeof(*param), MPI_BYTE, 0, qudaState.layout.world_comm); + MPI_Bcast((void *)invert_param_mg, sizeof(*invert_param_mg), MPI_BYTE, 0, qudaState.layout.world_comm); + MPI_Bcast((void *)multigrid_param, sizeof(*multigrid_param), MPI_BYTE, 0, qudaState.layout.world_comm); + multigrid_param->invert_param = invert_param_mg; + + /** + * We need a void* to store the multigrid_param (QudaMultigridParam) struct, + * such that we can access it and setup multigrid in a later stage, for + * instance right before calling invertQuda() if multigrid was not + * instantiated until then. + */ + openQCD_QudaSolver *additional_prop = new openQCD_QudaSolver(); + sprintf(additional_prop->infile, "%s", qudaState.infile); + additional_prop->id = id; + additional_prop->mg_param = multigrid_param; + additional_prop->u1csw = 0.0; + additional_prop->qhat = 0.0; + param->additional_prop = reinterpret_cast(additional_prop); + + return (void *)param; +} + +void *openQCD_qudaSolverGetHandle(int id) +{ + check_solver_id(id); + if (qudaState.inv_handles[id] == nullptr) { + if (id != -1) { + WITH_COMM(logQuda(QUDA_VERBOSE, "Read in solver parameters from file %s for solver (id=%d)\n", qudaState.infile, id)); + } + qudaState.inv_handles[id] = openQCD_qudaSolverReadIn(id); + } + + openQCD_qudaSolverUpdate(qudaState.inv_handles[id]); + return qudaState.inv_handles[id]; +} + +void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p) +{ + void *in = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, src); + void *out = qudaState.init.buffer_field(qudaState.layout.world_comm, 1, dst); + qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, src, in); + + if (in_comm()) { + + QudaInvertParam param = newOpenQCDDiracParam(p); + + /* both fields reside on the CPU */ + param.input_location = QUDA_CPU_FIELD_LOCATION; + param.output_location = QUDA_CPU_FIELD_LOCATION; + + MatQuda(static_cast(out), static_cast(in), ¶m); + + logQuda(QUDA_DEBUG_VERBOSE, "MatQuda()\n"); + logQuda(QUDA_DEBUG_VERBOSE, " gflops = %.2e\n", param.gflops); + logQuda(QUDA_DEBUG_VERBOSE, " secs = %.2e\n", param.secs); + } + + qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, out, dst); +} + +void openQCD_qudaDw(double mu, void *src, void *dst) +{ + if (gauge_field_get_unset()) { + WITH_COMM(errorQuda("Gauge field not populated in openQxD.")); + } + + QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(-1)); + param->mu = mu; + + if (!openQCD_qudaInvertParamCheck(param)) { + WITH_COMM(errorQuda("QudaInvertParam struct check failed, parameters/fields between openQxD and QUDA are not in sync.")); + } + + /* both fields reside on the CPU */ + param->input_location = QUDA_CPU_FIELD_LOCATION; + param->output_location = QUDA_CPU_FIELD_LOCATION; + + void *in = qudaState.init.buffer_field(qudaState.layout.world_comm, 0, src); + void *out = qudaState.init.buffer_field(qudaState.layout.world_comm, 1, dst); + + qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, src, in); + WITH_COMM(MatQuda(static_cast(out), static_cast(in), param)); + qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, out, dst); +} + +void openQCD_qudaDw_NoLoads(double mu, void *d_in, void *d_out) +{ + if (gauge_field_get_unset()) { + WITH_COMM(errorQuda("Gauge field not populated in openQxD.")); + } + + QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(-1)); + param->mu = mu; + + if (!openQCD_qudaInvertParamCheck(param)) { + WITH_COMM(errorQuda("QudaInvertParam struct check failed, parameters/fields between openQxD and QUDA are not in sync.")); + } + + /* both fields reside on the GPU */ + param->input_location = QUDA_CUDA_FIELD_LOCATION; + param->output_location = QUDA_CUDA_FIELD_LOCATION; + + ColorSpinorField *in = reinterpret_cast(d_in); + ColorSpinorField *out = reinterpret_cast(d_out); + + /* truncated version of what MatQuda does */ + if (in_comm()) { + pushVerbosity(param->verbosity); + bool pc = (param->solution_type == QUDA_MATPC_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + + DiracParam diracParam; + setDiracParam(diracParam, param, pc); + + Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator + dirac->M(*out, *in); // apply the operator + delete dirac; // clean up + + if (pc) { + if (param->mass_normalization == QUDA_MASS_NORMALIZATION) { + blas::ax(0.25 / (param->kappa * param->kappa), *out); + } else if (param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(0.5 / param->kappa, *out); + } + } else { + if (param->mass_normalization == QUDA_MASS_NORMALIZATION + || param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(0.5 / param->kappa, *out); + } + } + popVerbosity(); + } +} + +/** + * @brief Take the string-hash over a struct using std::hash. + * + * @param[in] in Input struct + * + * @tparam T Type of input struct + * + * @return Hash value + */ +template int hash_struct(T *in) +{ + int hash = 0; + char *cstruct = reinterpret_cast(in); + + for (char *c = cstruct; c < cstruct + sizeof(T); c += strlen(c) + 1) { + if (strlen(c) != 0) { hash ^= (std::hash {}(std::string(c)) << 1); } + } + + return hash; +} + +int openQCD_qudaSolverGetHash(int id) +{ + check_solver_id(id); + if (qudaState.inv_handles[id] != nullptr) { + QudaInvertParam *param = reinterpret_cast(qudaState.inv_handles[id]); + QudaInvertParam hparam = newQudaInvertParam(); + memset(&hparam, '\0', sizeof(QudaInvertParam)); /* set everything to zero */ + + /* Set some properties we want to take the hash over */ + hparam.inv_type = param->inv_type; + hparam.tol = param->tol; + hparam.tol_restart = param->tol_restart; + hparam.tol_hq = param->tol_hq; + hparam.maxiter = param->maxiter; + hparam.reliable_delta = param->reliable_delta; + hparam.solution_type = param->solution_type; + hparam.solve_type = param->solve_type; + hparam.matpc_type = param->matpc_type; + hparam.dagger = param->dagger; + hparam.mass_normalization = param->mass_normalization; + hparam.solver_normalization = param->solver_normalization; + hparam.cpu_prec = param->cpu_prec; + hparam.cuda_prec = param->cuda_prec; + hparam.use_init_guess = param->use_init_guess; + hparam.gcrNkrylov = param->gcrNkrylov; + + return hash_struct(&hparam); + } else { + return 0; + } +} + +void openQCD_qudaSolverPrintSetup(int id) +{ + check_solver_id(id); + if (!in_comm()) { return; } + if (qudaState.inv_handles[id] != nullptr) { + QudaInvertParam *param = static_cast(qudaState.inv_handles[id]); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + + printQudaInvertParam(param); + printfQuda("additional_prop->infile = %s\n", additional_prop->infile); + printfQuda("additional_prop->id = %d\n", additional_prop->id); + printfQuda("additional_prop->mg_param = %p\n", additional_prop->mg_param); + printfQuda("additional_prop->u1csw = %.6e\n", additional_prop->u1csw); + printfQuda("additional_prop->qhat = %d\n", additional_prop->qhat); + printfQuda("additional_prop->mg_ud_rev = %d\n", additional_prop->mg_ud_rev); + printfQuda("additional_prop->mg_ad_rev = %d\n", additional_prop->mg_ad_rev); + printfQuda("additional_prop->mg_kappa = %.6e\n", additional_prop->mg_kappa); + printfQuda("additional_prop->mg_su3csw = %.6e\n", additional_prop->mg_su3csw); + printfQuda("additional_prop->mg_u1csw = %.6e\n", additional_prop->mg_u1csw); + printfQuda("additional_prop->mg_qhat = %d\n", additional_prop->mg_qhat); + printfQuda("handle = %p\n", param); + printfQuda("hash = %d\n", openQCD_qudaSolverGetHash(id)); + + printfQuda("inv_type_precondition = %d\n", param->inv_type_precondition); + + if (param->inv_type_precondition == QUDA_MG_INVERTER) { printQudaMultigridParam(additional_prop->mg_param); } + } else { + printfQuda("\n"); + } +} + +double openQCD_qudaInvert(int id, double mu, void* source, void* solution, int *status) +{ + double residual; + openQCD_qudaInvertMultiSrc(id, mu, &source, &solution, status, &residual); + return residual; +} + +void openQCD_qudaInvertMultiSrc(int id, double mu, void** sources, void** solutions, int *status, double *residual) +{ + if (gauge_field_get_unset()) { WITH_COMM(errorQuda("Gauge field not populated in openQxD.")); } + + /** + * This is to make sure we behave in the same way as openQCDs solvers, we call + * h_sw() which in turn calls sw_term(). We have to make sure that the SW-term + * in openQxD is setup and in sync with QUDAs. + */ + if (qudaState.layout.h_sw != nullptr) { + qudaState.layout.h_sw(); + } else { + WITH_COMM(errorQuda("qudaState.layout.h_sw is not set.")); + } + + QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(id)); + param->mu = mu; + + if (!openQCD_qudaInvertParamCheck(param)) { + WITH_COMM(errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync.")); + } + + void **h_sources = (void**) malloc(param->num_src*sizeof(void*)); + void **h_solutions = (void**) malloc(param->num_src*sizeof(void*)); + + for (int i = 0; i < param->num_src; ++i) { + h_sources[i] = qudaState.init.buffer_field(qudaState.layout.world_comm, i, sources[i]); + h_solutions[i] = qudaState.init.buffer_field(qudaState.layout.world_comm, i+param->num_src, solutions[i]); + qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, sources[i], h_sources[i]); + if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { + qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, solutions[i], h_solutions[i]); + } + } + + WITH_COMM(logQuda(QUDA_VERBOSE, "Calling invertMultiSrcQuda() ...\n")); + PUSH_RANGE("invertMultiSrcQuda", 5); + WITH_COMM(invertMultiSrcQuda(h_solutions, h_sources, param)); + POP_RANGE; + + for (int i = 0; i < param->num_src; ++i) { + qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, h_solutions[i], solutions[i]); + status[i] = param->true_res[i] <= param->tol ? param->iter : -1; + residual[i] = param->true_res[i]; + } + + if (!qudaState.init.two_grids_equal) { + MPI_Bcast(status, param->num_src, MPI_INT, 0, qudaState.layout.world_comm); + MPI_Bcast(residual, param->num_src, MPI_DOUBLE, 0, qudaState.layout.world_comm); + } + + WITH_COMM(logQuda(QUDA_VERBOSE, "openQCD_qudaInvertMultiSrc()\n")); + WITH_COMM(logQuda(QUDA_VERBOSE, " iter = %d\n", param->iter)); + WITH_COMM(logQuda(QUDA_VERBOSE, " gflops = %.2e\n", param->gflops)); + WITH_COMM(logQuda(QUDA_VERBOSE, " secs = %.2e\n", param->secs)); + for (int i = 0; i < param->num_src; ++i) { + WITH_COMM(logQuda(QUDA_VERBOSE, " true_res[%d] = %.2e\n", i, param->true_res[i])); + WITH_COMM(logQuda(QUDA_VERBOSE, " true_res_hq[%d] = %.2e\n", i, param->true_res_hq[i])); + WITH_COMM(logQuda(QUDA_VERBOSE, " status[%d] = %d\n", i, status[i])); + } + + free(h_sources); + free(h_solutions); +} + +static void *openQCD_qudaInvertAsyncWrapper(void*) +{ + /* enable the thread to use QUDA */ + WITH_COMM(device::init_thread()); + + /* split the world_comm communicator, then signal the main threads to continue */ + pthread_mutex_lock(&qudaState.thread.mutex); + MPI_Comm_split(qudaState.layout.world_comm, 0, 0, &qudaState.thread.worker_comm); + qudaState.thread.ready = true; + pthread_cond_signal(&qudaState.thread.cond); + pthread_mutex_unlock(&qudaState.thread.mutex); + + int Ns = qudaState.async_params.num_src; /* num sources */ + int Nd = qudaState.inv_args.size(); /* num dispatched */ + int Nc = Nd > Ns ? ((Ns+Nd-1)/Ns) : 1; /* num calls */ + + void **h_sources = (void**) malloc(Ns*sizeof(void*)); + void **h_solutions = (void**) malloc(Ns*sizeof(void*)); + + for (int c = 0; c < Nc; ++c) { + qudaState.async_params.num_src = (c+1==Nc && Nc>1 && Nd % Ns != 0) ? (Nd % Ns) : min(Nd, Ns); + + for (int s = 0; s < qudaState.async_params.num_src; ++s) { + int i = c*Ns + s; + h_sources[s] = qudaState.init.buffer_field(qudaState.thread.worker_comm, s, qudaState.inv_args[i].source); + h_solutions[s] = qudaState.init.buffer_field(qudaState.thread.worker_comm, s+qudaState.async_params.num_src, qudaState.inv_args[i].solution); + qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, qudaState.inv_args[i].source, h_sources[s]); + if (qudaState.async_params.use_init_guess == QUDA_USE_INIT_GUESS_YES) { + qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, qudaState.inv_args[i].solution, h_solutions[s]); + } + } + + WITH_COMM(logQuda(QUDA_VERBOSE, "Calling invertMultiSrcQuda() ...\n")); + PUSH_RANGE("invertMultiSrcQuda", 5); + WITH_COMM(invertMultiSrcQuda(h_solutions, h_sources, &qudaState.async_params)); + POP_RANGE; + + for (int s = 0; s < qudaState.async_params.num_src; ++s) { + int i = c*Ns + s; + qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, h_solutions[s], qudaState.inv_args[i].solution); + + if (in_comm()) { + *(qudaState.inv_args[i].status) = qudaState.async_params.true_res[s] <= qudaState.async_params.tol ? qudaState.async_params.iter : -1; + qudaState.inv_args[i].retval = qudaState.async_params.true_res[s]; + } + } + + WITH_COMM(logQuda(QUDA_VERBOSE, "openQCD_qudaInvertAsync...()\n")); + WITH_COMM(logQuda(QUDA_VERBOSE, " iter = %d\n", qudaState.async_params.iter)); + WITH_COMM(logQuda(QUDA_VERBOSE, " gflops = %.2e\n", qudaState.async_params.gflops)); + WITH_COMM(logQuda(QUDA_VERBOSE, " secs = %.2e\n", qudaState.async_params.secs)); + for (int s = 0; s < qudaState.async_params.num_src; ++s) { + int i = c*Ns + s; + WITH_COMM(logQuda(QUDA_VERBOSE, " true_res[%d] = %.2e\n", s, qudaState.async_params.true_res[s])); + WITH_COMM(logQuda(QUDA_VERBOSE, " true_res_hq[%d] = %.2e\n", s, qudaState.async_params.true_res_hq[s])); + WITH_COMM(logQuda(QUDA_VERBOSE, " status[%d] = %d\n", s, *(qudaState.inv_args[i].status))); + + MPI_Bcast(qudaState.inv_args[i].status, 1, MPI_INT, 0, qudaState.thread.worker_comm); + MPI_Bcast((void *)&qudaState.inv_args[i].retval, 1, MPI_DOUBLE, 0, qudaState.thread.worker_comm); + } + } + free(h_sources); + free(h_solutions); + MPI_Comm_free(&qudaState.thread.worker_comm); + + return nullptr; +} + +void check_mpi_init(void) +{ + int flag; + MPI_Query_thread(&flag); + if (flag < MPI_THREAD_MULTIPLE) + WITH_COMM(errorQuda("MPI was not initialized with thread support. " + "Initialize MPI with quda_mpi_init instead of MPI_Init.")); +} + +void openQCD_qudaInvertAsyncSetup(int id, double mu) +{ + check_mpi_init(); + if (gauge_field_get_unset()) { WITH_COMM(errorQuda("Gauge field not populated in openQxD.")); } + + if (qudaState.layout.h_sw != nullptr) { + qudaState.layout.h_sw(); + } else { + WITH_COMM(errorQuda("qudaState.layout.h_sw is not set.")); + } + + qudaState.async_params = *static_cast(openQCD_qudaSolverGetHandle(id)); + qudaState.async_params.mu = mu; + + if (!openQCD_qudaInvertParamCheck(&qudaState.async_params)) { + WITH_COMM(errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync.")); + } +} + +void openQCD_qudaInvertAsyncDispatch(void *source, void *solution, int *status) +{ + check_mpi_init(); + + openQCD_qudaInvert_args_t args; + args.source = source; + args.solution = solution; + args.status = status; + + qudaState.inv_args.push_back(args); +} + +MPI_Comm openQCD_qudaInvertAsyncStart(void) +{ + check_mpi_init(); + + if (qudaState.thread.created == true) + WITH_COMM(errorQuda("Thread already created")); + + MPI_Comm_split(qudaState.layout.world_comm, 1, 0, &qudaState.thread.main_comm); + + int rc = pthread_create(&qudaState.thread.thread, nullptr, openQCD_qudaInvertAsyncWrapper, nullptr); + if (rc != 0) { + perror("Error in openQCD_qudaInvertStart"); + WITH_COMM(errorQuda("pthread_create failed")); + } + + /* wait for the worker threads to split the world_comm communicator properly */ + pthread_mutex_lock(&qudaState.thread.mutex); + while (!qudaState.thread.ready) { + pthread_cond_wait(&qudaState.thread.cond, &qudaState.thread.mutex); + } + pthread_mutex_unlock(&qudaState.thread.mutex); + + qudaState.thread.created = true; + return qudaState.thread.main_comm; +} + +void openQCD_qudaInvertAsyncWait(double *residual) +{ + check_mpi_init(); + + if (!qudaState.thread.created) + WITH_COMM(errorQuda("openQCD_qudaInvertStart not called")); + + int rc = pthread_join(qudaState.thread.thread, nullptr); + if (rc != 0) { + perror("Error in openQCD_qudaInvertWait"); + WITH_COMM(errorQuda("pthread_join failed")); + } + + /* reset the thread state properly */ + MPI_Comm_free(&qudaState.thread.main_comm); + qudaState.thread.created = false; + qudaState.thread.ready = false; + qudaState.thread.cond = PTHREAD_COND_INITIALIZER; + qudaState.thread.mutex = PTHREAD_MUTEX_INITIALIZER; + qudaState.thread.worker_comm = MPI_COMM_NULL; + qudaState.thread.main_comm = MPI_COMM_NULL; + + if (residual != nullptr) { + int i = 0; + for (openQCD_qudaInvert_args_t args : qudaState.inv_args) { + residual[i] = args.retval; + i++; + } + } + + qudaState.inv_args.clear(); +} + + +void openQCD_qudaSolverDestroy(int id) +{ + check_solver_id(id); + if (qudaState.inv_handles[id] != nullptr) { + QudaInvertParam *param = static_cast(qudaState.inv_handles[id]); + + if (param->inv_type_precondition == QUDA_MG_INVERTER) { destroyMultigridQuda(param->preconditioner); } + + delete static_cast(param->additional_prop)->mg_param; + delete static_cast(param->additional_prop); + delete param; + qudaState.inv_handles[id] = nullptr; + } +} + +void *openQCD_qudaEigensolverReadIn(int id, int solver_id) +{ + int my_rank; + QudaEigParam *param; + + MPI_Comm_rank(qudaState.layout.world_comm, &my_rank); + QudaVerbosity verbosity = QUDA_SUMMARIZE; + + /* Allocate on the heap */ + if (qudaState.eig_handles[id] == nullptr) { + param = new QudaEigParam(newQudaEigParam()); + } else { + param = static_cast(qudaState.eig_handles[id]); + } + + if (my_rank == 0) { + + KeyValueStore kv; + kv.set_map(&enum_map); + kv.load(qudaState.infile); + + std::string section = "Eigensolver " + std::to_string(id); + + verbosity = kv.get(section, "verbosity", verbosity); + + if (verbosity >= QUDA_DEBUG_VERBOSE) { kv.dump(); } + + if (kv.get(section, "solver") != "QUDA") { + WITH_COMM(errorQuda("Eigensolver section \"%s\" in file %s is not a valid quda-eigensolver section (solver = %s)\n", + section.c_str(), qudaState.infile, kv.get(section, "solver").c_str())); + } + + param->eig_type = kv.get(section, "eig_type", param->eig_type); + param->use_poly_acc = kv.get(section, "use_poly_acc", param->use_poly_acc); + param->poly_deg = kv.get(section, "poly_deg", param->poly_deg); + param->a_min = kv.get(section, "a_min", param->a_min); + param->a_max = kv.get(section, "a_max", param->a_max); + param->preserve_deflation = kv.get(section, "preserve_deflation", param->preserve_deflation); + /*param->*preserve_deflation_space = kv.get(section, *"*preserve_deflation_space", param->preserve_deflation_space);*/ + param->preserve_evals = kv.get(section, "preserve_evals", param->preserve_evals); + param->use_dagger = kv.get(section, "use_dagger", param->use_dagger); + param->use_norm_op = kv.get(section, "use_norm_op", param->use_norm_op); + param->use_pc = kv.get(section, "use_pc", param->use_pc); + param->use_eigen_qr = kv.get(section, "use_eigen_qr", param->use_eigen_qr); + param->compute_svd = kv.get(section, "compute_svd", param->compute_svd); + param->compute_gamma5 = kv.get(section, "compute_gamma5", param->compute_gamma5); + param->require_convergence = kv.get(section, "require_convergence", param->require_convergence); + param->spectrum = kv.get(section, "spectrum", param->spectrum); + param->n_ev = kv.get(section, "n_ev", param->n_ev); + param->n_kr = kv.get(section, "n_kr", param->n_kr); + param->n_conv = kv.get(section, "n_conv", param->n_conv); + param->n_ev_deflate = kv.get(section, "n_ev_deflate", param->n_ev_deflate); + param->tol = kv.get(section, "tol", param->tol); + param->qr_tol = kv.get(section, "qr_tol", param->qr_tol); + param->check_interval = kv.get(section, "check_interval", param->check_interval); + param->max_restarts = kv.get(section, "max_restarts", param->max_restarts); + param->batched_rotate = kv.get(section, "batched_rotate", param->batched_rotate); + param->block_size = kv.get(section, "block_size", param->block_size); + param->arpack_check = kv.get(section, "arpack_check", param->arpack_check); + strcpy(param->QUDA_logfile, kv.get(section, "QUDA_logfile", param->QUDA_logfile).c_str()); + strcpy(param->arpack_logfile, kv.get(section, "arpack_logfile", param->arpack_logfile).c_str()); + + param->nk = kv.get(section, "nk", param->nk); + param->np = kv.get(section, "np", param->np); + param->import_vectors = kv.get(section, "import_vectors", param->import_vectors); + param->cuda_prec_ritz = kv.get(section, "cuda_prec_ritz", param->cuda_prec_ritz); + param->mem_type_ritz = kv.get(section, "mem_type_ritz", param->mem_type_ritz); + param->location = kv.get(section, "location", param->location); + param->run_verify = kv.get(section, "run_verify", param->run_verify); + /*strcpy(param->vec_infile, kv.get(section, "vec_infile", param->vec_infile).c_str());*/ + /*strcpy(param->vec_outfile, kv.get(section, "vec_outfile", param->vec_outfile).c_str());*/ + param->vec_outfile[0] = '\0'; + param->vec_infile[0] = '\0'; + param->save_prec = kv.get(section, "save_prec", param->save_prec); + param->io_parity_inflate = kv.get(section, "io_parity_inflate", param->io_parity_inflate); + param->extlib_type = kv.get(section, "extlib_type", param->extlib_type); + } + + /* transfer of the struct to all the processes */ + MPI_Bcast((void *)param, sizeof(*param), MPI_BYTE, 0, qudaState.layout.world_comm); + + void *inv_param = openQCD_qudaSolverGetHandle(solver_id); + param->invert_param = static_cast(inv_param); + + param->invert_param->verbosity = std::max(param->invert_param->verbosity, verbosity); + + if (solver_id != -1 && param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { + WITH_COMM(printQudaInvertParam(param->invert_param)); + } + + if (param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { WITH_COMM(printQudaEigParam(param)); } + + return (void *)param; +} + +void *openQCD_qudaEigensolverGetHandle(int id, int solver_id) +{ + check_eigensolver_id(id); + check_solver_id(solver_id); + + if (qudaState.eig_handles[id] == nullptr) { + WITH_COMM(logQuda(QUDA_VERBOSE, "Read in eigensolver parameters from file %s for eigensolver (id=%d)\n", qudaState.infile, id)); + qudaState.eig_handles[id] = openQCD_qudaEigensolverReadIn(id, solver_id); + } + + openQCD_qudaSolverUpdate(static_cast(qudaState.eig_handles[id])->invert_param); + return qudaState.eig_handles[id]; +} + +void openQCD_qudaEigensolverPrintSetup(int id, int solver_id) +{ + check_eigensolver_id(id); + check_solver_id(solver_id); + if (!in_comm()) { return; } + + if (qudaState.eig_handles[id] != nullptr) { + QudaEigParam *param = static_cast(qudaState.eig_handles[id]); + printQudaEigParam(param); + printfQuda("\n"); + openQCD_qudaSolverPrintSetup(solver_id); + } else { + printfQuda("\n"); + } +} + +void openQCD_qudaEigensolve(int id, int solver_id, void **h_evecs, void *h_evals) +{ + if (gauge_field_get_unset()) { WITH_COMM(errorQuda("Gauge field not populated in openQxD.")); } + + if (qudaState.layout.h_sw != nullptr) { + qudaState.layout.h_sw(); + } else { + WITH_COMM(errorQuda("qudaState.layout.h_sw is not set.")); + } + + QudaEigParam *eig_param = static_cast(openQCD_qudaEigensolverGetHandle(id, solver_id)); + + if (!openQCD_qudaInvertParamCheck(eig_param->invert_param)) { + WITH_COMM(errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync.")); + } + + int N = eig_param->n_conv*(eig_param->compute_svd ? 2 : 1); + void **evecs = (void**) malloc(N*sizeof(void*)); + for (int i = 0; i < N; ++i) { + evecs[i] = qudaState.init.buffer_field(qudaState.layout.world_comm, i, h_evecs[i]); + qudaState.layout.openqcd2quda(OPENQCD_FIELD_SPINOR, h_evecs[i], evecs[i]); + } + + WITH_COMM(logQuda(QUDA_VERBOSE, "Calling eigensolveQuda() ...\n")); + PUSH_RANGE("eigensolveQuda", 6); + WITH_COMM(eigensolveQuda(evecs, static_cast(h_evals), eig_param)); + POP_RANGE; + + WITH_COMM(logQuda(QUDA_SUMMARIZE, "openQCD_qudaEigensolve()\n")); + WITH_COMM(logQuda(QUDA_SUMMARIZE, " gflops = %.2e\n", eig_param->invert_param->gflops)); + WITH_COMM(logQuda(QUDA_SUMMARIZE, " secs = %.2e\n", eig_param->invert_param->secs)); + WITH_COMM(logQuda(QUDA_SUMMARIZE, " iter = %d\n", eig_param->invert_param->iter)); + + if (!qudaState.init.two_grids_equal) + MPI_Bcast(h_evals, 2*N, MPI_DOUBLE, 0, qudaState.layout.world_comm); + + for (int i = 0; i < N; ++i) { + qudaState.layout.quda2openqcd(OPENQCD_FIELD_SPINOR, evecs[i], h_evecs[i]); + } + + free(evecs); +} + +void openQCD_qudaEigensolverDestroy(int id) +{ + check_eigensolver_id(id); + + if (qudaState.eig_handles[id] != nullptr) { + QudaEigParam *eig_param = static_cast(qudaState.eig_handles[id]); + + delete eig_param; + qudaState.eig_handles[id] = nullptr; + } +} + +bool openQCD_qudaQCDPlusQEDEnabled(void) { +#ifdef BUILD_QCD_PLUS_QED + return true; +#else + return false; +#endif +} + +#undef WITH_COMM diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90 index 2f52d56aae..494d77f136 100644 --- a/lib/quda_fortran.F90 +++ b/lib/quda_fortran.F90 @@ -368,6 +368,9 @@ module quda_fortran ! The t0 parameter for distance preconditioning, the timeslice where the source is located integer(4) distance_pc_t0 + ! Additional user-defined properties + integer(8) :: additional_prop + end type quda_invert_param end module quda_fortran diff --git a/lib/targets/cuda/comm_target.cpp b/lib/targets/cuda/comm_target.cpp index bd9fa1fd3e..89bc7f2a38 100644 --- a/lib/targets/cuda/comm_target.cpp +++ b/lib/targets/cuda/comm_target.cpp @@ -75,8 +75,11 @@ namespace quda // TODO: We maybe can force loopback comms to use the IB path here if (comm_dim(dim) == 1) continue; #endif - // even if comm_dim(2) == 2, we might not have p2p enabled in both directions, so check this - const int num_dir = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2; + // even if comm_dim(dim) == 2, we might not have p2p enabled in both directions, so check this + const int num_dir + = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? + 1 : + 2; for (int dir = 0; dir < num_dir; dir++) { remote[dim][dir] = nullptr; #ifndef NVSHMEM_COMMS diff --git a/lib/targets/hip/comm_target.cpp b/lib/targets/hip/comm_target.cpp index cbb4725253..30812d4440 100644 --- a/lib/targets/hip/comm_target.cpp +++ b/lib/targets/hip/comm_target.cpp @@ -69,9 +69,11 @@ namespace quda // open the remote memory handles and set the send ghost pointers for (int dim = 0; dim < 4; ++dim) { if (comm_dim(dim) == 1) continue; - // even if comm_dim(2) == 2, we might not have p2p enabled in both directions, so check this - const int num_dir - = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2; + // even if comm_dim(dim) == 2, we might not have p2p enabled in both directions, so check this + const int num_dir = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) + && comm_peer2peer_enabled(1, dim)) ? + 1 : + 2; for (int dir = 0; dir < num_dir; ++dir) { remote[dim][dir] = nullptr; if (!comm_peer2peer_enabled(dir, dim)) continue; diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index 3f04cfcd9c..7e2bbd8078 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -121,11 +121,7 @@ void init(int argc, char **argv) { // Set QUDA's internal parameters gauge_param = newQudaGaugeParam(); - if (use_split_gauge_bkup == 1) { - gauge_param.use_split_gauge_bkup = true; - } else { - gauge_param.use_split_gauge_bkup = false; - } + gauge_param.use_split_gauge_bkup = use_split_gauge_bkup == 1; setWilsonGaugeParam(gauge_param); inv_param = newQudaInvertParam(); diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 1f5c7c726e..7e10bc8b6f 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -140,11 +140,7 @@ void init() { // Set QUDA internal parameters gauge_param = newQudaGaugeParam(); - if (use_split_gauge_bkup == 1) { - gauge_param.use_split_gauge_bkup = true; - } else { - gauge_param.use_split_gauge_bkup = false; - } + gauge_param.use_split_gauge_bkup = use_split_gauge_bkup == 1; setStaggeredGaugeParam(gauge_param); QudaGaugeSmearParam smear_param; if (gauge_smear) {