diff --git a/.gitignore b/.gitignore index a60ac7da48..9f7d2843b5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,18 @@ *.o +*.d *.f90 *.mod *.a *~ tests/*_test +tests/*_ctest +make.inc milc_interface/* *#* *.pyc tunecache.tsv profile.tsv +profile_*.tsv config.log CMakeCache.txt CMakeFiles diff --git a/CMakeLists.txt b/CMakeLists.txt index 803f5dba41..2ffbd8994c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,11 +72,11 @@ else() set(DEFTARGET "CUDA") endif() -set(VALID_TARGET_TYPES CUDA HIP SYCL) +set(VALID_TARGET_TYPES CUDA HIP SYCL OMPTARGET) set(QUDA_TARGET_TYPE "${DEFTARGET}" CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}") -set_property(CACHE QUDA_TARGET_TYPE PROPERTY STRINGS CUDA HIP SYCL) +set_property(CACHE QUDA_TARGET_TYPE PROPERTY STRINGS CUDA HIP SYCL OMPTARGET) string(TOUPPER ${QUDA_TARGET_TYPE} CHECK_TARGET_TYPE) list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID) @@ -240,10 +240,10 @@ option(QUDA_ALTERNATIVE_I_TO_F "enable using alternative integer-to-float conver option(QUDA_OPENMP "enable OpenMP" OFF) set(QUDA_CXX_STANDARD - 17 - CACHE STRING "set the CXX Standard (14 or 17)") + 20 + CACHE STRING "set the CXX Standard (14, 17, 20)") -set_property(CACHE QUDA_CXX_STANDARD PROPERTY STRINGS 14 17) +set_property(CACHE QUDA_CXX_STANDARD PROPERTY STRINGS 14 17 20) option(QUDA_BACKWARDS "Enable stacktrace generation using backwards-cpp") diff --git a/include/array.h b/include/array.h index 3005087c85..e243043317 100644 --- a/include/array.h +++ b/include/array.h @@ -17,13 +17,6 @@ namespace quda constexpr T &operator[](int i) { return data[i]; } constexpr const T &operator[](int i) const { return data[i]; } constexpr int size() const { return n; } - - array() = default; - array(const array &) = default; - array(array &&) = default; - - array &operator=(const array &) = default; - array &operator=(array &&) = default; }; template std::ostream &operator<<(std::ostream &output, const array &a) diff --git a/include/comm_quda.h b/include/comm_quda.h index 4341cf58d9..8c3567308b 100644 --- a/include/comm_quda.h +++ b/include/comm_quda.h @@ -1,5 +1,6 @@ #pragma once #include +#include #include #include #include diff --git a/include/communicator_quda.h b/include/communicator_quda.h index aec02b8c2a..3365097027 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -544,7 +544,7 @@ namespace quda if (gpuid < 0) { int device_count = device::get_device_count(); - if (device_count == 0) { errorQuda("No devices found"); } + if (device_count == 0) { warningQuda("No devices found"); } // We initialize gpuid if it's still negative. gpuid = 0; @@ -558,7 +558,7 @@ namespace quda gpuid = gpuid % device_count; printf("MPS enabled, rank=%3d -> gpu=%d\n", comm_rank(), gpuid); } else { - errorQuda("Too few GPUs available on %s", comm_hostname()); + warningQuda("Too few GPUs available on %s", comm_hostname()); } } } // -ve gpuid diff --git a/include/complex_quda.h b/include/complex_quda.h index 18da63def5..369598ea52 100644 --- a/include/complex_quda.h +++ b/include/complex_quda.h @@ -27,6 +27,8 @@ #include #include // for double2 / float2 +#include + namespace quda { namespace gauge { template struct fieldorder_wrapper; diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh index df176549fc..26bbdfb82b 100644 --- a/include/dslash_helper.cuh +++ b/include/dslash_helper.cuh @@ -12,7 +12,7 @@ #include #include -#if defined(_NVHPC_CUDA) +#if defined(_NVHPC_CUDA) || defined(QUDA_TARGET_OMPTARGET) #include constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::FALSE; #else diff --git a/include/kernel_helper.h b/include/kernel_helper.h index dcb33baba0..2183c1c28d 100644 --- a/include/kernel_helper.h +++ b/include/kernel_helper.h @@ -5,6 +5,17 @@ namespace quda { + enum ThreadsSync { + ThreadsSyncNo = 0, + ThreadsSyncX = 1, + ThreadsSyncY = 2, + ThreadsSyncXY = 3, + ThreadsSyncZ = 4, + ThreadsSyncXZ = 5, + ThreadsSyncYZ = 6, + ThreadsSyncAll = 7 + }; + struct kernel_t { const void *func; const std::string name; diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh index 03f9d4b75e..f9bbc7247f 100644 --- a/include/kernels/coarse_op_kernel.cuh +++ b/include/kernels/coarse_op_kernel.cuh @@ -1029,6 +1029,7 @@ namespace quda { template __device__ __host__ inline int virtualThreadIdx(const Arg &arg) { + QUDA_RT_CONSTS; int warp_id = threadIdx.x / device::warp_size(); int warp_lane = threadIdx.x % device::warp_size(); int tx = warp_id * (device::warp_size() / arg.aggregates_per_block) + warp_lane / arg.aggregates_per_block; @@ -1037,12 +1038,14 @@ namespace quda { template __device__ __host__ inline int virtualBlockDim(const Arg &arg) { + QUDA_RT_CONSTS; int block_dim_x = blockDim.x / arg.aggregates_per_block; return block_dim_x; } template __device__ __host__ inline int coarseIndex(const Arg &arg) { + QUDA_RT_CONSTS; int warp_lane = threadIdx.x % device::warp_size(); int x_coarse = (arg.coarse_color_wave ? blockIdx.y : blockIdx.x) * arg.aggregates_per_block + warp_lane % arg.aggregates_per_block; return x_coarse; @@ -1396,6 +1399,7 @@ namespace quda { template inline __device__ void operator()(VUV &vuv, bool isDiagonal, int coarse_x_cb, int coarse_parity, int i0, int j0, int parity, const Pack &pack, const Arg &arg) { + QUDA_RT_CONSTS; using real = typename Arg::Float; using TileType = typename Arg::vuvTileType; const int dim_index = arg.dim_index % arg.Y_atomic.geometry; @@ -1638,6 +1642,7 @@ namespace quda { template __device__ inline void operator()(int &parity_coarse, int &x_coarse_cb, int &parity, int &x_cb, int &parity_c_row, int &c_row, int &c_col, const Arg &arg) { + QUDA_RT_CONSTS; if (arg.coarse_color_wave) { int parity_c_row_block_idx_z = blockDim.y*blockIdx.x + threadIdx.y; int c_row_block_idx_z = arg.parity_flip ? (parity_c_row_block_idx_z % arg.coarse_color_grid_z ) : (parity_c_row_block_idx_z / 2); // coarse color row index diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh index b3637e1652..b182219a1b 100644 --- a/include/kernels/color_spinor_pack.cuh +++ b/include/kernels/color_spinor_pack.cuh @@ -64,6 +64,7 @@ namespace quda { template struct PackGhostArg : kernel_param<> { + static constexpr ThreadsSync requires_threads_sync = ThreadsSyncX; static constexpr bool block_float = sizeof(store_t) == QUDA_SINGLE_PRECISION && isFixed::value; // ensure we only compile supported block-float kernels diff --git a/include/kernels/dslash_coarse.cuh b/include/kernels/dslash_coarse.cuh index 18dd1da1a5..f2a9847625 100644 --- a/include/kernels/dslash_coarse.cuh +++ b/include/kernels/dslash_coarse.cuh @@ -31,6 +31,8 @@ namespace quda { template struct DslashCoarseArg : kernel_param<> { + static constexpr ThreadsSync requires_threads_sync = ThreadsSyncAll; + static constexpr bool dslash = dslash_; static constexpr bool clover = clover_; static constexpr bool dagger = dagger_; @@ -364,11 +366,18 @@ namespace quda { if (doBulk() && Arg::clover && dir==0 && dim==0) applyClover(out, arg, x_cb, src_idx, parity, s, color_block, color_offset); +#ifdef QUDA_TARGET_OMPTARGET + // reduce down to the first group of column-split threads, do it for every threads in openmp. + out = warp_combine(out); +#endif + if (dir==0 && dim==0) { const int my_spinor_parity = (arg.nParity == 2) ? parity : 0; +#ifndef QUDA_TARGET_OMPTARGET // reduce down to the first group of column-split threads out = warp_combine(out); +#endif #pragma unroll for (int color_local=0; color_local __device__ __host__ inline Vector d5(const Arg &arg, const Vector &in, int parity, int x_cb, int s, int src_idx) { + QUDA_RT_CONSTS; using real = typename Arg::real; constexpr bool is_variable = true; @@ -378,6 +379,7 @@ namespace quda __device__ __host__ inline Vector constantInv(const Arg &arg, const Vector &in, int parity, int x_cb, int s_, int src_idx) { + QUDA_RT_CONSTS; using real = typename Arg::real; const auto k = arg.kappa; const auto inv = arg.inv; @@ -436,6 +438,7 @@ namespace quda __device__ __host__ inline Vector variableInv(const Arg &arg, const Vector &in, int parity, int x_cb, int s_, int src_idx) { + QUDA_RT_CONSTS; constexpr int nSpin = 4; using real = typename Arg::real; typedef ColorSpinor HalfVector; diff --git a/include/kernels/dslash_mobius_eofa.cuh b/include/kernels/dslash_mobius_eofa.cuh index c46ffa1d62..7106fa70ec 100644 --- a/include/kernels/dslash_mobius_eofa.cuh +++ b/include/kernels/dslash_mobius_eofa.cuh @@ -106,6 +106,7 @@ namespace quda __device__ __host__ inline void operator()(int x_cb, int src_s, int parity) { + QUDA_RT_CONSTS; using real = typename Arg::real; typedef ColorSpinor Vector; @@ -186,6 +187,7 @@ namespace quda __device__ __host__ inline void operator()(int x_cb, int src_s, int parity) { + QUDA_RT_CONSTS; using real = typename Arg::real; typedef ColorSpinor Vector; diff --git a/include/kernels/gauge_fix_ovr.cuh b/include/kernels/gauge_fix_ovr.cuh index c43649c15e..4707275586 100644 --- a/include/kernels/gauge_fix_ovr.cuh +++ b/include/kernels/gauge_fix_ovr.cuh @@ -101,6 +101,7 @@ namespace quda { */ template struct GaugeFixArg : kernel_param<> { + static constexpr ThreadsSync requires_threads_sync = ThreadsSyncAll; using real = typename mapper::type; static constexpr int gauge_dir = gauge_dir_; static constexpr bool halo = halo_; diff --git a/include/kernels/momentum.cuh b/include/kernels/momentum.cuh index 41f8bc929c..be3f79bbd5 100644 --- a/include/kernels/momentum.cuh +++ b/include/kernels/momentum.cuh @@ -90,6 +90,17 @@ namespace quda { constexpr MomUpdate(const Arg &arg) : arg(arg) {} static constexpr const char *filename() { return KERNEL_FILE; } +#ifdef QUDA_TARGET_OMPTARGET + static reduce_t reduce_omp(const reduce_t &a, const reduce_t &b) + { + auto c = a; + if (b[0] > a[0]) c[0] = b[0]; + if (b[1] > a[1]) c[1] = b[1]; + return c; + } + static reduce_t init_omp() { return reduce_t(); } // see UpdateMomArg::init(). +#endif + // calculate the momentum contribution to the action. This uses the // MILC convention where we subtract 4.0 from each matrix norm in // order to increase stability diff --git a/include/kernels/multi_blas_core.cuh b/include/kernels/multi_blas_core.cuh index bc49a2335b..e05402b46a 100644 --- a/include/kernels/multi_blas_core.cuh +++ b/include/kernels/multi_blas_core.cuh @@ -35,6 +35,7 @@ namespace quda struct MultiBlasArg : kernel_param<>, SpinorXZ, SpinorYW(), store_t, N, y_store_t, Ny, Functor_::use_w> { + static constexpr ThreadsSync requires_threads_sync = ThreadsSyncAll; using real = real_; using Functor = Functor_; static constexpr int warp_split = warp_split_; diff --git a/include/kernels/random_init.cuh b/include/kernels/random_init.cuh index ef0eb93d47..13a37afd2e 100644 --- a/include/kernels/random_init.cuh +++ b/include/kernels/random_init.cuh @@ -9,6 +9,7 @@ namespace quda { struct rngArg : kernel_param<> { + static constexpr ThreadsSync requires_threads_sync = ThreadsSyncNo; int commCoord[QUDA_MAX_DIM]; int X[QUDA_MAX_DIM]; int X_global[QUDA_MAX_DIM]; diff --git a/include/kernels/reduce_init.cuh b/include/kernels/reduce_init.cuh index f5a130a385..1915e24188 100644 --- a/include/kernels/reduce_init.cuh +++ b/include/kernels/reduce_init.cuh @@ -6,6 +6,7 @@ namespace quda { namespace reducer { template struct init_arg : kernel_param<> { + static constexpr ThreadsSync requires_threads_sync = ThreadsSyncNo; using T = T_; T *count; init_arg(T *count, int n_reduce) : diff --git a/include/quda_api.h b/include/quda_api.h index e1ec69bbe1..fb14992513 100644 --- a/include/quda_api.h +++ b/include/quda_api.h @@ -5,6 +5,13 @@ #include #include +/* We have to overwrite some cuda-ism here even for public interface, + other wise we can't compile tests. + */ +#ifdef QUDA_TARGET_OMPTARGET +#include "targets/omptarget/quda_api.h" +#endif + /** @file quda_api.h diff --git a/include/quda_arch.h b/include/quda_arch.h index 45a8ed34e4..f1558b6c5e 100644 --- a/include/quda_arch.h +++ b/include/quda_arch.h @@ -14,6 +14,9 @@ #elif defined(QUDA_TARGET_SYCL) #include + +#elif defined(QUDA_TARGET_OMPTARGET) +#include #endif #ifdef QUDA_OPENMP diff --git a/include/quda_define.h.in b/include/quda_define.h.in index ea6657120f..390517d073 100644 --- a/include/quda_define.h.in +++ b/include/quda_define.h.in @@ -238,6 +238,12 @@ static_assert(QUDA_ORDER_FP_MG == 2 || QUDA_ORDER_FP_MG == 4 || QUDA_ORDER_FP_MG */ #cmakedefine QUDA_TARGET_SYCL @QUDA_TARGET_SYCL@ -#if !defined(QUDA_TARGET_CUDA) && !defined(QUDA_TARGET_HIP) && !defined(QUDA_TARGET_SYCL) +/** + * @def QUDA_TARGET_OMPTARGET + * @brief This macro is set by CMake if the OMPTARGET Build Target is selected + */ +#cmakedefine QUDA_TARGET_OMPTARGET @QUDA_TARGET_OMPTARGET@ + +#if !defined(QUDA_TARGET_CUDA) && !defined(QUDA_TARGET_HIP) && !defined(QUDA_TARGET_SYCL) && !defined(QUDA_TARGET_OMPTARGET) #error "No QUDA_TARGET selected" #endif diff --git a/include/quda_ptr.h b/include/quda_ptr.h index aab76f6b89..7ee91bf717 100644 --- a/include/quda_ptr.h +++ b/include/quda_ptr.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include "malloc_quda.h" namespace quda diff --git a/include/targets/omptarget/FFT_Plans.h b/include/targets/omptarget/FFT_Plans.h new file mode 100644 index 0000000000..baed179446 --- /dev/null +++ b/include/targets/omptarget/FFT_Plans.h @@ -0,0 +1,137 @@ +#pragma once + +#include +#include + +using FFTPlanHandle = int; +/* +#include + +using FFTPlanHandle = cufftHandle; +#define FFT_FORWARD CUFFT_FORWARD +#define FFT_INVERSE CUFFT_INVERSE + +#ifndef GPU_GAUGE_ALG +*/ +#ifdef QUDA_TARGET_OMPTARGET +#define CUFFT_SAFE_CALL(call) + +inline void ApplyFFT(FFTPlanHandle &, float2 *, float2 *, int) +{ + errorQuda("unimplemented"); +} + +inline void ApplyFFT(FFTPlanHandle &, double2 *, double2 *, int) +{ + errorQuda("unimplemented"); +} + +inline void SetPlanFFTMany(FFTPlanHandle &, int4, int, QudaPrecision) +{ + errorQuda("unimplemented"); +} + +inline void SetPlanFFT2DMany(FFTPlanHandle &, int4, int, QudaPrecision) +{ + errorQuda("unimplemented"); +} + +inline void FFTDestroyPlan(FFTPlanHandle &) +{ + errorQuda("unimplemented"); +} +#else + +/*-------------------------------------------------------------------------------*/ +#define CUFFT_SAFE_CALL( call) { \ + cufftResult err = call; \ + if ( CUFFT_SUCCESS != err ) { \ + fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } } +/*-------------------------------------------------------------------------------*/ + +/** + * @brief Call CUFFT to perform a single-precision complex-to-complex + * transform plan in the transform direction as specified by direction + * parameter + * @param[in] CUFFT plan + * @param[in] data_in, pointer to the complex input data (in GPU memory) to transform + * @param[out] data_out, pointer to the complex output data (in GPU memory) + * @param[in] direction, the transform direction: CUFFT_FORWARD or CUFFT_INVERSE + */ +inline void ApplyFFT(FFTPlanHandle &plan, float2 *data_in, float2 *data_out, int direction){ + CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)data_in, (cufftComplex *)data_out, direction)); +} + +/** + * @brief Call CUFFT to perform a double-precision complex-to-complex transform plan in the transform direction +as specified by direction parameter + * @param[in] CUFFT plan + * @param[in] data_in, pointer to the complex input data (in GPU memory) to transform + * @param[out] data_out, pointer to the complex output data (in GPU memory) + * @param[in] direction, the transform direction: CUFFT_FORWARD or CUFFT_INVERSE + */ +inline void ApplyFFT(FFTPlanHandle &plan, double2 *data_in, double2 *data_out, int direction){ + CUFFT_SAFE_CALL(cufftExecZ2Z(plan, (cufftDoubleComplex *)data_in, (cufftDoubleComplex *)data_out, direction)); +} + +/** + * @brief Creates a CUFFT plan supporting 4D (1D+3D) data layouts for complex-to-complex + * @param[out] plan, CUFFT plan + * @param[in] size, int4 with lattice size dimensions, (.x,.y,.z,.w) -> (Nx, Ny, Nz, Nt) + * @param[in] dim, 1 for 1D plan along the temporal direction with batch size Nx*Ny*Nz, 3 for 3D plan along Nx, Ny and Nz with batch size Nt + * @param[in] precision The precision of the computation + */ + +inline void SetPlanFFTMany(FFTPlanHandle &plan, int4 size, int dim, QudaPrecision precision) +{ + auto type = precision == QUDA_DOUBLE_PRECISION ? CUFFT_Z2Z : CUFFT_C2C; + switch (dim) { + case 1: + { + int n[1] = { size.w }; + CUFFT_SAFE_CALL(cufftPlanMany(&plan, 1, n, NULL, 1, 0, NULL, 1, 0, type, size.x * size.y * size.z)); + } + break; + case 3: + { + int n[3] = { size.x, size.y, size.z }; + CUFFT_SAFE_CALL(cufftPlanMany(&plan, 3, n, NULL, 1, 0, NULL, 1, 0, type, size.w)); + } + break; + } +} + +/** + * @brief Creates a CUFFT plan supporting 4D (2D+2D) data layouts for complex-to-complex + * @param[out] plan, CUFFT plan + * @param[in] size, int4 with lattice size dimensions, (.x,.y,.z,.w) -> (Nx, Ny, Nz, Nt) + * @param[in] dim, 0 for 2D plan in Z-T planes with batch size Nx*Ny, 1 for 2D plan in X-Y planes with batch size Nz*Nt + * @param[in] precision The precision of the computation + */ +inline void SetPlanFFT2DMany(cufftHandle &plan, int4 size, int dim, QudaPrecision precision) +{ + auto type = precision == QUDA_DOUBLE_PRECISION ? CUFFT_Z2Z : CUFFT_C2C; + switch (dim) { + case 0: + { + int n[2] = { size.w, size.z }; + CUFFT_SAFE_CALL(cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, type, size.x * size.y)); + } + break; + case 1: + { + int n[2] = { size.x, size.y }; + CUFFT_SAFE_CALL(cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, type, size.z * size.w)); + } + break; + } +} + +inline void FFTDestroyPlan( FFTPlanHandle &plan) { + CUFFT_SAFE_CALL(cufftDestroy(plan)); +} + +#endif diff --git a/include/targets/omptarget/atomic_helper.h b/include/targets/omptarget/atomic_helper.h new file mode 100644 index 0000000000..94ee26ba77 --- /dev/null +++ b/include/targets/omptarget/atomic_helper.h @@ -0,0 +1,85 @@ +#pragma once + +#include + +/** + @file atomic_helper.h + + @section Provides definitions of atomic functions that are used in QUDA. + */ + +namespace quda +{ + + /** + @brief atomic_fetch_add function performs similarly as atomic_ref::fetch_add + @param[in,out] addr The memory address of the variable we are + updating atomically + @param[in] val The value we summing to the value at addr + */ + template __device__ __host__ inline void atomic_fetch_add(T *addr, T val) + { +#pragma omp atomic update + *addr += val; + } + + template __device__ __host__ inline void atomic_fetch_add(complex *addr, complex val) + { + atomic_fetch_add(reinterpret_cast(addr) + 0, val.real()); + atomic_fetch_add(reinterpret_cast(addr) + 1, val.imag()); + } + + template __device__ __host__ inline void atomic_fetch_add(array *addr, array val) + { + for (int i = 0; i < n; i++) atomic_fetch_add(&(*addr)[i], val[i]); + } + + /** + @brief atomic_fetch_max function that does an atomic max. + @param[in,out] addr The memory address of the variable we are + updating atomically + @param[in] val The value we are comparing against. Must be + positive valued else result is undefined. + */ + __device__ __host__ inline void atomic_fetch_abs_max(float *addr, float val) + { +#pragma omp atomic compare + if(*addr + inline T atomic_read(T &x) + { + T v; + #pragma omp atomic read + v = x; + return v; + } + template + inline array atomic_read(array &x) + { + array v; + for (int i = 0; i < N; ++i) + v[i] = atomic_read(x[i]); + return v; + } + template + inline complex atomic_read(complex &x) + { + complex v (atomic_read(x.x), atomic_read(x.y)); + return v; + } + template + inline deviation_t atomic_read(deviation_t &x) + { + deviation_t v; + v.diff = atomic_read(x.diff); + v.ref = atomic_read(x.ref); + return v; + } +} // namespace quda diff --git a/include/targets/omptarget/block_reduce_helper.h b/include/targets/omptarget/block_reduce_helper.h new file mode 100644 index 0000000000..02134f0c2e --- /dev/null +++ b/include/targets/omptarget/block_reduce_helper.h @@ -0,0 +1,170 @@ +#pragma once + +#include +#include + +/** + @file block_reduce_helper.h + + @section This files contains the OpenMP target specializations for + warp- and block-level reductions + */ + +using namespace quda; + +namespace quda +{ + + namespace target + { + template + constexpr bool enough_shared_mem(void) + { + constexpr auto max_nthr = device::max_block_size(); + return max_nthr*sizeof(T) <= device::max_shared_memory_size()-sizeof(device::get_shared_cache()[0])*128; // FIXME arbitrary, the number is arbitrary, offset 128 below & in reduce_helper.h:/reduce + } + /** + @brief OpenMP reduction over a group of consecutive threads smaller than omp_num_threads() + */ + template + inline T any_reduce_impl(const reducer_t &r, const T &value_, const int batch, const int block_size, const bool all, const bool async) + { + static_assert(enough_shared_mem(), "Shared cache not large enough for tempStorage"); + T *storage = (T*)&device::get_shared_cache()[128]; // FIXME arbitrary + const int tid = omp_get_thread_num(); + const auto& v0 = r.init(); +#if 1 + const auto batch_begin = block_size*batch; + const auto batch_end = batch_begin+block_size; + auto value = value_; + for(int offset=1;offset1 || !async){ // only synchronize if we are not pipelining + #pragma omp barrier + } + storage[tid] = value; + const auto j = tid+offset; + #pragma omp barrier + if(j + inline T any_reduce(const R &r, const T &value_, const int batch, const int block_size, const bool all, const bool async) + { + if constexpr (enough_shared_mem()) + return any_reduce_impl(r, value_, batch, block_size, all, async); + else{ + using V = typename T::value_type; + constexpr auto N = T::N; + if constexpr ( + std::is_same_v> && + std::is_same_v && + std::is_same_v>){ + // make sure the implementation is still compatible: ../../array.h + constexpr auto N0 = N/2; + constexpr auto N1 = N-N0; + using T0 = array; + using T1 = array; + const constexpr plus r0 {}; + const constexpr plus r1 {}; + auto value = value_; + // recurse to myself + reinterpret_cast(value[0]) = any_reduce(r0, reinterpret_cast(value_[0]), batch, block_size, all, async); + // recurse with async==false + reinterpret_cast(value[N0]) = any_reduce(r1, reinterpret_cast(value_[N0]), batch, block_size, all, false); + return value; + }else + static_assert(sizeof(T)==0, "unimplemented reduction"); // let me fail at compile time + } + } + } + + // pre-declaration of warp_reduce that we wish to specialize + template struct warp_reduce; + + /** + @brief OpenMP target specialization of warp_reduce + */ + template <> struct warp_reduce { + + /** + @brief Perform a warp-wide reduction + @param[in] value_ thread-local value to be reduced + @param[in] all Whether we want all threads to have visibility + to the result (all = true) or just the first thread in the + warp (all = false) + @param[in] r The reduction operation we want to apply + @return The warp-wide reduced value + */ + template + __device__ inline T operator()(const T &value_, bool all, const reducer_t &r, const param_t &) + { + constexpr int block_size = device::warp_size(); + const int batch = omp_get_thread_num() / block_size; + return target::any_reduce(r, value_, batch, block_size, all, false); + } + }; + + // pre-declaration of block_reduce that we wish to specialize + template struct block_reduce; + + /** + @brief OpenMP target specialization of block_reduce + */ + template <> struct block_reduce { + + /** + @brief Perform a block-wide reduction + @param[in] value_ thread-local value to be reduced + @param[in] async Whether this reduction will be performed + asynchronously with respect to the calling threads + @param[in] batch The batch index of the reduction + @param[in] all Whether we want all threads to have visibility + to the result (all = true) or just the first thread in the + block (all = false) + @param[in] r The reduction operation we want to apply + @return The block-wide reduced value + */ + template + __device__ inline T operator()(const T &value_, bool async, int batch, bool all, const reducer_t &r, const param_t &) + { + const auto block_size = target::block_size(); + return target::any_reduce(r, value_, batch, block_size, all, async); + } + }; + +} // namespace quda + +#include "../generic/block_reduce_helper.h" diff --git a/include/targets/omptarget/block_reduction_kernel.h b/include/targets/omptarget/block_reduction_kernel.h new file mode 100644 index 0000000000..d0a8c7f0e1 --- /dev/null +++ b/include/targets/omptarget/block_reduction_kernel.h @@ -0,0 +1,148 @@ +#pragma once + +#include +#include +#include + +namespace quda +{ + + /** + @brief This helper function swizzles the block index through + mapping the block index onto a matrix and tranposing it. This is + done to potentially increase the cache utilization. Requires + that the argument class has a member parameter "swizzle" which + determines if we are swizzling and a parameter "swizzle_factor" + which is the effective matrix dimension that we are tranposing in + this mapping. + + Specifically, the thread block id is remapped by + transposing its coordinates: if the original order can be + parameterized by + + blockIdx.x = j * swizzle + i, + + then the new order is + + block_idx = i * (gridDim.x / swizzle) + j + + We need to factor out any remainder and leave this in original + ordering. + + @param arg Kernel argument struct + @return Swizzled block index + */ + template __device__ constexpr int virtual_block_idx(const Arg &arg) + { + QUDA_RT_CONSTS; + auto block_idx = blockIdx.x; + if (arg.swizzle) { + // the portion of the grid that is exactly divisible by the number of SMs + const auto gridp = gridDim.x - gridDim.x % arg.swizzle_factor; + + if (block_idx < gridp) { + // this is the portion of the block that we are going to transpose + const int i = blockIdx.x % arg.swizzle_factor; + const int j = blockIdx.x / arg.swizzle_factor; + + // transpose the coordinates + block_idx = i * (gridp / arg.swizzle_factor) + j; + } + } + return block_idx; + } + + /** + @brief This class is derived from the arg class that the functor + creates and curries in the block size. This allows the block + size to be set statically at launch time in the actual argument + class that is passed to the kernel. + + @tparam block_size x-dimension block-size + @param[in] arg Kernel argument + */ + template struct BlockKernelArg : Arg_ { + static constexpr ThreadsSync requires_threads_sync = ThreadsSyncYZ; + using Arg = Arg_; + static constexpr unsigned int block_size = block_size_; + BlockKernelArg(const Arg &arg) : Arg(arg) { } + }; + + /** + @brief BlockKernel2D_impl is the implementation of the Generic + block kernel. Here, we split the block (CTA) and thread indices + and pass them separately to the transform functor. The x thread + dimension is templated (Arg::block_size), e.g., for efficient + reductions. + + @tparam Functor Kernel functor that defines the kernel + @tparam Arg Kernel argument struct that set any required meta + data for the kernel + @param[in] arg Kernel argument + */ + template