diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7826d25cbf..66c42c9c2c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1124,20 +1124,6 @@ foreach(pol IN LISTS DSLASH_POLICIES)
 
 endforeach(pol)
 
-# enable the precisions that are compiled
-math(EXPR double_prec  "${QUDA_PRECISION} & 8")
-math(EXPR single_prec  "${QUDA_PRECISION} & 4")
-math(EXPR half_prec    "${QUDA_PRECISION} & 2")
-math(EXPR quarter_prec "${QUDA_PRECISION} & 1")
-
-if(double_prec AND single_prec)
-  set(TEST_PRECS single double)
-elseif(double_prec)
-  set(TEST_PRECS double)
-elseif(single_prec)
-  set(TEST_PRECS single)
-endif()
-
 # Wilson-type Inversions
 if(QUDA_DIRAC_WILSON)
   add_test(NAME invert_test_wilson
@@ -1301,60 +1287,44 @@ if(QUDA_DIRAC_DOMAIN_WALL)
 endif()
 
 # Staggered-type Inversions
-foreach(prec IN LISTS TEST_PRECS)
+if(QUDA_DIRAC_STAGGERED)
+  # --compute-fat-long true is necessary to get well-behaved fields
 
-  # These require looser tolerances to keep iterations to solution in check
-  if(${prec} STREQUAL "double")
-    set(tol 1e-6)
-  elseif(${prec} STREQUAL "single")
-    set(tol 1e-5)
-  endif()
+  add_test(NAME invert_test_staggered
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
+    --dim 2 4 6 8 --niter 1000 --enable-testing true
+    --gtest_output=xml:invert_test_staggered.xml)
 
-  if(QUDA_DIRAC_STAGGERED)
-    # --compute-fat-long true is necessary to get well-behaved fields
+  add_test(NAME invert_test_splitgrid_staggered
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
+    --dim 2 4 6 8 --niter 1000 --nsrc ${QUDA_TEST_NUM_PROCS} --nsrc-tile ${QUDA_TEST_NUM_PROCS}
+    --enable-testing true
+    --gtest_output=xml:invert_test_splitgrid_staggered.xml)
 
-    add_test(NAME invert_test_staggered_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-      --enable-testing true
-      --gtest_output=xml:invert_test_staggered_${prec}.xml)
+  set_tests_properties(invert_test_splitgrid_staggered PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
 
-    add_test(NAME invert_test_splitgrid_staggered_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-      --nsrc ${QUDA_TEST_NUM_PROCS} --nsrc-tile ${QUDA_TEST_NUM_PROCS}
-      --enable-testing true
-      --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml)
+  add_test(NAME invert_test_asqtad
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true
+    --dim 6 6 6 8 --niter 1000 --enable-testing true --nsrc 4 --nsrc-tile 4
+    --gtest_output=xml:invert_test_asqtad.xml)
 
-    set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+  add_test(NAME invert_test_splitgrid_asqtad
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true
+    --dim 6 6 6 8 --niter 1000 --nsrc ${QUDA_TEST_NUM_PROCS} --enable-testing true
+    --gtest_output=xml:invert_test_splitgrid_asqtad)
 
-    add_test(NAME invert_test_asqtad_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true
-      --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-      --enable-testing true --nsrc 4 --nsrc-tile 4
-      --gtest_output=xml:invert_test_asqtad_${prec}.xml)
-
-    add_test(NAME invert_test_splitgrid_asqtad_${prec}
+  if (QUDA_DIRAC_LAPLACE)
+    add_test(NAME invert_test_laplace
       COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true
-      --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-      --nsrc ${QUDA_TEST_NUM_PROCS}
-      --enable-testing true
-      --gtest_output=xml:invert_test_splitgrid_asqtad_${prec}.xml)
-
-    if (QUDA_DIRAC_LAPLACE)
-      add_test(NAME invert_test_laplace_${prec}
-        COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
-        --dslash-type laplace --ngcrkrylov 8 --compute-fat-long true
-        --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-        --enable-testing true
-        --gtest_output=xml:invert_test_laplace_${prec}.xml)
-    endif()
+      --dslash-type laplace --ngcrkrylov 8 --compute-fat-long true
+      --dim 2 4 6 8 --niter 1000 --enable-testing true
+      --gtest_output=xml:invert_test_laplace.xml)
   endif()
-endforeach(prec)
+endif()
 
 # Distance preconditioning for Wilson/clover
 if (QUDA_DIRAC_DISTANCE_PRECONDITIONING)
@@ -1618,44 +1588,27 @@ add_test(NAME gauge_path
   --dim 2 4 6 8 --enable-testing true --niter 1
   --gtest_output=xml:gauge_path_test.xml)
 
-foreach(prec IN LISTS TEST_PRECS)
-
-  if(QUDA_DIRAC_STAGGERED)
-    add_test(NAME unitarize_link_${prec}
-             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:unitarize_link_test> ${MPIEXEC_POSTFLAGS}
-                     --dim 2 4 6 8 --prec ${prec}
-                     --gtest_output=xml:unitarize_link_test_${prec}.xml)
-
-    add_test(NAME hisq_paths_force_${prec}
-             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:hisq_paths_force_test> ${MPIEXEC_POSTFLAGS}
-                     --dim 2 4 6 8 --prec ${prec}
-                     --gtest_output=xml:hisq_paths_force_test_${prec}.xml)
-
-    add_test(NAME hisq_unitarize_force_${prec}
-             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:hisq_unitarize_force_test> ${MPIEXEC_POSTFLAGS}
-                     --dim 2 4 6 8 --prec ${prec}
-                     --gtest_output=xml:hisq_unitarize_force_test_${prec}.xml)
-
-    set(KERNEL_TYPE TwoLink GaussianSmear)
-    foreach(kerneltp IN LISTS KERNEL_TYPE)
-      if(${kerneltp} STREQUAL "TwoLink")
-	set(laplace3D 4)
-	set(smear_t0 -1)
-      elseif(${kerneltp} STREQUAL "GaussianSmear")
-	set(laplace3D 3)
-	set(smear_t0 1)
-      endif()	    
-      add_test(NAME staggered_gsmear_${kerneltp}_${prec}
-        COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_gsmear_test> ${MPIEXEC_POSTFLAGS}
-        --dim 6 8 10 12 --prec ${prec}
-        --test ${kerneltp}
-	--laplace3D ${laplace3D}
-	--smear-t0 ${smear_t0}
-        --gtest_output=xml:staggered_gsmear_test_${kerneltp}_${prec}.xml)
-    endforeach(kerneltp)
-  endif()
-
-endforeach(prec)
+if(QUDA_DIRAC_STAGGERED)
+  add_test(NAME unitarize_link
+           COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:unitarize_link_test> ${MPIEXEC_POSTFLAGS}
+           --dim 2 4 6 8 --enable-testing true
+           --gtest_output=xml:unitarize_link_test.xml)
+
+  add_test(NAME hisq_unitarize_force
+           COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:hisq_unitarize_force_test> ${MPIEXEC_POSTFLAGS}
+                   --dim 2 4 6 8 --enable-testing true
+                   --gtest_output=xml:hisq_unitarize_force_test.xml)
+
+  add_test(NAME hisq_paths_force
+           COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:hisq_paths_force_test> ${MPIEXEC_POSTFLAGS}
+                   --dim 2 4 6 8 --enable-testing true
+                   --gtest_output=xml:hisq_paths_force_test}.xml)
+
+  add_test(NAME staggered_gsmear
+           COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_gsmear_test> ${MPIEXEC_POSTFLAGS}
+           --dim 6 8 10 12 --enable-testing true
+           --gtest_output=xml:staggered_gsmear_test.xml)
+endif()
 
 add_test(NAME gauge_alg
   COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:gauge_alg_test> ${MPIEXEC_POSTFLAGS}
diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp
index 3e9646b634..3b97b30978 100644
--- a/tests/hisq_paths_force_test.cpp
+++ b/tests/hisq_paths_force_test.cpp
@@ -7,7 +7,7 @@
 #include <ks_improved_force.h>
 #include <momentum.h>
 #include <timer.h>
-#include <gtest/gtest.h>
+#include <instantiate.h>
 
 #include "host_utils.h"
 #include "momentum_utils.h"
@@ -18,41 +18,10 @@
 
 using namespace quda;
 
-GaugeField *cpuGauge = NULL;
-GaugeField *cudaForce = NULL;
-GaugeField *cpuForce = NULL;
-GaugeField *hostVerifyForce = NULL;
-
-GaugeField *cudaMom = NULL;
-GaugeField *cpuMom = NULL;
-GaugeField *refMom = NULL;
-
 QudaGaugeFieldOrder gauge_order = QUDA_QDP_GAUGE_ORDER;
 
-GaugeField *cpuOprod = NULL;
-GaugeField *cudaOprod = NULL;
-GaugeField *cpuLongLinkOprod = NULL;
-GaugeField *cudaLongLinkOprod = NULL;
-
 int ODD_BIT = 1;
 
-QudaPrecision force_prec = QUDA_DOUBLE_PRECISION;
-
-GaugeField *cudaGauge_ex = NULL;
-GaugeField *cpuGauge_ex = NULL;
-GaugeField *cudaForce_ex = NULL;
-GaugeField *cpuForce_ex = NULL;
-GaugeField *cpuOprod_ex = NULL;
-GaugeField *cudaOprod_ex = NULL;
-GaugeField *cpuLongLinkOprod_ex = NULL;
-GaugeField *cudaLongLinkOprod_ex = NULL;
-
-static void setPrecision(QudaPrecision precision)
-{
-  force_prec = precision;
-  return;
-}
-
 /**
   @brief Compute the aggregate bytes and flops for various components within the HISQ force
 
@@ -156,15 +125,32 @@ static lat_dim_t R = {2, 2, 2, 2};
 static lat_dim_t R = {0, 0, 0, 0};
 #endif
 
-// one-time initializations at start of tests
-static void hisq_force_startup()
+using test_t = ::testing::tuple<QudaPrecision, bool>;
+
+class HisqForceTest : public ::testing::TestWithParam<test_t>
+{
+protected:
+  QudaPrecision force_prec;
+  bool lepage;
+
+public:
+  HisqForceTest() : force_prec(::testing::get<0>(GetParam())), lepage(::testing::get<1>(GetParam())) { }
+};
+
+int hisq_force_test(QudaPrecision force_prec, bool lepage)
 {
+  if (force_prec != QUDA_DOUBLE_PRECISION && force_prec != QUDA_SINGLE_PRECISION)
+    errorQuda("Invalid precision %d", force_prec);
+  // FIXME: debugging recon 12
+  if (link_recon != QUDA_RECONSTRUCT_NO && link_recon != QUDA_RECONSTRUCT_13)
+    errorQuda("Invalid reconstruct %d", link_recon);
+
   int X[4] = {xdim, ydim, zdim, tdim};
   setDims(X);
 
   setVerbosity(verbosity);
 
-  quda::RNG *rng;
+  quda::RNG rng;
 
   // initialize RNG
   {
@@ -183,7 +169,7 @@ static void hisq_force_startup()
     param.location = QUDA_CPU_FIELD_LOCATION;
     for (int d = 0; d < 4; d++) param.x[d] = X[d];
     quda::ColorSpinorField spinor_in(param);
-    rng = new quda::RNG(spinor_in, 1234);
+    rng = quda::RNG(spinor_in, 1234);
   }
 
   QudaGaugeParam qudaGaugeParam = newQudaGaugeParam();
@@ -192,7 +178,7 @@ static void hisq_force_startup()
   for (int d = 0; d < 4; d++) qudaGaugeParam.X[d] = X[d];
 
   // need to do some thinking for recon
-  qudaGaugeParam.cpu_prec = force_prec;
+  qudaGaugeParam.cpu_prec = cpu_prec;
   qudaGaugeParam.cuda_prec = force_prec;
   qudaGaugeParam.reconstruct = (link_recon == QUDA_RECONSTRUCT_12 ? QUDA_RECONSTRUCT_13 : link_recon);
   qudaGaugeParam.type = QUDA_GENERAL_LINKS;
@@ -216,11 +202,8 @@ static void hisq_force_startup()
 #endif
   qudaGaugeParam_ex.ga_pad = 3 * pad_size; // long links
 
-  GaugeFieldParam gParam_ex;
-  GaugeFieldParam gParam;
-
   // create device gauge field
-  gParam_ex = GaugeFieldParam(qudaGaugeParam_ex);
+  GaugeFieldParam gParam_ex = GaugeFieldParam(qudaGaugeParam_ex);
   gParam_ex.location = QUDA_CUDA_FIELD_LOCATION;
   gParam_ex.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
   gParam_ex.create = QUDA_NULL_FIELD_CREATE;
@@ -230,18 +213,18 @@ static void hisq_force_startup()
     gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0;
     gParam_ex.x[d] = X[d] + 2 * gParam_ex.r[d];
   } // set halo region for GPU
-  cudaGauge_ex = new GaugeField(gParam_ex);
+  GaugeField cudaGauge_ex(gParam_ex);
 
   // Create the host gauge field
   memcpy(&qudaGaugeParam_ex, &qudaGaugeParam, sizeof(QudaGaugeParam));
 
-  gParam = GaugeFieldParam(qudaGaugeParam);
+  GaugeFieldParam gParam = GaugeFieldParam(qudaGaugeParam);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.order = gauge_order;
-  cpuGauge = new GaugeField(gParam);
+  GaugeField cpuGauge(gParam);
 
   gParam_ex = GaugeFieldParam(qudaGaugeParam_ex);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
@@ -253,14 +236,14 @@ static void hisq_force_startup()
     gParam_ex.r[d] = R[d];
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   } // set halo region for CPU
-  cpuGauge_ex = new GaugeField(gParam_ex);
+  GaugeField cpuGauge_ex(gParam_ex);
 
   auto generated_link_type
     = (link_recon == QUDA_RECONSTRUCT_NO ?
          SiteLinkType::SITELINK_PHASE_NO :
          (link_recon == QUDA_RECONSTRUCT_13 ? SiteLinkType::SITELINK_PHASE_U1 : SiteLinkType::SITELINK_PHASE_MILC));
-  createSiteLinkCPU(*cpuGauge, qudaGaugeParam.cpu_prec, generated_link_type);
-  copyExtendedGauge(*cpuGauge_ex, *cpuGauge, QUDA_CPU_FIELD_LOCATION);
+  createSiteLinkCPU(cpuGauge, qudaGaugeParam.cpu_prec, generated_link_type);
+  copyExtendedGauge(cpuGauge_ex, cpuGauge, QUDA_CPU_FIELD_LOCATION);
 
   qudaGaugeParam.type = QUDA_GENERAL_LINKS;
   qudaGaugeParam.t_boundary = QUDA_PERIODIC_T;
@@ -279,16 +262,16 @@ static void hisq_force_startup()
    **************************/
   gParam.location = QUDA_CPU_FIELD_LOCATION;
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
-  gParam.setPrecision(prec);
+  gParam.setPrecision(cpu_prec);
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.order = gauge_order;
-  cpuForce = new GaugeField(gParam);
-  hostVerifyForce = new GaugeField(gParam);
+  GaugeField cpuForce(gParam);
+  GaugeField hostVerifyForce(gParam);
 
   gParam_ex.location = QUDA_CPU_FIELD_LOCATION;
   gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
-  gParam_ex.setPrecision(prec);
+  gParam_ex.setPrecision(cpu_prec);
   gParam_ex.create = QUDA_NULL_FIELD_CREATE;
   gParam_ex.link_type = QUDA_GENERAL_LINKS;
   gParam_ex.order = gauge_order;
@@ -296,33 +279,32 @@ static void hisq_force_startup()
     gParam_ex.r[d] = R[d];
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   }
-  cpuForce_ex = new GaugeField(gParam_ex);
+  GaugeField cpuForce_ex(gParam_ex);
 
   // create the momentum matrix
   gParam.location = QUDA_CPU_FIELD_LOCATION;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
-  gParam.setPrecision(prec);
+  gParam.setPrecision(cpu_prec);
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.order = QUDA_MILC_GAUGE_ORDER;
   gParam.create = QUDA_NULL_FIELD_CREATE;
-  cpuMom = new GaugeField(gParam);
-  refMom = new GaugeField(gParam);
+  GaugeField cpuMom(gParam);
+  GaugeField refMom(gParam);
 
   /**********************************
    * Create the outer product fields *
    **********************************/
 
   // Create four full-volume random spinor fields
-  void *stag_for_oprod = safe_malloc(4 * cpuGauge->Volume() * stag_spinor_site_size * force_prec);
+  std::vector<char> stag_for_oprod(4 * cpuGauge.Volume() * stag_spinor_site_size * cpu_prec);
 
   // Allocate the outer product fields and populate them with the random spinor fields
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.order = gauge_order;
-  cpuOprod = new GaugeField(gParam);
-  cpuLongLinkOprod = new GaugeField(gParam);
-
+  GaugeField cpuOprod(gParam);
+  GaugeField cpuLongLinkOprod(gParam);
   // Create extended outer product fields
   gParam_ex.location = QUDA_CPU_FIELD_LOCATION;
   gParam_ex.link_type = QUDA_GENERAL_LINKS;
@@ -332,97 +314,63 @@ static void hisq_force_startup()
     gParam_ex.r[d] = R[d];
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   } // set halo region for CPU
-  cpuOprod_ex = new GaugeField(gParam_ex);
-  cpuLongLinkOprod_ex = new GaugeField(gParam_ex);
+  GaugeField cpuOprod_ex(gParam_ex);
+  GaugeField cpuLongLinkOprod_ex(gParam_ex);
 
   // initialize the CPU outer product fields and exchange once
-  createStagForOprodCPU(stag_for_oprod, force_prec, qudaGaugeParam.X, *rng);
-  computeLinkOrderedOuterProduct(stag_for_oprod, *cpuOprod, force_prec, 1);
-  computeLinkOrderedOuterProduct(stag_for_oprod, *cpuLongLinkOprod, force_prec, 3);
+  createStagForOprodCPU(stag_for_oprod.data(), cpu_prec, qudaGaugeParam.X, rng);
+  computeLinkOrderedOuterProduct(stag_for_oprod.data(), cpuOprod, 1);
 
-  copyExtendedGauge(*cpuOprod_ex, *cpuOprod, QUDA_CPU_FIELD_LOCATION);
-  copyExtendedGauge(*cpuLongLinkOprod_ex, *cpuLongLinkOprod, QUDA_CPU_FIELD_LOCATION);
+  computeLinkOrderedOuterProduct(stag_for_oprod.data(), cpuLongLinkOprod, 3);
 
-  // free the initial spinor field
-  host_free(stag_for_oprod);
+  copyExtendedGauge(cpuOprod_ex, cpuOprod, QUDA_CPU_FIELD_LOCATION);
+  copyExtendedGauge(cpuLongLinkOprod_ex, cpuLongLinkOprod, QUDA_CPU_FIELD_LOCATION);
 
   /**************************
    * Create remaining fields *
    ***************************/
   gParam_ex.location = QUDA_CUDA_FIELD_LOCATION;
   gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
-  gParam_ex.setPrecision(prec, true);
+  gParam_ex.setPrecision(force_prec, true);
   for (int d = 0; d < 4; d++) {
     gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0;
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   } // set halo region
-  cudaForce_ex = new GaugeField(gParam_ex);
-  cudaOprod_ex = new GaugeField(gParam_ex);
-  cudaLongLinkOprod_ex = new GaugeField(gParam_ex);
+  GaugeField cudaForce_ex(gParam_ex);
+  GaugeField cudaOprod_ex(gParam_ex);
+  GaugeField cudaLongLinkOprod_ex(gParam_ex);
 
   // create a device force for verify
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.link_type = QUDA_GENERAL_LINKS;
-  gParam.setPrecision(prec, true);
-  cudaForce = new GaugeField(gParam);
+  gParam.setPrecision(force_prec, true);
+  GaugeField cudaForce(gParam);
 
   // create the device momentum field
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
-  gParam.setPrecision(prec, true);
-  cudaMom = new GaugeField(gParam);
+  gParam.setPrecision(force_prec, true);
+  GaugeField cudaMom(gParam);
 
   /********************************************************************
    * Copy to and exchange gauge and outer product fields on the device *
    ********************************************************************/
-  cpuGauge_ex->exchangeExtendedGhost(R, true);
-  cudaGauge_ex->copy(*cpuGauge);
-  cudaGauge_ex->exchangeExtendedGhost(cudaGauge_ex->R());
-
-  cpuOprod_ex->exchangeExtendedGhost(R, true);
-  cudaOprod_ex->copy(*cpuOprod);
-  cudaOprod_ex->exchangeExtendedGhost(cudaOprod_ex->R());
-
-  cpuLongLinkOprod_ex->exchangeExtendedGhost(R, true);
-  cudaLongLinkOprod_ex->copy(*cpuLongLinkOprod);
-  cudaLongLinkOprod_ex->exchangeExtendedGhost(cudaLongLinkOprod_ex->R());
-
-  /**********************
-   * Do a little cleanup *
-   **********************/
-  delete rng;
-}
+  cpuGauge_ex.exchangeExtendedGhost(R, true);
+  cudaGauge_ex.copy(cpuGauge);
+  cudaGauge_ex.exchangeExtendedGhost(cudaGauge_ex.R());
 
-// one-time teardown at end of tests
-static void hisq_force_teardown()
-{
-  delete cudaMom;
-  delete cudaForce;
-  delete cudaForce_ex;
-  delete cudaGauge_ex;
-  delete cudaOprod_ex;
-  delete cudaLongLinkOprod_ex;
-
-  delete cpuGauge;
-  delete cpuForce;
-  delete hostVerifyForce;
-  delete cpuMom;
-  delete refMom;
-  delete cpuOprod;
-  delete cpuLongLinkOprod;
-
-  delete cpuGauge_ex;
-  delete cpuForce_ex;
-  delete cpuOprod_ex;
-  delete cpuLongLinkOprod_ex;
-}
+  cpuOprod_ex.exchangeExtendedGhost(R, true);
+  cudaOprod_ex.copy(cpuOprod);
+  cudaOprod_ex.exchangeExtendedGhost(cudaOprod_ex.R());
+
+  cpuLongLinkOprod_ex.exchangeExtendedGhost(R, true);
+  cudaLongLinkOprod_ex.copy(cpuLongLinkOprod);
+  cudaLongLinkOprod_ex.exchangeExtendedGhost(cudaLongLinkOprod_ex.R());
 
-static int hisq_force_test(bool lepage)
-{
   // float d_weight = 1.0;
   // { one, naik, three, five, seven, lepage }
   // double d_act_path_coeff[6] = { 1., 0., 0., 0., 0., 0. };
@@ -438,41 +386,41 @@ static int hisq_force_test(bool lepage)
   /********************************
    * Zero momenta and force fields *
    ********************************/
-  cpuForce->zero();
-  cpuForce_ex->zero();
-  cpuMom->zero();
-  refMom->zero();
+  cpuForce.zero();
+  cpuForce_ex.zero();
+  cpuMom.zero();
+  refMom.zero();
 
-  cudaForce->zero();
-  cudaForce_ex->zero();
-  cudaMom->zero();
+  cudaForce.zero();
+  cudaForce_ex.zero();
+  cudaMom.zero();
 
   /**************************************
    * Force contribution from the staples *
    **************************************/
   host_timer.start();
-  fermion_force::hisqStaplesForce(*cudaForce_ex, *cudaOprod_ex, *cudaGauge_ex, d_act_path_coeff);
+  fermion_force::hisqStaplesForce(cudaForce_ex, cudaOprod_ex, cudaGauge_ex, d_act_path_coeff);
   qudaDeviceSynchronize();
   host_timer.stop();
   staple_time_sec = host_timer.last();
 
   if (verify_results) {
     host_timer.start();
-    hisqStaplesForceCPU(d_act_path_coeff, *cpuOprod_ex, *cpuGauge_ex, cpuForce_ex);
+    hisqStaplesForceCPU(d_act_path_coeff, cpuOprod_ex, cpuGauge_ex, &cpuForce_ex);
     host_timer.stop();
     host_time_sec = host_timer.last();
 
-    copyExtendedGauge(*cpuForce, *cpuForce_ex, QUDA_CPU_FIELD_LOCATION);
-    copyExtendedGauge(*cudaForce, *cudaForce_ex, QUDA_CUDA_FIELD_LOCATION);
-    hostVerifyForce->copy(*cudaForce);
+    copyExtendedGauge(cpuForce, cpuForce_ex, QUDA_CPU_FIELD_LOCATION);
+    copyExtendedGauge(cudaForce, cudaForce_ex, QUDA_CUDA_FIELD_LOCATION);
+    hostVerifyForce.copy(cudaForce);
 
     int res = 1;
     for (int dir = 0; dir < 4; dir++) {
-      res &= compare_floats(cpuForce->data<void *>(dir), hostVerifyForce->data<void *>(dir), V * gauge_site_size,
-                            getTolerance(force_prec), force_prec);
+      res &= compare_floats(cpuForce.data<void *>(dir), hostVerifyForce.data<void *>(dir), V * gauge_site_size,
+                            getTolerance(force_prec), cpu_prec);
     }
 
-    strong_check_link(*hostVerifyForce, "GPU result:", *cpuForce, "CPU reference results:");
+    strong_check_link(hostVerifyForce, "GPU result:", cpuForce, "CPU reference results:");
     logQuda(QUDA_SUMMARIZE, "Lepage %s staples force test %s\n\n", lepage ? "enabled" : "disabled",
             (1 == res) ? "PASSED" : "FAILED");
   }
@@ -485,53 +433,53 @@ static int hisq_force_test(bool lepage)
   // This is consistent with the chain rule for HISQ
   if (lepage && d_act_path_coeff[1] != 0.) {
     host_timer.start();
-    fermion_force::hisqLongLinkForce(*cudaForce_ex, *cudaLongLinkOprod_ex, *cudaGauge_ex, d_act_path_coeff[1]);
+    fermion_force::hisqLongLinkForce(cudaForce_ex, cudaLongLinkOprod_ex, cudaGauge_ex, d_act_path_coeff[1]);
     qudaDeviceSynchronize();
     host_timer.stop();
     long_time_sec = host_timer.last();
 
     if (verify_results) {
       host_timer.start();
-      hisqLongLinkForceCPU(d_act_path_coeff[1], *cpuLongLinkOprod_ex, *cpuGauge_ex, cpuForce_ex);
+      hisqLongLinkForceCPU(d_act_path_coeff[1], cpuLongLinkOprod_ex, cpuGauge_ex, &cpuForce_ex);
       host_timer.stop();
       host_time_sec += host_timer.last();
 
-      copyExtendedGauge(*cpuForce, *cpuForce_ex, QUDA_CPU_FIELD_LOCATION);
-      copyExtendedGauge(*cudaForce, *cudaForce_ex, QUDA_CUDA_FIELD_LOCATION);
-      hostVerifyForce->copy(*cudaForce);
+      copyExtendedGauge(cpuForce, cpuForce_ex, QUDA_CPU_FIELD_LOCATION);
+      copyExtendedGauge(cudaForce, cudaForce_ex, QUDA_CUDA_FIELD_LOCATION);
+      hostVerifyForce.copy(cudaForce);
 
       int res = 1;
       for (int dir = 0; dir < 4; dir++) {
-        res &= compare_floats(cpuForce->data(dir), hostVerifyForce->data(dir), V * gauge_site_size,
-                              getTolerance(force_prec), force_prec);
+        res &= compare_floats(cpuForce.data(dir), hostVerifyForce.data(dir), V * gauge_site_size,
+                              getTolerance(force_prec), cpu_prec);
       }
 
-      strong_check_link(*hostVerifyForce, "GPU results: ", *cpuForce, "CPU reference results:");
+      strong_check_link(hostVerifyForce, "GPU results: ", cpuForce, "CPU reference results:");
       logQuda(QUDA_SUMMARIZE, "Long link force test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
     }
   }
 
   host_timer.start();
-  fermion_force::hisqCompleteForce(*cudaForce_ex, *cudaGauge_ex);
-  updateMomentum(*cudaMom, 1.0, *cudaForce_ex, __func__);
+  fermion_force::hisqCompleteForce(cudaForce_ex, cudaGauge_ex);
+  updateMomentum(cudaMom, 1.0, cudaForce_ex, __func__);
   qudaDeviceSynchronize();
   host_timer.stop();
   complete_time_sec = host_timer.last();
 
   if (verify_results) {
     host_timer.start();
-    hisqCompleteForceCPU(*cpuForce_ex, *cpuGauge_ex, refMom);
+    hisqCompleteForceCPU(cpuForce_ex, cpuGauge_ex, &refMom);
     host_timer.stop();
     host_time_sec += host_timer.last();
 
-    cpuMom->copy(*cudaMom);
+    cpuMom.copy(cudaMom);
   }
 
   int accuracy_level = 3;
   if (verify_results) {
-    int res = compare_floats(cpuMom->data(), refMom->data(), 4 * cpuMom->Volume() * mom_site_size,
-                             getTolerance(force_prec), force_prec);
-    accuracy_level = strong_check_mom(cpuMom->data(), refMom->data(), 4 * cpuMom->Volume(), force_prec);
+    int res = compare_floats(cpuMom.data(), refMom.data(), 4 * cpuMom.Volume() * mom_site_size,
+                             getTolerance(force_prec), cpu_prec);
+    accuracy_level = strong_check_mom(cpuMom.data(), refMom.data(), 4 * cpuMom.Volume(), cpu_prec);
     logQuda(QUDA_SUMMARIZE, "Test (lepage coeff %e) %s\n", d_act_path_coeff[5], (1 == res) ? "PASSED" : "FAILED");
   }
   long long staple_io, staple_flops, long_io, long_flops, complete_io, complete_flops;
@@ -564,77 +512,54 @@ static int hisq_force_test(bool lepage)
   return accuracy_level;
 }
 
-static void display_test_info()
+TEST_P(HisqForceTest, verify)
 {
-  printfQuda("running the following fermion force computation test:\n");
-
-  printfQuda(
-    "force_precision           link_reconstruct           space_dim(x/y/z)         T_dimension       Gauge_order\n");
-  printfQuda("%s                       %s                         %d/%d/%d                  %d                %s\n",
-             get_prec_str(force_prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim,
-             get_gauge_order_str(gauge_order));
+  if (!quda::is_enabled(force_prec)) GTEST_SKIP();
+  int level = hisq_force_test(force_prec, lepage);
+  int tolerance = getNegLog10Tolerance(force_prec);
+  ASSERT_GE(level, tolerance) << "CPU and GPU implementations do not agree";
 }
 
-TEST(paths, verify)
-{
-  int level = hisq_force_test(true);
-  // prevent tests from failing when verify is set to false
-  if (verify_results) {
-    int tolerance = getNegLog10Tolerance(force_prec);
-    ASSERT_GE(level, tolerance) << "CPU and GPU implementations do not agree";
-  }
-}
+auto test_str = [](testing::TestParamInfo<test_t> param) {
+  return std::string(get_prec_str(::testing::get<0>(param.param))) + (::testing::get<1>(param.param) ? "_lepage" : "");
+};
 
-TEST(paths_no_lepage, verify)
-{
-  int level = hisq_force_test(false);
-  // prevent tests from failing when verify is set to false
-  if (verify_results) {
-    int tolerance = getNegLog10Tolerance(force_prec);
-    ASSERT_GE(level, tolerance) << "CPU and GPU implementations do not agree";
-  }
-}
-
-int main(int argc, char **argv)
-{
-  // initalize google test
-  ::testing::InitGoogleTest(&argc, argv);
+INSTANTIATE_TEST_SUITE_P(, HisqForceTest,
+                         ::testing::Combine(::testing::Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION),
+                                            ::testing::Values(false, true)),
+                         test_str);
 
-  auto app = make_app();
+static bool do_lepage = false;
 
-  try {
-    app->parse(argc, argv);
-  } catch (const CLI::ParseError &e) {
-    return app->exit(e);
+struct hisq_path_force_test : public quda_test {
+  void display_info() const override
+  {
+    printfQuda(
+      "force_precision           link_reconstruct           space_dim(x/y/z)         T_dimension       Gauge_order\n");
+    printfQuda("%s                       %s                         %d/%d/%d                  %d                %s\n",
+               get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_gauge_order_str(gauge_order));
   }
 
-  initComms(argc, argv, gridsize_from_cmdline);
-  initRand();
-  initQuda(device_ordinal);
-
-  setPrecision(prec);
-
-  display_test_info();
-
-  if (prec != QUDA_DOUBLE_PRECISION && prec != QUDA_SINGLE_PRECISION) errorQuda("Invalid precision %d", prec);
-  // FIXME: debugging recon 12
-  if (link_recon != QUDA_RECONSTRUCT_NO && link_recon != QUDA_RECONSTRUCT_13 /* && link_recon != QUDA_RECONSTRUCT_12*/)
-    errorQuda("Invalid reconstruct %d", link_recon);
-
-  // one-time setup
-  hisq_force_startup();
-
-  // Ensure gtest prints only from rank 0
-  ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
-  if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
-
-  int test_rc = RUN_ALL_TESTS();
-  if (test_rc != 0) warningQuda("Tests failed");
+  void add_command_line_group(std::shared_ptr<QUDAApp> app) const override
+  {
+    quda_test::add_command_line_group(app);
+    app->add_option("--lepage", do_lepage, "Include the Lepage term in the force computation (default false)");
+  }
 
-  hisq_force_teardown();
+  hisq_path_force_test(int argc, char **argv) : quda_test("hisq_path_force_test", argc, argv) { }
+};
 
-  endQuda();
-  finalizeComms();
+int main(int argc, char **argv)
+{
+  hisq_path_force_test test(argc, argv);
+  test.init();
+
+  int test_rc = 0;
+  if (!enable_testing) {
+    hisq_force_test(prec, do_lepage);
+  } else {
+    test_rc = test.execute();
+  }
 
   return test_rc;
 }
diff --git a/tests/hisq_unitarize_force_test.cpp b/tests/hisq_unitarize_force_test.cpp
index fe4b927223..46e846f774 100644
--- a/tests/hisq_unitarize_force_test.cpp
+++ b/tests/hisq_unitarize_force_test.cpp
@@ -1,35 +1,34 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <sys/time.h>
 
 #include <quda.h>
+#include <instantiate.h>
+#include <gauge_field.h>
+
 #include "host_utils.h"
-#include <command_line_params.h>
-#include "gauge_field.h"
+#include "command_line_params.h"
 #include "misc.h"
 #include "test.h"
 #include "hisq_force_reference.h"
 #include "ks_improved_force.h"
-#include <sys/time.h>
-#include <gtest/gtest.h>
-
-quda::GaugeField *cudaFatLink = NULL;
-quda::GaugeField *cpuFatLink = NULL;
 
-quda::GaugeField *cudaOprod = NULL;
-quda::GaugeField *cpuOprod = NULL;
+using test_t = ::testing::tuple<QudaPrecision>;
 
-quda::GaugeField *cudaResult = NULL;
-quda::GaugeField *cpuResult = NULL;
-
-quda::GaugeField *cpuReference = NULL;
+class HisqUnitarizeTest : public ::testing::TestWithParam<test_t>
+{
+protected:
+  QudaPrecision precision;
 
-static QudaGaugeParam gaugeParam;
+public:
+  HisqUnitarizeTest() : precision(::testing::get<0>(GetParam())) { }
+};
 
-// allocate memory
-// set the layout, etc.
-static void hisq_force_init()
+void hisq_unitarize(QudaPrecision prec)
 {
+  QudaGaugeParam gaugeParam = newQudaGaugeParam();
+
   gaugeParam.X[0] = xdim;
   gaugeParam.X[1] = ydim;
   gaugeParam.X[2] = zdim;
@@ -42,51 +41,34 @@ static void hisq_force_init()
   gaugeParam.cuda_prec = prec;
   gaugeParam.reconstruct = link_recon;
   gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER;
+  gaugeParam.type = QUDA_WILSON_LINKS;
   quda::GaugeFieldParam gParam(gaugeParam);
   gParam.create = QUDA_ZERO_FIELD_CREATE;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.anisotropy = 1;
 
-  cpuFatLink = new quda::GaugeField(gParam);
-  cpuOprod = new quda::GaugeField(gParam);
-  cpuResult = new quda::GaugeField(gParam);
-  cpuReference = new quda::GaugeField(gParam);
+  quda::GaugeField cpuFatLink = quda::GaugeField(gParam);
+
+  auto cpuOprod = quda::GaugeField(gParam);
+  auto cpuResult = quda::GaugeField(gParam);
+  auto cpuReference = quda::GaugeField(gParam);
 
   // create "gauge fields"
-  createSiteLinkCPU(*cpuFatLink, gaugeParam.cpu_prec, SiteLinkType::SITELINK_NOISY);
-  createSiteLinkCPU(*cpuOprod, gaugeParam.cpu_prec, SiteLinkType::SITELINK_NOISY);
+  createSiteLinkCPU(cpuFatLink, gaugeParam.cpu_prec, SiteLinkType::SITELINK_NOISY);
+  createSiteLinkCPU(cpuOprod, gaugeParam.cpu_prec, SiteLinkType::SITELINK_NOISY);
 
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.setPrecision(gaugeParam.cuda_prec, true);
 
-  cudaFatLink = new quda::GaugeField(gParam);
-  cudaOprod = new quda::GaugeField(gParam);
-  cudaResult = new quda::GaugeField(gParam);
+  auto cudaFatLink = quda::GaugeField(gParam);
+  auto cudaOprod = quda::GaugeField(gParam);
+  auto cudaResult = quda::GaugeField(gParam);
 
   gParam.order = QUDA_QDP_GAUGE_ORDER;
 
-  cudaFatLink->copy(*cpuFatLink);
-  cudaOprod->copy(*cpuOprod);
-}
-
-static void hisq_force_end()
-{
-  delete cpuFatLink;
-  delete cpuOprod;
-  delete cpuResult;
-
-  delete cudaFatLink;
-  delete cudaOprod;
-  delete cudaResult;
-
-  delete cpuReference;
-}
-
-TEST(hisq_force_unitarize, verify)
-{
-  setVerbosity(verbosity);
-  hisq_force_init();
+  cudaFatLink.copy(cpuFatLink);
+  cudaOprod.copy(cpuOprod);
 
   double unitarize_eps = 1e-5;
   const double hisq_force_filter = 5e-5;
@@ -103,64 +85,65 @@ TEST(hisq_force_unitarize, verify)
   qudaMemset(num_failures_dev, 0, sizeof(int));
 
   printfQuda("Calling unitarizeForce\n");
-  quda::fermion_force::unitarizeForce(*cudaResult, *cudaOprod, *cudaFatLink, num_failures_dev);
+  quda::fermion_force::unitarizeForce(cudaResult, cudaOprod, cudaFatLink, num_failures_dev);
 
   device_free(num_failures_dev);
 
   if (verify_results) {
     printfQuda("Calling unitarizeForceCPU\n");
-    quda::fermion_force::unitarizeForceCPU(*cpuResult, *cpuOprod, *cpuFatLink);
+    quda::fermion_force::unitarizeForceCPU(cpuResult, cpuOprod, cpuFatLink);
   }
 
-  cpuReference->copy(*cudaResult);
+  cpuReference.copy(cudaResult);
 
   printfQuda("Comparing CPU and GPU results\n");
   int res[4];
 
   double accuracy = prec == QUDA_DOUBLE_PRECISION ? 1e-10 : 1e-5;
   for (int dir = 0; dir < 4; ++dir) {
-    res[dir] = compare_floats(cpuReference->data<void *>(dir), cpuResult->data<void *>(dir),
-                              cpuReference->Volume() * gauge_site_size, accuracy, gaugeParam.cpu_prec);
+    res[dir] = compare_floats(cpuReference.data<void *>(dir), cpuResult.data<void *>(dir),
+                              cpuReference.Volume() * gauge_site_size, accuracy, gaugeParam.cpu_prec);
 
     quda::comm_allreduce_int(res[dir]);
     res[dir] /= quda::comm_size();
   }
 
-  hisq_force_end();
-
   for (int dir = 0; dir < 4; ++dir) { ASSERT_EQ(res[dir], 1) << "Dir:" << dir; }
 }
 
-static void display_test_info()
+TEST_P(HisqUnitarizeTest, verify)
 {
-  printfQuda("running the following fermion force computation test:\n");
-
-  printfQuda("link_precision           link_reconstruct           space_dim(x/y/z)         T_dimension\n");
-  printfQuda("%s                       %s                         %d/%d/%d                  %d \n", get_prec_str(prec),
-             get_recon_str(link_recon), xdim, ydim, zdim, tdim);
+  prec = ::testing::get<0>(GetParam());
+  if (!quda::is_enabled(prec)) GTEST_SKIP();
+  hisq_unitarize(prec);
 }
 
-int main(int argc, char **argv)
-{
-  // initalize google test
-  ::testing::InitGoogleTest(&argc, argv);
-
-  auto app = make_app();
-  try {
-    app->parse(argc, argv);
-  } catch (const CLI::ParseError &e) {
-    return app->exit(e);
-  }
-
-  initComms(argc, argv, gridsize_from_cmdline);
-  initQuda(device_ordinal);
+auto test_str
+  = [](testing::TestParamInfo<test_t> param) { return std::string(get_prec_str(::testing::get<0>(param.param))); };
 
-  display_test_info();
+INSTANTIATE_TEST_SUITE_P(, HisqUnitarizeTest, ::testing::Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION), test_str);
 
-  int test_rc = RUN_ALL_TESTS();
+struct hisq_unitarize_test : public quda_test {
+  void display_info() const override
+  {
+    printfQuda("link_precision           link_reconstruct           space_dim(x/y/z)         T_dimension\n");
+    printfQuda("%s                       %s                         %d/%d/%d                  %d \n",
+               get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim);
+  }
 
-  endQuda();
-  finalizeComms();
+  hisq_unitarize_test(int argc, char **argv) : quda_test("hisq_unitarize_test", argc, argv) { }
+};
 
+int main(int argc, char **argv)
+{
+  hisq_unitarize_test test(argc, argv);
+  test.init();
+  int test_rc = 0;
+
+  if (!enable_testing) {
+    hisq_unitarize(prec);
+  } else {
+    test_rc = test.execute();
+  }
   return test_rc;
 }
diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index c7b00d73fa..14bb13ee3f 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -101,10 +101,10 @@ template <typename real_t> struct ComputePathProduct {
                   const void *const loop_coeff_, int coeff_index, int dir, const lattice_t &lat)
   {
     using matrix = Matrix<3, std::complex<real_t>>;
-    auto sitelink = reinterpret_cast<const matrix *const *const>(sitelink_);
+    auto sitelink = reinterpret_cast<const matrix *const *>(sitelink_);
 
-    auto staple = reinterpret_cast<matrix *const>(staple_);
-    auto loop_coeff = reinterpret_cast<const real_t *const>(loop_coeff_);
+    auto staple = reinterpret_cast<matrix *>(staple_);
+    auto loop_coeff = reinterpret_cast<const real_t *>(loop_coeff_);
     auto coeff = loop_coeff[coeff_index];
 
 #pragma omp parallel for
@@ -144,7 +144,7 @@ template <typename real_t> struct ComputeLoopTrace {
                                   const lattice_t &lat)
   {
     using matrix = Matrix<3, std::complex<real_t>>;
-    auto sitelink = reinterpret_cast<const matrix *const *const>(sitelink_);
+    auto sitelink = reinterpret_cast<const matrix *const *>(sitelink_);
 
     std::complex<double> accum = 0;
 
@@ -185,9 +185,9 @@ template <typename real_t> struct UpdateMomentum {
   {
     using matrix = Matrix<3, std::complex<real_t>>;
 
-    auto momentum = reinterpret_cast<anti_hermitmat<real_t> *const>(momentum_);
-    auto sitelink = reinterpret_cast<const matrix *const *const>(sitelink_);
-    auto staple = reinterpret_cast<const matrix *const>(staple_);
+    auto momentum = reinterpret_cast<anti_hermitmat<real_t> *>(momentum_);
+    auto sitelink = reinterpret_cast<const matrix *const *>(sitelink_);
+    auto staple = reinterpret_cast<const matrix *>(staple_);
 
 #pragma omp parallel for
     for (size_t i = 0; i < lat.volume; i++) {
@@ -228,9 +228,9 @@ template <typename real_t> struct UpdateGauge {
   {
     using matrix = Matrix<3, std::complex<real_t>>;
 
-    auto gauge = reinterpret_cast<matrix *const>(gauge_);
-    auto sitelink = reinterpret_cast<const matrix *const *const>(sitelink_);
-    auto staple = reinterpret_cast<const matrix *const>(staple_);
+    auto gauge = reinterpret_cast<matrix *>(gauge_);
+    auto sitelink = reinterpret_cast<const matrix *const *>(sitelink_);
+    auto staple = reinterpret_cast<const matrix *>(staple_);
 
 #pragma omp parallel for
     for (size_t i = 0; i < lat.volume; i++) {
diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index 6b761d6925..489612d030 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -18,7 +18,7 @@ extern int Vh;
 template <typename real_t> struct ComputeLinkOrderedOuterProduct {
   void operator()(const void *const src_, quda::GaugeField &dest, size_t nhops)
   {
-    auto src = reinterpret_cast<const su3_vector<real_t> *const>(src_);
+    auto src = reinterpret_cast<const su3_vector<real_t> *>(src_);
 
 #pragma omp parallel for
     for (int i = 0; i < V; ++i) {
@@ -35,9 +35,9 @@ template <typename real_t> struct ComputeLinkOrderedOuterProduct {
   }
 };
 
-void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dst, QudaPrecision precision, size_t nhops)
+void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dst, size_t nhops)
 {
-  instantiate_host<ComputeLinkOrderedOuterProduct>(precision, src, dst, nhops);
+  instantiate_host<ComputeLinkOrderedOuterProduct>(dst.Precision(), src, dst, nhops);
 }
 
 #define RETURN_IF_ERR                                                                                                  \
@@ -814,9 +814,9 @@ template <typename real_t> struct HisqStaplesForce {
 void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda::GaugeField &link,
                          quda::GaugeField *newOprod)
 {
+  auto precision = quda::checkPrecision(oprod, link, *newOprod);
   int X_[4];
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
-  QudaPrecision precision = oprod.Precision();
 
   uint64_t len = is_multi_gpu() ? (2 * Vh_ex) : (X_[0] * X_[1] * X_[2] * X_[3]);
 
@@ -890,9 +890,9 @@ template <class real_t> struct ComputeLongLinkField {
   void operator()(const int dim[4], const void *const *const oprod_, const void *const *const link_, int sig,
                   real_t coeff, void *const *const output_)
   {
-    auto oprod = reinterpret_cast<const matrix *const *const>(oprod_);
-    auto link = reinterpret_cast<const matrix *const *const>(link_);
-    auto output = reinterpret_cast<matrix *const *const>(output_);
+    auto oprod = reinterpret_cast<const matrix *const *>(oprod_);
+    auto link = reinterpret_cast<const matrix *const *>(link_);
+    auto output = reinterpret_cast<matrix *const *>(output_);
 
     int volume = 1;
     for (int dir = 0; dir < 4; ++dir) volume *= dim[dir];
@@ -913,9 +913,9 @@ template <class real_t> struct ComputeLongLinkField {
 
 void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *newOprod)
 {
+  auto precision = quda::checkPrecision(oprod, link, *newOprod);
   int X_[4];
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
-  QudaPrecision precision = oprod.Precision();
 
   for (int sig = 0; sig < 4; ++sig) {
     instantiate_host<ComputeLongLinkField>(precision, X_, oprod.data_array().data, link.data_array().data, sig, coeff,
@@ -947,9 +947,9 @@ template <class real_t> struct CompleteForceField {
   void operator()(const int dim[4], const void *const *const oprod_, const void *const *const link_, int sig,
                   void *const mom_)
   {
-    auto oprod = reinterpret_cast<const matrix *const *const>(oprod_);
-    auto link = reinterpret_cast<const matrix *const *const>(link_);
-    auto mom = reinterpret_cast<real_t *const>(mom_);
+    auto oprod = reinterpret_cast<const matrix *const *>(oprod_);
+    auto link = reinterpret_cast<const matrix *const *>(link_);
+    auto mom = reinterpret_cast<real_t *>(mom_);
 
     int volume = dim[0] * dim[1] * dim[2] * dim[3];
     const int half_volume = volume / 2;
@@ -964,12 +964,12 @@ template <class real_t> struct CompleteForceField {
 
 void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *mom)
 {
+  auto precision = quda::checkPrecision(oprod, link, *mom);
   int X_[4];
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
-  QudaPrecision precision = oprod.Precision();
 
   for (int sig = 0; sig < 4; ++sig) {
     instantiate_host<CompleteForceField>(precision, X_, oprod.data_array().data, link.data_array().data, sig,
                                          mom->data());
   } // loop over sig
-}
\ No newline at end of file
+}
diff --git a/tests/host_reference/hisq_force_reference.h b/tests/host_reference/hisq_force_reference.h
index bdf78c4750..06052df319 100644
--- a/tests/host_reference/hisq_force_reference.h
+++ b/tests/host_reference/hisq_force_reference.h
@@ -12,7 +12,7 @@
    @param[in] precision Precision of data (single or double)
    @param[in] separation Offset for outer product (1 for fat links, 3 for long links)
 */
-void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dest, QudaPrecision precision, size_t separation);
+void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dest, size_t separation);
 
 /**
    @brief Compute the force contribution from the fat links, CPU version
diff --git a/tests/staggered_gsmear_test.cpp b/tests/staggered_gsmear_test.cpp
index 84f793dcb9..cdcab64d9c 100644
--- a/tests/staggered_gsmear_test.cpp
+++ b/tests/staggered_gsmear_test.cpp
@@ -1,4 +1,3 @@
-#include "test.h"
 #include "staggered_gsmear_test_utils.h"
 
 using namespace quda;
@@ -6,43 +5,52 @@ using namespace quda;
 int argc_copy;
 char **argv_copy;
 
-class StaggeredGSmearTest : public ::testing::Test
+using test_t = ::testing::tuple<QudaPrecision, gsmear_test_type>;
+
+class StaggeredGSmearTest : public ::testing::TestWithParam<test_t>
 {
 protected:
   StaggeredGSmearTestWrapper gsmear_test_wrapper;
 
-  void display_test_info()
-  {
-    printfQuda("running the following test:\n");
-    printfQuda("prec     recon    test_type     S_dim         T_dimension\n");
-    printfQuda("%s   %s       %s       %d/%d/%d      %d \n", get_prec_str(prec), get_recon_str(link_recon),
-               get_string(gtest_type_map, gtest_type).c_str(), xdim, ydim, zdim, tdim);
-    printfQuda("Grid partition info:     X  Y  Z  T\n");
-    printfQuda("                         %d  %d  %d  %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2),
-               dimPartitioned(3));
-  }
-
 public:
   StaggeredGSmearTest() = default;
 
   virtual void SetUp()
   {
+    prec = ::testing::get<0>(GetParam());
+    gtest_type = ::testing::get<1>(GetParam());
+    if (!quda::is_enabled(prec)) GTEST_SKIP();
     gsmear_test_wrapper.init_test(argc_copy, argv_copy);
-    display_test_info();
   }
 
-  virtual void TearDown() { gsmear_test_wrapper.end(); }
+  virtual void TearDown()
+  {
+    if (!quda::is_enabled(prec)) GTEST_SKIP();
+    gsmear_test_wrapper.end();
+  }
 
-  static void SetUpTestCase() { initQuda(device_ordinal); }
+  static void SetUpTestCase() { }
 
-  static void TearDownTestCase() { endQuda(); }
+  static void TearDownTestCase() { }
 };
 
-TEST_F(StaggeredGSmearTest, benchmark) { gsmear_test_wrapper.run_test(niter, /**show_metrics =*/true); }
-
-TEST_F(StaggeredGSmearTest, verify)
+TEST_P(StaggeredGSmearTest, verify)
 {
-  if (!verify_results) GTEST_SKIP();
+  prec = ::testing::get<0>(GetParam());
+  gtest_type = ::testing::get<1>(GetParam());
+  if (!quda::is_enabled(prec)) GTEST_SKIP();
+
+  switch (gtest_type) {
+  case gsmear_test_type::TwoLink:
+    laplace3D = 4;
+    smear_t0 = -1;
+    break;
+  case gsmear_test_type::GaussianSmear:
+    laplace3D = 3;
+    smear_t0 = 1;
+    break;
+  default: errorQuda("Unexpected gsmear_type = %s", get_string(gtest_type_map, gtest_type).c_str());
+  }
 
   gsmear_test_wrapper.staggeredGSmearRef();
   gsmear_test_wrapper.run_test(2);
@@ -52,41 +60,55 @@ TEST_F(StaggeredGSmearTest, verify)
   ASSERT_LE(deviation, tol) << "reference and QUDA implementations do not agree";
 }
 
+struct gsmear_test : public quda_test {
+  void display_info() const override
+  {
+    printfQuda("prec     recon    test_type     S_dim         T_dimension\n");
+    printfQuda("%s   %s       %s       %d/%d/%d      %d \n", get_prec_str(prec), get_recon_str(link_recon),
+               get_string(gtest_type_map, gtest_type).c_str(), xdim, ydim, zdim, tdim);
+  }
 
-int main(int argc, char **argv)
-{
-  // initalize google test
-  ::testing::InitGoogleTest(&argc, argv);
-
-  // command line options
-  auto app = make_app();
-  app->add_option("--test", gtest_type, "Test method")->transform(CLI::CheckedTransformer(gtest_type_map));
-  add_quark_smear_option_group(app);
-  add_su3_option_group(app);
-  add_comms_option_group(app);
-  try {
-    app->parse(argc, argv);
-  } catch (const CLI::ParseError &e) {
-    return app->exit(e);
+  void add_command_line_group(std::shared_ptr<QUDAApp> app) const override
+  {
+    quda_test::add_command_line_group(app);
+    app->add_option("--test", gtest_type, "Test method")->transform(CLI::CheckedTransformer(gtest_type_map));
+    add_quark_smear_option_group(app);
+    add_su3_option_group(app);
   }
 
-  initComms(argc, argv, gridsize_from_cmdline);
+  gsmear_test(int argc, char **argv) : quda_test("gsmear_test", argc, argv) { }
+};
+
+auto test_str = [](testing::TestParamInfo<test_t> param) {
+  return std::string(get_prec_str(::testing::get<0>(param.param))) + "_"
+    + get_string(gtest_type_map, ::testing::get<1>(param.param));
+};
+
+using ::testing::Combine;
+using ::testing::Values;
+
+INSTANTIATE_TEST_SUITE_P(, StaggeredGSmearTest,
+                         Combine(Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION),
+                                 Values(gsmear_test_type::TwoLink, gsmear_test_type::GaussianSmear)),
+                         test_str);
+
+int main(int argc, char **argv)
+{
+  gsmear_test test(argc, argv);
+  test.init();
 
   // Same approach as in Staggered DslashTest
   argc_copy = argc;
   argv_copy = argv;
 
-  // Ensure gtest prints only from rank 0
-  ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
-  if (comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
+  if (link_recon != QUDA_RECONSTRUCT_NO) errorQuda("Error: link reconstruction is currently not supported");
 
-  if (link_recon != QUDA_RECONSTRUCT_NO) {
-    printfQuda("Error: link reconstruction is currently not supported.\n");
-    exit(0);
-  }
+  int test_rc = 0;
+  if (!enable_testing) {
 
-  int test_rc = RUN_ALL_TESTS();
+  } else {
+    test_rc = test.execute();
+  }
 
-  finalizeComms();
   return test_rc;
 }
diff --git a/tests/staggered_gsmear_test_utils.h b/tests/staggered_gsmear_test_utils.h
index 2cc7f1c2a8..8e4e2a9014 100644
--- a/tests/staggered_gsmear_test_utils.h
+++ b/tests/staggered_gsmear_test_utils.h
@@ -11,13 +11,13 @@
 #include <tune_quda.h>
 #include <gauge_field.h>
 #include <dirac_quda.h>
+#include <instantiate.h>
 
-#include <gtest/gtest.h>
-
-#include <misc.h>
-#include <host_utils.h>
+#include "misc.h"
+#include "host_utils.h"
 #include "gauge_utils.h"
 #include "command_line_params.h"
+#include "test.h"
 
 using namespace quda;
 
@@ -152,7 +152,7 @@ struct StaggeredGSmearTestWrapper { //
   void init_ctest_once()
   {
     static bool has_been_called = false;
-    if (has_been_called) { errorQuda("This function is not supposed to be called twice.\n"); }
+    if (has_been_called) { errorQuda("This function is not supposed to be called twice"); }
     is_ctest = true; // Is being used in dslash_ctest.
     has_been_called = true;
   }
@@ -160,7 +160,7 @@ struct StaggeredGSmearTestWrapper { //
   void end_ctest_once()
   {
     static bool has_been_called = false;
-    if (has_been_called) { errorQuda("This function is not supposed to be called twice.\n"); }
+    if (has_been_called) { errorQuda("This function is not supposed to be called twice"); }
     has_been_called = true;
   }
 
@@ -209,7 +209,6 @@ struct StaggeredGSmearTestWrapper { //
 
   void init(int argc, char **argv)
   {
-    setVerbosity(verbosity);
     inv_param.split_grid[0] = grid_partition[0];
     inv_param.split_grid[1] = grid_partition[1];
     inv_param.split_grid[2] = grid_partition[2];
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 1f5c7c726e..f5d2f31505 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -31,6 +31,14 @@ bool use_multi_src = false;
 // print instructions on how to run the old tests
 bool print_legacy_info = false;
 
+QudaPrecision last_prec = QUDA_INVALID_PRECISION;
+
+GaugeField cpuInQDP = {};
+GaugeField cpuFatQDP = {};
+GaugeField cpuLongQDP = {};
+GaugeField cpuFatMILC = {};
+GaugeField cpuLongMILC = {};
+
 // if --enable-testing true is passed, we run the tests defined in here
 #include <staggered_invert_test_gtest.hpp>
 
@@ -131,11 +139,6 @@ void display_legacy_info()
     "--test 6 -> --solve-type direct-pc --solution-type mat-pc --inv-type cg --matpc odd-odd   --multishift 8\n");
 }
 
-GaugeField cpuFatQDP = {};
-GaugeField cpuLongQDP = {};
-GaugeField cpuFatMILC = {};
-GaugeField cpuLongMILC = {};
-
 void init()
 {
   // Set QUDA internal parameters
@@ -202,7 +205,7 @@ void init()
   cpuParam.order = QUDA_QDP_GAUGE_ORDER;
   cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   cpuParam.create = QUDA_NULL_FIELD_CREATE;
-  GaugeField cpuIn = GaugeField(cpuParam);
+  cpuInQDP = GaugeField(cpuParam);
   cpuFatQDP = GaugeField(cpuParam);
   cpuParam.order = QUDA_MILC_GAUGE_ORDER;
   cpuFatMILC = GaugeField(cpuParam);
@@ -214,7 +217,7 @@ void init()
   cpuParam.order = QUDA_MILC_GAUGE_ORDER;
   cpuLongMILC = GaugeField(cpuParam);
 
-  void *qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)};
+  void *qdp_inlink[4] = {cpuInQDP.data(0), cpuInQDP.data(1), cpuInQDP.data(2), cpuInQDP.data(3)};
   void *qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)};
   void *qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)};
   constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, 0, nullptr, true);
@@ -254,20 +257,22 @@ void init()
 
 std::vector<std::array<double, 2>> solve(test_t param)
 {
-  inv_param.inv_type = ::testing::get<0>(param);
-  inv_param.solution_type = ::testing::get<1>(param);
-  inv_param.solve_type = ::testing::get<2>(param);
-  inv_param.cuda_prec_sloppy = ::testing::get<3>(param);
-  multishift = ::testing::get<4>(param);
-  inv_param.solution_accumulator_pipeline = ::testing::get<5>(param);
+  inv_param.cuda_prec = ::testing::get<0>(param);
+  inv_param.cuda_prec_sloppy = ::testing::get<1>(param);
+  inv_param.cuda_prec_refinement_sloppy = ::testing::get<1>(param);
+  inv_param.inv_type = ::testing::get<2>(param);
+  inv_param.solution_type = ::testing::get<3>(param);
+  inv_param.solve_type = ::testing::get<4>(param);
+  multishift = ::testing::get<5>(param);
+  inv_param.solution_accumulator_pipeline = ::testing::get<6>(param);
 
   // schwarz parameters
-  auto schwarz_param = ::testing::get<6>(param);
+  auto schwarz_param = ::testing::get<7>(param);
   inv_param.schwarz_type = ::testing::get<0>(schwarz_param);
   inv_param.inv_type_precondition = ::testing::get<1>(schwarz_param);
   inv_param.cuda_prec_precondition = ::testing::get<2>(schwarz_param);
 
-  inv_param.residual_type = ::testing::get<7>(param);
+  inv_param.residual_type = ::testing::get<8>(param);
 
   // reset lambda_max if we're doing a testing loop to ensure correct lambma_max
   if (enable_testing) inv_param.ca_lambda_max = -1.0;
@@ -463,6 +468,7 @@ std::vector<std::array<double, 2>> solve(test_t param)
 
 void cleanup()
 {
+  cpuInQDP = {};
   cpuFatQDP = {};
   cpuLongQDP = {};
   cpuFatMILC = {};
@@ -569,7 +575,7 @@ int main(int argc, char **argv)
     result = RUN_ALL_TESTS();
   } else {
     for (int rep = 0; rep < nrepeat; rep++)
-      solve(test_t {inv_type, solution_type, solve_type, prec_sloppy, multishift, solution_accumulator_pipeline,
+      solve(test_t {prec, prec_sloppy, inv_type, solution_type, solve_type, multishift, solution_accumulator_pipeline,
                     schwarz_t {precon_schwarz_type, inv_multigrid ? QUDA_MG_INVERTER : precon_type, prec_precondition},
                     inv_param.residual_type});
   }
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index af150fea86..19371b4944 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -4,30 +4,23 @@
 // tuple containing parameters for Schwarz solver
 using schwarz_t = ::testing::tuple<QudaSchwarzType, QudaInverterType, QudaPrecision>;
 
-using test_t
-  = ::testing::tuple<QudaInverterType, QudaSolutionType, QudaSolveType, QudaPrecision, int, int, schwarz_t, QudaResidualType>;
-
-class StaggeredInvertTest : public ::testing::TestWithParam<test_t>
-{
-protected:
-  test_t param;
-
-public:
-  StaggeredInvertTest() : param(GetParam()) { }
-};
+using test_t = ::testing::tuple<QudaPrecision, QudaPrecision, QudaInverterType, QudaSolutionType, QudaSolveType, int,
+                                int, schwarz_t, QudaResidualType>;
 
 bool skip_test(test_t param)
 {
-  auto inverter_type = ::testing::get<0>(param);
-  auto solution_type = ::testing::get<1>(param);
-  auto solve_type = ::testing::get<2>(param);
-  auto prec_sloppy = ::testing::get<3>(param);
-  auto multishift = ::testing::get<4>(param);
-  auto solution_accumulator_pipeline = ::testing::get<5>(param);
-  auto schwarz_param = ::testing::get<6>(param);
+  auto prec = ::testing::get<0>(param);
+  auto prec_sloppy = ::testing::get<1>(param);
+  auto inverter_type = ::testing::get<2>(param);
+  auto solution_type = ::testing::get<3>(param);
+  auto solve_type = ::testing::get<4>(param);
+  auto multishift = ::testing::get<5>(param);
+  auto solution_accumulator_pipeline = ::testing::get<6>(param);
+  auto schwarz_param = ::testing::get<7>(param);
   auto prec_precondition = ::testing::get<2>(schwarz_param);
 
   if (prec < prec_sloppy) return true;              // outer precision >= sloppy precision
+  if (!(QUDA_PRECISION & prec)) return true;        // precision not enabled so skip it
   if (!(QUDA_PRECISION & prec_sloppy)) return true; // precision not enabled so skip it
   if (!(QUDA_PRECISION & prec_precondition) && prec_precondition != QUDA_INVALID_PRECISION)
     return true;                                    // precision not enabled so skip it
@@ -70,21 +63,73 @@ bool skip_test(test_t param)
   return false;
 }
 
+class StaggeredInvertTest : public ::testing::TestWithParam<test_t>
+{
+protected:
+  test_t param;
+
+public:
+  StaggeredInvertTest() : param(GetParam()) { }
+
+  virtual void SetUp()
+  {
+    if (skip_test(GetParam())) GTEST_SKIP();
+
+    // check if outer precision has changed and update if it has
+    if (::testing::get<0>(param) != last_prec) {
+      if (last_prec != QUDA_INVALID_PRECISION) freeGaugeQuda();
+
+      // Compute the plaquette. Routine is aware that the gauge fields already have the phases on them.
+      // This needs to be called before `loadFatLongGaugeQuda` because this routine also loads the
+      // gauge fields with different parameters.
+      void *qdp_inlink[4] = {cpuInQDP.data(0), cpuInQDP.data(1), cpuInQDP.data(2), cpuInQDP.data(3)};
+
+      // Load the gauge field to the device
+      gauge_param.cuda_prec = ::testing::get<0>(param);
+      gauge_param.cuda_prec_sloppy = ::testing::get<0>(param);
+      gauge_param.cuda_prec_precondition = ::testing::get<0>(param);
+      gauge_param.cuda_prec_refinement_sloppy = ::testing::get<0>(param);
+      gauge_param.cuda_prec_eigensolver = ::testing::get<0>(param);
+
+      double plaq[3];
+      computeStaggeredPlaquetteQDPOrder(qdp_inlink, plaq, gauge_param, dslash_type);
+      printfQuda("Computed plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]);
+
+      if (dslash_type == QUDA_ASQTAD_DSLASH) {
+        // Compute fat link plaquette
+        void *qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)};
+        computeStaggeredPlaquetteQDPOrder(qdp_fatlink, plaq, gauge_param, dslash_type);
+        printfQuda("Computed fat link plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]);
+      }
+
+      freeGaugeQuda();
+
+      loadFatLongGaugeQuda(cpuFatMILC.data(), cpuLongMILC.data(), gauge_param);
+
+      last_prec = ::testing::get<0>(param);
+    }
+  }
+};
+
 std::vector<std::array<double, 2>> solve(test_t param);
 
 TEST_P(StaggeredInvertTest, verify)
 {
   if (skip_test(GetParam())) GTEST_SKIP();
 
+  // Loosen tolerances to keep iterations to solution in check
+  tol = ::testing::get<0>(param) == QUDA_DOUBLE_PRECISION ? 1e-6 : 1e-5;
+  tol_hq = tol;
+
   inv_param.tol = 0.0;
   inv_param.tol_hq = 0.0;
-  auto res_t = ::testing::get<7>(GetParam());
+  auto res_t = ::testing::get<8>(GetParam());
   if (res_t & QUDA_L2_RELATIVE_RESIDUAL) inv_param.tol = tol;
   if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) inv_param.tol_hq = tol_hq;
 
-  auto inverter_type = ::testing::get<0>(param);
-  auto solution_type = ::testing::get<1>(param);
-  auto solve_type = ::testing::get<2>(param);
+  auto inverter_type = ::testing::get<2>(param);
+  auto solution_type = ::testing::get<3>(param);
+  auto solve_type = ::testing::get<4>(param);
 
   // Make a local copy of "tol" for modification in place
   auto verify_tol = tol;
@@ -126,21 +171,22 @@ TEST_P(StaggeredInvertTest, verify)
 std::string gettestname(::testing::TestParamInfo<test_t> param)
 {
   std::string name;
-  name += get_solver_str(::testing::get<0>(param.param)) + std::string("_");
-  name += get_solution_str(::testing::get<1>(param.param)) + std::string("_");
-  name += get_solve_str(::testing::get<2>(param.param)) + std::string("_");
-  name += get_prec_str(::testing::get<3>(param.param));
-  if (::testing::get<4>(param.param) > 1)
-    name += std::string("_shift") + std::to_string(::testing::get<4>(param.param));
+  name += get_prec_str(::testing::get<0>(param.param)) + std::string("_");
+  name += get_prec_str(::testing::get<1>(param.param)) + std::string("_");
+  name += get_solver_str(::testing::get<2>(param.param)) + std::string("_");
+  name += get_solution_str(::testing::get<3>(param.param)) + std::string("_");
+  name += get_solve_str(::testing::get<4>(param.param));
   if (::testing::get<5>(param.param) > 1)
-    name += std::string("_solution_accumulator_pipeline") + std::to_string(::testing::get<5>(param.param));
-  auto &schwarz_param = ::testing::get<6>(param.param);
+    name += std::string("_shift") + std::to_string(::testing::get<5>(param.param));
+  if (::testing::get<6>(param.param) > 1)
+    name += std::string("_solution_accumulator_pipeline") + std::to_string(::testing::get<6>(param.param));
+  auto &schwarz_param = ::testing::get<7>(param.param);
   if (::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) {
     name += std::string("_") + get_schwarz_str(::testing::get<0>(schwarz_param));
     name += std::string("_") + get_solver_str(::testing::get<1>(schwarz_param));
     name += std::string("_") + get_prec_str(::testing::get<2>(schwarz_param));
   }
-  auto res_t = ::testing::get<7>(param.param);
+  auto res_t = ::testing::get<8>(param.param);
   if (res_t & QUDA_L2_RELATIVE_RESIDUAL) name += std::string("_l2");
   if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) name += std::string("_heavy_quark");
   return name;
@@ -159,6 +205,8 @@ auto direct_solvers = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGN
                              QUDA_CG3NE_INVERTER, QUDA_CG3NR_INVERTER, QUDA_GCR_INVERTER, QUDA_CA_GCR_INVERTER,
                              QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER);
 
+auto precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION);
+
 auto sloppy_precisions
   = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION);
 
@@ -173,35 +221,36 @@ auto no_heavy_quark = Values(QUDA_L2_RELATIVE_RESIDUAL);
 
 // preconditioned solves
 INSTANTIATE_TEST_SUITE_P(EvenOdd, StaggeredInvertTest,
-                         Combine(staggered_pc_solvers, Values(QUDA_MATPC_SOLUTION, QUDA_MAT_SOLUTION),
-                                 Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1),
-                                 solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         Combine(precisions, sloppy_precisions, staggered_pc_solvers,
+                                 Values(QUDA_MATPC_SOLUTION, QUDA_MAT_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE),
+                                 Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
                          gettestname);
 
 // full system normal solve
 INSTANTIATE_TEST_SUITE_P(NormalFull, StaggeredInvertTest,
-                         Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION, QUDA_MAT_SOLUTION),
-                                 Values(QUDA_NORMOP_SOLVE), sloppy_precisions, Values(1),
-                                 solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         Combine(precisions, sloppy_precisions, normal_solvers,
+                                 Values(QUDA_MATDAG_MAT_SOLUTION, QUDA_MAT_SOLUTION), Values(QUDA_NORMOP_SOLVE),
+                                 Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
                          gettestname);
 
 // full system direct solve
 INSTANTIATE_TEST_SUITE_P(Full, StaggeredInvertTest,
-                         Combine(direct_solvers, Values(QUDA_MAT_SOLUTION), Values(QUDA_DIRECT_SOLVE), sloppy_precisions,
-                                 Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         Combine(precisions, sloppy_precisions, direct_solvers, Values(QUDA_MAT_SOLUTION),
+                                 Values(QUDA_DIRECT_SOLVE), Values(1), solution_accumulator_pipelines, no_schwarz,
+                                 no_heavy_quark),
                          gettestname);
 
 // preconditioned multi-shift solves
 INSTANTIATE_TEST_SUITE_P(MultiShiftEvenOdd, StaggeredInvertTest,
-                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE),
-                                 sloppy_precisions, Values(10), solution_accumulator_pipelines, no_schwarz,
+                         Combine(precisions, sloppy_precisions, Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION),
+                                 Values(QUDA_DIRECT_PC_SOLVE), Values(10), solution_accumulator_pipelines, no_schwarz,
                                  no_heavy_quark),
                          gettestname);
 
 // Heavy-Quark preconditioned solves
 INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, StaggeredInvertTest,
-                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE),
-                                 sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz,
+                         Combine(precisions, sloppy_precisions, Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION),
+                                 Values(QUDA_DIRECT_PC_SOLVE), Values(1), solution_accumulator_pipelines, no_schwarz,
                                  Values(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL, QUDA_HEAVY_QUARK_RESIDUAL)),
                          gettestname);
 
diff --git a/tests/unitarize_link_test.cpp b/tests/unitarize_link_test.cpp
index 6a175204f8..1341405122 100644
--- a/tests/unitarize_link_test.cpp
+++ b/tests/unitarize_link_test.cpp
@@ -3,24 +3,19 @@
 #include <string.h>
 #include <sys/time.h>
 
-#include "quda.h"
-#include "timer.h"
-#include "gauge_field.h"
+#include <quda.h>
+#include <timer.h>
+#include <gauge_field.h>
+#include <instantiate.h>
+#include "util_quda.h"
+#include <unitarization_links.h>
+
 #include "host_utils.h"
-#include <command_line_params.h>
+#include "command_line_params.h"
 #include "misc.h"
 #include "test.h"
-#include "util_quda.h"
-#include "llfat_quda.h"
-#include <unitarization_links.h>
 #include "ks_improved_force.h"
-
-#ifdef MULTI_GPU
 #include "comm_quda.h"
-#endif
-
-// google test frame work
-#include <gtest/gtest.h>
 
 #define TDIFF(a, b) (b.tv_sec - a.tv_sec + 0.000001 * (b.tv_usec - a.tv_usec))
 
@@ -33,34 +28,23 @@ static double max_allowed_error = 1e-11;
 
 static QudaGaugeFieldOrder gauge_order = QUDA_MILC_GAUGE_ORDER;
 
-quda::GaugeField *cpuFatLink, *cpuULink, *cudaResult;
-quda::GaugeField *cudaFatLink, *cudaULink;
-
 const double unittol = (prec == QUDA_DOUBLE_PRECISION) ? 1e-10 : 1e-6;
 
-TEST(unitarization, verify)
-{
-  unitarizeLinksCPU(*cpuULink, *cpuFatLink);
-  cudaResult->copy(*cudaULink);
-
-  int res = compare_floats(cudaResult->data(), cpuULink->data(), 4 * cudaResult->Volume() * gauge_site_size, unittol,
-                           cpu_prec);
+using test_t = ::testing::tuple<QudaPrecision>;
 
-#ifdef MULTI_GPU
-  quda::comm_allreduce_int(res);
-  res /= quda::comm_size();
-#endif
+class UnitarizeTest : public ::testing::TestWithParam<test_t>
+{
+protected:
+  QudaPrecision precision;
 
-  ASSERT_EQ(res, 1) << "CPU and CUDA implementations do not agree";
-}
+public:
+  UnitarizeTest() : precision(::testing::get<0>(GetParam())) { }
+};
 
-static int unitarize_link_test(int &test_rc)
+void unitarize(QudaPrecision prec)
 {
-  setVerbosity(verbosity);
   QudaGaugeParam qudaGaugeParam = newQudaGaugeParam();
 
-  qudaGaugeParam.anisotropy = 1.0;
-
   qudaGaugeParam.X[0] = xdim;
   qudaGaugeParam.X[1] = ydim;
   qudaGaugeParam.X[2] = zdim;
@@ -70,6 +54,8 @@ static int unitarize_link_test(int &test_rc)
 
   qudaGaugeParam.type = QUDA_WILSON_LINKS;
 
+  qudaGaugeParam.anisotropy = 1.0;
+
   qudaGaugeParam.t_boundary = QUDA_PERIODIC_T;
   qudaGaugeParam.anisotropy = 1.0;
   qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
@@ -125,21 +111,21 @@ static int unitarize_link_test(int &test_rc)
   gParam.create = QUDA_REFERENCE_FIELD_CREATE;
   gParam.gauge = fatlink;
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  cpuFatLink = new quda::GaugeField(gParam);
+  auto cpuFatLink = quda::GaugeField(gParam);
 
   gParam.create = QUDA_ZERO_FIELD_CREATE;
-  cpuULink = new quda::GaugeField(gParam);
+  auto cpuULink = quda::GaugeField(gParam);
 
   gParam.create = QUDA_ZERO_FIELD_CREATE;
-  cudaResult = new quda::GaugeField(gParam);
+  auto cudaResult = quda::GaugeField(gParam);
 
   gParam.pad = 0;
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.setPrecision(prec, true);
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-  cudaFatLink = new quda::GaugeField(gParam);
-  cudaULink = new quda::GaugeField(gParam);
+  auto cudaFatLink = quda::GaugeField(gParam);
+  auto cudaULink = quda::GaugeField(gParam);
 
   { // create fat links
     double act_path_coeff[6];
@@ -152,7 +138,7 @@ static int unitarize_link_test(int &test_rc)
 
     computeKSLinkQuda(fatlink, NULL, NULL, inlink, act_path_coeff, &qudaGaugeParam);
 
-    cudaFatLink->copy(*cpuFatLink);
+    cudaFatLink.copy(cpuFatLink);
   }
 
   quda::setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error,
@@ -165,19 +151,22 @@ static int unitarize_link_test(int &test_rc)
   struct timeval t0, t1;
 
   gettimeofday(&t0, NULL);
-  unitarizeLinks(*cudaULink, *cudaFatLink, num_failures_d);
+  unitarizeLinks(cudaULink, cudaFatLink, num_failures_d);
   gettimeofday(&t1, NULL);
 
   if (verify_results) {
-    test_rc = RUN_ALL_TESTS();
-    if (test_rc != 0) warningQuda("Tests failed");
+    unitarizeLinksCPU(cpuULink, cpuFatLink);
+    cudaResult.copy(cudaULink);
+
+    int test_rc
+      = compare_floats(cudaResult.data(), cpuULink.data(), 4 * cudaResult.Volume() * gauge_site_size, unittol, cpu_prec);
+
+    quda::comm_allreduce_int(test_rc);
+    test_rc /= quda::comm_size();
+
+    ASSERT_EQ(test_rc, 1) << "CPU and CUDA implementations do not agree";
   }
 
-  delete cudaResult;
-  delete cpuULink;
-  delete cpuFatLink;
-  delete cudaFatLink;
-  delete cudaULink;
   for (int dir = 0; dir < 4; ++dir) host_free(sitelink[dir]);
 
   host_free(fatlink);
@@ -191,69 +180,61 @@ static int unitarize_link_test(int &test_rc)
 #endif
 
   printfQuda("Unitarization time: %g ms\n", TDIFF(t0, t1) * 1000);
-  return num_failures;
+
+  quda::comm_allreduce_int(num_failures);
+  printfQuda("Number of failures = %d\n", num_failures);
+  ASSERT_EQ(num_failures, 0);
+  if (num_failures > 0) {
+    printfQuda("Failure rate = %lf\n", num_failures / (4.0 * V * quda::comm_size()));
+    printfQuda("You may want to increase the error tolerance or vary the unitarization parameters\n");
+  } else {
+    printfQuda("Unitarization successfull!\n");
+  }
 }
 
-static void display_test_info()
+TEST_P(UnitarizeTest, verify)
 {
-  printfQuda("running the following test:\n");
+  prec = ::testing::get<0>(GetParam());
+  if (!quda::is_enabled(prec)) GTEST_SKIP();
+  unitarize(prec);
+}
 
-  printfQuda("link_precision      link_reconstruct           space_dimension        T_dimension    algorithm           "
-             "max allowed error  deviation tolerance\n");
-  printfQuda("%8s              %s                         %d/%d/%d/                 %d            %s         %g        "
-             "     %g\n",
-             get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim,
-             get_unitarization_str(reunit_svd_only), max_allowed_error, unittol);
+auto test_str
+  = [](testing::TestParamInfo<test_t> param) { return std::string(get_prec_str(::testing::get<0>(param.param))); };
+
+INSTANTIATE_TEST_SUITE_P(, UnitarizeTest, ::testing::Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION), test_str);
+
+struct unitarize_test : public quda_test {
+  void display_info() const override
+  {
+    printfQuda(
+      "link_precision      link_reconstruct           space_dimension        T_dimension    algorithm           "
+      "max allowed error  deviation tolerance\n");
+    printfQuda(
+      "%8s              %s                         %d/%d/%d/                 %d            %s         %g        "
+      "     %g\n",
+      get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_unitarization_str(reunit_svd_only),
+      max_allowed_error, unittol);
+  }
 
-#ifdef MULTI_GPU
-  printfQuda("Grid partition info:     X  Y  Z  T\n");
-  printfQuda("                         %d  %d  %d  %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2),
-             dimPartitioned(3));
-#endif
-}
+  unitarize_test(int argc, char **argv) : quda_test("unitarize_test", argc, argv) { }
+};
 
 int main(int argc, char **argv)
 {
-  // initalize google test, includes command line options
-  ::testing::InitGoogleTest(&argc, argv);
-  int test_rc = 0;
+  unitarize_test test(argc, argv);
 
   // default to 18 reconstruct, 8^3 x 8
   link_recon = QUDA_RECONSTRUCT_NO;
   xdim = ydim = zdim = tdim = 8;
 
-  auto app = make_app();
-  try {
-    app->parse(argc, argv);
-  } catch (const CLI::ParseError &e) {
-    return app->exit(e);
-  }
-
-  initComms(argc, argv, gridsize_from_cmdline);
-  initQuda(device_ordinal);
+  test.init();
 
-  // Ensure gtest prints only from rank 0
-  ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
-  if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
-
-  display_test_info();
-  int num_failures = unitarize_link_test(test_rc);
-  int num_procs = 1;
-#ifdef MULTI_GPU
-  quda::comm_allreduce_int(num_failures);
-  num_procs = quda::comm_size();
-#endif
-
-  printfQuda("Number of failures = %d\n", num_failures);
-  if (num_failures > 0) {
-    printfQuda("Failure rate = %lf\n", num_failures / (4.0 * V * num_procs));
-    printfQuda("You may want to increase the error tolerance or vary the unitarization parameters\n");
+  int test_rc = 0;
+  if (!enable_testing) {
+    unitarize(prec);
   } else {
-    printfQuda("Unitarization successfull!\n");
+    test_rc = test.execute();
   }
-
-  endQuda();
-  finalizeComms();
-
   return test_rc;
 }
diff --git a/tests/utils/force_utils.hpp b/tests/utils/force_utils.hpp
index c8e06e3919..8a42907809 100644
--- a/tests/utils/force_utils.hpp
+++ b/tests/utils/force_utils.hpp
@@ -30,13 +30,13 @@ template <typename real_t_> struct anti_hermitmat {
 
 template <typename real_t> su3_matrix<real_t> *get_su3_matrix(quda::GaugeField &p, int idx, int dir)
 {
-  auto data = static_cast<su3_matrix<real_t> *const>(p.data(dir));
+  auto data = static_cast<su3_matrix<real_t> *>(p.data(dir));
   return data + idx;
 }
 
 template <typename real_t> const su3_matrix<real_t> *get_su3_matrix(const quda::GaugeField &p, int idx, int dir)
 {
-  auto data = static_cast<const su3_matrix<real_t> *const>(p.data(dir));
+  auto data = static_cast<const su3_matrix<real_t> *>(p.data(dir));
   return data + idx;
 }