From 84c7c7f8e1edf638ec74d325ba199fb7d6c6ae25 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 13 Apr 2022 16:12:55 -0400 Subject: [PATCH 01/15] Initial hacky support for Chebyshev setup: controlled by environment variables, level 2 (critical path for now) only --- lib/multigrid.cpp | 474 +++++++++++++++++++++++++++++++++------------- 1 file changed, 339 insertions(+), 135 deletions(-) diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 4cb3d7dd62..36cbc2ecbc 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -1412,137 +1412,193 @@ namespace quda { pushLevel(param.level); - SolverParam solverParam(param); // Set solver field parameters: - // set null-space generation options - need to expose these - solverParam.maxiter - = refresh ? param.mg_global.setup_maxiter_refresh[param.level] : param.mg_global.setup_maxiter[param.level]; - solverParam.tol = param.mg_global.setup_tol[param.level]; - solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES; - solverParam.delta = 1e-1; - solverParam.inv_type = param.mg_global.setup_inv_type[param.level]; - // Hard coded for now... - if (is_ca_solver(solverParam.inv_type)) { - solverParam.ca_basis = param.mg_global.setup_ca_basis[param.level]; - solverParam.ca_lambda_min = param.mg_global.setup_ca_lambda_min[param.level]; - solverParam.ca_lambda_max = param.mg_global.setup_ca_lambda_max[param.level]; - solverParam.Nkrylov = param.mg_global.setup_ca_basis_size[param.level]; - } else if (solverParam.inv_type == QUDA_GCR_INVERTER || solverParam.inv_type == QUDA_BICGSTABL_INVERTER) { - solverParam.Nkrylov = param.mg_global.setup_ca_basis_size[param.level]; - } else { - solverParam.Nkrylov = 4; - } - solverParam.pipeline - = (solverParam.inv_type == QUDA_BICGSTAB_INVERTER ? 0 : 4); // FIXME: pipeline != 0 breaks BICGSTAB - solverParam.precision = r->Precision(); + static bool envset = false; - if (is_fine_grid()) { - solverParam.precision_sloppy = param.mg_global.invert_param->cuda_prec_precondition; - solverParam.precision_precondition = param.mg_global.invert_param->cuda_prec_precondition; - } else { - solverParam.precision_precondition = solverParam.precision; - } + // use chebyshev filter or not + static bool use_cheby; - solverParam.residual_type = static_cast(QUDA_L2_RELATIVE_RESIDUAL); - solverParam.compute_null_vector = QUDA_COMPUTE_NULL_VECTOR_YES; - ColorSpinorParam csParam(*B[0]); // Create spinor field parameters: - csParam.setPrecision(r->Precision(), r->Precision(), true); // ensure native ordering - csParam.location = QUDA_CUDA_FIELD_LOCATION; // hard code to GPU location for null-space generation for now - csParam.gammaBasis = B[0]->Nspin() == 1 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS : - QUDA_UKQCD_GAMMA_BASIS; // degrand-rossi required for staggered - csParam.create = QUDA_ZERO_FIELD_CREATE; - ColorSpinorField b(csParam); - ColorSpinorField x(csParam); + // bottom of window for filtering stage, eigenvalues smaller than lamda_min enhanced + static double lambda_min; - csParam.create = QUDA_NULL_FIELD_CREATE; + // how many filtering iterations should we do + static int filter_iterations; - // if we not using GCR/MG smoother then we need to switch off Schwarz since regular Krylov solvers do not support it - bool schwarz_reset = solverParam.inv_type != QUDA_MG_INVERTER - && param.mg_global.smoother_schwarz_type[param.level] != QUDA_INVALID_SCHWARZ; - if (schwarz_reset) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Disabling Schwarz for null-space finding"); - int commDim[QUDA_MAX_DIM]; - for (int i = 0; i < QUDA_MAX_DIM; i++) commDim[i] = 1; - diracSmootherSloppy->setCommDim(commDim); - } + // how many near-nulls to generate from each base vector + static int vectors_per_set; - // if quarter precision halo, promote for null-space finding to half precision - QudaPrecision halo_precision = diracSmootherSloppy->HaloPrecision(); - if (halo_precision == QUDA_QUARTER_PRECISION) diracSmootherSloppy->setHaloPrecision(QUDA_HALF_PRECISION); - - Solver *solve; - DiracMdagM *mdagm = (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) ? new DiracMdagM(*diracSmoother) : nullptr; - DiracMdagM *mdagmSloppy = (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) ? new DiracMdagM(*diracSmootherSloppy) : nullptr; - if (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) { - solve = Solver::create(solverParam, *mdagm, *mdagmSloppy, *mdagmSloppy, *mdagmSloppy, profile); - } else if (solverParam.inv_type == QUDA_MG_INVERTER) { - // in case MG has not been created, we create the Smoother - if (!transfer) createSmoother(); - - // run GCR with the MG as a preconditioner - solverParam.inv_type_precondition = QUDA_MG_INVERTER; - solverParam.schwarz_type = QUDA_ADDITIVE_SCHWARZ; - solverParam.precondition_cycle = 1; - solverParam.tol_precondition = 1e-1; - solverParam.maxiter_precondition = 1; - solverParam.omega = 1.0; - solverParam.verbosity_precondition = param.mg_global.verbosity[param.level+1]; - solverParam.precision_sloppy = solverParam.precision; - solverParam.compute_true_res = 0; - solverParam.preconditioner = this; - - solverParam.inv_type = QUDA_GCR_INVERTER; - solve = Solver::create(solverParam, *param.matSmooth, *param.matSmooth, *param.matSmoothSloppy, - *param.matSmoothSloppy, profile); - solverParam.inv_type = QUDA_MG_INVERTER; - } else { - solve = Solver::create(solverParam, *param.matSmooth, *param.matSmoothSloppy, *param.matSmoothSloppy, - *param.matSmoothSloppy, profile); + // how many chebyshev iterations between each vector in the set + static int iterations_for_next; + + if (!envset) { + + char *cheby = getenv("QUDA_USE_CHEBY"); + if (cheby) { use_cheby = (atoi(cheby) == 1) ? true : false; } + + if (use_cheby) { + char *tmp_string; + + tmp_string = getenv("QUDA_CHEBY_LMIN"); + if (tmp_string) { + lambda_min = atof(tmp_string); + } else { + errorQuda("QUDA_CHEBY_LMIN unset"); + } + + tmp_string = getenv("QUDA_VEC_FILTER_ITER"); + if (tmp_string) { + filter_iterations = atoi(tmp_string); + } else { + errorQuda("QUDA_VEC_FILTER_ITER unset"); + } + + tmp_string = getenv("QUDA_VEC_PER_SET"); + if (tmp_string) { + vectors_per_set = atoi(tmp_string); + } else { + errorQuda("QUDA_VEC_PER_SET unset"); + } + + tmp_string = getenv("QUDA_VEC_ITER_FOR_NEXT"); + if (tmp_string) { + iterations_for_next = atoi(tmp_string); + } else { + errorQuda("QUDA_VEC_ITER_FOR_NEXT unset"); + } + } + + envset = true; } - for (int si = 0; si < param.mg_global.num_setup_iter[param.level]; si++) { - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Running vectors setup on level %d iter %d of %d\n", param.level, si + 1, - param.mg_global.num_setup_iter[param.level]); + // Just constraining to level 2 for now + if (param.level == 2 && use_cheby) { + + // Filtering approach based on arXiv:2103.05034, P. Boyle and A. Yamaguchi. + + // use the right space; diracResidual is guaranteed to be full-parity + DiracMdagM dirac_mdm(diracResidual); + + // temporaries for Dirac applications + ColorSpinorField tmp1(*B[0]); + ColorSpinorField tmp2(*B[0]); + + // temporaries needed for Chebyshev filter + ColorSpinorField pk(*B[0]); + ColorSpinorField pkm1(*B[0]); + ColorSpinorField pkm2(*B[0]); + ColorSpinorField Apkm1(*B[0]); + + // approximate lambda_max + double lambda_max = 1.1 * Solver::performPowerIterations(dirac_mdm, *B[0], pk, pkm1, + 100, 10, tmp1, tmp2); + + // create filter interpolators + double m_map_filter = 2. / (lambda_max - lambda_min); + double b_map_filter = - (lambda_max + lambda_min) / (lambda_max - lambda_min); + + // create interpolators for generating more near-nulls + double m_map_generate = 2. / lambda_max; + double b_map_generate = -1.; + + // we create batches of near-nulls from `B.size() / vectors_per_set` starting + // random vectors + int num_vec = static_cast(B.size()); - // global orthonormalization of the initial null-space vectors - if(param.mg_global.pre_orthonormalize) { - for(int i=0; i<(int)B.size(); i++) { - for (int j=0; j caxpy(-alpha, *B[j], *B[i]); // i-j } double nrm2 = norm2(*B[i]); - if (nrm2 > 1e-16) ax(1.0 /sqrt(nrm2), *B[i]);// i/ + if (nrm2 > 1e-16) ax(1.0 / sqrt(nrm2), *B[i]);// i/ else errorQuda("\nCannot normalize %u vector\n", i); } } - // launch solver for each source - for (int i=0; i<(int)B.size(); i++) { - if (param.mg_global.setup_type == QUDA_TEST_VECTOR_SETUP) { // DDalphaAMG test vector idea - b = *B[i]; // inverting against the vector - zero(x); // with zero initial guess - } else { - x = *B[i]; - zero(b); + for (int s = 0; s < num_starting_vectors; s++) { + + // filter + { + // P0 + blas::copy(pk, *B[s]); + + if (filter_iterations > 0) { + // P1 = m Ap_0 + b p_0 + std::swap(pkm1, pk); // p_k -> p_{k - 1} + dirac_mdm(Apkm1, pkm1, tmp1, tmp2); + blas::axpbyz(m_map_filter, Apkm1, b_map_filter, pkm1, pk); + + if (filter_iterations > 1) { + // Enter recursion relation + for (int k = 2; k <= filter_iterations; k++) { + std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} + std::swap(pkm1, pk); // p_k -> p_{k-1} + dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} + blas::axpbypczw(2. * m_map_filter, Apkm1, 2. * b_map_filter, pkm1, -1., pkm2, pk); + + // heuristic rescale to keep norms in check... + if (k % 50 == 0) { + double tmp_nrm2 = blas::norm2(pk); + printf("Starting vector %d heuristic rescale at %d old norm2 %e\n", s, k, tmp_nrm2); + double tmp_inv_nrm = 1. / sqrt(tmp_nrm2); + ax(tmp_inv_nrm, pk); + ax(tmp_inv_nrm, pkm1); + ax(tmp_inv_nrm, pkm2); + ax(tmp_inv_nrm, Apkm1); + } + } + } + } + + // print the norm (to check for enhancement, normalize) + double nrm2 = blas::norm2(pk); + printfQuda("Enhanced norm2 for start %d is %e\n", s, nrm2); + ax(1.0 / sqrt(nrm2), pk); + + // This is the first filtered vector + blas::copy(*B[s], pk); } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial guess = %g\n", norm2(x)); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial rhs = %g\n", norm2(b)); + // Now we generate further vectors by another Chebyshev filter from 0 -> lambda_max + // P0 is trivially in place + + // P1 + std::swap(pkm1, pk); // p_k -> p_{k - 1} + dirac_mdm(Apkm1, pkm1, tmp1, tmp2); + blas::axpbyz(m_map_generate, Apkm1, b_map_generate, pkm1, pk); + + bool first = true; - ColorSpinorField *out=nullptr, *in=nullptr; - diracSmoother->prepare(in, out, x, b, QUDA_MAT_SOLUTION); - (*solve)(*out, *in); - diracSmoother->reconstruct(x, b, QUDA_MAT_SOLUTION); + for (int i = s + num_starting_vectors; i < num_vec; i += num_starting_vectors) { + + for (int k = first ? 2 : 0; k < iterations_for_next; k++) { + std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} + std::swap(pkm1, pk); // p_k -> p_{k-1} + dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} + blas::axpbypczw(2. * m_map_generate, Apkm1, 2. * b_map_generate, pkm1, -1., pkm2, pk); + } + + blas::copy(*B[i], pk); + + if (first) first = false; + + } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Solution = %g\n", norm2(x)); - *B[i] = x; + } + + // print norms + printfQuda("Norms of Chebyshev filtered near-null vectors:\n"); + for (int i = 0; i < num_vec; i++) { + printfQuda("Vector %d = %e\n", i, sqrt(blas::norm2(*B[i]))); } // global orthonormalization of the generated null-space vectors if (param.mg_global.post_orthonormalize) { - for(int i=0; i<(int)B.size(); i++) { - for (int j=0; j caxpy(-alpha, *B[j], *B[i]); // i-j } @@ -1552,43 +1608,191 @@ namespace quda } } - if (solverParam.inv_type == QUDA_MG_INVERTER) { + //errorQuda("Quit out..."); - if (transfer) { - resetTransfer = true; - reset(); - if ( param.level < param.Nlevel-2 ) { - if ( param.mg_global.generate_all_levels == QUDA_BOOLEAN_TRUE ) { - coarse->generateNullVectors(*B_coarse, refresh); - } else { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Restricting null space vectors\n"); - for (int i=0; iR(*(*B_coarse)[i], *(param.B[i])); + } else { + + // Traditional inverse iterations setup + + SolverParam solverParam(param); // Set solver field parameters: + // set null-space generation options - need to expose these + solverParam.maxiter + = refresh ? param.mg_global.setup_maxiter_refresh[param.level] : param.mg_global.setup_maxiter[param.level]; + solverParam.tol = param.mg_global.setup_tol[param.level]; + solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES; + solverParam.delta = 1e-1; + solverParam.inv_type = param.mg_global.setup_inv_type[param.level]; + // Hard coded for now... + if (is_ca_solver(solverParam.inv_type)) { + solverParam.ca_basis = param.mg_global.setup_ca_basis[param.level]; + solverParam.ca_lambda_min = param.mg_global.setup_ca_lambda_min[param.level]; + solverParam.ca_lambda_max = param.mg_global.setup_ca_lambda_max[param.level]; + solverParam.Nkrylov = param.mg_global.setup_ca_basis_size[param.level]; + } else if (solverParam.inv_type == QUDA_GCR_INVERTER || solverParam.inv_type == QUDA_BICGSTABL_INVERTER) { + solverParam.Nkrylov = param.mg_global.setup_ca_basis_size[param.level]; + } else { + solverParam.Nkrylov = 4; + } + solverParam.pipeline + = (solverParam.inv_type == QUDA_BICGSTAB_INVERTER ? 0 : 4); // FIXME: pipeline != 0 breaks BICGSTAB + solverParam.precision = r->Precision(); + + if (is_fine_grid()) { + solverParam.precision_sloppy = param.mg_global.invert_param->cuda_prec_precondition; + solverParam.precision_precondition = param.mg_global.invert_param->cuda_prec_precondition; + } else { + solverParam.precision_precondition = solverParam.precision; + } + + solverParam.residual_type = static_cast(QUDA_L2_RELATIVE_RESIDUAL); + solverParam.compute_null_vector = QUDA_COMPUTE_NULL_VECTOR_YES; + ColorSpinorParam csParam(*B[0]); // Create spinor field parameters: + csParam.setPrecision(r->Precision(), r->Precision(), true); // ensure native ordering + csParam.location = QUDA_CUDA_FIELD_LOCATION; // hard code to GPU location for null-space generation for now + csParam.gammaBasis = B[0]->Nspin() == 1 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS : + QUDA_UKQCD_GAMMA_BASIS; // degrand-rossi required for staggered + csParam.create = QUDA_ZERO_FIELD_CREATE; + ColorSpinorField b(csParam); + ColorSpinorField x(csParam); + + csParam.create = QUDA_NULL_FIELD_CREATE; + + // if we not using GCR/MG smoother then we need to switch off Schwarz since regular Krylov solvers do not support it + bool schwarz_reset = solverParam.inv_type != QUDA_MG_INVERTER + && param.mg_global.smoother_schwarz_type[param.level] != QUDA_INVALID_SCHWARZ; + if (schwarz_reset) { + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Disabling Schwarz for null-space finding"); + int commDim[QUDA_MAX_DIM]; + for (int i = 0; i < QUDA_MAX_DIM; i++) commDim[i] = 1; + diracSmootherSloppy->setCommDim(commDim); + } + + // if quarter precision halo, promote for null-space finding to half precision + QudaPrecision halo_precision = diracSmootherSloppy->HaloPrecision(); + if (halo_precision == QUDA_QUARTER_PRECISION) diracSmootherSloppy->setHaloPrecision(QUDA_HALF_PRECISION); + + Solver *solve; + DiracMdagM *mdagm = (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) ? new DiracMdagM(*diracSmoother) : nullptr; + DiracMdagM *mdagmSloppy = (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) ? new DiracMdagM(*diracSmootherSloppy) : nullptr; + if (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) { + solve = Solver::create(solverParam, *mdagm, *mdagmSloppy, *mdagmSloppy, *mdagmSloppy, profile); + } else if (solverParam.inv_type == QUDA_MG_INVERTER) { + // in case MG has not been created, we create the Smoother + if (!transfer) createSmoother(); + + // run GCR with the MG as a preconditioner + solverParam.inv_type_precondition = QUDA_MG_INVERTER; + solverParam.schwarz_type = QUDA_ADDITIVE_SCHWARZ; + solverParam.precondition_cycle = 1; + solverParam.tol_precondition = 1e-1; + solverParam.maxiter_precondition = 1; + solverParam.omega = 1.0; + solverParam.verbosity_precondition = param.mg_global.verbosity[param.level+1]; + solverParam.precision_sloppy = solverParam.precision; + solverParam.compute_true_res = 0; + solverParam.preconditioner = this; + + solverParam.inv_type = QUDA_GCR_INVERTER; + solve = Solver::create(solverParam, *param.matSmooth, *param.matSmooth, *param.matSmoothSloppy, + *param.matSmoothSloppy, profile); + solverParam.inv_type = QUDA_MG_INVERTER; + } else { + solve = Solver::create(solverParam, *param.matSmooth, *param.matSmoothSloppy, *param.matSmoothSloppy, + *param.matSmoothSloppy, profile); + } + + for (int si = 0; si < param.mg_global.num_setup_iter[param.level]; si++) { + if (getVerbosity() >= QUDA_VERBOSE) + printfQuda("Running vectors setup on level %d iter %d of %d\n", param.level, si + 1, + param.mg_global.num_setup_iter[param.level]); + + // global orthonormalization of the initial null-space vectors + if(param.mg_global.pre_orthonormalize) { + for(int i=0; i<(int)B.size(); i++) { + for (int j=0; j + caxpy(-alpha, *B[j], *B[i]); // i-j + } + double nrm2 = norm2(*B[i]); + if (nrm2 > 1e-16) ax(1.0 /sqrt(nrm2), *B[i]);// i/ + else errorQuda("\nCannot normalize %u vector\n", i); + } + } + + // launch solver for each source + for (int i=0; i<(int)B.size(); i++) { + if (param.mg_global.setup_type == QUDA_TEST_VECTOR_SETUP) { // DDalphaAMG test vector idea + b = *B[i]; // inverting against the vector + zero(x); // with zero initial guess + } else { + x = *B[i]; + zero(b); + } + + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial guess = %g\n", norm2(x)); + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial rhs = %g\n", norm2(b)); + + ColorSpinorField *out=nullptr, *in=nullptr; + diracSmoother->prepare(in, out, x, b, QUDA_MAT_SOLUTION); + (*solve)(*out, *in); + diracSmoother->reconstruct(x, b, QUDA_MAT_SOLUTION); + + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Solution = %g\n", norm2(x)); + *B[i] = x; + } + + // global orthonormalization of the generated null-space vectors + if (param.mg_global.post_orthonormalize) { + for(int i=0; i<(int)B.size(); i++) { + for (int j=0; j + caxpy(-alpha, *B[j], *B[i]); // i-j + } + double nrm2 = norm2(*B[i]); + if (sqrt(nrm2) > 1e-16) ax(1.0/sqrt(nrm2), *B[i]);// i/ + else errorQuda("\nCannot normalize %u vector (nrm=%e)\n", i, sqrt(nrm2)); + } + } + + if (solverParam.inv_type == QUDA_MG_INVERTER) { + + if (transfer) { + resetTransfer = true; + reset(); + if ( param.level < param.Nlevel-2 ) { + if ( param.mg_global.generate_all_levels == QUDA_BOOLEAN_TRUE ) { + coarse->generateNullVectors(*B_coarse, refresh); + } else { + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Restricting null space vectors\n"); + for (int i=0; iR(*(*B_coarse)[i], *(param.B[i])); + } + // rebuild the transfer operator in the coarse level + coarse->resetTransfer = true; + coarse->reset(); } - // rebuild the transfer operator in the coarse level - coarse->resetTransfer = true; - coarse->reset(); } + } else { + reset(); } - } else { - reset(); } } - } - delete solve; - if (mdagm) delete mdagm; - if (mdagmSloppy) delete mdagmSloppy; + delete solve; + if (mdagm) delete mdagm; + if (mdagmSloppy) delete mdagmSloppy; - diracSmootherSloppy->setHaloPrecision(halo_precision); // restore halo precision + diracSmootherSloppy->setHaloPrecision(halo_precision); // restore halo precision + + // reenable Schwarz + if (schwarz_reset) { + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Reenabling Schwarz for null-space finding"); + int commDim[QUDA_MAX_DIM]; + for (int i=0; isetCommDim(commDim); + } - // reenable Schwarz - if (schwarz_reset) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Reenabling Schwarz for null-space finding"); - int commDim[QUDA_MAX_DIM]; - for (int i=0; isetCommDim(commDim); } if (param.mg_global.vec_store[param.level] == QUDA_BOOLEAN_TRUE) { // conditional store of null vectors From da231089b4e7dd5bcb520a846def2ba5aa179f01 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 13 Apr 2022 21:01:02 -0400 Subject: [PATCH 02/15] Added solver.hpp include --- lib/multigrid.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 36cbc2ecbc..6c67f20f42 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -4,6 +4,7 @@ #include #include #include +#include // for building the KD inverse op #include From 4dce64ec03923511873ba34f75e0b5cdcac6071f Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 14 Apr 2022 17:32:06 -0400 Subject: [PATCH 03/15] Separate flags for level 1 and 2 --- lib/multigrid.cpp | 112 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 83 insertions(+), 29 deletions(-) diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 6c67f20f42..aa7c3c7e4e 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -1416,54 +1416,108 @@ namespace quda static bool envset = false; // use chebyshev filter or not - static bool use_cheby; + static bool use_cheby[QUDA_MAX_MG_LEVEL]; // bottom of window for filtering stage, eigenvalues smaller than lamda_min enhanced - static double lambda_min; + static double lambda_min[QUDA_MAX_MG_LEVEL]; // how many filtering iterations should we do - static int filter_iterations; + static int filter_iterations[QUDA_MAX_MG_LEVEL]; + + // how frequently we rescale while we filter + static int filter_rescale_freq[QUDA_MAX_MG_LEVEL]; // how many near-nulls to generate from each base vector - static int vectors_per_set; + static int vectors_per_set[QUDA_MAX_MG_LEVEL]; // how many chebyshev iterations between each vector in the set - static int iterations_for_next; + static int iterations_for_next[QUDA_MAX_MG_LEVEL]; if (!envset) { - char *cheby = getenv("QUDA_USE_CHEBY"); - if (cheby) { use_cheby = (atoi(cheby) == 1) ? true : false; } + // level 1 + char *cheby = getenv("QUDA_USE_CHEBY_L1"); + if (cheby) { use_cheby[1] = (atoi(cheby) == 1) ? true : false; } - if (use_cheby) { + if (use_cheby[1]) { char *tmp_string; - tmp_string = getenv("QUDA_CHEBY_LMIN"); + tmp_string = getenv("QUDA_CHEBY_LMIN_L1"); + if (tmp_string) { + lambda_min[1] = atof(tmp_string); + } else { + errorQuda("QUDA_CHEBY_LMIN_L1 unset"); + } + + tmp_string = getenv("QUDA_VEC_FILTER_ITER_L1"); + if (tmp_string) { + filter_iterations[1] = atoi(tmp_string); + } else { + errorQuda("QUDA_VEC_FILTER_ITER_L1 unset"); + } + + tmp_string = getenv("QUDA_VEC_FILTER_RESCALE_FREQ_L1"); + if (tmp_string) { + filter_rescale_freq[1] = atoi(tmp_string); + } else { + errorQuda("QUDA_VEC_FILTER_RESCALE_FREQ_L1 unset"); + } + + tmp_string = getenv("QUDA_VEC_PER_SET_L1"); + if (tmp_string) { + vectors_per_set[1] = atoi(tmp_string); + } else { + errorQuda("QUDA_VEC_PER_SET_L1 unset_L1"); + } + + tmp_string = getenv("QUDA_VEC_ITER_FOR_NEXT_L1"); + if (tmp_string) { + iterations_for_next[1] = atoi(tmp_string); + } else { + errorQuda("QUDA_VEC_ITER_FOR_NEXT_L1 unset"); + } + } + + // level 2 + cheby = getenv("QUDA_USE_CHEBY_L1"); + if (cheby) { use_cheby[2] = (atoi(cheby) == 1) ? true : false; } + + if (use_cheby[2]) { + char *tmp_string; + + tmp_string = getenv("QUDA_CHEBY_LMIN_L2"); + if (tmp_string) { + lambda_min[2] = atof(tmp_string); + } else { + errorQuda("QUDA_CHEBY_LMIN_L2 unset"); + } + + tmp_string = getenv("QUDA_VEC_FILTER_ITER_L2"); if (tmp_string) { - lambda_min = atof(tmp_string); + filter_iterations[2] = atoi(tmp_string); } else { - errorQuda("QUDA_CHEBY_LMIN unset"); + errorQuda("QUDA_VEC_FILTER_ITER_L2 unset"); } - tmp_string = getenv("QUDA_VEC_FILTER_ITER"); + tmp_string = getenv("QUDA_VEC_FILTER_RESCALE_FREQ_L2"); if (tmp_string) { - filter_iterations = atoi(tmp_string); + filter_rescale_freq[2] = atoi(tmp_string); } else { - errorQuda("QUDA_VEC_FILTER_ITER unset"); + errorQuda("QUDA_VEC_FILTER_RESCALE_FREQ_L2 unset"); } - tmp_string = getenv("QUDA_VEC_PER_SET"); + tmp_string = getenv("QUDA_VEC_PER_SET_L2"); if (tmp_string) { - vectors_per_set = atoi(tmp_string); + vectors_per_set[2] = atoi(tmp_string); } else { - errorQuda("QUDA_VEC_PER_SET unset"); + errorQuda("QUDA_VEC_PER_SET_L2 unset_L1"); } - tmp_string = getenv("QUDA_VEC_ITER_FOR_NEXT"); + tmp_string = getenv("QUDA_VEC_ITER_FOR_NEXT_L2"); if (tmp_string) { - iterations_for_next = atoi(tmp_string); + iterations_for_next[2] = atoi(tmp_string); } else { - errorQuda("QUDA_VEC_ITER_FOR_NEXT unset"); + errorQuda("QUDA_VEC_ITER_FOR_NEXT_L2 unset"); } } @@ -1471,7 +1525,7 @@ namespace quda } // Just constraining to level 2 for now - if (param.level == 2 && use_cheby) { + if ((param.level == 1 || param.level == 2) && use_cheby) { // Filtering approach based on arXiv:2103.05034, P. Boyle and A. Yamaguchi. @@ -1493,8 +1547,8 @@ namespace quda 100, 10, tmp1, tmp2); // create filter interpolators - double m_map_filter = 2. / (lambda_max - lambda_min); - double b_map_filter = - (lambda_max + lambda_min) / (lambda_max - lambda_min); + double m_map_filter = 2. / (lambda_max - lambda_min[param.level]); + double b_map_filter = - (lambda_max + lambda_min[param.level]) / (lambda_max - lambda_min[param.level]); // create interpolators for generating more near-nulls double m_map_generate = 2. / lambda_max; @@ -1504,7 +1558,7 @@ namespace quda // random vectors int num_vec = static_cast(B.size()); - int num_starting_vectors = (num_vec + vectors_per_set - 1) / vectors_per_set; + int num_starting_vectors = (num_vec + vectors_per_set[param.level] - 1) / vectors_per_set[param.level]; // orthonormalize if (param.mg_global.pre_orthonormalize) { @@ -1526,22 +1580,22 @@ namespace quda // P0 blas::copy(pk, *B[s]); - if (filter_iterations > 0) { + if (filter_iterations[param.level] > 0) { // P1 = m Ap_0 + b p_0 std::swap(pkm1, pk); // p_k -> p_{k - 1} dirac_mdm(Apkm1, pkm1, tmp1, tmp2); blas::axpbyz(m_map_filter, Apkm1, b_map_filter, pkm1, pk); - if (filter_iterations > 1) { + if (filter_iterations[param.level] > 1) { // Enter recursion relation - for (int k = 2; k <= filter_iterations; k++) { + for (int k = 2; k <= filter_iterations[param.level]; k++) { std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} std::swap(pkm1, pk); // p_k -> p_{k-1} dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} blas::axpbypczw(2. * m_map_filter, Apkm1, 2. * b_map_filter, pkm1, -1., pkm2, pk); // heuristic rescale to keep norms in check... - if (k % 50 == 0) { + if (k % filter_rescale_freq[param.level] == 0) { double tmp_nrm2 = blas::norm2(pk); printf("Starting vector %d heuristic rescale at %d old norm2 %e\n", s, k, tmp_nrm2); double tmp_inv_nrm = 1. / sqrt(tmp_nrm2); @@ -1575,7 +1629,7 @@ namespace quda for (int i = s + num_starting_vectors; i < num_vec; i += num_starting_vectors) { - for (int k = first ? 2 : 0; k < iterations_for_next; k++) { + for (int k = first ? 2 : 0; k < iterations_for_next[param.level]; k++) { std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} std::swap(pkm1, pk); // p_k -> p_{k-1} dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} From 7f659a86cdc8d7e37a24d542414d10d1e736768f Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 27 Apr 2022 18:29:20 -0400 Subject: [PATCH 04/15] Big commit refactoring near-null setup, exposing multiple types on a per-level basis; chebyshev filter parameters are still only setable via the command line --- include/dirac_quda.h | 10 + include/enum_quda.h | 20 +- include/enum_quda_fortran.h | 17 +- include/invert_quda.h | 8 +- include/multigrid.h | 32 +- include/quda.h | 10 +- lib/check_params.h | 26 +- lib/inv_bicgstab_quda.cpp | 6 +- lib/inv_bicgstabl_quda.cpp | 4 +- lib/inv_ca_cg.cpp | 2 +- lib/inv_ca_gcr.cpp | 2 +- lib/inv_cg3_quda.cpp | 2 +- lib/inv_cg_quda.cpp | 2 +- lib/inv_gcr_quda.cpp | 2 +- lib/inv_pcg_quda.cpp | 2 +- lib/milc_interface.cpp | 9 +- lib/multigrid.cpp | 799 +++++++++++++++------------- lib/transfer.cpp | 2 +- tests/invert_test.cpp | 2 +- tests/staggered_invert_test.cpp | 2 +- tests/utils/command_line_params.cpp | 25 +- tests/utils/command_line_params.h | 4 +- tests/utils/set_params.cpp | 16 +- 23 files changed, 524 insertions(+), 480 deletions(-) diff --git a/include/dirac_quda.h b/include/dirac_quda.h index 6884072332..455c4e8f48 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -1296,12 +1296,22 @@ namespace quda { virtual QudaDiracType getDiracType() const { return QUDA_STAGGERED_DIRAC; } +<<<<<<< HEAD /** @brief Return the one-hop field for staggered operators for MG setup @return Gauge field */ virtual cudaGaugeField *getStaggeredShortLinkField() const { return gauge; } +======= + /** @brief Return the one-hop field for staggered operators for MG setup + + @return Gauge field + */ + virtual cudaGaugeField* getStaggeredShortLinkField() const { + return gauge; + } +>>>>>>> 19643384c (Big commit refactoring near-null setup, exposing multiple types on a per-level basis; chebyshev filter parameters are still only setable via the command line) /** * @brief Create the coarse staggered operator. diff --git a/include/enum_quda.h b/include/enum_quda.h index b996f02e61..69c572ffc9 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -438,17 +438,15 @@ typedef enum QudaDeflatedGuess_s { QUDA_DEFLATED_GUESS_INVALID = QUDA_INVALID_ENUM } QudaDeflatedGuess; -typedef enum QudaComputeNullVector_s { - QUDA_COMPUTE_NULL_VECTOR_NO, - QUDA_COMPUTE_NULL_VECTOR_YES, - QUDA_COMPUTE_NULL_VECTOR_INVALID = QUDA_INVALID_ENUM -} QudaComputeNullVector; - -typedef enum QudaSetupType_s { - QUDA_NULL_VECTOR_SETUP, - QUDA_TEST_VECTOR_SETUP, - QUDA_INVALID_SETUP_TYPE = QUDA_INVALID_ENUM -} QudaSetupType; +typedef enum QudaNullVectorSetupType_s { + QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS, + QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER, + QUDA_SETUP_NULL_VECTOR_EIGENVECTORS, + QUDA_SETUP_NULL_VECTOR_TEST_VECTORS, + QUDA_SETUP_NULL_VECTOR_RESTRICT_FINE, + QUDA_SETUP_NULL_VECTOR_FREE_FIELD, + QUDA_SETUP_NULL_VECTOR_INVALID = QUDA_INVALID_ENUM +} QudaNullVectorSetupType; typedef enum QudaTransferType_s { QUDA_TRANSFER_AGGREGATE, diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index a77b849a81..f793c9726a 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -399,15 +399,14 @@ #define QUDA_USE_INIT_GUESS_YES 1 #define QUDA_USE_INIT_GUESS_INVALID QUDA_INVALID_ENUM -#define QudaComputeNullVector integer(4) -#define QUDA_COMPUTE_NULL_VECTOR_NO 0 -#define QUDA_COMPUTE_NULL_VECTOR_YES 1 -#define QUDA_COMPUTE_NULL_VECTOR_INVALID QUDA_INVALID_ENUM - -#define QudaSetupType integer(4) -#define QUDA_NULL_VECTOR_SETUP 0 -#define QUDA_TEST_VECTOR_SETUP 1 -#define QUDA_INVALID_SETUP_TYPE QUDA_INVALID_ENUM +#define QudaNullVectorSetupType integer(4) +#define QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS 0 +#define QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER 1 +#define QUDA_SETUP_NULL_VECTOR_EIGENVECTOES 2 +#define QUDA_SETUP_NULL_VECTOR_TEST_VECTORS 3 +#define QUDA_SETUP_NULL_VECTOR_RESTRICT_FINE 4 +#define QUDA_SETUP_NULL_VECTOR_FREE_FIELD 5 +#define QUDA_SETUP_NULL_VECTOR_INVALID QUDA_INVALID_ENUM #define QudaTransferType integer(4) #define QUDA_TRANSFER_AGGREGATE 0 diff --git a/include/invert_quda.h b/include/invert_quda.h index b17b41ac58..e4b9925c72 100644 --- a/include/invert_quda.h +++ b/include/invert_quda.h @@ -60,8 +60,8 @@ namespace quda { /**< Whether to use an initial guess in the solver or not */ QudaUseInitGuess use_init_guess; - /**< Whether to solve linear system with zero RHS */ - QudaComputeNullVector compute_null_vector; + /**< Whether or not to allow a zero RHS solve for near-null vector generation */ + bool compute_null_vector; /**< Reliable update tolerance */ double delta; @@ -269,7 +269,7 @@ namespace quda { Default constructor */ SolverParam() : - compute_null_vector(QUDA_COMPUTE_NULL_VECTOR_NO), + compute_null_vector(false), compute_true_res(true), sloppy_converge(false), verbosity_precondition(QUDA_SILENT), @@ -291,7 +291,7 @@ namespace quda { residual_type(param.residual_type), deflate(param.eig_param != 0), use_init_guess(param.use_init_guess), - compute_null_vector(QUDA_COMPUTE_NULL_VECTOR_NO), + compute_null_vector(false), delta(param.reliable_delta), use_alternative_reliable(param.use_alternative_reliable), use_sloppy_partial_accumulator(param.use_sloppy_partial_accumulator), diff --git a/include/multigrid.h b/include/multigrid.h index cf8bfe7452..cd90b40f3e 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -231,7 +231,7 @@ namespace quda { SolverParam *param_coarse_solver; /** The coarse-grid representation of the null space vectors */ - std::vector *B_coarse; + std::vector B_coarse; /** Residual vector */ ColorSpinorField *r; @@ -394,9 +394,8 @@ namespace quda { /** @brief Load the null space vectors in from file - @param B Loaded null-space vectors (pre-allocated) */ - void loadVectors(std::vector &B); + void loadVectors(); /** @brief Save the null space vectors in from file @@ -404,23 +403,40 @@ namespace quda { */ void saveVectors(const std::vector &B) const; + + /** + @brief Wrapping function that manages logic for constructing near-null vectors + */ + void constructNearNulls(bool refresh = false); + + /** + @brief Generate the null-space vectors via inverse iterations + @param B Generated null-space vectors + @param refresh Whether we refreshing pre-exising vectors or starting afresh + */ + void generateInverseIterations(std::vector &B, bool refresh = false); + /** - @brief Generate the null-space vectors + @brief Generate the null-space vectors via a Chebyshev filter @param B Generated null-space vectors @param refresh Whether we refreshing pre-exising vectors or starting afresh */ - void generateNullVectors(std::vector &B, bool refresh=false); + void generateChebyshevFilter(std::vector &B, bool refresh = false); /** @brief Generate lowest eigenvectors */ - void generateEigenVectors(); + void generateEigenvectors(); + + /** + @brief Generate near-null vectors via restricting finer near-nulls + */ + void generateRestrictedVectors(); /** @brief Build free-field null-space vectors - @param B Free-field null-space vectors */ - void buildFreeVectors(std::vector &B); + void generateFreeVectors(); /** @brief Return the total flops done on this and all coarser levels. diff --git a/include/quda.h b/include/quda.h index 8995013576..9867684b72 100644 --- a/include/quda.h +++ b/include/quda.h @@ -651,7 +651,7 @@ extern "C" { double setup_ca_lambda_max[QUDA_MAX_MG_LEVEL]; /** Null-space type to use in the setup phase */ - QudaSetupType setup_type; + QudaNullVectorSetupType setup_type[QUDA_MAX_MG_LEVEL]; /** Pre orthonormalize vectors in the setup phase */ QudaBoolean pre_orthonormalize; @@ -714,7 +714,7 @@ extern "C" { int smoother_schwarz_cycle[QUDA_MAX_MG_LEVEL]; /** The type of residual to send to the next coarse grid, and thus the - type of solution to receive back from this coarse grid */ + type of solution to receive back from this coarse grid */ QudaSolutionType coarse_grid_solution_type[QUDA_MAX_MG_LEVEL]; /** The type of smoother solve to do on each grid (e/o preconditioning or not)*/ @@ -740,12 +740,6 @@ extern "C" { memory */ QudaBoolean setup_minimize_memory; - /** Whether to compute the null vectors or reload them */ - QudaComputeNullVector compute_null_vector; - - /** Whether to generate on all levels or just on level 0 */ - QudaBoolean generate_all_levels; - /** Whether to run the verification checks once set up is complete */ QudaBoolean run_verify; diff --git a/lib/check_params.h b/lib/check_params.h index 358ce71730..ae46c0e920 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -760,12 +760,6 @@ void printQudaMultigridParam(QudaMultigridParam *param) { int n_level = param->n_level; #endif -#ifdef INIT_PARAM - P(setup_type, QUDA_NULL_VECTOR_SETUP); -#else - P(setup_type, QUDA_INVALID_SETUP_TYPE); -#endif - #ifdef INIT_PARAM P(pre_orthonormalize, QUDA_BOOLEAN_FALSE); #else @@ -779,6 +773,12 @@ void printQudaMultigridParam(QudaMultigridParam *param) { #endif for (int i=0; igenerate_all_levels == QUDA_BOOLEAN_FALSE && param->compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES) { - for (int i=1; in_vec[0] != param->n_vec[i]) - errorQuda("n_vec %d != %d must be equal on all levels if generate_all_levels == false", - param->n_vec[0], param->n_vec[i]); - } -#endif - P(run_verify, QUDA_BOOLEAN_INVALID); #ifdef INIT_PARAM diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp index 7374241575..ab8874c2f4 100644 --- a/lib/inv_bicgstab_quda.cpp +++ b/lib/inv_bicgstab_quda.cpp @@ -92,7 +92,7 @@ namespace quda { // Check to see that we're not trying to invert on a zero-field source if (b2 == 0) { - if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { + if (!param.compute_null_vector) { warningQuda("inverting on zero-field source"); x = b; param.true_res = 0.0; @@ -110,7 +110,7 @@ namespace quda { if (param.precision_sloppy == x.Precision()) { r_sloppy = &r; - if(param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) + if(!param.compute_null_vector) { r_0 = &b; } @@ -333,7 +333,7 @@ namespace quda { delete r_0; delete r_sloppy; } - else if(param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES) + else if (param.compute_null_vector) { delete r_0; } diff --git a/lib/inv_bicgstabl_quda.cpp b/lib/inv_bicgstabl_quda.cpp index a134efdf75..c664bf8082 100644 --- a/lib/inv_bicgstabl_quda.cpp +++ b/lib/inv_bicgstabl_quda.cpp @@ -549,7 +549,7 @@ namespace quda { // Check to see that we're not trying to invert on a zero-field source if (b2 == 0) { - if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { + if (!param.compute_null_vector) { warningQuda("inverting on zero-field source"); x = b; param.true_res = 0.0; @@ -567,7 +567,7 @@ namespace quda { // There probably be bugs and headaches hiding here. if (param.precision_sloppy == x.Precision()) { r[0] = &r_full; // r[0] \equiv r_sloppy points to the same memory location as r. - if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) + if (!param.compute_null_vector) { r0p = &b; // r0, b point to the same vector in memory. } diff --git a/lib/inv_ca_cg.cpp b/lib/inv_ca_cg.cpp index b0506aeddc..9f96b093f6 100644 --- a/lib/inv_ca_cg.cpp +++ b/lib/inv_ca_cg.cpp @@ -498,7 +498,7 @@ namespace quda // Check to see that we're not trying to invert on a zero-field source if (b2 == 0) { - if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { + if (!param.compute_null_vector) { warningQuda("inverting on zero-field source\n"); x = b; param.true_res = 0.0; diff --git a/lib/inv_ca_gcr.cpp b/lib/inv_ca_gcr.cpp index 8c947dd2b2..3779ad1c4d 100644 --- a/lib/inv_ca_gcr.cpp +++ b/lib/inv_ca_gcr.cpp @@ -240,7 +240,7 @@ namespace quda // Check to see that we're not trying to invert on a zero-field source if (b2 == 0) { - if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { + if (!param.compute_null_vector) { warningQuda("inverting on zero-field source\n"); x = b; param.true_res = 0.0; diff --git a/lib/inv_cg3_quda.cpp b/lib/inv_cg3_quda.cpp index 567a9c639e..ec79421563 100644 --- a/lib/inv_cg3_quda.cpp +++ b/lib/inv_cg3_quda.cpp @@ -204,7 +204,7 @@ namespace quda { // Check to see that we're not trying to invert on a zero-field source double b2 = blas::norm2(b); if(b2 == 0 && - (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO || param.use_init_guess == QUDA_USE_INIT_GUESS_NO)){ + (!param.compute_null_vector || param.use_init_guess == QUDA_USE_INIT_GUESS_NO)){ profile.TPSTOP(QUDA_PROFILE_INIT); printfQuda("Warning: inverting on zero-field source\n"); x = b; diff --git a/lib/inv_cg_quda.cpp b/lib/inv_cg_quda.cpp index 8883e35b1b..417f13f8f4 100644 --- a/lib/inv_cg_quda.cpp +++ b/lib/inv_cg_quda.cpp @@ -252,7 +252,7 @@ namespace quda { double b2 = blas::norm2(b); // Check to see that we're not trying to invert on a zero-field source - if (b2 == 0 && param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { + if (b2 == 0 && !param.compute_null_vector) { if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_INIT); printfQuda("Warning: inverting on zero-field source\n"); x = b; diff --git a/lib/inv_gcr_quda.cpp b/lib/inv_gcr_quda.cpp index 22546d17ee..72baaaf277 100644 --- a/lib/inv_gcr_quda.cpp +++ b/lib/inv_gcr_quda.cpp @@ -327,7 +327,7 @@ namespace quda { // Check to see that we're not trying to invert on a zero-field source if (b2 == 0) { - if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { + if (!param.compute_null_vector) { profile.TPSTOP(QUDA_PROFILE_INIT); warningQuda("inverting on zero-field source\n"); x = b; diff --git a/lib/inv_pcg_quda.cpp b/lib/inv_pcg_quda.cpp index d2aa935628..a507cacd70 100644 --- a/lib/inv_pcg_quda.cpp +++ b/lib/inv_pcg_quda.cpp @@ -108,7 +108,7 @@ namespace quda double b2 = blas::norm2(b); // Check to see that we're not trying to invert on a zero-field source - if (b2 == 0 && param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO) { + if (b2 == 0 && !param.compute_null_vector) { profile.TPSTOP(QUDA_PROFILE_INIT); printfQuda("Warning: inverting on zero-field source\n"); x = b; diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp index 5d5b817f65..bf4f880f0f 100644 --- a/lib/milc_interface.cpp +++ b/lib/milc_interface.cpp @@ -1985,6 +1985,9 @@ void milcSetMultigridParam(milcMultigridPack *mg_pack, QudaPrecision host_precis mg_param.setup_ca_basis[i] = QUDA_POWER_BASIS; // setup_ca_basis[i]; } + // Setup type to use (inverse iterations, chebyshev filter, eigenvectors, restriction, free field) + mg_param.setup_type[i] = QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS; // setup_type[i]; + // Basis size for CA solver setup mg_param.setup_ca_basis_size[i] = input_struct.setup_ca_basis_size[i]; @@ -2167,15 +2170,9 @@ void milcSetMultigridParam(milcMultigridPack *mg_pack, QudaPrecision host_precis || input_struct.optimized_kd == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG) mg_param.spin_block_size[1] = 0; - mg_param.setup_type = QUDA_NULL_VECTOR_SETUP; // setup_type; mg_param.pre_orthonormalize = QUDA_BOOLEAN_FALSE; // pre_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; mg_param.post_orthonormalize = QUDA_BOOLEAN_TRUE; // post_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.compute_null_vector - = QUDA_COMPUTE_NULL_VECTOR_YES; // generate_nullspace ? QUDA_COMPUTE_NULL_VECTOR_YES : QUDA_COMPUTE_NULL_VECTOR_NO; - - mg_param.generate_all_levels = QUDA_BOOLEAN_TRUE; // generate_all_levels ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.run_verify = input_struct.verify_results ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; mg_param.run_low_mode_check = QUDA_BOOLEAN_FALSE; // low_mode_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; mg_param.run_oblique_proj_check = QUDA_BOOLEAN_FALSE; // oblique_proj_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index aa7c3c7e4e..782c0efd17 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -31,6 +31,7 @@ namespace quda param_presmooth(nullptr), param_postsmooth(nullptr), param_coarse_solver(nullptr), + B_coarse(), r(nullptr), b_tilde(nullptr), r_coarse(nullptr), @@ -85,37 +86,13 @@ namespace quda rng = new RNG(*param.B[0], 1234); - if (param.transfer_type == QUDA_TRANSFER_AGGREGATE) { - if (param.level < param.Nlevel - 1) { - if (param.mg_global.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES) { - if (param.mg_global.generate_all_levels == QUDA_BOOLEAN_TRUE || param.level == 0) { - - // Initializing to random vectors - for (int i = 0; i < (int)param.B.size(); i++) { - spinorNoise(*r, *rng, QUDA_NOISE_UNIFORM); - *param.B[i] = *r; - } - } - if (param.mg_global.num_setup_iter[param.level] > 0) { - if (param.mg_global.vec_load[param.level] == QUDA_BOOLEAN_TRUE - && strcmp(param.mg_global.vec_infile[param.level], "") - != 0) { // only load if infile is defined and not computing - loadVectors(param.B); - } else if (param.mg_global.use_eig_solver[param.level]) { - generateEigenVectors(); // Run the eigensolver - } else { - generateNullVectors(param.B); - } - } - } else if (strcmp(param.mg_global.vec_infile[param.level], "") - != 0) { // only load if infile is defined and not computing - if (param.mg_global.num_setup_iter[param.level] > 0) generateNullVectors(param.B); - } else if (param.mg_global.vec_load[param.level] == QUDA_BOOLEAN_TRUE) { // only conditional load of null vectors - loadVectors(param.B); - } else { // generate free field vectors - buildFreeVectors(param.B); - } - } + if (param.transfer_type == QUDA_TRANSFER_AGGREGATE && param.level < param.Nlevel - 1) { + + constructNearNulls(); + + // only save if outfile is defined + if (strcmp(param.mg_global.vec_outfile[param.level], "") != 0) + saveVectors(param.B); } // in case of iterative setup with MG the coarse level may be already built @@ -141,8 +118,9 @@ namespace quda // if we aren't doing a staggered KD solve if (param.level != 0 || param.transfer_type == QUDA_TRANSFER_AGGREGATE) { // Refresh the null-space vectors if we need to + // FIXME: can we meaningfully refresh Chebyshev filter null vecs? if (refresh && param.level < param.Nlevel - 1) { - if (param.mg_global.setup_maxiter_refresh[param.level]) generateNullVectors(param.B, refresh); + if (param.mg_global.setup_maxiter_refresh[param.level]) constructNearNulls(refresh); } } @@ -184,23 +162,14 @@ namespace quda x_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), param.mg_global.location[param.level + 1]); - B_coarse = new std::vector(); - int nVec_coarse = std::max(param.Nvec, param.mg_global.n_vec[param.level + 1]); - B_coarse->resize(nVec_coarse); + int n_vec_coarse = param.mg_global.n_vec[param.level + 1]; + B_coarse.resize(n_vec_coarse); // only have single precision B vectors on the coarse grid QudaPrecision B_coarse_precision = std::max(param.mg_global.precision_null[param.level+1], QUDA_SINGLE_PRECISION); - for (int i=0; iCreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, B_coarse_precision, param.mg_global.setup_location[param.level+1]); - - // if we're not generating on all levels then we need to propagate the vectors down - if ((param.level != 0 || param.Nlevel - 1) && param.mg_global.generate_all_levels == QUDA_BOOLEAN_FALSE) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Restricting null space vectors\n"); - for (int i=0; iR(*(*B_coarse)[i], *(param.B[i])); - } - } + for (int i = 0; i < n_vec_coarse; i++) + B_coarse[i] = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, B_coarse_precision, param.mg_global.setup_location[param.level+1]); + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Transfer operator done\n"); } @@ -229,7 +198,7 @@ namespace quda coarse->reset(refresh); } else { // create the next multigrid level - param_coarse = new MGParam(param, *B_coarse, matCoarseResidual, matCoarseSmoother, matCoarseSmootherSloppy, + param_coarse = new MGParam(param, B_coarse, matCoarseResidual, matCoarseSmoother, matCoarseSmootherSloppy, param.level + 1); param_coarse->fine = this; param_coarse->delta = 1e-20; @@ -781,16 +750,13 @@ namespace quda if (param.level < param.Nlevel - 1) { if (coarse) delete coarse; if (param.level == param.Nlevel-1 || param.cycle_type == QUDA_MG_CYCLE_RECURSIVE) { - if (coarse_solver) delete coarse_solver; - if (param_coarse_solver) delete param_coarse_solver; + if (coarse_solver) delete coarse_solver; + if (param_coarse_solver) delete param_coarse_solver; } - if (B_coarse) { - int nVec_coarse = std::max(param.Nvec, param.mg_global.n_vec[param.level + 1]); - for (int i = 0; i < nVec_coarse; i++) - if ((*B_coarse)[i]) delete (*B_coarse)[i]; - delete B_coarse; - } + int n_vec_coarse = param.mg_global.n_vec[param.level + 1]; + for (int i = 0; i < n_vec_coarse; i++) + if (B_coarse[i]) delete B_coarse[i]; if (transfer) delete transfer; if (matCoarseSmootherSloppy) delete matCoarseSmootherSloppy; if (diracCoarseSmootherSloppy) delete diracCoarseSmootherSloppy; @@ -1172,7 +1138,7 @@ namespace quda // Reuse the space for the Null vectors. By this point, // the coarse grid has already been constructed. - generateEigenVectors(); + generateEigenvectors(); for (int i = 0; i < param.Nvec; i++) { @@ -1355,7 +1321,7 @@ namespace quda } // supports separate reading or single file read - void MG::loadVectors(std::vector &B) + void MG::loadVectors() { if (param.transfer_type != QUDA_TRANSFER_AGGREGATE) { warningQuda("Cannot load near-null vectors for top level of staggered MG solve."); @@ -1370,7 +1336,7 @@ namespace quda vec_infile += "_nvec_"; vec_infile += std::to_string(param.mg_global.n_vec[param.level]); VectorIO io(vec_infile); - io.load(B); + io.load(param.B); popLevel(); profile_global.TPSTOP(QUDA_PROFILE_IO); if (is_running) profile_global.TPSTART(QUDA_PROFILE_INIT); @@ -1409,14 +1375,240 @@ namespace quda if (param.level < param.Nlevel - 2) coarse->dumpNullVectors(); } - void MG::generateNullVectors(std::vector &B, bool refresh) + void MG::constructNearNulls(bool refresh) { + pushLevel(param.level); + + auto setup_type = param.mg_global.setup_type[param.level]; + + // if a near-null vector input file is specified, always load it and + // ignore other setup information + if (strcmp(param.mg_global.vec_infile[param.level], "") != 0) { + loadVectors(); + } else { + + if (setup_type == QUDA_SETUP_NULL_VECTOR_FREE_FIELD) { + // Generate free field near-null vectors. Consistency checks on the number of + // generated vectors are done within this function. + generateFreeVectors(); + } else if (setup_type == QUDA_SETUP_NULL_VECTOR_RESTRICT_FINE) { + // Restrict near-null vectors from the finer level + generateRestrictedVectors(); + } else if (setup_type == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { + + // Run the eigensolver + generateEigenvectors(); + } else if (setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS || + setup_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER || + setup_type == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS) { + + // multiple setup iterations only matters for test vector setup + if (param.mg_global.num_setup_iter[param.level] > 1 && + setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS) + errorQuda("Multiple setup iterations can only be used with test vector setup"); + + // Initializing to random vectors + // FIXME: only need to initialize a subset for the chebyshev filter + for (int i = 0; i < (int)param.B.size(); i++) { + spinorNoise(*r, *rng, QUDA_NOISE_UNIFORM); + *param.B[i] = *r; + } + + if (setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS || + setup_type == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS) { + generateInverseIterations(param.B, refresh); + } else { + // Chebyshev filter + generateChebyshevFilter(param.B, refresh); + } + } else { + errorQuda("Invalid setup type %d", setup_type); + } + + } + + popLevel(); + } + + void MG::generateInverseIterations(std::vector &B, bool refresh) { pushLevel(param.level); - static bool envset = false; + SolverParam solverParam(param); // Set solver field parameters: + // set null-space generation options - need to expose these + solverParam.maxiter + = refresh ? param.mg_global.setup_maxiter_refresh[param.level] : param.mg_global.setup_maxiter[param.level]; + solverParam.tol = param.mg_global.setup_tol[param.level]; + solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES; + solverParam.delta = 1e-1; + solverParam.inv_type = param.mg_global.setup_inv_type[param.level]; + // Hard coded for now... + if (is_ca_solver(solverParam.inv_type)) { + solverParam.ca_basis = param.mg_global.setup_ca_basis[param.level]; + solverParam.ca_lambda_min = param.mg_global.setup_ca_lambda_min[param.level]; + solverParam.ca_lambda_max = param.mg_global.setup_ca_lambda_max[param.level]; + solverParam.Nkrylov = param.mg_global.setup_ca_basis_size[param.level]; + } else if (solverParam.inv_type == QUDA_GCR_INVERTER || solverParam.inv_type == QUDA_BICGSTABL_INVERTER) { + solverParam.Nkrylov = param.mg_global.setup_ca_basis_size[param.level]; + } else { + solverParam.Nkrylov = 4; + } + solverParam.pipeline + = (solverParam.inv_type == QUDA_BICGSTAB_INVERTER ? 0 : 4); // FIXME: pipeline != 0 breaks BICGSTAB + solverParam.precision = r->Precision(); - // use chebyshev filter or not - static bool use_cheby[QUDA_MAX_MG_LEVEL]; + if (is_fine_grid()) { + solverParam.precision_sloppy = param.mg_global.invert_param->cuda_prec_precondition; + solverParam.precision_precondition = param.mg_global.invert_param->cuda_prec_precondition; + } else { + solverParam.precision_precondition = solverParam.precision; + } + + solverParam.residual_type = static_cast(QUDA_L2_RELATIVE_RESIDUAL); + solverParam.compute_null_vector = true; + ColorSpinorParam csParam(*B[0]); // Create spinor field parameters: + csParam.setPrecision(r->Precision(), r->Precision(), true); // ensure native ordering + csParam.location = QUDA_CUDA_FIELD_LOCATION; // hard code to GPU location for null-space generation for now + csParam.gammaBasis = B[0]->Nspin() == 1 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS : + QUDA_UKQCD_GAMMA_BASIS; // degrand-rossi required for staggered + csParam.create = QUDA_ZERO_FIELD_CREATE; + ColorSpinorField b(csParam); + ColorSpinorField x(csParam); + + csParam.create = QUDA_NULL_FIELD_CREATE; + + // if we not using GCR/MG smoother then we need to switch off Schwarz since regular Krylov solvers do not support it + bool schwarz_reset = solverParam.inv_type != QUDA_MG_INVERTER + && param.mg_global.smoother_schwarz_type[param.level] != QUDA_INVALID_SCHWARZ; + if (schwarz_reset) { + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Disabling Schwarz for null-space finding"); + int commDim[QUDA_MAX_DIM]; + for (int i = 0; i < QUDA_MAX_DIM; i++) commDim[i] = 1; + diracSmootherSloppy->setCommDim(commDim); + } + + // if quarter precision halo, promote for null-space finding to half precision + QudaPrecision halo_precision = diracSmootherSloppy->HaloPrecision(); + if (halo_precision == QUDA_QUARTER_PRECISION) diracSmootherSloppy->setHaloPrecision(QUDA_HALF_PRECISION); + + Solver *solve; + DiracMdagM *mdagm = (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) ? new DiracMdagM(*diracSmoother) : nullptr; + DiracMdagM *mdagmSloppy = (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) ? new DiracMdagM(*diracSmootherSloppy) : nullptr; + if (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) { + solve = Solver::create(solverParam, *mdagm, *mdagmSloppy, *mdagmSloppy, *mdagmSloppy, profile); + } else if (solverParam.inv_type == QUDA_MG_INVERTER) { + // in case MG has not been created, we create the Smoother + if (!transfer) createSmoother(); + + // run GCR with the MG as a preconditioner + solverParam.inv_type_precondition = QUDA_MG_INVERTER; + solverParam.schwarz_type = QUDA_ADDITIVE_SCHWARZ; + solverParam.precondition_cycle = 1; + solverParam.tol_precondition = 1e-1; + solverParam.maxiter_precondition = 1; + solverParam.omega = 1.0; + solverParam.verbosity_precondition = param.mg_global.verbosity[param.level+1]; + solverParam.precision_sloppy = solverParam.precision; + solverParam.compute_true_res = 0; + solverParam.preconditioner = this; + + solverParam.inv_type = QUDA_GCR_INVERTER; + solve = Solver::create(solverParam, *param.matSmooth, *param.matSmooth, *param.matSmoothSloppy, + *param.matSmoothSloppy, profile); + solverParam.inv_type = QUDA_MG_INVERTER; + } else { + solve = Solver::create(solverParam, *param.matSmooth, *param.matSmoothSloppy, *param.matSmoothSloppy, + *param.matSmoothSloppy, profile); + } + + for (int si = 0; si < param.mg_global.num_setup_iter[param.level]; si++) { + if (getVerbosity() >= QUDA_VERBOSE) + printfQuda("Running vectors setup on level %d iter %d of %d\n", param.level, si + 1, + param.mg_global.num_setup_iter[param.level]); + + // global orthonormalization of the initial null-space vectors + if(param.mg_global.pre_orthonormalize) { + for(int i=0; i<(int)B.size(); i++) { + for (int j=0; j + caxpy(-alpha, *B[j], *B[i]); // i-j + } + double nrm2 = norm2(*B[i]); + if (nrm2 > 1e-16) ax(1.0 /sqrt(nrm2), *B[i]);// i/ + else errorQuda("\nCannot normalize %u vector\n", i); + } + } + + // launch solver for each source + for (int i=0; i<(int)B.size(); i++) { + if (param.mg_global.setup_type[param.level] == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS) { // DDalphaAMG test vector idea + b = *B[i]; // inverting against the vector + zero(x); // with zero initial guess + } else { + x = *B[i]; + zero(b); + } + + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial guess = %g\n", norm2(x)); + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial rhs = %g\n", norm2(b)); + + ColorSpinorField *out=nullptr, *in=nullptr; + diracSmoother->prepare(in, out, x, b, QUDA_MAT_SOLUTION); + (*solve)(*out, *in); + diracSmoother->reconstruct(x, b, QUDA_MAT_SOLUTION); + + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Solution = %g\n", norm2(x)); + *B[i] = x; + } + + // global orthonormalization of the generated null-space vectors + if (param.mg_global.post_orthonormalize) { + for(int i=0; i<(int)B.size(); i++) { + for (int j=0; j + caxpy(-alpha, *B[j], *B[i]); // i-j + } + double nrm2 = norm2(*B[i]); + if (sqrt(nrm2) > 1e-16) ax(1.0/sqrt(nrm2), *B[i]);// i/ + else errorQuda("\nCannot normalize %u vector (nrm=%e)\n", i, sqrt(nrm2)); + } + } + + if (solverParam.inv_type == QUDA_MG_INVERTER) { + + if (transfer) { + resetTransfer = true; + reset(); + if ( param.level < param.Nlevel-2 ) { + coarse->constructNearNulls(); + } + } else { + reset(); + } + } + } + + delete solve; + if (mdagm) delete mdagm; + if (mdagmSloppy) delete mdagmSloppy; + + diracSmootherSloppy->setHaloPrecision(halo_precision); // restore halo precision + + // reenable Schwarz + if (schwarz_reset) { + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Reenabling Schwarz for null-space finding"); + int commDim[QUDA_MAX_DIM]; + for (int i=0; isetCommDim(commDim); + } + + popLevel(); + } + + void MG::generateChebyshevFilter(std::vector &B, bool refresh) + { + pushLevel(param.level); + + static bool envset[QUDA_MAX_MG_LEVEL]; // bottom of window for filtering stage, eigenvalues smaller than lamda_min enhanced static double lambda_min[QUDA_MAX_MG_LEVEL]; @@ -1433,14 +1625,52 @@ namespace quda // how many chebyshev iterations between each vector in the set static int iterations_for_next[QUDA_MAX_MG_LEVEL]; - if (!envset) { + if (!envset[param.level]) { - // level 1 - char *cheby = getenv("QUDA_USE_CHEBY_L1"); - if (cheby) { use_cheby[1] = (atoi(cheby) == 1) ? true : false; } + char *tmp_string; + + // level 0 + if (param.level == 0) { + + tmp_string = getenv("QUDA_CHEBY_LMIN_L0"); + if (tmp_string) { + lambda_min[0] = atof(tmp_string); + } else { + errorQuda("QUDA_CHEBY_LMIN_L0 unset"); + } + + tmp_string = getenv("QUDA_VEC_FILTER_ITER_L0"); + if (tmp_string) { + filter_iterations[0] = atoi(tmp_string); + } else { + errorQuda("QUDA_VEC_FILTER_ITER_L0 unset"); + } - if (use_cheby[1]) { - char *tmp_string; + tmp_string = getenv("QUDA_VEC_FILTER_RESCALE_FREQ_L0"); + if (tmp_string) { + filter_rescale_freq[0] = atoi(tmp_string); + } else { + errorQuda("QUDA_VEC_FILTER_RESCALE_FREQ_L0 unset"); + } + + tmp_string = getenv("QUDA_VEC_PER_SET_L0"); + if (tmp_string) { + vectors_per_set[0] = atoi(tmp_string); + } else { + errorQuda("QUDA_VEC_PER_SET_L0 unset"); + } + + tmp_string = getenv("QUDA_VEC_ITER_FOR_NEXT_L0"); + if (tmp_string) { + iterations_for_next[0] = atoi(tmp_string); + } else { + errorQuda("QUDA_VEC_ITER_FOR_NEXT_L0 unset"); + } + + } + + // level 1 + if (param.level == 1) { tmp_string = getenv("QUDA_CHEBY_LMIN_L1"); if (tmp_string) { @@ -1467,7 +1697,7 @@ namespace quda if (tmp_string) { vectors_per_set[1] = atoi(tmp_string); } else { - errorQuda("QUDA_VEC_PER_SET_L1 unset_L1"); + errorQuda("QUDA_VEC_PER_SET_L1 unset"); } tmp_string = getenv("QUDA_VEC_ITER_FOR_NEXT_L1"); @@ -1476,14 +1706,11 @@ namespace quda } else { errorQuda("QUDA_VEC_ITER_FOR_NEXT_L1 unset"); } + } // level 2 - cheby = getenv("QUDA_USE_CHEBY_L1"); - if (cheby) { use_cheby[2] = (atoi(cheby) == 1) ? true : false; } - - if (use_cheby[2]) { - char *tmp_string; + if (param.level == 2) { tmp_string = getenv("QUDA_CHEBY_LMIN_L2"); if (tmp_string) { @@ -1510,7 +1737,7 @@ namespace quda if (tmp_string) { vectors_per_set[2] = atoi(tmp_string); } else { - errorQuda("QUDA_VEC_PER_SET_L2 unset_L1"); + errorQuda("QUDA_VEC_PER_SET_L2 unset"); } tmp_string = getenv("QUDA_VEC_ITER_FOR_NEXT_L2"); @@ -1521,337 +1748,145 @@ namespace quda } } - envset = true; + envset[param.level] = true; } - // Just constraining to level 2 for now - if ((param.level == 1 || param.level == 2) && use_cheby) { + if (!envset[param.level]) errorQuda("Chebyshev filter setup params unset"); - // Filtering approach based on arXiv:2103.05034, P. Boyle and A. Yamaguchi. + // Filtering approach based on arXiv:2103.05034, P. Boyle and A. Yamaguchi. - // use the right space; diracResidual is guaranteed to be full-parity - DiracMdagM dirac_mdm(diracResidual); + // use the right space; diracResidual is guaranteed to be full-parity + DiracMdagM dirac_mdm(diracResidual); - // temporaries for Dirac applications - ColorSpinorField tmp1(*B[0]); - ColorSpinorField tmp2(*B[0]); + // temporaries for Dirac applications + ColorSpinorField tmp1(*B[0]); + ColorSpinorField tmp2(*B[0]); - // temporaries needed for Chebyshev filter - ColorSpinorField pk(*B[0]); - ColorSpinorField pkm1(*B[0]); - ColorSpinorField pkm2(*B[0]); - ColorSpinorField Apkm1(*B[0]); + // temporaries needed for Chebyshev filter + ColorSpinorField pk(*B[0]); + ColorSpinorField pkm1(*B[0]); + ColorSpinorField pkm2(*B[0]); + ColorSpinorField Apkm1(*B[0]); - // approximate lambda_max - double lambda_max = 1.1 * Solver::performPowerIterations(dirac_mdm, *B[0], pk, pkm1, - 100, 10, tmp1, tmp2); + // approximate lambda_max + double lambda_max = 1.1 * Solver::performPowerIterations(dirac_mdm, *B[0], pk, pkm1, + 100, 10, tmp1, tmp2); - // create filter interpolators - double m_map_filter = 2. / (lambda_max - lambda_min[param.level]); - double b_map_filter = - (lambda_max + lambda_min[param.level]) / (lambda_max - lambda_min[param.level]); + // create filter interpolators + double m_map_filter = 2. / (lambda_max - lambda_min[param.level]); + double b_map_filter = - (lambda_max + lambda_min[param.level]) / (lambda_max - lambda_min[param.level]); - // create interpolators for generating more near-nulls - double m_map_generate = 2. / lambda_max; - double b_map_generate = -1.; + // create interpolators for generating more near-nulls + double m_map_generate = 2. / lambda_max; + double b_map_generate = -1.; - // we create batches of near-nulls from `B.size() / vectors_per_set` starting - // random vectors - int num_vec = static_cast(B.size()); + // we create batches of near-nulls from `B.size() / vectors_per_set` starting + // random vectors + int num_vec = static_cast(B.size()); - int num_starting_vectors = (num_vec + vectors_per_set[param.level] - 1) / vectors_per_set[param.level]; + int num_starting_vectors = (num_vec + vectors_per_set[param.level] - 1) / vectors_per_set[param.level]; - // orthonormalize - if (param.mg_global.pre_orthonormalize) { - for (int i = 0; i < num_starting_vectors; i++) { - for (int j = 0; j < i; j++) { - Complex alpha = cDotProduct(*B[j], *B[i]);// - caxpy(-alpha, *B[j], *B[i]); // i-j - } - double nrm2 = norm2(*B[i]); - if (nrm2 > 1e-16) ax(1.0 / sqrt(nrm2), *B[i]);// i/ - else errorQuda("\nCannot normalize %u vector\n", i); + // orthonormalize + if (param.mg_global.pre_orthonormalize) { + for (int i = 0; i < num_starting_vectors; i++) { + for (int j = 0; j < i; j++) { + Complex alpha = cDotProduct(*B[j], *B[i]);// + caxpy(-alpha, *B[j], *B[i]); // i-j } + double nrm2 = norm2(*B[i]); + if (nrm2 > 1e-16) ax(1.0 / sqrt(nrm2), *B[i]);// i/ + else errorQuda("\nCannot normalize %u vector\n", i); } + } - for (int s = 0; s < num_starting_vectors; s++) { + for (int s = 0; s < num_starting_vectors; s++) { - // filter - { - // P0 - blas::copy(pk, *B[s]); - - if (filter_iterations[param.level] > 0) { - // P1 = m Ap_0 + b p_0 - std::swap(pkm1, pk); // p_k -> p_{k - 1} - dirac_mdm(Apkm1, pkm1, tmp1, tmp2); - blas::axpbyz(m_map_filter, Apkm1, b_map_filter, pkm1, pk); - - if (filter_iterations[param.level] > 1) { - // Enter recursion relation - for (int k = 2; k <= filter_iterations[param.level]; k++) { - std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} - std::swap(pkm1, pk); // p_k -> p_{k-1} - dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} - blas::axpbypczw(2. * m_map_filter, Apkm1, 2. * b_map_filter, pkm1, -1., pkm2, pk); - - // heuristic rescale to keep norms in check... - if (k % filter_rescale_freq[param.level] == 0) { - double tmp_nrm2 = blas::norm2(pk); - printf("Starting vector %d heuristic rescale at %d old norm2 %e\n", s, k, tmp_nrm2); - double tmp_inv_nrm = 1. / sqrt(tmp_nrm2); - ax(tmp_inv_nrm, pk); - ax(tmp_inv_nrm, pkm1); - ax(tmp_inv_nrm, pkm2); - ax(tmp_inv_nrm, Apkm1); - } + // filter + { + // P0 + blas::copy(pk, *B[s]); + + if (filter_iterations[param.level] > 0) { + // P1 = m Ap_0 + b p_0 + std::swap(pkm1, pk); // p_k -> p_{k - 1} + dirac_mdm(Apkm1, pkm1, tmp1, tmp2); + blas::axpbyz(m_map_filter, Apkm1, b_map_filter, pkm1, pk); + + if (filter_iterations[param.level] > 1) { + // Enter recursion relation + for (int k = 2; k <= filter_iterations[param.level]; k++) { + std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} + std::swap(pkm1, pk); // p_k -> p_{k-1} + dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} + blas::axpbypczw(2. * m_map_filter, Apkm1, 2. * b_map_filter, pkm1, -1., pkm2, pk); + + // heuristic rescale to keep norms in check... + if (k % filter_rescale_freq[param.level] == 0) { + double tmp_nrm2 = blas::norm2(pk); + printf("Starting vector %d heuristic rescale at %d old norm2 %e\n", s, k, tmp_nrm2); + double tmp_inv_nrm = 1. / sqrt(tmp_nrm2); + ax(tmp_inv_nrm, pk); + ax(tmp_inv_nrm, pkm1); + ax(tmp_inv_nrm, pkm2); + ax(tmp_inv_nrm, Apkm1); } } } - - // print the norm (to check for enhancement, normalize) - double nrm2 = blas::norm2(pk); - printfQuda("Enhanced norm2 for start %d is %e\n", s, nrm2); - ax(1.0 / sqrt(nrm2), pk); - - // This is the first filtered vector - blas::copy(*B[s], pk); } - // Now we generate further vectors by another Chebyshev filter from 0 -> lambda_max - // P0 is trivially in place - - // P1 - std::swap(pkm1, pk); // p_k -> p_{k - 1} - dirac_mdm(Apkm1, pkm1, tmp1, tmp2); - blas::axpbyz(m_map_generate, Apkm1, b_map_generate, pkm1, pk); - - bool first = true; - - for (int i = s + num_starting_vectors; i < num_vec; i += num_starting_vectors) { - - for (int k = first ? 2 : 0; k < iterations_for_next[param.level]; k++) { - std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} - std::swap(pkm1, pk); // p_k -> p_{k-1} - dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} - blas::axpbypczw(2. * m_map_generate, Apkm1, 2. * b_map_generate, pkm1, -1., pkm2, pk); - } - - blas::copy(*B[i], pk); - - if (first) first = false; - - } + // print the norm (to check for enhancement, normalize) + double nrm2 = blas::norm2(pk); + printfQuda("Enhanced norm2 for start %d is %e\n", s, nrm2); + ax(1.0 / sqrt(nrm2), pk); + // This is the first filtered vector + blas::copy(*B[s], pk); } - // print norms - printfQuda("Norms of Chebyshev filtered near-null vectors:\n"); - for (int i = 0; i < num_vec; i++) { - printfQuda("Vector %d = %e\n", i, sqrt(blas::norm2(*B[i]))); - } + // Now we generate further vectors by another Chebyshev filter from 0 -> lambda_max + // P0 is trivially in place - // global orthonormalization of the generated null-space vectors - if (param.mg_global.post_orthonormalize) { - for (int i = 0; i < num_vec; i++) { - for (int j = 0; j < i ; j++) { - Complex alpha = cDotProduct(*B[j], *B[i]);// - caxpy(-alpha, *B[j], *B[i]); // i-j - } - double nrm2 = norm2(*B[i]); - if (sqrt(nrm2) > 1e-16) ax(1.0/sqrt(nrm2), *B[i]);// i/ - else errorQuda("\nCannot normalize %u vector (nrm=%e)\n", i, sqrt(nrm2)); - } - } + // P1 + std::swap(pkm1, pk); // p_k -> p_{k - 1} + dirac_mdm(Apkm1, pkm1, tmp1, tmp2); + blas::axpbyz(m_map_generate, Apkm1, b_map_generate, pkm1, pk); - //errorQuda("Quit out..."); + bool first = true; - } else { - - // Traditional inverse iterations setup - - SolverParam solverParam(param); // Set solver field parameters: - // set null-space generation options - need to expose these - solverParam.maxiter - = refresh ? param.mg_global.setup_maxiter_refresh[param.level] : param.mg_global.setup_maxiter[param.level]; - solverParam.tol = param.mg_global.setup_tol[param.level]; - solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES; - solverParam.delta = 1e-1; - solverParam.inv_type = param.mg_global.setup_inv_type[param.level]; - // Hard coded for now... - if (is_ca_solver(solverParam.inv_type)) { - solverParam.ca_basis = param.mg_global.setup_ca_basis[param.level]; - solverParam.ca_lambda_min = param.mg_global.setup_ca_lambda_min[param.level]; - solverParam.ca_lambda_max = param.mg_global.setup_ca_lambda_max[param.level]; - solverParam.Nkrylov = param.mg_global.setup_ca_basis_size[param.level]; - } else if (solverParam.inv_type == QUDA_GCR_INVERTER || solverParam.inv_type == QUDA_BICGSTABL_INVERTER) { - solverParam.Nkrylov = param.mg_global.setup_ca_basis_size[param.level]; - } else { - solverParam.Nkrylov = 4; - } - solverParam.pipeline - = (solverParam.inv_type == QUDA_BICGSTAB_INVERTER ? 0 : 4); // FIXME: pipeline != 0 breaks BICGSTAB - solverParam.precision = r->Precision(); - - if (is_fine_grid()) { - solverParam.precision_sloppy = param.mg_global.invert_param->cuda_prec_precondition; - solverParam.precision_precondition = param.mg_global.invert_param->cuda_prec_precondition; - } else { - solverParam.precision_precondition = solverParam.precision; - } + for (int i = s + num_starting_vectors; i < num_vec; i += num_starting_vectors) { - solverParam.residual_type = static_cast(QUDA_L2_RELATIVE_RESIDUAL); - solverParam.compute_null_vector = QUDA_COMPUTE_NULL_VECTOR_YES; - ColorSpinorParam csParam(*B[0]); // Create spinor field parameters: - csParam.setPrecision(r->Precision(), r->Precision(), true); // ensure native ordering - csParam.location = QUDA_CUDA_FIELD_LOCATION; // hard code to GPU location for null-space generation for now - csParam.gammaBasis = B[0]->Nspin() == 1 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS : - QUDA_UKQCD_GAMMA_BASIS; // degrand-rossi required for staggered - csParam.create = QUDA_ZERO_FIELD_CREATE; - ColorSpinorField b(csParam); - ColorSpinorField x(csParam); - - csParam.create = QUDA_NULL_FIELD_CREATE; - - // if we not using GCR/MG smoother then we need to switch off Schwarz since regular Krylov solvers do not support it - bool schwarz_reset = solverParam.inv_type != QUDA_MG_INVERTER - && param.mg_global.smoother_schwarz_type[param.level] != QUDA_INVALID_SCHWARZ; - if (schwarz_reset) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Disabling Schwarz for null-space finding"); - int commDim[QUDA_MAX_DIM]; - for (int i = 0; i < QUDA_MAX_DIM; i++) commDim[i] = 1; - diracSmootherSloppy->setCommDim(commDim); - } - - // if quarter precision halo, promote for null-space finding to half precision - QudaPrecision halo_precision = diracSmootherSloppy->HaloPrecision(); - if (halo_precision == QUDA_QUARTER_PRECISION) diracSmootherSloppy->setHaloPrecision(QUDA_HALF_PRECISION); - - Solver *solve; - DiracMdagM *mdagm = (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) ? new DiracMdagM(*diracSmoother) : nullptr; - DiracMdagM *mdagmSloppy = (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) ? new DiracMdagM(*diracSmootherSloppy) : nullptr; - if (solverParam.inv_type == QUDA_CG_INVERTER || solverParam.inv_type == QUDA_CA_CG_INVERTER) { - solve = Solver::create(solverParam, *mdagm, *mdagmSloppy, *mdagmSloppy, *mdagmSloppy, profile); - } else if (solverParam.inv_type == QUDA_MG_INVERTER) { - // in case MG has not been created, we create the Smoother - if (!transfer) createSmoother(); - - // run GCR with the MG as a preconditioner - solverParam.inv_type_precondition = QUDA_MG_INVERTER; - solverParam.schwarz_type = QUDA_ADDITIVE_SCHWARZ; - solverParam.precondition_cycle = 1; - solverParam.tol_precondition = 1e-1; - solverParam.maxiter_precondition = 1; - solverParam.omega = 1.0; - solverParam.verbosity_precondition = param.mg_global.verbosity[param.level+1]; - solverParam.precision_sloppy = solverParam.precision; - solverParam.compute_true_res = 0; - solverParam.preconditioner = this; - - solverParam.inv_type = QUDA_GCR_INVERTER; - solve = Solver::create(solverParam, *param.matSmooth, *param.matSmooth, *param.matSmoothSloppy, - *param.matSmoothSloppy, profile); - solverParam.inv_type = QUDA_MG_INVERTER; - } else { - solve = Solver::create(solverParam, *param.matSmooth, *param.matSmoothSloppy, *param.matSmoothSloppy, - *param.matSmoothSloppy, profile); - } - - for (int si = 0; si < param.mg_global.num_setup_iter[param.level]; si++) { - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Running vectors setup on level %d iter %d of %d\n", param.level, si + 1, - param.mg_global.num_setup_iter[param.level]); - - // global orthonormalization of the initial null-space vectors - if(param.mg_global.pre_orthonormalize) { - for(int i=0; i<(int)B.size(); i++) { - for (int j=0; j - caxpy(-alpha, *B[j], *B[i]); // i-j - } - double nrm2 = norm2(*B[i]); - if (nrm2 > 1e-16) ax(1.0 /sqrt(nrm2), *B[i]);// i/ - else errorQuda("\nCannot normalize %u vector\n", i); - } + for (int k = first ? 2 : 0; k < iterations_for_next[param.level]; k++) { + std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} + std::swap(pkm1, pk); // p_k -> p_{k-1} + dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} + blas::axpbypczw(2. * m_map_generate, Apkm1, 2. * b_map_generate, pkm1, -1., pkm2, pk); } - // launch solver for each source - for (int i=0; i<(int)B.size(); i++) { - if (param.mg_global.setup_type == QUDA_TEST_VECTOR_SETUP) { // DDalphaAMG test vector idea - b = *B[i]; // inverting against the vector - zero(x); // with zero initial guess - } else { - x = *B[i]; - zero(b); - } - - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial guess = %g\n", norm2(x)); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial rhs = %g\n", norm2(b)); - - ColorSpinorField *out=nullptr, *in=nullptr; - diracSmoother->prepare(in, out, x, b, QUDA_MAT_SOLUTION); - (*solve)(*out, *in); - diracSmoother->reconstruct(x, b, QUDA_MAT_SOLUTION); + blas::copy(*B[i], pk); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Solution = %g\n", norm2(x)); - *B[i] = x; - } + if (first) first = false; - // global orthonormalization of the generated null-space vectors - if (param.mg_global.post_orthonormalize) { - for(int i=0; i<(int)B.size(); i++) { - for (int j=0; j - caxpy(-alpha, *B[j], *B[i]); // i-j - } - double nrm2 = norm2(*B[i]); - if (sqrt(nrm2) > 1e-16) ax(1.0/sqrt(nrm2), *B[i]);// i/ - else errorQuda("\nCannot normalize %u vector (nrm=%e)\n", i, sqrt(nrm2)); - } - } - - if (solverParam.inv_type == QUDA_MG_INVERTER) { - - if (transfer) { - resetTransfer = true; - reset(); - if ( param.level < param.Nlevel-2 ) { - if ( param.mg_global.generate_all_levels == QUDA_BOOLEAN_TRUE ) { - coarse->generateNullVectors(*B_coarse, refresh); - } else { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Restricting null space vectors\n"); - for (int i=0; iR(*(*B_coarse)[i], *(param.B[i])); - } - // rebuild the transfer operator in the coarse level - coarse->resetTransfer = true; - coarse->reset(); - } - } - } else { - reset(); - } - } } - delete solve; - if (mdagm) delete mdagm; - if (mdagmSloppy) delete mdagmSloppy; - - diracSmootherSloppy->setHaloPrecision(halo_precision); // restore halo precision - - // reenable Schwarz - if (schwarz_reset) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Reenabling Schwarz for null-space finding"); - int commDim[QUDA_MAX_DIM]; - for (int i=0; isetCommDim(commDim); - } + } + // print norms + printfQuda("Norms of Chebyshev filtered near-null vectors:\n"); + for (int i = 0; i < num_vec; i++) { + printfQuda("Vector %d = %e\n", i, sqrt(blas::norm2(*B[i]))); } - if (param.mg_global.vec_store[param.level] == QUDA_BOOLEAN_TRUE) { // conditional store of null vectors - saveVectors(B); + // global orthonormalization of the generated null-space vectors + if (param.mg_global.post_orthonormalize) { + for (int i = 0; i < num_vec; i++) { + for (int j = 0; j < i ; j++) { + Complex alpha = cDotProduct(*B[j], *B[i]);// + caxpy(-alpha, *B[j], *B[i]); // i-j + } + double nrm2 = norm2(*B[i]); + if (sqrt(nrm2) > 1e-16) ax(1.0/sqrt(nrm2), *B[i]);// i/ + else errorQuda("\nCannot normalize %u vector (nrm=%e)\n", i, sqrt(nrm2)); + } } popLevel(); @@ -1859,9 +1894,10 @@ namespace quda // generate a full span of free vectors. // FIXME: Assumes fine level is SU(3). - void MG::buildFreeVectors(std::vector &B) + void MG::generateFreeVectors() { pushLevel(param.level); + std::vector &B = param.B; const int Nvec = B.size(); // Given the number of colors and spins, figure out if the number @@ -2030,10 +2066,13 @@ namespace quda popLevel(); } - void MG::generateEigenVectors() + void MG::generateEigenvectors() { pushLevel(param.level); + if (param.mg_global.eig_param[param.level] == nullptr) + errorQuda("eig_param for level %d has not been populated", param.level); + // Extract eigensolver params int n_conv = param.mg_global.eig_param[param.level]->n_conv; bool dagger = param.mg_global.eig_param[param.level]->use_dagger; @@ -2091,8 +2130,22 @@ namespace quda // Local clean-up for (auto b : B_evecs) { delete b; } - // only save if outfile is defined - if (strcmp(param.mg_global.vec_outfile[param.level], "") != 0) { saveVectors(param.B); } + popLevel(); + } + + void MG::generateRestrictedVectors() + { + pushLevel(param.level); + + if (is_fine_grid()) errorQuda("There are no fine near-null vectors to restrict on level 0"); + if (param.Nvec != param.fine->param.Nvec) errorQuda("The number of near-null vectors on the fine level (%d) and coarse level (%d) do not match", param.fine->param.Nvec, param.Nvec); + + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Restricting null space vectors\n"); + + for (int i = 0; i < param.Nvec; i++) { + zero(*(param.B[i])); + param.fine->transfer->R(*(param.B[i]), *(param.fine->param.B[i])); + } popLevel(); } diff --git a/lib/transfer.cpp b/lib/transfer.cpp index dadc1298f8..9489debca2 100644 --- a/lib/transfer.cpp +++ b/lib/transfer.cpp @@ -455,7 +455,7 @@ namespace quda { if (use_gpu) { if (out.Location() == QUDA_CPU_FIELD_LOCATION) output = coarse_tmp_d; - if (in.Location() == QUDA_CPU_FIELD_LOCATION || in.GammaBasis() != V->GammaBasis()) + if (in.Location() == QUDA_CPU_FIELD_LOCATION || in.GammaBasis() != V->GammaBasis() || in.FieldOrder() != QUDA_FLOAT2_FIELD_ORDER) input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d : &fine_tmp_d->Even(); if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU"); } else { diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index cd52cf0958..d5fcf2242a 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -128,7 +128,7 @@ void init(int argc, char **argv) // Set sub structures mg_param.invert_param = &mg_inv_param; for (int i = 0; i < mg_levels; i++) { - if (mg_eig[i]) { + if (mg_eig[i] || setup_type[i] == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { mg_eig_param[i] = newQudaEigParam(); setMultigridEigParam(mg_eig_param[i], i); mg_param.eig_param[i] = &mg_eig_param[i]; diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 12b67cec13..2ce6ea3867 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -185,7 +185,7 @@ int main(int argc, char **argv) mg_param.invert_param = &mg_inv_param; for (int i = 0; i < mg_levels; i++) { - if (mg_eig[i]) { + if (mg_eig[i] || setup_type[i] == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { mg_eig_param[i] = newQudaEigParam(); setMultigridEigParam(mg_eig_param[i], i); mg_param.eig_param[i] = &mg_eig_param[i]; diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index 01ee7db612..b0c9d7a049 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -138,7 +138,7 @@ quda::mgarray setup_ca_basis = {}; quda::mgarray setup_ca_basis_size = {}; quda::mgarray setup_ca_lambda_min = {}; quda::mgarray setup_ca_lambda_max = {}; -QudaSetupType setup_type = QUDA_NULL_VECTOR_SETUP; +quda::mgarray setup_type = {}; bool pre_orthonormalize = false; bool post_orthonormalize = true; double omega = 0.85; @@ -155,8 +155,6 @@ quda::mgarray coarse_solver_ca_basis = {}; quda::mgarray coarse_solver_ca_basis_size = {}; quda::mgarray coarse_solver_ca_lambda_min = {}; quda::mgarray coarse_solver_ca_lambda_max = {}; -bool generate_nullspace = true; -bool generate_all_levels = true; quda::mgarray mg_schwarz_type = {}; quda::mgarray mg_schwarz_cycle = {}; bool mg_evolve_thin_updates = false; @@ -385,7 +383,12 @@ namespace {"SR", QUDA_SPECTRUM_SR_EIG}, {"LR", QUDA_SPECTRUM_LR_EIG}, {"SM", QUDA_SPECTRUM_SM_EIG}, {"LM", QUDA_SPECTRUM_LM_EIG}, {"SI", QUDA_SPECTRUM_SI_EIG}, {"LI", QUDA_SPECTRUM_LI_EIG}}; - CLI::TransformPairs setup_type_map {{"test", QUDA_TEST_VECTOR_SETUP}, {"null", QUDA_TEST_VECTOR_SETUP}}; + CLI::TransformPairs setup_type_map {{"inverse-iterations", QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS}, + {"chebyshev-filter", QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER}, + {"eigenvectors", QUDA_SETUP_NULL_VECTOR_EIGENVECTORS}, + {"test-vectors", QUDA_SETUP_NULL_VECTOR_TEST_VECTORS}, + {"restrict-fine", QUDA_SETUP_NULL_VECTOR_RESTRICT_FINE}, + {"free-field", QUDA_SETUP_NULL_VECTOR_FREE_FIELD}}; CLI::TransformPairs extlib_map {{"eigen", QUDA_EIGEN_EXTLIB}}; @@ -833,19 +836,13 @@ void add_multigrid_option_group(std::shared_ptr quda_app) "Solve the MdagM problem instead of M (MMdag if eig-use-dagger == true) (default false)"); quda_app->add_mgoption(opgroup, "--mg-eig-use-poly-acc", mg_eig_use_poly_acc, CLI::Validator(), "Use Chebyshev polynomial acceleration in the eigensolver (default true)"); - opgroup->add_option( - "--mg-generate-all-levels", - generate_all_levels, "true=generate null-space on all levels, false=generate on level 0 and create other levels from that (default true)"); opgroup->add_option("--mg-evolve-thin-updates", mg_evolve_thin_updates, "Utilize thin updates for multigrid evolution tests (default false)"); - opgroup->add_option("--mg-generate-nullspace", generate_nullspace, - "Generate the null-space vector dynamically (default true, if set false and mg-load-vec isn't " - "set, creates free-field null vectors)"); opgroup->add_option("--mg-levels", mg_levels, "The number of multigrid levels to do (default 2)"); // TODO quda_app->add_mgoption(opgroup, "--mg-load-vec", mg_vec_infile, CLI::Validator(), - "Load the vectors for the multigrid_test (requires QIO)"); + "Load the vectors for the multigrid_test, overrides setup (requires QIO)"); quda_app->add_mgoption(opgroup, "--mg-save-vec", mg_vec_outfile, CLI::Validator(), "Save the generated null-space vectors from the multigrid_test (requires QIO)"); @@ -900,7 +897,7 @@ void add_multigrid_option_group(std::shared_ptr quda_app) quda_app->add_mgoption(opgroup, "--mg-setup-inv", setup_inv, solver_trans, "The inverter to use for the setup of multigrid (default bicgstab)"); quda_app->add_mgoption(opgroup, "--mg-setup-iters", num_setup_iter, CLI::PositiveNumber, - "The number of setup iterations to use for the multigrid (default 1)"); + "The number of setup iterations to use for the multigrid, requires test vector setup (default 1)"); quda_app->add_mgoption(opgroup, "--mg-setup-location", setup_location, CLI::QUDACheckedTransformer(field_location_map), "The location where the multigrid setup will be computed (default cuda)"); @@ -913,8 +910,8 @@ void add_multigrid_option_group(std::shared_ptr quda_app) quda_app->add_mgoption(opgroup, "--mg-setup-tol", setup_tol, CLI::Validator(), "The tolerance to use for the setup of multigrid (default 5e-6)"); - opgroup->add_option("--mg-setup-type", setup_type, "The type of setup to use for the multigrid (default null)") - ->transform(CLI::QUDACheckedTransformer(setup_type_map)); + quda_app->add_mgoption(opgroup, "--mg-setup-type", setup_type, CLI::QUDACheckedTransformer(setup_type_map), + "The type of setup to use for the multigrid; ignored if --mg-load-vec is set (inverse-iterations (default), chebyshev-filter, eigenvectors, test-vectors, restrict-fine, free-field)"); opgroup ->add_option( diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h index b80e683d7c..7942b9b5ed 100644 --- a/tests/utils/command_line_params.h +++ b/tests/utils/command_line_params.h @@ -275,7 +275,7 @@ extern quda::mgarray setup_ca_basis; extern quda::mgarray setup_ca_basis_size; extern quda::mgarray setup_ca_lambda_min; extern quda::mgarray setup_ca_lambda_max; -extern QudaSetupType setup_type; +extern quda::mgarray setup_type; extern bool pre_orthonormalize; extern bool post_orthonormalize; extern double omega; @@ -292,8 +292,6 @@ extern quda::mgarray coarse_solver_ca_basis; extern quda::mgarray coarse_solver_ca_basis_size; extern quda::mgarray coarse_solver_ca_lambda_min; extern quda::mgarray coarse_solver_ca_lambda_max; -extern bool generate_nullspace; -extern bool generate_all_levels; extern quda::mgarray mg_schwarz_type; extern quda::mgarray mg_schwarz_cycle; extern bool mg_evolve_thin_updates; diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp index 486fbce958..021abe3da0 100644 --- a/tests/utils/set_params.cpp +++ b/tests/utils/set_params.cpp @@ -428,6 +428,9 @@ void setMultigridParam(QudaMultigridParam &mg_param) mg_param.setup_maxiter[i] = setup_maxiter[i]; mg_param.setup_maxiter_refresh[i] = setup_maxiter_refresh[i]; + // Setup type to use (inverse iterations, chebyshev filter, eigenvectors, restriction, free field) + mg_param.setup_type[i] = setup_type[i]; + // Basis to use for CA solver setups mg_param.setup_ca_basis[i] = setup_ca_basis[i]; @@ -574,14 +577,9 @@ void setMultigridParam(QudaMultigridParam &mg_param) // only coarsen the spin on the first restriction mg_param.spin_block_size[0] = 2; - mg_param.setup_type = setup_type; mg_param.pre_orthonormalize = pre_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; mg_param.post_orthonormalize = post_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.compute_null_vector = generate_nullspace ? QUDA_COMPUTE_NULL_VECTOR_YES : QUDA_COMPUTE_NULL_VECTOR_NO; - - mg_param.generate_all_levels = generate_all_levels ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.run_verify = verify_results ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; mg_param.run_low_mode_check = low_mode_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; mg_param.run_oblique_proj_check = oblique_proj_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; @@ -1026,6 +1024,9 @@ void setStaggeredMultigridParam(QudaMultigridParam &mg_param) mg_param.setup_tol[i] = setup_tol[i]; mg_param.setup_maxiter[i] = setup_maxiter[i]; + // Setup type to use (inverse iterations, chebyshev filter, eigenvectors, restriction, free field) + mg_param.setup_type[i] = setup_type[i]; + // Basis to use for CA solver setups mg_param.setup_ca_basis[i] = setup_ca_basis[i]; @@ -1178,14 +1179,9 @@ void setStaggeredMultigridParam(QudaMultigridParam &mg_param) mg_param.spin_block_size[1] = 0; // we're coarsening the optimized KD op } - mg_param.setup_type = setup_type; mg_param.pre_orthonormalize = pre_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; mg_param.post_orthonormalize = post_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.compute_null_vector = generate_nullspace ? QUDA_COMPUTE_NULL_VECTOR_YES : QUDA_COMPUTE_NULL_VECTOR_NO; - - mg_param.generate_all_levels = generate_all_levels ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.run_verify = verify_results ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; mg_param.run_low_mode_check = low_mode_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; mg_param.run_oblique_proj_check = oblique_proj_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; From cef28f50f460e34cb1b40e297d77c09c00a16079 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 28 Apr 2022 19:12:59 -0400 Subject: [PATCH 05/15] Threaded in command line support for Chebyshev filter setup --- include/dirac_quda.h | 12 +- include/multigrid.h | 6 +- include/quda.h | 18 ++ lib/check_params.h | 17 ++ lib/multigrid.cpp | 202 +++------------ tests/invert_test.cpp | 2 +- tests/multigrid_evolve_test.cpp | 2 +- tests/utils/command_line_params.cpp | 34 ++- tests/utils/command_line_params.h | 15 +- tests/utils/host_utils.cpp | 8 + tests/utils/host_utils.h | 15 ++ tests/utils/set_params.cpp | 376 +++++++--------------------- 12 files changed, 239 insertions(+), 468 deletions(-) diff --git a/include/dirac_quda.h b/include/dirac_quda.h index 455c4e8f48..f5f3af9979 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -1296,22 +1296,12 @@ namespace quda { virtual QudaDiracType getDiracType() const { return QUDA_STAGGERED_DIRAC; } -<<<<<<< HEAD /** @brief Return the one-hop field for staggered operators for MG setup @return Gauge field - */ - virtual cudaGaugeField *getStaggeredShortLinkField() const { return gauge; } -======= - /** @brief Return the one-hop field for staggered operators for MG setup - - @return Gauge field */ - virtual cudaGaugeField* getStaggeredShortLinkField() const { - return gauge; - } ->>>>>>> 19643384c (Big commit refactoring near-null setup, exposing multiple types on a per-level basis; chebyshev filter parameters are still only setable via the command line) + virtual cudaGaugeField *getStaggeredShortLinkField() const { return gauge; } /** * @brief Create the coarse staggered operator. diff --git a/include/multigrid.h b/include/multigrid.h index cd90b40f3e..53255ef5e3 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -417,11 +417,11 @@ namespace quda { void generateInverseIterations(std::vector &B, bool refresh = false); /** - @brief Generate the null-space vectors via a Chebyshev filter + @brief Generate the null-space vectors via a Chebyshev filter; this approach is + based on arXiv:2103.05034, P. Boyle and A. Yamaguchi. @param B Generated null-space vectors - @param refresh Whether we refreshing pre-exising vectors or starting afresh */ - void generateChebyshevFilter(std::vector &B, bool refresh = false); + void generateChebyshevFilter(std::vector &B); /** @brief Generate lowest eigenvectors diff --git a/include/quda.h b/include/quda.h index 9867684b72..9c9d608772 100644 --- a/include/quda.h +++ b/include/quda.h @@ -650,6 +650,24 @@ extern "C" { /** Maximum eigenvalue for Chebyshev CA basis */ double setup_ca_lambda_max[QUDA_MAX_MG_LEVEL]; + /** Number of random starting vectors for Chebyshev filter null space generation */ + int filter_startup_vectors[QUDA_MAX_MG_LEVEL]; + + /** Number of iterations for initial Chebyshev filter */ + int filter_startup_iterations[QUDA_MAX_MG_LEVEL]; + + /** Frequency of rescales during initial filtering which helps avoid overflow */ + int filter_startup_rescale_frequency[QUDA_MAX_MG_LEVEL]; + + /** Number of iterations between null vectors generated from each starting vector */ + int filter_iterations_between_vectors[QUDA_MAX_MG_LEVEL]; + + /** Conservative estimate of largest eigenvalue of operator used for Chebyshev filter setup */ + int filter_lambda_min[QUDA_MAX_MG_LEVEL]; + + /** Lower bound of eigenvalues that are not enhanced by the initial Chebyshev filter */ + int filter_lambda_max[QUDA_MAX_MG_LEVEL]; + /** Null-space type to use in the setup phase */ QudaNullVectorSetupType setup_type[QUDA_MAX_MG_LEVEL]; diff --git a/lib/check_params.h b/lib/check_params.h index ae46c0e920..f9534c5298 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -821,6 +821,23 @@ void printQudaMultigridParam(QudaMultigridParam *param) { P(setup_ca_lambda_max[i], INVALID_DOUBLE); #endif +#ifdef INIT_PARAM + P(filter_startup_vectors[i], 1); + P(filter_startup_iterations[i], 1000); + P(filter_startup_rescale_frequency[i], 50); + P(filter_iterations_between_vectors[i], 150); + P(filter_lambda_min[i], 1.0); + P(filter_lambda_max[i], -1.0); +#else + P(filter_startup_vectors[i], INVALID_INT); + P(filter_startup_iterations[i], INVALID_INT); + P(filter_startup_rescale_frequency[i], INVALID_INT); + P(filter_iterations_between_vectors[i], INVALID_INT); + P(filter_lambda_min[i], INVALID_DOUBLE); + P(filter_lambda_max[i], INVALID_DOUBLE); +#endif + + #ifdef INIT_PARAM P(n_block_ortho[i], 1); P(block_ortho_two_pass[i], QUDA_BOOLEAN_TRUE); diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 782c0efd17..5ba5706c36 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -1407,8 +1407,9 @@ namespace quda errorQuda("Multiple setup iterations can only be used with test vector setup"); // Initializing to random vectors - // FIXME: only need to initialize a subset for the chebyshev filter - for (int i = 0; i < (int)param.B.size(); i++) { + // FIXME: do not do if in the *middle* of test vector setup + int num_initialize = (setup_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) ? param.mg_global.filter_startup_vectors[param.level] : (int)param.B.size(); + for (int i = 0; i < num_initialize; i++) { spinorNoise(*r, *rng, QUDA_NOISE_UNIFORM); *param.B[i] = *r; } @@ -1418,7 +1419,7 @@ namespace quda generateInverseIterations(param.B, refresh); } else { // Chebyshev filter - generateChebyshevFilter(param.B, refresh); + generateChebyshevFilter(param.B); } } else { errorQuda("Invalid setup type %d", setup_type); @@ -1440,6 +1441,7 @@ namespace quda solverParam.tol = param.mg_global.setup_tol[param.level]; solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES; solverParam.delta = 1e-1; + solverParam.tol_hq = 0; solverParam.inv_type = param.mg_global.setup_inv_type[param.level]; // Hard coded for now... if (is_ca_solver(solverParam.inv_type)) { @@ -1604,156 +1606,11 @@ namespace quda popLevel(); } - void MG::generateChebyshevFilter(std::vector &B, bool refresh) + void MG::generateChebyshevFilter(std::vector &B) { pushLevel(param.level); - static bool envset[QUDA_MAX_MG_LEVEL]; - - // bottom of window for filtering stage, eigenvalues smaller than lamda_min enhanced - static double lambda_min[QUDA_MAX_MG_LEVEL]; - - // how many filtering iterations should we do - static int filter_iterations[QUDA_MAX_MG_LEVEL]; - - // how frequently we rescale while we filter - static int filter_rescale_freq[QUDA_MAX_MG_LEVEL]; - - // how many near-nulls to generate from each base vector - static int vectors_per_set[QUDA_MAX_MG_LEVEL]; - - // how many chebyshev iterations between each vector in the set - static int iterations_for_next[QUDA_MAX_MG_LEVEL]; - - if (!envset[param.level]) { - - char *tmp_string; - - // level 0 - if (param.level == 0) { - - tmp_string = getenv("QUDA_CHEBY_LMIN_L0"); - if (tmp_string) { - lambda_min[0] = atof(tmp_string); - } else { - errorQuda("QUDA_CHEBY_LMIN_L0 unset"); - } - - tmp_string = getenv("QUDA_VEC_FILTER_ITER_L0"); - if (tmp_string) { - filter_iterations[0] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_FILTER_ITER_L0 unset"); - } - - tmp_string = getenv("QUDA_VEC_FILTER_RESCALE_FREQ_L0"); - if (tmp_string) { - filter_rescale_freq[0] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_FILTER_RESCALE_FREQ_L0 unset"); - } - - tmp_string = getenv("QUDA_VEC_PER_SET_L0"); - if (tmp_string) { - vectors_per_set[0] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_PER_SET_L0 unset"); - } - - tmp_string = getenv("QUDA_VEC_ITER_FOR_NEXT_L0"); - if (tmp_string) { - iterations_for_next[0] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_ITER_FOR_NEXT_L0 unset"); - } - - } - - // level 1 - if (param.level == 1) { - - tmp_string = getenv("QUDA_CHEBY_LMIN_L1"); - if (tmp_string) { - lambda_min[1] = atof(tmp_string); - } else { - errorQuda("QUDA_CHEBY_LMIN_L1 unset"); - } - - tmp_string = getenv("QUDA_VEC_FILTER_ITER_L1"); - if (tmp_string) { - filter_iterations[1] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_FILTER_ITER_L1 unset"); - } - - tmp_string = getenv("QUDA_VEC_FILTER_RESCALE_FREQ_L1"); - if (tmp_string) { - filter_rescale_freq[1] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_FILTER_RESCALE_FREQ_L1 unset"); - } - - tmp_string = getenv("QUDA_VEC_PER_SET_L1"); - if (tmp_string) { - vectors_per_set[1] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_PER_SET_L1 unset"); - } - - tmp_string = getenv("QUDA_VEC_ITER_FOR_NEXT_L1"); - if (tmp_string) { - iterations_for_next[1] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_ITER_FOR_NEXT_L1 unset"); - } - - } - - // level 2 - if (param.level == 2) { - - tmp_string = getenv("QUDA_CHEBY_LMIN_L2"); - if (tmp_string) { - lambda_min[2] = atof(tmp_string); - } else { - errorQuda("QUDA_CHEBY_LMIN_L2 unset"); - } - - tmp_string = getenv("QUDA_VEC_FILTER_ITER_L2"); - if (tmp_string) { - filter_iterations[2] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_FILTER_ITER_L2 unset"); - } - - tmp_string = getenv("QUDA_VEC_FILTER_RESCALE_FREQ_L2"); - if (tmp_string) { - filter_rescale_freq[2] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_FILTER_RESCALE_FREQ_L2 unset"); - } - - tmp_string = getenv("QUDA_VEC_PER_SET_L2"); - if (tmp_string) { - vectors_per_set[2] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_PER_SET_L2 unset"); - } - - tmp_string = getenv("QUDA_VEC_ITER_FOR_NEXT_L2"); - if (tmp_string) { - iterations_for_next[2] = atoi(tmp_string); - } else { - errorQuda("QUDA_VEC_ITER_FOR_NEXT_L2 unset"); - } - } - - envset[param.level] = true; - } - - if (!envset[param.level]) errorQuda("Chebyshev filter setup params unset"); - - // Filtering approach based on arXiv:2103.05034, P. Boyle and A. Yamaguchi. + // TBD: implement a refresh // use the right space; diracResidual is guaranteed to be full-parity DiracMdagM dirac_mdm(diracResidual); @@ -1768,13 +1625,24 @@ namespace quda ColorSpinorField pkm2(*B[0]); ColorSpinorField Apkm1(*B[0]); - // approximate lambda_max - double lambda_max = 1.1 * Solver::performPowerIterations(dirac_mdm, *B[0], pk, pkm1, - 100, 10, tmp1, tmp2); + const int num_starting_vectors = param.mg_global.filter_startup_vectors[param.level]; + const int filter_iterations = param.mg_global.filter_startup_iterations[param.level]; + const int filter_rescale_freq = param.mg_global.filter_startup_rescale_frequency[param.level]; + const int iterations_for_next = param.mg_global.filter_iterations_between_vectors[param.level]; + const double lambda_min = param.mg_global.filter_lambda_min[param.level]; + double lambda_max = param.mg_global.filter_lambda_max[param.level]; + if (lambda_max < lambda_min) { + // approximate lambda_max + lambda_max = 1.1 * Solver::performPowerIterations(dirac_mdm, *B[0], pk, pkm1, + 100, 10, tmp1, tmp2); + } + + logQuda(getVerbosity(), "Nums starting %d Filter iter %d Filter rescale %d Filter next %d Lambda min %e lambda max %e\n", + num_starting_vectors, filter_iterations, filter_rescale_freq, iterations_for_next, lambda_min, lambda_max); // create filter interpolators - double m_map_filter = 2. / (lambda_max - lambda_min[param.level]); - double b_map_filter = - (lambda_max + lambda_min[param.level]) / (lambda_max - lambda_min[param.level]); + double m_map_filter = 2. / (lambda_max - lambda_min); + double b_map_filter = - (lambda_max + lambda_min) / (lambda_max - lambda_min); // create interpolators for generating more near-nulls double m_map_generate = 2. / lambda_max; @@ -1784,8 +1652,6 @@ namespace quda // random vectors int num_vec = static_cast(B.size()); - int num_starting_vectors = (num_vec + vectors_per_set[param.level] - 1) / vectors_per_set[param.level]; - // orthonormalize if (param.mg_global.pre_orthonormalize) { for (int i = 0; i < num_starting_vectors; i++) { @@ -1806,24 +1672,24 @@ namespace quda // P0 blas::copy(pk, *B[s]); - if (filter_iterations[param.level] > 0) { + if (filter_iterations > 0) { // P1 = m Ap_0 + b p_0 std::swap(pkm1, pk); // p_k -> p_{k - 1} dirac_mdm(Apkm1, pkm1, tmp1, tmp2); blas::axpbyz(m_map_filter, Apkm1, b_map_filter, pkm1, pk); - if (filter_iterations[param.level] > 1) { + if (filter_iterations > 1) { // Enter recursion relation - for (int k = 2; k <= filter_iterations[param.level]; k++) { + for (int k = 2; k <= filter_iterations; k++) { std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} std::swap(pkm1, pk); // p_k -> p_{k-1} dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} blas::axpbypczw(2. * m_map_filter, Apkm1, 2. * b_map_filter, pkm1, -1., pkm2, pk); // heuristic rescale to keep norms in check... - if (k % filter_rescale_freq[param.level] == 0) { + if (k % filter_rescale_freq == 0) { double tmp_nrm2 = blas::norm2(pk); - printf("Starting vector %d heuristic rescale at %d old norm2 %e\n", s, k, tmp_nrm2); + logQuda(getVerbosity(), "Starting vector %d heuristic rescale at %d old norm2 %e\n", s, k, tmp_nrm2); double tmp_inv_nrm = 1. / sqrt(tmp_nrm2); ax(tmp_inv_nrm, pk); ax(tmp_inv_nrm, pkm1); @@ -1836,7 +1702,7 @@ namespace quda // print the norm (to check for enhancement, normalize) double nrm2 = blas::norm2(pk); - printfQuda("Enhanced norm2 for start %d is %e\n", s, nrm2); + logQuda(getVerbosity(), "Enhanced norm2 for start %d is %e\n", s, nrm2); ax(1.0 / sqrt(nrm2), pk); // This is the first filtered vector @@ -1855,7 +1721,7 @@ namespace quda for (int i = s + num_starting_vectors; i < num_vec; i += num_starting_vectors) { - for (int k = first ? 2 : 0; k < iterations_for_next[param.level]; k++) { + for (int k = first ? 2 : 0; k < iterations_for_next; k++) { std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} std::swap(pkm1, pk); // p_k -> p_{k-1} dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} @@ -1871,9 +1737,11 @@ namespace quda } // print norms - printfQuda("Norms of Chebyshev filtered near-null vectors:\n"); - for (int i = 0; i < num_vec; i++) { - printfQuda("Vector %d = %e\n", i, sqrt(blas::norm2(*B[i]))); + if (getVerbosity() >= QUDA_VERBOSE) { + printfQuda("Norms of Chebyshev filtered near-null vectors:\n"); + for (int i = 0; i < num_vec; i++) { + printfQuda("Vector %d = %e\n", i, sqrt(blas::norm2(*B[i]))); + } } // global orthonormalization of the generated null-space vectors diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index d5fcf2242a..f93733fe44 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -137,7 +137,7 @@ void init(int argc, char **argv) } } // Set MG - setMultigridParam(mg_param); + setWilsonMultigridParam(mg_param); } else { setInvertParam(inv_param); } diff --git a/tests/multigrid_evolve_test.cpp b/tests/multigrid_evolve_test.cpp index 105e2e2951..f08164ede0 100644 --- a/tests/multigrid_evolve_test.cpp +++ b/tests/multigrid_evolve_test.cpp @@ -168,7 +168,7 @@ int main(int argc, char **argv) } } // Set MG - setMultigridParam(mg_param); + setWilsonMultigridParam(mg_param); } else { setInvertParam(inv_param); } diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index b0c9d7a049..e82928614c 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -130,6 +130,10 @@ quda::mgarray mg_verbosity = {}; quda::mgarray setup_inv = {}; quda::mgarray coarse_solve_type = {}; quda::mgarray smoother_solve_type = {}; + +quda::mgarray setup_type = {}; + +// Parameters for inverse iterations setup quda::mgarray num_setup_iter = {}; quda::mgarray setup_tol = {}; quda::mgarray setup_maxiter = {}; @@ -138,7 +142,15 @@ quda::mgarray setup_ca_basis = {}; quda::mgarray setup_ca_basis_size = {}; quda::mgarray setup_ca_lambda_min = {}; quda::mgarray setup_ca_lambda_max = {}; -quda::mgarray setup_type = {}; + +// Parameters for Chebyshev filter setup +quda::mgarray filter_startup_vectors = {}; +quda::mgarray filter_startup_iterations = {}; +quda::mgarray filter_startup_rescale_frequency = {}; +quda::mgarray filter_iterations_between_vectors = {}; +quda::mgarray filter_lambda_min = {}; +quda::mgarray filter_lambda_max = {}; + bool pre_orthonormalize = false; bool post_orthonormalize = true; double omega = 0.85; @@ -883,6 +895,10 @@ void add_multigrid_option_group(std::shared_ptr quda_app) ->transform(CLI::QUDACheckedTransformer(schwarz_type_map)); quda_app->add_mgoption(opgroup, "--mg-schwarz-cycle", mg_schwarz_cycle, CLI::PositiveNumber, "The number of Schwarz cycles to apply per smoother application (default=1)"); + + quda_app->add_mgoption(opgroup, "--mg-setup-type", setup_type, CLI::QUDACheckedTransformer(setup_type_map), + "The type of setup to use for the multigrid; ignored if --mg-load-vec is set (inverse-iterations (default), chebyshev-filter, eigenvectors, test-vectors, restrict-fine, free-field)"); + quda_app->add_mgoption(opgroup, "--mg-setup-ca-basis-size", setup_ca_basis_size, CLI::PositiveNumber, "The basis size to use for CA solver setup of multigrid (default 4)"); quda_app->add_mgoption(opgroup, "--mg-setup-ca-basis-type", setup_ca_basis, CLI::QUDACheckedTransformer(ca_basis_map), @@ -910,8 +926,20 @@ void add_multigrid_option_group(std::shared_ptr quda_app) quda_app->add_mgoption(opgroup, "--mg-setup-tol", setup_tol, CLI::Validator(), "The tolerance to use for the setup of multigrid (default 5e-6)"); - quda_app->add_mgoption(opgroup, "--mg-setup-type", setup_type, CLI::QUDACheckedTransformer(setup_type_map), - "The type of setup to use for the multigrid; ignored if --mg-load-vec is set (inverse-iterations (default), chebyshev-filter, eigenvectors, test-vectors, restrict-fine, free-field)"); + quda_app->add_mgoption(opgroup, "--mg-setup-filter-startup-vectors", filter_startup_vectors, CLI::PositiveNumber, + "Number of random starting vectors for Chebyshev filter null space generation (default 1)"); + quda_app->add_mgoption(opgroup, "--mg-setup-filter-startup-iterations", filter_startup_iterations, CLI::PositiveNumber, + "Number of iterations for initial Chebyshev filter (default 1000)"); + quda_app->add_mgoption(opgroup, "--mg-setup-filter-startup-rescale-frequency", filter_startup_rescale_frequency, CLI::PositiveNumber, + "Frequency of rescales during initial filtering which helps avoid overflow (default 50)"); + quda_app->add_mgoption(opgroup, "--mg-setup-filter-iterations-between-vectors", filter_iterations_between_vectors, CLI::PositiveNumber, + "Number of iterations between null vectors generated from each starting vector (default 150)"); + quda_app->add_mgoption( + opgroup, "--mg-setup-filter-lambda-max", filter_lambda_max, CLI::Validator(), + "Conservative estimate of largest eigenvalue of operator used for Chebyshev filter setup (default is to guess with power iterations)"); + quda_app->add_mgoption( + opgroup, "--mg-setup-filter-lambda-min", filter_lambda_min, CLI::PositiveNumber, + "Lower bound of eigenvalues that are not enhanced by the initial Chebyshev filter (default 1)"); opgroup ->add_option( diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h index 7942b9b5ed..53bc790041 100644 --- a/tests/utils/command_line_params.h +++ b/tests/utils/command_line_params.h @@ -267,6 +267,10 @@ extern quda::mgarray mg_verbosity; extern quda::mgarray setup_inv; extern quda::mgarray coarse_solve_type; extern quda::mgarray smoother_solve_type; + +extern quda::mgarray setup_type; + +// Parameters for inverse iterations setup extern quda::mgarray num_setup_iter; extern quda::mgarray setup_tol; extern quda::mgarray setup_maxiter; @@ -275,7 +279,16 @@ extern quda::mgarray setup_ca_basis; extern quda::mgarray setup_ca_basis_size; extern quda::mgarray setup_ca_lambda_min; extern quda::mgarray setup_ca_lambda_max; -extern quda::mgarray setup_type; + +// Parameters for Chebyshev filter setup +extern quda::mgarray filter_startup_vectors; +extern quda::mgarray filter_startup_iterations; +extern quda::mgarray filter_startup_rescale_frequency; +extern quda::mgarray filter_iterations_between_vectors; +extern quda::mgarray filter_lambda_min; +extern quda::mgarray filter_lambda_max; + + extern bool pre_orthonormalize; extern bool post_orthonormalize; extern double omega; diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index dd0f10e256..409188343a 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -128,6 +128,14 @@ void setQudaDefaultMgTestParams() mg_eig_amax[i] = -1.0; // use power iterations mg_eig_save_prec[i] = QUDA_DOUBLE_PRECISION; + setup_type[i] = QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS; + filter_startup_vectors[i] = 1; + filter_startup_iterations[i] = 1000; + filter_startup_rescale_frequency[i] = 50; + filter_iterations_between_vectors[i] = 150; + filter_lambda_min[i] = 1.0; + filter_lambda_max[i] = -1.0; + setup_ca_basis[i] = QUDA_POWER_BASIS; setup_ca_basis_size[i] = 4; setup_ca_lambda_min[i] = 0.0; diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 569cae8643..3876eddf16 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -260,7 +260,22 @@ inline void safe_strcpy(char *cstr, const std::string &str, size_t limit, const } // MG param types +/** + @brief Set multigrid parameters that are universal across Wilson-type + and staggered-type +*/ void setMultigridParam(QudaMultigridParam &mg_param); + +/** + @brief Set multigrid parameters that are specific to Wilson-type + multigrid. Calls setUniversalMultigridParam. +*/ +void setWilsonMultigridParam(QudaMultigridParam &mg_param); + +/** + @brief Set multigrid parameters that are specific to staggered-type + multigrid. Calls setUniversalMultigridParam. +*/ void setStaggeredMultigridParam(QudaMultigridParam &mg_param); // Eig param types diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp index 021abe3da0..f89d2b6b1c 100644 --- a/tests/utils/set_params.cpp +++ b/tests/utils/set_params.cpp @@ -343,6 +343,7 @@ void setEigParam(QudaEigParam &eig_param) eig_param.struct_size = sizeof(eig_param); } + void setMultigridParam(QudaMultigridParam &mg_param) { QudaInvertParam &inv_param = *mg_param.invert_param; // this will be used to setup SolverParam parent in MGParam class @@ -360,60 +361,21 @@ void setMultigridParam(QudaMultigridParam &mg_param) inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; inv_param.dirac_order = QUDA_DIRAC_ORDER; - if (kappa == -1.0) { - inv_param.mass = mass; - inv_param.kappa = 1.0 / (2.0 * (1 + 3 / anisotropy + mass)); - } else { - inv_param.kappa = kappa; - inv_param.mass = 0.5 / kappa - (1 + 3 / anisotropy); - } - - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH - || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) { - inv_param.clover_cpu_prec = cpu_prec; - inv_param.clover_cuda_prec = cuda_prec; - inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy; - inv_param.clover_cuda_prec_precondition = cuda_prec_precondition; - inv_param.clover_cuda_prec_eigensolver = cuda_prec_eigensolver; - inv_param.clover_cuda_prec_refinement_sloppy = cuda_prec_refinement_sloppy; - inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; - // Use kappa * csw or supplied clover_coeff - inv_param.clover_csw = clover_csw; - if (clover_coeff == 0.0) { - inv_param.clover_coeff = clover_csw * inv_param.kappa; - } else { - inv_param.clover_coeff = clover_coeff; - } - inv_param.compute_clover_trlog = compute_clover_trlog ? 1 : 0; - } - inv_param.input_location = QUDA_CPU_FIELD_LOCATION; inv_param.output_location = QUDA_CPU_FIELD_LOCATION; inv_param.dslash_type = dslash_type; - if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - inv_param.mu = mu; - inv_param.epsilon = epsilon; - inv_param.twist_flavor = twist_flavor; - inv_param.Ls = (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) ? 2 : 1; - - if (twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { - printfQuda("Twisted-mass doublet non supported (yet)\n"); - exit(0); - } - } - inv_param.dagger = QUDA_DAG_NO; - inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; - inv_param.matpc_type = matpc_type; inv_param.solution_type = QUDA_MAT_SOLUTION; inv_param.solve_type = QUDA_DIRECT_SOLVE; - mg_param.invert_param = &inv_param; + mg_param.use_mma = mg_use_mma ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + mg_param.n_level = mg_levels; + for (int i = 0; i < mg_param.n_level; i++) { for (int j = 0; j < 4; j++) { // if not defined use 4 @@ -434,15 +396,22 @@ void setMultigridParam(QudaMultigridParam &mg_param) // Basis to use for CA solver setups mg_param.setup_ca_basis[i] = setup_ca_basis[i]; - // Basis size for CA solver setup + // Basis size for CA solver setups mg_param.setup_ca_basis_size[i] = setup_ca_basis_size[i]; // Minimum and maximum eigenvalue for Chebyshev CA basis setup mg_param.setup_ca_lambda_min[i] = setup_ca_lambda_min[i]; mg_param.setup_ca_lambda_max[i] = setup_ca_lambda_max[i]; + // Parameters for Chebyshev filter multigrid setup + mg_param.filter_startup_vectors[i] = filter_startup_vectors[i]; + mg_param.filter_startup_iterations[i] = filter_startup_iterations[i]; + mg_param.filter_startup_rescale_frequency[i] = filter_startup_rescale_frequency[i]; + mg_param.filter_iterations_between_vectors[i] = filter_iterations_between_vectors[i]; + mg_param.filter_lambda_min[i] = filter_lambda_min[i]; + mg_param.filter_lambda_max[i] = filter_lambda_max[i]; + mg_param.spin_block_size[i] = 1; - mg_param.n_vec[i] = nvec[i] == 0 ? 24 : nvec[i]; // default to 24 vectors if not set mg_param.n_block_ortho[i] = n_block_ortho[i]; // number of times to Gram-Schmidt mg_param.block_ortho_two_pass[i] = block_ortho_two_pass[i] ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; // whether to use a two-pass block ortho @@ -454,9 +423,6 @@ void setMultigridParam(QudaMultigridParam &mg_param) mg_param.cycle_type[i] = QUDA_MG_CYCLE_RECURSIVE; - // Is not a staggered solve, always aggregate - mg_param.transfer_type[i] = QUDA_TRANSFER_AGGREGATE; - // set the coarse solver wrappers including bottom solver mg_param.coarse_solver[i] = coarse_solver[i]; mg_param.coarse_solver_tol[i] = coarse_solver_tol[i]; @@ -497,7 +463,7 @@ void setMultigridParam(QudaMultigridParam &mg_param) mg_param.smoother_solver_ca_lambda_min[i] = smoother_solver_ca_lambda_min[i]; mg_param.smoother_solver_ca_lambda_max[i] = smoother_solver_ca_lambda_max[i]; - // Set set coarse_grid_solution_type: this defines which linear + // Set set coarse_grid_solution_type: this defines which linear // system we are solving on a given level // * QUDA_MAT_SOLUTION - we are solving the full system and inject // a full field into coarse grid @@ -543,15 +509,15 @@ void setMultigridParam(QudaMultigridParam &mg_param) if (i == 0) { // top-level treatment if (coarse_solve_type[0] != solve_type) - errorQuda("Mismatch between top-level MG solve type %d and outer solve type %d", coarse_solve_type[0], - solve_type); + errorQuda("Mismatch between top-level MG solve type %s and outer solve type %s", + get_solve_str(coarse_solve_type[0]), get_solve_str(solve_type)); if (solve_type == QUDA_DIRECT_SOLVE) { mg_param.coarse_grid_solution_type[i] = QUDA_MAT_SOLUTION; } else if (solve_type == QUDA_DIRECT_PC_SOLVE) { mg_param.coarse_grid_solution_type[i] = QUDA_MATPC_SOLUTION; } else { - errorQuda("Unexpected solve_type = %d\n", solve_type); + errorQuda("Unexpected solve_type = %s\n", get_solve_str(solve_type)); } } else { @@ -561,7 +527,7 @@ void setMultigridParam(QudaMultigridParam &mg_param) } else if (coarse_solve_type[i] == QUDA_DIRECT_PC_SOLVE) { mg_param.coarse_grid_solution_type[i] = QUDA_MATPC_SOLUTION; } else { - errorQuda("Unexpected solve_type = %d\n", coarse_solve_type[i]); + errorQuda("Unexpected solve_type = %s\n", get_solve_str(coarse_solve_type[i])); } } @@ -574,9 +540,6 @@ void setMultigridParam(QudaMultigridParam &mg_param) // whether to run GPU setup but putting temporaries into mapped (slow CPU) memory mg_param.setup_minimize_memory = QUDA_BOOLEAN_FALSE; - // only coarsen the spin on the first restriction - mg_param.spin_block_size[0] = 2; - mg_param.pre_orthonormalize = pre_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; mg_param.post_orthonormalize = post_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; @@ -584,18 +547,6 @@ void setMultigridParam(QudaMultigridParam &mg_param) mg_param.run_low_mode_check = low_mode_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; mg_param.run_oblique_proj_check = oblique_proj_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.use_mma = mg_use_mma ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - // Whether or not to use thin restarts in the evolve tests - mg_param.thin_update_only = mg_evolve_thin_updates ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - - // whether or not to let MG coarsening drop improvements - // ex: for asqtad, dropping the long links for aggregation dimensions smaller than 3 - mg_param.allow_truncation = mg_allow_truncation ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - - // whether or not to use the dagger approximation to Xinv, which is X^dagger - mg_param.staggered_kd_dagger_approximation - = mg_staggered_kd_dagger_approximation ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - // set file i/o parameters for (int i = 0; i < mg_param.n_level; i++) { safe_strcpy(mg_param.vec_infile[i], mg_vec_infile[i], 256, "mg_vec_infile[" + std::to_string(i) + "]"); @@ -606,7 +557,12 @@ void setMultigridParam(QudaMultigridParam &mg_param) mg_param.coarse_guess = mg_eig_coarse_guess ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.struct_size = sizeof(mg_param); + // Whether or not to use thin restarts in the evolve tests + mg_param.thin_update_only = mg_evolve_thin_updates ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + + // whether or not to let MG coarsening drop improvements + // ex: for asqtad, dropping the long links for aggregation dimensions smaller than 3 + mg_param.allow_truncation = mg_allow_truncation ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; // these need to tbe set for now but are actually ignored by the MG setup // needed to make it pass the initialization test @@ -619,6 +575,70 @@ void setMultigridParam(QudaMultigridParam &mg_param) inv_param.verbosity = verbosity; inv_param.verbosity_precondition = verbosity; + inv_param.struct_size = sizeof(inv_param); +} + +void setWilsonMultigridParam(QudaMultigridParam &mg_param) +{ + QudaInvertParam &inv_param = *mg_param.invert_param; // this will be used to setup SolverParam parent in MGParam class + + setMultigridParam(mg_param); + + if (kappa == -1.0) { + inv_param.mass = mass; + inv_param.kappa = 1.0 / (2.0 * (1 + 3 / anisotropy + mass)); + } else { + inv_param.kappa = kappa; + inv_param.mass = 0.5 / kappa - (1 + 3 / anisotropy); + } + + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH + || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) { + inv_param.clover_cpu_prec = cpu_prec; + inv_param.clover_cuda_prec = cuda_prec; + inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy; + inv_param.clover_cuda_prec_precondition = cuda_prec_precondition; + inv_param.clover_cuda_prec_eigensolver = cuda_prec_eigensolver; + inv_param.clover_cuda_prec_refinement_sloppy = cuda_prec_refinement_sloppy; + inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; + // Use kappa * csw or supplied clover_coeff + inv_param.clover_csw = clover_csw; + if (clover_coeff == 0.0) { + inv_param.clover_coeff = clover_csw * inv_param.kappa; + } else { + inv_param.clover_coeff = clover_coeff; + } + inv_param.compute_clover_trlog = compute_clover_trlog ? 1 : 0; + } + + if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + inv_param.mu = mu; + inv_param.epsilon = epsilon; + inv_param.twist_flavor = twist_flavor; + inv_param.Ls = (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) ? 2 : 1; + + if (twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { + printfQuda("Twisted-mass doublet non supported (yet)\n"); + exit(0); + } + } + + inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; + + mg_param.invert_param = &inv_param; + + for (int i = 0; i < mg_param.n_level; i++) { + + mg_param.n_vec[i] = nvec[i] == 0 ? 24 : nvec[i]; // default to 24 vectors if not set + + // Is not a staggered solve, always aggregate + mg_param.transfer_type[i] = QUDA_TRANSFER_AGGREGATE; + + } + + // only coarsen the spin on the first restriction + mg_param.spin_block_size[0] = 2; + // Use kappa * csw or supplied clover_coeff inv_param.clover_csw = clover_csw; if (clover_coeff == 0.0) { @@ -970,244 +990,38 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param) void setStaggeredMultigridParam(QudaMultigridParam &mg_param) { - QudaInvertParam &inv_param = *mg_param.invert_param; // this will be used to setup SolverParam parent in MGParam class - - // Whether or not to use native BLAS LAPACK - inv_param.native_blas_lapack = (native_blas_lapack ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE); - - inv_param.Ls = 1; - inv_param.cpu_prec = cpu_prec; - inv_param.cuda_prec = cuda_prec; - inv_param.cuda_prec_sloppy = cuda_prec_sloppy; - inv_param.cuda_prec_precondition = cuda_prec_precondition; - inv_param.cuda_prec_eigensolver = cuda_prec_eigensolver; - inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; - inv_param.dirac_order = QUDA_DIRAC_ORDER; - - inv_param.input_location = QUDA_CPU_FIELD_LOCATION; - inv_param.output_location = QUDA_CPU_FIELD_LOCATION; + QudaInvertParam &inv_param = *mg_param.invert_param; // this will be used to setup SolverParam parent in MGParam class - inv_param.dslash_type = dslash_type; + setMultigridParam(mg_param); inv_param.mass = mass; inv_param.kappa = 1.0 / (2.0 * (4.0 + inv_param.mass)); - inv_param.dagger = QUDA_DAG_NO; inv_param.mass_normalization = QUDA_MASS_NORMALIZATION; - inv_param.matpc_type = matpc_type; - inv_param.solution_type = QUDA_MAT_SOLUTION; - - inv_param.solve_type = QUDA_DIRECT_SOLVE; - - mg_param.use_mma = mg_use_mma ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - - // whether or not to allow dropping the long links for aggregation dimensions smaller than 3 - mg_param.allow_truncation = mg_allow_truncation ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - - // whether or not to use the dagger approximation to Xinv, which is X^dagger - mg_param.staggered_kd_dagger_approximation - = mg_staggered_kd_dagger_approximation ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.invert_param = &inv_param; - mg_param.n_level = mg_levels; + for (int i = 0; i < mg_param.n_level; i++) { - for (int j = 0; j < 4; j++) { - // if not defined use 4 - mg_param.geo_block_size[i][j] = geo_block_size[i][j] ? geo_block_size[i][j] : 4; - } - mg_param.use_eig_solver[i] = mg_eig[i] ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.verbosity[i] = mg_verbosity[i]; - mg_param.setup_inv_type[i] = setup_inv[i]; - mg_param.num_setup_iter[i] = num_setup_iter[i]; - mg_param.setup_tol[i] = setup_tol[i]; - mg_param.setup_maxiter[i] = setup_maxiter[i]; - - // Setup type to use (inverse iterations, chebyshev filter, eigenvectors, restriction, free field) - mg_param.setup_type[i] = setup_type[i]; - - // Basis to use for CA solver setups - mg_param.setup_ca_basis[i] = setup_ca_basis[i]; - - // Basis size for CA solver setups - mg_param.setup_ca_basis_size[i] = setup_ca_basis_size[i]; - - // Minimum and maximum eigenvalue for Chebyshev CA basis setup - mg_param.setup_ca_lambda_min[i] = setup_ca_lambda_min[i]; - mg_param.setup_ca_lambda_max[i] = setup_ca_lambda_max[i]; - mg_param.spin_block_size[i] = 1; mg_param.n_vec[i] = nvec[i] == 0 ? 64 : nvec[i]; // default to 64 vectors if not set - mg_param.n_block_ortho[i] = n_block_ortho[i]; // number of times to Gram-Schmidt - mg_param.block_ortho_two_pass[i] - = block_ortho_two_pass[i] ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; // whether to use a two-pass block ortho - mg_param.precision_null[i] = prec_null; // precision to store the null-space basis - mg_param.smoother_halo_precision[i] = smoother_halo_prec; // precision of the halo exchange in the smoother - mg_param.nu_pre[i] = nu_pre[i]; - mg_param.nu_post[i] = nu_post[i]; - mg_param.mu_factor[i] = mu_factor[i]; mg_param.transfer_type[i] = (i == 0) ? staggered_transfer_type : QUDA_TRANSFER_AGGREGATE; - mg_param.cycle_type[i] = QUDA_MG_CYCLE_RECURSIVE; - - // set the coarse solver wrappers including bottom solver - mg_param.coarse_solver[i] = coarse_solver[i]; - mg_param.coarse_solver_tol[i] = coarse_solver_tol[i]; - mg_param.coarse_solver_maxiter[i] = coarse_solver_maxiter[i]; - - // Basis to use for CA coarse solvers - mg_param.coarse_solver_ca_basis[i] = coarse_solver_ca_basis[i]; - - // Basis size for CA coarse solvers - mg_param.coarse_solver_ca_basis_size[i] = coarse_solver_ca_basis_size[i]; - - // Minimum and maximum eigenvalue for Chebyshev CA basis - mg_param.coarse_solver_ca_lambda_min[i] = coarse_solver_ca_lambda_min[i]; - mg_param.coarse_solver_ca_lambda_max[i] = coarse_solver_ca_lambda_max[i]; - - mg_param.smoother[i] = smoother_type[i]; - - // set the smoother / bottom solver tolerance (for MR smoothing this will be ignored) - mg_param.smoother_tol[i] = smoother_tol[i]; - - // set to QUDA_DIRECT_SOLVE for no even/odd preconditioning on the smoother - // set to QUDA_DIRECT_PC_SOLVE for to enable even/odd preconditioning on the smoother - mg_param.smoother_solve_type[i] = smoother_solve_type[i]; - - // set to QUDA_ADDITIVE_SCHWARZ for Additive Schwarz precondioned smoother (presently only impelemented for MR) - mg_param.smoother_schwarz_type[i] = mg_schwarz_type[i]; - - // if using Schwarz preconditioning then use local reductions only - mg_param.global_reduction[i] = (mg_schwarz_type[i] == QUDA_INVALID_SCHWARZ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - - // set number of Schwarz cycles to apply - mg_param.smoother_schwarz_cycle[i] = mg_schwarz_cycle[i]; - - // Basis to use for CA smoothers - mg_param.smoother_solver_ca_basis[i] = smoother_solver_ca_basis[i]; - - // Minimum and maximum eigenvalue for Chebyshev CA basis smoothers - mg_param.smoother_solver_ca_lambda_min[i] = smoother_solver_ca_lambda_min[i]; - mg_param.smoother_solver_ca_lambda_max[i] = smoother_solver_ca_lambda_max[i]; - - // Set set coarse_grid_solution_type: this defines which linear - // system we are solving on a given level - // * QUDA_MAT_SOLUTION - we are solving the full system and inject - // a full field into coarse grid - // * QUDA_MATPC_SOLUTION - we are solving the e/o-preconditioned - // system, and only inject single parity field into coarse grid - // - // Multiple possible scenarios here - // - // 1. **Direct outer solver and direct smoother**: here we use - // full-field residual coarsening, and everything involves the - // full system so coarse_grid_solution_type = QUDA_MAT_SOLUTION - // - // 2. **Direct outer solver and preconditioned smoother**: here, - // only the smoothing uses e/o preconditioning, so - // coarse_grid_solution_type = QUDA_MAT_SOLUTION_TYPE. - // We reconstruct the full residual prior to coarsening after the - // pre-smoother, and then need to project the solution for post - // smoothing. - // - // 3. **Preconditioned outer solver and preconditioned smoother**: - // here we use single-parity residual coarsening throughout, so - // coarse_grid_solution_type = QUDA_MATPC_SOLUTION. This is a bit - // questionable from a theoretical point of view, since we don't - // coarsen the preconditioned operator directly, rather we coarsen - // the full operator and preconditioned that, but it just works. - // This is the optimal combination in general for Wilson-type - // operators: although there is an occasional increase in - // iteration or two), by working completely in the preconditioned - // space, we save the cost of reconstructing the full residual - // from the preconditioned smoother, and re-projecting for the - // subsequent smoother, as well as reducing the cost of the - // ancillary blas operations in the coarse-grid solve. - // - // Note, we cannot use preconditioned outer solve with direct - // smoother - // - // Finally, we have to treat the top level carefully: for all - // other levels the entry into and out of the grid will be a - // full-field, which we can then work in Schur complement space or - // not (e.g., freedom to choose coarse_grid_solution_type). For - // the top level, if the outer solver is for the preconditioned - // system, then we must use preconditoning, e.g., option 3.) above. - - if (i == 0) { // top-level treatment - if (coarse_solve_type[0] != solve_type) - errorQuda("Mismatch between top-level MG solve type %s and outer solve type %s", - get_solve_str(coarse_solve_type[0]), get_solve_str(solve_type)); - - if (solve_type == QUDA_DIRECT_SOLVE) { - mg_param.coarse_grid_solution_type[i] = QUDA_MAT_SOLUTION; - } else if (solve_type == QUDA_DIRECT_PC_SOLVE) { - mg_param.coarse_grid_solution_type[i] = QUDA_MATPC_SOLUTION; - } else { - errorQuda("Unexpected solve_type = %s\n", get_solve_str(solve_type)); - } - - } else { - - if (coarse_solve_type[i] == QUDA_DIRECT_SOLVE) { - mg_param.coarse_grid_solution_type[i] = QUDA_MAT_SOLUTION; - } else if (coarse_solve_type[i] == QUDA_DIRECT_PC_SOLVE) { - mg_param.coarse_grid_solution_type[i] = QUDA_MATPC_SOLUTION; - } else { - errorQuda("Unexpected solve_type = %s\n", get_solve_str(coarse_solve_type[i])); - } - } - - mg_param.omega[i] = omega; // over/under relaxation factor - - mg_param.location[i] = solver_location[i]; - mg_param.setup_location[i] = setup_location[i]; - nu_pre[i] = 2; - nu_post[i] = 2; } - // whether to run GPU setup but putting temporaries into mapped (slow CPU) memory - mg_param.setup_minimize_memory = QUDA_BOOLEAN_FALSE; - // coarsening the spin on the first restriction is undefined for staggered fields. mg_param.spin_block_size[0] = 0; + // whether or not to use the dagger approximation to Xinv, which is X^dagger + mg_param.staggered_kd_dagger_approximation + = mg_staggered_kd_dagger_approximation ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + if (staggered_transfer_type == QUDA_TRANSFER_OPTIMIZED_KD || staggered_transfer_type == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG) { mg_param.spin_block_size[1] = 0; // we're coarsening the optimized KD op } - mg_param.pre_orthonormalize = pre_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.post_orthonormalize = post_orthonormalize ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - - mg_param.run_verify = verify_results ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.run_low_mode_check = low_mode_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - mg_param.run_oblique_proj_check = oblique_proj_check ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - - // set file i/o parameters - for (int i = 0; i < mg_param.n_level; i++) { - safe_strcpy(mg_param.vec_infile[i], mg_vec_infile[i], 256, "mg_vec_infile[" + std::to_string(i) + "]"); - safe_strcpy(mg_param.vec_outfile[i], mg_vec_outfile[i], 256, "mg_vec_outfile[" + std::to_string(i) + "]"); - if (mg_vec_infile[i].size() > 0) mg_param.vec_load[i] = QUDA_BOOLEAN_TRUE; - if (mg_vec_outfile[i].size() > 0) mg_param.vec_store[i] = QUDA_BOOLEAN_TRUE; - } - - mg_param.coarse_guess = mg_eig_coarse_guess ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - - // these need to tbe set for now but are actually ignored by the MG setup - // needed to make it pass the initialization test - inv_param.inv_type = QUDA_GCR_INVERTER; - inv_param.tol = 1e-10; - inv_param.maxiter = 1000; - inv_param.reliable_delta = reliable_delta; - inv_param.gcrNkrylov = 10; - - inv_param.verbosity = verbosity; - inv_param.verbosity_precondition = verbosity; - - inv_param.struct_size = sizeof(inv_param); } void setDeflatedInvertParam(QudaInvertParam &inv_param) From f954cec3b97ba65add2736301eae3b5d02b575ae Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 5 May 2022 00:34:07 -0400 Subject: [PATCH 06/15] Various cleanup --- include/invert_quda.h | 6 + include/multigrid.h | 16 +++ lib/inv_cg_quda.cpp | 2 +- lib/multigrid.cpp | 307 ++++++++++++++++++------------------------ lib/solver.cpp | 20 +++ 5 files changed, 175 insertions(+), 176 deletions(-) diff --git a/include/invert_quda.h b/include/invert_quda.h index e4b9925c72..1c79a1b178 100644 --- a/include/invert_quda.h +++ b/include/invert_quda.h @@ -1694,4 +1694,10 @@ namespace quda { */ bool is_ca_solver(QudaInverterType type); + /** + @brief Returns if a solver supports deflation or not + @return true if solver supports deflation, false otherwise + */ + bool is_deflatable_solver(QudaInverterType type); + } // namespace quda diff --git a/include/multigrid.h b/include/multigrid.h index 53255ef5e3..b2e264156c 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -438,6 +438,14 @@ namespace quda { */ void generateFreeVectors(); + /** + @brief Orthonormalize a vector of ColorSpinorField, erroring out if + the fields are not sufficiently linearly independent + @param vecs vector of ColorSpinorFields to normalize + @param count number of near-null vectors to orthonormalize, default all + */ + void orthonormalize_vectors(std::vector& vecs, int count = -1) const; + /** @brief Return the total flops done on this and all coarser levels. */ @@ -456,6 +464,14 @@ namespace quda { return (param.level == 0 || kd_nearnull_gen); } + + /** + @brief Return if we're on the coarsest grid right now + */ + bool is_coarsest_grid() const { + return (param.level == (param.Nlevel - 1)); + } + }; /** diff --git a/lib/inv_cg_quda.cpp b/lib/inv_cg_quda.cpp index 417f13f8f4..19627367a4 100644 --- a/lib/inv_cg_quda.cpp +++ b/lib/inv_cg_quda.cpp @@ -567,7 +567,7 @@ namespace quda { } // if L2 broke down already we turn off reliable updates and restart the CG - if (ru.reliable_heavy_quark_break(L2breakdown, heavy_quark_res, heavy_quark_res_old, heavy_quark_restart)) { + if (use_heavy_quark_res && ru.reliable_heavy_quark_break(L2breakdown, heavy_quark_res, heavy_quark_res_old, heavy_quark_restart)) { break; } diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 5ba5706c36..8e70b3a039 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -86,7 +86,19 @@ namespace quda rng = new RNG(*param.B[0], 1234); - if (param.transfer_type == QUDA_TRANSFER_AGGREGATE && param.level < param.Nlevel - 1) { + if (!is_coarsest_grid()) { + // allocate coarse temporary vectors + tmp_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), + param.mg_global.location[param.level + 1]); + tmp2_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), + param.mg_global.location[param.level + 1]); + r_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), + param.mg_global.location[param.level + 1]); + x_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), + param.mg_global.location[param.level + 1]); + } + + if (param.transfer_type == QUDA_TRANSFER_AGGREGATE && !is_coarsest_grid()) { constructNearNulls(); @@ -119,13 +131,13 @@ namespace quda if (param.level != 0 || param.transfer_type == QUDA_TRANSFER_AGGREGATE) { // Refresh the null-space vectors if we need to // FIXME: can we meaningfully refresh Chebyshev filter null vecs? - if (refresh && param.level < param.Nlevel - 1) { + if (refresh && !is_coarsest_grid()) { if (param.mg_global.setup_maxiter_refresh[param.level]) constructNearNulls(refresh); } } // if not on the coarsest level, update next - if (param.level < param.Nlevel-1) { + if (!is_coarsest_grid()) { if (transfer) { // restoring FULL parity in Transfer changed at the end of this procedure @@ -142,26 +154,6 @@ namespace quda param.mg_global.transfer_type[param.level], profile); for (int i=0; iCreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); - - // create coarse temporary vector if not already created in verify() - if (!tmp2_coarse) - tmp2_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); - - // create coarse residual vector if not already created in verify() - if (!r_coarse) - r_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); - - // create coarse solution vector if not already created in verify() - if (!x_coarse) - x_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); - int n_vec_coarse = param.mg_global.n_vec[param.level + 1]; B_coarse.resize(n_vec_coarse); @@ -183,7 +175,7 @@ namespace quda // delay allocating smoother until after coarse-links have been created createSmoother(); - if (param.level < param.Nlevel-1) { + if (!is_coarsest_grid()) { // If enabled, verify the coarse links and fine solvers were correctly built if (param.mg_global.run_verify) verify(); @@ -300,7 +292,7 @@ namespace quda pushLevel(param.level); // create the smoother for this level - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating smoother\n"); + logQuda(QUDA_VERBOSE, "Creating smoother\n"); destroySmoother(); param_presmooth = new SolverParam(param); @@ -318,7 +310,7 @@ namespace quda param_presmooth->inv_type_precondition = QUDA_INVALID_INVERTER; param_presmooth->residual_type = (param_presmooth->inv_type == QUDA_MR_INVERTER) ? QUDA_INVALID_RESIDUAL : QUDA_L2_RELATIVE_RESIDUAL; param_presmooth->Nsteps = param.mg_global.smoother_schwarz_cycle[param.level]; - param_presmooth->maxiter = (param.level < param.Nlevel-1) ? param.nu_pre : param.nu_pre + param.nu_post; + param_presmooth->maxiter = (is_coarsest_grid()) ? (param.nu_pre + param.nu_post) : param.nu_pre; param_presmooth->Nkrylov = param_presmooth->maxiter; param_presmooth->pipeline = param_presmooth->maxiter; @@ -338,12 +330,12 @@ namespace quda // inner solver should recompute the true residual after each cycle if using Schwarz preconditioning param_presmooth->compute_true_res = (param_presmooth->schwarz_type != QUDA_INVALID_SCHWARZ) ? true : false; - presmoother = ((param.level < param.Nlevel - 1 || param_presmooth->schwarz_type != QUDA_INVALID_SCHWARZ) + presmoother = ((!is_coarsest_grid() || param_presmooth->schwarz_type != QUDA_INVALID_SCHWARZ) && param_presmooth->inv_type != QUDA_INVALID_INVERTER && param_presmooth->maxiter > 0) ? Solver::create(*param_presmooth, *param.matSmooth, *param.matSmoothSloppy, *param.matSmoothSloppy, *param.matSmoothSloppy, profile) : nullptr; - if (param.level < param.Nlevel - 1) { // Create the post smoother + if (!is_coarsest_grid()) { // Create the post smoother param_postsmooth = new SolverParam(*param_presmooth); param_postsmooth->return_residual = false; // post smoother does not need to return the residual vector param_postsmooth->use_init_guess = QUDA_USE_INIT_GUESS_YES; @@ -633,11 +625,8 @@ namespace quda param_coarse_solver->use_init_guess = QUDA_USE_INIT_GUESS_YES; } - // Deflation on the coarse is supported for 6 solvers only - if (param_coarse_solver->inv_type != QUDA_CA_CGNR_INVERTER && param_coarse_solver->inv_type != QUDA_CGNR_INVERTER - && param_coarse_solver->inv_type != QUDA_CA_CGNE_INVERTER && param_coarse_solver->inv_type != QUDA_CGNE_INVERTER - && param_coarse_solver->inv_type != QUDA_CA_GCR_INVERTER && param_coarse_solver->inv_type != QUDA_GCR_INVERTER - && param_coarse_solver->inv_type != QUDA_BICGSTABL_INVERTER) { + // Deflation is not supported for all solvers + if (!is_deflatable_solver(param_coarse_solver->inv_type)) { errorQuda("Coarse grid deflation not supported with coarse solver %d", param_coarse_solver->inv_type); } @@ -719,8 +708,7 @@ namespace quda // vectors stored in the parent MG instead coarse_solver_inner.setDeflateCompute(false); coarse_solver_inner.setRecomputeEvals(true); - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Transferring deflation space size %d to coarse solver\n", defl_size); + logQuda(QUDA_VERBOSE,"Transferring deflation space size %d to coarse solver\n", defl_size); // Create space in coarse solver to hold deflation space, destroy space in MG. coarse_solver_inner.injectDeflationSpace(evecs); } @@ -734,11 +722,11 @@ namespace quda param_coarse_solver->maxiter = param.mg_global.coarse_solver_maxiter[param.level + 1]; } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Assigned coarse solver to preconditioned GCR solver\n"); + logQuda(QUDA_VERBOSE, "Assigned coarse solver to preconditioned GCR solver\n"); } else { errorQuda("Multigrid cycle type %d not supported", param.cycle_type); } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Coarse solver wrapper done\n"); + logQuda(QUDA_VERBOSE, "Coarse solver wrapper done\n"); popLevel(); } @@ -791,6 +779,23 @@ namespace quda popLevel(); } + void MG::orthonormalize_vectors(std::vector& vecs, int count) const { + int num_vec = (count == -1) ? static_cast(vecs.size()) : count; + + if (num_vec > (int)vecs.size()) + errorQuda("Trying to orthonormalize %d vectors which is larger than std::vector size %ld", num_vec, vecs.size()); + + for (int i = 0; i < num_vec; i++) { + for (int j = 0; j < i ; j++) { + Complex alpha = cDotProduct(*vecs[j], *vecs[i]);// + caxpy(-alpha, *vecs[j], *vecs[i]); // i-j + } + double nrm2 = norm2(*vecs[i]); + if (nrm2 > 1e-16) ax(1.0 / sqrt(nrm2), *vecs[i]);// i/ + else errorQuda("Cannot normalize %u vector", i); + } + } + // FIXME need to make this more robust (implement Solver::flops() for all solvers) double MG::flops() const { double flops = 0; @@ -851,8 +856,7 @@ namespace quda // No need to check (projector) v_k for staggered case if (param.transfer_type == QUDA_TRANSFER_AGGREGATE) { - if (getVerbosity() >= QUDA_SUMMARIZE) - printfQuda("Checking 0 = (1 - P P^\\dagger) v_k for %d vectors\n", param.Nvec); + logQuda(QUDA_SUMMARIZE, "Checking 0 = (1 - P P^\\dagger) v_k for %d vectors\n", param.Nvec); for (int i = 0; i < param.Nvec; i++) { // as well as copying to the correct location this also changes basis if necessary @@ -917,28 +921,6 @@ namespace quda } #endif - // We need to create temporary coarse vectors here for various verifications - // Otherwise these get created in the coarse `reset()` routine later - - if (!tmp_coarse) - tmp_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); - - // create coarse temporary vector if not already created in verify() - if (!tmp2_coarse) - tmp2_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); - - // create coarse residual vector if not already created in verify() - if (!r_coarse) - r_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); - - // create coarse solution vector if not already created in verify() - if (!x_coarse) - x_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); - if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("Checking 0 = (1 - P^\\dagger P) eta_c\n"); spinorNoise(*x_coarse, *rng, QUDA_NOISE_UNIFORM); @@ -1218,7 +1200,7 @@ namespace quda if ( debug ) printfQuda("entering V-cycle with x2=%e, r2=%e\n", norm2(x), norm2(b)); - if (param.level < param.Nlevel-1) { + if (!is_coarsest_grid()) { //transfer->setTransferGPU(false); // use this to force location of transfer (need to check if still works for multi-level) // do the pre smoothing @@ -1367,17 +1349,26 @@ namespace quda void MG::dumpNullVectors() const { - if (param.transfer_type != QUDA_TRANSFER_AGGREGATE) { - warningQuda("Cannot dump near-null vectors for top level of staggered MG solve."); - } else { - saveVectors(param.B); + if (!is_coarsest_grid()) { + if (param.transfer_type != QUDA_TRANSFER_AGGREGATE) { + warningQuda("Cannot dump near-null vectors for top level of staggered MG solve."); + } else { + saveVectors(param.B); + } + coarse->dumpNullVectors(); } - if (param.level < param.Nlevel - 2) coarse->dumpNullVectors(); } void MG::constructNearNulls(bool refresh) { + pushLevel(param.level); + if (is_coarsest_grid()) { + logQuda(QUDA_VERBOSE, "On coarsest level, not generating new near nulls..."); + popLevel(); + return; + } + auto setup_type = param.mg_global.setup_type[param.level]; // if a near-null vector input file is specified, always load it and @@ -1404,22 +1395,44 @@ namespace quda // multiple setup iterations only matters for test vector setup if (param.mg_global.num_setup_iter[param.level] > 1 && setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS) - errorQuda("Multiple setup iterations can only be used with test vector setup"); + errorQuda("Multiple setup iterations can only be used with test vector setup"); - // Initializing to random vectors + // Initializing to random vectors // FIXME: do not do if in the *middle* of test vector setup - int num_initialize = (setup_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) ? param.mg_global.filter_startup_vectors[param.level] : (int)param.B.size(); - for (int i = 0; i < num_initialize; i++) { - spinorNoise(*r, *rng, QUDA_NOISE_UNIFORM); - *param.B[i] = *r; + if (!refresh) { + int num_initialize = (setup_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) ? param.mg_global.filter_startup_vectors[param.level] : (int)param.B.size(); + for (int i = 0; i < num_initialize; i++) { + spinorNoise(*r, *rng, QUDA_NOISE_UNIFORM); + *param.B[i] = *r; + } } - if (setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS || + for (int si = 0; si < param.mg_global.num_setup_iter[param.level]; si++) { + logQuda(QUDA_VERBOSE, "Running vectors setup on level %d iter %d of %d\n", param.level, si + 1, + param.mg_global.num_setup_iter[param.level]); + + if (setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS || setup_type == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS) { - generateInverseIterations(param.B, refresh); - } else { - // Chebyshev filter - generateChebyshevFilter(param.B); + generateInverseIterations(param.B, refresh); + } else { + // Chebyshev filter + generateChebyshevFilter(param.B); + } + + // recurse when doing test setups + if (setup_type == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS && + param.mg_global.setup_inv_type[param.level] == QUDA_MG_INVERTER) { + if (transfer) { + resetTransfer = true; + reset(); + if ( param.level < param.Nlevel - 2) { + coarse->constructNearNulls(); + } + } else { + reset(); + } + } + } } else { errorQuda("Invalid setup type %d", setup_type); @@ -1441,7 +1454,6 @@ namespace quda solverParam.tol = param.mg_global.setup_tol[param.level]; solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES; solverParam.delta = 1e-1; - solverParam.tol_hq = 0; solverParam.inv_type = param.mg_global.setup_inv_type[param.level]; // Hard coded for now... if (is_ca_solver(solverParam.inv_type)) { @@ -1465,7 +1477,7 @@ namespace quda solverParam.precision_precondition = solverParam.precision; } - solverParam.residual_type = static_cast(QUDA_L2_RELATIVE_RESIDUAL); + solverParam.residual_type = QUDA_L2_RELATIVE_RESIDUAL; solverParam.compute_null_vector = true; ColorSpinorParam csParam(*B[0]); // Create spinor field parameters: csParam.setPrecision(r->Precision(), r->Precision(), true); // ensure native ordering @@ -1482,7 +1494,7 @@ namespace quda bool schwarz_reset = solverParam.inv_type != QUDA_MG_INVERTER && param.mg_global.smoother_schwarz_type[param.level] != QUDA_INVALID_SCHWARZ; if (schwarz_reset) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Disabling Schwarz for null-space finding"); + logQuda(QUDA_VERBOSE, "Disabling Schwarz for null-space finding"); int commDim[QUDA_MAX_DIM]; for (int i = 0; i < QUDA_MAX_DIM; i++) commDim[i] = 1; diracSmootherSloppy->setCommDim(commDim); @@ -1522,71 +1534,38 @@ namespace quda *param.matSmoothSloppy, profile); } - for (int si = 0; si < param.mg_global.num_setup_iter[param.level]; si++) { - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Running vectors setup on level %d iter %d of %d\n", param.level, si + 1, - param.mg_global.num_setup_iter[param.level]); - - // global orthonormalization of the initial null-space vectors - if(param.mg_global.pre_orthonormalize) { - for(int i=0; i<(int)B.size(); i++) { - for (int j=0; j - caxpy(-alpha, *B[j], *B[i]); // i-j - } - double nrm2 = norm2(*B[i]); - if (nrm2 > 1e-16) ax(1.0 /sqrt(nrm2), *B[i]);// i/ - else errorQuda("\nCannot normalize %u vector\n", i); - } - } - - // launch solver for each source - for (int i=0; i<(int)B.size(); i++) { - if (param.mg_global.setup_type[param.level] == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS) { // DDalphaAMG test vector idea - b = *B[i]; // inverting against the vector - zero(x); // with zero initial guess - } else { - x = *B[i]; - zero(b); - } - - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial guess = %g\n", norm2(x)); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial rhs = %g\n", norm2(b)); - - ColorSpinorField *out=nullptr, *in=nullptr; - diracSmoother->prepare(in, out, x, b, QUDA_MAT_SOLUTION); - (*solve)(*out, *in); - diracSmoother->reconstruct(x, b, QUDA_MAT_SOLUTION); + // global orthonormalization of the initial null-space vectors + if(param.mg_global.pre_orthonormalize) { + orthonormalize_vectors(B); + } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Solution = %g\n", norm2(x)); - *B[i] = x; + // launch solver for each source + for (int i=0; i<(int)B.size(); i++) { + if (param.mg_global.setup_type[param.level] == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS) { // DDalphaAMG test vector idea + b = *B[i]; // inverting against the vector + zero(x); // with zero initial guess + } else { + x = *B[i]; + zero(b); } - // global orthonormalization of the generated null-space vectors - if (param.mg_global.post_orthonormalize) { - for(int i=0; i<(int)B.size(); i++) { - for (int j=0; j - caxpy(-alpha, *B[j], *B[i]); // i-j - } - double nrm2 = norm2(*B[i]); - if (sqrt(nrm2) > 1e-16) ax(1.0/sqrt(nrm2), *B[i]);// i/ - else errorQuda("\nCannot normalize %u vector (nrm=%e)\n", i, sqrt(nrm2)); - } + if (getVerbosity() >= QUDA_VERBOSE) { + printfQuda("Initial guess = %g\n", norm2(x)); + printfQuda("Initial rhs = %g\n", norm2(b)); } - if (solverParam.inv_type == QUDA_MG_INVERTER) { + ColorSpinorField *out=nullptr, *in=nullptr; + diracSmoother->prepare(in, out, x, b, QUDA_MAT_SOLUTION); + (*solve)(*out, *in); + diracSmoother->reconstruct(x, b, QUDA_MAT_SOLUTION); - if (transfer) { - resetTransfer = true; - reset(); - if ( param.level < param.Nlevel-2 ) { - coarse->constructNearNulls(); - } - } else { - reset(); - } - } + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Solution = %g\n", norm2(x)); + *B[i] = x; + } + + // global orthonormalization of the generated null-space vectors + if (param.mg_global.post_orthonormalize) { + orthonormalize_vectors(B); } delete solve; @@ -1597,7 +1576,7 @@ namespace quda // reenable Schwarz if (schwarz_reset) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Reenabling Schwarz for null-space finding"); + logQuda(QUDA_VERBOSE, "Reenabling Schwarz for null-space finding"); int commDim[QUDA_MAX_DIM]; for (int i=0; isetCommDim(commDim); @@ -1637,7 +1616,7 @@ namespace quda 100, 10, tmp1, tmp2); } - logQuda(getVerbosity(), "Nums starting %d Filter iter %d Filter rescale %d Filter next %d Lambda min %e lambda max %e\n", + logQuda(QUDA_VERBOSE, "Nums starting %d Filter iter %d Filter rescale %d Filter next %d Lambda min %e lambda max %e\n", num_starting_vectors, filter_iterations, filter_rescale_freq, iterations_for_next, lambda_min, lambda_max); // create filter interpolators @@ -1654,15 +1633,7 @@ namespace quda // orthonormalize if (param.mg_global.pre_orthonormalize) { - for (int i = 0; i < num_starting_vectors; i++) { - for (int j = 0; j < i; j++) { - Complex alpha = cDotProduct(*B[j], *B[i]);// - caxpy(-alpha, *B[j], *B[i]); // i-j - } - double nrm2 = norm2(*B[i]); - if (nrm2 > 1e-16) ax(1.0 / sqrt(nrm2), *B[i]);// i/ - else errorQuda("\nCannot normalize %u vector\n", i); - } + orthonormalize_vectors(B, num_starting_vectors); } for (int s = 0; s < num_starting_vectors; s++) { @@ -1689,7 +1660,7 @@ namespace quda // heuristic rescale to keep norms in check... if (k % filter_rescale_freq == 0) { double tmp_nrm2 = blas::norm2(pk); - logQuda(getVerbosity(), "Starting vector %d heuristic rescale at %d old norm2 %e\n", s, k, tmp_nrm2); + logQuda(QUDA_VERBOSE, "Starting vector %d heuristic rescale at %d old norm2 %e\n", s, k, tmp_nrm2); double tmp_inv_nrm = 1. / sqrt(tmp_nrm2); ax(tmp_inv_nrm, pk); ax(tmp_inv_nrm, pkm1); @@ -1702,7 +1673,7 @@ namespace quda // print the norm (to check for enhancement, normalize) double nrm2 = blas::norm2(pk); - logQuda(getVerbosity(), "Enhanced norm2 for start %d is %e\n", s, nrm2); + logQuda(QUDA_VERBOSE, "Enhanced norm2 for start %d is %e\n", s, nrm2); ax(1.0 / sqrt(nrm2), pk); // This is the first filtered vector @@ -1746,15 +1717,7 @@ namespace quda // global orthonormalization of the generated null-space vectors if (param.mg_global.post_orthonormalize) { - for (int i = 0; i < num_vec; i++) { - for (int j = 0; j < i ; j++) { - Complex alpha = cDotProduct(*B[j], *B[i]);// - caxpy(-alpha, *B[j], *B[i]); // i-j - } - double nrm2 = norm2(*B[i]); - if (sqrt(nrm2) > 1e-16) ax(1.0/sqrt(nrm2), *B[i]);// i/ - else errorQuda("\nCannot normalize %u vector (nrm=%e)\n", i, sqrt(nrm2)); - } + orthonormalize_vectors(B); } popLevel(); @@ -1780,8 +1743,7 @@ namespace quda // There needs to be 6 null vectors -> 12 after chirality. if (Nvec != 6) errorQuda("\nError in MG::buildFreeVectors: Wilson-type fermions require Nvec = 6"); - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Building %d free field vectors for Wilson-type fermions\n", Nvec); + logQuda(QUDA_VERBOSE, "Building %d free field vectors for Wilson-type fermions\n", Nvec); // Zero the null vectors. for (int i = 0; i < Nvec; i++) zero(*B[i]); @@ -1807,8 +1769,7 @@ namespace quda // There needs to be 24 null vectors -> 48 after chirality. if (Nvec != 24) errorQuda("\nError in MG::buildFreeVectors: Staggered-type fermions require Nvec = 24\n"); - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Building %d free field vectors for Staggered-type fermions\n", Nvec); + logQuda(QUDA_VERBOSE, "Building %d free field vectors for Staggered-type fermions\n", Nvec); // Zero the null vectors. for (int i = 0; i < Nvec; i++) zero(*B[i]); @@ -1879,7 +1840,7 @@ namespace quda // There needs to be Ncolor null vectors. if (Nvec != Ncolor) errorQuda("\nError in MG::buildFreeVectors: Coarse fermions require Nvec = Ncolor"); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Building %d free field vectors for Coarse fermions\n", Ncolor); + logQuda(QUDA_VERBOSE, "Building %d free field vectors for Coarse fermions\n", Ncolor); // Zero the null vectors. for (int i = 0; i < Nvec; i++) zero(*B[i]); @@ -1900,7 +1861,7 @@ namespace quda // There needs to be Ncolor null vectors. if (Nvec != Ncolor) errorQuda("\nError in MG::buildFreeVectors: Coarse fermions require Nvec = Ncolor"); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Building %d free field vectors for Coarse fermions\n", Ncolor); + logQuda(QUDA_VERBOSE, "Building %d free field vectors for Coarse fermions\n", Ncolor); // Zero the null vectors. for (int i = 0; i < Nvec; i++) zero(*B[i]); @@ -1922,14 +1883,10 @@ namespace quda // global orthonormalization of the generated null-space vectors if(param.mg_global.post_orthonormalize) { - for(int i=0; i<(int)B.size(); i++) { - double nrm2 = norm2(*B[i]); - if (nrm2 > 1e-16) ax(1.0 /sqrt(nrm2), *B[i]);// i/ - else errorQuda("\nCannot normalize %u vector\n", i); - } + orthonormalize_vectors(B); } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Done building free vectors\n"); + logQuda(QUDA_VERBOSE, "Done building free vectors\n"); popLevel(); } @@ -2008,7 +1965,7 @@ namespace quda if (is_fine_grid()) errorQuda("There are no fine near-null vectors to restrict on level 0"); if (param.Nvec != param.fine->param.Nvec) errorQuda("The number of near-null vectors on the fine level (%d) and coarse level (%d) do not match", param.fine->param.Nvec, param.Nvec); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Restricting null space vectors\n"); + logQuda(QUDA_VERBOSE, "Restricting null space vectors\n"); for (int i = 0; i < param.Nvec; i++) { zero(*(param.B[i])); diff --git a/lib/solver.cpp b/lib/solver.cpp index cd2f0b6b91..080b14a389 100644 --- a/lib/solver.cpp +++ b/lib/solver.cpp @@ -450,4 +450,24 @@ namespace quda { } } + /** + @brief Returns if a solver supports deflation or not + @return true if solver supports deflation, false otherwise + */ + bool is_deflatable_solver(QudaInverterType type) { + switch (type) { + case QUDA_CG_INVERTER: + case QUDA_CGNR_INVERTER: + case QUDA_CGNE_INVERTER: + case QUDA_PCG_INVERTER: + case QUDA_CA_CG_INVERTER: + case QUDA_CA_CGNR_INVERTER: + case QUDA_CA_CGNE_INVERTER: + case QUDA_GCR_INVERTER: + case QUDA_CA_GCR_INVERTER: + case QUDA_BICGSTABL_INVERTER: return true; + default: return false; + } + } + } // namespace quda From 4b3d7b6acdb2c71060cf977fe6250eb621f3b037 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 5 May 2022 14:52:31 -0400 Subject: [PATCH 07/15] Split out test vector setup, disabled for the time being. --- lib/multigrid.cpp | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 8e70b3a039..5fda2b9879 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -116,7 +116,7 @@ namespace quda void MG::reset(bool refresh) { pushLevel(param.level); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("%s level %d\n", transfer ? "Resetting" : "Creating", param.level); + logQuda(QUDA_VERBOSE, "%s level %d\n", transfer ? "Resetting" : "Creating", param.level); destroySmoother(); destroyCoarseSolver(); @@ -1174,7 +1174,7 @@ namespace quda { pushOutputPrefix(prefix); - if (param.level < param.Nlevel - 1) { // set parity for the solver in the transfer operator + if (!is_coarsest_grid()) { // set parity for the solver in the transfer operator QudaSiteSubset site_subset = param.coarse_grid_solution_type == QUDA_MATPC_SOLUTION ? QUDA_PARITY_SITE_SUBSET : QUDA_FULL_SITE_SUBSET; QudaMatPCType matpc_type = param.mg_global.invert_param->matpc_type; @@ -1372,8 +1372,8 @@ namespace quda auto setup_type = param.mg_global.setup_type[param.level]; // if a near-null vector input file is specified, always load it and - // ignore other setup information - if (strcmp(param.mg_global.vec_infile[param.level], "") != 0) { + // ignore other setup information, UNLESS we're refreshing + if (strcmp(param.mg_global.vec_infile[param.level], "") != 0 && !refresh) { loadVectors(); } else { @@ -1389,16 +1389,13 @@ namespace quda // Run the eigensolver generateEigenvectors(); } else if (setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS || - setup_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER || - setup_type == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS) { + setup_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) { // multiple setup iterations only matters for test vector setup - if (param.mg_global.num_setup_iter[param.level] > 1 && - setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS) + if (param.mg_global.num_setup_iter[param.level] > 1) errorQuda("Multiple setup iterations can only be used with test vector setup"); // Initializing to random vectors - // FIXME: do not do if in the *middle* of test vector setup if (!refresh) { int num_initialize = (setup_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) ? param.mg_global.filter_startup_vectors[param.level] : (int)param.B.size(); for (int i = 0; i < num_initialize; i++) { @@ -1407,21 +1404,27 @@ namespace quda } } + logQuda(QUDA_VERBOSE, "Running vectors setup on level %d\n", param.level); + + if (setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS) { + generateInverseIterations(param.B, refresh); + } else { + // Chebyshev filter + generateChebyshevFilter(param.B); + } + + } else if (setup_type == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS) { + errorQuda("Test vector setup is currently not enabled."); + for (int si = 0; si < param.mg_global.num_setup_iter[param.level]; si++) { logQuda(QUDA_VERBOSE, "Running vectors setup on level %d iter %d of %d\n", param.level, si + 1, param.mg_global.num_setup_iter[param.level]); - if (setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS || - setup_type == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS) { - generateInverseIterations(param.B, refresh); - } else { - // Chebyshev filter - generateChebyshevFilter(param.B); - } - - // recurse when doing test setups - if (setup_type == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS && - param.mg_global.setup_inv_type[param.level] == QUDA_MG_INVERTER) { + // can we do test vectors + chebyshev filter setup? + generateInverseIterations(param.B, refresh); + + // recurse when doing test setups, currently buggy + if (param.mg_global.setup_inv_type[param.level] == QUDA_MG_INVERTER) { if (transfer) { resetTransfer = true; reset(); @@ -1434,6 +1437,7 @@ namespace quda } } + } else { errorQuda("Invalid setup type %d", setup_type); } From ae5552386480d91a2204ec26c3a9e9a19839dae4 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 5 May 2022 17:02:55 -0400 Subject: [PATCH 08/15] Added (untested) support to polish near-nulls with inverse iterations, generate leftover near nulls when restricting some fine near null vectors --- include/multigrid.h | 17 ++- include/quda.h | 6 + lib/check_params.h | 4 + lib/milc_interface.cpp | 10 +- lib/multigrid.cpp | 168 +++++++++++++++++++--------- lib/staggered_kd_build_xinv.cu | 4 + tests/utils/command_line_params.cpp | 8 ++ tests/utils/command_line_params.h | 2 + tests/utils/host_utils.cpp | 2 + tests/utils/set_params.cpp | 4 + 10 files changed, 163 insertions(+), 62 deletions(-) diff --git a/include/multigrid.h b/include/multigrid.h index b2e264156c..392160400d 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -403,18 +403,24 @@ namespace quda { */ void saveVectors(const std::vector &B) const; + /** + @brief Initialize a set of vectors to random noise + @param B set of vectors + */ + void initializeRandomVectors(const std::vector &B) const; /** @brief Wrapping function that manages logic for constructing near-null vectors + @param refresh whether or not we're only refreshing near null vectors */ void constructNearNulls(bool refresh = false); /** @brief Generate the null-space vectors via inverse iterations @param B Generated null-space vectors - @param refresh Whether we refreshing pre-exising vectors or starting afresh + @param iterations if non-zero, override the max number of iterations */ - void generateInverseIterations(std::vector &B, bool refresh = false); + void generateInverseIterations(std::vector &B, int iterations = 0); /** @brief Generate the null-space vectors via a Chebyshev filter; this approach is @@ -429,9 +435,10 @@ namespace quda { void generateEigenvectors(); /** - @brief Generate near-null vectors via restricting finer near-nulls + @brief Generate near-null vectors via restricting finer near-nulls, generating extras if need be + @param refresh Whether or not we're only refreshing extra near nulls */ - void generateRestrictedVectors(); + void generateRestrictedVectors(bool refresh = false); /** @brief Build free-field null-space vectors @@ -444,7 +451,7 @@ namespace quda { @param vecs vector of ColorSpinorFields to normalize @param count number of near-null vectors to orthonormalize, default all */ - void orthonormalize_vectors(std::vector& vecs, int count = -1) const; + void orthonormalize_vectors(const std::vector& vecs, int count = -1) const; /** @brief Return the total flops done on this and all coarser levels. diff --git a/include/quda.h b/include/quda.h index 9c9d608772..14b448d74c 100644 --- a/include/quda.h +++ b/include/quda.h @@ -638,6 +638,9 @@ extern "C" { /** Maximum number of iterations for refreshing the null-space vectors */ int setup_maxiter_refresh[QUDA_MAX_MG_LEVEL]; + /** Maximum number of iterations for inverse iteration polishing of null-space vectors */ + int setup_maxiter_inverse_iterations_polish[QUDA_MAX_MG_LEVEL]; + /** Basis to use for CA solver setup */ QudaCABasis setup_ca_basis[QUDA_MAX_MG_LEVEL]; @@ -671,6 +674,9 @@ extern "C" { /** Null-space type to use in the setup phase */ QudaNullVectorSetupType setup_type[QUDA_MAX_MG_LEVEL]; + /** Null-space type to use to "fill in" extra vectors after restricting some */ + QudaNullVectorSetupType setup_restrict_remaining_type[QUDA_MAX_MG_LEVEL]; + /** Pre orthonormalize vectors in the setup phase */ QudaBoolean pre_orthonormalize; diff --git a/lib/check_params.h b/lib/check_params.h index f9534c5298..3e7bdfd47e 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -775,8 +775,10 @@ void printQudaMultigridParam(QudaMultigridParam *param) { for (int i=0; i& vecs, int count) const { + void MG::orthonormalize_vectors(const std::vector& vecs, int count) const { int num_vec = (count == -1) ? static_cast(vecs.size()) : count; if (num_vec > (int)vecs.size()) @@ -1359,6 +1359,13 @@ namespace quda } } + void MG::initializeRandomVectors(const std::vector &B) const { + for (auto b : B) { + spinorNoise(*r, *rng, QUDA_NOISE_UNIFORM); + *b = *r; + } + } + void MG::constructNearNulls(bool refresh) { pushLevel(param.level); @@ -1369,6 +1376,8 @@ namespace quda return; } + logQuda(QUDA_VERBOSE, "Running vectors setup on level %d\n", param.level); + auto setup_type = param.mg_global.setup_type[param.level]; // if a near-null vector input file is specified, always load it and @@ -1377,69 +1386,77 @@ namespace quda loadVectors(); } else { - if (setup_type == QUDA_SETUP_NULL_VECTOR_FREE_FIELD) { - // Generate free field near-null vectors. Consistency checks on the number of - // generated vectors are done within this function. - generateFreeVectors(); - } else if (setup_type == QUDA_SETUP_NULL_VECTOR_RESTRICT_FINE) { - // Restrict near-null vectors from the finer level - generateRestrictedVectors(); - } else if (setup_type == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { - - // Run the eigensolver - generateEigenvectors(); - } else if (setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS || - setup_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) { - - // multiple setup iterations only matters for test vector setup - if (param.mg_global.num_setup_iter[param.level] > 1) - errorQuda("Multiple setup iterations can only be used with test vector setup"); + // multiple setup iterations only matters for test vector setup + if (setup_type != QUDA_SETUP_NULL_VECTOR_TEST_VECTORS && + param.mg_global.num_setup_iter[param.level] > 1) + errorQuda("Multiple setup iterations can only be used with test vector setup"); + if (setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS) { // Initializing to random vectors if (!refresh) { - int num_initialize = (setup_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) ? param.mg_global.filter_startup_vectors[param.level] : (int)param.B.size(); - for (int i = 0; i < num_initialize; i++) { - spinorNoise(*r, *rng, QUDA_NOISE_UNIFORM); - *param.B[i] = *r; - } + initializeRandomVectors(param.B); } - logQuda(QUDA_VERBOSE, "Running vectors setup on level %d\n", param.level); + generateInverseIterations(param.B, refresh ? param.mg_global.setup_maxiter_refresh[param.level] : param.mg_global.setup_maxiter[param.level]); + } else { + // Other types support an inverse iterations "polish" + + if (setup_type == QUDA_SETUP_NULL_VECTOR_FREE_FIELD) { + // Generate free field near-null vectors. Consistency checks on the number of + // generated vectors are done within this function. + generateFreeVectors(); + } else if (setup_type == QUDA_SETUP_NULL_VECTOR_RESTRICT_FINE) { + // Restrict near-null vectors from the finer level, generating extra if coarse Nvec > fine Nvec + generateRestrictedVectors(refresh); + } else if (setup_type == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { + + // Run the eigensolver + generateEigenvectors(); + } else if (setup_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) { + + // Initializing to random vectors + if (!refresh) { + int num_initialize = param.mg_global.filter_startup_vectors[param.level]; + if (num_initialize > (int)param.B.size()) + errorQuda("Chebyshev filter num_initialize %d is greater than vectors to setup %d", num_initialize, (int)param.B.size()); + initializeRandomVectors(std::vector(param.B.begin(), param.B.begin() + num_initialize)); + } - if (setup_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS) { - generateInverseIterations(param.B, refresh); - } else { // Chebyshev filter generateChebyshevFilter(param.B); - } - } else if (setup_type == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS) { - errorQuda("Test vector setup is currently not enabled."); - - for (int si = 0; si < param.mg_global.num_setup_iter[param.level]; si++) { - logQuda(QUDA_VERBOSE, "Running vectors setup on level %d iter %d of %d\n", param.level, si + 1, - param.mg_global.num_setup_iter[param.level]); - - // can we do test vectors + chebyshev filter setup? - generateInverseIterations(param.B, refresh); - - // recurse when doing test setups, currently buggy - if (param.mg_global.setup_inv_type[param.level] == QUDA_MG_INVERTER) { - if (transfer) { - resetTransfer = true; - reset(); - if ( param.level < param.Nlevel - 2) { - coarse->constructNearNulls(); + } else if (setup_type == QUDA_SETUP_NULL_VECTOR_TEST_VECTORS) { + errorQuda("Test vector setup is currently not enabled."); + + for (int si = 0; si < param.mg_global.num_setup_iter[param.level]; si++) { + logQuda(QUDA_VERBOSE, "Running vectors setup on level %d iter %d of %d\n", param.level, si + 1, + param.mg_global.num_setup_iter[param.level]); + + // can we do test vectors + chebyshev filter setup? + generateInverseIterations(param.B, refresh ? param.mg_global.setup_maxiter_refresh[param.level] : param.mg_global.setup_maxiter[param.level]); + + // recurse when doing test setups, currently buggy + if (param.mg_global.setup_inv_type[param.level] == QUDA_MG_INVERTER) { + if (transfer) { + resetTransfer = true; + reset(); + if ( param.level < param.Nlevel - 2) { + coarse->constructNearNulls(); + } + } else { + reset(); } - } else { - reset(); } + } + } else { + errorQuda("Invalid setup type %d", setup_type); } - } else { - errorQuda("Invalid setup type %d", setup_type); + if (param.mg_global.setup_maxiter_inverse_iterations_polish[param.level] > 0) { + generateInverseIterations(param.B, param.mg_global.setup_maxiter_inverse_iterations_polish[param.level]); + } } } @@ -1447,14 +1464,13 @@ namespace quda popLevel(); } - void MG::generateInverseIterations(std::vector &B, bool refresh) + void MG::generateInverseIterations(std::vector &B, int iterations) { pushLevel(param.level); SolverParam solverParam(param); // Set solver field parameters: // set null-space generation options - need to expose these - solverParam.maxiter - = refresh ? param.mg_global.setup_maxiter_refresh[param.level] : param.mg_global.setup_maxiter[param.level]; + solverParam.maxiter = (iterations == 0) ? param.mg_global.setup_maxiter_refresh[param.level] : iterations; solverParam.tol = param.mg_global.setup_tol[param.level]; solverParam.use_init_guess = QUDA_USE_INIT_GUESS_YES; solverParam.delta = 1e-1; @@ -1962,20 +1978,62 @@ namespace quda popLevel(); } - void MG::generateRestrictedVectors() + void MG::generateRestrictedVectors(bool refresh) { pushLevel(param.level); if (is_fine_grid()) errorQuda("There are no fine near-null vectors to restrict on level 0"); - if (param.Nvec != param.fine->param.Nvec) errorQuda("The number of near-null vectors on the fine level (%d) and coarse level (%d) do not match", param.fine->param.Nvec, param.Nvec); + if (param.Nvec != param.fine->param.Nvec) + logQuda(QUDA_VERBOSE, "The number of near-null vectors on the fine level (%d) and coarse level (%d) do not match, restricting as many as possible", param.fine->param.Nvec, param.Nvec); logQuda(QUDA_VERBOSE, "Restricting null space vectors\n"); - for (int i = 0; i < param.Nvec; i++) { + for (int i = 0; i < param.fine->param.Nvec; i++) { zero(*(param.B[i])); param.fine->transfer->R(*(param.B[i]), *(param.fine->param.B[i])); } + // generate more if need be + if (param.Nvec != param.fine->param.Nvec) { + auto setup_remaining_type = param.mg_global.setup_restrict_remaining_type[param.level]; + + // TBD eigenvectors + if (setup_remaining_type != QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS && + setup_remaining_type != QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) + errorQuda("Unsupported remaining near-null vector type %d", setup_remaining_type); + + auto B_remaining = std::vector(param.B.begin() + param.fine->param.Nvec, param.B.end()); + + if (setup_remaining_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS) { + if (!refresh) { + initializeRandomVectors(B_remaining); + } + if (param.mg_global.pre_orthonormalize) { + orthonormalize_vectors(param.B); + } + + generateInverseIterations(B_remaining, refresh ? param.mg_global.setup_maxiter_refresh[param.level] : param.mg_global.setup_maxiter[param.level]); + } else if (setup_remaining_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) { + int num_initialize = param.mg_global.filter_startup_vectors[param.level]; + if (!refresh) { + if (num_initialize > (int)B_remaining.size()) + errorQuda("Chebyshev filter num_initialize %d is greater than vectors to setup %d", num_initialize, (int)B_remaining.size()); + + initializeRandomVectors(std::vector(B_remaining.begin(), B_remaining.begin() + num_initialize)); + } + + if (param.mg_global.pre_orthonormalize) { + orthonormalize_vectors(std::vector(param.B.begin(), param.B.begin() + param.fine->param.Nvec + num_initialize)); + } + + generateChebyshevFilter(B_remaining); + } + } + + if (param.mg_global.post_orthonormalize) { + orthonormalize_vectors(param.B); + } + popLevel(); } diff --git a/lib/staggered_kd_build_xinv.cu b/lib/staggered_kd_build_xinv.cu index c09e45d31c..77713d4bca 100644 --- a/lib/staggered_kd_build_xinv.cu +++ b/lib/staggered_kd_build_xinv.cu @@ -210,6 +210,7 @@ namespace quda { // Step 3: Create the X field based on Xinv, but switch to a native ordering for a GPU setup. std::unique_ptr tmp_X(nullptr); GaugeFieldParam x_param(*xInvMilcOrder); + x_param.create = QUDA_ZERO_FIELD_CREATE; if (location == QUDA_CUDA_FIELD_LOCATION) { x_param.order = QUDA_FLOAT2_GAUGE_ORDER; x_param.setPrecision(x_param.Precision()); @@ -222,6 +223,8 @@ namespace quda { // Step 4: Calculate X from U if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing the KD block on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU"); + printfQuda("U pre-compute %e, mass %e\n", U.norm2(), mass); + calculateStaggeredKDBlock(X, U, mass); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("X2 = %e\n", X.norm2(0)); @@ -268,6 +271,7 @@ namespace quda { std::shared_ptr AllocateAndBuildStaggeredKahlerDiracInverse(const cudaGaugeField &gauge, const double mass, const bool dagger_approximation) { GaugeFieldParam gParam(gauge); + gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.reconstruct = QUDA_RECONSTRUCT_NO; gParam.create = QUDA_NULL_FIELD_CREATE; gParam.geometry = QUDA_KDINVERSE_GEOMETRY; diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index e82928614c..eefdf41cc7 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -132,12 +132,14 @@ quda::mgarray coarse_solve_type = {}; quda::mgarray smoother_solve_type = {}; quda::mgarray setup_type = {}; +quda::mgarray setup_restrict_remaining_type = {}; // Parameters for inverse iterations setup quda::mgarray num_setup_iter = {}; quda::mgarray setup_tol = {}; quda::mgarray setup_maxiter = {}; quda::mgarray setup_maxiter_refresh = {}; +quda::mgarray setup_maxiter_inverse_iterations_polish = {}; quda::mgarray setup_ca_basis = {}; quda::mgarray setup_ca_basis_size = {}; quda::mgarray setup_ca_lambda_min = {}; @@ -899,6 +901,9 @@ void add_multigrid_option_group(std::shared_ptr quda_app) quda_app->add_mgoption(opgroup, "--mg-setup-type", setup_type, CLI::QUDACheckedTransformer(setup_type_map), "The type of setup to use for the multigrid; ignored if --mg-load-vec is set (inverse-iterations (default), chebyshev-filter, eigenvectors, test-vectors, restrict-fine, free-field)"); + quda_app->add_mgoption(opgroup, "--mg-setup-restrict-remaining-type", setup_restrict_remaining_type, CLI::QUDACheckedTransformer(setup_type_map), + "The type of setup to use for remaining vectors if restricting all fine null vectors (inverse-iterations (default), chebyshev-filter)"); + quda_app->add_mgoption(opgroup, "--mg-setup-ca-basis-size", setup_ca_basis_size, CLI::PositiveNumber, "The basis size to use for CA solver setup of multigrid (default 4)"); quda_app->add_mgoption(opgroup, "--mg-setup-ca-basis-type", setup_ca_basis, CLI::QUDACheckedTransformer(ca_basis_map), @@ -923,6 +928,9 @@ void add_multigrid_option_group(std::shared_ptr quda_app) quda_app->add_mgoption( opgroup, "--mg-setup-maxiter-refresh", setup_maxiter_refresh, CLI::Validator(), "The maximum number of solver iterations to use when refreshing the pre-existing null space vectors (default 100)"); + quda_app->add_mgoption( + opgroup, "--mg-setup-maxiter-inverse-iterations-polish", setup_maxiter_inverse_iterations_polish, CLI::Validator(), + "The maximum number of solver iterations to use when polishing pre-existing null space vectors after a non-inverse-iteration setup (default 0, no polishing)"); quda_app->add_mgoption(opgroup, "--mg-setup-tol", setup_tol, CLI::Validator(), "The tolerance to use for the setup of multigrid (default 5e-6)"); diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h index 53bc790041..a965e92746 100644 --- a/tests/utils/command_line_params.h +++ b/tests/utils/command_line_params.h @@ -269,12 +269,14 @@ extern quda::mgarray coarse_solve_type; extern quda::mgarray smoother_solve_type; extern quda::mgarray setup_type; +extern quda::mgarray setup_restrict_remaining_type; // Parameters for inverse iterations setup extern quda::mgarray num_setup_iter; extern quda::mgarray setup_tol; extern quda::mgarray setup_maxiter; extern quda::mgarray setup_maxiter_refresh; +extern quda::mgarray setup_maxiter_inverse_iterations_polish; extern quda::mgarray setup_ca_basis; extern quda::mgarray setup_ca_basis_size; extern quda::mgarray setup_ca_lambda_min; diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index 409188343a..0783212127 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -93,6 +93,7 @@ void setQudaDefaultMgTestParams() setup_tol[i] = 5e-6; setup_maxiter[i] = 500; setup_maxiter_refresh[i] = 20; + setup_maxiter_inverse_iterations_polish[i] = 0; mu_factor[i] = 1.; coarse_solve_type[i] = QUDA_INVALID_SOLVE; smoother_solve_type[i] = QUDA_INVALID_SOLVE; @@ -129,6 +130,7 @@ void setQudaDefaultMgTestParams() mg_eig_save_prec[i] = QUDA_DOUBLE_PRECISION; setup_type[i] = QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS; + setup_restrict_remaining_type[i] = QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS; filter_startup_vectors[i] = 1; filter_startup_iterations[i] = 1000; filter_startup_rescale_frequency[i] = 50; diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp index f89d2b6b1c..6606d79254 100644 --- a/tests/utils/set_params.cpp +++ b/tests/utils/set_params.cpp @@ -389,10 +389,14 @@ void setMultigridParam(QudaMultigridParam &mg_param) mg_param.setup_tol[i] = setup_tol[i]; mg_param.setup_maxiter[i] = setup_maxiter[i]; mg_param.setup_maxiter_refresh[i] = setup_maxiter_refresh[i]; + mg_param.setup_maxiter_inverse_iterations_polish[i] = setup_maxiter_inverse_iterations_polish[i]; // Setup type to use (inverse iterations, chebyshev filter, eigenvectors, restriction, free field) mg_param.setup_type[i] = setup_type[i]; + // Setup type to use to generate remaining near-null vectors when some are restricted + mg_param.setup_restrict_remaining_type[i] = setup_restrict_remaining_type[i]; + // Basis to use for CA solver setups mg_param.setup_ca_basis[i] = setup_ca_basis[i]; From 3b300ee7e08b3f7b51b472738cbc3d6fa1c157bd Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 6 May 2022 13:26:55 -0400 Subject: [PATCH 09/15] Need temporary vector for restricting half prec fine near-nulls -> coarse --- lib/multigrid.cpp | 15 ++++++++++++++- lib/staggered_kd_build_xinv.cu | 3 --- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index ef4513e564..5d62a4fc8c 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -1988,9 +1988,22 @@ namespace quda logQuda(QUDA_VERBOSE, "Restricting null space vectors\n"); + // B vectors can be in half precession on the fine level, need to convert to single + ColorSpinorField Btmp; + bool need_tmp_single = param.fine->param.B[0]->Precision() != QUDA_SINGLE_PRECISION; + if (need_tmp_single) { + ColorSpinorParam b_tmp_param(*param.fine->param.B[0]); + b_tmp_param.create = QUDA_NULL_FIELD_CREATE; + b_tmp_param.setPrecision(QUDA_SINGLE_PRECISION, QUDA_INVALID_PRECISION, + b_tmp_param.location == QUDA_CUDA_FIELD_LOCATION ? true : false); + Btmp = ColorSpinorField(b_tmp_param); + } + for (int i = 0; i < param.fine->param.Nvec; i++) { zero(*(param.B[i])); - param.fine->transfer->R(*(param.B[i]), *(param.fine->param.B[i])); + if (need_tmp_single) Btmp = *(param.fine->param.B[i]); + ColorSpinorField& Bfine = need_tmp_single ? Btmp : *(param.fine->param.B[i]); + param.fine->transfer->R(*(param.B[i]), Bfine); } // generate more if need be diff --git a/lib/staggered_kd_build_xinv.cu b/lib/staggered_kd_build_xinv.cu index c27fe02fcc..3441feede0 100644 --- a/lib/staggered_kd_build_xinv.cu +++ b/lib/staggered_kd_build_xinv.cu @@ -213,7 +213,6 @@ namespace quda { // Step 3: Create the X field based on Xinv, but switch to a native ordering for a GPU setup. std::unique_ptr tmp_X(nullptr); GaugeFieldParam x_param(*xInvMilcOrder); - x_param.create = QUDA_ZERO_FIELD_CREATE; if (location == QUDA_CUDA_FIELD_LOCATION) { x_param.order = QUDA_FLOAT2_GAUGE_ORDER; x_param.setPrecision(x_param.Precision()); @@ -226,8 +225,6 @@ namespace quda { // Step 4: Calculate X from U if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing the KD block on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU"); - printfQuda("U pre-compute %e, mass %e\n", U.norm2(), mass); - calculateStaggeredKDBlock(X, U, mass); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("X2 = %e\n", X.norm2(0)); From 5e109fe37b989d2a2aa7ab61c920a6158abd0aaa Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 6 May 2022 18:46:31 -0400 Subject: [PATCH 10/15] Partial progress to using eigensolver to generate extra vectors after restricting finer near-nulls --- include/multigrid.h | 6 +- lib/multigrid.cpp | 111 +++++++++----------------------- tests/invert_test.cpp | 2 +- tests/staggered_invert_test.cpp | 2 +- 4 files changed, 36 insertions(+), 85 deletions(-) diff --git a/include/multigrid.h b/include/multigrid.h index 392160400d..af8b49eca4 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -431,8 +431,9 @@ namespace quda { /** @brief Generate lowest eigenvectors + @param B Generated null-space vectors */ - void generateEigenvectors(); + void generateEigenvectors(std::vector &B); /** @brief Generate near-null vectors via restricting finer near-nulls, generating extras if need be @@ -442,8 +443,9 @@ namespace quda { /** @brief Build free-field null-space vectors + @param B Generated null-space vectors */ - void generateFreeVectors(); + void generateFreeVectors(std::vector &B); /** @brief Orthonormalize a vector of ColorSpinorField, erroring out if diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 5d62a4fc8c..89f135c0e4 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -1120,7 +1120,7 @@ namespace quda // Reuse the space for the Null vectors. By this point, // the coarse grid has already been constructed. - generateEigenvectors(); + generateEigenvectors(param.B); for (int i = 0; i < param.Nvec; i++) { @@ -1404,14 +1404,14 @@ namespace quda if (setup_type == QUDA_SETUP_NULL_VECTOR_FREE_FIELD) { // Generate free field near-null vectors. Consistency checks on the number of // generated vectors are done within this function. - generateFreeVectors(); + generateFreeVectors(param.B); } else if (setup_type == QUDA_SETUP_NULL_VECTOR_RESTRICT_FINE) { // Restrict near-null vectors from the finer level, generating extra if coarse Nvec > fine Nvec generateRestrictedVectors(refresh); } else if (setup_type == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { // Run the eigensolver - generateEigenvectors(); + generateEigenvectors(param.B); } else if (setup_type == QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) { // Initializing to random vectors @@ -1745,10 +1745,9 @@ namespace quda // generate a full span of free vectors. // FIXME: Assumes fine level is SU(3). - void MG::generateFreeVectors() + void MG::generateFreeVectors(std::vector& B) { pushLevel(param.level); - std::vector &B = param.B; const int Nvec = B.size(); // Given the number of colors and spins, figure out if the number @@ -1756,6 +1755,9 @@ namespace quda const int Ncolor = B[0]->Ncolor(); const int Nspin = B[0]->Nspin(); + // Zero the null vectors + for (auto b : B) zero(*b); + if (Ncolor == 3) // fine level { if (Nspin == 4) // Wilson or Twisted Mass (singlet) @@ -1765,9 +1767,6 @@ namespace quda logQuda(QUDA_VERBOSE, "Building %d free field vectors for Wilson-type fermions\n", Nvec); - // Zero the null vectors. - for (int i = 0; i < Nvec; i++) zero(*B[i]); - // Create a temporary vector. ColorSpinorParam csParam(*B[0]); csParam.create = QUDA_ZERO_FIELD_CREATE; @@ -1791,9 +1790,6 @@ namespace quda logQuda(QUDA_VERBOSE, "Building %d free field vectors for Staggered-type fermions\n", Nvec); - // Zero the null vectors. - for (int i = 0; i < Nvec; i++) zero(*B[i]); - // Create a temporary vector. ColorSpinorParam csParam(*B[0]); csParam.create = QUDA_ZERO_FIELD_CREATE; @@ -1801,55 +1797,15 @@ namespace quda // Build free null vectors. for (int c = 0; c < B[0]->Ncolor(); c++) { - // Need to pair an even+odd corner together - // since they'll get split up. - // 0000, 0001 - tmp.Source(QUDA_CORNER_SOURCE, 1, 0x0, c); - xpy(tmp, *B[8 * c + 0]); - tmp.Source(QUDA_CORNER_SOURCE, 1, 0x1, c); - xpy(tmp, *B[8 * c + 0]); - - // 0010, 0011 - tmp.Source(QUDA_CORNER_SOURCE, 1, 0x2, c); - xpy(tmp, *B[8 * c + 1]); - tmp.Source(QUDA_CORNER_SOURCE, 1, 0x3, c); - xpy(tmp, *B[8 * c + 1]); - - // 0100, 0101 - tmp.Source(QUDA_CORNER_SOURCE, 1, 0x4, c); - xpy(tmp, *B[8 * c + 2]); - tmp.Source(QUDA_CORNER_SOURCE, 1, 0x5, c); - xpy(tmp, *B[8 * c + 2]); - - // 0110, 0111 - tmp.Source(QUDA_CORNER_SOURCE, 1, 0x6, c); - xpy(tmp, *B[8 * c + 3]); - tmp.Source(QUDA_CORNER_SOURCE, 1, 0x7, c); - xpy(tmp, *B[8 * c + 3]); - - // 1000, 1001 - tmp.Source(QUDA_CORNER_SOURCE, 1, 0x8, c); - xpy(tmp, *B[8 * c + 4]); - tmp.Source(QUDA_CORNER_SOURCE, 1, 0x9, c); - xpy(tmp, *B[8 * c + 4]); - - // 1010, 1011 - tmp.Source(QUDA_CORNER_SOURCE, 1, 0xA, c); - xpy(tmp, *B[8 * c + 5]); - tmp.Source(QUDA_CORNER_SOURCE, 1, 0xB, c); - xpy(tmp, *B[8 * c + 5]); - - // 1100, 1101 - tmp.Source(QUDA_CORNER_SOURCE, 1, 0xC, c); - xpy(tmp, *B[8 * c + 6]); - tmp.Source(QUDA_CORNER_SOURCE, 1, 0xD, c); - xpy(tmp, *B[8 * c + 6]); - - // 1110, 1111 - tmp.Source(QUDA_CORNER_SOURCE, 1, 0xE, c); - xpy(tmp, *B[8 * c + 7]); - tmp.Source(QUDA_CORNER_SOURCE, 1, 0xF, c); - xpy(tmp, *B[8 * c + 7]); + // Need to pair an even+odd corner together since they'll get split up + // during chiral doubling. Vector 0 is `0000`,`0001`; + // vector 1 is `0010`,`0011`, etc. + for (int corner = 0; corner < 8; corner++) { + tmp.Source(QUDA_CORNER_SOURCE, 1, 2 * corner, c); + xpy(tmp, *B[8 * c + corner]); + tmp.Source(QUDA_CORNER_SOURCE, 1, 2 * corner + 1, c); + xpy(tmp, *B[8 * c + corner]); + } } } else { @@ -1862,9 +1818,6 @@ namespace quda logQuda(QUDA_VERBOSE, "Building %d free field vectors for Coarse fermions\n", Ncolor); - // Zero the null vectors. - for (int i = 0; i < Nvec; i++) zero(*B[i]); - // Create a temporary vector. ColorSpinorParam csParam(*B[0]); csParam.create = QUDA_ZERO_FIELD_CREATE; @@ -1883,9 +1836,6 @@ namespace quda logQuda(QUDA_VERBOSE, "Building %d free field vectors for Coarse fermions\n", Ncolor); - // Zero the null vectors. - for (int i = 0; i < Nvec; i++) zero(*B[i]); - // Create a temporary vector. ColorSpinorParam csParam(*B[0]); csParam.create = QUDA_ZERO_FIELD_CREATE; @@ -1911,7 +1861,7 @@ namespace quda popLevel(); } - void MG::generateEigenvectors() + void MG::generateEigenvectors(std::vector& B) { pushLevel(param.level); @@ -1919,6 +1869,9 @@ namespace quda errorQuda("eig_param for level %d has not been populated", param.level); // Extract eigensolver params + // HACK FOR NOW for restrict then eigenvectors to patch up + param.mg_global.eig_param[param.level]->n_conv = (int)B.size(); + param.mg_global.eig_param[param.level]->n_ev_deflate = (int)B.size(); int n_conv = param.mg_global.eig_param[param.level]->n_conv; bool dagger = param.mg_global.eig_param[param.level]->use_dagger; bool normop = param.mg_global.eig_param[param.level]->use_norm_op; @@ -1927,18 +1880,14 @@ namespace quda std::vector evals(n_conv, 0.0); std::vector B_evecs; - ColorSpinorParam csParam(*param.B[0]); - csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; + ColorSpinorParam csParam(*B[0]); + csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; // FIXME: check for staggered csParam.create = QUDA_ZERO_FIELD_CREATE; // This is the vector precision used by matResidual csParam.setPrecision(param.mg_global.invert_param->cuda_prec_sloppy, QUDA_INVALID_PRECISION, true); for (int i = 0; i < n_conv; i++) B_evecs.push_back(new ColorSpinorField(csParam)); - // before entering the eigen solver, let's free the B vectors to save some memory - ColorSpinorParam bParam(*param.B[0]); - for (int i = 0; i < (int)param.B.size(); i++) delete param.B[i]; - EigenSolver *eig_solve; if (!normop && !dagger) { DiracM *mat = new DiracM(*diracResidual); @@ -1968,8 +1917,7 @@ namespace quda // now reallocate the B vectors copy in e-vectors for (int i = 0; i < (int)param.B.size(); i++) { - param.B[i] = new ColorSpinorField(bParam); - *param.B[i] = *B_evecs[i]; + *B[i] = *B_evecs[i]; } // Local clean-up @@ -1984,7 +1932,7 @@ namespace quda if (is_fine_grid()) errorQuda("There are no fine near-null vectors to restrict on level 0"); if (param.Nvec != param.fine->param.Nvec) - logQuda(QUDA_VERBOSE, "The number of near-null vectors on the fine level (%d) and coarse level (%d) do not match, restricting as many as possible", param.fine->param.Nvec, param.Nvec); + logQuda(QUDA_VERBOSE, "The number of near-null vectors on the fine level (%d) and coarse level (%d) do not match, restricting as many as possible\n", param.fine->param.Nvec, param.Nvec); logQuda(QUDA_VERBOSE, "Restricting null space vectors\n"); @@ -2010,11 +1958,6 @@ namespace quda if (param.Nvec != param.fine->param.Nvec) { auto setup_remaining_type = param.mg_global.setup_restrict_remaining_type[param.level]; - // TBD eigenvectors - if (setup_remaining_type != QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS && - setup_remaining_type != QUDA_SETUP_NULL_VECTOR_CHEBYSHEV_FILTER) - errorQuda("Unsupported remaining near-null vector type %d", setup_remaining_type); - auto B_remaining = std::vector(param.B.begin() + param.fine->param.Nvec, param.B.end()); if (setup_remaining_type == QUDA_SETUP_NULL_VECTOR_INVERSE_ITERATIONS) { @@ -2040,6 +1983,12 @@ namespace quda } generateChebyshevFilter(B_remaining); + } else if (setup_remaining_type == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { + // nothing to initialize, though there may be scope to reuse with Block Lanczos + // once it's performant + generateEigenvectors(B_remaining); + } else { + errorQuda("Unsupported remaining near-null vector type %d", setup_remaining_type); } } diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index f93733fe44..8e55caf76a 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -128,7 +128,7 @@ void init(int argc, char **argv) // Set sub structures mg_param.invert_param = &mg_inv_param; for (int i = 0; i < mg_levels; i++) { - if (mg_eig[i] || setup_type[i] == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { + if (mg_eig[i] || setup_type[i] == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS || setup_restrict_remaining_type[i] == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { mg_eig_param[i] = newQudaEigParam(); setMultigridEigParam(mg_eig_param[i], i); mg_param.eig_param[i] = &mg_eig_param[i]; diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 2ce6ea3867..33012758a7 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -185,7 +185,7 @@ int main(int argc, char **argv) mg_param.invert_param = &mg_inv_param; for (int i = 0; i < mg_levels; i++) { - if (mg_eig[i] || setup_type[i] == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { + if (mg_eig[i] || setup_type[i] == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS || setup_restrict_remaining_type[i] == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { mg_eig_param[i] = newQudaEigParam(); setMultigridEigParam(mg_eig_param[i], i); mg_param.eig_param[i] = &mg_eig_param[i]; From 00149534a8094220470d2f853a37170775edfa01 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 17 May 2022 01:24:58 -0400 Subject: [PATCH 11/15] Mixed restrict/eigensolver setup is now working. --- include/multigrid.h | 5 +++- lib/eigensolve_quda.cpp | 12 +++++----- lib/multigrid.cpp | 53 +++++++++++++++++++++++++++-------------- 3 files changed, 45 insertions(+), 25 deletions(-) diff --git a/include/multigrid.h b/include/multigrid.h index af8b49eca4..b9c47a10a3 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -432,8 +432,11 @@ namespace quda { /** @brief Generate lowest eigenvectors @param B Generated null-space vectors + @param post_restrict whether or not we're generating extra eigenvectors after restricting + some fine eigenvectors; if true we override the requested n_conv in the multigrid + eigenvalue struct */ - void generateEigenvectors(std::vector &B); + void generateEigenvectors(std::vector &B, bool post_restrict = false); /** @brief Generate near-null vectors via restricting finer near-nulls, generating extras if need be diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp index bc3f9db9c8..8f69508378 100644 --- a/lib/eigensolve_quda.cpp +++ b/lib/eigensolve_quda.cpp @@ -337,8 +337,8 @@ namespace quda // C_1 is the current 'out' vector. // Clone 'in' to two temporary vectors. - auto tmp1 = std::make_unique(in); - auto tmp2 = std::make_unique(out); + auto tmp_1 = std::make_unique(in); + auto tmp_2 = std::make_unique(out); // Using Chebyshev polynomial recursion relation, // C_{m+1}(x) = 2*x*C_{m} - C_{m-1} @@ -355,14 +355,14 @@ namespace quda // FIXME - we could introduce a fused matVec + blas kernel here, eliminating one temporary // mat*C_{m}(x) - matVec(mat, out, *tmp2); + matVec(mat, out, *tmp_2); - blas::axpbypczw(d3, *tmp1, d2, *tmp2, d1, out, *tmp1); - std::swap(tmp1, tmp2); + blas::axpbypczw(d3, *tmp_1, d2, *tmp_2, d1, out, *tmp_1); + std::swap(tmp_1, tmp_2); sigma_old = sigma; } - blas::copy(out, *tmp2); + blas::copy(out, *tmp_2); // Save Chebyshev tuning saveTuneCache(); diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 89f135c0e4..d1d836768f 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -1861,63 +1861,80 @@ namespace quda popLevel(); } - void MG::generateEigenvectors(std::vector& B) + void MG::generateEigenvectors(std::vector& B, bool post_restrict) { pushLevel(param.level); if (param.mg_global.eig_param[param.level] == nullptr) errorQuda("eig_param for level %d has not been populated", param.level); - // Extract eigensolver params - // HACK FOR NOW for restrict then eigenvectors to patch up - param.mg_global.eig_param[param.level]->n_conv = (int)B.size(); - param.mg_global.eig_param[param.level]->n_ev_deflate = (int)B.size(); - int n_conv = param.mg_global.eig_param[param.level]->n_conv; - bool dagger = param.mg_global.eig_param[param.level]->use_dagger; - bool normop = param.mg_global.eig_param[param.level]->use_norm_op; + // Extract eigensolver params; we create a copy by value because we may override + // some values when we restrict first + auto eig_param = *param.mg_global.eig_param[param.level]; + + // We need to compute fewer eigenvectors if we've restricted some fine + // near-nulls first. + if (post_restrict) { + eig_param.n_conv = static_cast(B.size()); + eig_param.n_ev_deflate = static_cast(B.size()); + } + + int n_conv = eig_param.n_conv; + bool dagger = eig_param.use_dagger; + bool normop = eig_param.use_norm_op; // Dummy array to keep the eigensolver happy. std::vector evals(n_conv, 0.0); - std::vector B_evecs; ColorSpinorParam csParam(*B[0]); - csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; // FIXME: check for staggered + if (csParam.siteSubset == QUDA_PARITY_SITE_SUBSET) { + csParam.siteSubset = QUDA_FULL_SITE_SUBSET; + csParam.x[0] *= 2; + } + if (B[0]->Nspin() == 4) csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; csParam.create = QUDA_ZERO_FIELD_CREATE; // This is the vector precision used by matResidual csParam.setPrecision(param.mg_global.invert_param->cuda_prec_sloppy, QUDA_INVALID_PRECISION, true); for (int i = 0; i < n_conv; i++) B_evecs.push_back(new ColorSpinorField(csParam)); + // before entering the eigen solver, let's free the B vectors to save some memory + //ColorSpinorParam bParam(*B[0]); + //for (int i = 0; i < (int)B.size(); i++) delete B[i]; + EigenSolver *eig_solve; if (!normop && !dagger) { DiracM *mat = new DiracM(*diracResidual); - eig_solve = EigenSolver::create(param.mg_global.eig_param[param.level], *mat, profile); + eig_solve = EigenSolver::create(&eig_param, *mat, profile); (*eig_solve)(B_evecs, evals); delete eig_solve; delete mat; } else if (!normop && dagger) { DiracMdag *mat = new DiracMdag(*diracResidual); - eig_solve = EigenSolver::create(param.mg_global.eig_param[param.level], *mat, profile); + eig_solve = EigenSolver::create(&eig_param, *mat, profile); (*eig_solve)(B_evecs, evals); delete eig_solve; delete mat; } else if (normop && !dagger) { DiracMdagM *mat = new DiracMdagM(*diracResidual); - eig_solve = EigenSolver::create(param.mg_global.eig_param[param.level], *mat, profile); + eig_solve = EigenSolver::create(&eig_param, *mat, profile); (*eig_solve)(B_evecs, evals); delete eig_solve; delete mat; } else if (normop && dagger) { DiracMMdag *mat = new DiracMMdag(*diracResidual); - eig_solve = EigenSolver::create(param.mg_global.eig_param[param.level], *mat, profile); + eig_solve = EigenSolver::create(&eig_param, *mat, profile); (*eig_solve)(B_evecs, evals); delete eig_solve; delete mat; } // now reallocate the B vectors copy in e-vectors - for (int i = 0; i < (int)param.B.size(); i++) { - *B[i] = *B_evecs[i]; + for (int i = 0; i < (int)B.size(); i++) { + if (B[0]->SiteSubset() == QUDA_PARITY_SITE_SUBSET) + *B[i] = B_evecs[i]->Even(); + else + *B[i] = *B_evecs[i]; } // Local clean-up @@ -1944,7 +1961,7 @@ namespace quda b_tmp_param.create = QUDA_NULL_FIELD_CREATE; b_tmp_param.setPrecision(QUDA_SINGLE_PRECISION, QUDA_INVALID_PRECISION, b_tmp_param.location == QUDA_CUDA_FIELD_LOCATION ? true : false); - Btmp = ColorSpinorField(b_tmp_param); + Btmp = ColorSpinorField(b_tmp_param); } for (int i = 0; i < param.fine->param.Nvec; i++) { @@ -1986,7 +2003,7 @@ namespace quda } else if (setup_remaining_type == QUDA_SETUP_NULL_VECTOR_EIGENVECTORS) { // nothing to initialize, though there may be scope to reuse with Block Lanczos // once it's performant - generateEigenvectors(B_remaining); + generateEigenvectors(B_remaining, true); } else { errorQuda("Unsupported remaining near-null vector type %d", setup_remaining_type); } From 462854d1e4ca9d0d172dee26ea846eace1f8cbac Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 17 May 2022 02:03:52 -0400 Subject: [PATCH 12/15] Fixed type typo on chebyshev filter bounds --- include/quda.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/quda.h b/include/quda.h index 14b448d74c..e6f4a23eb3 100644 --- a/include/quda.h +++ b/include/quda.h @@ -666,10 +666,10 @@ extern "C" { int filter_iterations_between_vectors[QUDA_MAX_MG_LEVEL]; /** Conservative estimate of largest eigenvalue of operator used for Chebyshev filter setup */ - int filter_lambda_min[QUDA_MAX_MG_LEVEL]; + double filter_lambda_min[QUDA_MAX_MG_LEVEL]; /** Lower bound of eigenvalues that are not enhanced by the initial Chebyshev filter */ - int filter_lambda_max[QUDA_MAX_MG_LEVEL]; + double filter_lambda_max[QUDA_MAX_MG_LEVEL]; /** Null-space type to use in the setup phase */ QudaNullVectorSetupType setup_type[QUDA_MAX_MG_LEVEL]; From 5ac5288cf79843ba7c64c7cd48859130c48fc6e6 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 2 Jun 2022 13:42:35 -0700 Subject: [PATCH 13/15] Addressed most PR review comments ; wrote block-BLAS orthonormalization --- include/blas_quda.h | 3 +- include/multigrid.h | 5 +- include/quda.h | 4 +- lib/check_params.h | 4 +- lib/eigensolve_quda.cpp | 10 +-- lib/milc_interface.cpp | 2 +- lib/multigrid.cpp | 101 +++++++++++++++++++--------- tests/utils/command_line_params.cpp | 6 +- tests/utils/command_line_params.h | 2 +- tests/utils/host_utils.cpp | 2 +- tests/utils/host_utils.h | 11 +-- tests/utils/set_params.cpp | 2 +- 12 files changed, 90 insertions(+), 62 deletions(-) diff --git a/include/blas_quda.h b/include/blas_quda.h index e0d2f9266a..40ec730083 100644 --- a/include/blas_quda.h +++ b/include/blas_quda.h @@ -202,8 +202,7 @@ namespace quda { } /** - @brief Compute the block "caxpy" with over the s -et of + @brief Compute the block "caxpy" with over the set of ColorSpinorFields. E.g., it computes y = x * a + y diff --git a/include/multigrid.h b/include/multigrid.h index c6639f4671..1cef678352 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -449,8 +449,9 @@ namespace quda { /** @brief Load the null space vectors in from file + @param B Load null-space vectors to here */ - void loadVectors(); + void loadVectors(std::vector &B); /** @brief Save the null space vectors in from file @@ -511,7 +512,7 @@ namespace quda { @param vecs vector of ColorSpinorFields to normalize @param count number of near-null vectors to orthonormalize, default all */ - void orthonormalize_vectors(const std::vector& vecs, int count = -1) const; + void orthonormalizeVectors(const std::vector& vecs, int count = -1) const; /** @brief Return the total flops done on this and all coarser levels. diff --git a/include/quda.h b/include/quda.h index e6f4a23eb3..5b90f464ea 100644 --- a/include/quda.h +++ b/include/quda.h @@ -638,8 +638,8 @@ extern "C" { /** Maximum number of iterations for refreshing the null-space vectors */ int setup_maxiter_refresh[QUDA_MAX_MG_LEVEL]; - /** Maximum number of iterations for inverse iteration polishing of null-space vectors */ - int setup_maxiter_inverse_iterations_polish[QUDA_MAX_MG_LEVEL]; + /** Maximum number of iterations for inverse iteration refinement of null-space vectors */ + int setup_maxiter_inverse_iterations_refinement[QUDA_MAX_MG_LEVEL]; /** Basis to use for CA solver setup */ QudaCABasis setup_ca_basis[QUDA_MAX_MG_LEVEL]; diff --git a/lib/check_params.h b/lib/check_params.h index 3e7bdfd47e..ddd995c700 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -805,12 +805,12 @@ void printQudaMultigridParam(QudaMultigridParam *param) { P(setup_tol[i], 5e-6); P(setup_maxiter[i], 500); P(setup_maxiter_refresh[i], 0); - P(setup_maxiter_inverse_iterations_polish[i], 0); + P(setup_maxiter_inverse_iterations_refinement[i], 0); #else P(setup_tol[i], INVALID_DOUBLE); P(setup_maxiter[i], INVALID_INT); P(setup_maxiter_refresh[i], INVALID_INT); - P(setup_maxiter_inverse_iterations_polish[i], INVALID_INT); + P(setup_maxiter_inverse_iterations_refinement[i], INVALID_INT); #endif #ifdef INIT_PARAM diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp index 8f69508378..eab8160a58 100644 --- a/lib/eigensolve_quda.cpp +++ b/lib/eigensolve_quda.cpp @@ -337,8 +337,8 @@ namespace quda // C_1 is the current 'out' vector. // Clone 'in' to two temporary vectors. - auto tmp_1 = std::make_unique(in); - auto tmp_2 = std::make_unique(out); + ColorSpinorField tmp_1(in); + ColorSpinorField tmp_2(out); // Using Chebyshev polynomial recursion relation, // C_{m+1}(x) = 2*x*C_{m} - C_{m-1} @@ -355,14 +355,14 @@ namespace quda // FIXME - we could introduce a fused matVec + blas kernel here, eliminating one temporary // mat*C_{m}(x) - matVec(mat, out, *tmp_2); + matVec(mat, out, tmp_2); - blas::axpbypczw(d3, *tmp_1, d2, *tmp_2, d1, out, *tmp_1); + blas::axpbypczw(d3, tmp_1, d2, tmp_2, d1, out, tmp_1); std::swap(tmp_1, tmp_2); sigma_old = sigma; } - blas::copy(out, *tmp_2); + blas::copy(out, tmp_2); // Save Chebyshev tuning saveTuneCache(); diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp index fc2e83f09f..37142f9cb0 100644 --- a/lib/milc_interface.cpp +++ b/lib/milc_interface.cpp @@ -1967,7 +1967,7 @@ void milcSetMultigridParam(milcMultigridPack *mg_pack, QudaPrecision host_precis // change this to refresh fields when mass or links change mg_param.setup_maxiter_refresh[i] = 0; // setup_maxiter_refresh[i]; - mg_param.setup_maxiter_inverse_iterations_polish[i] = 0; // setup_maxiter_inverse_iterations_polish[i]; + mg_param.setup_maxiter_inverse_iterations_refinement[i] = 0; // setup_maxiter_inverse_iterations_refinement[i]; // Basis to use for CA solver setup --- heuristic for CA-GCR is empirical if (is_ca_solver(input_struct.setup_inv[i])) { diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index d2a8cb5f53..9a5c5af27b 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include // for building the KD inverse op @@ -779,21 +780,61 @@ namespace quda popLevel(); } - void MG::orthonormalize_vectors(const std::vector& vecs, int count) const { + void MG::orthonormalizeVectors(const std::vector& vecs, int count) const { + int num_vec = (count == -1) ? static_cast(vecs.size()) : count; if (num_vec > (int)vecs.size()) errorQuda("Trying to orthonormalize %d vectors which is larger than std::vector size %ld", num_vec, vecs.size()); - for (int i = 0; i < num_vec; i++) { - for (int j = 0; j < i ; j++) { - Complex alpha = cDotProduct(*vecs[j], *vecs[i]);// - caxpy(-alpha, *vecs[j], *vecs[i]); // i-j - } - double nrm2 = norm2(*vecs[i]); - if (nrm2 > 1e-16) ax(1.0 / sqrt(nrm2), *vecs[i]);// i/ - else errorQuda("Cannot normalize %u vector", i); + typedef Matrix matrix; + + // Create std::vector of references to ColorSpinorFields we're orthonormalizing + std::vector> ortho_vecs; + for (int i = 0; i < num_vec; i++) + ortho_vecs.emplace_back(std::ref(*vecs[i])); + + // Prepare to do the Cholesky decomposition for a thin-QR + std::vector Vdagv_(num_vec * num_vec); + + // outstanding bugfix + //hDotProduct(Vdagv_, ortho_vecs, ortho_vecs); + cDotProduct(Vdagv_, ortho_vecs, ortho_vecs); + + // perform Cholesky + matrix Vdagv(num_vec, num_vec); + for (int i = 0; i < num_vec; i++) + for (int j = 0; j < num_vec; j++) + Vdagv(i, j) = Vdagv_[i * num_vec + j]; + matrix sigma = Vdagv.llt().matrixL(); + + // perform tall-skinny QR to get Q + matrix R_inv = sigma.adjoint().inverse(); + +#if 0 + // Given a working cax_U... + // Tracked at https://github.com/lattice/quda/issues/1286 + std::vector R_inv_(num_vec * num_vec); + for (int i = 0; i < num_vec; i++) + for (int j = 0; j < num_vec; j++) + R_inv_[i * num_vec + j] = R_inv(i, j); + cax_U(R_inv_, ortho_vecs); + +#else + ColorSpinorParam csParam(*vecs[0]); + csParam.create = QUDA_NULL_FIELD_CREATE; + ColorSpinorField tmp_field(csParam); + for (int i = num_vec - 1; i >= 0; i--) { + blas::zero(tmp_field); + std::vector coeff(i + 1); + for (int j = 0; j < i + 1; j++) + coeff[j] = R_inv(j, i); + std::vector > vecs_subset(ortho_vecs.begin(), ortho_vecs.begin() + i + 1); + caxpy(coeff, vecs_subset, tmp_field); + blas::copy(ortho_vecs[i], tmp_field); } +#endif + } // FIXME need to make this more robust (implement Solver::flops() for all solvers) @@ -1309,7 +1350,7 @@ namespace quda } // supports separate reading or single file read - void MG::loadVectors() + void MG::loadVectors(std::vector &B) { if (param.transfer_type != QUDA_TRANSFER_AGGREGATE) { warningQuda("Cannot load near-null vectors for top level of staggered MG solve."); @@ -1324,7 +1365,7 @@ namespace quda vec_infile += "_nvec_"; vec_infile += std::to_string(param.mg_global.n_vec[param.level]); VectorIO io(vec_infile); - io.load(param.B); + io.load(B); popLevel(); profile_global.TPSTOP(QUDA_PROFILE_IO); if (is_running) profile_global.TPSTART(QUDA_PROFILE_INIT); @@ -1389,7 +1430,7 @@ namespace quda // if a near-null vector input file is specified, always load it and // ignore other setup information, UNLESS we're refreshing if (strcmp(param.mg_global.vec_infile[param.level], "") != 0 && !refresh) { - loadVectors(); + loadVectors(param.B); } else { // multiple setup iterations only matters for test vector setup @@ -1460,8 +1501,8 @@ namespace quda errorQuda("Invalid setup type %d", setup_type); } - if (param.mg_global.setup_maxiter_inverse_iterations_polish[param.level] > 0) { - generateInverseIterations(param.B, param.mg_global.setup_maxiter_inverse_iterations_polish[param.level]); + if (param.mg_global.setup_maxiter_inverse_iterations_refinement[param.level] > 0) { + generateInverseIterations(param.B, param.mg_global.setup_maxiter_inverse_iterations_refinement[param.level]); } } @@ -1562,7 +1603,7 @@ namespace quda // global orthonormalization of the initial null-space vectors if(param.mg_global.pre_orthonormalize) { - orthonormalize_vectors(B); + orthonormalizeVectors(B); } // launch solver for each source @@ -1591,7 +1632,7 @@ namespace quda // global orthonormalization of the generated null-space vectors if (param.mg_global.post_orthonormalize) { - orthonormalize_vectors(B); + orthonormalizeVectors(B); } delete solve; @@ -1659,7 +1700,7 @@ namespace quda // orthonormalize if (param.mg_global.pre_orthonormalize) { - orthonormalize_vectors(B, num_starting_vectors); + orthonormalizeVectors(B, num_starting_vectors); } for (int s = 0; s < num_starting_vectors; s++) { @@ -1734,16 +1775,14 @@ namespace quda } // print norms - if (getVerbosity() >= QUDA_VERBOSE) { - printfQuda("Norms of Chebyshev filtered near-null vectors:\n"); - for (int i = 0; i < num_vec; i++) { - printfQuda("Vector %d = %e\n", i, sqrt(blas::norm2(*B[i]))); - } + logQuda(QUDA_VERBOSE, "Norms of Chebyshev filtered near-null vectors:\n"); + for (int i = 0; i < num_vec; i++) { + logQuda(QUDA_VERBOSE, "Vector %d = %e\n", i, sqrt(blas::norm2(*B[i]))); } // global orthonormalization of the generated null-space vectors if (param.mg_global.post_orthonormalize) { - orthonormalize_vectors(B); + orthonormalizeVectors(B); } popLevel(); @@ -1859,7 +1898,7 @@ namespace quda // global orthonormalization of the generated null-space vectors if(param.mg_global.post_orthonormalize) { - orthonormalize_vectors(B); + orthonormalizeVectors(B); } logQuda(QUDA_VERBOSE, "Done building free vectors\n"); @@ -1904,10 +1943,6 @@ namespace quda for (int i = 0; i < n_conv; i++) B_evecs.push_back(new ColorSpinorField(csParam)); - // before entering the eigen solver, let's free the B vectors to save some memory - //ColorSpinorParam bParam(*B[0]); - //for (int i = 0; i < (int)B.size(); i++) delete B[i]; - EigenSolver *eig_solve; if (!normop && !dagger) { DiracM *mat = new DiracM(*diracResidual); @@ -1935,7 +1970,7 @@ namespace quda delete mat; } - // now reallocate the B vectors copy in e-vectors + // copy e-vectors back in for (int i = 0; i < (int)B.size(); i++) { if (B[0]->SiteSubset() == QUDA_PARITY_SITE_SUBSET) *B[i] = B_evecs[i]->Even(); @@ -1961,7 +1996,7 @@ namespace quda // B vectors can be in half precession on the fine level, need to convert to single ColorSpinorField Btmp; - bool need_tmp_single = param.fine->param.B[0]->Precision() != QUDA_SINGLE_PRECISION; + bool need_tmp_single = param.fine->param.B[0]->Precision() < QUDA_SINGLE_PRECISION; if (need_tmp_single) { ColorSpinorParam b_tmp_param(*param.fine->param.B[0]); b_tmp_param.create = QUDA_NULL_FIELD_CREATE; @@ -1988,7 +2023,7 @@ namespace quda initializeRandomVectors(B_remaining); } if (param.mg_global.pre_orthonormalize) { - orthonormalize_vectors(param.B); + orthonormalizeVectors(param.B); } generateInverseIterations(B_remaining, refresh ? param.mg_global.setup_maxiter_refresh[param.level] : param.mg_global.setup_maxiter[param.level]); @@ -2002,7 +2037,7 @@ namespace quda } if (param.mg_global.pre_orthonormalize) { - orthonormalize_vectors(std::vector(param.B.begin(), param.B.begin() + param.fine->param.Nvec + num_initialize)); + orthonormalizeVectors(std::vector(param.B.begin(), param.B.begin() + param.fine->param.Nvec + num_initialize)); } generateChebyshevFilter(B_remaining); @@ -2016,7 +2051,7 @@ namespace quda } if (param.mg_global.post_orthonormalize) { - orthonormalize_vectors(param.B); + orthonormalizeVectors(param.B); } popLevel(); diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index eefdf41cc7..6a0764de0b 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -139,7 +139,7 @@ quda::mgarray num_setup_iter = {}; quda::mgarray setup_tol = {}; quda::mgarray setup_maxiter = {}; quda::mgarray setup_maxiter_refresh = {}; -quda::mgarray setup_maxiter_inverse_iterations_polish = {}; +quda::mgarray setup_maxiter_inverse_iterations_refinement = {}; quda::mgarray setup_ca_basis = {}; quda::mgarray setup_ca_basis_size = {}; quda::mgarray setup_ca_lambda_min = {}; @@ -929,8 +929,8 @@ void add_multigrid_option_group(std::shared_ptr quda_app) opgroup, "--mg-setup-maxiter-refresh", setup_maxiter_refresh, CLI::Validator(), "The maximum number of solver iterations to use when refreshing the pre-existing null space vectors (default 100)"); quda_app->add_mgoption( - opgroup, "--mg-setup-maxiter-inverse-iterations-polish", setup_maxiter_inverse_iterations_polish, CLI::Validator(), - "The maximum number of solver iterations to use when polishing pre-existing null space vectors after a non-inverse-iteration setup (default 0, no polishing)"); + opgroup, "--mg-setup-maxiter-inverse-iterations-refinement", setup_maxiter_inverse_iterations_refinement, CLI::Validator(), + "The maximum number of solver iterations to use when refining pre-existing null space vectors after a non-inverse-iteration setup (default 0, no refinement)"); quda_app->add_mgoption(opgroup, "--mg-setup-tol", setup_tol, CLI::Validator(), "The tolerance to use for the setup of multigrid (default 5e-6)"); diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h index a965e92746..4e4322868c 100644 --- a/tests/utils/command_line_params.h +++ b/tests/utils/command_line_params.h @@ -276,7 +276,7 @@ extern quda::mgarray num_setup_iter; extern quda::mgarray setup_tol; extern quda::mgarray setup_maxiter; extern quda::mgarray setup_maxiter_refresh; -extern quda::mgarray setup_maxiter_inverse_iterations_polish; +extern quda::mgarray setup_maxiter_inverse_iterations_refinement; extern quda::mgarray setup_ca_basis; extern quda::mgarray setup_ca_basis_size; extern quda::mgarray setup_ca_lambda_min; diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index 98f1c889d7..8666e42eb7 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -93,7 +93,7 @@ void setQudaDefaultMgTestParams() setup_tol[i] = 5e-6; setup_maxiter[i] = 500; setup_maxiter_refresh[i] = 20; - setup_maxiter_inverse_iterations_polish[i] = 0; + setup_maxiter_inverse_iterations_refinement[i] = 0; mu_factor[i] = 1.; coarse_solve_type[i] = QUDA_INVALID_SOLVE; smoother_solve_type[i] = QUDA_INVALID_SOLVE; diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 3876eddf16..e41efa82aa 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -260,21 +260,14 @@ inline void safe_strcpy(char *cstr, const std::string &str, size_t limit, const } // MG param types -/** - @brief Set multigrid parameters that are universal across Wilson-type - and staggered-type -*/ -void setMultigridParam(QudaMultigridParam &mg_param); /** - @brief Set multigrid parameters that are specific to Wilson-type - multigrid. Calls setUniversalMultigridParam. + @brief Set multigrid parameters that are specific to Wilson-type multigrid */ void setWilsonMultigridParam(QudaMultigridParam &mg_param); /** - @brief Set multigrid parameters that are specific to staggered-type - multigrid. Calls setUniversalMultigridParam. + @brief Set multigrid parameters that are specific to staggered-type multigrid */ void setStaggeredMultigridParam(QudaMultigridParam &mg_param); diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp index 6606d79254..b51c0ac592 100644 --- a/tests/utils/set_params.cpp +++ b/tests/utils/set_params.cpp @@ -389,7 +389,7 @@ void setMultigridParam(QudaMultigridParam &mg_param) mg_param.setup_tol[i] = setup_tol[i]; mg_param.setup_maxiter[i] = setup_maxiter[i]; mg_param.setup_maxiter_refresh[i] = setup_maxiter_refresh[i]; - mg_param.setup_maxiter_inverse_iterations_polish[i] = setup_maxiter_inverse_iterations_polish[i]; + mg_param.setup_maxiter_inverse_iterations_refinement[i] = setup_maxiter_inverse_iterations_refinement[i]; // Setup type to use (inverse iterations, chebyshev filter, eigenvectors, restriction, free field) mg_param.setup_type[i] = setup_type[i]; From b4cdb6f76184f592017bd8f43428dd875d45b02d Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Mon, 6 Jun 2022 11:19:28 -0700 Subject: [PATCH 14/15] Added a more general matrix polynomial abstraction --- include/enum_quda.h | 6 +- include/enum_quda_fortran.h | 2 +- include/invert_quda.h | 42 +---- include/quda.h | 10 +- lib/inv_ca_cg.cpp | 11 +- lib/inv_ca_gcr.cpp | 11 +- lib/multigrid.cpp | 124 ++++++-------- lib/polynomial.hpp | 255 ++++++++++++++++++++++++++++ lib/quda_fortran.F90 | 4 +- lib/solver.hpp | 96 ----------- tests/utils/command_line_params.cpp | 12 +- tests/utils/command_line_params.h | 10 +- 12 files changed, 347 insertions(+), 236 deletions(-) create mode 100644 lib/polynomial.hpp delete mode 100644 lib/solver.hpp diff --git a/include/enum_quda.h b/include/enum_quda.h index c8f66c6e3f..2cde461d66 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -197,12 +197,12 @@ typedef enum QudaResidualType_s { QUDA_INVALID_RESIDUAL = QUDA_INVALID_ENUM } QudaResidualType; -// Which basis to use for CA algorithms -typedef enum QudaCABasis_s { +// Which basis to use for polynomials, CA solver basis +typedef enum QudaPolynomialBasis_s { QUDA_POWER_BASIS, QUDA_CHEBYSHEV_BASIS, QUDA_INVALID_BASIS = QUDA_INVALID_ENUM -} QudaCABasis; +} QudaPolynomialBasis; // Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo) // diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index e6313c0b6a..1fafd83091 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -181,7 +181,7 @@ #define QUDA_HEAVY_QUARK_RESIDUAL 4 #define QUDA_INVALID_RESIDUAL QUDA_INVALID_ENUM -#define QudaCABasis integer(4) +#define QudaPolynomialBasis integer(4) #define QUDA_POWER_BASIS 0 #define QUDA_CHEBYSHEV_BASIS 1 #define QUDA_INVALID_BASIS QUDA_INVALID_ENUM diff --git a/include/invert_quda.h b/include/invert_quda.h index 1c79a1b178..a9a33d8321 100644 --- a/include/invert_quda.h +++ b/include/invert_quda.h @@ -202,7 +202,7 @@ namespace quda { double omega; /** Basis for CA algorithms */ - QudaCABasis ca_basis; + QudaPolynomialBasis ca_basis; /** Minimum eigenvalue for Chebyshev CA basis */ double ca_lambda_min; @@ -211,7 +211,7 @@ namespace quda { double ca_lambda_max; // -1 -> power iter generate /** Basis for CA algorithms in a preconditioner */ - QudaCABasis ca_basis_precondition; + QudaPolynomialBasis ca_basis_precondition; /** Minimum eigenvalue for Chebyshev CA basis in a preconditioner */ double ca_lambda_min_precondition; @@ -708,40 +708,6 @@ namespace quda { */ void setRecomputeEvals(bool flag) { recompute_evals = flag; }; - /** - @brief Compute power iterations on a Dirac matrix - @param[in] diracm Dirac matrix used for power iterations - @param[in] start Starting rhs for power iterations; value preserved unless it aliases tempvec1 or tempvec2 - @param[in,out] tempvec1 Temporary vector used for power iterations (FIXME: can become a reference when std::swap - can be used on ColorSpinorField) - @param[in,out] tempvec2 Temporary vector used for power iterations (FIXME: can become a reference when std::swap - can be used on ColorSpinorField) - @param[in] niter Total number of power iteration iterations - @param[in] normalize_freq Frequency with which intermediate vector gets normalized - @param[in] args Parameter pack of ColorSpinorFields used as temporary passed to Dirac - @return Norm of final power iteration result - */ - template - static double performPowerIterations(const DiracMatrix &diracm, const ColorSpinorField &start, - ColorSpinorField &tempvec1, ColorSpinorField &tempvec2, int niter, - int normalize_freq, Args &&...args); - - /** - @brief Generate a Krylov space in a given basis - @param[in] diracm Dirac matrix used to generate the Krylov space - @param[out] Ap dirac matrix times the Krylov basis vectors - @param[in,out] p Krylov basis vectors; assumes p[0] is in place - @param[in] n_krylov Size of krylov space - @param[in] basis Basis type - @param[in] m_map Slope mapping for Chebyshev basis; ignored for power basis - @param[in] b_map Intercept mapping for Chebyshev basis; ignored for power basis - @param[in] args Parameter pack of ColorSpinorFields used as temporary passed to Dirac - */ - template - static void computeCAKrylovSpace(const DiracMatrix &diracm, std::vector &Ap, - std::vector &p, int n_krylov, QudaCABasis basis, double m_map, - double b_map, Args &&...args); - /** * @brief Return flops * @return flops expended by this operator @@ -1182,7 +1148,7 @@ namespace quda { bool init; bool lambda_init; - QudaCABasis basis; + QudaPolynomialBasis basis; std::vector Q_AQandg; // Fused inner product matrix std::vector Q_AS; // inner product matrix @@ -1320,7 +1286,7 @@ namespace quda { bool init; bool lambda_init; // whether or not lambda_max has been initialized - QudaCABasis basis; // CA basis + QudaPolynomialBasis basis; // CA basis std::vector alpha; // Solution coefficient vectors diff --git a/include/quda.h b/include/quda.h index 5b90f464ea..f6ad0404f6 100644 --- a/include/quda.h +++ b/include/quda.h @@ -322,7 +322,7 @@ extern "C" { double omega; /** Basis for CA algorithms */ - QudaCABasis ca_basis; + QudaPolynomialBasis ca_basis; /** Minimum eigenvalue for Chebyshev CA basis */ double ca_lambda_min; @@ -331,7 +331,7 @@ extern "C" { double ca_lambda_max; /** Basis for CA algorithms in a preconditioned solver */ - QudaCABasis ca_basis_precondition; + QudaPolynomialBasis ca_basis_precondition; /** Minimum eigenvalue for Chebyshev CA basis in a preconditioner solver */ double ca_lambda_min_precondition; @@ -642,7 +642,7 @@ extern "C" { int setup_maxiter_inverse_iterations_refinement[QUDA_MAX_MG_LEVEL]; /** Basis to use for CA solver setup */ - QudaCABasis setup_ca_basis[QUDA_MAX_MG_LEVEL]; + QudaPolynomialBasis setup_ca_basis[QUDA_MAX_MG_LEVEL]; /** Basis size for CA solver setup */ int setup_ca_basis_size[QUDA_MAX_MG_LEVEL]; @@ -693,7 +693,7 @@ extern "C" { int coarse_solver_maxiter[QUDA_MAX_MG_LEVEL]; /** Basis to use for CA coarse solvers */ - QudaCABasis coarse_solver_ca_basis[QUDA_MAX_MG_LEVEL]; + QudaPolynomialBasis coarse_solver_ca_basis[QUDA_MAX_MG_LEVEL]; /** Basis size for CA coarse solvers */ int coarse_solver_ca_basis_size[QUDA_MAX_MG_LEVEL]; @@ -717,7 +717,7 @@ extern "C" { int nu_post[QUDA_MAX_MG_LEVEL]; /** Basis to use for CA smoother solvers */ - QudaCABasis smoother_solver_ca_basis[QUDA_MAX_MG_LEVEL]; + QudaPolynomialBasis smoother_solver_ca_basis[QUDA_MAX_MG_LEVEL]; /** Minimum eigenvalue for Chebyshev CA smoother basis */ double smoother_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL]; diff --git a/lib/inv_ca_cg.cpp b/lib/inv_ca_cg.cpp index 9f96b093f6..e8d6f505bb 100644 --- a/lib/inv_ca_cg.cpp +++ b/lib/inv_ca_cg.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include /** @file inv_ca_cg.cpp @@ -485,7 +485,14 @@ namespace quda // Perform 100 power iterations, normalizing every 10 mat-vecs, using r as an initial seed // and Q[0]/AQ[0] as temporaries for the power iterations. tmp_sloppy/tmp2_sloppy get passed in as temporaries // for matSloppy. - lambda_max = 1.1 * Solver::performPowerIterations(matSloppy, r, Q[0], AQ[0], 100, 10, tmp_sloppy, tmp2_sloppy); + PolynomialBasisParams basis_params; + basis_params.basis = QUDA_POWER_BASIS; + basis_params.n_order = 100; + basis_params.normalize_freq = 10; + basis_params.tmp_vectors.emplace_back(std::ref(Q[0])); + basis_params.tmp_vectors.emplace_back(std::ref(AQ[1])); + lambda_max = 1.1 * performPowerIterations(matSloppy, r, basis_params, tmp_sloppy, tmp2_sloppy); + logQuda(QUDA_SUMMARIZE, "CA-CG Approximate lambda max = 1.1 x %e\n", lambda_max / 1.1); lambda_init = true; diff --git a/lib/inv_ca_gcr.cpp b/lib/inv_ca_gcr.cpp index 3779ad1c4d..6e593ee936 100644 --- a/lib/inv_ca_gcr.cpp +++ b/lib/inv_ca_gcr.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include namespace quda { @@ -223,7 +223,14 @@ namespace quda // Perform 100 power iterations, normalizing every 10 mat-vecs, using r_ as an initial seed // and q[0]/q[1] as temporaries for the power iterations. tmpSloppy get passed in a temporary // for matSloppy. Technically illegal if n_krylov == 1, but in that case lambda_max isn't used anyway. - lambda_max = 1.1 * Solver::performPowerIterations(matSloppy, r, q[0], q[1], 100, 10, tmp_sloppy); + PolynomialBasisParams basis_params; + basis_params.basis = QUDA_POWER_BASIS; + basis_params.n_order = 100; + basis_params.normalize_freq = 10; + basis_params.tmp_vectors.emplace_back(std::ref(q[0])); + basis_params.tmp_vectors.emplace_back(std::ref(q[1])); + lambda_max = 1.1 * performPowerIterations(matSloppy, r, basis_params, tmp_sloppy); + logQuda(QUDA_SUMMARIZE, "CA-GCR Approximate lambda max = 1.1 x %e\n", lambda_max / 1.1); lambda_init = true; diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 9a5c5af27b..cba32728ca 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include // for building the KD inverse op #include @@ -790,7 +790,7 @@ namespace quda typedef Matrix matrix; // Create std::vector of references to ColorSpinorFields we're orthonormalizing - std::vector> ortho_vecs; + std::vector ortho_vecs; for (int i = 0; i < num_vec; i++) ortho_vecs.emplace_back(std::ref(*vecs[i])); @@ -829,7 +829,7 @@ namespace quda std::vector coeff(i + 1); for (int j = 0; j < i + 1; j++) coeff[j] = R_inv(j, i); - std::vector > vecs_subset(ortho_vecs.begin(), ortho_vecs.begin() + i + 1); + std::vector vecs_subset(ortho_vecs.begin(), ortho_vecs.begin() + i + 1); caxpy(coeff, vecs_subset, tmp_field); blas::copy(ortho_vecs[i], tmp_field); } @@ -1679,20 +1679,43 @@ namespace quda double lambda_max = param.mg_global.filter_lambda_max[param.level]; if (lambda_max < lambda_min) { // approximate lambda_max - lambda_max = 1.1 * Solver::performPowerIterations(dirac_mdm, *B[0], pk, pkm1, - 100, 10, tmp1, tmp2); + PolynomialBasisParams basis_params; + basis_params.basis = QUDA_POWER_BASIS; + basis_params.n_order = 100; + basis_params.normalize_freq = 10; + basis_params.tmp_vectors.emplace_back(std::ref(pk)); + basis_params.tmp_vectors.emplace_back(std::ref(pkm1)); + lambda_max = 1.1 * performPowerIterations(dirac_mdm, *B[0], basis_params, tmp1, tmp2); } logQuda(QUDA_VERBOSE, "Nums starting %d Filter iter %d Filter rescale %d Filter next %d Lambda min %e lambda max %e\n", num_starting_vectors, filter_iterations, filter_rescale_freq, iterations_for_next, lambda_min, lambda_max); - // create filter interpolators - double m_map_filter = 2. / (lambda_max - lambda_min); - double b_map_filter = - (lambda_max + lambda_min) / (lambda_max - lambda_min); - - // create interpolators for generating more near-nulls - double m_map_generate = 2. / lambda_max; - double b_map_generate = -1.; + // create filter basis info + PolynomialBasisParams filter_basis_params; + filter_basis_params.basis = QUDA_CHEBYSHEV_BASIS; + filter_basis_params.n_order = filter_iterations; + filter_basis_params.normalize_freq = filter_rescale_freq; + filter_basis_params.tmp_vectors.emplace_back(std::ref(pk)); + filter_basis_params.tmp_vectors.emplace_back(std::ref(pkm1)); + filter_basis_params.tmp_vectors.emplace_back(std::ref(pkm2)); + filter_basis_params.tmp_vectors.emplace_back(std::ref(Apkm1)); + filter_basis_params.m_map = PolynomialBasisParams::compute_m_map(lambda_min, lambda_max); + filter_basis_params.b_map = PolynomialBasisParams::compute_b_map(lambda_min, lambda_max); + + // create basis info for polynomial that generates more near-nulls + // note that n_order is intentionally left empty, since that can + // vary if the number of starting vectors does not divide evenly + // into the total number of near-nulls + PolynomialBasisParams generation_basis_params; + generation_basis_params.basis = QUDA_CHEBYSHEV_BASIS; + generation_basis_params.normalize_freq = 0; + generation_basis_params.tmp_vectors.emplace_back(std::ref(pk)); + generation_basis_params.tmp_vectors.emplace_back(std::ref(pkm1)); + generation_basis_params.tmp_vectors.emplace_back(std::ref(pkm2)); + generation_basis_params.tmp_vectors.emplace_back(std::ref(Apkm1)); + generation_basis_params.m_map = PolynomialBasisParams::compute_m_map(0., lambda_max); + generation_basis_params.b_map = PolynomialBasisParams::compute_b_map(0., lambda_max); // we create batches of near-nulls from `B.size() / vectors_per_set` starting // random vectors @@ -1705,72 +1728,21 @@ namespace quda for (int s = 0; s < num_starting_vectors; s++) { - // filter - { - // P0 - blas::copy(pk, *B[s]); - - if (filter_iterations > 0) { - // P1 = m Ap_0 + b p_0 - std::swap(pkm1, pk); // p_k -> p_{k - 1} - dirac_mdm(Apkm1, pkm1, tmp1, tmp2); - blas::axpbyz(m_map_filter, Apkm1, b_map_filter, pkm1, pk); - - if (filter_iterations > 1) { - // Enter recursion relation - for (int k = 2; k <= filter_iterations; k++) { - std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} - std::swap(pkm1, pk); // p_k -> p_{k-1} - dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} - blas::axpbypczw(2. * m_map_filter, Apkm1, 2. * b_map_filter, pkm1, -1., pkm2, pk); - - // heuristic rescale to keep norms in check... - if (k % filter_rescale_freq == 0) { - double tmp_nrm2 = blas::norm2(pk); - logQuda(QUDA_VERBOSE, "Starting vector %d heuristic rescale at %d old norm2 %e\n", s, k, tmp_nrm2); - double tmp_inv_nrm = 1. / sqrt(tmp_nrm2); - ax(tmp_inv_nrm, pk); - ax(tmp_inv_nrm, pkm1); - ax(tmp_inv_nrm, pkm2); - ax(tmp_inv_nrm, Apkm1); - } - } - } - } - - // print the norm (to check for enhancement, normalize) - double nrm2 = blas::norm2(pk); - logQuda(QUDA_VERBOSE, "Enhanced norm2 for start %d is %e\n", s, nrm2); - ax(1.0 / sqrt(nrm2), pk); - - // This is the first filtered vector - blas::copy(*B[s], pk); - } + // Apply the initial Chebyshev filter + applyMatrixPolynomial(dirac_mdm, *B[s], *B[s], filter_basis_params, tmp1, tmp2); // Now we generate further vectors by another Chebyshev filter from 0 -> lambda_max - // P0 is trivially in place - - // P1 - std::swap(pkm1, pk); // p_k -> p_{k - 1} - dirac_mdm(Apkm1, pkm1, tmp1, tmp2); - blas::axpbyz(m_map_generate, Apkm1, b_map_generate, pkm1, pk); - - bool first = true; - - for (int i = s + num_starting_vectors; i < num_vec; i += num_starting_vectors) { - - for (int k = first ? 2 : 0; k < iterations_for_next; k++) { - std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} - std::swap(pkm1, pk); // p_k -> p_{k-1} - dirac_mdm(Apkm1, pkm1, tmp1, tmp2); // compute A p_{k-1} - blas::axpbypczw(2. * m_map_generate, Apkm1, 2. * b_map_generate, pkm1, -1., pkm2, pk); - } - - blas::copy(*B[i], pk); - - if (first) first = false; - - } + // Populate the array of outputs + std::vector output_vecs; + for (int i = s + num_starting_vectors; i < num_vec; i += num_starting_vectors) + output_vecs.emplace_back(std::ref(*B[i])); + + // the number we need to save is the size of output_vecs; the order of the polynomial is + // iterations_for_next times the size + generation_basis_params.n_order = output_vecs.size() * iterations_for_next; + + // Generate additional near-null vectors + applyMatrixPolynomial(dirac_mdm, output_vecs, *B[s], generation_basis_params, iterations_for_next, tmp1, tmp2); } diff --git a/lib/polynomial.hpp b/lib/polynomial.hpp new file mode 100644 index 0000000000..a034d08656 --- /dev/null +++ b/lib/polynomial.hpp @@ -0,0 +1,255 @@ + +#include +#include +#include + +namespace quda +{ + + struct PolynomialBasisParams { + + /** What basis polynomial are we using */ + QudaPolynomialBasis basis; + + /** What order polynomial are we generating? */ + int n_order; + + /** How often are we re-normalizing the vector to avoid overflow + Set to 0 to never normalize */ + int normalize_freq; + + /** Chebyshev basis: slope of mapping onto [-1,1] */ + double m_map; + + /** Chebyshev basis: intercept of mapping onto [-1,1] */ + double b_map; + + /** Vector of temporary vectors; need 2 for power iterations, 4 for chebyshev basis */ + std::vector tmp_vectors; + + PolynomialBasisParams() : + basis(QUDA_INVALID_BASIS), + n_order(0), + normalize_freq(0), + m_map(0.0), + b_map(0.0), + tmp_vectors() { } + + static double compute_m_map(double lambda_min, double lambda_max) { return 2. / (lambda_max - lambda_min); } + static double compute_b_map(double lambda_min, double lambda_max) { return - (lambda_max + lambda_min) / (lambda_max - lambda_min); } + + static void check_params(const PolynomialBasisParams& params) { + if (params.n_order < 0) errorQuda("Invalid polynomial order %d", params.n_order); + if (params.normalize_freq < 0) errorQuda("Invalid rescale frequency %d", params.normalize_freq); + + switch (params.basis) { + case QUDA_POWER_BASIS: + if (params.tmp_vectors.size() < 2) errorQuda("Invalid temporary vector count %lu for power basis, expected 2", params.tmp_vectors.size()); + break; + case QUDA_CHEBYSHEV_BASIS: + if (params.m_map < 0) errorQuda("Invalid m_map %e, implies lambda_min >= lambda_max", params.m_map); + if (params.tmp_vectors.size() < 4) errorQuda("Invalid temporary vector count %lu for Chebyshev basis, expected 4", params.tmp_vectors.size()); + break; + default: errorQuda("Polynomial basis is unspecified"); + } + } + }; + + /** + @brief Apply a polynomial basis to a starting vector, optionally saving with some frequency + @param[in] diracm Dirac matrix used for the polynomial + @param[out] output_vecs Output vectors, which may be more than one if the save frquency is non-zero + @param[in] start_vec Starting vector for polynomial application + @param[in] poly_params Parameters for the polynomial application + @param[in] save_freq Frequency with which vectors are saved to output_vecs + @param[in] args Parameter pack of ColorSpinorFields used as temporaries passed to Dirac + */ + template + void applyMatrixPolynomial(const DiracMatrix &diracm, std::vector &output_vecs, + const ColorSpinorField &start_vec, const PolynomialBasisParams &poly_params, int save_freq, + Args &&...args) + { + PolynomialBasisParams::check_params(poly_params); + // if the save_frequency is 0, make sure output_vecs is of length 1 + if (save_freq < 0) + errorQuda("Invalid save frequency %lu", output_vecs.size()); + else if (save_freq == 0 && output_vecs.size() != 1) + errorQuda("Invalid vector length %lu", output_vecs.size()); + else if (save_freq > 0 && poly_params.n_order % save_freq != 0) + errorQuda("Saving frequency %d does not evenly divide into polynomial order %d", save_freq, poly_params.n_order); + else if (save_freq > 0 && static_cast(output_vecs.size()) != poly_params.n_order / save_freq) + errorQuda("Invalid vector length %lu, expected %d", output_vecs.size(), poly_params.n_order / save_freq); + + int save_count = 0; + + if (poly_params.basis == QUDA_POWER_BASIS) { + ColorSpinorField &tempvec1 = poly_params.tmp_vectors[0]; + ColorSpinorField &tempvec2 = poly_params.tmp_vectors[1]; + blas::copy(tempvec1, start_vec); // no-op if fieds alias + + for (int i = 1; i <= poly_params.n_order; i++) { + diracm(tempvec2, tempvec1, args...); + if (save_freq > 0 && i % save_freq == 0) + blas::copy(output_vecs[save_count++], tempvec2); + if (poly_params.normalize_freq > 0 && i % poly_params.normalize_freq == 0) { + double tmp_nrm = sqrt(blas::norm2(tempvec2)); + logQuda(QUDA_VERBOSE, "Triggered rescale during matrix polynomial application; norm at rescale is %e\n", tmp_nrm); + blas::ax(1.0 / tmp_nrm, tempvec2); + } + std::swap(tempvec1, tempvec2); + } + + // if save_freq != 0, the last vector has already been saved into output_vecs + if (save_freq == 0) blas::copy(output_vecs[0], tempvec1); + } else if (poly_params.basis == QUDA_CHEBYSHEV_BASIS) { + + ColorSpinorField &pk = poly_params.tmp_vectors[0]; + ColorSpinorField &pkm1 = poly_params.tmp_vectors[1]; + ColorSpinorField &pkm2 = poly_params.tmp_vectors[2]; + ColorSpinorField &Apkm1 = poly_params.tmp_vectors[3]; + + int save_count = 0; + + // P0 + blas::copy(pk, start_vec); + + if (poly_params.n_order > 0) { + // P1 = m Ap_0 + b p_0 + std::swap(pkm1, pk); // p_k -> p_{k - 1} + diracm(Apkm1, pkm1, args...); + blas::axpbyz(poly_params.m_map, Apkm1, poly_params.b_map, pkm1, pk); + + if (save_freq == 1) blas::copy(output_vecs[save_count++], pk); + if (poly_params.normalize_freq == 1) { + double tmp_nrm = sqrt(blas::norm2(pk)); + logQuda(QUDA_VERBOSE, "Triggered rescale during matrix polynomial application; norm at rescale is %e\n", tmp_nrm); + double tmp_inv_nrm = 1. / tmp_nrm; + blas::ax(tmp_inv_nrm, pk); + blas::ax(tmp_inv_nrm, pkm1); + blas::ax(tmp_inv_nrm, Apkm1); + } + + if (poly_params.n_order > 1) { + // Enter recursion relation + for (int k = 2; k <= poly_params.n_order; k++) { + std::swap(pkm2, pkm1); // p_{k - 1} -> p_{k-2} + std::swap(pkm1, pk); // p_k -> p_{k-1} + diracm(Apkm1, pkm1, args...); // compute A p_{k-1} + blas::axpbypczw(2. * poly_params.m_map, Apkm1, 2. * poly_params.b_map, pkm1, -1., pkm2, pk); + + if (save_freq > 0 && k % save_freq == 0) blas::copy(output_vecs[save_count++], pk); + // heuristic rescale to keep norms in check... + if (poly_params.normalize_freq > 0 && k % poly_params.normalize_freq == 0) { + double tmp_nrm = sqrt(blas::norm2(pk)); + logQuda(QUDA_VERBOSE, "Triggered rescale during matrix polynomial application; norm at rescale is %e\n", tmp_nrm); + double tmp_inv_nrm = 1. / tmp_nrm; + blas::ax(tmp_inv_nrm, pk); + blas::ax(tmp_inv_nrm, pkm1); + blas::ax(tmp_inv_nrm, pkm2); + blas::ax(tmp_inv_nrm, Apkm1); + } + } + } + } + + // if save_freq != 0, the last vector has already been saved into output_vecs + if (save_freq == 0) blas::copy(output_vecs[0], pk); + } else { + errorQuda("Invalid basis %d", poly_params.basis); + } + } + + /** + @brief Apply a polynomial basis to a starting vector + @param[in] diracm Dirac matrix used for the polynomial + @param[out] output_vec Output vector + @param[in] start_vec Starting vector for polynomial application + @param[in] poly_params Parameters for the polynomial application + @param[in] args Parameter pack of ColorSpinorFields used as temporaries passed to Dira + */ + template + void applyMatrixPolynomial(const DiracMatrix &diracm, ColorSpinorField &output_vec, + const ColorSpinorField &start_vec, const PolynomialBasisParams &poly_params, + Args &&...args) + { + std::vector output_vecs; + output_vecs.emplace_back(std::ref(output_vec)); + applyMatrixPolynomial(diracm, output_vecs, start_vec, poly_params, 0, args...); + } + + /** + @brief Compute power iterations on a Dirac matrix + @param[in] diracm Dirac matrix used for power iterations + @param[in] start Starting rhs for power iterations; value preserved unless it aliases temporary vectors + @param[in,out] poly_params Parameters for polynomial application, must correspond to power iterations + @param[in] args Parameter pack of ColorSpinorFields used as temporary passed to Dirac + @return Norm of final power iteration result + */ + template + double performPowerIterations(const DiracMatrix &diracm, const ColorSpinorField &start, + PolynomialBasisParams &poly_params, Args &&...args) + { + PolynomialBasisParams::check_params(poly_params); + if (poly_params.basis != QUDA_POWER_BASIS) errorQuda("Invalid basis %d", poly_params.basis); + + auto tempvec1 = poly_params.tmp_vectors[0]; + auto tempvec2 = poly_params.tmp_vectors[1]; + + applyMatrixPolynomial(diracm, tempvec1, start, poly_params, args...); + + // Get Rayleigh quotient + double tmpnrm = sqrt(blas::norm2(tempvec1)); + blas::ax(1.0 / tmpnrm, tempvec1); + diracm(tempvec2, tempvec1, args...); + double lambda_max = sqrt(blas::norm2(tempvec2)); + logQuda(QUDA_VERBOSE, "Power iterations approximate max = %e\n", lambda_max); + + return lambda_max; + } + + /** + @brief Generate a Krylov space in a given basis + @param[in] diracm Dirac matrix used to generate the Krylov space + @param[out] Ap dirac matrix times the Krylov basis vectors + @param[in,out] p Krylov basis vectors; assumes p[0] is in place + @param[in] n_krylov Size of krylov space + @param[in] basis Basis type + @param[in] m_map Slope mapping for Chebyshev basis; ignored for power basis + @param[in] b_map Intercept mapping for Chebyshev basis; ignored for power basis + @param[in] args Parameter pack of ColorSpinorFields used as temporary passed to Dirac + */ + template + void computeCAKrylovSpace(const DiracMatrix &diracm, std::vector &Ap, + std::vector &p, int n_krylov, QudaPolynomialBasis basis, double m_map, + double b_map, Args &&...args) + { + // in some cases p or Ap may be larger + if (static_cast(p.size()) < n_krylov) errorQuda("Invalid p.size() %lu < n_krylov %d", p.size(), n_krylov); + if (static_cast(Ap.size()) < n_krylov) errorQuda("Invalid Ap.size() %lu < n_krylov %d", Ap.size(), n_krylov); + + if (basis == QUDA_POWER_BASIS) { + for (int k = 0; k < n_krylov; k++) { + diracm(Ap[k], p[k], args...); + if (k < (n_krylov - 1)) blas::copy(p[k + 1], Ap[k]); // no op if fields alias, which is often the case + } + } else { // chebyshev basis + diracm(Ap[0], p[0], args...); + + if (n_krylov > 1) { + // p_1 = m Ap_0 + b p_0 + blas::axpbyz(m_map, Ap[0], b_map, p[0], p[1]); + diracm(Ap[1], p[1], args...); + + // Enter recursion relation + if (n_krylov > 2) { + // p_k = 2 m A[_{k-1} + 2 b p_{k-1} - p_{k-2} + for (int k = 2; k < n_krylov; k++) { + blas::axpbypczw(2. * m_map, Ap[k - 1], 2. * b_map, p[k - 1], -1., p[k - 2], p[k]); + diracm(Ap[k], p[k], args...); + } + } + } + } + } + +} // namespace quda diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90 index 01a792558a..f8fd660c88 100644 --- a/lib/quda_fortran.F90 +++ b/lib/quda_fortran.F90 @@ -261,7 +261,7 @@ module quda_fortran real(8) :: omega ! Basis for CA algorithms - QudaCABasis :: ca_basis + QudaPolynomialBasis :: ca_basis ! Minimum eigenvalue for Chebyshev CA basis real(8) :: ca_lambda_min @@ -270,7 +270,7 @@ module quda_fortran real(8) :: ca_lambda_max ! Basis for CA algorithms in preconditioner solvers - QudaCABasis :: ca_basis_precondition + QudaPolynomialBasis :: ca_basis_precondition ! Minimum eigenvalue for Chebyshev CA basis in preconditioner solvers real(8) :: ca_lambda_min_precondition diff --git a/lib/solver.hpp b/lib/solver.hpp deleted file mode 100644 index ab93451aef..0000000000 --- a/lib/solver.hpp +++ /dev/null @@ -1,96 +0,0 @@ - -#include -#include -#include -#include - -namespace quda -{ - - /** - @brief Compute power iterations on a Dirac matrix - @param[in] diracm Dirac matrix used for power iterations - @param[in] start Starting rhs for power iterations; value preserved unless it aliases tempvec1 or tempvec2 - @param[in,out] tempvec1 Temporary vector used for power iterations - @param[in,out] tempvec2 Temporary vector used for power iterations - @param[in] niter Total number of power iteration iterations - @param[in] normalize_freq Frequency with which intermediate vector gets normalized - @param[in] args Parameter pack of ColorSpinorFields used as temporary passed to Dirac - @return Norm of final power iteration result - */ - template - double Solver::performPowerIterations(const DiracMatrix &diracm, const ColorSpinorField &start, - ColorSpinorField &tempvec1, ColorSpinorField &tempvec2, int niter, - int normalize_freq, Args &&...args) - { - checkPrecision(tempvec1, tempvec2); - blas::copy(tempvec1, start); // no-op if fields alias - - // Do niter iterations, normalize every normalize_freq - for (int i = 0; i < niter; i++) { - if (normalize_freq > 0 && i % normalize_freq == 0) { - double tmpnrm = sqrt(blas::norm2(tempvec1)); - blas::ax(1.0 / tmpnrm, tempvec1); - } - diracm(tempvec2, tempvec1, args...); - if (normalize_freq > 0 && i % normalize_freq == 0) { - logQuda(QUDA_VERBOSE, "Current Rayleigh Quotient step %d is %e\n", i, sqrt(blas::norm2(tempvec2))); - } - std::swap(tempvec1, tempvec2); - } - // Get Rayleigh quotient - double tmpnrm = sqrt(blas::norm2(tempvec1)); - blas::ax(1.0 / tmpnrm, tempvec1); - diracm(tempvec2, tempvec1, args...); - double lambda_max = sqrt(blas::norm2(tempvec2)); - logQuda(QUDA_VERBOSE, "Power iterations approximate max = %e\n", lambda_max); - - return lambda_max; - } - - /** - @brief Generate a Krylov space in a given basis - @param[in] diracm Dirac matrix used to generate the Krylov space - @param[out] Ap dirac matrix times the Krylov basis vectors - @param[in,out] p Krylov basis vectors; assumes p[0] is in place - @param[in] n_krylov Size of krylov space - @param[in] basis Basis type - @param[in] m_map Slope mapping for Chebyshev basis; ignored for power basis - @param[in] b_map Intercept mapping for Chebyshev basis; ignored for power basis - @param[in] args Parameter pack of ColorSpinorFields used as temporary passed to Dirac - */ - template - void Solver::computeCAKrylovSpace(const DiracMatrix &diracm, std::vector &Ap, - std::vector &p, int n_krylov, QudaCABasis basis, double m_map, - double b_map, Args &&...args) - { - // in some cases p or Ap may be larger - if (static_cast(p.size()) < n_krylov) errorQuda("Invalid p.size() %lu < n_krylov %d", p.size(), n_krylov); - if (static_cast(Ap.size()) < n_krylov) errorQuda("Invalid Ap.size() %lu < n_krylov %d", Ap.size(), n_krylov); - - if (basis == QUDA_POWER_BASIS) { - for (int k = 0; k < n_krylov; k++) { - diracm(Ap[k], p[k], args...); - if (k < (n_krylov - 1)) blas::copy(p[k + 1], Ap[k]); // no op if fields alias, which is often the case - } - } else { // chebyshev basis - diracm(Ap[0], p[0], args...); - - if (n_krylov > 1) { - // p_1 = m Ap_0 + b p_0 - blas::axpbyz(m_map, Ap[0], b_map, p[0], p[1]); - diracm(Ap[1], p[1], args...); - - // Enter recursion relation - if (n_krylov > 2) { - // p_k = 2 m A[_{k-1} + 2 b p_{k-1} - p_{k-2} - for (int k = 2; k < n_krylov; k++) { - blas::axpbypczw(2. * m_map, Ap[k - 1], 2. * b_map, p[k - 1], -1., p[k - 2], p[k]); - diracm(Ap[k], p[k], args...); - } - } - } - } - } - -} // namespace quda diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index 6a0764de0b..9be2f9f557 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -50,10 +50,10 @@ int niter = 100; int maxiter_precondition = 10; QudaVerbosity verbosity_precondition = QUDA_SUMMARIZE; int gcrNkrylov = 8; -QudaCABasis ca_basis = QUDA_CHEBYSHEV_BASIS; +QudaPolynomialBasis ca_basis = QUDA_CHEBYSHEV_BASIS; double ca_lambda_min = 0.0; double ca_lambda_max = -1.0; -QudaCABasis ca_basis_precondition = QUDA_POWER_BASIS; +QudaPolynomialBasis ca_basis_precondition = QUDA_POWER_BASIS; double ca_lambda_min_precondition = 0.0; double ca_lambda_max_precondition = -1.0; int pipeline = 0; @@ -140,7 +140,7 @@ quda::mgarray setup_tol = {}; quda::mgarray setup_maxiter = {}; quda::mgarray setup_maxiter_refresh = {}; quda::mgarray setup_maxiter_inverse_iterations_refinement = {}; -quda::mgarray setup_ca_basis = {}; +quda::mgarray setup_ca_basis = {}; quda::mgarray setup_ca_basis_size = {}; quda::mgarray setup_ca_lambda_min = {}; quda::mgarray setup_ca_lambda_max = {}; @@ -159,13 +159,13 @@ double omega = 0.85; quda::mgarray coarse_solver = {}; quda::mgarray coarse_solver_tol = {}; quda::mgarray smoother_type = {}; -quda::mgarray smoother_solver_ca_basis = {}; +quda::mgarray smoother_solver_ca_basis = {}; quda::mgarray smoother_solver_ca_lambda_min = {}; quda::mgarray smoother_solver_ca_lambda_max = {}; QudaPrecision smoother_halo_prec = QUDA_INVALID_PRECISION; quda::mgarray smoother_tol = {}; quda::mgarray coarse_solver_maxiter = {}; -quda::mgarray coarse_solver_ca_basis = {}; +quda::mgarray coarse_solver_ca_basis = {}; quda::mgarray coarse_solver_ca_basis_size = {}; quda::mgarray coarse_solver_ca_lambda_min = {}; quda::mgarray coarse_solver_ca_lambda_max = {}; @@ -288,7 +288,7 @@ bool enable_testing = false; namespace { - CLI::TransformPairs ca_basis_map {{"power", QUDA_POWER_BASIS}, {"chebyshev", QUDA_CHEBYSHEV_BASIS}}; + CLI::TransformPairs ca_basis_map {{"power", QUDA_POWER_BASIS}, {"chebyshev", QUDA_CHEBYSHEV_BASIS}}; CLI::TransformPairs contract_type_map {{"open", QUDA_CONTRACT_TYPE_OPEN}, {"dr", QUDA_CONTRACT_TYPE_DR}}; diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h index 4e4322868c..5871c605c5 100644 --- a/tests/utils/command_line_params.h +++ b/tests/utils/command_line_params.h @@ -189,10 +189,10 @@ extern int niter; extern int maxiter_precondition; extern QudaVerbosity verbosity_precondition; extern int gcrNkrylov; -extern QudaCABasis ca_basis; +extern QudaPolynomialBasis ca_basis; extern double ca_lambda_min; extern double ca_lambda_max; -extern QudaCABasis ca_basis_precondition; +extern QudaPolynomialBasis ca_basis_precondition; extern double ca_lambda_min_precondition; extern double ca_lambda_max_precondition; extern int pipeline; @@ -277,7 +277,7 @@ extern quda::mgarray setup_tol; extern quda::mgarray setup_maxiter; extern quda::mgarray setup_maxiter_refresh; extern quda::mgarray setup_maxiter_inverse_iterations_refinement; -extern quda::mgarray setup_ca_basis; +extern quda::mgarray setup_ca_basis; extern quda::mgarray setup_ca_basis_size; extern quda::mgarray setup_ca_lambda_min; extern quda::mgarray setup_ca_lambda_max; @@ -297,13 +297,13 @@ extern double omega; extern quda::mgarray coarse_solver; extern quda::mgarray coarse_solver_tol; extern quda::mgarray smoother_type; -extern quda::mgarray smoother_solver_ca_basis; +extern quda::mgarray smoother_solver_ca_basis; extern quda::mgarray smoother_solver_ca_lambda_min; extern quda::mgarray smoother_solver_ca_lambda_max; extern QudaPrecision smoother_halo_prec; extern quda::mgarray smoother_tol; extern quda::mgarray coarse_solver_maxiter; -extern quda::mgarray coarse_solver_ca_basis; +extern quda::mgarray coarse_solver_ca_basis; extern quda::mgarray coarse_solver_ca_basis_size; extern quda::mgarray coarse_solver_ca_lambda_min; extern quda::mgarray coarse_solver_ca_lambda_max; From e9fba2179c99b23d1da66df6e522e54de7af690a Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Mon, 6 Jun 2022 11:39:48 -0700 Subject: [PATCH 15/15] Updated CA basis generation to use polynomial basis struct --- lib/inv_ca_cg.cpp | 12 ++++++---- lib/inv_ca_gcr.cpp | 12 ++++++---- lib/polynomial.hpp | 55 +++++++++++++++++++++++++--------------------- 3 files changed, 46 insertions(+), 33 deletions(-) diff --git a/lib/inv_ca_cg.cpp b/lib/inv_ca_cg.cpp index e8d6f505bb..2b3cbe40dd 100644 --- a/lib/inv_ca_cg.cpp +++ b/lib/inv_ca_cg.cpp @@ -548,9 +548,13 @@ namespace quda double maxrr = rNorm; // The maximum residual norm since the last reliable update. double maxr_deflate = rNorm; // The maximum residual since the last deflation - // Factors which map linear operator onto [-1,1] - double m_map = 2. / (lambda_max - lambda_min); - double b_map = -(lambda_max + lambda_min) / (lambda_max - lambda_min); + // Parameters for the basis polynomial + PolynomialBasisParams ca_params; + ca_params.basis = basis; + ca_params.n_order = n_krylov; + ca_params.normalize_freq = 0; + ca_params.m_map = PolynomialBasisParams::compute_m_map(lambda_min, lambda_max); + ca_params.b_map = PolynomialBasisParams::compute_b_map(lambda_min, lambda_max); blas::copy(S[0], r); // no op if uni-precision @@ -558,7 +562,7 @@ namespace quda while (!convergence(r2, heavy_quark_res, stop, param.tol_hq) && total_iter < param.maxiter) { // build up a space of size n_krylov, assumes S[0] is in place - computeCAKrylovSpace(matSloppy, AS, S, n_krylov, basis, m_map, b_map, tmp_sloppy, tmp2_sloppy); + computeCAKrylovSpace(matSloppy, AS, S, ca_params, tmp_sloppy, tmp2_sloppy); // we can greatly simplify the workflow for fixed iterations if (!fixed_iteration) { diff --git a/lib/inv_ca_gcr.cpp b/lib/inv_ca_gcr.cpp index 6e593ee936..e9b0a143ed 100644 --- a/lib/inv_ca_gcr.cpp +++ b/lib/inv_ca_gcr.cpp @@ -241,9 +241,13 @@ namespace quda } } - // Factors which map linear operator onto [-1,1] - double m_map = 2. / (lambda_max - lambda_min); - double b_map = -(lambda_max + lambda_min) / (lambda_max - lambda_min); + // Parameters for the basis polynomial + PolynomialBasisParams ca_params; + ca_params.basis = basis; + ca_params.n_order = n_krylov; + ca_params.normalize_freq = 0; + ca_params.m_map = PolynomialBasisParams::compute_m_map(lambda_min, lambda_max); + ca_params.b_map = PolynomialBasisParams::compute_b_map(lambda_min, lambda_max); // Check to see that we're not trying to invert on a zero-field source if (b2 == 0) { @@ -291,7 +295,7 @@ namespace quda while (!convergence(r2, heavy_quark_res, stop, param.tol_hq) && total_iter < param.maxiter) { // build up a space of size n_krylov - computeCAKrylovSpace(matSloppy, q, p, n_krylov, basis, m_map, b_map, tmp_sloppy); + computeCAKrylovSpace(matSloppy, q, p, ca_params, tmp_sloppy); solve(alpha, q, p[0]); diff --git a/lib/polynomial.hpp b/lib/polynomial.hpp index a034d08656..aff3386096 100644 --- a/lib/polynomial.hpp +++ b/lib/polynomial.hpp @@ -38,19 +38,22 @@ namespace quda static double compute_m_map(double lambda_min, double lambda_max) { return 2. / (lambda_max - lambda_min); } static double compute_b_map(double lambda_min, double lambda_max) { return - (lambda_max + lambda_min) / (lambda_max - lambda_min); } - static void check_params(const PolynomialBasisParams& params) { + static void check_params(const PolynomialBasisParams& params, bool skip_temporaries_check = false) { if (params.n_order < 0) errorQuda("Invalid polynomial order %d", params.n_order); if (params.normalize_freq < 0) errorQuda("Invalid rescale frequency %d", params.normalize_freq); - switch (params.basis) { - case QUDA_POWER_BASIS: - if (params.tmp_vectors.size() < 2) errorQuda("Invalid temporary vector count %lu for power basis, expected 2", params.tmp_vectors.size()); - break; - case QUDA_CHEBYSHEV_BASIS: - if (params.m_map < 0) errorQuda("Invalid m_map %e, implies lambda_min >= lambda_max", params.m_map); - if (params.tmp_vectors.size() < 4) errorQuda("Invalid temporary vector count %lu for Chebyshev basis, expected 4", params.tmp_vectors.size()); - break; - default: errorQuda("Polynomial basis is unspecified"); + // the temporary check does not need to be done for CA basis generation + if (!skip_temporaries_check) { + switch (params.basis) { + case QUDA_POWER_BASIS: + if (params.tmp_vectors.size() < 2) errorQuda("Invalid temporary vector count %lu for power basis, expected 2", params.tmp_vectors.size()); + break; + case QUDA_CHEBYSHEV_BASIS: + if (params.m_map < 0) errorQuda("Invalid m_map %e, implies lambda_min >= lambda_max", params.m_map); + if (params.tmp_vectors.size() < 4) errorQuda("Invalid temporary vector count %lu for Chebyshev basis, expected 4", params.tmp_vectors.size()); + break; + default: errorQuda("Polynomial basis is unspecified"); + } } } }; @@ -208,47 +211,49 @@ namespace quda } /** - @brief Generate a Krylov space in a given basis - @param[in] diracm Dirac matrix used to generate the Krylov space - @param[out] Ap dirac matrix times the Krylov basis vectors - @param[in,out] p Krylov basis vectors; assumes p[0] is in place - @param[in] n_krylov Size of krylov space - @param[in] basis Basis type - @param[in] m_map Slope mapping for Chebyshev basis; ignored for power basis - @param[in] b_map Intercept mapping for Chebyshev basis; ignored for power basis - @param[in] args Parameter pack of ColorSpinorFields used as temporary passed to Dirac + @brief Apply a polynomial basis to a starting vector, optionally saving with some frequency + @param[in] diracm Dirac matrix used for the polynomial + @param[out] Ap dirac matrix times the Krylov basis vectors + @param[in,out] p Krylov basis vectors; assumes p[0] is in place + @param[in] poly_params Parameters for the polynomial application + @param[in] args Parameter pack of ColorSpinorFields used as temporaries passed to Dirac */ template void computeCAKrylovSpace(const DiracMatrix &diracm, std::vector &Ap, - std::vector &p, int n_krylov, QudaPolynomialBasis basis, double m_map, - double b_map, Args &&...args) + std::vector &p, PolynomialBasisParams poly_params, Args &&...args) { + PolynomialBasisParams::check_params(poly_params, true); + + auto n_krylov = poly_params.n_order; + // in some cases p or Ap may be larger if (static_cast(p.size()) < n_krylov) errorQuda("Invalid p.size() %lu < n_krylov %d", p.size(), n_krylov); if (static_cast(Ap.size()) < n_krylov) errorQuda("Invalid Ap.size() %lu < n_krylov %d", Ap.size(), n_krylov); - if (basis == QUDA_POWER_BASIS) { + if (poly_params.basis == QUDA_POWER_BASIS) { for (int k = 0; k < n_krylov; k++) { diracm(Ap[k], p[k], args...); if (k < (n_krylov - 1)) blas::copy(p[k + 1], Ap[k]); // no op if fields alias, which is often the case } - } else { // chebyshev basis + } else if (poly_params.basis == QUDA_CHEBYSHEV_BASIS) { diracm(Ap[0], p[0], args...); if (n_krylov > 1) { // p_1 = m Ap_0 + b p_0 - blas::axpbyz(m_map, Ap[0], b_map, p[0], p[1]); + blas::axpbyz(poly_params.m_map, Ap[0], poly_params.b_map, p[0], p[1]); diracm(Ap[1], p[1], args...); // Enter recursion relation if (n_krylov > 2) { // p_k = 2 m A[_{k-1} + 2 b p_{k-1} - p_{k-2} for (int k = 2; k < n_krylov; k++) { - blas::axpbypczw(2. * m_map, Ap[k - 1], 2. * b_map, p[k - 1], -1., p[k - 2], p[k]); + blas::axpbypczw(2. * poly_params.m_map, Ap[k - 1], 2. * poly_params.b_map, p[k - 1], -1., p[k - 2], p[k]); diracm(Ap[k], p[k], args...); } } } + } else { + errorQuda("Invalid basis %d", poly_params.basis); } }