@@ -737,6 +737,273 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
737737 return i;
738738}
739739
740+ template <class ScalarType >
741+ void BCGSTABpre_parallel (const CSysVector<ScalarType>& a, CSysVector<ScalarType>& b_in,
742+ const CMatrixVectorProduct<ScalarType>& mat_vec, const CPreconditioner<ScalarType>& precond, const CConfig* config) {
743+
744+ ScalarType norm_r_in = 0.0 , norm0_in = 0.0 ;
745+ unsigned long ii = 0 ;
746+
747+ CSysVector<ScalarType> A_z_i;
748+ CSysVector<ScalarType> r_0_in;
749+ CSysVector<ScalarType> r_in;
750+ CSysVector<ScalarType> p;
751+ CSysVector<ScalarType> v;
752+ CSysVector<ScalarType> z_i;
753+
754+ /* --- Allocate if not allocated yet ---*/
755+ BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {
756+ auto nVar = a.GetNVar ();
757+ auto nBlk = a.GetNBlk ();
758+ auto nBlkDomain = a.GetNBlkDomain ();
759+
760+ A_z_i.Initialize (nBlk, nBlkDomain, nVar, nullptr );
761+ r_0_in.Initialize (nBlk, nBlkDomain, nVar, nullptr );
762+ r_in.Initialize (nBlk, nBlkDomain, nVar, nullptr );
763+ p.Initialize (nBlk, nBlkDomain, nVar, nullptr );
764+ v.Initialize (nBlk, nBlkDomain, nVar, nullptr );
765+ z_i.Initialize (nBlk, nBlkDomain, nVar, nullptr );
766+ }
767+ END_SU2_OMP_SAFE_GLOBAL_ACCESS
768+
769+ /* --- Calculate the initial residual, compute norm, and check if system is already solved ---*/
770+ mat_vec (b_in, A_z_i);
771+ r_in = a - A_z_i;
772+
773+ /* --- Only compute the residuals in full communication mode. ---*/
774+ if (config->GetComm_Level () == COMM_FULL) {
775+ norm_r_in = r_in.norm ();
776+ norm0_in = a.norm ();
777+ /* --- Set the norm to the initial residual value ---*/
778+ /* --- if (tol_type == LinearToleranceType::RELATIVE) norm0_in = norm_r_in; ---*/
779+ }
780+
781+ /* --- Initialization ---*/
782+ ScalarType alpha = 1.0 , omega = 1.0 , rho = 1.0 , rho_prime = 1.0 ;
783+ p = ScalarType (0.0 );
784+ v = ScalarType (0.0 );
785+ r_0_in = r_in;
786+
787+ ScalarType tolerance = 1e-5 ; // Tolerance for the residual norm
788+
789+ /* --- Loop over all search directions ---*/
790+ while (ii < 1000 ) { // Arbitrary high iteration limit for safety
791+ /* --- Compute rho_prime ---*/
792+ rho_prime = rho;
793+
794+ /* --- Compute rho_i ---*/
795+ rho = r_in.dot (r_0_in);
796+
797+ /* --- Compute beta ---*/
798+ ScalarType beta_in = (rho / rho_prime) * (alpha / omega);
799+
800+ /* --- Update p ---*/
801+ p = beta_in * (p - omega * v) + r_in;
802+
803+ /* --- Preconditioning step ---*/
804+ precond (p, z_i);
805+ mat_vec (z_i, v);
806+
807+ /* --- Calculate step-length alpha ---*/
808+ ScalarType r_0_v = r_0_in.dot (v);
809+ alpha = rho / r_0_v;
810+
811+ /* --- Update solution and residual ---*/
812+ b_in += alpha * z_i;
813+ r_in -= alpha * v;
814+
815+ /* --- Preconditioning step ---*/
816+ precond (r_in, z_i);
817+ mat_vec (z_i, A_z_i);
818+
819+ /* --- Calculate step-length omega, avoid division by 0. ---*/
820+ omega = A_z_i.squaredNorm ();
821+ if (omega == ScalarType (0 )) break ;
822+ omega = A_z_i.dot (r_in) / omega;
823+
824+ /* --- Update solution and residual ---*/
825+ b_in += omega * z_i;
826+ r_in -= omega * A_z_i;
827+
828+ /* --- Update the residual norm ---*/
829+ norm_r_in = r_in.norm ();
830+
831+ /* --- Check if residual norm is below tolerance ---*/
832+ if (norm_r_in < tolerance) {
833+ break ; // Stop if the residual norm is below the desired tolerance
834+ }
835+
836+ ii++; // Increment iteration counter
837+ }
838+ }
839+
840+
841+
842+
843+ template <class ScalarType >
844+ unsigned long CSysSolve<ScalarType>::FGMRESandBCGSTAB2_LinSolver(const CSysVector<ScalarType>& b, CSysVector<ScalarType>& x,
845+ const CMatrixVectorProduct<ScalarType>& mat_vec,
846+ const CPreconditioner<ScalarType>& precond, ScalarType tol,
847+ unsigned long m, ScalarType& residual, bool monitoring,
848+ const CConfig* config) const {
849+
850+
851+ const bool masterRank = (SU2_MPI::GetRank () == MASTER_NODE);
852+ const bool flexible = !precond.IsIdentity ();
853+ /* --- If we call the solver outside of a parallel region, but the number of threads allows,
854+ * we still want to parallelize some of the expensive operations. ---*/
855+ const bool nestedParallel = !omp_in_parallel () && omp_get_max_threads () > 1 ;
856+
857+ /* --- Check the subspace size ---*/
858+
859+ if (m < 1 ) {
860+ SU2_MPI::Error (" Number of linear solver iterations must be greater than 0." , CURRENT_FUNCTION);
861+ }
862+
863+ if (m > 5000 ) {
864+ SU2_MPI::Error (" FGMRES subspace is too large." , CURRENT_FUNCTION);
865+ }
866+
867+ /* --- Allocate if not allocated yet ---*/
868+
869+ if (W.size () <= m || (flexible && Z.size () <= m)) {
870+ BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {
871+ W.resize (m + 1 );
872+ for (auto & w : W) w.Initialize (x.GetNBlk (), x.GetNBlkDomain (), x.GetNVar (), nullptr );
873+ if (flexible) {
874+ Z.resize (m + 1 );
875+ for (auto & z : Z) z.Initialize (x.GetNBlk (), x.GetNBlkDomain (), x.GetNVar (), nullptr );
876+ }
877+ }
878+ END_SU2_OMP_SAFE_GLOBAL_ACCESS
879+ }
880+
881+ /* --- Define various arrays. In parallel, each thread of each rank has and works
882+ on its own thread, since calculations on these arrays are based on dot products
883+ (reduced across all threads and ranks) all threads do the same computations. ---*/
884+
885+ su2vector<ScalarType> g (m + 1 ), sn (m + 1 ), cs (m + 1 ), y (m);
886+ g = ScalarType (0 );
887+ sn = ScalarType (0 );
888+ cs = ScalarType (0 );
889+ y = ScalarType (0 );
890+ su2matrix<ScalarType> H (m + 1 , m);
891+ H = ScalarType (0 );
892+
893+ /* --- Calculate the norm of the rhs vector. ---*/
894+
895+ ScalarType norm0 = b.norm ();
896+
897+ /* --- Calculate the initial residual (actually the negative residual) and compute its norm. ---*/
898+
899+ if (!xIsZero) {
900+ mat_vec (x, W[0 ]);
901+ W[0 ] -= b;
902+ } else {
903+ W[0 ] = -b;
904+ }
905+
906+ ScalarType beta = W[0 ].norm ();
907+
908+ if (tol_type == LinearToleranceType::RELATIVE) norm0 = beta;
909+
910+ if ((beta < tol * norm0) || (beta < eps)) {
911+ if (masterRank) {
912+ SU2_OMP_MASTER
913+ cout << " CSysSolve::FGMRES(): system solved by initial guess." << endl;
914+ END_SU2_OMP_MASTER
915+ }
916+ residual = beta;
917+ return 0 ;
918+ }
919+
920+ W[0 ] /= -beta;
921+
922+ g[0 ] = beta;
923+
924+ unsigned long i = 0 ;
925+ if ((monitoring) && (masterRank)) {
926+ SU2_OMP_MASTER {
927+ WriteHeader (" FGMRES" , tol, beta);
928+ WriteHistory (i, beta / norm0);
929+ }
930+ END_SU2_OMP_MASTER
931+ }
932+
933+ for (i = 0 ; i < m; i++) {
934+ if (beta < tol * norm0) break ;
935+
936+ if (flexible) {
937+ /* --- Use BCGSTAB as inner iteration ---*/
938+ BCGSTABpre_parallel (W[i], Z[i], mat_vec, precond, config);
939+
940+ mat_vec (Z[i], W[i + 1 ]);
941+ } else {
942+ mat_vec (W[i], W[i + 1 ]);
943+ }
944+
945+ if (nestedParallel) {
946+ SU2_OMP_PARALLEL
947+ ModGramSchmidt (true , i, H, W);
948+ END_SU2_OMP_PARALLEL
949+ } else {
950+ ModGramSchmidt (false , i, H, W);
951+ }
952+
953+ for (unsigned long k = 0 ; k < i; k++) ApplyGivens (sn[k], cs[k], H[k][i], H[k + 1 ][i]);
954+ GenerateGivens (H[i][i], H[i + 1 ][i], sn[i], cs[i]);
955+ ApplyGivens (sn[i], cs[i], g[i], g[i + 1 ]);
956+
957+ beta = fabs (g[i + 1 ]);
958+
959+ if ((((monitoring) && (masterRank)) && ((i + 1 ) % monitorFreq == 0 ))) {
960+ SU2_OMP_MASTER
961+ WriteHistory (i + 1 , beta / norm0);
962+ END_SU2_OMP_MASTER
963+ }
964+ }
965+
966+ SolveReduced (i, H, g, y);
967+
968+ const auto & basis = flexible ? Z : W;
969+
970+ if (nestedParallel) {
971+ SU2_OMP_PARALLEL
972+ for (unsigned long k = 0 ; k < i; k++) x += y[k] * basis[k];
973+ END_SU2_OMP_PARALLEL
974+ } else {
975+ for (unsigned long k = 0 ; k < i; k++) x += y[k] * basis[k];
976+ }
977+
978+ if ((monitoring) && (config->GetComm_Level () == COMM_FULL)) {
979+ if (masterRank) {
980+ SU2_OMP_MASTER
981+ WriteFinalResidual (" FGMRES" , i, beta / norm0);
982+ END_SU2_OMP_MASTER
983+ }
984+
985+ if (recomputeRes) {
986+ mat_vec (x, W[0 ]);
987+ W[0 ] -= b;
988+ ScalarType res = W[0 ].norm ();
989+
990+ if (fabs (res - beta) > tol * 10 ) {
991+ if (masterRank) {
992+ SU2_OMP_MASTER
993+ WriteWarning (beta, res, tol);
994+ END_SU2_OMP_MASTER
995+ }
996+ }
997+ }
998+ }
999+
1000+ residual = beta / norm0;
1001+ return i;
1002+ }
1003+
1004+
1005+
1006+
7401007template <class ScalarType >
7411008unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarType>& b, CSysVector<ScalarType>& x,
7421009 const CMatrixVectorProduct<ScalarType>& mat_vec,
@@ -973,6 +1240,10 @@ unsigned long CSysSolve<ScalarType>::Solve(CSysMatrix<ScalarType>& Jacobian, con
9731240 IterLinSol = RFGMRES_LinSolver (*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, MaxIter, residual,
9741241 ScreenOutput, config);
9751242 break ;
1243+ case FGMRESandBCGSTAB2:
1244+ IterLinSol = FGMRESandBCGSTAB2_LinSolver (*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, MaxIter, residual,
1245+ ScreenOutput, config);
1246+ break ;
9761247 case CONJUGATE_GRADIENT:
9771248 IterLinSol = CG_LinSolver (*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, MaxIter, residual,
9781249 ScreenOutput, config);
@@ -1132,6 +1403,10 @@ unsigned long CSysSolve<ScalarType>::Solve_b(CSysMatrix<ScalarType>& Jacobian, c
11321403 IterLinSol = BCGSTAB_LinSolver (*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, MaxIter, residual,
11331404 ScreenOutput, config);
11341405 break ;
1406+ case FGMRESandBCGSTAB2:
1407+ IterLinSol = FGMRESandBCGSTAB2_LinSolver (*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, MaxIter, residual,
1408+ ScreenOutput, config);
1409+ break ;
11351410 case CONJUGATE_GRADIENT:
11361411 IterLinSol = CG_LinSolver (*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, MaxIter, residual,
11371412 ScreenOutput, config);
0 commit comments