From 0f8c298f2f12eb64f2088e2f974abf9923f5d871 Mon Sep 17 00:00:00 2001 From: kmp5VT Date: Fri, 5 Aug 2022 16:15:40 -0400 Subject: [PATCH 1/3] Start adding MPI threading to mixed-contraction kenels --- btas/generic/element_wise_contract.h | 137 +++++++++++++++++++++++++-- 1 file changed, 128 insertions(+), 9 deletions(-) diff --git a/btas/generic/element_wise_contract.h b/btas/generic/element_wise_contract.h index 9b162e89..34d6dc69 100644 --- a/btas/generic/element_wise_contract.h +++ b/btas/generic/element_wise_contract.h @@ -6,7 +6,6 @@ #define BTAS_GENERIC_ELEMENT_WISE_CONTRACT_H namespace btas{ - // This compute \alpha A(i, j, r) * B(j, r) + \beta C(i,r) = C(i,r) template< typename _T, @@ -19,7 +18,7 @@ namespace btas{ std::is_same::value >::type > - void middle_contract(_T alpha, const _TensorA& A, const _TensorB& B, _T beta, _TensorC& C){ + void middle_contract(_T alpha, const _TensorA& A, const _TensorB& B, _T beta, _TensorC& C) { static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value && boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value, "btas::middle_contract does not support mixed storage order"); @@ -35,32 +34,86 @@ namespace btas{ BTAS_ASSERT(A.extent(1) == B.extent(0)) BTAS_ASSERT(A.extent(2) == B.extent(1)) - if(!C.empty()){ + if (!C.empty()) { BTAS_ASSERT(C.rank() == 2); BTAS_ASSERT(C.extent(0) == A.extent(0)) BTAS_ASSERT(C.extent(1) == A.extent(2)) - } else{ + } else { C = _TensorC(A.extent(0), A.extent(2)); NumericType::fill(std::begin(C), std::end(C), NumericType::zero()); } ind_t idx1 = A.extent(0), idx2 = A.extent(1), rank = A.extent(2); ord_t i_times_rank = 0, i_times_rank_idx2 = 0; - for (ind_t i = 0; i < idx1; i++, i_times_rank += rank) { + for (ind_t i = 0; i < idx1; ++i, i_times_rank += rank) { auto *C_ptr = C.data() + i_times_rank; ord_t j_times_rank = 0; - for (ind_t j = 0; j < idx2; j++, j_times_rank += rank) { + for (ind_t j = 0; j < idx2; ++j, j_times_rank += rank) { const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_rank; const auto *B_ptr = B.data() + j_times_rank; - for (ind_t r = 0; r < rank; r++) { - *(C_ptr + r) += alpha * (*(A_ptr + r) * *(B_ptr + r)) - + beta * *(C_ptr + r); + for (ind_t r = 0; r < rank; ++r) { + *(C_ptr + r) += alpha * (*(A_ptr + r) * *(B_ptr + r)) + beta * *(C_ptr + r); } } i_times_rank_idx2 += j_times_rank; } } + // This compute \alpha A(i, j, r) * B(j, r) + \beta C(i,r) = C(i,r) + template< + typename _T, + class _TensorA, class _TensorB, class _TensorC, + class = typename std::enable_if< + is_boxtensor<_TensorA>::value & + is_boxtensor<_TensorB>::value & + is_boxtensor<_TensorC>::value & + std::is_same::value & + std::is_same::value + >::type + > + void middle_contract_parallel(const _T alpha, const _TensorA& A, const _TensorB& B, const _T beta, _TensorC& C) { + static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value && + boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value, + "btas::middle_contract does not support mixed storage order"); + static_assert(boxtensor_storage_order<_TensorC>::value != boxtensor_storage_order<_TensorC>::other, + "btas::middle_contract does not support non-major storage order"); + + typedef typename _TensorA::value_type value_type; + typedef typename _TensorA::storage_type storage_type; +#pragma omp declare reduction(btas_tensor_plus:storage_type : std::transform(omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus ())) \ + initializer(omp_priv = decltype(omp_orig)(omp_orig.size())) + using ind_t = typename _TensorA::range_type::index_type::value_type; + using ord_t = typename range_traits::ordinal_type; + BTAS_ASSERT(A.rank() == 3) + BTAS_ASSERT(B.rank() == 2) + BTAS_ASSERT(A.extent(1) == B.extent(0)) + BTAS_ASSERT(A.extent(2) == B.extent(1)) + + if (!C.empty()) { + BTAS_ASSERT(C.rank() == 2); + BTAS_ASSERT(C.extent(0) == A.extent(0)) + BTAS_ASSERT(C.extent(1) == A.extent(2)) + } else { + C = _TensorC(A.extent(0), A.extent(2)); + NumericType::fill(std::begin(C), std::end(C), NumericType::zero()); + } + ind_t idx1 = A.extent(0), idx2 = A.extent(1), rank = A.extent(2); + + //ord_t i_times_rank = 0, i_times_rank_idx2 = 0; + storage_type& c_storage = C.storage(); + auto A_ptr = A.data(), B_ptr = B.data(); +#pragma omp parallel for reduction(btas_tensor_plus : c_storage) schedule(static) default(shared) + for (ind_t i = 0; i < idx1; ++i) { + auto i_offset_c = i * rank, i_offset_a = i * rank * idx2; + for (ind_t r = 0; r < rank; ++r) { + for (ind_t j = 0; j < idx2; ++j) { + auto & val = c_storage[i_offset_c + r]; + val += alpha * *(A_ptr + i_offset_a + j * rank + r) * + * (B_ptr + j * rank + r) + beta * val; + } + } + } + } // this does the elementwise contraction \alpha A(i,j,k,r) * B(j, r) + \beta C(i,k,r) = C(i,k,r) template< typename _T, @@ -122,6 +175,72 @@ namespace btas{ } + // this does the elementwise contraction \alpha A(i,j,k,r) * B(j, r) + \beta C(i,k,r) = C(i,k,r) + template< + typename _T, + class _TensorA, class _TensorB, class _TensorC, + class = typename std::enable_if< + is_boxtensor<_TensorA>::value & + is_boxtensor<_TensorB>::value & + is_boxtensor<_TensorC>::value & + std::is_same::value & + std::is_same::value + >::type + > + void middle_contract_with_pseudorank_parallel(const _T alpha, const _TensorA & A, + const _TensorB& B, const _T beta, _TensorC& C){ + static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value && + boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value, + "btas::middle_contract does not support mixed storage order"); + static_assert(boxtensor_storage_order<_TensorC>::value != boxtensor_storage_order<_TensorC>::other, + "btas::middle_contract does not support non-major storage order"); + + typedef typename _TensorA::value_type value_type; + typedef typename _TensorA::storage_type storage_type; +#pragma omp declare reduction(btas_tensor_plus:storage_type : std::transform(omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus ())) \ + initializer(omp_priv = decltype(omp_orig)(omp_orig.size())) + using ind_t = typename _TensorA::range_type::index_type::value_type; + using ord_t = typename range_traits::ordinal_type; + + BTAS_ASSERT(A.rank() == 3) + BTAS_ASSERT(B.rank() == 2) + BTAS_ASSERT(A.extent(1) == B.extent(0)) + ind_t rank = B.extent(1), + idx3 = A.extent(2) / rank; + BTAS_ASSERT(A.extent(2) / idx3 == B.extent(1)); + + if(!C.empty()){ + BTAS_ASSERT(C.rank() == 2); + BTAS_ASSERT(C.extent(0) == A.extent(0)) + BTAS_ASSERT(C.extent(1) == A.extent(2)) + } else{ + C = _TensorC(A.extent(0), A.extent(2)); + NumericType::fill(std::begin(C), std::end(C), NumericType::zero()); + } + + ind_t idx1 = A.extent(0), idx2 = A.extent(1), pseudo_rank = A.extent(2); + storage_type& c_storage = C.storage(); +#pragma omp parallel for reduction(btas_tensor_plus : c_storage) schedule(static) default(shared) + for (ind_t i = 0; i < idx1; ++i) { + ord_t i_times_rank = i * pseudo_rank, i_times_rank_idx2 = i * pseudo_rank * idx2; + auto *C_ptr = C.data() + i_times_rank; + for (ind_t j = 0; j < idx2; ++j) { + ord_t j_times_prank = j * pseudo_rank, j_times_rank = j * rank; + const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_prank; + const auto *B_ptr = B.data() + j_times_rank; + ord_t k_times_rank = 0; + for (ind_t k = 0; k < idx3; ++k, k_times_rank += rank) { + for (ind_t r = 0; r < rank; ++r) { + auto& val = c_storage[k_times_rank + r]; + val += alpha * ( *(A_ptr + k_times_rank + r) * *(B_ptr + r)) + + beta * val; + } + } + } + } + + } + // this computes \alpha A(i,j,r) * B(i,r) + \beta C(j,r) = C(j,r) template< typename _T, From b5e6e13a7e02e30e6729f1cb063e894bac8ca66c Mon Sep 17 00:00:00 2001 From: kmp5VT Date: Fri, 5 Aug 2022 16:16:07 -0400 Subject: [PATCH 2/3] Add `mixed_contract_parallel` to `cp_als.h` --- btas/generic/cp_als.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/btas/generic/cp_als.h b/btas/generic/cp_als.h index 417165c6..8da208e4 100644 --- a/btas/generic/cp_als.h +++ b/btas/generic/cp_als.h @@ -818,7 +818,8 @@ namespace btas { // over the middle dimension and sum over the rank. else if (contract_dim > n) { - middle_contract(1.0, temp, a, 0.0, contract_tensor); +// middle_contract(1.0, temp, a, 0.0, contract_tensor); + middle_contract_parallel(1.0, temp, a, 0.0, contract_tensor); temp = contract_tensor; } From 94f7b434a045d2b0ffe28a818c7f7ce81a0c400f Mon Sep 17 00:00:00 2001 From: kmp5VT Date: Mon, 8 Aug 2022 09:52:12 -0400 Subject: [PATCH 3/3] Create `OMP_PARALLEL` definition to enable/dis-enable MPI parallelization --- btas/generic/cp_als.h | 3 +- btas/generic/element_wise_contract.h | 123 ++++++++++++++------------- 2 files changed, 66 insertions(+), 60 deletions(-) diff --git a/btas/generic/cp_als.h b/btas/generic/cp_als.h index 52bc1457..9e859841 100644 --- a/btas/generic/cp_als.h +++ b/btas/generic/cp_als.h @@ -818,8 +818,7 @@ namespace btas { // over the middle dimension and sum over the rank. else if (contract_dim > n) { -// middle_contract(1.0, temp, a, 0.0, contract_tensor); - middle_contract_parallel(1.0, temp, a, 0.0, contract_tensor); + middle_contract(1.0, temp, a, 0.0, contract_tensor); temp = contract_tensor; } diff --git a/btas/generic/element_wise_contract.h b/btas/generic/element_wise_contract.h index 34d6dc69..2628b69f 100644 --- a/btas/generic/element_wise_contract.h +++ b/btas/generic/element_wise_contract.h @@ -1,11 +1,16 @@ // // Created by Karl Pierce on 2/17/22. // +#ifndef OMP_PARALLEL +#define OMP_PARALLEL 1 +#endif // OMP_PARALLEL #ifndef BTAS_GENERIC_ELEMENT_WISE_CONTRACT_H #define BTAS_GENERIC_ELEMENT_WISE_CONTRACT_H namespace btas{ + +#if OMP_PARALLEL // This compute \alpha A(i, j, r) * B(j, r) + \beta C(i,r) = C(i,r) template< typename _T, @@ -26,9 +31,11 @@ namespace btas{ "btas::middle_contract does not support non-major storage order"); typedef typename _TensorA::value_type value_type; + typedef typename _TensorA::storage_type storage_type; +#pragma omp declare reduction(btas_tensor_plus:storage_type : std::transform(omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus ())) \ + initializer(omp_priv = decltype(omp_orig)(omp_orig.size())) using ind_t = typename _TensorA::range_type::index_type::value_type; using ord_t = typename range_traits::ordinal_type; - BTAS_ASSERT(A.rank() == 3) BTAS_ASSERT(B.rank() == 2) BTAS_ASSERT(A.extent(1) == B.extent(0)) @@ -44,9 +51,12 @@ namespace btas{ } ind_t idx1 = A.extent(0), idx2 = A.extent(1), rank = A.extent(2); - ord_t i_times_rank = 0, i_times_rank_idx2 = 0; - for (ind_t i = 0; i < idx1; ++i, i_times_rank += rank) { - auto *C_ptr = C.data() + i_times_rank; + //ord_t i_times_rank = 0, i_times_rank_idx2 = 0; + storage_type& c_storage = C.storage(); +#pragma omp parallel for reduction(btas_tensor_plus : c_storage) schedule(static) default(shared) + for (ind_t i = 0; i < idx1; ++i) { + ord_t i_times_rank = i * rank, i_times_rank_idx2 = i * rank * idx2; + auto *C_ptr = c_storage.data() + i_times_rank; ord_t j_times_rank = 0; for (ind_t j = 0; j < idx2; ++j, j_times_rank += rank) { const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_rank; @@ -55,11 +65,10 @@ namespace btas{ *(C_ptr + r) += alpha * (*(A_ptr + r) * *(B_ptr + r)) + beta * *(C_ptr + r); } } - i_times_rank_idx2 += j_times_rank; } } - // This compute \alpha A(i, j, r) * B(j, r) + \beta C(i,r) = C(i,r) + // this does the elementwise contraction \alpha A(i,j,k,r) * B(j, r) + \beta C(i,k,r) = C(i,k,r) template< typename _T, class _TensorA, class _TensorB, class _TensorC, @@ -71,7 +80,8 @@ namespace btas{ std::is_same::value >::type > - void middle_contract_parallel(const _T alpha, const _TensorA& A, const _TensorB& B, const _T beta, _TensorC& C) { + void middle_contract_with_pseudorank(_T alpha, const _TensorA & A, + const _TensorB& B, _T beta, _TensorC& C){ static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value && boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value, "btas::middle_contract does not support mixed storage order"); @@ -84,37 +94,47 @@ namespace btas{ initializer(omp_priv = decltype(omp_orig)(omp_orig.size())) using ind_t = typename _TensorA::range_type::index_type::value_type; using ord_t = typename range_traits::ordinal_type; + BTAS_ASSERT(A.rank() == 3) BTAS_ASSERT(B.rank() == 2) BTAS_ASSERT(A.extent(1) == B.extent(0)) - BTAS_ASSERT(A.extent(2) == B.extent(1)) + ind_t rank = B.extent(1), + idx3 = A.extent(2) / rank; + BTAS_ASSERT(A.extent(2) / idx3 == B.extent(1)); - if (!C.empty()) { + if(!C.empty()){ BTAS_ASSERT(C.rank() == 2); BTAS_ASSERT(C.extent(0) == A.extent(0)) BTAS_ASSERT(C.extent(1) == A.extent(2)) - } else { + } else{ C = _TensorC(A.extent(0), A.extent(2)); NumericType::fill(std::begin(C), std::end(C), NumericType::zero()); } - ind_t idx1 = A.extent(0), idx2 = A.extent(1), rank = A.extent(2); - //ord_t i_times_rank = 0, i_times_rank_idx2 = 0; + ind_t idx1 = A.extent(0), idx2 = A.extent(1), pseudo_rank = A.extent(2); storage_type& c_storage = C.storage(); - auto A_ptr = A.data(), B_ptr = B.data(); #pragma omp parallel for reduction(btas_tensor_plus : c_storage) schedule(static) default(shared) for (ind_t i = 0; i < idx1; ++i) { - auto i_offset_c = i * rank, i_offset_a = i * rank * idx2; - for (ind_t r = 0; r < rank; ++r) { - for (ind_t j = 0; j < idx2; ++j) { - auto & val = c_storage[i_offset_c + r]; - val += alpha * *(A_ptr + i_offset_a + j * rank + r) * - * (B_ptr + j * rank + r) + beta * val; + ord_t i_times_rank = i * pseudo_rank, i_times_rank_idx2 = i * idx2 * pseudo_rank ; + auto *C_ptr = C.data() + i_times_rank; + ord_t j_times_prank = 0, j_times_rank = 0; + for (ind_t j = 0; j < idx2; ++j, j_times_prank += pseudo_rank, j_times_rank += rank) { + const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_prank; + const auto *B_ptr = B.data() + j_times_rank; + ord_t k_times_rank = 0; + for (ind_t k = 0; k < idx3; ++k, k_times_rank += rank) { + for (ind_t r = 0; r < rank; ++r) { + *(C_ptr + k_times_rank + r) += alpha * ( *(A_ptr + k_times_rank + r) * *(B_ptr + r)) + + beta * *(C_ptr + k_times_rank + r); } } + } + i_times_rank_idx2 += j_times_prank; } + } - // this does the elementwise contraction \alpha A(i,j,k,r) * B(j, r) + \beta C(i,k,r) = C(i,k,r) +#elif + // This compute \alpha A(i, j, r) * B(j, r) + \beta C(i,r) = C(i,r) template< typename _T, class _TensorA, class _TensorB, class _TensorC, @@ -126,8 +146,7 @@ namespace btas{ std::is_same::value >::type > - void middle_contract_with_pseudorank(_T alpha, const _TensorA & A, - const _TensorB& B, _T beta, _TensorC& C){ + void middle_contract(_T alpha, const _TensorA& A, const _TensorB& B, _T beta, _TensorC& C) { static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value && boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value, "btas::middle_contract does not support mixed storage order"); @@ -141,40 +160,32 @@ namespace btas{ BTAS_ASSERT(A.rank() == 3) BTAS_ASSERT(B.rank() == 2) BTAS_ASSERT(A.extent(1) == B.extent(0)) - ind_t rank = B.extent(1), - idx3 = A.extent(2) / rank; - BTAS_ASSERT(A.extent(2) / idx3 == B.extent(1)); + BTAS_ASSERT(A.extent(2) == B.extent(1)) - if(!C.empty()){ + if (!C.empty()) { BTAS_ASSERT(C.rank() == 2); BTAS_ASSERT(C.extent(0) == A.extent(0)) BTAS_ASSERT(C.extent(1) == A.extent(2)) - } else{ + } else { C = _TensorC(A.extent(0), A.extent(2)); NumericType::fill(std::begin(C), std::end(C), NumericType::zero()); } + ind_t idx1 = A.extent(0), idx2 = A.extent(1), rank = A.extent(2); - ind_t idx1 = A.extent(0), idx2 = A.extent(1), pseudo_rank = A.extent(2); ord_t i_times_rank = 0, i_times_rank_idx2 = 0; - for (ind_t i = 0; i < idx1; ++i, i_times_rank += pseudo_rank) { + for (ind_t i = 0; i < idx1; ++i, i_times_rank += rank) { auto *C_ptr = C.data() + i_times_rank; - ord_t j_times_prank = 0, j_times_rank = 0; - for (ind_t j = 0; j < idx2; ++j, j_times_prank += pseudo_rank, j_times_rank += rank) { - const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_prank; + ord_t j_times_rank = 0; + for (ind_t j = 0; j < idx2; ++j, j_times_rank += rank) { + const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_rank; const auto *B_ptr = B.data() + j_times_rank; - ord_t k_times_rank = 0; - for (ind_t k = 0; k < idx3; ++k, k_times_rank += rank) { - for (ind_t r = 0; r < rank; ++r) { - *(C_ptr + k_times_rank + r) += alpha * ( *(A_ptr + k_times_rank + r) * *(B_ptr + r)) - + beta * *(C_ptr + k_times_rank + r); - } + for (ind_t r = 0; r < rank; ++r) { + *(C_ptr + r) += alpha * (*(A_ptr + r) * *(B_ptr + r)) + beta * *(C_ptr + r); } } - i_times_rank_idx2 += j_times_prank; + i_times_rank_idx2 += j_times_rank; } - } - // this does the elementwise contraction \alpha A(i,j,k,r) * B(j, r) + \beta C(i,k,r) = C(i,k,r) template< typename _T, @@ -187,8 +198,8 @@ namespace btas{ std::is_same::value >::type > - void middle_contract_with_pseudorank_parallel(const _T alpha, const _TensorA & A, - const _TensorB& B, const _T beta, _TensorC& C){ + void middle_contract_with_pseudorank(_T alpha, const _TensorA & A, + const _TensorB& B, _T beta, _TensorC& C){ static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value && boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value, "btas::middle_contract does not support mixed storage order"); @@ -196,9 +207,6 @@ namespace btas{ "btas::middle_contract does not support non-major storage order"); typedef typename _TensorA::value_type value_type; - typedef typename _TensorA::storage_type storage_type; -#pragma omp declare reduction(btas_tensor_plus:storage_type : std::transform(omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus ())) \ - initializer(omp_priv = decltype(omp_orig)(omp_orig.size())) using ind_t = typename _TensorA::range_type::index_type::value_type; using ord_t = typename range_traits::ordinal_type; @@ -219,27 +227,26 @@ namespace btas{ } ind_t idx1 = A.extent(0), idx2 = A.extent(1), pseudo_rank = A.extent(2); - storage_type& c_storage = C.storage(); -#pragma omp parallel for reduction(btas_tensor_plus : c_storage) schedule(static) default(shared) - for (ind_t i = 0; i < idx1; ++i) { - ord_t i_times_rank = i * pseudo_rank, i_times_rank_idx2 = i * pseudo_rank * idx2; + ord_t i_times_rank = 0, i_times_rank_idx2 = 0; + for (ind_t i = 0; i < idx1; ++i, i_times_rank += pseudo_rank) { auto *C_ptr = C.data() + i_times_rank; - for (ind_t j = 0; j < idx2; ++j) { - ord_t j_times_prank = j * pseudo_rank, j_times_rank = j * rank; + ord_t j_times_prank = 0, j_times_rank = 0; + for (ind_t j = 0; j < idx2; ++j, j_times_prank += pseudo_rank, j_times_rank += rank) { const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_prank; const auto *B_ptr = B.data() + j_times_rank; ord_t k_times_rank = 0; for (ind_t k = 0; k < idx3; ++k, k_times_rank += rank) { for (ind_t r = 0; r < rank; ++r) { - auto& val = c_storage[k_times_rank + r]; - val += alpha * ( *(A_ptr + k_times_rank + r) * *(B_ptr + r)) - + beta * val; + *(C_ptr + k_times_rank + r) += alpha * ( *(A_ptr + k_times_rank + r) * *(B_ptr + r)) + + beta * *(C_ptr + k_times_rank + r); } } } + i_times_rank_idx2 += j_times_prank; } } +#endif // this computes \alpha A(i,j,r) * B(i,r) + \beta C(j,r) = C(j,r) template< @@ -281,13 +288,13 @@ namespace btas{ ind_t idx1 = A.extent(0), idx2 = A.extent(1), rank = A.extent(2); ord_t i_times_rank = 0, i_times_rank_idx2 = 0; - for (ind_t i = 0; i < idx1; i++, i_times_rank += rank) { + for (ind_t i = 0; i < idx1; ++i, i_times_rank += rank) { const auto *B_ptr = B.data() + i_times_rank; ord_t j_times_rank = 0; - for (ind_t j = 0; j < idx2; j++, j_times_rank += rank) { + for (ind_t j = 0; j < idx2; ++j, j_times_rank += rank) { const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_rank; auto *C_ptr = C.data() + j_times_rank; - for (ind_t r = 0; r < rank; r++) { + for (ind_t r = 0; r < rank; ++r) { *(C_ptr + r) += *(B_ptr + r) * *(A_ptr + r); } }