From 0f8c298f2f12eb64f2088e2f974abf9923f5d871 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Fri, 5 Aug 2022 16:15:40 -0400
Subject: [PATCH 1/3] Start adding MPI threading to mixed-contraction kenels

---
 btas/generic/element_wise_contract.h | 137 +++++++++++++++++++++++++--
 1 file changed, 128 insertions(+), 9 deletions(-)
diff --git a/btas/generic/element_wise_contract.h b/btas/generic/element_wise_contract.h
index 9b162e89..34d6dc69 100644
--- a/btas/generic/element_wise_contract.h
+++ b/btas/generic/element_wise_contract.h
@@ -6,7 +6,6 @@
 #define BTAS_GENERIC_ELEMENT_WISE_CONTRACT_H
 
 namespace btas{
-
   // This compute \alpha A(i, j, r) * B(j, r) + \beta C(i,r) = C(i,r)
   template<
       typename _T,
@@ -19,7 +18,7 @@ namespace btas{
           std::is_same<typename _TensorA::value_type, typename _TensorC::value_type>::value
           >::type
       >
-  void middle_contract(_T alpha, const _TensorA& A, const _TensorB& B, _T beta, _TensorC& C){
+  void middle_contract(_T alpha, const _TensorA& A, const _TensorB& B, _T beta, _TensorC& C) {
     static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value &&
                       boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value,
                   "btas::middle_contract does not support mixed storage order");
@@ -35,32 +34,86 @@ namespace btas{
     BTAS_ASSERT(A.extent(1) == B.extent(0))
     BTAS_ASSERT(A.extent(2) == B.extent(1))
 
-    if(!C.empty()){
+    if (!C.empty()) {
       BTAS_ASSERT(C.rank() == 2);
       BTAS_ASSERT(C.extent(0) == A.extent(0))
       BTAS_ASSERT(C.extent(1) == A.extent(2))
-    } else{
+    } else {
       C = _TensorC(A.extent(0), A.extent(2));
       NumericType<value_type>::fill(std::begin(C), std::end(C), NumericType<value_type>::zero());
     }
     ind_t idx1 = A.extent(0), idx2 = A.extent(1), rank = A.extent(2);
 
     ord_t i_times_rank = 0, i_times_rank_idx2 = 0;
-    for (ind_t i = 0; i < idx1; i++, i_times_rank += rank) {
+    for (ind_t i = 0; i < idx1; ++i, i_times_rank += rank) {
       auto *C_ptr = C.data() + i_times_rank;
       ord_t j_times_rank = 0;
-      for (ind_t j = 0; j < idx2; j++, j_times_rank += rank) {
+      for (ind_t j = 0; j < idx2; ++j, j_times_rank += rank) {
         const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_rank;
         const auto *B_ptr = B.data() + j_times_rank;
-        for (ind_t r = 0; r < rank; r++) {
-          *(C_ptr + r) += alpha * (*(A_ptr + r) * *(B_ptr + r))
-                          + beta * *(C_ptr + r);
+        for (ind_t r = 0; r < rank; ++r) {
+          *(C_ptr + r) += alpha * (*(A_ptr + r) * *(B_ptr + r)) + beta * *(C_ptr + r);
         }
       }
       i_times_rank_idx2 += j_times_rank;
     }
   }
 
+  // This compute \alpha A(i, j, r) * B(j, r) + \beta C(i,r) = C(i,r)
+  template<
+      typename _T,
+      class _TensorA, class _TensorB, class _TensorC,
+      class = typename std::enable_if<
+          is_boxtensor<_TensorA>::value &
+          is_boxtensor<_TensorB>::value &
+          is_boxtensor<_TensorC>::value &
+          std::is_same<typename _TensorA::value_type, typename _TensorB::value_type>::value &
+          std::is_same<typename _TensorA::value_type, typename _TensorC::value_type>::value
+          >::type
+      >
+  void middle_contract_parallel(const _T alpha, const _TensorA& A, const _TensorB& B, const _T beta, _TensorC& C) {
+    static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value &&
+                      boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value,
+                  "btas::middle_contract does not support mixed storage order");
+    static_assert(boxtensor_storage_order<_TensorC>::value != boxtensor_storage_order<_TensorC>::other,
+                  "btas::middle_contract does not support non-major storage order");
+
+    typedef typename _TensorA::value_type value_type;
+    typedef typename _TensorA::storage_type storage_type;
+#pragma omp declare reduction(btas_tensor_plus:storage_type : std::transform(omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus <value_type>())) \
+        initializer(omp_priv = decltype(omp_orig)(omp_orig.size()))
+    using ind_t = typename _TensorA::range_type::index_type::value_type;
+    using ord_t = typename range_traits<typename _TensorA::range_type>::ordinal_type;
+    BTAS_ASSERT(A.rank() == 3)
+    BTAS_ASSERT(B.rank() == 2)
+    BTAS_ASSERT(A.extent(1) == B.extent(0))
+    BTAS_ASSERT(A.extent(2) == B.extent(1))
+
+    if (!C.empty()) {
+      BTAS_ASSERT(C.rank() == 2);
+      BTAS_ASSERT(C.extent(0) == A.extent(0))
+      BTAS_ASSERT(C.extent(1) == A.extent(2))
+    } else {
+      C = _TensorC(A.extent(0), A.extent(2));
+      NumericType<value_type>::fill(std::begin(C), std::end(C), NumericType<value_type>::zero());
+    }
+    ind_t idx1 = A.extent(0), idx2 = A.extent(1), rank = A.extent(2);
+
+    //ord_t i_times_rank = 0, i_times_rank_idx2 = 0;
+    storage_type& c_storage = C.storage();
+    auto A_ptr = A.data(), B_ptr = B.data();
+#pragma omp parallel for reduction(btas_tensor_plus : c_storage) schedule(static) default(shared)
+    for (ind_t i = 0; i < idx1; ++i) {
+        auto i_offset_c = i * rank, i_offset_a = i * rank * idx2;
+        for (ind_t r = 0; r < rank; ++r) {
+          for (ind_t j = 0; j < idx2; ++j) {
+            auto & val = c_storage[i_offset_c + r];
+            val += alpha * *(A_ptr + i_offset_a + j * rank + r) *
+                                         * (B_ptr + j * rank + r) + beta * val;
+          }
+        }
+    }
+  }
   // this does the elementwise contraction \alpha A(i,j,k,r) * B(j, r) + \beta C(i,k,r) = C(i,k,r)
   template<
       typename _T,
@@ -122,6 +175,72 @@ namespace btas{
 
   }
 
+  // this does the elementwise contraction \alpha A(i,j,k,r) * B(j, r) + \beta C(i,k,r) = C(i,k,r)
+  template<
+      typename _T,
+      class _TensorA, class _TensorB, class _TensorC,
+      class = typename std::enable_if<
+          is_boxtensor<_TensorA>::value &
+          is_boxtensor<_TensorB>::value &
+          is_boxtensor<_TensorC>::value &
+          std::is_same<typename _TensorA::value_type, typename _TensorB::value_type>::value &
+          std::is_same<typename _TensorA::value_type, typename _TensorC::value_type>::value
+          >::type
+      >
+  void middle_contract_with_pseudorank_parallel(const _T alpha, const _TensorA & A,
+                                       const _TensorB& B, const _T beta, _TensorC& C){
+    static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value &&
+                      boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value,
+                  "btas::middle_contract does not support mixed storage order");
+    static_assert(boxtensor_storage_order<_TensorC>::value != boxtensor_storage_order<_TensorC>::other,
+                  "btas::middle_contract does not support non-major storage order");
+
+    typedef typename _TensorA::value_type value_type;
+    typedef typename _TensorA::storage_type storage_type;
+#pragma omp declare reduction(btas_tensor_plus:storage_type : std::transform(omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus <value_type>())) \
+        initializer(omp_priv = decltype(omp_orig)(omp_orig.size()))
+    using ind_t = typename _TensorA::range_type::index_type::value_type;
+    using ord_t = typename range_traits<typename _TensorA::range_type>::ordinal_type;
+
+    BTAS_ASSERT(A.rank() == 3)
+    BTAS_ASSERT(B.rank() == 2)
+    BTAS_ASSERT(A.extent(1) == B.extent(0))
+    ind_t rank = B.extent(1),
+          idx3 = A.extent(2) / rank;
+    BTAS_ASSERT(A.extent(2) / idx3 == B.extent(1));
+
+    if(!C.empty()){
+      BTAS_ASSERT(C.rank() == 2);
+      BTAS_ASSERT(C.extent(0) == A.extent(0))
+      BTAS_ASSERT(C.extent(1) == A.extent(2))
+    } else{
+      C = _TensorC(A.extent(0), A.extent(2));
+      NumericType<value_type>::fill(std::begin(C), std::end(C), NumericType<value_type>::zero());
+    }
+
+    ind_t idx1 = A.extent(0), idx2 = A.extent(1), pseudo_rank = A.extent(2);
+    storage_type& c_storage = C.storage();
+#pragma omp parallel for reduction(btas_tensor_plus : c_storage) schedule(static) default(shared)
+    for (ind_t i = 0; i < idx1; ++i) {
+      ord_t i_times_rank = i * pseudo_rank, i_times_rank_idx2 = i * pseudo_rank * idx2;
+      auto *C_ptr = C.data() + i_times_rank;
+      for (ind_t j = 0; j < idx2; ++j) {
+        ord_t j_times_prank = j * pseudo_rank, j_times_rank = j * rank;
+        const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_prank;
+        const auto *B_ptr = B.data() + j_times_rank;
+        ord_t k_times_rank = 0;
+        for (ind_t k = 0; k < idx3; ++k, k_times_rank += rank) {
+          for (ind_t r = 0; r < rank; ++r) {
+            auto& val = c_storage[k_times_rank + r];
+            val += alpha * ( *(A_ptr + k_times_rank + r) * *(B_ptr + r))
+                                           + beta * val;
+          }
+        }
+      }
+    }
+
+  }
+
   // this computes \alpha A(i,j,r) * B(i,r) + \beta C(j,r) = C(j,r)
   template<
       typename _T,

From b5e6e13a7e02e30e6729f1cb063e894bac8ca66c Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Fri, 5 Aug 2022 16:16:07 -0400
Subject: [PATCH 2/3] Add `mixed_contract_parallel` to `cp_als.h`

---
 btas/generic/cp_als.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/btas/generic/cp_als.h b/btas/generic/cp_als.h
index 417165c6..8da208e4 100644
--- a/btas/generic/cp_als.h
+++ b/btas/generic/cp_als.h
@@ -818,7 +818,8 @@ namespace btas {
         // over the middle dimension and sum over the rank.
 
         else if (contract_dim > n) {
-          middle_contract(1.0, temp, a, 0.0, contract_tensor);
+//          middle_contract(1.0, temp, a, 0.0, contract_tensor);
+          middle_contract_parallel(1.0, temp, a, 0.0, contract_tensor);
           temp = contract_tensor;
         }
 

From 94f7b434a045d2b0ffe28a818c7f7ce81a0c400f Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Mon, 8 Aug 2022 09:52:12 -0400
Subject: [PATCH 3/3] Create `OMP_PARALLEL` definition

to enable/dis-enable MPI parallelization
---
 btas/generic/cp_als.h                |   3 +-
 btas/generic/element_wise_contract.h | 123 ++++++++++++++-------------
 2 files changed, 66 insertions(+), 60 deletions(-)

diff --git a/btas/generic/cp_als.h b/btas/generic/cp_als.h
index 52bc1457..9e859841 100644
--- a/btas/generic/cp_als.h
+++ b/btas/generic/cp_als.h
@@ -818,8 +818,7 @@ namespace btas {
         // over the middle dimension and sum over the rank.
 
         else if (contract_dim > n) {
-//          middle_contract(1.0, temp, a, 0.0, contract_tensor);
-          middle_contract_parallel(1.0, temp, a, 0.0, contract_tensor);
+          middle_contract(1.0, temp, a, 0.0, contract_tensor);
           temp = contract_tensor;
         }
 
diff --git a/btas/generic/element_wise_contract.h b/btas/generic/element_wise_contract.h
index 34d6dc69..2628b69f 100644
--- a/btas/generic/element_wise_contract.h
+++ b/btas/generic/element_wise_contract.h
@@ -1,11 +1,16 @@
 //
 // Created by Karl Pierce on 2/17/22.
 //
+#ifndef OMP_PARALLEL
+#define OMP_PARALLEL 1
+#endif // OMP_PARALLEL
 
 #ifndef BTAS_GENERIC_ELEMENT_WISE_CONTRACT_H
 #define BTAS_GENERIC_ELEMENT_WISE_CONTRACT_H
 
 namespace btas{
+
+#if OMP_PARALLEL
   // This compute \alpha A(i, j, r) * B(j, r) + \beta C(i,r) = C(i,r)
   template<
       typename _T,
@@ -26,9 +31,11 @@ namespace btas{
                   "btas::middle_contract does not support non-major storage order");
 
     typedef typename _TensorA::value_type value_type;
+    typedef typename _TensorA::storage_type storage_type;
+#pragma omp declare reduction(btas_tensor_plus:storage_type : std::transform(omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus <value_type>())) \
+        initializer(omp_priv = decltype(omp_orig)(omp_orig.size()))
     using ind_t = typename _TensorA::range_type::index_type::value_type;
     using ord_t = typename range_traits<typename _TensorA::range_type>::ordinal_type;
-
     BTAS_ASSERT(A.rank() == 3)
     BTAS_ASSERT(B.rank() == 2)
     BTAS_ASSERT(A.extent(1) == B.extent(0))
@@ -44,9 +51,12 @@ namespace btas{
     }
     ind_t idx1 = A.extent(0), idx2 = A.extent(1), rank = A.extent(2);
 
-    ord_t i_times_rank = 0, i_times_rank_idx2 = 0;
-    for (ind_t i = 0; i < idx1; ++i, i_times_rank += rank) {
-      auto *C_ptr = C.data() + i_times_rank;
+    //ord_t i_times_rank = 0, i_times_rank_idx2 = 0;
+    storage_type& c_storage = C.storage();
+#pragma omp parallel for reduction(btas_tensor_plus : c_storage) schedule(static) default(shared)
+    for (ind_t i = 0; i < idx1; ++i) {
+      ord_t i_times_rank = i * rank, i_times_rank_idx2 = i * rank * idx2;
+      auto *C_ptr = c_storage.data() + i_times_rank;
       ord_t j_times_rank = 0;
       for (ind_t j = 0; j < idx2; ++j, j_times_rank += rank) {
         const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_rank;
@@ -55,11 +65,10 @@ namespace btas{
           *(C_ptr + r) += alpha * (*(A_ptr + r) * *(B_ptr + r)) + beta * *(C_ptr + r);
         }
       }
-      i_times_rank_idx2 += j_times_rank;
     }
   }
 
-  // This compute \alpha A(i, j, r) * B(j, r) + \beta C(i,r) = C(i,r)
+  // this does the elementwise contraction \alpha A(i,j,k,r) * B(j, r) + \beta C(i,k,r) = C(i,k,r)
   template<
       typename _T,
       class _TensorA, class _TensorB, class _TensorC,
@@ -71,7 +80,8 @@ namespace btas{
           std::is_same<typename _TensorA::value_type, typename _TensorC::value_type>::value
           >::type
       >
-  void middle_contract_parallel(const _T alpha, const _TensorA& A, const _TensorB& B, const _T beta, _TensorC& C) {
+  void middle_contract_with_pseudorank(_T alpha, const _TensorA & A,
+                                       const _TensorB& B, _T beta, _TensorC& C){
     static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value &&
                       boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value,
                   "btas::middle_contract does not support mixed storage order");
@@ -84,37 +94,47 @@ namespace btas{
         initializer(omp_priv = decltype(omp_orig)(omp_orig.size()))
     using ind_t = typename _TensorA::range_type::index_type::value_type;
     using ord_t = typename range_traits<typename _TensorA::range_type>::ordinal_type;
+
     BTAS_ASSERT(A.rank() == 3)
     BTAS_ASSERT(B.rank() == 2)
     BTAS_ASSERT(A.extent(1) == B.extent(0))
-    BTAS_ASSERT(A.extent(2) == B.extent(1))
+    ind_t rank = B.extent(1),
+          idx3 = A.extent(2) / rank;
+    BTAS_ASSERT(A.extent(2) / idx3 == B.extent(1));
 
-    if (!C.empty()) {
+    if(!C.empty()){
       BTAS_ASSERT(C.rank() == 2);
       BTAS_ASSERT(C.extent(0) == A.extent(0))
       BTAS_ASSERT(C.extent(1) == A.extent(2))
-    } else {
+    } else{
       C = _TensorC(A.extent(0), A.extent(2));
       NumericType<value_type>::fill(std::begin(C), std::end(C), NumericType<value_type>::zero());
     }
-    ind_t idx1 = A.extent(0), idx2 = A.extent(1), rank = A.extent(2);
 
-    //ord_t i_times_rank = 0, i_times_rank_idx2 = 0;
+    ind_t idx1 = A.extent(0), idx2 = A.extent(1), pseudo_rank = A.extent(2);
     storage_type& c_storage = C.storage();
-    auto A_ptr = A.data(), B_ptr = B.data();
 #pragma omp parallel for reduction(btas_tensor_plus : c_storage) schedule(static) default(shared)
     for (ind_t i = 0; i < idx1; ++i) {
-        auto i_offset_c = i * rank, i_offset_a = i * rank * idx2;
-        for (ind_t r = 0; r < rank; ++r) {
-          for (ind_t j = 0; j < idx2; ++j) {
-            auto & val = c_storage[i_offset_c + r];
-            val += alpha * *(A_ptr + i_offset_a + j * rank + r) *
-                                         * (B_ptr + j * rank + r) + beta * val;
+      ord_t i_times_rank = i * pseudo_rank, i_times_rank_idx2 = i * idx2 * pseudo_rank ;
+      auto *C_ptr = C.data() + i_times_rank;
+      ord_t j_times_prank = 0, j_times_rank = 0;
+      for (ind_t j = 0; j < idx2; ++j, j_times_prank += pseudo_rank, j_times_rank += rank) {
+        const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_prank;
+        const auto *B_ptr = B.data() + j_times_rank;
+        ord_t k_times_rank = 0;
+        for (ind_t k = 0; k < idx3; ++k, k_times_rank += rank) {
+          for (ind_t r = 0; r < rank; ++r) {
+            *(C_ptr + k_times_rank + r) += alpha * ( *(A_ptr + k_times_rank + r) * *(B_ptr + r))
+                                           + beta * *(C_ptr + k_times_rank + r);
           }
         }
+      }
+      i_times_rank_idx2 += j_times_prank;
     }
+
   }
-  // this does the elementwise contraction \alpha A(i,j,k,r) * B(j, r) + \beta C(i,k,r) = C(i,k,r)
+#elif
+  // This compute \alpha A(i, j, r) * B(j, r) + \beta C(i,r) = C(i,r)
   template<
       typename _T,
       class _TensorA, class _TensorB, class _TensorC,
@@ -126,8 +146,7 @@ namespace btas{
           std::is_same<typename _TensorA::value_type, typename _TensorC::value_type>::value
           >::type
       >
-  void middle_contract_with_pseudorank(_T alpha, const _TensorA & A,
-                                       const _TensorB& B, _T beta, _TensorC& C){
+  void middle_contract(_T alpha, const _TensorA& A, const _TensorB& B, _T beta, _TensorC& C) {
     static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value &&
                       boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value,
                   "btas::middle_contract does not support mixed storage order");
@@ -141,40 +160,32 @@ namespace btas{
     BTAS_ASSERT(A.rank() == 3)
     BTAS_ASSERT(B.rank() == 2)
     BTAS_ASSERT(A.extent(1) == B.extent(0))
-    ind_t rank = B.extent(1),
-          idx3 = A.extent(2) / rank;
-    BTAS_ASSERT(A.extent(2) / idx3 == B.extent(1));
+    BTAS_ASSERT(A.extent(2) == B.extent(1))
 
-    if(!C.empty()){
+    if (!C.empty()) {
       BTAS_ASSERT(C.rank() == 2);
       BTAS_ASSERT(C.extent(0) == A.extent(0))
       BTAS_ASSERT(C.extent(1) == A.extent(2))
-    } else{
+    } else {
       C = _TensorC(A.extent(0), A.extent(2));
       NumericType<value_type>::fill(std::begin(C), std::end(C), NumericType<value_type>::zero());
     }
+    ind_t idx1 = A.extent(0), idx2 = A.extent(1), rank = A.extent(2);
 
-    ind_t idx1 = A.extent(0), idx2 = A.extent(1), pseudo_rank = A.extent(2);
     ord_t i_times_rank = 0, i_times_rank_idx2 = 0;
-    for (ind_t i = 0; i < idx1; ++i, i_times_rank += pseudo_rank) {
+    for (ind_t i = 0; i < idx1; ++i, i_times_rank += rank) {
       auto *C_ptr = C.data() + i_times_rank;
-      ord_t j_times_prank = 0, j_times_rank = 0;
-      for (ind_t j = 0; j < idx2; ++j, j_times_prank += pseudo_rank, j_times_rank += rank) {
-        const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_prank;
+      ord_t j_times_rank = 0;
+      for (ind_t j = 0; j < idx2; ++j, j_times_rank += rank) {
+        const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_rank;
         const auto *B_ptr = B.data() + j_times_rank;
-        ord_t k_times_rank = 0;
-        for (ind_t k = 0; k < idx3; ++k, k_times_rank += rank) {
-          for (ind_t r = 0; r < rank; ++r) {
-            *(C_ptr + k_times_rank + r) += alpha * ( *(A_ptr + k_times_rank + r) * *(B_ptr + r))
-                                           + beta * *(C_ptr + k_times_rank + r);
-          }
+        for (ind_t r = 0; r < rank; ++r) {
+          *(C_ptr + r) += alpha * (*(A_ptr + r) * *(B_ptr + r)) + beta * *(C_ptr + r);
         }
       }
-      i_times_rank_idx2 += j_times_prank;
+      i_times_rank_idx2 += j_times_rank;
     }
-
   }
-
   // this does the elementwise contraction \alpha A(i,j,k,r) * B(j, r) + \beta C(i,k,r) = C(i,k,r)
   template<
       typename _T,
@@ -187,8 +198,8 @@ namespace btas{
           std::is_same<typename _TensorA::value_type, typename _TensorC::value_type>::value
           >::type
       >
-  void middle_contract_with_pseudorank_parallel(const _T alpha, const _TensorA & A,
-                                       const _TensorB& B, const _T beta, _TensorC& C){
+  void middle_contract_with_pseudorank(_T alpha, const _TensorA & A,
+                                       const _TensorB& B, _T beta, _TensorC& C){
     static_assert(boxtensor_storage_order<_TensorA>::value == boxtensor_storage_order<_TensorC>::value &&
                       boxtensor_storage_order<_TensorB>::value == boxtensor_storage_order<_TensorC>::value,
                   "btas::middle_contract does not support mixed storage order");
@@ -196,9 +207,6 @@ namespace btas{
                   "btas::middle_contract does not support non-major storage order");
 
     typedef typename _TensorA::value_type value_type;
-    typedef typename _TensorA::storage_type storage_type;
-#pragma omp declare reduction(btas_tensor_plus:storage_type : std::transform(omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus <value_type>())) \
-        initializer(omp_priv = decltype(omp_orig)(omp_orig.size()))
     using ind_t = typename _TensorA::range_type::index_type::value_type;
     using ord_t = typename range_traits<typename _TensorA::range_type>::ordinal_type;
 
@@ -219,27 +227,26 @@ namespace btas{
     }
 
     ind_t idx1 = A.extent(0), idx2 = A.extent(1), pseudo_rank = A.extent(2);
-    storage_type& c_storage = C.storage();
-#pragma omp parallel for reduction(btas_tensor_plus : c_storage) schedule(static) default(shared)
-    for (ind_t i = 0; i < idx1; ++i) {
-      ord_t i_times_rank = i * pseudo_rank, i_times_rank_idx2 = i * pseudo_rank * idx2;
+    ord_t i_times_rank = 0, i_times_rank_idx2 = 0;
+    for (ind_t i = 0; i < idx1; ++i, i_times_rank += pseudo_rank) {
       auto *C_ptr = C.data() + i_times_rank;
-      for (ind_t j = 0; j < idx2; ++j) {
-        ord_t j_times_prank = j * pseudo_rank, j_times_rank = j * rank;
+      ord_t j_times_prank = 0, j_times_rank = 0;
+      for (ind_t j = 0; j < idx2; ++j, j_times_prank += pseudo_rank, j_times_rank += rank) {
         const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_prank;
         const auto *B_ptr = B.data() + j_times_rank;
         ord_t k_times_rank = 0;
         for (ind_t k = 0; k < idx3; ++k, k_times_rank += rank) {
           for (ind_t r = 0; r < rank; ++r) {
-            auto& val = c_storage[k_times_rank + r];
-            val += alpha * ( *(A_ptr + k_times_rank + r) * *(B_ptr + r))
-                                           + beta * val;
+            *(C_ptr + k_times_rank + r) += alpha * ( *(A_ptr + k_times_rank + r) * *(B_ptr + r))
+                                           + beta * *(C_ptr + k_times_rank + r);
           }
         }
       }
+      i_times_rank_idx2 += j_times_prank;
     }
 
   }
+#endif
 
   // this computes \alpha A(i,j,r) * B(i,r) + \beta C(j,r) = C(j,r)
   template<
@@ -281,13 +288,13 @@ namespace btas{
 
     ind_t idx1 = A.extent(0), idx2 =  A.extent(1), rank = A.extent(2);
     ord_t i_times_rank = 0, i_times_rank_idx2 = 0;
-    for (ind_t i = 0; i < idx1; i++, i_times_rank += rank) {
+    for (ind_t i = 0; i < idx1; ++i, i_times_rank += rank) {
       const auto *B_ptr = B.data() + i_times_rank;
       ord_t j_times_rank = 0;
-      for (ind_t j = 0; j < idx2; j++, j_times_rank += rank) {
+      for (ind_t j = 0; j < idx2; ++j, j_times_rank += rank) {
         const auto *A_ptr = A.data() + i_times_rank_idx2 + j_times_rank;
         auto *C_ptr = C.data() + j_times_rank;
-        for (ind_t r = 0; r < rank; r++) {
+        for (ind_t r = 0; r < rank; ++r) {
           *(C_ptr + r) += *(B_ptr + r) * *(A_ptr + r);
         }
       }