From 4ac3dc3513c81e7f7d8fc80ea727f54cf6e73844 Mon Sep 17 00:00:00 2001
From: Aditya Goel <48102515+adityagoel4512@users.noreply.github.com>
Date: Wed, 28 Feb 2024 23:09:55 +0100
Subject: [PATCH] CategoricalMatrix A.Tb reproducibility. (#348)

* transpose matmul categorical bit reproducibility

* Remove reproducibility test

* Drop redundant atomic

* changelog

---------

Co-authored-by: Marc-Antoine Schmidt <marc-antoine.schmidt@quantco.com>
---
 CHANGELOG.rst                             |  2 +-
 src/tabmat/ext/cat_split_helpers-tmpl.cpp | 25 ++++++++++++++---------
 2 files changed, 16 insertions(+), 11 deletions(-)
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 3d6e251c..048b9190 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -14,10 +14,10 @@ Unreleased
 
 - Added cython compiler directive legacy_implicit_noexcept = True to fix performance regression with cython 3.
 
-
 **Other changes:**
 
 - Refactored the pre-commit hooks to use ruff.
+- Refactored CategoricalMatrix's transpose_matvec to be deterministic when using OpenMP.
 
 3.1.13 - 2023-10-17
 -------------------
diff --git a/src/tabmat/ext/cat_split_helpers-tmpl.cpp b/src/tabmat/ext/cat_split_helpers-tmpl.cpp
index c40f851c..d70960b5 100644
--- a/src/tabmat/ext/cat_split_helpers-tmpl.cpp
+++ b/src/tabmat/ext/cat_split_helpers-tmpl.cpp
@@ -1,5 +1,5 @@
 #include <vector>
-
+#include <omp.h>
 
 <%def name="transpose_matvec(dropfirst)">
 template <typename Int, typename F>
@@ -10,24 +10,29 @@ void _transpose_matvec_${dropfirst}(
     F* res,
     Int res_size
 ) {
-    #pragma omp parallel
+    int num_threads = omp_get_max_threads();
+    std::vector<F> all_res(num_threads * res_size, 0.0);
+    #pragma omp parallel shared(all_res) 
     {
-        std::vector<F> restemp(res_size, 0.0);
-        #pragma omp for
+	int tid = omp_get_thread_num();
+	F* res_slice = &all_res[tid * res_size]; 
+	#pragma omp for
         for (Py_ssize_t i = 0; i < n_rows; i++) {
             % if dropfirst == 'all_rows_drop_first':
                 Py_ssize_t col_idx = indices[i] - 1;
                 if (col_idx != -1) {
-                    restemp[col_idx] += other[i];
+                    res_slice[col_idx] += other[i];
                 }
             % else:
-                restemp[indices[i]] += other[i];
+  	        res_slice[indices[i]] += other[i];
             % endif
         }
-        for (Py_ssize_t i = 0; i < res_size; i++) {
-            # pragma omp atomic
-            res[i] += restemp[i];
-        }
+	#pragma omp for
+	for (Py_ssize_t i = 0; i < res_size; ++i) {
+	    for (int tid = 0; tid < num_threads; ++tid) {
+	        res[i] += all_res[tid * res_size + i];	    
+	    }
+	}	
     }
 }
 </%def>