From 4ac3dc3513c81e7f7d8fc80ea727f54cf6e73844 Mon Sep 17 00:00:00 2001 From: Aditya Goel <48102515+adityagoel4512@users.noreply.github.com> Date: Wed, 28 Feb 2024 23:09:55 +0100 Subject: [PATCH] CategoricalMatrix A.Tb reproducibility. (#348) * transpose matmul categorical bit reproducibility * Remove reproducibility test * Drop redundant atomic * changelog --------- Co-authored-by: Marc-Antoine Schmidt --- CHANGELOG.rst | 2 +- src/tabmat/ext/cat_split_helpers-tmpl.cpp | 25 ++++++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3d6e251c..048b9190 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,10 +14,10 @@ Unreleased - Added cython compiler directive legacy_implicit_noexcept = True to fix performance regression with cython 3. - **Other changes:** - Refactored the pre-commit hooks to use ruff. +- Refactored CategoricalMatrix's transpose_matvec to be deterministic when using OpenMP. 3.1.13 - 2023-10-17 ------------------- diff --git a/src/tabmat/ext/cat_split_helpers-tmpl.cpp b/src/tabmat/ext/cat_split_helpers-tmpl.cpp index c40f851c..d70960b5 100644 --- a/src/tabmat/ext/cat_split_helpers-tmpl.cpp +++ b/src/tabmat/ext/cat_split_helpers-tmpl.cpp @@ -1,5 +1,5 @@ #include - +#include <%def name="transpose_matvec(dropfirst)"> template @@ -10,24 +10,29 @@ void _transpose_matvec_${dropfirst}( F* res, Int res_size ) { - #pragma omp parallel + int num_threads = omp_get_max_threads(); + std::vector all_res(num_threads * res_size, 0.0); + #pragma omp parallel shared(all_res) { - std::vector restemp(res_size, 0.0); - #pragma omp for + int tid = omp_get_thread_num(); + F* res_slice = &all_res[tid * res_size]; + #pragma omp for for (Py_ssize_t i = 0; i < n_rows; i++) { % if dropfirst == 'all_rows_drop_first': Py_ssize_t col_idx = indices[i] - 1; if (col_idx != -1) { - restemp[col_idx] += other[i]; + res_slice[col_idx] += other[i]; } % else: - restemp[indices[i]] += other[i]; + res_slice[indices[i]] += other[i]; % endif } - for (Py_ssize_t i = 0; i < res_size; i++) { - # pragma omp atomic - res[i] += restemp[i]; - } + #pragma omp for + for (Py_ssize_t i = 0; i < res_size; ++i) { + for (int tid = 0; tid < num_threads; ++tid) { + res[i] += all_res[tid * res_size + i]; + } + } } }