diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3d6e251c..048b9190 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,10 +14,10 @@ Unreleased - Added cython compiler directive legacy_implicit_noexcept = True to fix performance regression with cython 3. - **Other changes:** - Refactored the pre-commit hooks to use ruff. +- Refactored CategoricalMatrix's transpose_matvec to be deterministic when using OpenMP. 3.1.13 - 2023-10-17 ------------------- diff --git a/src/tabmat/ext/cat_split_helpers-tmpl.cpp b/src/tabmat/ext/cat_split_helpers-tmpl.cpp index c40f851c..d70960b5 100644 --- a/src/tabmat/ext/cat_split_helpers-tmpl.cpp +++ b/src/tabmat/ext/cat_split_helpers-tmpl.cpp @@ -1,5 +1,5 @@ #include - +#include <%def name="transpose_matvec(dropfirst)"> template @@ -10,24 +10,29 @@ void _transpose_matvec_${dropfirst}( F* res, Int res_size ) { - #pragma omp parallel + int num_threads = omp_get_max_threads(); + std::vector all_res(num_threads * res_size, 0.0); + #pragma omp parallel shared(all_res) { - std::vector restemp(res_size, 0.0); - #pragma omp for + int tid = omp_get_thread_num(); + F* res_slice = &all_res[tid * res_size]; + #pragma omp for for (Py_ssize_t i = 0; i < n_rows; i++) { % if dropfirst == 'all_rows_drop_first': Py_ssize_t col_idx = indices[i] - 1; if (col_idx != -1) { - restemp[col_idx] += other[i]; + res_slice[col_idx] += other[i]; } % else: - restemp[indices[i]] += other[i]; + res_slice[indices[i]] += other[i]; % endif } - for (Py_ssize_t i = 0; i < res_size; i++) { - # pragma omp atomic - res[i] += restemp[i]; - } + #pragma omp for + for (Py_ssize_t i = 0; i < res_size; ++i) { + for (int tid = 0; tid < num_threads; ++tid) { + res[i] += all_res[tid * res_size + i]; + } + } } }