Skip to content

Commit

Permalink
Add support for architectures not supported by xsimd (#262)
Browse files Browse the repository at this point in the history
  • Loading branch information
xhochy authored Jun 15, 2023
1 parent 0d3835d commit e2462cc
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 2 deletions.
16 changes: 16 additions & 0 deletions .ci_support/linux_aarch64_python3.10_default.____cpython.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
BUILD:
- aarch64-conda_cos7-linux-gnu
c_compiler:
- gcc
c_compiler_version:
- '12'
cxx_compiler:
- gxx
cxx_compiler_version:
- '12'
numpy:
- '1.21'
python:
- 3.10.* *_cpython
target_platform:
- linux-aarch64
14 changes: 14 additions & 0 deletions .ci_support/linux_ppc64le_python3.10_default.____cpython.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
c_compiler:
- gcc
c_compiler_version:
- '12'
cxx_compiler:
- gxx
cxx_compiler_version:
- '12'
numpy:
- '1.21'
python:
- 3.10.* *_cpython
target_platform:
- linux-ppc64le
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ jobs:
matrix:
CONDA_BUILD_YML:
- linux_64_python3.7_default.____cpython
- linux_aarch64_python3.10_default.____cpython
- linux_ppc64le_python3.10_default.____cpython
steps:
- name: Pull image
run: docker pull condaforge/mambaforge:latest
Expand Down
9 changes: 7 additions & 2 deletions .github/workflows/conda-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,10 @@ export CONDA_BUILD_YML=$1
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
source ${SCRIPT_DIR}/base.sh $*
conda activate base
mamba install -y conda-build
conda build -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe
mamba install -y boa

if grep -q "linux-aarch64\|linux-ppc64le" .ci_support/${CONDA_BUILD_YML}.yaml; then
CONDA_BUILD_ARGS="${CONDA_BUILD_ARGS:-} --no-test"
fi

conda mambabuild ${CONDA_BUILD_ARGS:-} -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
Changelog
=========

unreleased
----------

**Other changes:**

- Support building on architectures that are unsupported by xsimd.

3.1.8 - 2023-06-13
------------------

Expand Down
28 changes: 28 additions & 0 deletions src/tabmat/ext/dense_helpers-tmpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@ namespace xs = xsimd;
// setup simd accumulators
% for ir in range(IBLOCK):
% for jr in range(JBLOCK):
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
auto accumsimd${ir}_${jr} = (F)0.0;
#else
auto accumsimd${ir}_${jr} = xs::XSIMD_BROADCAST(((F)0.0));
#endif
% endfor
% endfor

Expand Down Expand Up @@ -78,10 +82,18 @@ namespace xs = xsimd;
% endfor
) {
% for ir in range(IBLOCK):
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
auto Xtd${ir} = *Lptr${ir};
#else
auto Xtd${ir} = xs::load_aligned(Lptr${ir});
#endif
% for jr in range(JBLOCK):
{
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
auto Xsimd = *Rptr${jr};
#else
auto Xsimd = xs::load_aligned(Rptr${jr});
#endif
accumsimd${ir}_${jr} = xs::fma(Xtd${ir}, Xsimd, accumsimd${ir}_${jr});
}
% endfor
Expand All @@ -91,7 +103,11 @@ namespace xs = xsimd;
// horizontal sum of the simd blocks
% for ir in range(IBLOCK):
% for jr in range(JBLOCK):
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
F accum${ir}_${jr} = accumsimd${ir}_${jr};
#else
F accum${ir}_${jr} = xs::XSIMD_REDUCE_ADD(accumsimd${ir}_${jr});
#endif
% endfor
% endfor

Expand Down Expand Up @@ -150,7 +166,11 @@ void dense_base${kparallel}(F* R, F* L, F* d, F* out,
Py_ssize_t jmin2, Py_ssize_t jmax2,
Py_ssize_t kmin, Py_ssize_t kmax, Int innerblock, Int kstep)
{
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
constexpr std::size_t simd_size = 1;
#else
constexpr std::size_t simd_size = xsimd::simd_type<F>::size;
#endif
for (Py_ssize_t imin = imin2; imin < imax2; imin+=innerblock) {
Py_ssize_t imax = imin + innerblock;
if (imax > imax2) {
Expand Down Expand Up @@ -248,7 +268,11 @@ template <typename Int, typename F>
void _dense${order}_sandwich(Int* rows, Int* cols, F* X, F* d, F* out,
Int in_n, Int out_m, Int m, Int n, Int thresh1d, Int kratio, Int innerblock)
{
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
constexpr std::size_t simd_size = 1;
#else
constexpr std::size_t simd_size = xsimd::simd_type<F>::size;
#endif
constexpr auto alignment = simd_size * sizeof(F);

bool kparallel = (in_n / (kratio*thresh1d)) > (out_m / thresh1d);
Expand Down Expand Up @@ -292,7 +316,11 @@ template <typename Int, typename F>
void _dense${order}_rmatvec(Int* rows, Int* cols, F* X, F* v, F* out,
Int n_rows, Int n_cols, Int m, Int n)
{
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
constexpr std::size_t simd_size = 1;
#else
constexpr std::size_t simd_size = xsimd::simd_type<F>::size;
#endif
constexpr std::size_t alignment = simd_size * sizeof(F);

auto outglobal = make_aligned_unique<F>(omp_get_max_threads()*n_cols, alignment);
Expand Down
18 changes: 18 additions & 0 deletions src/tabmat/ext/sparse_helpers-tmpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,12 @@ void _csr_dense${order}_sandwich(
Int nrows, Int nA_cols, Int nB_cols
)
{
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
constexpr Int simd_size = 1;
#else
constexpr Int simd_size = xsimd::simd_type<F>::size;
#endif

constexpr auto alignment = simd_size*sizeof(F);

int kblock = 128;
Expand Down Expand Up @@ -95,15 +100,28 @@ void _csr_dense${order}_sandwich(
}

F Q = Adata[A_idx];
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
auto Qsimd = Q;
#else
auto Qsimd = xs::XSIMD_BROADCAST(Q);
#endif

Py_ssize_t Cj = Cjj;
Py_ssize_t Cjmax2 = Cjj + ((Cjmax - Cjj) / simd_size) * simd_size;
for (; Cj < Cjmax2; Cj+=simd_size) {
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
auto Bsimd = R[(Py_ssize_t) (Ck-Ckk) * jblock + (Cj-Cjj)];
auto outsimd = outtemp.get()[Ci * nB_cols_rounded + Cj];
#else
auto Bsimd = xs::load_aligned(&R[(Py_ssize_t) (Ck-Ckk) * jblock + (Cj-Cjj)]);
auto outsimd = xs::load_aligned(&outtemp.get()[Ci * nB_cols_rounded + Cj]);
#endif
outsimd = xs::fma(Qsimd, Bsimd, outsimd);
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
outtemp.get()[Ci * nB_cols_rounded + Cj] = outsimd;
#else
outsimd.store_aligned(&outtemp.get()[Ci * nB_cols_rounded + Cj]);
#endif
}

for (; Cj < Cjmax; Cj++) {
Expand Down

0 comments on commit e2462cc

Please sign in to comment.