Skip to content

Commit e2462cc

Browse files
authored
Add support for architectures not supported by xsimd (#262)
1 parent 0d3835d commit e2462cc

File tree

7 files changed

+92
-2
lines changed

7 files changed

+92
-2
lines changed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
BUILD:
2+
- aarch64-conda_cos7-linux-gnu
3+
c_compiler:
4+
- gcc
5+
c_compiler_version:
6+
- '12'
7+
cxx_compiler:
8+
- gxx
9+
cxx_compiler_version:
10+
- '12'
11+
numpy:
12+
- '1.21'
13+
python:
14+
- 3.10.* *_cpython
15+
target_platform:
16+
- linux-aarch64
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
c_compiler:
2+
- gcc
3+
c_compiler_version:
4+
- '12'
5+
cxx_compiler:
6+
- gxx
7+
cxx_compiler_version:
8+
- '12'
9+
numpy:
10+
- '1.21'
11+
python:
12+
- 3.10.* *_cpython
13+
target_platform:
14+
- linux-ppc64le

.github/workflows/ci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ jobs:
4848
matrix:
4949
CONDA_BUILD_YML:
5050
- linux_64_python3.7_default.____cpython
51+
- linux_aarch64_python3.10_default.____cpython
52+
- linux_ppc64le_python3.10_default.____cpython
5153
steps:
5254
- name: Pull image
5355
run: docker pull condaforge/mambaforge:latest

.github/workflows/conda-build.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,10 @@ export CONDA_BUILD_YML=$1
77
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
88
source ${SCRIPT_DIR}/base.sh $*
99
conda activate base
10-
mamba install -y conda-build
11-
conda build -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe
10+
mamba install -y boa
11+
12+
if grep -q "linux-aarch64\|linux-ppc64le" .ci_support/${CONDA_BUILD_YML}.yaml; then
13+
CONDA_BUILD_ARGS="${CONDA_BUILD_ARGS:-} --no-test"
14+
fi
15+
16+
conda mambabuild ${CONDA_BUILD_ARGS:-} -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe

CHANGELOG.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@
77
Changelog
88
=========
99

10+
unreleased
11+
----------
12+
13+
**Other changes:**
14+
15+
- Support building on architectures that are unsupported by xsimd.
16+
1017
3.1.8 - 2023-06-13
1118
------------------
1219

src/tabmat/ext/dense_helpers-tmpl.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,11 @@ namespace xs = xsimd;
4545
// setup simd accumulators
4646
% for ir in range(IBLOCK):
4747
% for jr in range(JBLOCK):
48+
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
49+
auto accumsimd${ir}_${jr} = (F)0.0;
50+
#else
4851
auto accumsimd${ir}_${jr} = xs::XSIMD_BROADCAST(((F)0.0));
52+
#endif
4953
% endfor
5054
% endfor
5155

@@ -78,10 +82,18 @@ namespace xs = xsimd;
7882
% endfor
7983
) {
8084
% for ir in range(IBLOCK):
85+
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
86+
auto Xtd${ir} = *Lptr${ir};
87+
#else
8188
auto Xtd${ir} = xs::load_aligned(Lptr${ir});
89+
#endif
8290
% for jr in range(JBLOCK):
8391
{
92+
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
93+
auto Xsimd = *Rptr${jr};
94+
#else
8495
auto Xsimd = xs::load_aligned(Rptr${jr});
96+
#endif
8597
accumsimd${ir}_${jr} = xs::fma(Xtd${ir}, Xsimd, accumsimd${ir}_${jr});
8698
}
8799
% endfor
@@ -91,7 +103,11 @@ namespace xs = xsimd;
91103
// horizontal sum of the simd blocks
92104
% for ir in range(IBLOCK):
93105
% for jr in range(JBLOCK):
106+
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
107+
F accum${ir}_${jr} = accumsimd${ir}_${jr};
108+
#else
94109
F accum${ir}_${jr} = xs::XSIMD_REDUCE_ADD(accumsimd${ir}_${jr});
110+
#endif
95111
% endfor
96112
% endfor
97113

@@ -150,7 +166,11 @@ void dense_base${kparallel}(F* R, F* L, F* d, F* out,
150166
Py_ssize_t jmin2, Py_ssize_t jmax2,
151167
Py_ssize_t kmin, Py_ssize_t kmax, Int innerblock, Int kstep)
152168
{
169+
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
170+
constexpr std::size_t simd_size = 1;
171+
#else
153172
constexpr std::size_t simd_size = xsimd::simd_type<F>::size;
173+
#endif
154174
for (Py_ssize_t imin = imin2; imin < imax2; imin+=innerblock) {
155175
Py_ssize_t imax = imin + innerblock;
156176
if (imax > imax2) {
@@ -248,7 +268,11 @@ template <typename Int, typename F>
248268
void _dense${order}_sandwich(Int* rows, Int* cols, F* X, F* d, F* out,
249269
Int in_n, Int out_m, Int m, Int n, Int thresh1d, Int kratio, Int innerblock)
250270
{
271+
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
272+
constexpr std::size_t simd_size = 1;
273+
#else
251274
constexpr std::size_t simd_size = xsimd::simd_type<F>::size;
275+
#endif
252276
constexpr auto alignment = simd_size * sizeof(F);
253277

254278
bool kparallel = (in_n / (kratio*thresh1d)) > (out_m / thresh1d);
@@ -292,7 +316,11 @@ template <typename Int, typename F>
292316
void _dense${order}_rmatvec(Int* rows, Int* cols, F* X, F* v, F* out,
293317
Int n_rows, Int n_cols, Int m, Int n)
294318
{
319+
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
320+
constexpr std::size_t simd_size = 1;
321+
#else
295322
constexpr std::size_t simd_size = xsimd::simd_type<F>::size;
323+
#endif
296324
constexpr std::size_t alignment = simd_size * sizeof(F);
297325

298326
auto outglobal = make_aligned_unique<F>(omp_get_max_threads()*n_cols, alignment);

src/tabmat/ext/sparse_helpers-tmpl.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,12 @@ void _csr_dense${order}_sandwich(
3030
Int nrows, Int nA_cols, Int nB_cols
3131
)
3232
{
33+
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
34+
constexpr Int simd_size = 1;
35+
#else
3336
constexpr Int simd_size = xsimd::simd_type<F>::size;
37+
#endif
38+
3439
constexpr auto alignment = simd_size*sizeof(F);
3540

3641
int kblock = 128;
@@ -95,15 +100,28 @@ void _csr_dense${order}_sandwich(
95100
}
96101

97102
F Q = Adata[A_idx];
103+
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
104+
auto Qsimd = Q;
105+
#else
98106
auto Qsimd = xs::XSIMD_BROADCAST(Q);
107+
#endif
99108

100109
Py_ssize_t Cj = Cjj;
101110
Py_ssize_t Cjmax2 = Cjj + ((Cjmax - Cjj) / simd_size) * simd_size;
102111
for (; Cj < Cjmax2; Cj+=simd_size) {
112+
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
113+
auto Bsimd = R[(Py_ssize_t) (Ck-Ckk) * jblock + (Cj-Cjj)];
114+
auto outsimd = outtemp.get()[Ci * nB_cols_rounded + Cj];
115+
#else
103116
auto Bsimd = xs::load_aligned(&R[(Py_ssize_t) (Ck-Ckk) * jblock + (Cj-Cjj)]);
104117
auto outsimd = xs::load_aligned(&outtemp.get()[Ci * nB_cols_rounded + Cj]);
118+
#endif
105119
outsimd = xs::fma(Qsimd, Bsimd, outsimd);
120+
#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
121+
outtemp.get()[Ci * nB_cols_rounded + Cj] = outsimd;
122+
#else
106123
outsimd.store_aligned(&outtemp.get()[Ci * nB_cols_rounded + Cj]);
124+
#endif
107125
}
108126

109127
for (; Cj < Cjmax; Cj++) {

0 commit comments

Comments
 (0)