diff --git a/.ci_support/linux_aarch64_python3.10_default.____cpython.yaml b/.ci_support/linux_aarch64_python3.10_default.____cpython.yaml new file mode 100644 index 00000000..1c3d120f --- /dev/null +++ b/.ci_support/linux_aarch64_python3.10_default.____cpython.yaml @@ -0,0 +1,16 @@ +BUILD: +- aarch64-conda_cos7-linux-gnu +c_compiler: +- gcc +c_compiler_version: +- '12' +cxx_compiler: +- gxx +cxx_compiler_version: +- '12' +numpy: +- '1.21' +python: +- 3.10.* *_cpython +target_platform: +- linux-aarch64 diff --git a/.ci_support/linux_ppc64le_python3.10_default.____cpython.yaml b/.ci_support/linux_ppc64le_python3.10_default.____cpython.yaml new file mode 100644 index 00000000..d1e7b0f7 --- /dev/null +++ b/.ci_support/linux_ppc64le_python3.10_default.____cpython.yaml @@ -0,0 +1,14 @@ +c_compiler: +- gcc +c_compiler_version: +- '12' +cxx_compiler: +- gxx +cxx_compiler_version: +- '12' +numpy: +- '1.21' +python: +- 3.10.* *_cpython +target_platform: +- linux-ppc64le diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 604e6244..0ee0c664 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,6 +48,8 @@ jobs: matrix: CONDA_BUILD_YML: - linux_64_python3.7_default.____cpython + - linux_aarch64_python3.10_default.____cpython + - linux_ppc64le_python3.10_default.____cpython steps: - name: Pull image run: docker pull condaforge/mambaforge:latest diff --git a/.github/workflows/conda-build.sh b/.github/workflows/conda-build.sh index 71ec92f4..c6c6d28b 100755 --- a/.github/workflows/conda-build.sh +++ b/.github/workflows/conda-build.sh @@ -7,5 +7,10 @@ export CONDA_BUILD_YML=$1 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" source ${SCRIPT_DIR}/base.sh $* conda activate base -mamba install -y conda-build -conda build -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe +mamba install -y boa + +if grep -q "linux-aarch64\|linux-ppc64le" .ci_support/${CONDA_BUILD_YML}.yaml; then + CONDA_BUILD_ARGS="${CONDA_BUILD_ARGS:-} --no-test" +fi + +conda mambabuild ${CONDA_BUILD_ARGS:-} -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a0fd5c5b..0ae862e7 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,13 @@ Changelog ========= +unreleased +---------- + +**Other changes:** + +- Support building on architectures that are unsupported by xsimd. + 3.1.8 - 2023-06-13 ------------------ diff --git a/src/tabmat/ext/dense_helpers-tmpl.cpp b/src/tabmat/ext/dense_helpers-tmpl.cpp index 00af6375..430d028d 100644 --- a/src/tabmat/ext/dense_helpers-tmpl.cpp +++ b/src/tabmat/ext/dense_helpers-tmpl.cpp @@ -45,7 +45,11 @@ namespace xs = xsimd; // setup simd accumulators % for ir in range(IBLOCK): % for jr in range(JBLOCK): +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + auto accumsimd${ir}_${jr} = (F)0.0; +#else auto accumsimd${ir}_${jr} = xs::XSIMD_BROADCAST(((F)0.0)); +#endif % endfor % endfor @@ -78,10 +82,18 @@ namespace xs = xsimd; % endfor ) { % for ir in range(IBLOCK): +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + auto Xtd${ir} = *Lptr${ir}; +#else auto Xtd${ir} = xs::load_aligned(Lptr${ir}); +#endif % for jr in range(JBLOCK): { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + auto Xsimd = *Rptr${jr}; +#else auto Xsimd = xs::load_aligned(Rptr${jr}); +#endif accumsimd${ir}_${jr} = xs::fma(Xtd${ir}, Xsimd, accumsimd${ir}_${jr}); } % endfor @@ -91,7 +103,11 @@ namespace xs = xsimd; // horizontal sum of the simd blocks % for ir in range(IBLOCK): % for jr in range(JBLOCK): +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + F accum${ir}_${jr} = accumsimd${ir}_${jr}; +#else F accum${ir}_${jr} = xs::XSIMD_REDUCE_ADD(accumsimd${ir}_${jr}); +#endif % endfor % endfor @@ -150,7 +166,11 @@ void dense_base${kparallel}(F* R, F* L, F* d, F* out, Py_ssize_t jmin2, Py_ssize_t jmax2, Py_ssize_t kmin, Py_ssize_t kmax, Int innerblock, Int kstep) { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + constexpr std::size_t simd_size = 1; +#else constexpr std::size_t simd_size = xsimd::simd_type::size; +#endif for (Py_ssize_t imin = imin2; imin < imax2; imin+=innerblock) { Py_ssize_t imax = imin + innerblock; if (imax > imax2) { @@ -248,7 +268,11 @@ template void _dense${order}_sandwich(Int* rows, Int* cols, F* X, F* d, F* out, Int in_n, Int out_m, Int m, Int n, Int thresh1d, Int kratio, Int innerblock) { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + constexpr std::size_t simd_size = 1; +#else constexpr std::size_t simd_size = xsimd::simd_type::size; +#endif constexpr auto alignment = simd_size * sizeof(F); bool kparallel = (in_n / (kratio*thresh1d)) > (out_m / thresh1d); @@ -292,7 +316,11 @@ template void _dense${order}_rmatvec(Int* rows, Int* cols, F* X, F* v, F* out, Int n_rows, Int n_cols, Int m, Int n) { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + constexpr std::size_t simd_size = 1; +#else constexpr std::size_t simd_size = xsimd::simd_type::size; +#endif constexpr std::size_t alignment = simd_size * sizeof(F); auto outglobal = make_aligned_unique(omp_get_max_threads()*n_cols, alignment); diff --git a/src/tabmat/ext/sparse_helpers-tmpl.cpp b/src/tabmat/ext/sparse_helpers-tmpl.cpp index 9704d4cc..bdf535fd 100644 --- a/src/tabmat/ext/sparse_helpers-tmpl.cpp +++ b/src/tabmat/ext/sparse_helpers-tmpl.cpp @@ -30,7 +30,12 @@ void _csr_dense${order}_sandwich( Int nrows, Int nA_cols, Int nB_cols ) { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + constexpr Int simd_size = 1; +#else constexpr Int simd_size = xsimd::simd_type::size; +#endif + constexpr auto alignment = simd_size*sizeof(F); int kblock = 128; @@ -95,15 +100,28 @@ void _csr_dense${order}_sandwich( } F Q = Adata[A_idx]; +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + auto Qsimd = Q; +#else auto Qsimd = xs::XSIMD_BROADCAST(Q); +#endif Py_ssize_t Cj = Cjj; Py_ssize_t Cjmax2 = Cjj + ((Cjmax - Cjj) / simd_size) * simd_size; for (; Cj < Cjmax2; Cj+=simd_size) { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + auto Bsimd = R[(Py_ssize_t) (Ck-Ckk) * jblock + (Cj-Cjj)]; + auto outsimd = outtemp.get()[Ci * nB_cols_rounded + Cj]; +#else auto Bsimd = xs::load_aligned(&R[(Py_ssize_t) (Ck-Ckk) * jblock + (Cj-Cjj)]); auto outsimd = xs::load_aligned(&outtemp.get()[Ci * nB_cols_rounded + Cj]); +#endif outsimd = xs::fma(Qsimd, Bsimd, outsimd); +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + outtemp.get()[Ci * nB_cols_rounded + Cj] = outsimd; +#else outsimd.store_aligned(&outtemp.get()[Ci * nB_cols_rounded + Cj]); +#endif } for (; Cj < Cjmax; Cj++) {