Skip to content

Commit d0d77f0

Browse files
committed
config: add SVE detection alongside NEON in aarch64 op component
- Introduce AC_CACHE_CHECK probes for ARM Scalable Vector Extension (SVE) using both a default compile test and a second test with __attribute__((__target__("+sve"))). - Define variables op_cv_sve_support and op_cv_sve_add_flags - Update AM_CONDITIONAL and AC_DEFINE to expose SVE support macros (OMPI_MCA_OP_HAVE_SVE, OMPI_MCA_OP_SVE_EXTRA_FLAGS). - Extend final AS_IF to enable the component when either NEON or SVE is available. - Add a preprocessor guard around SVE-specific function attributes - Encapsulate the +sve attribute behind OMPI_MCA_OP_SVE_EXTRA_FLAGS, ensuring that only builds which detected and enabled compiler SVE support will compile with SVE-targeted code paths. - Simplifies later code by using SVE_ATTR in function declarations instead of repeating the attribute clause. - apply SVE_ATTR macro in C source for conditional +sve targeting Signed-off-by: Marco Vogel <[email protected]>
1 parent f6fe1d4 commit d0d77f0

File tree

4 files changed

+108
-57
lines changed

4 files changed

+108
-57
lines changed

ompi/mca/op/aarch64/configure.m4

+63-20
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@
1313
#
1414

1515
# MCA_ompi_op_arm_CONFIG([action-if-can-compile],
16-
# [action-if-cant-compile])
16+
# [action-if-cant-compile])
1717
# ------------------------------------------------
1818
AC_DEFUN([MCA_ompi_op_aarch64_CONFIG],[
1919
AC_CONFIG_FILES([ompi/mca/op/aarch64/Makefile])
20+
2021
case "${host}" in
2122
aarch64*|arm64*)
2223
op_aarch64_check="yes";;
@@ -71,36 +72,74 @@ AC_DEFUN([MCA_ompi_op_aarch64_CONFIG],[
7172
[op_cv_neon_fp_support=yes],
7273
[op_cv_neon_fp_support=no])])])
7374

74-
#
75+
76+
#
7577
# Check for SVE support
7678
#
77-
AC_CACHE_CHECK([for SVE support], op_cv_sve_support,
78-
[AS_IF([test "$op_cv_neon_support" = "yes"],
79-
[
80-
AC_LINK_IFELSE(
81-
[AC_LANG_PROGRAM([[
79+
AC_CACHE_CHECK([for SVE support], [op_cv_sve_support], [
80+
AC_MSG_RESULT([])
81+
# initialize result variables
82+
op_cv_sve_support=no
83+
op_cv_sve_add_flags=no
84+
85+
# first attempt: no extra flags
86+
AC_MSG_CHECKING([for SVE support (no additional flags)])
87+
AC_LINK_IFELSE(
88+
[AC_LANG_SOURCE([[
8289
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
83-
#include <arm_sve.h>
90+
#include <arm_sve.h>
8491
#else
85-
#error "No support for __aarch64__ or SVE"
92+
#error "No support for __aarch64__ or SVE"
8693
#endif
87-
]],
88-
[[
89-
#if defined(__aarch64__) && defined(_ARM_FEATURE_SVE)
90-
svfloat32_t vA;
91-
vA = svdup_n_f32(0)
94+
95+
int main(void) {
96+
svfloat32_t vA;
97+
vA = svdup_n_f32(0);
98+
return 0;
99+
}
100+
]])],
101+
[ op_cv_sve_support=yes
102+
AC_MSG_RESULT([yes]) ],
103+
[ AC_MSG_RESULT([no ]) ]
104+
)
105+
106+
# second attempt: use +sve attribute
107+
AS_IF([test "$op_cv_sve_support" = "no"],[
108+
AC_MSG_CHECKING([for SVE support (with +sve)])
109+
AC_LINK_IFELSE(
110+
[AC_LANG_SOURCE([[
111+
#if defined(__aarch64__)
112+
#include <arm_sve.h>
113+
#else
114+
#error "not on aarch64"
92115
#endif
93-
]])],
94-
[op_cv_sve_support=yes],
95-
[op_cv_sve_support=no])])])
96-
])
97116

117+
__attribute__((__target__("+sve")))
118+
int main(void) {
119+
svbool_t pg = svptrue_b32();
120+
svuint32_t a = svdup_u32(0);
121+
svuint32_t b = svdup_u32(0);
122+
svuint32_t c = svadd_u32_m(pg, a, b);
123+
return (int)svaddv_u32(pg, c);
124+
}
125+
]])],
126+
[ op_cv_sve_support=yes
127+
op_cv_sve_add_flags=yes
128+
AC_MSG_RESULT([yes]) ],
129+
[ AC_MSG_RESULT([no ]) ]
130+
)
131+
])
132+
])
133+
134+
AC_LANG_POP
135+
])
98136
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_neon_support],
99137
[test "$op_cv_neon_support" = "yes"])
100138
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_neon_fp_support],
101139
[test "$op_cv_neon_fp_support" = "yes"])
102140
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sve_support],
103141
[test "$op_cv_sve_support" = "yes"])
142+
104143
AC_SUBST(MCA_BUILD_ompi_op_has_neon_support)
105144
AC_SUBST(MCA_BUILD_ompi_op_has_neon_fp_support)
106145
AC_SUBST(MCA_BUILD_ompi_op_has_sve_support)
@@ -111,9 +150,13 @@ AC_DEFUN([MCA_ompi_op_aarch64_CONFIG],[
111150
[AC_DEFINE([OMPI_MCA_OP_HAVE_NEON_FP], [1],[NEON FP supported in the current build])])
112151
AS_IF([test "$op_cv_sve_support" = "yes"],
113152
[AC_DEFINE([OMPI_MCA_OP_HAVE_SVE], [1],[SVE supported in the current build])])
153+
AS_IF([test "$op_cv_sve_add_flags" = "yes"],
154+
[AC_DEFINE([OMPI_MCA_OP_SVE_EXTRA_FLAGS], [1],[SVE supported with additional compile attributes])])
114155

115-
# If we have at least support for Neon
116-
AS_IF([test "$op_cv_neon_support" = "yes"],
156+
157+
# If we have at least support for Neon or SVE
158+
AS_IF([test "$op_cv_neon_support" = "yes" || test "$op_cv_sve_support" = "yes" ],
117159
[$1],
118160
[$2])
161+
119162
])dnl

ompi/mca/op/aarch64/op_aarch64.h

+6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,12 @@
2424

2525
BEGIN_C_DECLS
2626

27+
#if defined(OMPI_MCA_OP_SVE_EXTRA_FLAGS)
28+
#define SVE_ATTR __attribute__ ((__target__ ("+sve")))
29+
#else
30+
#define SVE_ATTR
31+
#endif
32+
2733
/**
2834
* Derive a struct from the base op component struct, allowing us to
2935
* cache some component-specific information on our well-known

ompi/mca/op/aarch64/op_aarch64_component.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ static int mca_op_aarch64_component_close(void)
101101
/*
102102
* Register MCA params.
103103
*/
104-
static int mca_op_aarch64_component_register(void)
104+
SVE_ATTR static int mca_op_aarch64_component_register(void)
105105
{
106106

107107
mca_op_aarch64_component.hardware_available = 1; /* Check for Neon */

ompi/mca/op/aarch64/op_aarch64_functions.c

+38-36
Original file line numberDiff line numberDiff line change
@@ -136,24 +136,25 @@ _Generic((*(out)), \
136136
} \
137137
}
138138
#elif defined(GENERATE_SVE_CODE)
139-
#define OP_AARCH64_FUNC(name, type_name, type_size, type_cnt, type, op) \
139+
#define OP_AARCH64_FUNC(name, type_name, type_size, type_cnt, type, op) \
140+
SVE_ATTR \
140141
static void OP_CONCAT(ompi_op_aarch64_2buff_##name##_##type##type_size##_t, APPEND) \
141-
(const void *_in, void *_out, int *count, \
142-
struct ompi_datatype_t **dtype, \
143-
struct ompi_op_base_module_1_0_0_t *module) \
144-
{ \
145-
const int types_per_step = svcnt(*((type##type_size##_t *) _in)); \
146-
const int cnt = *count; \
147-
type##type_size##_t *in = (type##type_size##_t *) _in, \
148-
*out = (type##type_size##_t *) _out; \
149-
OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \
150-
for (int idx=0; idx < cnt; idx += types_per_step) { \
151-
svbool_t pred = svwhilelt_b##type_size(idx, cnt); \
152-
vsrc = svld1(pred, &in[idx]); \
153-
vdst = svld1(pred, &out[idx]); \
154-
vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##_x)(pred, vdst, vsrc); \
155-
OP_CONCAT(OMPI_OP_OP_PREPEND, st1)(pred, &out[idx], vdst); \
156-
} \
142+
(const void *_in, void *_out, int *count, \
143+
struct ompi_datatype_t **dtype, \
144+
struct ompi_op_base_module_1_0_0_t *module) \
145+
{ \
146+
const int types_per_step = svcnt(*((type##type_size##_t *) _in)); \
147+
const int cnt = *count; \
148+
type##type_size##_t *in = (type##type_size##_t *) _in, \
149+
*out = (type##type_size##_t *) _out; \
150+
OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \
151+
for (int idx=0; idx < cnt; idx += types_per_step) { \
152+
svbool_t pred = svwhilelt_b##type_size(idx, cnt); \
153+
vsrc = svld1(pred, &in[idx]); \
154+
vdst = svld1(pred, &out[idx]); \
155+
vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##_x)(pred, vdst, vsrc); \
156+
OP_CONCAT(OMPI_OP_OP_PREPEND, st1)(pred, &out[idx], vdst); \
157+
} \
157158
}
158159
#endif
159160

@@ -302,25 +303,26 @@ static void OP_CONCAT(ompi_op_aarch64_3buff_##name##_##type##type_size##_t, APPE
302303
} \
303304
}
304305
#elif defined(GENERATE_SVE_CODE)
305-
#define OP_AARCH64_FUNC_3BUFF(name, type_name, type_size, type_cnt, type, op) \
306-
static void OP_CONCAT(ompi_op_aarch64_3buff_##name##_##type##type_size##_t, APPEND) \
307-
(const void *_in1, const void *_in2, void *_out, int *count, \
308-
struct ompi_datatype_t **dtype, \
309-
struct ompi_op_base_module_1_0_0_t *module) \
310-
{ \
311-
const int types_per_step = svcnt(*((type##type_size##_t *) _in1)); \
312-
type##type_size##_t *in1 = (type##type_size##_t *) _in1, \
313-
*in2 = (type##type_size##_t *) _in2, \
314-
*out = (type##type_size##_t *) _out; \
315-
const int cnt = *count; \
316-
OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \
317-
for (int idx=0; idx < cnt; idx += types_per_step) { \
318-
svbool_t pred = svwhilelt_b##type_size(idx, cnt); \
319-
vsrc = svld1(pred, &in1[idx]); \
320-
vdst = svld1(pred, &in2[idx]); \
321-
vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##_x)(pred, vdst, vsrc); \
322-
OP_CONCAT(OMPI_OP_OP_PREPEND, st1)(pred, &out[idx], vdst); \
323-
} \
306+
#define OP_AARCH64_FUNC_3BUFF(name, type_name, type_size, type_cnt, type, op) \
307+
SVE_ATTR \
308+
static void OP_CONCAT(ompi_op_aarch64_3buff_##name##_##type##type_size##_t, APPEND) \
309+
(const void *_in1, const void *_in2, void *_out, int *count, \
310+
struct ompi_datatype_t **dtype, \
311+
struct ompi_op_base_module_1_0_0_t *module) \
312+
{ \
313+
const int types_per_step = svcnt(*((type##type_size##_t *) _in1)); \
314+
type##type_size##_t *in1 = (type##type_size##_t *) _in1, \
315+
*in2 = (type##type_size##_t *) _in2, \
316+
*out = (type##type_size##_t *) _out; \
317+
const int cnt = *count; \
318+
OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \
319+
for (int idx=0; idx < cnt; idx += types_per_step) { \
320+
svbool_t pred = svwhilelt_b##type_size(idx, cnt); \
321+
vsrc = svld1(pred, &in1[idx]); \
322+
vdst = svld1(pred, &in2[idx]); \
323+
vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##_x)(pred, vdst, vsrc); \
324+
OP_CONCAT(OMPI_OP_OP_PREPEND, st1)(pred, &out[idx], vdst); \
325+
} \
324326
}
325327
#endif /* defined(GENERATE_SVE_CODE) */
326328

0 commit comments

Comments
 (0)