Skip to content

Commit f395da4

Browse files
committedFeb 13, 2025·
Merge remote-tracking branch 'origin/main' into abadams/zen4_natural_vector_size
2 parents 8030aab + f770beb commit f395da4

14 files changed

+126
-131
lines changed
 

‎CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ option(THREADS_PREFER_PTHREAD_FLAG "When enabled, prefer to use the -pthread fla
196196
find_package(Threads REQUIRED)
197197

198198
## LLVM
199-
find_package(Halide_LLVM 18...20 REQUIRED
199+
find_package(Halide_LLVM 18...99 REQUIRED # Use 99 to fake a minimum-only constraint
200200
COMPONENTS WebAssembly X86
201201
OPTIONAL_COMPONENTS AArch64 ARM Hexagon NVPTX PowerPC RISCV)
202202

‎src/CodeGen_LLVM.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -748,7 +748,7 @@ Value *CodeGen_LLVM::register_destructor(llvm::Function *destructor_fn, Value *o
748748
IRBuilderBase::InsertPoint here = builder->saveIP();
749749
BasicBlock *dtors = get_destructor_block();
750750

751-
builder->SetInsertPoint(dtors->getFirstNonPHI());
751+
builder->SetInsertPoint(dtors->getFirstNonPHIIt());
752752

753753
PHINode *error_code = dyn_cast<PHINode>(dtors->begin());
754754
internal_assert(error_code) << "The destructor block is supposed to start with a phi node\n";

‎src/CodeGen_PTX_Dev.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
156156
}
157157
}
158158

159+
function->setCallingConv(llvm::CallingConv::PTX_Kernel);
160+
159161
// Make the initial basic block
160162
entry_block = BasicBlock::Create(*context, "entry", function);
161163
builder->SetInsertPoint(entry_block);

‎src/IROperator.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -496,7 +496,6 @@ Expr lossless_cast(Type t,
496496
Expr a = lossless_cast(t, op->a, scope, cache);
497497
Expr b = lossless_cast(t, op->b, scope, cache);
498498
if (a.defined() && b.defined()) {
499-
debug(0) << a << " " << b << "\n";
500499
return Min::make(a, b);
501500
}
502501
} else if (const Max *op = e.as<Max>()) {

‎src/runtime/aarch64_cpu_features.cpp

+22-23
Original file line numberDiff line numberDiff line change
@@ -21,24 +21,24 @@ extern "C" unsigned long getauxval(unsigned long type);
2121

2222
namespace {
2323

24-
void set_platform_features(CpuFeatures &features) {
24+
void set_platform_features(CpuFeatures *features) {
2525
unsigned long hwcaps = getauxval(AT_HWCAP);
2626
unsigned long hwcaps2 = getauxval(AT_HWCAP2);
2727

2828
if (hwcaps & HWCAP_ASIMDDP) {
29-
features.set_available(halide_target_feature_arm_dot_prod);
29+
halide_set_available_cpu_feature(features, halide_target_feature_arm_dot_prod);
3030
}
3131

3232
if (hwcaps & HWCAP_ASIMDHP) {
33-
features.set_available(halide_target_feature_arm_fp16);
33+
halide_set_available_cpu_feature(features, halide_target_feature_arm_fp16);
3434
}
3535

3636
if (hwcaps & HWCAP_SVE) {
37-
features.set_available(halide_target_feature_sve);
37+
halide_set_available_cpu_feature(features, halide_target_feature_sve);
3838
}
3939

4040
if (hwcaps2 & HWCAP2_SVE2) {
41-
features.set_available(halide_target_feature_sve2);
41+
halide_set_available_cpu_feature(features, halide_target_feature_sve2);
4242
}
4343
}
4444

@@ -56,13 +56,13 @@ bool sysctl_is_set(const char *name) {
5656
return sysctlbyname(name, &enabled, &enabled_len, nullptr, 0) == 0 && enabled;
5757
}
5858

59-
void set_platform_features(CpuFeatures &features) {
59+
void set_platform_features(CpuFeatures *features) {
6060
if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
61-
features.set_available(halide_target_feature_arm_dot_prod);
61+
halide_set_available_cpu_feature(features, halide_target_feature_arm_dot_prod);
6262
}
6363

6464
if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
65-
features.set_available(halide_target_feature_arm_fp16);
65+
halide_set_available_cpu_feature(features, halide_target_feature_arm_fp16);
6666
}
6767
}
6868

@@ -84,20 +84,20 @@ extern "C" BOOL IsProcessorFeaturePresent(DWORD feature);
8484

8585
namespace {
8686

87-
void set_platform_features(CpuFeatures &features) {
87+
void set_platform_features(CpuFeatures *features) {
8888
// This is the strategy used by Google's cpuinfo library for
8989
// detecting fp16 arithmetic support on Windows.
9090
if (!IsProcessorFeaturePresent(PF_FLOATING_POINT_EMULATED) &&
9191
IsProcessorFeaturePresent(PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE)) {
92-
features.set_available(halide_target_feature_arm_fp16);
92+
halide_set_available_cpu_feature(features, halide_target_feature_arm_fp16);
9393
}
9494

9595
if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
96-
features.set_available(halide_target_feature_arm_dot_prod);
96+
halide_set_available_cpu_feature(features, halide_target_feature_arm_dot_prod);
9797
}
9898

9999
if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
100-
features.set_available(halide_target_feature_sve);
100+
halide_set_available_cpu_feature(features, halide_target_feature_sve);
101101
}
102102
}
103103

@@ -107,28 +107,27 @@ void set_platform_features(CpuFeatures &features) {
107107

108108
namespace {
109109

110-
void set_platform_features(CpuFeatures &) {
110+
void set_platform_features(CpuFeatures *) {
111111
}
112112

113113
} // namespace
114114

115115
#endif
116116

117-
WEAK CpuFeatures halide_get_cpu_features() {
118-
CpuFeatures features;
119-
features.set_known(halide_target_feature_arm_dot_prod);
120-
features.set_known(halide_target_feature_arm_fp16);
121-
features.set_known(halide_target_feature_armv7s);
122-
features.set_known(halide_target_feature_no_neon);
123-
features.set_known(halide_target_feature_sve);
124-
features.set_known(halide_target_feature_sve2);
117+
extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) {
118+
halide_set_known_cpu_feature(features, halide_target_feature_arm_dot_prod);
119+
halide_set_known_cpu_feature(features, halide_target_feature_arm_fp16);
120+
halide_set_known_cpu_feature(features, halide_target_feature_armv7s);
121+
halide_set_known_cpu_feature(features, halide_target_feature_no_neon);
122+
halide_set_known_cpu_feature(features, halide_target_feature_sve);
123+
halide_set_known_cpu_feature(features, halide_target_feature_sve2);
125124

126125
// All ARM architectures support "No Neon".
127-
features.set_available(halide_target_feature_no_neon);
126+
halide_set_available_cpu_feature(features, halide_target_feature_no_neon);
128127

129128
set_platform_features(features);
130129

131-
return features;
130+
return halide_error_code_success;
132131
}
133132

134133
} // namespace Internal

‎src/runtime/arm_cpu_features.cpp

+17-18
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@ extern "C" unsigned long getauxval(unsigned long type);
1818

1919
namespace {
2020

21-
void set_platform_features(CpuFeatures &features) {
21+
void set_platform_features(CpuFeatures *features) {
2222
unsigned long hwcaps = getauxval(AT_HWCAP);
2323

2424
if (hwcaps & HWCAP_ASIMDDP) {
25-
features.set_available(halide_target_feature_arm_dot_prod);
25+
halide_set_available_cpu_feature(features, halide_target_feature_arm_dot_prod);
2626
}
2727

2828
if (hwcaps & HWCAP_ASIMDHP) {
29-
features.set_available(halide_target_feature_arm_fp16);
29+
halide_set_available_cpu_feature(features, halide_target_feature_arm_fp16);
3030
}
3131
}
3232

@@ -68,17 +68,17 @@ bool is_armv7s() {
6868
return type == CPU_TYPE_ARM && subtype == CPU_SUBTYPE_ARM_V7S;
6969
}
7070

71-
void set_platform_features(CpuFeatures &features) {
71+
void set_platform_features(CpuFeatures *features) {
7272
if (is_armv7s()) {
73-
features.set_available(halide_target_feature_armv7s);
73+
halide_set_available_cpu_feature(features, halide_target_feature_armv7s);
7474
}
7575

7676
if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
77-
features.set_available(halide_target_feature_arm_dot_prod);
77+
halide_set_available_cpu_feature(features, halide_target_feature_arm_dot_prod);
7878
}
7979

8080
if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
81-
features.set_available(halide_target_feature_arm_fp16);
81+
halide_set_available_cpu_feature(features, halide_target_feature_arm_fp16);
8282
}
8383
}
8484

@@ -88,28 +88,27 @@ void set_platform_features(CpuFeatures &features) {
8888

8989
namespace {
9090

91-
void set_platform_features(CpuFeatures &) {
91+
void set_platform_features(CpuFeatures *) {
9292
}
9393

9494
} // namespace
9595

9696
#endif
9797

98-
WEAK CpuFeatures halide_get_cpu_features() {
99-
CpuFeatures features;
100-
features.set_known(halide_target_feature_arm_dot_prod);
101-
features.set_known(halide_target_feature_arm_fp16);
102-
features.set_known(halide_target_feature_armv7s);
103-
features.set_known(halide_target_feature_no_neon);
104-
features.set_known(halide_target_feature_sve);
105-
features.set_known(halide_target_feature_sve2);
98+
extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) {
99+
halide_set_known_cpu_feature(features, halide_target_feature_arm_dot_prod);
100+
halide_set_known_cpu_feature(features, halide_target_feature_arm_fp16);
101+
halide_set_known_cpu_feature(features, halide_target_feature_armv7s);
102+
halide_set_known_cpu_feature(features, halide_target_feature_no_neon);
103+
halide_set_known_cpu_feature(features, halide_target_feature_sve);
104+
halide_set_known_cpu_feature(features, halide_target_feature_sve2);
106105

107106
// All ARM architectures support "No Neon".
108-
features.set_available(halide_target_feature_no_neon);
107+
halide_set_available_cpu_feature(features, halide_target_feature_no_neon);
109108

110109
set_platform_features(features);
111110

112-
return features;
111+
return halide_error_code_success;
113112
}
114113

115114
} // namespace Internal

‎src/runtime/can_use_target.cpp

+6-4
Original file line numberDiff line numberDiff line change
@@ -40,21 +40,23 @@ WEAK int halide_default_can_use_target_features(int count, const uint64_t *featu
4040

4141
static_assert(sizeof(halide_cpu_features_storage) == sizeof(CpuFeatures), "CpuFeatures Mismatch");
4242
if (!halide_cpu_features_initialized) {
43-
CpuFeatures tmp = halide_get_cpu_features();
43+
CpuFeatures tmp;
44+
int error = halide_get_cpu_features(&tmp);
45+
halide_abort_if_false(nullptr, error == halide_error_code_success);
4446
memcpy(&halide_cpu_features_storage, &tmp, sizeof(tmp));
4547
halide_cpu_features_initialized = true;
4648
}
4749
}
4850

49-
if (count != CpuFeatures::kWordCount) {
51+
if (count != cpu_feature_mask_size) {
5052
// This should not happen unless our runtime is out of sync with the rest of libHalide.
5153
#ifdef DEBUG_RUNTIME
52-
debug(nullptr) << "count " << count << " CpuFeatures::kWordCount " << CpuFeatures::kWordCount << "\n";
54+
debug(nullptr) << "count " << count << " cpu_feature_mask_size " << cpu_feature_mask_size << "\n";
5355
#endif
5456
halide_error(nullptr, "Internal error: wrong structure size passed to halide_can_use_target_features()\n");
5557
}
5658
const CpuFeatures *cpu_features = reinterpret_cast<const CpuFeatures *>(&halide_cpu_features_storage[0]);
57-
for (int i = 0; i < CpuFeatures::kWordCount; ++i) {
59+
for (int i = 0; i < cpu_feature_mask_size; ++i) {
5860
uint64_t m;
5961
if ((m = (features[i] & cpu_features->known[i])) != 0) {
6062
if ((m & cpu_features->available[i]) != m) {

‎src/runtime/cpu_features.h

+26-31
Original file line numberDiff line numberDiff line change
@@ -8,42 +8,37 @@ namespace Halide {
88
namespace Runtime {
99
namespace Internal {
1010

11-
// Return two masks:
11+
// Size of CPU feature mask large enough to cover all Halide target features
12+
static constexpr int cpu_feature_mask_size = (halide_target_feature_end + 63) / (sizeof(uint64_t) * 8);
13+
14+
// Contains two masks:
1215
// One with all the CPU-specific features that might possible be available on this architecture ('known'),
1316
// and one with the subset that are actually present ('available').
1417
struct CpuFeatures {
15-
static const int kWordCount = (halide_target_feature_end + 63) / (sizeof(uint64_t) * 8);
16-
17-
ALWAYS_INLINE void set_known(int i) {
18-
known[i >> 6] |= ((uint64_t)1) << (i & 63);
19-
}
20-
21-
ALWAYS_INLINE void set_available(int i) {
22-
available[i >> 6] |= ((uint64_t)1) << (i & 63);
23-
}
24-
25-
ALWAYS_INLINE bool test_known(int i) const {
26-
return (known[i >> 6] & ((uint64_t)1) << (i & 63)) != 0;
27-
}
28-
29-
ALWAYS_INLINE bool test_available(int i) const {
30-
return (available[i >> 6] & ((uint64_t)1) << (i & 63)) != 0;
31-
}
32-
33-
ALWAYS_INLINE
34-
CpuFeatures() {
35-
for (int i = 0; i < kWordCount; ++i) {
36-
known[i] = 0;
37-
available[i] = 0;
38-
}
39-
}
40-
41-
uint64_t known[kWordCount]; // mask of the CPU features we know how to detect
42-
uint64_t available[kWordCount]; // mask of the CPU features that are available
43-
// (always a subset of 'known')
18+
uint64_t known[cpu_feature_mask_size] = {0}; // mask of the CPU features we know how to detect
19+
uint64_t available[cpu_feature_mask_size] = {0}; // mask of the CPU features that are available
20+
// (always a subset of 'known')
4421
};
4522

46-
extern WEAK CpuFeatures halide_get_cpu_features();
23+
ALWAYS_INLINE void halide_set_known_cpu_feature(CpuFeatures *features, int i) {
24+
features->known[i >> 6] |= ((uint64_t)1) << (i & 63);
25+
}
26+
27+
ALWAYS_INLINE void halide_set_available_cpu_feature(CpuFeatures *features, int i) {
28+
features->available[i >> 6] |= ((uint64_t)1) << (i & 63);
29+
}
30+
31+
ALWAYS_INLINE bool halide_test_known_cpu_feature(CpuFeatures *features, int i) {
32+
return (features->known[i >> 6] & ((uint64_t)1) << (i & 63)) != 0;
33+
}
34+
35+
ALWAYS_INLINE bool halide_test_available_cpu_feature(CpuFeatures *features, int i) {
36+
return (features->available[i >> 6] & ((uint64_t)1) << (i & 63)) != 0;
37+
}
38+
39+
// NOTE: This method is not part of the public API, but we push it into extern "C" to
40+
// avoid name mangling mismatches between platforms. See: https://github.com/halide/Halide/issues/8565
41+
extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features);
4742

4843
} // namespace Internal
4944
} // namespace Runtime

‎src/runtime/hexagon_cpu_features.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ namespace Halide {
55
namespace Runtime {
66
namespace Internal {
77

8-
WEAK CpuFeatures halide_get_cpu_features() {
8+
extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) {
99
// Hexagon has no CPU-specific Features.
10-
return CpuFeatures();
10+
return halide_error_code_success;
1111
}
1212

1313
} // namespace Internal

‎src/runtime/powerpc_cpu_features.cpp

+10-8
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,29 @@
88

99
#define PPC_FEATURE2_ARCH_2_07 0x80000000
1010

11-
extern "C" unsigned long int getauxval(unsigned long int);
11+
extern "C" {
12+
13+
unsigned long int getauxval(unsigned long int);
14+
}
1215

1316
namespace Halide {
1417
namespace Runtime {
1518
namespace Internal {
1619

17-
WEAK CpuFeatures halide_get_cpu_features() {
18-
CpuFeatures features;
19-
features.set_known(halide_target_feature_vsx);
20-
features.set_known(halide_target_feature_power_arch_2_07);
20+
extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) {
21+
halide_set_known_cpu_feature(features, halide_target_feature_vsx);
22+
halide_set_known_cpu_feature(features, halide_target_feature_power_arch_2_07);
2123

2224
const unsigned long hwcap = getauxval(AT_HWCAP);
2325
const unsigned long hwcap2 = getauxval(AT_HWCAP2);
2426

2527
if (hwcap & PPC_FEATURE_HAS_VSX) {
26-
features.set_available(halide_target_feature_vsx);
28+
halide_set_available_cpu_feature(features, halide_target_feature_vsx);
2729
}
2830
if (hwcap2 & PPC_FEATURE2_ARCH_2_07) {
29-
features.set_available(halide_target_feature_power_arch_2_07);
31+
halide_set_available_cpu_feature(features, halide_target_feature_power_arch_2_07);
3032
}
31-
return features;
33+
return halide_error_code_success;
3234
}
3335

3436
} // namespace Internal

‎src/runtime/riscv_cpu_features.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ namespace Halide {
55
namespace Runtime {
66
namespace Internal {
77

8-
WEAK CpuFeatures halide_get_cpu_features() {
8+
extern "C" WEAK int halide_get_cpu_features(Halide::Runtime::Internal::CpuFeatures *features) {
99
// For now, no version specific features, though RISCV promises to have many.
10-
return CpuFeatures();
10+
return halide_error_code_success;
1111
}
1212

1313
} // namespace Internal

0 commit comments

Comments
 (0)
Please sign in to comment.