You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
LLVM components incorrectly detect supported CPU instructions, resulting in the generation of illegal instructions, resulting in faulting binaries.
Detailed Description
The compiler infrastructure appears to use static CPU model mappings to determine available instruction sets, instead of querying what the CPU actually supports (via /proc/cpuinfo or hwcap on AArch64). This causes particular problems on:
Arm-V9 CPUs from Qualcomm SoCs that do not implement SVE despite the Arm-V9 specification requiring it
Potentially any system where the Linux kernel is not configured with CONFIG_ARM64_SVE=Y
But is not necessarily limited to AArch64 or the above.
This affects instruction selection/codegen/runtime dispatching for all of LLVM like Clang, Flang, OpenMP, ORC JIT, etc.
Reproduction Steps
Below is a relatively minimal test case using ORC JIT that demonstrates the issue. A Termux environment on Android devices using Qualcomm chips is likely the easiest target for reproduction. This could also be reproduced with a vectorizable loop in C code, with Clang, using the "-march=native" flag.
#include<iostream>
#include<vector>
#include<string>
#include"llvm/ExecutionEngine/Orc/LLJIT.h"
#include"llvm/IR/Function.h"
#include"llvm/IR/IRBuilder.h"
#include"llvm/IR/Module.h"
#include"llvm/IR/PassManager.h"
#include"llvm/IR/Verifier.h"
#include"llvm/Support/TargetSelect.h"
#include"llvm/TargetParser/Host.h"
#include"llvm/Passes/PassBuilder.h"
#include"llvm/Analysis/TargetTransformInfo.h"
#include"llvm/Target/TargetMachine.h"usingnamespacellvm;usingnamespacellvm::orc;// Diagnostic handler to suppress remarksclassSilenceRemarksHandler : publicDiagnosticHandler {
public:boolhandleDiagnostics(const DiagnosticInfo &DI) override {
// Ignore remarks, pass through other diagnosticsif (DI.getSeverity() == DS_Remark) {
returntrue;
}
returnfalse;
}
};
std::unique_ptr<Module> createVectorModule(LLVMContext &Context) {
auto M = std::make_unique<Module>("VecTest", Context);
auto *FloatTy = Type::getFloatTy(Context);
auto *FloatPtrTy = PointerType::get(FloatTy, 0);
auto *Int32Ty = Type::getInt32Ty(Context);
FunctionType *FT = FunctionType::get(
Type::getVoidTy(Context),
{FloatPtrTy, FloatPtrTy, FloatPtrTy, Int32Ty},
false);
Function *F = Function::Create(FT, Function::ExternalLinkage, "vector_op", M.get());
F->addFnAttr(Attribute::NoUnwind);
auto Args = F->arg_begin();
Value *A = &*Args++;
Value *B = &*Args++;
Value *Result = &*Args++;
Value *Length = &*Args++;
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
BasicBlock *LoopBB = BasicBlock::Create(Context, "loop", F);
BasicBlock *ExitBB = BasicBlock::Create(Context, "exit", F);
IRBuilder<> Builder(Context);
Builder.SetInsertPoint(EntryBB);
Value *IndexAlloca = Builder.CreateAlloca(Int32Ty, nullptr, "i");
Builder.CreateStore(ConstantInt::get(Int32Ty, 0), IndexAlloca);
Builder.CreateBr(LoopBB);
Builder.SetInsertPoint(LoopBB);
Value *Index = Builder.CreateLoad(Int32Ty, IndexAlloca, "idx");
Value *LoopCond = Builder.CreateICmpSLT(Index, Length, "cond");
Value *APtr = Builder.CreateGEP(FloatTy, A, Index, "a_ptr");
Value *BPtr = Builder.CreateGEP(FloatTy, B, Index, "b_ptr");
Value *ResultPtr = Builder.CreateGEP(FloatTy, Result, Index, "result_ptr");
MDNode *AccessGroup = MDNode::get(Context, {});
Value *AVal = Builder.CreateLoad(FloatTy, APtr, "a_val");
Value *BVal = Builder.CreateLoad(FloatTy, BPtr, "b_val");
cast<Instruction>(AVal)->setMetadata("llvm.mem.parallel_loop_access", AccessGroup);
cast<Instruction>(BVal)->setMetadata("llvm.mem.parallel_loop_access", AccessGroup);
Value *Square = Builder.CreateFMul(AVal, AVal, "square");
Value *AddResult = Builder.CreateFAdd(Square, BVal, "add");
auto *StoreInst = Builder.CreateStore(AddResult, ResultPtr);
StoreInst->setMetadata("llvm.mem.parallel_loop_access", AccessGroup);
Value *NextIndex = Builder.CreateAdd(Index, ConstantInt::get(Int32Ty, 1), "next_idx");
Builder.CreateStore(NextIndex, IndexAlloca);
// Loop metadata to force vectorization
MDNode *ForcedVec = MDNode::get(Context, {
MDString::get(Context, "llvm.loop.vectorize.enable"),
ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 1))
});
MDNode *LoopID = MDNode::get(Context, {MDNode::get(Context, {}), ForcedVec});
LoopID->replaceOperandWith(0, LoopID);
Builder.CreateCondBr(LoopCond, LoopBB, ExitBB)->setMetadata("llvm.loop", LoopID);
Builder.SetInsertPoint(ExitBB);
Builder.CreateRetVoid();
verifyFunction(*F);
return M;
}
// Apply optimization passes to force vectorizationvoidoptimizeModule(Module &M, TargetMachine *TM) {
PassBuilder PB;
LoopAnalysisManager LAM;
FunctionAnalysisManager FAM;
CGSCCAnalysisManager CGAM;
ModuleAnalysisManager MAM;
FAM.registerPass([&] { return TM->getTargetIRAnalysis(); });
PB.registerModuleAnalyses(MAM);
PB.registerCGSCCAnalyses(CGAM);
PB.registerFunctionAnalyses(FAM);
PB.registerLoopAnalyses(LAM);
PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
ModulePassManager MPM = PB.buildPerModuleDefaultPipeline(OptimizationLevel::O3);
MPM.run(M, MAM);
}
intmain(int argc, char** argv) {
// Parse command line argumentsbool useNoSVE = false;
for (int i = 1; i < argc; i++) {
if (std::string(argv[i]) == "--use-nosve") {
useNoSVE = true;
}
}
InitializeNativeTarget();
InitializeNativeTargetAsmPrinter();
InitializeNativeTargetAsmParser();
// Silence remarks
LLVMContext Context;
Context.setDiagnosticHandler(std::make_unique<SilenceRemarksHandler>());
auto JTMB = cantFail(JITTargetMachineBuilder::detectHost());
JTMB.setCodeGenOptLevel(CodeGenOptLevel::Aggressive);
if (useNoSVE) {
JTMB.addFeatures(std::vector<std::string>{"-sve"});
}
std::unique_ptr<TargetMachine> TM(cantFail(JTMB.createTargetMachine()));
auto M = createVectorModule(Context);
M->setDataLayout(TM->createDataLayout());
// Apply optimization passes to ensure and force vectorizationoptimizeModule(*M, TM.get());
// Set-up JIT compiled functionauto JIT = cantFail(LLJITBuilder().setJITTargetMachineBuilder(std::move(JTMB)).create());
cantFail(JIT->addIRModule(ThreadSafeModule(std::move(M), std::make_unique<LLVMContext>())));
auto VecOpAddr = cantFail(JIT->lookup("vector_op"));
auto *VectorOp = (void(*)(float*, float*, float*, int))VecOpAddr.getValue();
constint Length = 1024;
std::vector<float> A(Length), B(Length), Result(Length);
for (int i = 0; i < Length; i++) {
A[i] = i;
B[i] = i * 2;
}
// Execute JIT-compiled function// It should fault with an illegal instruction on such devicesVectorOp(A.data(), B.data(), Result.data(), Length);
// Will only reach here if execution succeeds
std::cout << "Result[10]: " << Result[10] << std::endl;
return0;
}
When executed normally, the program generates illegal instructions on hardware that meets the specified conditions. It will also accept an argument --use-nosve to add -sve to the JIT's features list which should cause it not crash.
Additional Context
Attempting to workaround this issue locally revealed frustrating inconsistentencies in how LLVM CPU features are specified across different LLVM interfaces:
-march=
-mcpu=
-Xclang -target-feature
llvm::orc::JITTargetMachineBuilder::addFeatures()
Each of these accepts a different set of feature flags with inconsistent naming conventions and limited documentation.
The text was updated successfully, but these errors were encountered:
See #95694 for context. At the time, the discussion only mentioned crypto, but I guess sve/sve2 have a similar issue. The goal is for getHostCPUName+getHostCPUFeatures to accurately characterize what features are available, so stuff like the JIT just works.
LLVM Versions Tested: 18, 19, 20, current tip
Issue Summary
LLVM components incorrectly detect supported CPU instructions, resulting in the generation of illegal instructions, resulting in faulting binaries.
Detailed Description
The compiler infrastructure appears to use static CPU model mappings to determine available instruction sets, instead of querying what the CPU actually supports (via
/proc/cpuinfo
or hwcap on AArch64). This causes particular problems on:CONFIG_ARM64_SVE=Y
But is not necessarily limited to AArch64 or the above.
This affects instruction selection/codegen/runtime dispatching for all of LLVM like Clang, Flang, OpenMP, ORC JIT, etc.
Reproduction Steps
Below is a relatively minimal test case using ORC JIT that demonstrates the issue. A Termux environment on Android devices using Qualcomm chips is likely the easiest target for reproduction. This could also be reproduced with a vectorizable loop in C code, with Clang, using the "-march=native" flag.
When executed normally, the program generates illegal instructions on hardware that meets the specified conditions. It will also accept an argument
--use-nosve
to add-sve
to the JIT's features list which should cause it not crash.Additional Context
Attempting to workaround this issue locally revealed frustrating inconsistentencies in how LLVM CPU features are specified across different LLVM interfaces:
-march=
-mcpu=
-Xclang -target-feature
llvm::orc::JITTargetMachineBuilder::addFeatures()
Each of these accepts a different set of feature flags with inconsistent naming conventions and limited documentation.
The text was updated successfully, but these errors were encountered: