[CIR] X86 vector fcmp-sse vector builtins

liuzhenya · liuzhenya · commit d3aa05bf1d73 · 2025-11-11T23:34:58.000-10:00
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -131,6 +131,14 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return cir::IntType::get(getContext(), n, false);
   }
 
+  static unsigned getCIRIntOrFloatBitWidth(mlir::Type eltTy) {
+    if (auto intType = mlir::dyn_cast<cir::IntTypeInterface>(eltTy))
+      return intType.getWidth();
+    if (auto floatType = mlir::dyn_cast<cir::FPTypeInterface>(eltTy))
+      return floatType.getWidth();
+
+    llvm_unreachable("Wrong type passed in or Non-CIR type passed in");
+  }
   cir::IntType getSIntNTy(int n) {
     return cir::IntType::get(getContext(), n, true);
   }
@@ -575,6 +583,16 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return cir::CmpOp::create(*this, loc, getBoolTy(), kind, lhs, rhs);
   }
 
+  cir::VecCmpOp createVecCompare(mlir::Location loc, cir::CmpOpKind kind,
+                                 mlir::Value lhs, mlir::Value rhs) {
+    VectorType vecCast = mlir::cast<VectorType>(lhs.getType());
+    IntType integralTy =
+        getSIntNTy(getCIRIntOrFloatBitWidth(vecCast.getElementType()));
+    VectorType integralVecTy =
+        VectorType::get(context, integralTy, vecCast.getSize());
+    return cir::VecCmpOp::create(*this, loc, integralVecTy, kind, lhs, rhs);
+  }
+
   mlir::Value createIsNaN(mlir::Location loc, mlir::Value operand) {
     return createCompare(loc, cir::CmpOpKind::ne, operand, operand);
   }
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
@@ -256,6 +256,7 @@ struct MissingFeatures {
   static bool emitBranchThroughCleanup() { return false; }
   static bool emitCheckedInBoundsGEP() { return false; }
   static bool emitCondLikelihoodViaExpectIntrinsic() { return false; }
+  static bool emitConstrainedFPCall() { return false; }
   static bool emitLifetimeMarkers() { return false; }
   static bool emitLValueAlignmentAssumption() { return false; }
   static bool emitNullCheckForDeleteCalls() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -27,13 +27,26 @@ namespace clang::CIRGen {
 
 class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
   const CIRGenTypeCache &typeCache;
+  bool isFPConstrained = false;
   llvm::StringMap<unsigned> recordNames;
   llvm::StringMap<unsigned> globalsVersioning;
 
 public:
   CIRGenBuilderTy(mlir::MLIRContext &mlirContext, const CIRGenTypeCache &tc)
       : CIRBaseBuilderTy(mlirContext), typeCache(tc) {}
 
+  //
+  // Floating point specific helpers
+  // -------------------------------
+  //
+
+  /// Query for the use of constrained floating point math
+  bool getisFPConstrained() {
+    if (isFPConstrained)
+      llvm_unreachable("Constrained FP NYI");
+    return isFPConstrained;
+  }
+
   /// Get a cir::ConstArrayAttr for a string literal.
   /// Note: This is different from what is returned by
   /// mlir::Builder::getStringAttr() which is an mlir::StringAttr.
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -625,6 +625,27 @@ CIRGenFunction::emitTargetBuiltinExpr(unsigned builtinID, const CallExpr *e,
                                    getTarget().getTriple().getArch());
 }
 
+// Handle immediate-constant (ICE) requirements for builtin args.
+// `iceArguments` is a bitmask: if bit `idx` is set, arg `idx` must be an
+// integer constant expression; we constant-fold it so the intrinsic sees
+// a ConstantInt. Otherwise we emit it as a normal scalar value.
+mlir::Value CIRGenFunction::emitScalarOrConstFoldImmArg(unsigned iceArguments,
+                                                        unsigned idx,
+                                                        const CallExpr *expr) {
+  mlir::Value arg = {};
+  if ((iceArguments & (1 << idx)) == 0) {
+    arg = emitScalarExpr(expr->getArg(idx));
+  } else {
+    // If this is required to be a constant, constant fold it so that we
+    // know that the generated intrinsic gets a ConstantInt.
+    std::optional<llvm::APSInt> result =
+        expr->getArg(idx)->getIntegerConstantExpr(getContext());
+    assert(result && "Expected argument to be a constant");
+    arg = builder.getConstInt(getLoc(expr->getSourceRange()), *result);
+  }
+  return arg;
+}
+
 /// Given a builtin id for a function like "__builtin_fabsf", return a Function*
 /// for "fabsf".
 cir::FuncOp CIRGenModule::getBuiltinLibFunction(const FunctionDecl *fd,
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -22,17 +22,17 @@ using namespace clang;
 using namespace clang::CIRGen;
 
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
-                                               const CallExpr *e) {
+                                               const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
-    cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_is");
+    cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_is");
     return {};
   }
   if (builtinID == Builtin::BI__builtin_cpu_supports) {
-    cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_supports");
+    cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_supports");
     return {};
   }
   if (builtinID == Builtin::BI__builtin_cpu_init) {
-    cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_init");
+    cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_init");
     return {};
   }
 
@@ -43,6 +43,53 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   // Find out if any arguments are required to be integer constant expressions.
   assert(!cir::MissingFeatures::handleBuiltinICEArguments());
 
+  llvm::SmallVector<mlir::Value> ops;
+
+  // Find out if any arguments are required to be integer constant expressions.
+  unsigned iceArguments = 0;
+  ASTContext::GetBuiltinTypeError error;
+  getContext().GetBuiltinType(builtinID, error, &iceArguments);
+  assert(error == ASTContext::GE_None &&
+         "builtinID should be checked before calling emitX86BuiltinExpr");
+  for (auto [idx, arg] : llvm::enumerate(expr->arguments())) {
+    ops.push_back(emitScalarOrConstFoldImmArg(iceArguments, idx, expr));
+  }
+
+  // OG has unordered comparison as a form of optimization in addition to
+  // ordered comparison, while CIR doesn't.
+  //
+  // This means that we can't encode the comparison code of UGT (unordered
+  // greater than), at least not at the CIR level.
+  //
+  // The boolean shouldInvert compensates for this.
+  // For example: to get to the comparison code UGT, we pass in
+  // emitVectorFCmp (OLE, shouldInvert = true) since OLE is the inverse of UGT.
+
+  // There are several ways to support this otherwise:
+  // - register extra CmpOpKind for unordered comparison types and build the
+  // translation code for
+  //    to go from CIR -> LLVM dialect. Notice we get this naturally with
+  //    shouldInvert, benefiting from existing infrastructure, albeit having to
+  //    generate an extra `not` at CIR).
+  // - Just add extra comparison code to a new VecCmpOpKind instead of
+  // cluttering CmpOpKind.
+  // - Add a boolean in VecCmpOp to indicate if it's doing unordered or ordered
+  // comparison
+  // - Just emit the intrinsics call instead of calling this helper, see how the
+  // LLVM lowering handles this.
+  auto emitVectorFCmp  = [this, &ops, &expr](cir::CmpOpKind pred,
+                                             bool shouldInvert,
+                                             bool isSignaling) {
+    assert(!cir::MissingFeatures::cgFPOptionsRAII());
+    auto loc = getLoc(expr->getExprLoc());
+    mlir::Value cmp;
+    assert(cir::MissingFeatures::emitConstrainedFPCall());
+    cmp = builder.createVecCompare(loc, pred, ops[0], ops[1]);
+    mlir::Value bitCast = builder.createBitcast(
+        shouldInvert ? builder.createNot(cmp) : cmp, ops[0].getType());
+    return bitCast;
+  };
+
   switch (builtinID) {
   default:
     return {};
@@ -710,10 +757,18 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_cmpunordpd:
   case X86::BI__builtin_ia32_cmpneqps:
   case X86::BI__builtin_ia32_cmpneqpd:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_cmpnltps:
   case X86::BI__builtin_ia32_cmpnltpd:
+    return emitVectorFCmp (cir::CmpOpKind::lt, /*shouldInvert=*/true,
+                           /*isSignaling=*/true);
   case X86::BI__builtin_ia32_cmpnleps:
   case X86::BI__builtin_ia32_cmpnlepd:
+    return emitVectorFCmp (cir::CmpOpKind::le, /*shouldInvert=*/true,
+                           /*isSignaling=*/true);
   case X86::BI__builtin_ia32_cmpordps:
   case X86::BI__builtin_ia32_cmpordpd:
   case X86::BI__builtin_ia32_cmpph128_mask:
@@ -798,7 +853,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
   case X86::BI__builtin_ia32_vfmaddcsh_round_mask3:
   case X86::BI__builtin_ia32_prefetchi:
-    cgm.errorNYI(e->getSourceRange(),
+    cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented X86 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
     return {};
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1699,6 +1699,9 @@ class CIRGenFunction : public CIRGenTypeCache {
   void emitScalarInit(const clang::Expr *init, mlir::Location loc,
                       LValue lvalue, bool capturedByInit = false);
 
+  mlir::Value emitScalarOrConstFoldImmArg(unsigned iceArguments, unsigned idx,
+                                          const CallExpr *expr);
+
   void emitStaticVarDecl(const VarDecl &d, cir::GlobalLinkageKind linkage);
 
   void emitStoreOfComplex(mlir::Location loc, mlir::Value v, LValue dest,
diff --git a/clang/test/CIR/CodeGen/builtin-fcmp-sse.c b/clang/test/CIR/CodeGen/builtin-fcmp-sse.c
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
+
+__m128 test_cmpnleps(__m128 A, __m128 B) {
+  // CIR-LABEL: @test_cmpnleps
+  // CIR: [[CMP:%.*]] = cir.vec.cmp(le, [[A:%.*]], [[B:%.*]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+  // CIR: [[NOTCMP:%.*]] = cir.unary(not, [[CMP]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+  // CIR-NEXT: [[CAST:%.*]] = cir.cast bitcast [[NOTCMP:%.*]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float>
+  // CIR-NEXT: cir.store [[CAST]], [[ALLOCA:%.*]] :  !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
+  // CIR-NEXT: [[LD:%.*]] = cir.load [[ALLOCA]] :
+  // CIR-NEXT: cir.return [[LD]] : !cir.vector<4 x !cir.float>
+
+  // LLVM-LABEL: test_cmpnleps
+  // LLVM: [[CMP:%.*]] = fcmp ugt <4 x float> {{.*}}, {{.*}}
+  // LLVM-NEXT: [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // LLVM-NEXT: [[CAST:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // LLVM-NEXT: ret <4 x float> [[CAST]]
+
+  // OGCG-LABEL: test_cmpnleps
+  // OGCG: [[CMP:%.*]] = fcmp ugt <4 x float> {{.*}}, {{.*}}
+  // OGCG-NEXT: [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // OGCG-NEXT: [[CAST:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // OGCG-NEXT: ret <4 x float> [[CAST]]
+  return __builtin_ia32_cmpnleps(A, B);
+}
+
+
+__m128d test_cmpnlepd(__m128d A, __m128d B) {
+  // CIR-LABEL: @test_cmpnlepd
+  // CIR: [[CMP:%.*]] = cir.vec.cmp(le, [[A:%.*]], [[B:%.*]]) :  !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i>
+  // CIR-NEXT: [[NOTCMP:%.*]] = cir.unary(not, [[CMP]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>
+  // CIR-NEXT: [[CAST:%.*]] = cir.cast bitcast [[NOTCMP]] :  !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double>
+  // CIR-NEXT: cir.store [[CAST]], [[ALLOCA:%.*]] : !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>
+  // CIR-NEXT: [[LD:%.*]] = cir.load [[ALLOCA]] :
+  // CIR-NEXT: cir.return [[LD]] : !cir.vector<2 x !cir.double>
+
+  // LLVM-LABEL: test_cmpnlepd
+  // LLVM: [[CMP:%.*]] = fcmp ugt <2 x double> {{.*}}, {{.*}}
+  // LLVM-NEXT: [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // LLVM-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // LLVM-NEXT: ret <2 x double> [[CAST]]
+
+  // OGCG-LABEL: test_cmpnlepd
+  // OGCG: [[CMP:%.*]] = fcmp ugt <2 x double> {{.*}}, {{.*}}
+  // OGCG-NEXT: [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // OGCG-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // OGCG-NEXT: ret <2 x double> [[CAST]]
+ return  __builtin_ia32_cmpnlepd(A, B);
+}
+
+
+__m128 test_cmpnltps(__m128 A, __m128 B) {
+  // CIR-LABEL: @test_cmpnltps
+  // CIR: [[CMP:%.*]] = cir.vec.cmp(lt, [[A:%.*]], [[B:%.*]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+  // CIR: [[NOTCMP:%.*]] = cir.unary(not, [[CMP]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+  // CIR-NEXT: [[CAST:%.*]] = cir.cast bitcast [[NOTCMP:%.*]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float>
+  // CIR-NEXT: cir.store [[CAST]], [[ALLOCA:%.*]] :  !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
+  // CIR-NEXT: [[LD:%.*]] = cir.load [[ALLOCA]] :
+  // CIR-NEXT: cir.return [[LD]] : !cir.vector<4 x !cir.float>
+
+  // LLVM-LABEL: test_cmpnltps
+  // LLVM: [[CMP:%.*]] = fcmp uge <4 x float> {{.*}}, {{.*}}
+  // LLVM-NEXT: [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // LLVM-NEXT: [[CAST:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // LLVM-NEXT: ret <4 x float> [[CAST]]
+
+  // OGCG-LABEL: test_cmpnltps
+  // OGCG: [[CMP:%.*]] = fcmp uge <4 x float> {{.*}}, {{.*}}
+  // OGCG-NEXT: [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // OGCG-NEXT: [[CAST:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // OGCG-NEXT: ret <4 x float> [[CAST]]
+  return __builtin_ia32_cmpnltps(A, B);
+}
+
+
+__m128d test_cmpnltpd(__m128d A, __m128d B) {
+  // CIR-LABEL: @test_cmpnltpd
+  // CIR: [[CMP:%.*]] = cir.vec.cmp(lt, [[A:%.*]], [[B:%.*]]) :  !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i>
+  // CIR-NEXT: [[NOTCMP:%.*]] = cir.unary(not, [[CMP]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>
+  // CIR-NEXT: [[CAST:%.*]] = cir.cast bitcast [[NOTCMP]] :  !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double>
+  // CIR-NEXT: cir.store [[CAST]], [[ALLOCA:%.*]] : !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>
+  // CIR-NEXT: [[LD:%.*]] = cir.load [[ALLOCA]] :
+  // CIR-NEXT: cir.return [[LD]] : !cir.vector<2 x !cir.double>
+
+  // LLVM-LABEL: test_cmpnltpd
+  // LLVM: [[CMP:%.*]] = fcmp uge <2 x double> {{.*}}, {{.*}}
+  // LLVM-NEXT: [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // LLVM-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // LLVM-NEXT: ret <2 x double> [[CAST]]
+
+  // OGCG-LABEL: test_cmpnltpd
+  // OGCG: [[CMP:%.*]] = fcmp uge <2 x double> {{.*}}, {{.*}}
+  // OGCG-NEXT: [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // OGCG-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // OGCG-NEXT: ret <2 x double> [[CAST]]
+  return  __builtin_ia32_cmpnltpd(A, B);
+}