Test vectorized support for math functions in correctness/math.cpp

mcourteaux · mcourteaux · commit 58e4a7fe53cb · 2025-03-17T14:44:00.000+01:00
diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -67,6 +67,40 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {
         CodeGen_D3D12Compute_C(std::ostream &s, const Target &t)
             : CodeGen_GPU_C(s, t) {
             integer_suffix_style = IntegerSuffixStyle::HLSL;
+
+#define alias(x, y) \
+            extern_function_name_map[x "_f16"] = y; \
+            extern_function_name_map[x "_f32"] = y; \
+            extern_function_name_map[x "_f64"] = y
+            alias("sqrt", "sqrt");
+            alias("sin", "sin");
+            alias("cos", "cos");
+            alias("exp", "exp");
+            alias("log", "log");
+            alias("abs", "abs");
+            alias("floor", "floor");
+            alias("ceil", "ceil");
+            alias("trunc", "trunc");
+            alias("pow", "pow");
+            alias("asin", "asin");
+            alias("acos", "acos");
+            alias("tan", "tan");
+            alias("atan", "atan");
+            alias("atan2", "atan2");
+            alias("sinh", "sinh");
+            alias("asinh", "asinh");
+            alias("cosh", "cosh");
+            alias("acosh", "acosh");
+            alias("tanh", "tanh");
+            alias("atanh", "atanh");
+
+            alias("is_nan", "isnan");
+            alias("is_inf", "isinf");
+            alias("is_finite", "isfinite");
+
+            alias("fast_inverse", "rcp");
+            alias("fast_inverse_sqrt", "rsqrt");
+#undef alias
         }
         void add_kernel(Stmt stmt,
                         const std::string &name,
@@ -79,7 +113,6 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {
         std::string print_storage_type(Type type);
         std::string print_type_maybe_storage(Type type, bool storage, AppendSpaceIfNeeded space);
         std::string print_reinterpret(Type type, const Expr &e) override;
-        std::string print_extern_call(const Call *op) override;
 
         std::string print_vanilla_cast(Type type, const std::string &value_expr);
         std::string print_reinforced_cast(Type type, const std::string &value_expr);
@@ -247,18 +280,6 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Evaluate *op)
     print_expr(op->value);
 }
 
-string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_extern_call(const Call *op) {
-    internal_assert(!function_takes_user_context(op->name)) << op->name;
-
-    vector<string> args(op->args.size());
-    for (size_t i = 0; i < op->args.size(); i++) {
-        args[i] = print_expr(op->args[i]);
-    }
-    ostringstream rhs;
-    rhs << op->name << "(" << with_commas(args) << ")";
-    return rhs.str();
-}
-
 void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Max *op) {
     print_expr(Call::make(op->type, "max", {op->a, op->b}, Call::Extern));
 }
@@ -1290,19 +1311,7 @@ void CodeGen_D3D12Compute_Dev::init_module() {
         << "float nan_f32()     { return  1.#IND; } \n"  // Quiet NaN with minimum fractional value.
         << "float neg_inf_f32() { return -1.#INF; } \n"
         << "float inf_f32()     { return +1.#INF; } \n"
-        << "#define is_inf_f32     isinf    \n"
-        << "#define is_finite_f32  isfinite \n"
-        << "#define is_nan_f32     isnan    \n"
         << "#define float_from_bits asfloat \n"
-        << "#define sqrt_f32    sqrt   \n"
-        << "#define sin_f32     sin    \n"
-        << "#define cos_f32     cos    \n"
-        << "#define exp_f32     exp    \n"
-        << "#define log_f32     log    \n"
-        << "#define abs_f32     abs    \n"
-        << "#define floor_f32   floor  \n"
-        << "#define ceil_f32    ceil   \n"
-        << "#define trunc_f32   trunc  \n"
         // pow() in HLSL has the same semantics as C if
         // x > 0.  Otherwise, we need to emulate C
         // behavior.
@@ -1322,19 +1331,9 @@ void CodeGen_D3D12Compute_Dev::init_module() {
         << "    return nan_f32();             \n"
         << "  }                               \n"
         << "}                                 \n"
-        << "#define asin_f32    asin   \n"
-        << "#define acos_f32    acos   \n"
-        << "#define tan_f32     tan    \n"
-        << "#define atan_f32    atan   \n"
-        << "#define atan2_f32   atan2  \n"
-        << "#define sinh_f32    sinh   \n"
-        << "#define cosh_f32    cosh   \n"
-        << "#define tanh_f32    tanh   \n"
-        << "#define asinh_f32(x) (log_f32(x + sqrt_f32(x*x + 1))) \n"
-        << "#define acosh_f32(x) (log_f32(x + sqrt_f32(x*x - 1))) \n"
-        << "#define atanh_f32(x) (log_f32((1+x)/(1-x))/2) \n"
-        << "#define fast_inverse_f32      rcp   \n"
-        << "#define fast_inverse_sqrt_f32 rsqrt \n"
+        << "#define asinh(x) (log(x + sqrt(x*x + 1))) \n"
+        << "#define acosh(x) (log(x + sqrt(x*x - 1))) \n"
+        << "#define atanh(x) (log((1+x)/(1-x))/2) \n"
         << "\n";
     //<< "}\n"; // close namespace
 
diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp
@@ -1,4 +1,5 @@
 #include "CodeGen_GPU_Dev.h"
+#include "CodeGen_Internal.h"
 #include "CanonicalizeGPUVars.h"
 #include "Deinterleave.h"
 #include "ExprUsesVar.h"
@@ -252,5 +253,31 @@ void CodeGen_GPU_C::visit(const Call *op) {
     }
 }
 
+
+std::string CodeGen_GPU_C::print_extern_call(const Call *op) {
+    internal_assert(!function_takes_user_context(op->name)) << op->name;
+
+    // Here we do not scalarize function calls with vector arguments.
+    // Backends should provide those functions, and if not available,
+    // we could compose them by writing out a call element by element,
+    // but that's never happened until 2025, so I guess we can leave
+    // this to be an error for now, just like it was.
+
+    std::ostringstream rhs;
+    std::vector<std::string> args(op->args.size());
+    for (size_t i = 0; i < op->args.size(); i++) {
+        args[i] = print_expr(op->args[i]);
+    }
+    std::string name = op->name;
+    auto it = extern_function_name_map.find(name);
+    if (it != extern_function_name_map.end()) {
+        name = it->second;
+        debug(3) << "Rewriting " << op->name << " as " << name << "\n";
+    }
+    debug(3) << "Writing out call to " << name << "\n";
+    rhs << name << "(" << with_commas(args) << ")";
+    return rhs.str();
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_GPU_Dev.h b/src/CodeGen_GPU_Dev.h
@@ -100,6 +100,8 @@ class CodeGen_GPU_C : public CodeGen_C {
     void visit(const Shuffle *op) override;
     void visit(const Call *op) override;
 
+    std::string print_extern_call(const Call *op) override;
+
     VectorDeclarationStyle vector_declaration_style = VectorDeclarationStyle::CLikeSyntax;
 };
 
diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
@@ -111,7 +111,6 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
         std::string print_storage_type(Type type);
         std::string print_type_maybe_storage(Type type, bool storage, AppendSpaceIfNeeded space);
         std::string print_reinterpret(Type type, const Expr &e) override;
-        std::string print_extern_call(const Call *op) override;
 
         std::string get_memory_space(const std::string &);
 
@@ -242,11 +241,6 @@ string simt_intrinsic(const string &name) {
 }
 }  // namespace
 
-string CodeGen_Metal_Dev::CodeGen_Metal_C::print_extern_call(const Call *op) {
-    internal_assert(!function_takes_user_context(op->name)) << op->name;
-    return CodeGen_GPU_C::print_extern_call(op);
-}
-
 void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Max *op) {
     print_expr(Call::make(op->type, "max", {op->a, op->b}, Call::Extern));
 }
diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
@@ -105,7 +105,6 @@ class CodeGen_OpenCL_Dev : public CodeGen_GPU_Dev {
         using CodeGen_GPU_C::visit;
         std::string print_type(Type type, AppendSpaceIfNeeded append_space = DoNotAppendSpace) override;
         std::string print_reinterpret(Type type, const Expr &e) override;
-        std::string print_extern_call(const Call *op) override;
         std::string print_array_access(const std::string &name,
                                        const Type &type,
                                        const std::string &id_index);
@@ -488,11 +487,6 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) {
     }
 }
 
-string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_extern_call(const Call *op) {
-    internal_assert(!function_takes_user_context(op->name)) << op->name;
-    return CodeGen_GPU_C::print_extern_call(op);
-}
-
 string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_array_access(const string &name,
                                                                 const Type &type,
                                                                 const string &id_index) {
diff --git a/src/CodeGen_WebGPU_Dev.cpp b/src/CodeGen_WebGPU_Dev.cpp
@@ -102,7 +102,6 @@ class CodeGen_WebGPU_Dev : public CodeGen_GPU_Dev {
                                AppendSpaceIfNeeded append_space =
                                    DoNotAppendSpace) override;
         std::string print_reinterpret(Type type, const Expr &e) override;
-        std::string print_extern_call(const Call *op) override;
         std::string print_assignment(Type t, const std::string &rhs) override;
         std::string print_const(Type t, const std::string &rhs);
         std::string print_assignment_or_const(Type t, const std::string &rhs,
@@ -299,11 +298,6 @@ string CodeGen_WebGPU_Dev::CodeGen_WGSL::print_reinterpret(Type type,
     return oss.str();
 }
 
-string CodeGen_WebGPU_Dev::CodeGen_WGSL::print_extern_call(const Call *op) {
-    internal_assert(!function_takes_user_context(op->name)) << op->name;
-    return CodeGen_GPU_C::print_extern_call(op);
-}
-
 void CodeGen_WebGPU_Dev::CodeGen_WGSL::add_kernel(
     const Stmt &s, const string &name, const vector<DeviceArgument> &args) {
     debug(2) << "Adding WGSL shader " << name << "\n";
diff --git a/test/correctness/math.cpp b/test/correctness/math.cpp
@@ -137,7 +137,7 @@ struct TestArgs {
         Var x("x"), xi("xi");                                                                              \
         test_##name(x) = name(in(x));                                                                      \
         if (target.has_gpu_feature()) {                                                                    \
-            test_##name.gpu_tile(x, xi, 8);                                                                \
+            test_##name.gpu_tile(x, xi, 16).vectorize(xi, 2);                                              \
         } else if (target.has_feature(Target::HVX)) {                                                      \
             test_##name.hexagon();                                                                         \
         }                                                                                                  \
@@ -168,7 +168,7 @@ struct TestArgs {
         Var x("x"), xi("xi");                                                                              \
         test_##name(x) = name(in(0, x), in(1, x));                                                         \
         if (target.has_gpu_feature()) {                                                                    \
-            test_##name.gpu_tile(x, xi, 8);                                                                \
+            test_##name.gpu_tile(x, xi, 16).vectorize(xi, 2);                                              \
         } else if (target.has_feature(Target::HVX)) {                                                      \
             test_##name.hexagon();                                                                         \
         }                                                                                                  \