Split tables for sin and cos, as metal has odd precision for sin. Add support for fast_tanh on all backends.

mcourteaux · mcourteaux · commit f4ebe0998f9d · 2025-02-09T19:06:04.000+01:00
diff --git a/src/FastMathFunctions.cpp b/src/FastMathFunctions.cpp
@@ -307,6 +307,32 @@ Expr fast_log(const Expr &x, ApproximationPrecision prec) {
     return result;
 }
 
+Expr fast_tanh(const Expr &x, ApproximationPrecision prec) {
+    // Rewrite with definition:
+    // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
+    //         = (1 - exp(-2x)) / (1 + exp(-2x))
+    // But abs(x) the argument, and flip when negative.
+    Type type = x.type();
+    Expr abs_x = abs(x);
+    Expr flip_sign = x < 0;
+    if (prec.optimized_for == ApproximationPrecision::MULPE) {
+        // Positive arguments to exp() have preciser ULP.
+        // So, we will rewrite the expression to always use exp(2*x)
+        // instead of exp(-2*x) when we are close to zero.
+        Expr flip_exp = abs_x > constant(type, 4);
+        Expr arg_exp = select(flip_exp, -abs_x, abs_x);
+        Expr exp2x = Halide::fast_exp(2 * arg_exp, prec);
+        Expr tanh = (exp2x - constant(type, 1.0)) / (exp2x + constant(type, 1));
+        tanh = select(flip_exp ^ flip_sign, -tanh, tanh);
+        return common_subexpression_elimination(tanh, true);
+    } else {
+        Expr exp2x = Halide::fast_exp(-2 * abs_x, prec);
+        Expr tanh = (constant(type, 1) - exp2x) / (constant(type, 1) + exp2x);
+        tanh = select(flip_sign, -tanh, tanh);
+        return common_subexpression_elimination(tanh, true);
+    }
+}
+
 }  // namespace ApproxImpl
 
 using OO = ApproximationPrecision::OptimizationObjective;
@@ -341,11 +367,20 @@ struct IntrinsicsInfoPerDeviceAPI {
 };
 
 // clang-format off
-IntrinsicsInfoPerDeviceAPI ii_sin_cos{
+IntrinsicsInfoPerDeviceAPI ii_sin{
+    OO::MAE, 1e-5f, 0, {
+      {DeviceAPI::Vulkan, {true}, {}},
+      {DeviceAPI::CUDA, {false}, {OO::MAE, 5e-7f, 1'000'000}},
+      {DeviceAPI::Metal, {true}, {OO::MAE, 6e-5f,   400'000}},
+      {DeviceAPI::WebGPU, {true}, {}},
+      {DeviceAPI::OpenCL, {false}, {OO::MAE, 5e-7f, 1'000'000}},
+}};
+
+IntrinsicsInfoPerDeviceAPI ii_cos{
     OO::MAE, 1e-5f, 0, {
       {DeviceAPI::Vulkan, {true}, {}},
       {DeviceAPI::CUDA, {false}, {OO::MAE, 5e-7f, 1'000'000}},
-      {DeviceAPI::Metal, {true}, {OO::MAE, 5e-7f, 1'000'000}},
+      {DeviceAPI::Metal, {true}, {OO::MAE, 7e-7f,     5'000}},
       {DeviceAPI::WebGPU, {true}, {}},
       {DeviceAPI::OpenCL, {false}, {OO::MAE, 5e-7f, 1'000'000}},
 }};
@@ -622,24 +657,30 @@ class LowerFastMathFunctions : public IRMutator {
     }
 
     Expr visit(const Call *op) override {
-        if (op->is_intrinsic(Call::fast_sin) || op->is_intrinsic(Call::fast_cos)) {
-            // Handle fast_sin and fast_cos together!
+        if (op->is_intrinsic(Call::fast_sin)) {
             ApproximationPrecision prec = extract_approximation_precision(op);
-            IntrinsicsInfo ii = resolve_precision(prec, ii_sin_cos, for_device_api);
+            IntrinsicsInfo ii = resolve_precision(prec, ii_sin, for_device_api);
             if (op->type == Float(32) && intrinsic_satisfies_precision(ii, prec)) {
                 return append_type_suffix(op);
             }
             if (ii.native_func.is_fast && native_func_satisfies_precision(ii, prec)) {
-                // The native sine and cosine are fast: fall back to native and continue lowering.
                 return to_native_func(op);
             }
 
             // No known fast version available, we will expand our own approximation.
-            if (op->is_intrinsic(Call::fast_sin)) {
-                return ApproxImpl::fast_sin(mutate(op->args[0]), prec);
-            } else {
-                return ApproxImpl::fast_cos(mutate(op->args[0]), prec);
+            return ApproxImpl::fast_sin(mutate(op->args[0]), prec);
+        } else if (op->is_intrinsic(Call::fast_cos)) {
+            ApproximationPrecision prec = extract_approximation_precision(op);
+            IntrinsicsInfo ii = resolve_precision(prec, ii_cos, for_device_api);
+            if (op->type == Float(32) && intrinsic_satisfies_precision(ii, prec)) {
+                return append_type_suffix(op);
             }
+            if (ii.native_func.is_fast && native_func_satisfies_precision(ii, prec)) {
+                return to_native_func(op);
+            }
+
+            // No known fast version available, we will expand our own approximation.
+            return ApproxImpl::fast_cos(mutate(op->args[0]), prec);
         } else if (op->is_intrinsic(Call::fast_atan) || op->is_intrinsic(Call::fast_atan2)) {
             // Handle fast_atan and fast_atan2 together!
             ApproximationPrecision prec = extract_approximation_precision(op);
@@ -722,8 +763,8 @@ class LowerFastMathFunctions : public IRMutator {
                 return append_type_suffix(op);
             }
 
-            // Unfortunately, no fast_tanh approximation implemented yet!
-            return to_native_func(op);
+            // Expand using defintion in terms of exp(2x), and recurse.
+            return mutate(ApproxImpl::fast_tanh(op->args[0], prec));
         } else if (op->is_intrinsic(Call::fast_pow)) {
             ApproximationPrecision prec = extract_approximation_precision(op);
             IntrinsicsInfo ii = resolve_precision(prec, ii_pow, for_device_api);
diff --git a/test/correctness/fast_function_approximations.cpp b/test/correctness/fast_function_approximations.cpp
@@ -87,7 +87,7 @@ struct FunctionToTest {
         {
             { "-pi/3 to pi/3", {{-pi * 0.333f, pi * 0.333f}}, true, 40, 0 },
             { "-pi/2 to pi/2", {{-pi * 0.5f, pi * 0.5f}}, true, 0, 0 },
-            { "-3pi to 3pi",   {{-pi * 3.0f, pi * 3.0f}}, false, 0, 0 },
+            { "-3pi to 3pi",   {{-pi * 3.0f, pi * 3.0f}}, true, 0, 0 },
         }
     },
     {
@@ -133,8 +133,8 @@ struct FunctionToTest {
         [](Expr x, Expr y) { return Halide::tanh(x); },
         [](Expr x, Expr y, Halide::ApproximationPrecision prec) { return Halide::fast_tanh(x, prec); },
         {
-            { "precise"  , {{ -10.0f , 10.0f }}, true, 70, 20 },
-            { "extended" , {{ -100.0f, 100.0f}}, true, 70, 20 },
+            { "precise"     , {{  -8.0f ,  8.0f }}, true, 2500, 20 },
+            { "extended"    , {{ -100.0f, 100.0f}}, true, 2500, 20 },
         }
     },
     // clang-format on
@@ -372,7 +372,8 @@ int main(int argc, char **argv) {
                     if (&rat == &ftt.ranged_tests[0]) {
                         // On the first (typically precise) range.
                         num_tests++;
-                        if (em.max_abs_error < 1e-5 || em.max_ulp_error < 20'000 || em.max_rel_error < 1e-2) {
+                        if ((em.max_abs_error < 1e-5 || em.max_ulp_error < 20'000 || em.max_rel_error < 1e-2) ||
+                            (em.max_abs_error < 1e-4 && em.mean_abs_error < 1e-5 && em.mean_ulp_error < 400)) {
                             num_tests_passed++;
                             print_ok();
                         } else {
diff --git a/tools/polynomial_optimizer.py b/tools/polynomial_optimizer.py
@@ -106,6 +106,11 @@ def optimize_approximation(loss, order):
         func = lambda x: np.log(x + 1.0)
         exponents = np.arange(1, order + 1)
         lower, upper = -0.25, 0.5
+    elif args.func == "tanh":
+        func_fixed_part = lambda x: x
+        func = lambda x: np.tanh(x)
+        exponents = np.arange(1, order + 1)
+        lower, upper = 0.0, 4.0
     else:
         print("Unknown function:", args.func)
         exit(1)

Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,7 @@ struct FunctionToTest {`
`87`	`87`	`{`
`88`	`88`	`{ "-pi/3 to pi/3", {{-pi * 0.333f, pi * 0.333f}}, true, 40, 0 },`
`89`	`89`	`{ "-pi/2 to pi/2", {{-pi * 0.5f, pi * 0.5f}}, true, 0, 0 },`
`90`		`- { "-3pi to 3pi", {{-pi * 3.0f, pi * 3.0f}}, false, 0, 0 },`
	`90`	`+ { "-3pi to 3pi", {{-pi * 3.0f, pi * 3.0f}}, true, 0, 0 },`
`91`	`91`	`}`
`92`	`92`	`},`
`93`	`93`	`{`
`@@ -133,8 +133,8 @@ struct FunctionToTest {`
`133`	`133`	`[](Expr x, Expr y) { return Halide::tanh(x); },`
`134`	`134`	`[](Expr x, Expr y, Halide::ApproximationPrecision prec) { return Halide::fast_tanh(x, prec); },`
`135`	`135`	`{`
`136`		`- { "precise" , {{ -10.0f , 10.0f }}, true, 70, 20 },`
`137`		`- { "extended" , {{ -100.0f, 100.0f}}, true, 70, 20 },`
	`136`	`+ { "precise" , {{ -8.0f , 8.0f }}, true, 2500, 20 },`
	`137`	`+ { "extended" , {{ -100.0f, 100.0f}}, true, 2500, 20 },`
`138`	`138`	`}`
`139`	`139`	`},`
`140`	`140`	`// clang-format on`
`@@ -372,7 +372,8 @@ int main(int argc, char **argv) {`
`372`	`372`	`if (&rat == &ftt.ranged_tests[0]) {`
`373`	`373`	`// On the first (typically precise) range.`
`374`	`374`	`num_tests++;`
`375`		`- if (em.max_abs_error < 1e-5 \|\| em.max_ulp_error < 20'000 \|\| em.max_rel_error < 1e-2) {`
	`375`	`+ if ((em.max_abs_error < 1e-5 \|\| em.max_ulp_error < 20'000 \|\| em.max_rel_error < 1e-2) \|\|`
	`376`	`+ (em.max_abs_error < 1e-4 && em.mean_abs_error < 1e-5 && em.mean_ulp_error < 400)) {`
`376`	`377`	`num_tests_passed++;`
`377`	`378`	`print_ok();`
`378`	`379`	`} else {`