halide
diff --git a/‎src/ApproximationTables.cpp
+106 b/‎src/ApproximationTables.cpp
+106
diff --git a/‎src/ApproximationTables.h
+2 b/‎src/ApproximationTables.h
+2
diff --git a/‎src/Derivative.cpp
+3 b/‎src/Derivative.cpp
+3
diff --git a/‎src/FastMathFunctions.cpp
+80-32 b/‎src/FastMathFunctions.cpp
+80-32
diff --git a/‎src/IR.cpp
+1 b/‎src/IR.cpp
+1
@@ -500,6 +500,108 @@ const std::vector<Approximation> table_tan = {
   },
 };
 
+const std::vector<Approximation> table_expm1 = {
+  /* MULPE optimized */
+  { /* Polynomial degree 2: 1*x + 0.5006693548784*x^2 */
+    /* f16 */ {6.973743e-06, nan, 0},
+    /* f32 */ {6.969223e-06, 0x1.ebb68p-8, 251914},
+    /* f64 */ {6.969224e-06, nan, 0},
+    /* p */ {0, 1, 0x1.0057bbd29fd1ep-1},
+  },
+  { /* Polynomial degree 3: 1*x + 0.5034739414620*x^2 + 0.1676710752100*x^3 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {3.367883e-09, 0x1.86dp-13, 6263},
+    /* f64 */ {3.367884e-09, nan, 0},
+    /* p */ {0, 1, 0x1.01c75621ef769p-1, 0x1.5763eec418d18p-3},
+  },
+  { /* Polynomial degree 4: 1*x + 0.4999934522294*x^2 + 0.1674641440143*x^3 + 0.0418883769826*x^4 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {7.937537e-12, 0x1.22p-17, 290},
+    /* f64 */ {7.937461e-12, nan, 0},
+    /* p */ {0, 1, 0x1.fffe4896282b8p-2, 0x1.56f770ee59ccdp-3, 0x1.57264b2721b28p-5},
+  },
+  { /* Polynomial degree 5: 1*x + 0.4999948095067*x^2 + 0.1666705913520*x^3 + 0.0418641947519*x^4 + 0.0083245399856*x^5 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {5.121846e-15, 0x1p-22, 9},
+    /* f64 */ {5.032477e-15, nan, 0},
+    /* p */ {0, 1, 0x1.fffea3ac00fecp-2, 0x1.555764187ec0cp-3, 0x1.56f3946aa5fddp-5, 0x1.10c74d7f0b9e3p-7},
+  },
+  { /* Polynomial degree 6: 1*x + 0.4999999783332*x^2 + 0.1666655167631*x^3 + 0.0416674530503*x^4 + 0.0083656894489*x^5 + 0.0013868266193*x^6 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {9.151552e-17, 0x1p-24, 3},
+    /* f64 */ {3.980170e-18, nan, 0},
+    /* p */ {0, 1, 0x1.fffffe8bc45fdp-2, 0x1.5554bafef2a4cp-3, 0x1.5556fb851488cp-5, 0x1.12207d4bbd602p-7, 0x1.6b8c5be658778p-10},
+  },
+  { /* Polynomial degree 7: 1*x + 0.5000000039620*x^2 + 0.1666666668832*x^3 + 0.0416663782542*x^4 + 0.0083333114192*x^5 + 0.0013939439655*x^6 + 0.0001989114932*x^7 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {8.791334e-17, 0x1p-24, 3},
+    /* f64 */ {1.261949e-21, nan, 0},
+    /* p */ {0, 1, 0x1.00000022086cdp-1, 0x1.5555555cc5f6bp-3, 0x1.5554ba7e3b3ap-5, 0x1.1110e201a0746p-7, 0x1.6d69fefa37758p-10, 0x1.a125cb74c2fdcp-13},
+  },
+  { /* Polynomial degree 8: 1*x + 0.5000000000002*x^2 + 0.1666666674457*x^3 + 0.0416666667550*x^4 + 0.0083332919144*x^5 + 0.0013888838822*x^6 + 0.0001990314010*x^7 + 0.0000248701821*x^8 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {8.794097e-17, 0x1p-24, 3},
+    /* f64 */ {6.327484e-25, nan, 0},
+    /* p */ {0, 1, 0x1.0000000000618p-1, 0x1.5555557019e1dp-3, 0x1.5555556177a9cp-5, 0x1.1110b81eca4bdp-7, 0x1.6c166b6843098p-10, 0x1.a1662b74ce94ap-13, 0x1.a1409e6521e4p-16},
+  },
+  { /* Polynomial degree 9: 1*x + 0.4999999999985*x^2 + 0.1666666666682*x^3 + 0.0416666668663*x^4 + 0.0083333332671*x^5 + 0.0013888825262*x^6 + 0.0001984132091*x^7 + 0.0000248745945*x^8 + 0.0000027582234*x^9 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {8.793395e-17, 0x1p-24, 3},
+    /* f64 */ {1.531604e-28, nan, 0},
+    /* p */ {0, 1, 0x1.fffffffff940fp-2, 0x1.555555556268ap-3, 0x1.55555570c649p-5, 0x1.111110ecaa65p-7, 0x1.6c16541ce2eep-10, 0x1.a01a47d13935p-13, 0x1.a15391e6e2bcp-16, 0x1.7233d57b06acp-19},
+  },
+
+  /* MAE optimized */
+  { /* Polynomial degree 2: 1*x + 0.5050242124682*x^2 */
+    /* f16 */ {6.973743e-06, nan, 0},
+    /* f32 */ {6.950645e-06, 0x1.c96fp-8, 276101},
+    /* f64 */ {6.950646e-06, nan, 0},
+    /* p */ {0, 1, 0x1.029288987a54cp-1},
+  },
+  { /* Polynomial degree 3: 1*x + 0.5041221231243*x^2 + 0.1676698092003*x^3 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {4.160910e-09, 0x1.c7p-14, 7815},
+    /* f64 */ {4.160914e-09, nan, 0},
+    /* p */ {0, 1, 0x1.021c4b8004a3ap-1, 0x1.576344d85599fp-3},
+  },
+  { /* Polynomial degree 4: 1*x + 0.4999895150973*x^2 + 0.1675387336054*x^3 + 0.0419211379777*x^4 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {9.945929e-12, 0x1.72p-18, 370},
+    /* f64 */ {9.945737e-12, nan, 0},
+    /* p */ {0, 1, 0x1.fffd405ebe74bp-2, 0x1.571e8c2d2f987p-3, 0x1.576aff9401dcp-5},
+  },
+  { /* Polynomial degree 5: 1*x + 0.4999914702852*x^2 + 0.1666645763191*x^3 + 0.0418982706165*x^4 + 0.0083746050916*x^5 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {3.805249e-15, 0x1.4p-23, 14},
+    /* f64 */ {3.714810e-15, nan, 0},
+    /* p */ {0, 1, 0x1.fffdc3949dcaep-2, 0x1.55543cc5899b8p-3, 0x1.573b0ac1d1b71p-5, 0x1.126b477e23ba6p-7},
+  },
+  { /* Polynomial degree 6: 1*x + 0.5000000095104*x^2 + 0.1666651891580*x^3 + 0.0416662060631*x^4 + 0.0083688803426*x^5 + 0.0013950473985*x^6 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {9.192510e-17, 0x1p-24, 3},
+    /* f64 */ {3.769683e-18, nan, 0},
+    /* p */ {0, 1, 0x1.00000051b18efp-1, 0x1.55548f06853e7p-3, 0x1.55545e0c74cfcp-5, 0x1.123b41b01319dp-7, 0x1.6db40bcfe61dp-10},
+  },
+  { /* Polynomial degree 7: 1*x + 0.5000000077859*x^2 + 0.1666666686005*x^3 + 0.0416662701044*x^4 + 0.0083332644982*x^5 + 0.0013946061254*x^6 + 0.0001991830927*x^7 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {8.790274e-17, 0x1p-24, 3},
+    /* f64 */ {1.003267e-21, nan, 0},
+    /* p */ {0, 1, 0x1.00000042e152ap-1, 0x1.55555597c7c4ap-3, 0x1.5554806e3a70cp-5, 0x1.11107d3e893fp-7, 0x1.6d966ecc0e888p-10, 0x1.a1b79bcd9bc7p-13},
+  },
+  { /* Polynomial degree 8: 1*x + 0.4999999999952*x^2 + 0.1666666678656*x^3 + 0.0416666670540*x^4 + 0.0083332812914*x^5 + 0.0013888796454*x^6 + 0.0001990923050*x^7 + 0.0000248875972*x^8 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {8.794057e-17, 0x1p-24, 3},
+    /* f64 */ {5.533894e-25, nan, 0},
+    /* p */ {0, 1, 0x1.ffffffffeae2bp-2, 0x1.5555557e86fd4p-3, 0x1.5555558a91454p-5, 0x1.1110a14eb4df8p-7, 0x1.6c16229ee20dp-10, 0x1.a186de09bce3fp-13, 0x1.a18b6a8cc4fp-16},
+  },
+  { /* Polynomial degree 9: 1*x + 0.4999999999960*x^2 + 0.1666666666657*x^3 + 0.0416666669889*x^4 + 0.0083333333889*x^5 + 0.0013888807600*x^6 + 0.0001984116265*x^7 + 0.0000248822674*x^8 + 0.0000027643875*x^9 */
+    /* f16 */ {0.000000e+00, nan, 0},
+    /* f32 */ {8.793395e-17, 0x1p-24, 3},
+    /* f64 */ {1.074717e-28, nan, 0},
+    /* p */ {0, 1, 0x1.ffffffffee98ep-2, 0x1.555555554c93dp-3, 0x1.555555819f9cp-5, 0x1.1111112fa1c6p-7, 0x1.6c1635c4da36p-10, 0x1.a0196e4f3bb98p-13, 0x1.a1748651dec8p-16, 0x1.7307a199bd04p-19},
+  },
+};
+
 const std::vector<Approximation> table_exp = {
   /* MULPE optimized (with fixed x⁰ and x¹ coefficients 1 and 1). */
   { /* Polynomial degree 1: 1 + 1*x */
@@ -905,6 +1007,10 @@ const Approximation *best_tan_approximation(Halide::ApproximationPrecision preci
     return find_best_approximation("tan", table_tan, precision, type);
 }
 
+const Approximation *best_expm1_approximation(Halide::ApproximationPrecision precision, Type type) {
+    return find_best_approximation("expm1", table_expm1, precision, type);
+}
+
 const Approximation *best_exp_approximation(Halide::ApproximationPrecision precision, Type type) {
     return find_best_approximation("exp", table_exp, precision, type);
 }
 
@@ -36,6 +36,7 @@ extern const std::vector<Approximation> table_atan;
 extern const std::vector<Approximation> table_sin;
 extern const std::vector<Approximation> table_cos;
 extern const std::vector<Approximation> table_tan;
+extern const std::vector<Approximation> table_expm1;
 extern const std::vector<Approximation> table_exp;
 extern const std::vector<Approximation> table_log;
 
@@ -45,6 +46,7 @@ const Approximation *best_cos_approximation(Halide::ApproximationPrecision preci
 const Approximation *best_tan_approximation(Halide::ApproximationPrecision precision, Type type);
 const Approximation *best_log_approximation(Halide::ApproximationPrecision precision, Type type);
 const Approximation *best_exp_approximation(Halide::ApproximationPrecision precision, Type type);
+const Approximation *best_expm1_approximation(Halide::ApproximationPrecision precision, Type type);
 }  // namespace ApproximationTables
 
 }  // namespace Internal
 
@@ -1070,6 +1070,9 @@ void ReverseAccumulationVisitor::visit(const Call *op) {
     if (is_math_func(op, "exp", Call::fast_exp)) {
         // d/dx exp(x) = exp(x)
         accumulate(op->args[0], adjoint * exp(op->args[0]));
+    } else if (is_math_func(op, "expm1", Call::fast_expm1)) {
+        // d/dx (exp(x) - 1) = exp(x)
+        accumulate(op->args[0], adjoint * exp(op->args[0]));
     } else if (is_math_func(op, "log", Call::fast_log)) {
         // d/dx log(x) = 1 / x
         accumulate(op->args[0], adjoint / op->args[0]);
 
@@ -343,8 +343,35 @@ Expr fast_exp(const Expr &x_full, ApproximationPrecision prec) {
 
     // Shift the bits up into the exponent field and reinterpret this
     // thing as float.
-    Expr two_to_the_n = reinterpret<float>(biased << 23);
-    result *= two_to_the_n;
+    Expr two_to_the_k = reinterpret<float>(biased << 23);
+    result *= two_to_the_k;
+    result = common_subexpression_elimination(result, true);
+    return result;
+}
+
+Expr fast_expm1(const Expr &x_full, ApproximationPrecision prec) {
+    Type type = x_full.type();
+    user_assert(x_full.type() == Float(32)) << "fast_exp only works for Float(32)";
+
+    Expr log2 = make_const(type, std::log(2.0));
+
+    Expr scaled = x_full / log2;
+    Expr k_real = round(scaled);  // Here we round instead of floor, to reduce to [-log(2)/2, log(2)/2].
+    Expr k = cast<int>(k_real);
+    Expr x = x_full - k_real * log2;
+
+    const Internal::Approximation *approx = Internal::ApproximationTables::best_expm1_approximation(prec, type);
+    Expr result = eval_approx(approx, x);
+
+    // Compute 2^k.
+    int fpbias = 127;
+    Expr biased = clamp(k + fpbias, 0, 255);
+
+    // Shift the bits up into the exponent field and reinterpret this
+    // thing as float.
+    Expr two_to_the_k = reinterpret<float>(biased << 23);
+
+    result = select(k == 0, result, (result + 1) * two_to_the_k - 1);
     result = common_subexpression_elimination(result, true);
     return result;
 }
@@ -370,26 +397,37 @@ Expr fast_tanh(const Expr &x, ApproximationPrecision prec) {
     // Rewrite with definition:
     // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
     //         = (1 - exp(-2x)) / (1 + exp(-2x))
+    //         = (expm1(2x)) / (expm1(2x) + 2)
     // But abs(x) the argument, and flip when negative.
     Type type = x.type();
     Expr abs_x = abs(x);
     Expr flip_sign = x < 0;
     if (prec.optimized_for == ApproximationPrecision::MULPE) {
+#if 0
         // Positive arguments to exp() have preciser ULP.
         // So, we will rewrite the expression to always use exp(2*x)
         // instead of exp(-2*x) when we are close to zero.
         // Rewriting it like this is slighlty more expensive, hence the branch
         // to only pay this extra cost in case we need MULPE-optimized approximations.
         Expr flip_exp = abs_x > make_const(type, 4);
         Expr arg_exp = select(flip_exp, -abs_x, abs_x);
-        Expr exp2x = Halide::fast_exp(2 * arg_exp, prec);
-        Expr tanh = (exp2x - make_const(type, 1.0)) / (exp2x + make_const(type, 1));
+        Expr exp2xm1 = Halide::fast_expm1(2 * arg_exp, prec);
+        Expr tanh = (exp2xm1) / (exp2xm1 + make_const(type, 2));
         tanh = select(flip_exp ^ flip_sign, -tanh, tanh);
         return common_subexpression_elimination(tanh, true);
+#else
+        // expm1 is devloped around 0 and is ULP accurate in [-ln(2)/2, ln(2)/2].
+        Expr exp2xm1 = Halide::fast_expm1(-2 * abs_x, prec);
+        Expr tanh = (exp2xm1) / (exp2xm1 + make_const(type, 2));
+        tanh = select(flip_sign, tanh, -tanh);
+        return common_subexpression_elimination(tanh, true);
+#endif
     } else {
         // Even if we are optimizing for MAE, the nested call to exp()
         // should be MULPE optimized for accuracy, as we are taking ratios.
-        prec.optimized_for = ApproximationPrecision::MULPE;
+        if (prec.optimized_for == ApproximationPrecision::MAE) {
+            prec.optimized_for = ApproximationPrecision::MULPE;
+        } // else it's on AUTO, and we want to keep that (AUTO tanh uses AUTO exp).
         Expr exp2x = Halide::fast_exp(-2 * abs_x, prec);
         Expr tanh = (make_const(type, 1) - exp2x) / (make_const(type, 1) + exp2x);
         tanh = select(flip_sign, -tanh, tanh);
@@ -466,6 +504,10 @@ IntrinsicsInfoPerDeviceAPI ii_tan{
       {DeviceAPI::OpenCL, {false}, {OO::MAE, 2e-6f, 1'000'000}},
 }};
 
+IntrinsicsInfoPerDeviceAPI ii_expm1{
+    OO::MULPE, 0.0f, 50, { /* No intrinsics on any backend. */
+}};
+
 IntrinsicsInfoPerDeviceAPI ii_exp{
     OO::MULPE, 0.0f, 50, {
       {DeviceAPI::Vulkan, {true}, {}},
@@ -478,10 +520,10 @@ IntrinsicsInfoPerDeviceAPI ii_exp{
 IntrinsicsInfoPerDeviceAPI ii_log{
     OO::MAE, 1e-5f, 1000, {
      {DeviceAPI::Vulkan, {true}, {}},
-     {DeviceAPI::CUDA, {false}, {OO::MULPE, 0.0f, 3'800'000}},
+     {DeviceAPI::CUDA, {false}, {OO::MAE, 0.0f, 3'800'000}},
      {DeviceAPI::Metal, {false}, {OO::MAE, 0.0f, 3'800'000}},  // slow log() on metal
      {DeviceAPI::WebGPU, {true}, {}},
-     {DeviceAPI::OpenCL, {true}, {OO::MULPE, 0.0f, 3'800'000}},
+     {DeviceAPI::OpenCL, {true}, {OO::MAE, 0.0f, 3'800'000}},
 }};
 
 IntrinsicsInfoPerDeviceAPI ii_pow{
@@ -519,6 +561,9 @@ bool fast_math_func_has_intrinsic_based_implementation(Call::IntrinsicOp op, Dev
     case Call::fast_cos:
         iipda = &ii_cos;
         break;
+    case Call::fast_expm1:
+        iipda = &ii_expm1;
+        break;
     case Call::fast_exp:
         iipda = &ii_exp;
         break;
@@ -563,14 +608,17 @@ bool fast_math_func_has_intrinsic_based_implementation(Call::IntrinsicOp op, Dev
     return false;
 }
 
-IntrinsicsInfo resolve_precision(ApproximationPrecision &prec, const IntrinsicsInfoPerDeviceAPI &iida, DeviceAPI api) {
-    IntrinsicsInfo ii{};
+IntrinsicsInfo find_intrinsics_info_for_device_api(const IntrinsicsInfoPerDeviceAPI &iida, DeviceAPI api) {
     for (const auto &cand : iida.device_apis) {
         if (cand.device_api == api) {
-            ii = cand;
-            break;
+            return cand;
         }
     }
+    return {};
+}
+
+IntrinsicsInfo resolve_precision(ApproximationPrecision &prec, const IntrinsicsInfoPerDeviceAPI &iida, DeviceAPI api) {
+    IntrinsicsInfo ii = find_intrinsics_info_for_device_api(iida, api);
 
     if (prec.optimized_for == ApproximationPrecision::AUTO) {
         if (!ii.intrinsic.defined()) {
@@ -690,18 +738,6 @@ class LowerFastMathFunctions : public IRMutator {
         return for_device_api == DeviceAPI::CUDA && target.get_cuda_capability_lower_bound() >= 75;
     }
 
-    void adjust_precision_for_target(ApproximationPrecision &prec) {
-        if (for_device_api == DeviceAPI::None) {
-            if (target.arch == Target::Arch::X86) {
-                // If we do not have fused-multiply-add, we lose some precision.
-                if (target.bits == 32 || !target.has_feature(Target::Feature::FMA)) {
-                    prec.constraint_max_absolute_error *= 0.5f;
-                    prec.constraint_max_ulp_error /= 2;
-                }
-            }
-        }
-    }
-
     /** Strips the fast_ prefix, appends the type suffix, and
      * drops the precision argument from the end. */
     Expr to_native_func(const Call *op) {
@@ -720,7 +756,7 @@ class LowerFastMathFunctions : public IRMutator {
         std::vector<Expr> args;
         for (size_t i = 0; i < op->args.size() - 1; ++i) {
             const Expr &arg = op->args[i];
-            args.push_back(IRMutator::mutate(arg));
+            args.push_back(mutate(arg));
         }
         return Call::make(op->type, new_name, args, Call::PureExtern);
     }
@@ -738,7 +774,7 @@ class LowerFastMathFunctions : public IRMutator {
         std::vector<Expr> args;
         for (size_t i = 0; i < op->args.size() - 1; ++i) {
             const Expr &arg = op->args[i];
-            args.push_back(IRMutator::mutate(arg));
+            args.push_back(mutate(arg));
         }
         return Call::make(op->type, new_name, args, Call::PureExtern);
     }
@@ -792,7 +828,6 @@ class LowerFastMathFunctions : public IRMutator {
             }
 
             // No known fast version available, we will expand our own approximation.
-            adjust_precision_for_target(prec);
             return ApproxImpl::fast_sin(mutate(op->args[0]), prec);
         } else if (op->is_intrinsic(Call::fast_cos)) {
             ApproximationPrecision prec = extract_approximation_precision(op);
@@ -805,7 +840,6 @@ class LowerFastMathFunctions : public IRMutator {
             }
 
             // No known fast version available, we will expand our own approximation.
-            adjust_precision_for_target(prec);
             return ApproxImpl::fast_cos(mutate(op->args[0]), prec);
         } else if (op->is_intrinsic(Call::fast_atan) || op->is_intrinsic(Call::fast_atan2)) {
             // Handle fast_atan and fast_atan2 together!
@@ -816,7 +850,6 @@ class LowerFastMathFunctions : public IRMutator {
                 return to_native_func(op);
             }
 
-            adjust_precision_for_target(prec);
             if (op->is_intrinsic(Call::fast_atan)) {
                 return ApproxImpl::fast_atan(mutate(op->args[0]), prec);
             } else {
@@ -841,10 +874,12 @@ class LowerFastMathFunctions : public IRMutator {
                 return to_native_func(op);
             }
 
-            adjust_precision_for_target(prec);
             return ApproxImpl::fast_tan(mutate(op->args[0]), prec);
+        } else if (op->is_intrinsic(Call::fast_expm1)) {
+            ApproximationPrecision prec = extract_approximation_precision(op);
+            resolve_precision(prec, ii_expm1, for_device_api);
+            return ApproxImpl::fast_expm1(mutate(op->args[0]), prec);
         } else if (op->is_intrinsic(Call::fast_exp)) {
-            // Handle fast_exp and fast_log together!
             ApproximationPrecision prec = extract_approximation_precision(op);
             IntrinsicsInfo ii = resolve_precision(prec, ii_exp, for_device_api);
             if (op->type == Float(32) && is_cuda_cc20() && intrinsic_satisfies_precision(ii, prec)) {
@@ -865,7 +900,6 @@ class LowerFastMathFunctions : public IRMutator {
                 return to_native_func(op);
             }
 
-            adjust_precision_for_target(prec);
             return ApproxImpl::fast_exp(mutate(op->args[0]), prec);
         } else if (op->is_intrinsic(Call::fast_log)) {
             // Handle fast_exp and fast_log together!
@@ -887,10 +921,24 @@ class LowerFastMathFunctions : public IRMutator {
                 return to_native_func(op);
             }
 
-            adjust_precision_for_target(prec);
             return ApproxImpl::fast_log(mutate(op->args[0]), prec);
         } else if (op->is_intrinsic(Call::fast_tanh)) {
             ApproximationPrecision prec = extract_approximation_precision(op);
+            // Here is a little special treatment. tanh() on cuda can be rewritten to exp(), but
+            // that would behave MAE, instead of MULPE. MULPE is the default behavior for the
+            // tanh.approx.f32 intrinsic. So resolve_precision() would set it to MULPE to be able
+            // to use that intrinsic, but that is dependent on CC7.5. So we will instead first
+            // check if we are on CC <7.5 and are on AUTO, no precision requirements.
+            // If that's the case, we leave the objective on AUTO, and immediately rewrite.
+            if (op->type == Float(32) && is_cuda_cc20() && !is_cuda_cc75()) {
+                if (prec.optimized_for == ApproximationPrecision::AUTO &&
+                    prec.constraint_max_absolute_error == 0 &&
+                    prec.constraint_max_ulp_error == 0 &&
+                    prec.force_halide_polynomial == 0) {
+                    return mutate(ApproxImpl::fast_tanh(op->args[0], prec));
+                }
+            }
+            // Now we know we're not in that case, proceed like usually.
             IntrinsicsInfo ii = resolve_precision(prec, ii_tanh, for_device_api);
             // We have a fast version on PTX with CC7.5
             if (op->type == Float(32) && is_cuda_cc75() && intrinsic_satisfies_precision(ii, prec)) {
 
@@ -635,6 +635,7 @@ const char *const intrinsic_op_names[] = {
     "fast_atan2",
     "fast_cos",
     "fast_exp",
+    "fast_expm1",
     "fast_log",
     "fast_pow",
     "fast_sin",