Skip to content

Commit f598106

Browse files
committed
Feedback from Steven.
1 parent 735c431 commit f598106

File tree

5 files changed

+41
-27
lines changed

5 files changed

+41
-27
lines changed

src/ApproximationTables.cpp

+29-7
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,17 @@ namespace {
77

88
using OO = ApproximationPrecision::OptimizationObjective;
99

10+
// clang-format off
1011
// Generate this table with:
1112
// python3 src/polynomial_optimizer.py atan --order 1 2 3 4 5 6 7 8 --loss mse mae mulpe mulpe_mae --no-gui --format table
12-
std::vector<Approximation> table_atan = {
13+
//
14+
// Note that the maximal errors are computed with numpy with double precision.
15+
// The real errors are a bit larger with single-precision floats (see correctness/fast_arctan.cpp).
16+
// Also note that ULP distances which are not units are bogus, but this is because this error
17+
// was again measured with double precision, so the actual reconstruction had more bits of
18+
// precision than the actual float32 target value. So in practice the MaxULP Error
19+
// will be close to round(MaxUlpE).
20+
const std::vector<Approximation> table_atan = {
1321
{OO::MSE, 9.249650e-04, 7.078984e-02, 2.411e+06, {+8.56188008e-01}},
1422
{OO::MSE, 1.026356e-05, 9.214909e-03, 3.985e+05, {+9.76213454e-01, -2.00030200e-01}},
1523
{OO::MSE, 1.577588e-07, 1.323851e-03, 6.724e+04, {+9.95982073e-01, -2.92278128e-01, +8.30180680e-02}},
@@ -46,21 +54,28 @@ std::vector<Approximation> table_atan = {
4654
{OO::MULPE_MAE, 3.053218e-14, 3.784868e-07, 4.181e+01, {+9.99997480e-01, -3.33205127e-01, +1.98309644e-01, -1.33094430e-01, +8.08643094e-02, -3.45859503e-02, +7.11261604e-03}},
4755
{OO::MULPE_MAE, 7.018877e-16, 5.862915e-08, 6.942e+00, {+9.99999581e-01, -3.33306326e-01, +1.99542180e-01, -1.39433369e-01, +9.72462857e-02, -5.69734398e-02, +2.25639390e-02, -4.24074590e-03}},
4856
};
57+
// clang-format on
4958
} // namespace
5059

51-
const Approximation *find_best_approximation(const std::vector<Approximation> &table, ApproximationPrecision precision) {
60+
const Approximation *find_best_approximation(const std::vector<Approximation> &table,
61+
ApproximationPrecision precision) {
62+
#define DEBUG_APPROXIMATION_SEARCH 0
5263
const Approximation *best = nullptr;
5364
constexpr int term_cost = 20;
5465
constexpr int extra_term_cost = 200;
5566
double best_score = 0;
56-
// std::printf("Looking for min_terms=%d, max_absolute_error=%f\n", precision.constraint_min_poly_terms, precision.constraint_max_absolute_error);
67+
#if DEBUG_APPROXIMATION_SEARCH
68+
std::printf("Looking for min_terms=%d, max_absolute_error=%f\n",
69+
precision.constraint_min_poly_terms, precision.constraint_max_absolute_error);
70+
#endif
5771
for (size_t i = 0; i < table.size(); ++i) {
5872
const Approximation &e = table[i];
5973

6074
double penalty = 0.0;
6175

6276
int obj_score = e.objective == precision.optimized_for ? 100 * term_cost : 0;
63-
if (precision.optimized_for == ApproximationPrecision::MULPE_MAE && e.objective == ApproximationPrecision::MULPE) {
77+
if (precision.optimized_for == ApproximationPrecision::MULPE_MAE &&
78+
e.objective == ApproximationPrecision::MULPE) {
6479
obj_score = 50 * term_cost; // When MULPE_MAE is not available, prefer MULPE.
6580
}
6681

@@ -87,19 +102,26 @@ const Approximation *find_best_approximation(const std::vector<Approximation> &t
87102
break;
88103
}
89104

90-
if (precision.constraint_max_absolute_error > 0.0 && precision.constraint_max_absolute_error < e.mae) {
105+
if (precision.constraint_max_absolute_error > 0.0 &&
106+
precision.constraint_max_absolute_error < e.mae) {
91107
float error_ratio = e.mae / precision.constraint_max_absolute_error;
92108
penalty += 20 * error_ratio * extra_term_cost; // penalty for not getting the required precision.
93109
}
94110

95111
double score = obj_score + term_count_score + precision_score - penalty;
96-
// std::printf("Score for %zu (%zu terms): %f = %d + %d + %f - penalty %f\n", i, e.coefficients.size(), score, obj_score, term_count_score, precision_score, penalty);
112+
#if DEBUG_APPROXIMATION_SEARCH
113+
std::printf("Score for %zu (%zu terms): %f = %d + %d + %f - penalty %f\n",
114+
i, e.coefficients.size(), score, obj_score, term_count_score,
115+
precision_score, penalty);
116+
#endif
97117
if (score > best_score || best == nullptr) {
98118
best = &e;
99119
best_score = score;
100120
}
101121
}
102-
// std::printf("Best score: %f\n", best_score);
122+
#if DEBUG_APPROXIMATION_SEARCH
123+
std::printf("Best score: %f\n", best_score);
124+
#endif
103125
return best;
104126
}
105127

src/ApproximationTables.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
#pragma once
1+
#ifndef HALIDE_APPROXIMATION_TABLES_H
2+
#define HALIDE_APPROXIMATION_TABLES_H
23

34
#include <vector>
45

@@ -19,3 +20,5 @@ const Approximation *best_atan_approximation(Halide::ApproximationPrecision prec
1920

2021
} // namespace Internal
2122
} // namespace Halide
23+
24+
#endif

src/IROperator.cpp

+1-11
Original file line numberDiff line numberDiff line change
@@ -1424,19 +1424,8 @@ Expr fast_atan_approximation(const Expr &x_full, ApproximationPrecision precisio
14241424
} else {
14251425
x = select(x_gt_1, 1.0f / x_full, x_full);
14261426
}
1427-
1428-
// Coefficients obtained using src/polynomial_optimizer.py
1429-
// Note that the maximal errors are computed with numpy with double precision.
1430-
// The real errors are a bit larger with single-precision floats (see correctness/fast_arctan.cpp).
1431-
// Also note that ULP distances which are not units are bogus, but this is because this error
1432-
// was again measured with double precision, so the actual reconstruction had more bits of precision
1433-
// than the actual float32 target value. So in practice the MaxULP Error will be close to round(MaxUlpE).
1434-
1435-
// The table is huge, so let's put clang-format off and handle the layout manually:
1436-
// clang-format off
14371427
const Internal::Approximation *approx = Internal::best_atan_approximation(precision);
14381428
const std::vector<double> &c = approx->coefficients;
1439-
14401429
Expr x2 = x * x;
14411430
Expr result = float(c.back());
14421431
for (size_t i = 1; i < c.size(); ++i) {
@@ -1449,6 +1438,7 @@ Expr fast_atan_approximation(const Expr &x_full, ApproximationPrecision precisio
14491438
}
14501439
return common_subexpression_elimination(result);
14511440
}
1441+
14521442
Expr fast_atan(const Expr &x_full, ApproximationPrecision precision) {
14531443
return fast_atan_approximation(x_full, precision, false);
14541444
}

src/IROperator.h

+6-7
Original file line numberDiff line numberDiff line change
@@ -983,18 +983,17 @@ Expr fast_sin(const Expr &x);
983983
Expr fast_cos(const Expr &x);
984984
// @}
985985

986-
/**
987-
* Struct that allows the user to specify several requirements for functions
986+
/** Struct that allows the user to specify several requirements for functions
988987
* that are approximated by polynomial expansions. These polynomials can be
989988
* optimized for four different metrics: Mean Squared Error, Maximum Absolute Error,
990989
* Maximum Units in Last Place (ULP) Error, or a 50%/50% blend of MAE and MULPE.
991990
*
992991
* Orthogonally to the optimization objective, these polynomials can vary
993992
* in degree. Higher degree polynomials will give more precise results.
994993
* Note that instead of specifying the degree, the number of terms is used instead.
995-
* E.g., even symmetric functions may be implemented using only even powers, for which
996-
* A number of terms of 4 would actually mean that terms in [1, x^2, x^4, x^6] are used,
997-
* which is degree 6.
994+
* E.g., even (i.e., symmetric) functions may be implemented using only even powers,
995+
* for which a number of terms of 4 would actually mean that terms
996+
* in [1, x^2, x^4, x^6] are used, which is degree 6.
998997
*
999998
* Additionally, if you don't care about number of terms in the polynomial
1000999
* and you do care about the maximal absolute error the approximation may have
@@ -1025,8 +1024,8 @@ struct ApproximationPrecision {
10251024
* For more info on the available approximations and their precisions, see the table in ApproximationTables.cpp.
10261025
*
10271026
* Note: the polynomial uses odd powers, so the number of terms is not the degree of the polynomial.
1028-
* Note: Poly8 is only useful to increase precision for atan, and not for atan2.
1029-
* Note: The performance of this functions seem to be not reliably faster on WebGPU (for now, August 2024).
1027+
* Note: the polynomial with 8 terms is only useful to increase precision for fast_atan, and not for fast_atan2.
1028+
* Note: the performance of this functions seem to be not reliably faster on WebGPU (for now, August 2024).
10301029
*/
10311030
// @{
10321031
Expr fast_atan(const Expr &x, ApproximationPrecision precision = {ApproximationPrecision::MULPE, 6});

test/performance/fast_arctan.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ int main(int argc, char **argv) {
2626

2727
Expr t0 = x / float(test_w);
2828
Expr t1 = y / float(test_h);
29-
// To make sure we time mostely the computation of the arctan, and not memory bandwidth,
29+
// To make sure we time mostly the computation of the arctan, and not memory bandwidth,
3030
// we will compute many arctans per output and sum them. In my testing, GPUs suffer more
3131
// from bandwith with this test, so we give it more arctangents to compute per output.
3232
const int test_d = target.has_gpu_feature() ? 1024 : 64;

0 commit comments

Comments
 (0)