diff --git a/src/ApproximationTables.cpp b/src/ApproximationTables.cpp new file mode 100644 index 000000000000..e376621b22d6 --- /dev/null +++ b/src/ApproximationTables.cpp @@ -0,0 +1,108 @@ +#include "ApproximationTables.h" + +namespace Halide { +namespace Internal { + +// clang-format off +// Generate this table with: +// python3 src/polynomial_optimizer.py atan --order 1 2 3 4 5 6 7 8 --loss mse mae mulpe mulpe_mae --no-gui --format table +static std::vector table_atan = { + {ApproximationPrecision::MSE, 9.249650e-04, 7.078984e-02, 2.411547e+06, {+8.56188008e-01}}, + {ApproximationPrecision::MSE, 1.026356e-05, 9.214909e-03, 3.985505e+05, {+9.76213454e-01, -2.00030200e-01}}, + {ApproximationPrecision::MSE, 1.577588e-07, 1.323851e-03, 6.724566e+04, {+9.95982073e-01, -2.92278128e-01, +8.30180680e-02}}, + {ApproximationPrecision::MSE, 2.849011e-09, 1.992218e-04, 1.142204e+04, {+9.99316541e-01, -3.22286501e-01, +1.49032461e-01, -4.08635592e-02}}, + {ApproximationPrecision::MSE, 5.667504e-11, 3.080100e-05, 1.945614e+03, {+9.99883373e-01, -3.30599535e-01, +1.81451316e-01, -8.71733830e-02, +2.18671936e-02}}, + {ApproximationPrecision::MSE, 1.202662e-12, 4.846916e-06, 3.318677e+02, {+9.99980065e-01, -3.32694393e-01, +1.94019697e-01, -1.17694732e-01, +5.40822080e-02, -1.22995279e-02}}, + {ApproximationPrecision::MSE, 2.672889e-14, 7.722732e-07, 5.664632e+01, {+9.99996589e-01, -3.33190090e-01, +1.98232868e-01, -1.32941469e-01, +8.07623712e-02, -3.46124853e-02, +7.15115276e-03}}, + {ApproximationPrecision::MSE, 6.147315e-16, 1.245768e-07, 9.764224e+00, {+9.99999416e-01, -3.33302229e-01, +1.99511173e-01, -1.39332647e-01, +9.70944891e-02, -5.68823386e-02, +2.25679012e-02, -4.25772648e-03}}, + + {ApproximationPrecision::MAE, 1.097847e-03, 4.801638e-02, 2.793645e+06, {+8.33414544e-01}}, + {ApproximationPrecision::MAE, 1.209593e-05, 4.968992e-03, 4.623251e+05, {+9.72410454e-01, -1.91981283e-01}}, + {ApproximationPrecision::MAE, 1.839382e-07, 6.107084e-04, 7.766697e+04, {+9.95360080e-01, -2.88702052e-01, +7.93508437e-02}}, + {ApproximationPrecision::MAE, 3.296902e-09, 8.164167e-05, 1.313615e+04, {+9.99214108e-01, -3.21178073e-01, +1.46272006e-01, -3.89915187e-02}}, + {ApproximationPrecision::MAE, 6.523525e-11, 1.147459e-05, 2.229646e+03, {+9.99866373e-01, -3.30305517e-01, +1.80162434e-01, -8.51611537e-02, +2.08475020e-02}}, + {ApproximationPrecision::MAE, 1.378842e-12, 1.667328e-06, 3.792091e+02, {+9.99977226e-01, -3.32622991e-01, +1.93541452e-01, -1.16429278e-01, +5.26504600e-02, -1.17203722e-02}}, + {ApproximationPrecision::MAE, 3.055131e-14, 2.480947e-07, 6.457187e+01, {+9.99996113e-01, -3.33173716e-01, +1.98078484e-01, -1.32334692e-01, +7.96260166e-02, -3.36062649e-02, +6.81247117e-03}}, + {ApproximationPrecision::MAE, 7.013215e-16, 3.757868e-08, 1.102324e+01, {+9.99999336e-01, -3.33298615e-01, +1.99465749e-01, -1.39086791e-01, +9.64233077e-02, -5.59142254e-02, +2.18643190e-02, -4.05495427e-03}}, + + {ApproximationPrecision::MULPE, 1.355602e-03, 1.067325e-01, 1.808493e+06, {+8.92130617e-01}}, + {ApproximationPrecision::MULPE, 2.100588e-05, 1.075508e-02, 1.822095e+05, {+9.89111122e-01, -2.14468039e-01}}, + {ApproximationPrecision::MULPE, 3.573985e-07, 1.316370e-03, 2.227347e+04, {+9.98665077e-01, -3.02990987e-01, +9.10404434e-02}}, + {ApproximationPrecision::MULPE, 6.474958e-09, 1.548508e-04, 2.619892e+03, {+9.99842198e-01, -3.26272641e-01, +1.56294460e-01, -4.46207045e-02}}, + {ApproximationPrecision::MULPE, 1.313474e-10, 2.533532e-05, 4.294794e+02, {+9.99974110e-01, -3.31823782e-01, +1.85886095e-01, -9.30024008e-02, +2.43894760e-02}}, + {ApproximationPrecision::MULPE, 3.007880e-12, 3.530685e-06, 5.983830e+01, {+9.99996388e-01, -3.33036463e-01, +1.95959706e-01, -1.22068745e-01, +5.83403647e-02, -1.37966171e-02}}, + {ApproximationPrecision::MULPE, 6.348880e-14, 4.882649e-07, 8.276351e+00, {+9.99999499e-01, -3.33273408e-01, +1.98895454e-01, -1.35153794e-01, +8.43185278e-02, -3.73434598e-02, +7.95583230e-03}}, + {ApproximationPrecision::MULPE, 1.369569e-15, 7.585036e-08, 1.284979e+00, {+9.99999922e-01, -3.33320840e-01, +1.99708563e-01, -1.40257063e-01, +9.93094012e-02, -5.97138046e-02, +2.44056181e-02, -4.73371006e-03}}, + + + {ApproximationPrecision::MULPE_MAE, 9.548909e-04, 6.131488e-02, 2.570520e+06, {+8.46713042e-01}}, + {ApproximationPrecision::MULPE_MAE, 1.159917e-05, 6.746680e-03, 3.778023e+05, {+9.77449762e-01, -1.98798279e-01}}, + {ApproximationPrecision::MULPE_MAE, 1.783646e-07, 8.575388e-04, 6.042236e+04, {+9.96388826e-01, -2.92591679e-01, +8.24585555e-02}}, + {ApproximationPrecision::MULPE_MAE, 3.265269e-09, 1.190548e-04, 9.505190e+03, {+9.99430906e-01, -3.22774535e-01, +1.49370817e-01, -4.07480795e-02}}, + {ApproximationPrecision::MULPE_MAE, 6.574962e-11, 1.684690e-05, 1.515116e+03, {+9.99909079e-01, -3.30795737e-01, +1.81810037e-01, -8.72860225e-02, +2.17776539e-02}}, + {ApproximationPrecision::MULPE_MAE, 1.380489e-12, 2.497538e-06, 2.510721e+02, {+9.99984893e-01, -3.32748885e-01, +1.94193211e-01, -1.17865932e-01, +5.40633775e-02, -1.22309990e-02}}, + {ApproximationPrecision::MULPE_MAE, 3.053218e-14, 3.784868e-07, 4.181995e+01, {+9.99997480e-01, -3.33205127e-01, +1.98309644e-01, -1.33094430e-01, +8.08643094e-02, -3.45859503e-02, +7.11261604e-03}}, + {ApproximationPrecision::MULPE_MAE, 7.018877e-16, 5.862915e-08, 6.942196e+00, {+9.99999581e-01, -3.33306326e-01, +1.99542180e-01, -1.39433369e-01, +9.72462857e-02, -5.69734398e-02, +2.25639390e-02, -4.24074590e-03}}, +}; +// clang-format on + +const Approximation *find_best_approximation(const std::vector &table, ApproximationPrecision precision) { + const Approximation *best = nullptr; + constexpr int term_cost = 20; + constexpr int extra_term_cost = 200; + double best_score = 0; + //std::printf("Looking for min_terms=%d, max_absolute_error=%f\n", precision.constraint_min_poly_terms, precision.constraint_max_absolute_error); + for (size_t i = 0; i < table.size(); ++i) { + const Approximation &e = table[i]; + + double penalty = 0.0; + + int obj_score = e.objective == precision.optimized_for ? 100 * term_cost : 0; + if (precision.optimized_for == ApproximationPrecision::MULPE_MAE && e.objective == ApproximationPrecision::MULPE) { + obj_score = 50 * term_cost; // When MULPE_MAE is not available, prefer MULPE. + } + + int num_terms = int(e.coefficients.size()); + int term_count_score = (12 - num_terms) * term_cost; + if (num_terms < precision.constraint_min_poly_terms) { + penalty += (precision.constraint_min_poly_terms - num_terms) * extra_term_cost; + } + + double precision_score = 0; + // If we don't care about the maximum number of terms, we maximize precision. + switch (precision.optimized_for) { + case ApproximationPrecision::MSE: + precision_score = -std::log(e.mse); + break; + case ApproximationPrecision::MAE: + precision_score = -std::log(e.mae); + break; + case ApproximationPrecision::MULPE: + precision_score = -std::log(e.mulpe); + break; + case ApproximationPrecision::MULPE_MAE: + precision_score = -0.5 * std::log(e.mulpe * e.mae); + break; + } + + if (precision.constraint_max_absolute_error > 0.0 && precision.constraint_max_absolute_error < e.mae) { + penalty += 20 * extra_term_cost; // penalty for not getting the required precision. + } + + double score = obj_score + term_count_score + precision_score - penalty; + //std::printf("Score for %zu (%zu terms): %f = %d + %d + %f - penalty %f\n", i, e.coefficients.size(), score, obj_score, term_count_score, precision_score, penalty); + if (score > best_score) { + best = &e; + best_score = score; + } + } + //std::printf("Best score: %f\n", best_score); + return best; +} + +const Approximation *best_atan_approximation(Halide::ApproximationPrecision precision) { + return find_best_approximation(table_atan, precision); +} + +} // namespace Internal +} // namespace Halide diff --git a/src/ApproximationTables.h b/src/ApproximationTables.h new file mode 100644 index 000000000000..ddf38ca9bf41 --- /dev/null +++ b/src/ApproximationTables.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "IROperator.h" + +namespace Halide { +namespace Internal { + +struct Approximation { + ApproximationPrecision::OptimizationObjective objective; + double mse; + double mae; + double mulpe; + std::vector coefficients; +}; + +const Approximation *best_atan_approximation(Halide::ApproximationPrecision precision); + +} // namespace Internal +} // namespace Halide diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f5eb3e64f97f..c7a71ffbfc61 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -219,8 +219,7 @@ target_sources( WrapCalls.h ) -# The sources that go into libHalide. For the sake of IDE support, headers that -# exist in src/ but are not public should be included here. +# The sources that go into libHalide. target_sources( Halide PRIVATE @@ -232,6 +231,7 @@ target_sources( AlignLoads.cpp AllocationBoundsInference.cpp ApplySplit.cpp + ApproximationTables.cpp Argument.cpp AssociativeOpsTable.cpp Associativity.cpp diff --git a/src/IROperator.cpp b/src/IROperator.cpp index db9b3bbb0d8f..ce80bf9dfa87 100644 --- a/src/IROperator.cpp +++ b/src/IROperator.cpp @@ -15,6 +15,7 @@ #include "IRPrinter.h" #include "Interval.h" #include "Util.h" +#include "ApproximationTables.h" #include "Var.h" using namespace Halide::Internal; @@ -1388,7 +1389,7 @@ Expr fast_sin_cos(const Expr &x_full, bool is_sin) { Expr sin_usecos = is_sin ? ((k_mod4 == 1) || (k_mod4 == 3)) : ((k_mod4 == 0) || (k_mod4 == 2)); Expr flip_sign = is_sin ? (k_mod4 > 1) : ((k_mod4 == 1) || (k_mod4 == 2)); - // Reduce the angle modulo pi/2. + // Reduce the angle modulo pi/2: i.e., to the angle within the quadrant. Expr x = x_full - k_real * pi_over_two; const float sin_c2 = -0.16666667163372039794921875f; @@ -1447,106 +1448,13 @@ Expr fast_atan_approximation(const Expr &x_full, ApproximationPrecision precisio // The table is huge, so let's put clang-format off and handle the layout manually: // clang-format off - std::vector c; - switch (precision) { - // == MSE Optimized == // - case ApproximationPrecision::MSE_Poly2: // (MSE=1.0264e-05, MAE=9.2149e-03, MaxUlpE=3.9855e+05) - c = {+9.762134539879e-01f, -2.000301999499e-01f}; - break; - case ApproximationPrecision::MSE_Poly3: // (MSE=1.5776e-07, MAE=1.3239e-03, MaxUlpE=6.7246e+04) - c = {+9.959820734941e-01f, -2.922781275652e-01f, +8.301806798764e-02f}; - break; - case ApproximationPrecision::MSE_Poly4: // (MSE=2.8490e-09, MAE=1.9922e-04, MaxUlpE=1.1422e+04) - c = {+9.993165406918e-01f, -3.222865011143e-01f, +1.490324612527e-01f, -4.086355921512e-02f}; - break; - case ApproximationPrecision::MSE_Poly5: // (MSE=5.6675e-11, MAE=3.0801e-05, MaxUlpE=1.9456e+03) - c = {+9.998833730470e-01f, -3.305995351168e-01f, +1.814513158372e-01f, -8.717338298570e-02f, - +2.186719361787e-02f}; - break; - case ApproximationPrecision::MSE_Poly6: // (MSE=1.2027e-12, MAE=4.8469e-06, MaxUlpE=3.3187e+02) - c = {+9.999800646964e-01f, -3.326943930673e-01f, +1.940196968486e-01f, -1.176947321238e-01f, - +5.408220801540e-02f, -1.229952788751e-02f}; - break; - case ApproximationPrecision::MSE_Poly7: // (MSE=2.6729e-14, MAE=7.7227e-07, MaxUlpE=5.6646e+01) - c = {+9.999965889517e-01f, -3.331900904961e-01f, +1.982328680483e-01f, -1.329414694644e-01f, - +8.076237117606e-02f, -3.461248530394e-02f, +7.151152759080e-03f}; - break; - case ApproximationPrecision::MSE_Poly8: // (MSE=6.1506e-16, MAE=1.2419e-07, MaxUlpE=9.6914e+00) - c = {+9.999994159669e-01f, -3.333022219271e-01f, +1.995110884308e-01f, -1.393321817395e-01f, - +9.709319573480e-02f, -5.688043380309e-02f, +2.256648487698e-02f, -4.257308331872e-03f}; - break; - - // == MAE Optimized == // - case ApproximationPrecision::MAE_1e_2: - case ApproximationPrecision::MAE_Poly2: // (MSE=1.2096e-05, MAE=4.9690e-03, MaxUlpE=4.6233e+05) - c = {+9.724104536788e-01f, -1.919812827495e-01f}; - break; - case ApproximationPrecision::MAE_1e_3: - case ApproximationPrecision::MAE_Poly3: // (MSE=1.8394e-07, MAE=6.1071e-04, MaxUlpE=7.7667e+04) - c = {+9.953600796593e-01f, -2.887020515559e-01f, +7.935084373856e-02f}; - break; - case ApproximationPrecision::MAE_1e_4: - case ApproximationPrecision::MAE_Poly4: // (MSE=3.2969e-09, MAE=8.1642e-05, MaxUlpE=1.3136e+04) - c = {+9.992141075707e-01f, -3.211780734117e-01f, +1.462720063085e-01f, -3.899151874271e-02f}; - break; - case ApproximationPrecision::MAE_Poly5: // (MSE=6.5235e-11, MAE=1.1475e-05, MaxUlpE=2.2296e+03) - c = {+9.998663727249e-01f, -3.303055171903e-01f, +1.801624340886e-01f, -8.516115366058e-02f, - +2.084750202717e-02f}; - break; - case ApproximationPrecision::MAE_1e_5: - case ApproximationPrecision::MAE_Poly6: // (MSE=1.3788e-12, MAE=1.6673e-06, MaxUlpE=3.7921e+02) - c = {+9.999772256973e-01f, -3.326229914097e-01f, +1.935414518077e-01f, -1.164292778405e-01f, - +5.265046001895e-02f, -1.172037220425e-02f}; - break; - case ApproximationPrecision::MAE_1e_6: - case ApproximationPrecision::MAE_Poly7: // (MSE=3.0551e-14, MAE=2.4809e-07, MaxUlpE=6.4572e+01) - c = {+9.999961125922e-01f, -3.331737159104e-01f, +1.980784841430e-01f, -1.323346922675e-01f, - +7.962601662878e-02f, -3.360626486524e-02f, +6.812471171209e-03f}; - break; - case ApproximationPrecision::MAE_Poly8: // (MSE=7.0132e-16, MAE=3.7579e-08, MaxUlpE=1.1023e+01) - c = {+9.999993357462e-01f, -3.332986153129e-01f, +1.994657492754e-01f, -1.390867909988e-01f, - +9.642330770840e-02f, -5.591422536378e-02f, +2.186431903729e-02f, -4.054954273090e-03f}; - break; - - - // == Max ULP Optimized == // - case ApproximationPrecision::MULPE_Poly2: // (MSE=2.1006e-05, MAE=1.0755e-02, MaxUlpE=1.8221e+05) - c = {+9.891111216318e-01f, -2.144680385336e-01f}; - break; - case ApproximationPrecision::MULPE_1e_2: - case ApproximationPrecision::MULPE_Poly3: // (MSE=3.5740e-07, MAE=1.3164e-03, MaxUlpE=2.2273e+04) - c = {+9.986650768126e-01f, -3.029909865833e-01f, +9.104044335898e-02f}; - break; - case ApproximationPrecision::MULPE_1e_3: - case ApproximationPrecision::MULPE_Poly4: // (MSE=6.4750e-09, MAE=1.5485e-04, MaxUlpE=2.6199e+03) - c = {+9.998421981586e-01f, -3.262726405770e-01f, +1.562944595469e-01f, -4.462070448745e-02f}; - break; - case ApproximationPrecision::MULPE_1e_4: - case ApproximationPrecision::MULPE_Poly5: // (MSE=1.3135e-10, MAE=2.5335e-05, MaxUlpE=4.2948e+02) - c = {+9.999741103798e-01f, -3.318237821017e-01f, +1.858860952571e-01f, -9.300240079057e-02f, - +2.438947597681e-02f}; - break; - case ApproximationPrecision::MULPE_1e_5: - case ApproximationPrecision::MULPE_Poly6: // (MSE=3.0079e-12, MAE=3.5307e-06, MaxUlpE=5.9838e+01) - c = {+9.999963876702e-01f, -3.330364633925e-01f, +1.959597060284e-01f, -1.220687452250e-01f, - +5.834036471395e-02f, -1.379661708254e-02f}; - break; - case ApproximationPrecision::MULPE_1e_6: - case ApproximationPrecision::MULPE_Poly7: // (MSE=6.3489e-14, MAE=4.8826e-07, MaxUlpE=8.2764e+00) - c = {+9.999994992400e-01f, -3.332734078379e-01f, +1.988954540598e-01f, -1.351537940907e-01f, - +8.431852775558e-02f, -3.734345976535e-02f, +7.955832300869e-03f}; - break; - case ApproximationPrecision::MULPE_Poly8: // (MSE=1.3696e-15, MAE=7.5850e-08, MaxUlpE=1.2850e+00) - c = {+9.999999220612e-01f, -3.333208398432e-01f, +1.997085632112e-01f, -1.402570625577e-01f, - +9.930940122930e-02f, -5.971380457112e-02f, +2.440561807586e-02f, -4.733710058459e-03f}; - break; - } - // clang-format on + const Internal::Approximation *approx = Internal::best_atan_approximation(precision); + const std::vector &c = approx->coefficients; Expr x2 = x * x; - Expr result = c.back(); + Expr result = float(c.back()); for (size_t i = 1; i < c.size(); ++i) { - result = x2 * result + c[c.size() - i - 1]; + result = x2 * result + float(c[c.size() - i - 1]); } result *= x; diff --git a/src/IROperator.h b/src/IROperator.h index 179061c06c0d..dbdb5adf75d2 100644 --- a/src/IROperator.h +++ b/src/IROperator.h @@ -973,75 +973,32 @@ Expr fast_cos(const Expr &x); // @} /** - * Enum that declares several options for functions that are approximated - * by polynomial expansions. These polynomials can be optimized for three - * different metrics: Mean Squared Error, Maximum Absolute Error, or - * Maximum Units in Last Place (ULP) Error. + * Struct that allows the user to specify several requirements for functions + * that are approximated by polynomial expansions. These polynomials can be + * optimized for four different metrics: Mean Squared Error, Maximum Absolute Error, + * Maximum Units in Last Place (ULP) Error, or a 50%/50% blend of MAE and MULPE. * * Orthogonally to the optimization objective, these polynomials can vary * in degree. Higher degree polynomials will give more precise results. - * Note that the `X` in the `PolyX` enum values refer to the number of terms - * in the polynomial, and not the degree of the polynomial. E.g., even - * symmetric functions may be implemented using only even powers, for which - * `Poly3` would actually mean that terms in [1, x^2, x^4] are used. + * Note that instead of specifying the degree, the number of terms is used instead. + * E.g., even symmetric functions may be implemented using only even powers, for which + * A number of terms of 4 would actually mean that terms in [1, x^2, x^4, x^6] are used, + * which is degree 6. * * Additionally, if you don't care about number of terms in the polynomial * and you do care about the maximal absolute error the approximation may have - * over the domain, you may use the `MAE_1e_x` values and the implementation + * over the domain, you may specify values and the implementation * will decide the appropriate polynomial degree that achieves this precision. */ -enum class ApproximationPrecision { - /** Mean Squared Error Optimized. */ - // @{ - MSE_Poly2, - MSE_Poly3, - MSE_Poly4, - MSE_Poly5, - MSE_Poly6, - MSE_Poly7, - MSE_Poly8, - // @} - - /** Number of terms in polynomial -- Optimized for Max Absolute Error. */ - // @{ - MAE_Poly2, - MAE_Poly3, - MAE_Poly4, - MAE_Poly5, - MAE_Poly6, - MAE_Poly7, - MAE_Poly8, - // @} - - /** Number of terms in polynomial -- Optimized for Max ULP Error. - * ULP is "Units in Last Place", measured in IEEE 32-bit floats. */ - // @{ - MULPE_Poly2, - MULPE_Poly3, - MULPE_Poly4, - MULPE_Poly5, - MULPE_Poly6, - MULPE_Poly7, - MULPE_Poly8, - // @} - - /* Maximum Absolute Error Optimized with given Maximal Absolute Error. */ - // @{ - MAE_1e_2, - MAE_1e_3, - MAE_1e_4, - MAE_1e_5, - MAE_1e_6, - // @} - - /* Maximum ULP Error Optimized with given Maximal Absolute Error. */ - // @{ - MULPE_1e_2, - MULPE_1e_3, - MULPE_1e_4, - MULPE_1e_5, - MULPE_1e_6, - // @} +struct ApproximationPrecision { + enum OptimizationObjective { + MSE, //< Mean Squared Error Optimized. + MAE, //< Optimized for Max Absolute Error. + MULPE, //< Optimized for Max ULP Error. ULP is "Units in Last Place", measured in IEEE 32-bit floats. + MULPE_MAE, //< Optimized for simultaneously Max ULP Error, and Max Absolute Error, each with a weight of 50%. + } optimized_for; + int constraint_min_poly_terms{0}; //< Number of terms in polynomial (zero for no constraint). + float constraint_max_absolute_error{0.0f}; //< Max absolute error (zero for no constraint). }; /** Fast vectorizable approximations for arctan and arctan2 for Float(32). @@ -1053,16 +1010,16 @@ enum class ApproximationPrecision { * - MAE (Maximum Absolute Error) * - MULPE (Maximum Units in Last Place Error). * - * The default (Max ULP Error Polynomial 6) has a MAE of 3.53e-6. - * For more info on the precision, see the table in IROperator.cpp. + * The default (Max ULP Error Polynomial of 6 terms) has a MAE of 3.53e-6. + * For more info on the available approximations and their precisions, see the table in ApproximationTables.cpp. * * Note: the polynomial uses odd powers, so the number of terms is not the degree of the polynomial. * Note: Poly8 is only useful to increase precision for atan, and not for atan2. * Note: The performance of this functions seem to be not reliably faster on WebGPU (for now, August 2024). */ // @{ -Expr fast_atan(const Expr &x, ApproximationPrecision precision = ApproximationPrecision::MULPE_Poly6); -Expr fast_atan2(const Expr &y, const Expr &x, ApproximationPrecision = ApproximationPrecision::MULPE_Poly6); +Expr fast_atan(const Expr &x, ApproximationPrecision precision = {ApproximationPrecision::MULPE, 6}); +Expr fast_atan2(const Expr &y, const Expr &x, ApproximationPrecision = {ApproximationPrecision::MULPE, 6}); // @} /** Fast approximate cleanly vectorizable log for Float(32). Returns diff --git a/src/polynomial_optimizer.py b/src/polynomial_optimizer.py index 78d1b9655445..41c4655416ba 100644 --- a/src/polynomial_optimizer.py +++ b/src/polynomial_optimizer.py @@ -38,8 +38,8 @@ def _split_lines(self, text, width): parser = argparse.ArgumentParser(formatter_class=SmartFormatter) parser.add_argument("func") -parser.add_argument("order", type=int) -parser.add_argument("loss", +parser.add_argument("--order", type=int, nargs='+', required=True) +parser.add_argument("--loss", nargs='+', required=True, choices=["mse", "mae", "mulpe", "mulpe_mae"], default="mulpe", help="R|What to optimize for.\n" @@ -50,231 +50,241 @@ def _split_lines(self, text, width): parser.add_argument("--no-gui", action='store_true', help="Do not produce plots.k") parser.add_argument("--print", action='store_true', help="Print while optimizing.") parser.add_argument("--pbar", action='store_true', help="Create a progress bar while optimizing.") -parser.add_argument("--format", default="all", choices=["all", "switch", "array", "consts"], +parser.add_argument("--format", default="all", choices=["all", "switch", "array", "table", "consts"], help="Output format for copy-pastable coefficients. (default: all)") args = parser.parse_args() -order = args.order -if args.func == "atan": - if hasattr(np, "atan"): - func = np.atan - elif hasattr(np, "arctan"): - func = np.arctan +loss_power = 500 + +def optimize_approximation(loss, order): + if args.func == "atan": + if hasattr(np, "atan"): + func = np.atan + elif hasattr(np, "arctan"): + func = np.arctan + else: + print("Your numpy version doesn't support arctan.") + exit(1) + exponents = 1 + np.arange(order) * 2 + lower, upper = 0.0, 1.0 + elif args.func == "sin": + func = np.sin + exponents = 1 + np.arange(order) * 2 + lower, upper = 0.0, np.pi / 2 + elif args.func == "cos": + func = np.cos + exponents = np.arange(order) * 2 + lower, upper = 0.0, np.pi / 2 + elif args.func == "exp": + func = lambda x: np.exp(x) + exponents = np.arange(order) + lower, upper = 0, np.log(2) + elif args.func == "log": + func = lambda x: np.log(x + 1.0) + exponents = np.arange(order) + lower, upper = 0, np.log(2) else: - print("Your numpy version doesn't support arctan.") + print("Unknown function:", args.func) exit(1) - exponents = 1 + np.arange(order) * 2 - lower, upper = 0.0, 1.0 -elif args.func == "sin": - func = np.sin - exponents = 1 + np.arange(order) * 2 - lower, upper = 0.0, np.pi / 2 -elif args.func == "cos": - func = np.cos - exponents = np.arange(order) * 2 - lower, upper = 0.0, np.pi / 2 -elif args.func == "exp": - func = lambda x: np.exp(x) - exponents = np.arange(order) - lower, upper = 0, np.log(2) -elif args.func == "log": - func = lambda x: np.log(x + 1.0) - exponents = np.arange(order) - lower, upper = 0, np.log(2) -else: - print("Unknown function:", args.func) - exit(1) - -X = np.linspace(lower, upper, 2048 * 8) -target = func(X) - -target_spacing = np.spacing(np.abs(target).astype(np.float32)).astype(np.float64) # Precision (i.e., ULP) -# We will optimize everything using double precision, which means we will obtain more bits of -# precision than the actual target values in float32, which means that our reconstruction and -# ideal target value can be a non-integer number of float32-ULPs apart. - -print("exponent:", exponents) -coeffs = np.zeros(len(exponents)) -powers = np.power(X[:,None], exponents) + X = np.linspace(lower, upper, 2048 * 8) + target = func(X) + + target_spacing = np.spacing(np.abs(target).astype(np.float32)).astype(np.float64) # Precision (i.e., ULP) + # We will optimize everything using double precision, which means we will obtain more bits of + # precision than the actual target values in float32, which means that our reconstruction and + # ideal target value can be a non-integer number of float32-ULPs apart. + + if args.print: print("exponent:", exponents) + coeffs = np.zeros(len(exponents)) + powers = np.power(X[:,None], exponents) + + + + + # If the loss is MSE, then this is just a linear system we can solve for. + # We will iteratively adjust the weights to put more focus on the parts where it goes wrong. + weight = np.ones_like(target) + + lstsq_iterations = loss_power * 10 + if loss == "mse": + lstsq_iterations = 1 + + loss_history = np.zeros((lstsq_iterations, 3)) + + iterator = range(lstsq_iterations) + if args.pbar: + import tqdm + iterator = tqdm.trange(lstsq_iterations) + + try: + for i in iterator: + norm_weight = weight / np.mean(weight) + coeffs, residuals, rank, s = np.linalg.lstsq(powers * norm_weight[:,None], target * norm_weight, rcond=None) + + y_hat = np.sum((powers * coeffs)[:,::-1], axis=-1) + diff = y_hat - target + abs_diff = np.abs(diff) + + # MSE metric + mean_squared_error = np.mean(np.square(diff)) + # MAE metric + max_abs_error = np.amax(abs_diff) + loss_history[i, 1] = max_abs_error + # MaxULP metric + ulp_error = diff / target_spacing + abs_ulp_error = np.abs(ulp_error) + max_ulp_error = np.amax(abs_ulp_error) + loss_history[i, 2] = max_ulp_error + + if args.print and i % 10 == 0: + print(f"[{((i+1) / lstsq_iterations * 100.0):3.0f}%] coefficients:", coeffs, + f" MaxAE: {max_abs_error:20.17f} MaxULPs: {max_ulp_error:20.0f} mean weight: {weight.mean():.4e}") + + if loss == "mae": + norm_error_metric = abs_diff / np.amax(abs_diff) + elif loss == "mulpe": + norm_error_metric = abs_ulp_error / max_ulp_error + elif loss == "mulpe_mae": + norm_error_metric = 0.5 * (abs_ulp_error / max_ulp_error + abs_diff / max_abs_error) + elif loss == "mse": + norm_error_metric = np.square(abs_diff) + + p = i / lstsq_iterations + p = min(p * 1.25, 1.0) + raised_error = np.power(norm_error_metric, 2 + loss_power * p) + weight += raised_error + + mean_loss = np.mean(np.power(abs_diff, loss_power)) + loss_history[i, 0] = mean_loss + + if i == 0: + init_coeffs = coeffs.copy() + init_ulp_error = ulp_error.copy() + init_abs_ulp_error = abs_ulp_error.copy() + init_abs_error = abs_diff.copy() + init_y_hat = y_hat.copy() + + except KeyboardInterrupt: + print("Interrupted") + + if not args.no_gui: + import matplotlib.pyplot as plt + + fig, ax = plt.subplots(2, 4, figsize=(12, 6)) + ax = ax.flatten() + ax[0].set_title("Comparison of exact\nand approximate " + args.func) + ax[0].plot(X, target, label=args.func) + ax[0].plot(X, y_hat, label='approx') + ax[0].grid() + ax[0].set_xlim(lower, upper) + ax[0].legend() + + ax[1].set_title("Error") + ax[1].axhline(0, linestyle='-', c='k', linewidth=1) + ax[1].plot(X, init_y_hat - target, label='init') + ax[1].plot(X, y_hat - target, label='final') + ax[1].grid() + ax[1].set_xlim(lower, upper) + ax[1].legend() + + ax[2].set_title("Absolute error\n(log-scale)") + ax[2].semilogy(X, init_abs_error, label='init') + ax[2].semilogy(X, abs_diff, label='final') + ax[2].axhline(np.amax(init_abs_error), linestyle=':', c='C0') + ax[2].axhline(np.amax(abs_diff), linestyle=':', c='C1') + ax[2].grid() + ax[2].set_xlim(lower, upper) + ax[2].legend() + + ax[3].set_title("Maximal Absolute Error\nprogression during\noptimization") + ax[3].semilogx(1 + np.arange(loss_history.shape[0]), loss_history[:,1]) + ax[3].set_xlim(1, loss_history.shape[0] + 1) + ax[3].axhline(y=loss_history[0,1], linestyle=':', color='k') + ax[3].grid() + + ax[5].set_title("ULP distance") + ax[5].axhline(0, linestyle='-', c='k', linewidth=1) + ax[5].plot(X, init_ulp_error, label='init') + ax[5].plot(X, ulp_error, label='final') + ax[5].grid() + ax[5].set_xlim(lower, upper) + ax[5].legend() + + + ax[6].set_title("Absolute ULP distance\n(log-scale)") + ax[6].semilogy(X, init_abs_ulp_error, label='init') + ax[6].semilogy(X, abs_ulp_error, label='final') + ax[6].axhline(np.amax(init_abs_ulp_error), linestyle=':', c='C0') + ax[6].axhline(np.amax(abs_ulp_error), linestyle=':', c='C1') + ax[6].grid() + ax[6].set_xlim(lower, upper) + ax[6].legend() + + ax[7].set_title("Maximal ULP Error\nprogression during\noptimization") + ax[7].loglog(1 + np.arange(loss_history.shape[0]), loss_history[:,2]) + ax[7].set_xlim(1, loss_history.shape[0] + 1) + ax[7].axhline(y=loss_history[0,2], linestyle=':', color='k') + ax[7].grid() + + ax[4].set_title("LstSq Weight\n(log-scale)") + ax[4].semilogy(X, norm_weight, label='weight') + ax[4].grid() + ax[4].set_xlim(lower, upper) + ax[4].legend() + + plt.tight_layout() + plt.show() + + return init_coeffs, coeffs, mean_squared_error, max_abs_error, max_ulp_error, loss_history + + +for loss in args.loss: + for order in args.order: + if args.print: print("Optimizing {loss} with {order} terms...") + init_coeffs, coeffs, mean_squared_error, max_abs_error, max_ulp_error, loss_history = optimize_approximation(loss, order) + + + if args.print: + print("Init coeffs:", init_coeffs) + print("Final coeffs:", coeffs) + print(f"mse: {mean_loss:40.27f} max abs error: {max_abs_error:20.17f} max ulp error: {max_ulp_error:e}") + + def print_comment(indent=""): + print(indent + "// " + + {"mae": "Max Absolute Error", + "mse": "Mean Squared Error", + "mulpe": "Max ULP Error", + "mulpe_mae": "MaxUlpAE" + }[loss] + + f" optimized (MSE={mean_squared_error:.4e}, MAE={max_abs_error:.4e}, MaxUlpE={max_ulp_error:.4e})") + + + if args.format in ["all", "consts"]: + print_comment() + for i, (e, c) in enumerate(zip(exponents, coeffs)): + print(f"const float c_{e}({c:+.12e}f);") + print() + + + if args.format in ["all", "array"]: + print_comment() + print("const float coef[] = {"); + for i, (e, c) in enumerate(reversed(list(zip(exponents, coeffs)))): + print(f" {c:+.12e}, // * x^{e}") + print("};\n") + + if args.format in ["all", "switch"]: + print("case ApproximationPrecision::" + loss.upper() + "_Poly" + str(order) + ":" + + f" // (MSE={mean_squared_error:.4e}, MAE={max_abs_error:.4e}, MaxUlpE={max_ulp_error:.4e})") + print(" c = {" + (", ".join([f"{c:+.12e}f" for c in coeffs])) + "}; break;") + print() + + if args.format in ["all", "table"]: + print("{ApproximationPrecision::" + loss.upper() + f", {mean_squared_error:.6e}, {max_abs_error:.6e}, {max_ulp_error:.6e}, " + + "{" + ", ".join([f"{c:+.8e}" for c in coeffs]) + "}},") + print() + + + if args.print: print("exponent:", exponents) -loss_power = 500 - -lstsq_iterations = loss_power * 10 - -# If the loss is MSE, then this is just a linear system we can solve for. -# We will iteratively adjust the weights to put more focus on the parts where it goes wrong. -weight = np.ones_like(target) - -if args.loss == "mse": - lstsq_iterations = 1 - -loss_history = np.zeros((lstsq_iterations, 3)) - -iterator = range(lstsq_iterations) -if args.pbar: - import tqdm - iterator = tqdm.trange(lstsq_iterations) - -try: - for i in iterator: - norm_weight = weight / np.mean(weight) - coeffs, residuals, rank, s = np.linalg.lstsq(powers * norm_weight[:,None], target * norm_weight, rcond=None) - - y_hat = np.sum((powers * coeffs)[:,::-1], axis=-1) - diff = y_hat - target - abs_diff = np.abs(diff) - - # MSE metric - mean_squared_error = np.mean(np.square(diff)) - # MAE metric - max_abs_error = np.amax(abs_diff) - loss_history[i, 1] = max_abs_error - # MaxULP metric - ulp_error = diff / target_spacing - abs_ulp_error = np.abs(ulp_error) - max_ulp_error = np.amax(abs_ulp_error) - loss_history[i, 2] = max_ulp_error - - if args.print and i % 10 == 0: - print(f"[{((i+1) / lstsq_iterations * 100.0):3.0f}%] coefficients:", coeffs, - f" MaxAE: {max_abs_error:20.17f} MaxULPs: {max_ulp_error:20.0f} mean weight: {weight.mean():.4e}") - - if args.loss == "mae": - norm_error_metric = abs_diff / np.amax(abs_diff) - elif args.loss == "mulpe": - norm_error_metric = abs_ulp_error / max_ulp_error - elif args.loss == "mulpe_mae": - norm_error_metric = 0.5 * (abs_ulp_error / max_ulp_error + abs_diff / max_abs_error) - elif args.loss == "mse": - norm_error_metric = np.square(abs_diff) - - p = i / lstsq_iterations - p = min(p * 1.25, 1.0) - raised_error = np.power(norm_error_metric, 2 + loss_power * p) - weight += raised_error - - mean_loss = np.mean(np.power(abs_diff, loss_power)) - loss_history[i, 0] = mean_loss - - if i == 0: - init_coeffs = coeffs.copy() - init_ulp_error = ulp_error.copy() - init_abs_ulp_error = abs_ulp_error.copy() - init_abs_error = abs_diff.copy() - init_y_hat = y_hat.copy() - -except KeyboardInterrupt: - print("Interrupted") - - -print("Init coeffs:", init_coeffs) -print("Final coeffs:", coeffs) -print(f"mse: {mean_loss:40.27f} max abs error: {max_abs_error:20.17f} max ulp error: {max_ulp_error:e}") - -def print_comment(indent=""): - print(indent + "// " - + {"mae": "Max Absolute Error", - "mse": "Mean Squared Error", - "mulpe": "Max ULP Error", - "mulpe_mae": "MaxUlpAE" - }[args.loss] - + f" optimized (MSE={mean_squared_error:.4e}, MAE={max_abs_error:.4e}, MaxUlpE={max_ulp_error:.4e})") - - -if args.format in ["all", "consts"]: - print() - print_comment() - for i, (e, c) in enumerate(zip(exponents, coeffs)): - print(f"const float c_{e}({c:+.12e}f);") - print() - - -if args.format in ["all", "array"]: - print() - print_comment() - print("const float coef[] = {"); - for i, (e, c) in enumerate(reversed(list(zip(exponents, coeffs)))): - print(f" {c:+.12e}, // * x^{e}") - print("};\n") - -if args.format in ["all", "switch"]: - print() - print("case ApproximationPrecision::" + args.loss.upper() + "_Poly" + str(args.order) + ":" + - f" // (MSE={mean_squared_error:.4e}, MAE={max_abs_error:.4e}, MaxUlpE={max_ulp_error:.4e})") - print(" c = {" + (", ".join([f"{c:+.12e}f" for c in coeffs])) + "}; break;") - print() - - -print() -print("exponent:", exponents) - -if args.no_gui: - exit() - -import matplotlib.pyplot as plt - -fig, ax = plt.subplots(2, 4, figsize=(12, 6)) -ax = ax.flatten() -ax[0].set_title("Comparison of exact\nand approximate " + args.func) -ax[0].plot(X, target, label=args.func) -ax[0].plot(X, y_hat, label='approx') -ax[0].grid() -ax[0].set_xlim(lower, upper) -ax[0].legend() - -ax[1].set_title("Error") -ax[1].axhline(0, linestyle='-', c='k', linewidth=1) -ax[1].plot(X, init_y_hat - target, label='init') -ax[1].plot(X, y_hat - target, label='final') -ax[1].grid() -ax[1].set_xlim(lower, upper) -ax[1].legend() - -ax[2].set_title("Absolute error\n(log-scale)") -ax[2].semilogy(X, init_abs_error, label='init') -ax[2].semilogy(X, abs_diff, label='final') -ax[2].axhline(np.amax(init_abs_error), linestyle=':', c='C0') -ax[2].axhline(np.amax(abs_diff), linestyle=':', c='C1') -ax[2].grid() -ax[2].set_xlim(lower, upper) -ax[2].legend() - -ax[3].set_title("Maximal Absolute Error\nprogression during\noptimization") -ax[3].semilogx(1 + np.arange(loss_history.shape[0]), loss_history[:,1]) -ax[3].set_xlim(1, loss_history.shape[0] + 1) -ax[3].axhline(y=loss_history[0,1], linestyle=':', color='k') -ax[3].grid() - -ax[5].set_title("ULP distance") -ax[5].axhline(0, linestyle='-', c='k', linewidth=1) -ax[5].plot(X, init_ulp_error, label='init') -ax[5].plot(X, ulp_error, label='final') -ax[5].grid() -ax[5].set_xlim(lower, upper) -ax[5].legend() - - -ax[6].set_title("Absolute ULP distance\n(log-scale)") -ax[6].semilogy(X, init_abs_ulp_error, label='init') -ax[6].semilogy(X, abs_ulp_error, label='final') -ax[6].axhline(np.amax(init_abs_ulp_error), linestyle=':', c='C0') -ax[6].axhline(np.amax(abs_ulp_error), linestyle=':', c='C1') -ax[6].grid() -ax[6].set_xlim(lower, upper) -ax[6].legend() - -ax[7].set_title("Maximal ULP Error\nprogression during\noptimization") -ax[7].loglog(1 + np.arange(loss_history.shape[0]), loss_history[:,2]) -ax[7].set_xlim(1, loss_history.shape[0] + 1) -ax[7].axhline(y=loss_history[0,2], linestyle=':', color='k') -ax[7].grid() - -ax[4].set_title("LstSq Weight\n(log-scale)") -ax[4].semilogy(X, norm_weight, label='weight') -ax[4].grid() -ax[4].set_xlim(lower, upper) -ax[4].legend() - -plt.tight_layout() -plt.show() diff --git a/test/correctness/fast_arctan.cpp b/test/correctness/fast_arctan.cpp index a86849f7df3b..0c7003c97e86 100644 --- a/test/correctness/fast_arctan.cpp +++ b/test/correctness/fast_arctan.cpp @@ -20,35 +20,34 @@ int bits_diff(float fa, float fb) { int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); - struct Prec { + struct Test { ApproximationPrecision precision; - float epsilon; const char *objective; } precisions_to_test[] = { // MAE - {ApproximationPrecision::MAE_1e_2, 1e-2f, "MAE"}, - {ApproximationPrecision::MAE_1e_3, 1e-3f, "MAE"}, - {ApproximationPrecision::MAE_1e_4, 1e-4f, "MAE"}, - {ApproximationPrecision::MAE_1e_5, 1e-5f, "MAE"}, - {ApproximationPrecision::MAE_1e_6, 1e-6f, "MAE"}, + {{ApproximationPrecision::MAE, 0, 1e-2}, "MAE"}, + {{ApproximationPrecision::MAE, 0, 1e-3}, "MAE"}, + {{ApproximationPrecision::MAE, 0, 1e-4}, "MAE"}, + {{ApproximationPrecision::MAE, 0, 1e-5}, "MAE"}, + {{ApproximationPrecision::MAE, 0, 1e-6}, "MAE"}, // MULPE - {ApproximationPrecision::MULPE_1e_2, 1e-2f, "MULPE"}, - {ApproximationPrecision::MULPE_1e_3, 1e-3f, "MULPE"}, - {ApproximationPrecision::MULPE_1e_4, 1e-4f, "MULPE"}, - {ApproximationPrecision::MULPE_1e_5, 1e-5f, "MULPE"}, - {ApproximationPrecision::MULPE_1e_6, 1e-6f, "MULPE"}, + {{ApproximationPrecision::MULPE, 0, 1e-2f}, "MULPE"}, + {{ApproximationPrecision::MULPE, 0, 1e-3f}, "MULPE"}, + {{ApproximationPrecision::MULPE, 0, 1e-4f}, "MULPE"}, + {{ApproximationPrecision::MULPE, 0, 1e-5f}, "MULPE"}, + {{ApproximationPrecision::MULPE, 0, 1e-6f}, "MULPE"}, }; - for (Prec precision : precisions_to_test) { - printf("\nTesting for precision %.1e (%s optimized)...\n", precision.epsilon, precision.objective); + for (Test test : precisions_to_test) { + printf("\nTesting for precision %.1e (%s optimized)...\n", test.precision.constraint_max_absolute_error, test.objective); Func atan_f, atan2_f; Var x, y; const int steps = 1000; Expr vx = (x - steps / 2) / float(steps / 8); Expr vy = (y - steps / 2) / float(steps / 8); - atan_f(x) = fast_atan(vx, precision.precision); + atan_f(x) = fast_atan(vx, test.precision); if (target.has_gpu_feature()) { Var xo, xi; Var yo, yi; @@ -70,14 +69,14 @@ int main(int argc, char **argv) { int mantissa_error = bits_diff(atan_x, atan_x_ref); max_error = std::max(max_error, abs_error); max_mantissa_error = std::max(max_mantissa_error, mantissa_error); - if (abs_error > precision.epsilon) { + if (abs_error > test.precision.constraint_max_absolute_error) { fprintf(stderr, "fast_atan(%.6f) = %.20f not equal to %.20f (error=%.5e)\n", x, atan_x, atan_x_ref, atan_x_ref - atan_x); exit(1); } } printf("Passed: max abs error: %.5e max mantissa bits wrong: %d\n", max_error, max_mantissa_error); - atan2_f(x, y) = fast_atan2(vx, vy, precision.precision); + atan2_f(x, y) = fast_atan2(vx, vy, test.precision); if (target.has_gpu_feature()) { Var xo, xi; Var yo, yi; @@ -100,7 +99,7 @@ int main(int argc, char **argv) { int mantissa_error = bits_diff(atan2_x_y, atan2_x_y_ref); max_error = std::max(max_error, abs_error); max_mantissa_error = std::max(max_mantissa_error, mantissa_error); - if (abs_error > precision.epsilon) { + if (abs_error > test.precision.constraint_max_absolute_error) { fprintf(stderr, "fast_atan2(%.6f, %.6f) = %.20f not equal to %.20f (error=%.5e)\n", x, y, atan2_x_y, atan2_x_y_ref, atan2_x_y_ref - atan2_x_y); exit(1); } diff --git a/test/performance/fast_arctan.cpp b/test/performance/fast_arctan.cpp index 20dce642005f..2012f906ff5e 100644 --- a/test/performance/fast_arctan.cpp +++ b/test/performance/fast_arctan.cpp @@ -14,12 +14,10 @@ int main(int argc, char **argv) { if (target.has_feature(Target::WebGPU)) { printf("WebGPU seems to perform bad, and fast_atan is not always faster (won't error if it's not faster).\n"); performance_is_expected_to_be_poor = true; - return 0; } if (target.has_feature(Target::Metal)) { printf("fast_atan is not always faster on Metal (won't error if it's not faster).\n"); performance_is_expected_to_be_poor = true; - return 0; } Var x, y; @@ -68,13 +66,13 @@ int main(int argc, char **argv) { double atan_time{0.0f}; double atan2_time{0.0f}; } precisions_to_test[] = { - {ApproximationPrecision::MULPE_Poly2, "Poly2"}, - {ApproximationPrecision::MULPE_Poly3, "Poly3"}, - {ApproximationPrecision::MULPE_Poly4, "Poly4"}, - {ApproximationPrecision::MULPE_Poly5, "Poly5"}, - {ApproximationPrecision::MULPE_Poly6, "Poly6"}, - {ApproximationPrecision::MULPE_Poly7, "Poly7"}, - {ApproximationPrecision::MULPE_Poly8, "Poly8"}, + {{ApproximationPrecision::MULPE, 2}, "Poly2"}, + {{ApproximationPrecision::MULPE, 3}, "Poly3"}, + {{ApproximationPrecision::MULPE, 4}, "Poly4"}, + {{ApproximationPrecision::MULPE, 5}, "Poly5"}, + {{ApproximationPrecision::MULPE, 6}, "Poly6"}, + {{ApproximationPrecision::MULPE, 7}, "Poly7"}, + {{ApproximationPrecision::MULPE, 8}, "Poly8"}, }; for (Prec &precision : precisions_to_test) {