#ifndef NETGEN_CORE_SIMD_MATH_HPP #define NETGEN_CORE_SIMD_MATH_HPP #include #ifndef M_PI #define M_PI 3.14159265358979323846 #endif namespace ngcore { /* based on: Stephen L. Moshier: Methods and Programs For Mathematical Functions https://www.moshier.net/methprog.pdf CEPHES MATHEMATICAL FUNCTION LIBRARY https://www.netlib.org/cephes/ */ static constexpr double sincof[] = { 1.58962301576546568060E-10, -2.50507477628578072866E-8, 2.75573136213857245213E-6, -1.98412698295895385996E-4, 8.33333333332211858878E-3, -1.66666666666666307295E-1, }; static constexpr double coscof[6] = { -1.13585365213876817300E-11, 2.08757008419747316778E-9, -2.75573141792967388112E-7, 2.48015872888517045348E-5, -1.38888888888730564116E-3, 4.16666666666665929218E-2, }; // highly accurate on [-pi/4, pi/4] template auto sincos_reduced (SIMD x) { auto x2 = x*x; auto s = ((((( sincof[0]*x2 + sincof[1]) * x2 + sincof[2]) * x2 + sincof[3]) * x2 + sincof[4]) * x2 + sincof[5]); s = x + x*x*x * s; auto c = ((((( coscof[0]*x2 + coscof[1]) * x2 + coscof[2]) * x2 + coscof[3]) * x2 + coscof[4]) * x2 + coscof[5]); c = 1.0 - 0.5*x2 + x2*x2*c; return std::tuple{ s, c }; } template auto sincos (SIMD x) { auto y = round((2/M_PI) * x); auto q = lround(y); auto [s1,c1] = sincos_reduced(x - y * (M_PI/2)); auto s2 = If((q & SIMD(1)) == SIMD(0), s1, c1); auto s = If((q & SIMD(2)) == SIMD(0), s2, -s2); auto c2 = If((q & SIMD(1)) == SIMD(0), c1, -s1); auto c = If((q & SIMD(2)) == SIMD(0), c2, -c2); return std::tuple{ s, c }; } template SIMD exp_reduced (SIMD x) { static constexpr double P[] = { 1.26177193074810590878E-4, 3.02994407707441961300E-2, 9.99999999999999999910E-1, }; static constexpr double Q[] = { 3.00198505138664455042E-6, 2.52448340349684104192E-3, 2.27265548208155028766E-1, 2.00000000000000000009E0, }; /* // from: https://www.netlib.org/cephes/ rational approximation for exponential * of the fractional part: * e**x = 1 + 2x P(x**2)/( Q(x**2) - x P(x**2) ) xx = x * x; px = x * polevl( xx, P, 2 ); x = px/( polevl( xx, Q, 3 ) - px ); x = 1.0 + 2.0 * x; */ auto xx = x*x; auto px = (P[0]*xx + P[1]) * xx + P[2]; auto qx = ((Q[0]*xx+Q[1])*xx+Q[2])*xx+Q[3]; return 1.0 + 2.0*x * px / (qx- x * px); } template SIMD pow2_int64_to_float64(SIMD n) { // thx to deepseek // Step 1: Clamp the input to valid exponent range [-1022, 1023] // (We use saturated operations to handle out-of-range values) SIMD max_exp(1023); SIMD min_exp(-1022); n = If(n > max_exp, max_exp, n); n = If(min_exp > n, min_exp, n); // Step 2: Add exponent bias (1023) n = n + SIMD(1023); // Step 3: Shift to exponent bit position (bit 52) auto shifted_exp = (n << IC<52>()); // Step 4: Reinterpret as double return Reinterpret (shifted_exp); } template SIMD myexp (SIMD x) { constexpr double log2 = 0.693147180559945286; // log(2.0); auto r = round(1/log2 * x); auto rI = lround(r); r *= log2; SIMD pow2 = pow2_int64_to_float64 (rI); return exp_reduced(x-r) * pow2; // maybe better: // x = ldexp( x, n ); } /* inline auto Test1 (SIMD x) { return myexp(x); } inline auto Test2 (SIMD x) { return sincos(x); } inline auto Test3 (SIMD x) { return myexp(x); } inline auto Test4 (SIMD x) { return sincos(x); } */ } #endif