Files
netgen/libsrc/core/simd_math.hpp
2025-11-02 13:14:08 +01:00

179 lines
3.9 KiB
C++

#ifndef NETGEN_CORE_SIMD_MATH_HPP
#define NETGEN_CORE_SIMD_MATH_HPP
#include <tuple>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
namespace ngcore
{
/*
based on:
Stephen L. Moshier: Methods and Programs For Mathematical Functions
https://www.moshier.net/methprog.pdf
CEPHES MATHEMATICAL FUNCTION LIBRARY
https://www.netlib.org/cephes/
*/
static constexpr double sincof[] = {
1.58962301576546568060E-10,
-2.50507477628578072866E-8,
2.75573136213857245213E-6,
-1.98412698295895385996E-4,
8.33333333332211858878E-3,
-1.66666666666666307295E-1,
};
static constexpr double coscof[6] = {
-1.13585365213876817300E-11,
2.08757008419747316778E-9,
-2.75573141792967388112E-7,
2.48015872888517045348E-5,
-1.38888888888730564116E-3,
4.16666666666665929218E-2,
};
// highly accurate on [-pi/4, pi/4]
template <int N>
auto sincos_reduced (SIMD<double,N> x)
{
auto x2 = x*x;
auto s = ((((( sincof[0]*x2 + sincof[1]) * x2 + sincof[2]) * x2 + sincof[3]) * x2 + sincof[4]) * x2 + sincof[5]);
s = x + x*x*x * s;
auto c = ((((( coscof[0]*x2 + coscof[1]) * x2 + coscof[2]) * x2 + coscof[3]) * x2 + coscof[4]) * x2 + coscof[5]);
c = 1.0 - 0.5*x2 + x2*x2*c;
return std::tuple{ s, c };
}
template <int N>
auto sincos (SIMD<double,N> x)
{
auto y = round((2/M_PI) * x);
auto q = lround(y);
auto [s1,c1] = sincos_reduced(x - y * (M_PI/2));
auto s2 = If((q & SIMD<int64_t,N>(1)) == SIMD<int64_t,N>(0), s1, c1);
auto s = If((q & SIMD<int64_t,N>(2)) == SIMD<int64_t,N>(0), s2, -s2);
auto c2 = If((q & SIMD<int64_t,N>(1)) == SIMD<int64_t,N>(0), c1, -s1);
auto c = If((q & SIMD<int64_t,N>(2)) == SIMD<int64_t,N>(0), c2, -c2);
return std::tuple{ s, c };
}
template <int N>
SIMD<double,N> exp_reduced (SIMD<double,N> x)
{
static constexpr double P[] = {
1.26177193074810590878E-4,
3.02994407707441961300E-2,
9.99999999999999999910E-1,
};
static constexpr double Q[] = {
3.00198505138664455042E-6,
2.52448340349684104192E-3,
2.27265548208155028766E-1,
2.00000000000000000009E0,
};
/*
// from: https://www.netlib.org/cephes/
rational approximation for exponential
* of the fractional part:
* e**x = 1 + 2x P(x**2)/( Q(x**2) - x P(x**2) )
xx = x * x;
px = x * polevl( xx, P, 2 );
x = px/( polevl( xx, Q, 3 ) - px );
x = 1.0 + 2.0 * x;
*/
auto xx = x*x;
auto px = (P[0]*xx + P[1]) * xx + P[2];
auto qx = ((Q[0]*xx+Q[1])*xx+Q[2])*xx+Q[3];
return 1.0 + 2.0*x * px / (qx- x * px);
}
template <int N>
SIMD<double,N> pow2_int64_to_float64(SIMD<int64_t,N> n)
{
// thx to deepseek
// Step 1: Clamp the input to valid exponent range [-1022, 1023]
// (We use saturated operations to handle out-of-range values)
SIMD<int64_t,N> max_exp(1023);
SIMD<int64_t,N> min_exp(-1022);
n = If(n > max_exp, max_exp, n);
n = If(min_exp > n, min_exp, n);
// Step 2: Add exponent bias (1023)
n = n + SIMD<int64_t,N>(1023);
// Step 3: Shift to exponent bit position (bit 52)
auto shifted_exp = (n << IC<52>());
// Step 4: Reinterpret as double
return Reinterpret<double> (shifted_exp);
}
template <int N>
SIMD<double,N> myexp (SIMD<double,N> x)
{
constexpr double log2 = 0.693147180559945286; // log(2.0);
auto r = round(1/log2 * x);
auto rI = lround(r);
r *= log2;
SIMD<double,N> pow2 = pow2_int64_to_float64 (rI);
return exp_reduced(x-r) * pow2;
// maybe better:
// x = ldexp( x, n );
}
/*
inline auto Test1 (SIMD<double> x)
{
return myexp(x);
}
inline auto Test2 (SIMD<double> x)
{
return sincos(x);
}
inline auto Test3 (SIMD<double,4> x)
{
return myexp(x);
}
inline auto Test4 (SIMD<double,4> x)
{
return sincos(x);
}
*/
}
#endif