serenity/AK/Math.h

/*
 * Copyright (c) 2021, Leon Albrecht <leon2002.la@gmail.com>.
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#pragma once

#include <AK/BuiltinWrappers.h>
#include <AK/Concepts.h>
#include <AK/FloatingPoint.h>
#include <AK/NumericLimits.h>
#include <AK/StdLibExtraDetails.h>
#include <AK/Types.h>
#include <math.h>

#ifdef KERNEL
#    error "Including AK/Math.h from the Kernel is never correct! Floating point is disabled."
#endif

namespace AK {

template<FloatingPoint T>
constexpr T NaN = __builtin_nan("");
template<FloatingPoint T>
constexpr T Infinity = __builtin_huge_vall();
template<FloatingPoint T>
constexpr T Pi = 3.141592653589793238462643383279502884L;
template<FloatingPoint T>
constexpr T E = 2.718281828459045235360287471352662498L;
template<FloatingPoint T>
constexpr T Sqrt2 = 1.414213562373095048801688724209698079L;
template<FloatingPoint T>
constexpr T Sqrt1_2 = 0.707106781186547524400844362104849039L;

template<FloatingPoint T>
constexpr T L2_10 = 3.321928094887362347870319429489390175864L;
template<FloatingPoint T>
constexpr T L2_E = 1.442695040888963407359924681001892137L;

namespace Details {
template<size_t>
constexpr size_t product_even();
template<>
constexpr size_t product_even<2>() { return 2; }
template<size_t value>
constexpr size_t product_even() { return value * product_even<value - 2>(); }

template<size_t>
constexpr size_t product_odd();
template<>
constexpr size_t product_odd<1>() { return 1; }
template<size_t value>
constexpr size_t product_odd() { return value * product_odd<value - 2>(); }
}

template<FloatingPoint T>
constexpr T to_radians(T degrees)
{
    return degrees * AK::Pi<T> / 180;
}

template<FloatingPoint T>
constexpr T to_degrees(T radians)
{
    return radians * 180 / AK::Pi<T>;
}

template<FloatingPoint FloatT>
FloatT copysign(FloatT x, FloatT y)
{
    using Extractor = FloatExtractor<FloatT>;
    auto ex = Extractor::from_float(x);
    auto ey = Extractor::from_float(y);
    ex.sign = ey.sign;
    return ex.to_float();
}

#define CALL_BUILTIN(function, args...)           \
    do {                                          \
        if constexpr (IsSame<T, long double>)     \
            return __builtin_##function##l(args); \
        if constexpr (IsSame<T, double>)          \
            return __builtin_##function(args);    \
        if constexpr (IsSame<T, float>)           \
            return __builtin_##function##f(args); \
    } while (0)

#define CONSTEXPR_STATE(function, args...) \
    if (is_constant_evaluated())           \
        CALL_BUILTIN(function, args);

#define AARCH64_INSTRUCTION(instruction, arg) \
    if constexpr (IsSame<T, long double>)     \
        TODO();                               \
    if constexpr (IsSame<T, double>) {        \
        double res;                           \
        asm(#instruction " %d0, %d1"          \
            : "=w"(res)                       \
            : "w"(arg));                      \
        return res;                           \
    }                                         \
    if constexpr (IsSame<T, float>) {         \
        float res;                            \
        asm(#instruction " %s0, %s1"          \
            : "=w"(res)                       \
            : "w"(arg));                      \
        return res;                           \
    }

template<FloatingPoint T>
constexpr T fabs(T x)
{
    // Both GCC and Clang inline fabs by default, so this is just a cmath like wrapper
    CALL_BUILTIN(fabs, x);
}

namespace Rounding {
template<FloatingPoint T>
constexpr T ceil(T num)
{
    // FIXME: SSE4.1 rounds[sd] num, res, 0b110
    if (is_constant_evaluated()) {
        if (num < NumericLimits<i64>::min() || num > NumericLimits<i64>::max())
            return num;
        return (static_cast<T>(static_cast<i64>(num)) == num)
            ? static_cast<i64>(num)
            : static_cast<i64>(num) + ((num > 0) ? 1 : 0);
    }
#if ARCH(AARCH64)
    AARCH64_INSTRUCTION(frintp, num);
#else
    CALL_BUILTIN(ceil, num);
#endif
}

template<FloatingPoint T>
constexpr T floor(T num)
{
    // FIXME: SSE4.1 rounds[sd] num, res, 0b101
    if (is_constant_evaluated()) {
        if (num < NumericLimits<i64>::min() || num > NumericLimits<i64>::max())
            return num;
        return (static_cast<T>(static_cast<i64>(num)) == num)
            ? static_cast<i64>(num)
            : static_cast<i64>(num) - ((num > 0) ? 0 : 1);
    }
#if ARCH(AARCH64)
    AARCH64_INSTRUCTION(frintm, num);
#else
    CALL_BUILTIN(floor, num);
#endif
}

template<FloatingPoint T>
constexpr T trunc(T num)
{
#if ARCH(AARCH64)
    if (is_constant_evaluated()) {
        if (num < NumericLimits<i64>::min() || num > NumericLimits<i64>::max())
            return num;
        return static_cast<T>(static_cast<i64>(num));
    }
    AARCH64_INSTRUCTION(frintz, num);
#endif
    // FIXME: Use dedicated instruction in the non constexpr case
    //        SSE4.1: rounds[sd] %num, %res, 0b111
    if (num < NumericLimits<i64>::min() || num > NumericLimits<i64>::max())
        return num;
    return static_cast<T>(static_cast<i64>(num));
}

template<FloatingPoint T>
constexpr T rint(T x)
{
    CONSTEXPR_STATE(rint, x);
    // Note: This does break tie to even
    //       But the behavior of frndint/rounds[ds]/frintx can be configured
    //       through the floating point control registers.
    // FIXME: We should decide if we rename this to allow us to get away from
    //        the configurability "burden" rint has
    //        this would make us use `rounds[sd] %num, %res, 0b100`
    //        and `frintn` respectively,
    //        no such guaranteed round exists for x87 `frndint`
#if ARCH(X86_64)
#    ifdef __SSE4_1__
    if constexpr (IsSame<T, double>) {
        T r;
        asm(
            "roundsd %1, %0"
            : "=x"(r)
            : "x"(x));
        return r;
    }
    if constexpr (IsSame<T, float>) {
        T r;
        asm(
            "roundss %1, %0"
            : "=x"(r)
            : "x"(x));
        return r;
    }
#    else
    asm(
        "frndint"
        : "+t"(x));
    return x;
#    endif
#elif ARCH(AARCH64)
    AARCH64_INSTRUCTION(frintx, x);
#elif ARCH(RISCV64)
    if (__builtin_isnan(x))
        return x;

    // Floating point values have a gap size of >= 1 for values above 2^mantissa_bits - 1.
    if (fabs(x) > FloatExtractor<T>::mantissa_max)
        return x;

    if constexpr (IsSame<T, float>) {
        i64 r;
        asm("fcvt.l.s %0, %1, dyn"
            : "=r"(r)
            : "f"(x));
        return copysign(static_cast<float>(r), x);
    }
    if constexpr (IsSame<T, double>) {
        i64 r;
        asm("fcvt.l.d %0, %1, dyn"
            : "=r"(r)
            : "f"(x));
        return copysign(static_cast<double>(r), x);
    }
    if constexpr (IsSame<T, long double>)
        TODO_RISCV64();
#endif
    TODO();
}

template<FloatingPoint T>
constexpr T round(T x)
{
    CONSTEXPR_STATE(round, x);
    // Note: This is break-tie-away-from-zero, so not the hw's understanding of
    //       "nearest", which would be towards even.
    if (x == 0.)
        return x;
    if (x > 0.)
        return floor(x + .5);
    return ceil(x - .5);
}

template<Integral I, FloatingPoint P>
ALWAYS_INLINE I round_to(P value);

#if ARCH(X86_64)
template<Integral I>
ALWAYS_INLINE I round_to(long double value)
{
    // Note: fistps outputs into a signed integer location (i16, i32, i64),
    //       so lets be nice and tell the compiler that.
    Conditional<sizeof(I) >= sizeof(i16), MakeSigned<I>, i16> ret;
    if constexpr (sizeof(I) == sizeof(i64)) {
        asm("fistpll %0"
            : "=m"(ret)
            : "t"(value)
            : "st");
    } else if constexpr (sizeof(I) == sizeof(i32)) {
        asm("fistpl %0"
            : "=m"(ret)
            : "t"(value)
            : "st");
    } else {
        asm("fistps %0"
            : "=m"(ret)
            : "t"(value)
            : "st");
    }
    return static_cast<I>(ret);
}

template<Integral I>
ALWAYS_INLINE I round_to(float value)
{
    // FIXME: round_to<u64> might will cause issues, aka the indefinite value being set,
    //        if the value surpasses the i64 limit, even if the result could fit into an u64
    //        To solve this we would either need to detect that value or do a range check and
    //        then do a more specialized conversion, which might include a division (which is expensive)
    if constexpr (sizeof(I) == sizeof(i64) || IsSame<I, u32>) {
        i64 ret;
        asm("cvtss2si %1, %0"
            : "=r"(ret)
            : "xm"(value));
        return static_cast<I>(ret);
    }
    i32 ret;
    asm("cvtss2si %1, %0"
        : "=r"(ret)
        : "xm"(value));
    return static_cast<I>(ret);
}

template<Integral I>
ALWAYS_INLINE I round_to(double value)
{
    // FIXME: round_to<u64> might will cause issues, aka the indefinite value being set,
    //        if the value surpasses the i64 limit, even if the result could fit into an u64
    //        To solve this we would either need to detect that value or do a range check and
    //        then do a more specialized conversion, which might include a division (which is expensive)
    if constexpr (sizeof(I) == sizeof(i64) || IsSame<I, u32>) {
        i64 ret;
        asm("cvtsd2si %1, %0"
            : "=r"(ret)
            : "xm"(value));
        return static_cast<I>(ret);
    }
    i32 ret;
    asm("cvtsd2si %1, %0"
        : "=r"(ret)
        : "xm"(value));
    return static_cast<I>(ret);
}

#elif ARCH(AARCH64)
template<SignedIntegral I>
ALWAYS_INLINE I round_to(float value)
{
    if constexpr (sizeof(I) <= sizeof(u32)) {
        i32 res;
        asm("fcvtns %w0, %s1"
            : "=r"(res)
            : "w"(value));
        return static_cast<I>(res);
    }
    i64 res;
    asm("fcvtns %0, %s1"
        : "=r"(res)
        : "w"(value));
    return static_cast<I>(res);
}

template<SignedIntegral I>
ALWAYS_INLINE I round_to(double value)
{
    if constexpr (sizeof(I) <= sizeof(u32)) {
        i32 res;
        asm("fcvtns %w0, %d1"
            : "=r"(res)
            : "w"(value));
        return static_cast<I>(res);
    }
    i64 res;
    asm("fcvtns %0, %d1"
        : "=r"(res)
        : "w"(value));
    return static_cast<I>(res);
}

template<UnsignedIntegral U>
ALWAYS_INLINE U round_to(float value)
{
    if constexpr (sizeof(U) <= sizeof(u32)) {
        u32 res;
        asm("fcvtnu %w0, %s1"
            : "=r"(res)
            : "w"(value));
        return static_cast<U>(res);
    }
    i64 res;
    asm("fcvtnu %0, %s1"
        : "=r"(res)
        : "w"(value));
    return static_cast<U>(res);
}

template<UnsignedIntegral U>
ALWAYS_INLINE U round_to(double value)
{
    if constexpr (sizeof(U) <= sizeof(u32)) {
        u32 res;
        asm("fcvtns %w0, %d1"
            : "=r"(res)
            : "w"(value));
        return static_cast<U>(res);
    }
    i64 res;
    asm("fcvtns %0, %d1"
        : "=r"(res)
        : "w"(value));
    return static_cast<U>(res);
}

#else
template<Integral I, FloatingPoint P>
ALWAYS_INLINE I round_to(P value)
{
    if constexpr (IsSame<P, long double>)
        return static_cast<I>(__builtin_llrintl(value));
    if constexpr (IsSame<P, double>)
        return static_cast<I>(__builtin_llrint(value));
    if constexpr (IsSame<P, float>)
        return static_cast<I>(__builtin_llrintf(value));
}
#endif

}

using Rounding::ceil;
using Rounding::floor;
using Rounding::rint;
using Rounding::round;
using Rounding::round_to;
using Rounding::trunc;

namespace Division {
template<FloatingPoint T>
constexpr T fmod(T x, T y)
{
    CONSTEXPR_STATE(fmod, x, y);

#if ARCH(X86_64)
    u16 fpu_status;
    do {
        asm(
            "fprem\n"
            "fnstsw %%ax\n"
            : "+t"(x), "=a"(fpu_status)
            : "u"(y));
    } while (fpu_status & 0x400);
    return x;
#else
#    if defined(AK_OS_SERENITY)
    // FIXME: This is a very naive implementation.

    if (__builtin_isnan(x))
        return x;
    if (__builtin_isnan(y))
        return y;

    // SPECIAL VALUES
    //      fmod(±0, y) returns ±0 if y is neither 0 nor NaN.
    if (x == 0 && y != 0)
        return x;

    //      fmod(x, y) returns a NaN and raises the "invalid" floating-point exception for x infinite or y zero.
    // FIXME: Exception.
    if (__builtin_isinf(x) || y == 0)
        return NaN<T>;

    //      fmod(x, ±infinity) returns x for x not infinite.
    if (__builtin_isinf(y))
        return x;

    // Range reduction: handle negative x and y.
    if (y < 0)
        return fmod(x, -y);
    if (x < 0)
        return -fmod(-x, y);

    // x is x_mantissa * 2**x_exponent, y is y_mantissa * 2**y_exponent.
    // (Both are positive at this point.)
    // If y_exponent > x_exponent, we are done and can return x.
    // If y_exponent == x_exponent, we can return (x_mantissa % y_mantissa) * 2**x_exponent.
    // If y_exponent < x_exponent, we'll iteratively reduce x_exponent by shifting from
    // the exponent into the mantissa.

    auto x_bits = FloatExtractor<T>::from_float(x);
    typename FloatExtractor<T>::ComponentType x_exponent = x_bits.exponent;

    auto y_bits = FloatExtractor<T>::from_float(y);
    typename FloatExtractor<T>::ComponentType y_exponent = y_bits.exponent;

    // FIXME: Handle denormals. For now, treat them as 0.
    if (x_exponent == 0 && y_exponent != 0)
        return 0;
    if (y_exponent == 0)
        return NaN<T>;

    if (y_exponent > x_exponent)
        return x;

    // FIXME: This is wrong for f80.
    typename FloatExtractor<T>::ComponentType implicit_mantissa_bit = 1;
    implicit_mantissa_bit <<= FloatExtractor<T>::mantissa_bits;

    typename FloatExtractor<T>::ComponentType x_mantissa = x_bits.mantissa | implicit_mantissa_bit;
    typename FloatExtractor<T>::ComponentType y_mantissa = y_bits.mantissa | implicit_mantissa_bit;

    while (y_exponent < x_exponent) {
        // This is ok because (x % (y * 2**n)) divides (x % y) for all n > 0.
        x_mantissa %= y_mantissa;

        x_mantissa <<= 1;
        --x_exponent;
    }

    x_mantissa %= y_mantissa;

    if (x_mantissa == 0)
        return 0;

    // We're done and want to return x_mantissa * 2 ** x_exponent.
    // But x_mantissa might not have a leading 1 bit, so we have to realign first.
    // Mantissa is mantissa_bits long, count_leading_zeroes() counts in ComponentType, adjust:
    auto const always_zero_bits = sizeof(typename FloatExtractor<T>::ComponentType) * 8 - (FloatExtractor<T>::mantissa_bits + 1); // +1 for implicit 1 bit
    auto shift = count_leading_zeroes(x_mantissa) - always_zero_bits;

    if (x_exponent < shift) {
        // FIXME: Make a real denormal.
        return 0;
    }

    x_mantissa <<= shift;
    x_exponent -= shift;

    x_bits.exponent = x_exponent;
    x_bits.mantissa = x_mantissa;
    return x_bits.to_float();
#    else
    CALL_BUILTIN(fmod, x, y);
#    endif
#endif
}

template<FloatingPoint T>
constexpr T remainder(T x, T y)
{
    CONSTEXPR_STATE(remainder, x, y);

#if ARCH(X86_64)
    u16 fpu_status;
    do {
        asm(
            "fprem1\n"
            "fnstsw %%ax\n"
            : "+t"(x), "=a"(fpu_status)
            : "u"(y));
    } while (fpu_status & 0x400);
    return x;
#else
#    if defined(AK_OS_SERENITY)
    // TODO: Add implementation for this function.
    TODO();
#    endif
    CALL_BUILTIN(remainder, x, y);
#endif
}
}

using Division::fmod;
using Division::remainder;

template<FloatingPoint T>
constexpr T sqrt(T x)
{
    CONSTEXPR_STATE(sqrt, x);

#if ARCH(X86_64)
    if constexpr (IsSame<T, float>) {
        float res;
        asm("sqrtss %1, %0"
            : "=x"(res)
            : "x"(x));
        return res;
    }
    if constexpr (IsSame<T, double>) {
        double res;
        asm("sqrtsd %1, %0"
            : "=x"(res)
            : "x"(x));
        return res;
    }
    T res;
    asm("fsqrt"
        : "=t"(res)
        : "0"(x));
    return res;
#elif ARCH(AARCH64)
    AARCH64_INSTRUCTION(fsqrt, x);
#elif ARCH(RISCV64)
    if constexpr (IsSame<T, float>) {
        float res;
        asm("fsqrt.s %0, %1"
            : "=f"(res)
            : "f"(x));
        return res;
    }
    if constexpr (IsSame<T, double>) {
        double res;
        asm("fsqrt.d %0, %1"
            : "=f"(res)
            : "f"(x));
        return res;
    }
    if constexpr (IsSame<T, long double>)
        TODO_RISCV64();
#else
#    if defined(AK_OS_SERENITY)
    // TODO: Add implementation for this function.
    TODO();
#    endif
    CALL_BUILTIN(sqrt, x);
#endif
}

template<FloatingPoint T>
constexpr T cbrt(T x)
{
    CONSTEXPR_STATE(cbrt, x);
    if (__builtin_isinf(x) || x == 0)
        return x;
    if (x < 0)
        return -cbrt(-x);

    T r = x;
    T ex = 0;

    while (r < 0.125l) {
        r *= 8;
        ex--;
    }
    while (r > 1.0l) {
        r *= 0.125l;
        ex++;
    }

    r = (-0.46946116l * r + 1.072302l) * r + 0.3812513l;

    while (ex < 0) {
        r *= 0.5l;
        ex++;
    }
    while (ex > 0) {
        r *= 2.0l;
        ex--;
    }

    r = (2.0l / 3.0l) * r + (1.0l / 3.0l) * x / (r * r);
    r = (2.0l / 3.0l) * r + (1.0l / 3.0l) * x / (r * r);
    r = (2.0l / 3.0l) * r + (1.0l / 3.0l) * x / (r * r);
    r = (2.0l / 3.0l) * r + (1.0l / 3.0l) * x / (r * r);

    return r;
}

namespace Trigonometry {

template<FloatingPoint T>
constexpr T hypot(T x, T y)
{
    return sqrt(x * x + y * y);
}

template<FloatingPoint T>
constexpr T sin(T angle)
{
    CONSTEXPR_STATE(sin, angle);

#if ARCH(X86_64)
    T ret;
    asm(
        "fsin"
        : "=t"(ret)
        : "0"(angle));
    return ret;
#else
#    if defined(AK_OS_SERENITY)
    T sign = 1;
    if (angle < 0) {
        angle = -angle;
        sign = -1;
    }

    if (angle >= 2 * Pi<T>)
        angle = fmod(angle, 2 * Pi<T>);

    if (angle >= Pi<T>) {
        angle = angle - Pi<T>;
        sign = -sign;
    }

    if (angle > Pi<T> / 2)
        angle = Pi<T> - angle;

    // https://github.com/samhocevar/lolremez/wiki/Tutorial-4-of-5%3A-fixing-lower-order-parameters
    auto f = [](T x) {
        if constexpr (IsSame<T, f32>) {
            // lolremez --float --degree 3 --range "1e-50:pi*pi/4" "(sin(sqrt(x))-sqrt(x))/(x*sqrt(x))" "1/(x*sqrt(x))"
            // "Estimated max error: 4.618689e-9"
            f32 u = 2.6000548e-06f;
            u = u * x + -0.00019806615f;
            u = u * x + 0.0083330171f;
            return u * x + -0.16666657f;
        } else {
            // FIXME: Could do something custom for long double.
            // lolremez --degree 6 --range "1e-50:pi*pi/4" "(sin(sqrt(x))-sqrt(x))/(x*sqrt(x))" "1/(x*sqrt(x))"
            // "Estimated max error: 1.1015766629825144e-16"
            T u = -7.3646464502210486e-13;
            u = u * x + 1.6047301196685753e-10;
            u = u * x + -2.5051851497012596e-08;
            u = u * x + 2.7557316077007725e-06;
            u = u * x + -0.00019841269820094207;
            u = u * x + 0.0083333333332628792;
            return u * x + -0.16666666666665811;
        }
    };
    T angle_squared = angle * angle;
    return sign * (angle + angle * angle_squared * f(angle_squared));
#    else
    CALL_BUILTIN(sin, angle);
#    endif
#endif
}

template<FloatingPoint T>
constexpr T cos(T angle)
{
    CONSTEXPR_STATE(cos, angle);

#if ARCH(X86_64)
    T ret;
    asm(
        "fcos"
        : "=t"(ret)
        : "0"(angle));
    return ret;
#else
#    if defined(AK_OS_SERENITY)
    if (angle < 0)
        angle = -angle;

    if (angle >= 2 * Pi<T>)
        angle = fmod(angle, 2 * Pi<T>);

    T sign = 1;
    if (angle >= Pi<T>) {
        angle = angle - Pi<T>;
        sign = -1;
    }

    if (angle > Pi<T> / 2) {
        angle = Pi<T> - angle;
        sign = -sign;
    }

    // https://github.com/samhocevar/lolremez/wiki/Tutorial-3-of-5:-changing-variables-for-simpler-polynomials
    // for cos(x): cos(x) - 1 is a function of x^2 terms, so we do the substitution on that page like
    // max | cos(x) - 1 - x^2 Q(x^2) | and then set y = x^2. That yields:
    // max | (cos(sqrt(y)) - 1) - y Q(y) |. Dividing through with y (instead of sqrt(y) as in the sin() case) gives us:
    auto f = [](T x) {
        if constexpr (IsSame<T, f32>) {
            // lolremez --float --degree 3 --range "1e-50:pi*pi/4" "(cos(sqrt(x)) - 1)/x"  "1/x"
            // "Estimated max error: 5.2720126e-8"
            f32 u = 2.3194387e-05f;
            u = u * x + -0.0013855927f;
            u = u * x + 0.041663989f;
            return u * x + -0.49999931f;
        } else {
            // lolremez --degree 6 --range "1e-50:pi*pi/4" "(cos(sqrt(x)) - 1)/x"  "1/x"
            // "Estimated max error: 2.0880269759116624e-15"
            T u = -1.101249182846601e-11;
            u = u * x + 2.0858735176345955e-09;
            u = u * x + -2.7556950755056579e-07;
            u = u * x + 2.4801583156341611e-05;
            u = u * x + -0.001388888886393419;
            u = u * x + 0.041666666665954213;
            return u * x + -0.49999999999993117;
        }
    };
    T angle_squared = angle * angle;
    return sign * (1 + angle_squared * f(angle_squared));
#    else
    CALL_BUILTIN(cos, angle);
#    endif
#endif
}

template<FloatingPoint T>
constexpr void sincos(T angle, T& sin_val, T& cos_val)
{
    if (is_constant_evaluated()) {
        sin_val = sin(angle);
        cos_val = cos(angle);
        return;
    }
#if ARCH(X86_64)
    asm(
        "fsincos"
        : "=t"(cos_val), "=u"(sin_val)
        : "0"(angle));
#else
    sin_val = sin(angle);
    cos_val = cos(angle);
#endif
}

template<FloatingPoint T>
constexpr T tan(T angle)
{
    CONSTEXPR_STATE(tan, angle);

#if ARCH(X86_64)
    T ret, one;
    asm(
        "fptan"
        : "=t"(one), "=u"(ret)
        : "0"(angle));

    return ret;
#else
#    if defined(AK_OS_SERENITY)
    return sin(angle) / cos(angle);
#    else
    CALL_BUILTIN(tan, angle);
#    endif
#endif
}

template<FloatingPoint T>
constexpr T asin(T x)
{
    CONSTEXPR_STATE(asin, x);

    if (x < 0)
        return -asin(-x);

    if (x > 1)
        return NaN<T>;

    if (x > (T)0.5) {
        // asin(x) = pi/2 - 2 * asin(sqrt((1 - x) / 2))
        // If 0.5 < x <= 1, then sqrt((1 - x) / 2 ) < 0.5,
        // and the recursion will go down the `<= 0.5` branch.
        return Pi<T> / 2 - 2 * asin(sqrt((1 - x) / 2));
    }

    T squared = x * x;
    T value = x;
    T i = x * squared;
    value += i * Details::product_odd<1>() / Details::product_even<2>() / 3;
    i *= squared;
    value += i * Details::product_odd<3>() / Details::product_even<4>() / 5;
    i *= squared;
    value += i * Details::product_odd<5>() / Details::product_even<6>() / 7;
    i *= squared;
    value += i * Details::product_odd<7>() / Details::product_even<8>() / 9;
    i *= squared;
    value += i * Details::product_odd<9>() / Details::product_even<10>() / 11;
    i *= squared;
    value += i * Details::product_odd<11>() / Details::product_even<12>() / 13;
    i *= squared;
    value += i * Details::product_odd<13>() / Details::product_even<14>() / 15;
    i *= squared;
    value += i * Details::product_odd<15>() / Details::product_even<16>() / 17;
    return value;
}

template<FloatingPoint T>
constexpr T acos(T value)
{
    CONSTEXPR_STATE(acos, value);

    // FIXME: I am naive
    return static_cast<T>(0.5) * Pi<T> - asin<T>(value);
}

template<FloatingPoint T>
constexpr T atan(T value)
{
    CONSTEXPR_STATE(atan, value);

#if ARCH(X86_64)
    T ret;
    asm(
        "fld1\n"
        "fpatan\n"
        : "=t"(ret)
        : "0"(value));
    return ret;
#else
#    if defined(AK_OS_SERENITY)
    if (value < 0)
        return -atan(-value);

    if (value > 1)
        return Pi<T> / 2 - atan(1 / value);

    // atan is an odd function a leading factor of 1, so this uses the same substitutions as sin().
    // See there for a description.
    auto f = [](T x) {
        if constexpr (IsSame<T, f32>) {
            // lolremez --float --degree 7 --range "1e-50:1" "(atan(sqrt(x))-sqrt(x))/(x*sqrt(x))" "1/(x*sqrt(x))"
            // "Estimated max error: 7.351572e-9"
            float u = 0.0026222446f;
            u = u * x + -0.015132537f;
            u = u * x + 0.041121863f;
            u = u * x + -0.073667064f;
            u = u * x + 0.10573932f;
            u = u * x + -0.14185975f;
            u = u * x + 0.19990396f;
            return u * x + -0.33332986f;
        } else {
            // FIXME: Could do something custom for long double.
            // lolremez --degree 15 --range "1e-50:1" "(atan(sqrt(x))-sqrt(x))/(x*sqrt(x))" "1/(x*sqrt(x))"
            // "Estimated max error: 2.695747111292741e-15"
            T u = 6.4855700791782353e-05;
            u = u * x + -0.00062980993515420608;
            u = u * x + 0.0028877745234626882;
            u = u * x + -0.0083913659122280861;
            u = u * x + 0.01759373283992496;
            u = u * x + -0.028943865588822337;
            u = u * x + 0.04001711781175539;
            u = u * x + -0.049493567473208426;
            u = u * x + 0.05782092815821073;
            u = u * x + -0.066423996784058609;
            u = u * x + 0.076879768543915233;
            u = u * x + -0.090903598304650779;
            u = u * x + 0.11111064186237864;
            u = u * x + -0.14285711801574916;
            u = u * x + 0.19999999929660117;
            return u * x + -0.33333333332571796;
        }
    };
    T squared = value * value;
    return value + value * squared * f(squared);
#    endif
    CALL_BUILTIN(atan, value);
#endif
}

template<FloatingPoint T>
constexpr T atan2(T y, T x)
{
    CONSTEXPR_STATE(atan2, y, x);

#if ARCH(X86_64)
    T ret;
    asm("fpatan"
        : "=t"(ret)
        : "0"(x), "u"(y)
        : "st(1)");
    return ret;
#else
#    if defined(AK_OS_SERENITY)
    if (__builtin_isnan(y))
        return y;
    if (__builtin_isnan(x))
        return x;

    // SPECIAL VALUES
    //      atan2(±0, -0) returns ±pi.
    if (y == 0 && x == 0 && signbit(x))
        return copysign(Pi<T>, y);

    //      atan2(±0, +0) returns ±0.
    if (y == 0 && x == 0 && !signbit(x))
        return y;

    //      atan2(±0, x) returns ±pi for x < 0.
    if (y == 0 && x < 0)
        return copysign(Pi<T>, y);

    //      atan2(±0, x) returns ±0 for x > 0.
    if (y == 0 && x > 0)
        return y;

    //      atan2(y, ±0) returns +pi/2 for y > 0.
    if (y > 0 && x == 0)
        return Pi<T> / 2;

    //      atan2(y, ±0) returns -pi/2 for y < 0.
    if (y < 0 && x == 0)
        return -Pi<T> / 2;

    //      atan2(±y, -infinity) returns ±pi for finite y > 0.
    if (!__builtin_isinf(y) && y > 0 && __builtin_isinf(x) && signbit(x))
        return copysign(Pi<T>, y);

    //      atan2(±y, +infinity) returns ±0 for finite y > 0.
    if (!__builtin_isinf(y) && y > 0 && __builtin_isinf(x) && !signbit(x))
        return copysign(static_cast<T>(0), y);

    //      atan2(±infinity, x) returns ±pi/2 for finite x.
    if (__builtin_isinf(y) && !__builtin_isinf(x))
        return copysign(Pi<T> / 2, y);

    //      atan2(±infinity, -infinity) returns ±3*pi/4.
    if (__builtin_isinf(y) && __builtin_isinf(x) && signbit(x))
        return copysign(3 * Pi<T> / 4, y);

    //      atan2(±infinity, +infinity) returns ±pi/4.
    if (__builtin_isinf(y) && __builtin_isinf(x) && !signbit(x))
        return copysign(Pi<T> / 4, y);

    // Check quadrant, going counterclockwise.
    if (y > 0 && x > 0)
        return atan(y / x);
    if (y > 0 && x < 0)
        return atan(y / x) + Pi<T>;
    if (y < 0 && x < 0)
        return atan(y / x) - Pi<T>;
    // y < 0 && x > 0
    return atan(y / x);
#    else
    CALL_BUILTIN(atan2, y, x);
#    endif
#endif
}

}

using Trigonometry::acos;
using Trigonometry::asin;
using Trigonometry::atan;
using Trigonometry::atan2;
using Trigonometry::cos;
using Trigonometry::hypot;
using Trigonometry::sin;
using Trigonometry::sincos;
using Trigonometry::tan;

namespace Exponentials {

template<FloatingPoint T>
constexpr T log2(T x)
{
    CONSTEXPR_STATE(log2, x);

#if ARCH(X86_64)
    if constexpr (IsSame<T, long double>) {
        T ret;
        asm(
            "fld1\n"
            "fxch %%st(1)\n"
            "fyl2x\n"
            : "=t"(ret)
            : "0"(x));
        return ret;
    }
#endif
    // References:
    // Gist comparing some implementations
    // * https://gist.github.com/Hendiadyoin1/f58346d66637deb9156ef360aa158bf9

    if (x == 0)
        return -Infinity<T>;
    if (x <= 0 || __builtin_isnan(x))
        return NaN<T>;

    auto ext = FloatExtractor<T>::from_float(x);
    T exponent = ext.exponent - FloatExtractor<T>::exponent_bias;

    // When the mantissa shows 0b00 (implicitly 1.0) we are on a power of 2
    if (ext.mantissa == 0)
        return exponent;

    // FIXME: Handle denormalized numbers separately

    FloatExtractor<T> mantissa_ext {
        .mantissa = ext.mantissa,
        .exponent = FloatExtractor<T>::exponent_bias,
        .sign = ext.sign
    };

    // (1 <= mantissa < 2)
    T m = mantissa_ext.to_float();

    // This is a reconstruction of one of Sun's algorithms
    // They use a transformation to lower the problem space,
    // while keeping the precision, and a 14th degree polynomial,
    // which is mirrored at sqrt(2)
    // TODO: Sun has some more algorithms for this and other math functions,
    //       leveraging LUTs, investigate those, if they are better in performance and/or precision.
    //       These seem to be related to crLibM's implementations, for which papers and references
    //       are available.
    // FIXME: Dynamically adjust the amount of precision between floats and doubles
    //        AKA floats might need less accuracy here, in comparison to doubles

    bool inverted = false;
    // m > sqrt(2)
    if (m > Sqrt2<T>) {
        inverted = true;
        m = 2 / m;
    }
    T s = (m - 1) / (m + 1);
    // ln((1 + s) / (1 - s)) == ln(m)
    T s2 = s * s;
    // clang-format off
    T high_approx = s2 * (static_cast<T>(0.6666666666666735130)
                  + s2 * (static_cast<T>(0.3999999999940941908)
                  + s2 * (static_cast<T>(0.2857142874366239149)
                  + s2 * (static_cast<T>(0.2222219843214978396)
                  + s2 * (static_cast<T>(0.1818357216161805012)
                  + s2 * (static_cast<T>(0.1531383769920937332)
                  + s2 *  static_cast<T>(0.1479819860511658591)))))));
    // clang-format on

    // ln(m) == 2 * s + s * high_approx
    // log2(m) == log2(e) * ln(m)
    T log2_mantissa = L2_E<T> * (2 * s + s * high_approx);
    if (inverted)
        log2_mantissa = 1 - log2_mantissa;
    return exponent + log2_mantissa;
}

template<FloatingPoint T>
constexpr T log(T x)
{
    CONSTEXPR_STATE(log, x);

#if ARCH(X86_64)
    T ret;
    asm(
        "fldln2\n"
        "fxch %%st(1)\n"
        "fyl2x\n"
        : "=t"(ret)
        : "0"(x));
    return ret;
#elif defined(AK_OS_SERENITY)
    // FIXME: Adjust the polynomial and formula in log2 to fit this
    return log2<T>(x) / L2_E<T>;
#else
    CALL_BUILTIN(log, x);
#endif
}

template<FloatingPoint T>
constexpr T log10(T x)
{
    CONSTEXPR_STATE(log10, x);

#if ARCH(X86_64)
    T ret;
    asm(
        "fldlg2\n"
        "fxch %%st(1)\n"
        "fyl2x\n"
        : "=t"(ret)
        : "0"(x));
    return ret;
#elif defined(AK_OS_SERENITY)
    // FIXME: Adjust the polynomial and formula in log2 to fit this
    return log2<T>(x) / L2_10<T>;
#else
    CALL_BUILTIN(log10, x);
#endif
}

template<FloatingPoint T>
constexpr T exp2(T exponent)
{
    CONSTEXPR_STATE(exp2, exponent);

#if ARCH(X86_64)
    T res;
    asm("fld1\n"
        "fld %%st(1)\n"
        "fprem\n"
        "f2xm1\n"
        "faddp\n"
        "fscale\n"
        "fstp %%st(1)"
        : "=t"(res)
        : "0"(exponent));
    return res;
#else
    // TODO: Add better implementation of this function.
    // This is just fast exponentiation for the integer part and
    // the first couple terms of the taylor series for the fractional part.

    if (exponent < 0)
        return 1 / exp2(-exponent);

    if (exponent >= log2(NumericLimits<T>::max()))
        return Infinity<T>;

    // Integer exponentiation part.
    int int_exponent = static_cast<int>(exponent);
    T exponent_fraction = exponent - int_exponent;

    T int_result = 1;
    T base = 2;
    for (;;) {
        if (int_exponent & 1)
            int_result *= base;
        int_exponent >>= 1;
        if (!int_exponent)
            break;
        base *= base;
    }

    // Fractional part.
    // Uses:
    // exp(x) = sum(n, 0, \infty, x ** n / n!)
    // 2**x = exp(log2(e) * x)
    // FIXME: Pick better step size (and make it dependent on T).
    T result = 0;
    T power = 1;
    T factorial = 1;
    for (int i = 1; i < 16; ++i) {
        result += power / factorial;
        power *= exponent_fraction / L2_E<T>;
        factorial *= i;
    }

    return int_result * result;
#endif
}

template<FloatingPoint T>
constexpr T exp(T exponent)
{
    CONSTEXPR_STATE(exp, exponent);

#if ARCH(X86_64)
    T res;
    asm("fldl2e\n"
        "fmulp\n"
        "fld1\n"
        "fld %%st(1)\n"
        "fprem\n"
        "f2xm1\n"
        "faddp\n"
        "fscale\n"
        "fstp %%st(1)"
        : "=t"(res)
        : "0"(exponent));
    return res;
#else
    // TODO: Add better implementation of this function.
    return exp2(exponent * L2_E<T>);
#endif
}

}

using Exponentials::exp;
using Exponentials::exp2;
using Exponentials::log;
using Exponentials::log10;
using Exponentials::log2;

namespace Hyperbolic {

template<FloatingPoint T>
constexpr T sinh(T x)
{
    T exponentiated = exp<T>(x);
    if (x > 0)
        return (exponentiated * exponentiated - 1) / 2 / exponentiated;
    return (exponentiated - 1 / exponentiated) / 2;
}

template<FloatingPoint T>
constexpr T cosh(T x)
{
    CONSTEXPR_STATE(cosh, x);

    T exponentiated = exp(-x);
    if (x < 0)
        return (1 + exponentiated * exponentiated) / 2 / exponentiated;
    return (1 / exponentiated + exponentiated) / 2;
}

template<FloatingPoint T>
constexpr T tanh(T x)
{
    if (x > 0) {
        T exponentiated = exp<T>(2 * x);
        return (exponentiated - 1) / (exponentiated + 1);
    }
    T plusX = exp<T>(x);
    T minusX = 1 / plusX;
    return (plusX - minusX) / (plusX + minusX);
}

template<FloatingPoint T>
constexpr T asinh(T x)
{
    return log<T>(x + sqrt<T>(x * x + 1));
}

template<FloatingPoint T>
constexpr T acosh(T x)
{
    return log<T>(x + sqrt<T>(x * x - 1));
}

template<FloatingPoint T>
constexpr T atanh(T x)
{
    return log<T>((1 + x) / (1 - x)) / (T)2.0l;
}

}

using Hyperbolic::acosh;
using Hyperbolic::asinh;
using Hyperbolic::atanh;
using Hyperbolic::cosh;
using Hyperbolic::sinh;
using Hyperbolic::tanh;

// Calculate x^y with fast exponentiation when the power is a natural number.
template<FloatingPoint F, UnsignedIntegral U>
constexpr F pow_int(F x, U y)
{
    auto result = static_cast<F>(1);
    while (y > 0) {
        if (y % 2 == 1) {
            result *= x;
        }
        x = x * x;
        y >>= 1;
    }
    return result;
}

template<FloatingPoint T>
constexpr T pow(T x, T y)
{
    CONSTEXPR_STATE(pow, x, y);
    if (__builtin_isnan(y))
        return y;
    if (y == 0)
        return 1;
    if (x == 0)
        return 0;
    if (y == 1)
        return x;

    // Take an integer fast path as long as the value fits within a 64-bit integer.
    if (y >= static_cast<T>(NumericLimits<i64>::min()) && y < static_cast<T>(NumericLimits<i64>::max())) {
        i64 y_as_int = static_cast<i64>(y);
        if (y == static_cast<T>(y_as_int)) {
            T result = pow_int(x, static_cast<u64>(fabs<T>(y)));
            if (y < 0)
                result = static_cast<T>(1.0l) / result;
            return result;
        }
    }

    // FIXME: This formula suffers from error magnification.
    return exp2<T>(y * log2<T>(x));
}

template<Integral I, typename T>
constexpr I clamp_to(T value)
{
    constexpr auto max = static_cast<T>(NumericLimits<I>::max());
    if constexpr (max > 0) {
        if (value >= static_cast<T>(NumericLimits<I>::max()))
            return NumericLimits<I>::max();
    }

    constexpr auto min = static_cast<T>(NumericLimits<I>::min());
    if constexpr (min <= 0) {
        if (value <= static_cast<T>(NumericLimits<I>::min()))
            return NumericLimits<I>::min();
    }

    if constexpr (IsFloatingPoint<T>)
        return round_to<I>(value);

    return value;
}

// Wrap a to keep it in the range [-b, b].
template<typename T>
constexpr T wrap_to_range(T a, IdentityType<T> b)
{
    return fmod(fmod(a + b, 2 * b) + 2 * b, 2 * b) - b;
}

#undef CALL_BUILTIN
#undef CONSTEXPR_STATE
#undef AARCH64_INSTRUCTION
}

#if USING_AK_GLOBALLY
using AK::round_to;
#endif