OpenJij-Reference-Page/fmath_8hpp_source.html

#pragma once

/*

        function prototype list


        float fmath::exp(float);

        double fmath::expd(double);

        float fmath::log(float);


        __m128 fmath::exp_ps(__m128);

        __m256 fmath::exp_ps256(__m256);

        __m128 fmath::log_ps(__m128);


        double fmath::expd_v(double *, size_t n);


        if FMATH_USE_XBYAK is defined then Xbyak version are used

*/

//#define FMATH_USE_XBYAK


#include <assert.h>

#include <float.h>

#include <limits>

#include <math.h>

#include <stddef.h>

#include <stdlib.h>

#include <string.h> // for memcpy

#if defined(_WIN32) && !defined(__GNUC__)

#include <intrin.h>

#ifndef MIE_ALIGN

#define MIE_ALIGN(x) __declspec(align(x))

#endif

#else

#ifndef __GNUC_PREREQ


#define __GNUC_PREREQ(major, minor)                                            \

  ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))


#endif

#if __GNUC_PREREQ(4, 4) || (__clang__ > 0 && __clang_major__ >= 3) ||          \

    !defined(__GNUC__)

/* GCC >= 4.4 or clang or non-GCC compilers */

#include <x86intrin.h>

#elif __GNUC_PREREQ(4, 1)

/* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header

 */

#include <emmintrin.h>

#endif

#ifndef MIE_ALIGN

#define MIE_ALIGN(x) __attribute__((aligned(x)))

#endif

#endif

#ifndef MIE_PACK

#define MIE_PACK(x, y, z, w) ((x)*64 + (y)*16 + (z)*4 + (w))

#endif

#ifdef FMATH_USE_XBYAK

#define XBYAK_NO_OP_NAMES

#include "xbyak/xbyak.h"

#include "xbyak/xbyak_util.h"

#endif


namespace fmath {


namespace local {


const size_t EXP_TABLE_SIZE = 10;

const size_t EXPD_TABLE_SIZE = 11;

const size_t LOG_TABLE_SIZE = 12;


typedef unsigned long long uint64_t;


union fi {

  float f;

  unsigned int i;

};


union di {

  double d;

  uint64_t i;

};


inline unsigned int mask(int x) { return (1U << x) - 1; }


inline uint64_t mask64(int x) { return (1ULL << x) - 1; }


template <class T> inline const T *cast_to(const void *p) {

  return reinterpret_cast<const T *>(p);

}


template <class T, size_t N> size_t NumOfArray(const T (&)[N]) { return N; }


/*

        exp(88.722839f) = inf ; 0x42b17218

        exp(-87.33655f) = 1.175491e-038f(007fffe6) denormal ; 0xc2aeac50

        exp(-103.972081f) = 0 ; 0xc2cff1b5

*/


template <size_t N = EXP_TABLE_SIZE> struct ExpVar {

  enum {

    s = N,

    n = 1 << s,

    f88 = 0x42b00000 /* 88.0 */

  };

  float minX[8];

  float maxX[8];

  float a[8];

  float b[8];

  float f1[8];

  unsigned int i127s[8];

  unsigned int mask_s[8];

  unsigned int i7fffffff[8];

  unsigned int tbl[n];


  ExpVar() {

    float log_2 = ::logf(2.0f);

    for (int i = 0; i < 8; i++) {

      maxX[i] = 88;

      minX[i] = -88;

      a[i] = n / log_2;

      b[i] = log_2 / n;

      f1[i] = 1.0f;

      i127s[i] = 127 << s;

      i7fffffff[i] = 0x7fffffff;

      mask_s[i] = mask(s);

    }


    for (int i = 0; i < n; i++) {

      float y = pow(2.0f, (float)i / n);

      fi fi;

      fi.f = y;

      tbl[i] = fi.i & mask(23);

    }

  }


};


template <size_t sbit_ = EXPD_TABLE_SIZE> struct ExpdVar {

  enum {

    sbit = sbit_,

    s = 1UL << sbit,

    adj = (1UL << (sbit + 10)) - (1UL << sbit)

  };

  // A = 1, B = 1, C = 1/2, D = 1/6

  double C1[2]; // A

  double C2[2]; // D

  double C3[2]; // C/D

  uint64_t tbl[s];

  double a;

  double ra;


  ExpdVar() : a(s / ::log(2.0)), ra(1 / a) {

    for (int i = 0; i < 2; i++) {

#if 0

            C1[i] = 1.0;

            C2[i] = 0.16667794882310216;

            C3[i] = 2.9997969303278795;

#else

      C1[i] = 1.0;

      C2[i] = 0.16666666685227835064;

      C3[i] = 3.0000000027955394;

#endif

    }

    for (int i = 0; i < s; i++) {

      di di;

      di.d = ::pow(2.0, i * (1.0 / s));

      tbl[i] = di.i & mask64(52);

    }

  }


};


template <size_t N = LOG_TABLE_SIZE> struct LogVar {

  enum { LEN = N - 1 };

  unsigned int m1[4]; // 0

  unsigned int m2[4]; // 16

  unsigned int m3[4]; // 32

  float m4[4];        // 48

  unsigned int m5[4]; // 64

  struct {

    float app;

    float rev;

  } tbl[1 << LEN];

  float c_log2;


  LogVar() : c_log2(::logf(2.0f) / (1 << 23)) {

    const double e = 1 / double(1 << 24);

    const double h = 1 / double(1 << LEN);

    const size_t n = 1U << LEN;

    for (size_t i = 0; i < n; i++) {

      double x = 1 + double(i) / n;

      double a = ::log(x);

      tbl[i].app = (float)a;

      if (i < n - 1) {

        double b = ::log(x + h - e);

        tbl[i].rev = (float)((b - a) / ((h - e) * (1 << 23)));

      } else {

        tbl[i].rev = (float)(1 / (x * (1 << 23)));

      }

    }

    for (int i = 0; i < 4; i++) {

      m1[i] = mask(8) << 23;

      m2[i] = mask(LEN) << (23 - LEN);

      m3[i] = mask(23 - LEN);

      m4[i] = c_log2;

      m5[i] = 127U << 23;

    }

  }


};


#ifdef FMATH_USE_XBYAK

struct ExpCode : public Xbyak::CodeGenerator {

  float (*exp_)(float);

  __m128 (*exp_ps_)(__m128);

  template <size_t N> ExpCode(const ExpVar<N> *self) {

    Xbyak::util::Cpu cpu;

    try {

      makeExp(self, cpu);

      exp_ = getCode<float (*)(float)>();

      align(16);

      exp_ps_ = getCurr<__m128 (*)(__m128)>();

      makeExpPs(self, cpu);

      return;

    } catch (std::exception &e) {

      fprintf(stderr, "ExpCode ERR:%s\n", e.what());

    } catch (...) {

      fprintf(stderr, "ExpCode ERR:unknown error\n");

    }

    ::exit(1);

  }

  template <size_t N>

  void makeExp(const ExpVar<N> *self, const Xbyak::util::Cpu & /*cpu*/) {

    typedef ExpVar<N> Self;

    using namespace local;

    using namespace Xbyak;


    inLocalLabel();

#ifdef XBYAK64

    const Reg64 &base = rcx;

    const Reg64 &a = rax;

#else

    const Reg32 &base = ecx;

    const Reg32 &a = eax;

#endif


    mov(base, (size_t)self);


#ifdef XBYAK32

    movss(xm0, ptr[esp + 4]);

#endif

    L(".retry");

    movaps(xm1, xm0);

    movd(edx, xm0);

    mulss(xm1, ptr[base + offsetof(Self, a)]); // t

    and_(edx, 0x7fffffff);

    cvtss2si(eax, xm1);

    cmp(edx, ExpVar<N>::f88);

    jg(".overflow");

    lea(edx, ptr[eax + (127 << self->s)]);

    cvtsi2ss(xm1, eax);

    and_(eax, mask(self->s));                          // v

    mov(eax, ptr[base + a * 4 + offsetof(Self, tbl)]); // expVar.tbl[v]

    shr(edx, self->s);

    mulss(xm1, ptr[base + offsetof(Self, b)]);

    shl(edx, 23);    // u

    subss(xm0, xm1); // t

    or_(eax, edx);   // fi.f

    addss(xm0, ptr[base + offsetof(Self, f1)]);

    movd(xm1, eax);

    mulss(xm0, xm1);

#ifdef XBYAK32

    movss(ptr[esp + 4], xm0);

    fld(dword[esp + 4]);

#endif

    ret();

    L(".overflow");

    minss(xm0, ptr[base + offsetof(Self, maxX)]);

    maxss(xm0, ptr[base + offsetof(Self, minX)]);

    jmp(".retry");

    outLocalLabel();

  }

  template <size_t N>

  void makeExpPs(const ExpVar<N> *self, const Xbyak::util::Cpu &cpu) {

    typedef ExpVar<N> Self;

    using namespace local;

    using namespace Xbyak;


    inLocalLabel();

#ifdef XBYAK64

    const Reg64 &base = rcx;

    const Reg64 &a = rax;

    const Reg64 &d = rdx;

#else

    const Reg32 &base = ecx;

    const Reg32 &a = eax;

    const Reg32 &d = edx;

#endif


    /*

            if abs(x) >= maxX then x = max(min(x, maxX), -maxX) and try

            minps, maxps are very slow then avoid them

    */

    const bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41);

#if defined(XBYAK64_WIN) && !defined(__INTEL_COMPILER)

    movaps(xm0, ptr[rcx]);

#endif

    mov(base, (size_t)self);

    L(".retry");

    movaps(xm5, xm0);

    andps(xm5, ptr[base + offsetof(Self, i7fffffff)]);

    movaps(xm3, ptr[base + offsetof(Self, a)]);

    movaps(xm4, ptr[base + offsetof(Self, b)]);

    pcmpgtd(xm5, ptr[base + offsetof(Self, maxX)]);

    mulps(xm3, xm0);

    movaps(xm1, ptr[base + offsetof(Self, i127s)]);

    pmovmskb(eax, xm5);

    movaps(xm5, ptr[base + offsetof(Self, mask_s)]);

    cvtps2dq(xm2, xm3);

    pand(xm5, xm2);

    cvtdq2ps(xm3, xm2);

    test(eax, eax);

    jnz(".overflow");

    paddd(xm1, xm2);

    movd(eax, xm5);

    mulps(xm4, xm3);

    pextrw(edx, xm5, 2);

    subps(xm0, xm4);

    movd(xm4, ptr[base + a * 4 + offsetof(Self, tbl)]);

    addps(xm0, ptr[base + offsetof(Self, f1)]);

    pextrw(eax, xm5, 4);

    if (useSSE41) {

      pinsrd(xm4, ptr[base + d * 4 + offsetof(Self, tbl)], 1);

    } else {

      movd(xm3, ptr[base + d * 4 + offsetof(Self, tbl)]);

      movlhps(xm4, xm3);

    }

    pextrw(edx, xm5, 6);

    psrld(xm1, self->s);

    pslld(xm1, 23);

    if (useSSE41) {

      pinsrd(xm4, ptr[base + a * 4 + offsetof(Self, tbl)], 2);

      pinsrd(xm4, ptr[base + d * 4 + offsetof(Self, tbl)], 3);

    } else {

      movd(xm2, ptr[base + a * 4 + offsetof(Self, tbl)]);

      movd(xm3, ptr[base + d * 4 + offsetof(Self, tbl)]);

      movlhps(xm2, xm3);

      shufps(xm4, xm2, MIE_PACK(2, 0, 2, 0));

    }

    por(xm1, xm4);

    mulps(xm0, xm1);

    ret();

    L(".overflow");

    minps(xm0, ptr[base + offsetof(Self, maxX)]);

    maxps(xm0, ptr[base + offsetof(Self, minX)]);

    jmp(".retry");

    outLocalLabel();

  }

};

#endif


/* to define static variables in fmath.hpp */

template <size_t EXP_N = EXP_TABLE_SIZE, size_t LOG_N = LOG_TABLE_SIZE,

          size_t EXPD_N = EXPD_TABLE_SIZE>


struct C {

  static const ExpVar<EXP_N> expVar;

  static const LogVar<LOG_N> logVar;

  static const ExpdVar<EXPD_N> expdVar;

#ifdef FMATH_USE_XBYAK

  static const ExpCode &getInstance() {

    static const ExpCode expCode(&expVar);

    return expCode;

  }

#endif

};


template <size_t EXP_N, size_t LOG_N, size_t EXPD_N>

MIE_ALIGN(32)

const ExpVar<EXP_N> C<EXP_N, LOG_N, EXPD_N>::expVar;


template <size_t EXP_N, size_t LOG_N, size_t EXPD_N>

MIE_ALIGN(32)

const LogVar<LOG_N> C<EXP_N, LOG_N, EXPD_N>::logVar;


template <size_t EXP_N, size_t LOG_N, size_t EXPD_N>

MIE_ALIGN(32)

const ExpdVar<EXPD_N> C<EXP_N, LOG_N, EXPD_N>::expdVar;


} // namespace local


#ifdef FMATH_USE_XBYAK

inline float expC(float x)

#else


inline float exp(float x)

#endif

{

  using namespace local;

  const ExpVar<> &expVar = C<>::expVar;


#if 1

  __m128 x1 = _mm_set_ss(x);


  int limit = _mm_cvtss_si32(x1) & 0x7fffffff;

  if (limit > ExpVar<>::f88) {

    x1 = _mm_min_ss(x1, _mm_load_ss(expVar.maxX));

    x1 = _mm_max_ss(x1, _mm_load_ss(expVar.minX));

  }


  int r = _mm_cvtss_si32(_mm_mul_ss(x1, _mm_load_ss(expVar.a)));

  unsigned int v = r & mask(expVar.s);

  float t = _mm_cvtss_f32(x1) - r * expVar.b[0];

  int u = r >> expVar.s;

  fi fi;

  fi.i = ((u + 127) << 23) | expVar.tbl[v];

  return (1 + t) * fi.f;

#else

  x = std::min(x, expVar.maxX[0]);

  x = std::max(x, expVar.minX[0]);

  float t = x * expVar.a[0];

  const float magic = (1 << 23) + (1 << 22); // to round

  t += magic;

  fi fi;

  fi.f = t;

  t = x - (t - magic) * expVar.b[0];

  int u = ((fi.i + (127 << expVar.s)) >> expVar.s) << 23;

  unsigned int v = fi.i & mask(expVar.s);

  fi.i = u | expVar.tbl[v];

  return (1 + t) * fi.f;

//  return (1 + t) * pow(2, (float)u) * pow(2, (float)v / n);

#endif

}


inline double expd(double x) {

  if (x <= -708.39641853226408)

    return 0;

  if (x >= 709.78271289338397)

    return std::numeric_limits<double>::infinity();

  using namespace local;

  const ExpdVar<> &c = C<>::expdVar;

#if 1

  const double _b = double(uint64_t(3) << 51);

  __m128d b = _mm_load_sd(&_b);

  __m128d xx = _mm_load_sd(&x);

  __m128d d = _mm_add_sd(_mm_mul_sd(xx, _mm_load_sd(&c.a)), b);

  uint64_t di = _mm_cvtsi128_si32(_mm_castpd_si128(d));

  uint64_t iax = c.tbl[di & mask(c.sbit)];

  __m128d _t = _mm_sub_sd(_mm_mul_sd(_mm_sub_sd(d, b), _mm_load_sd(&c.ra)), xx);

  uint64_t u = ((di + c.adj) >> c.sbit) << 52;

  double t;

  _mm_store_sd(&t, _t);

  double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];

  double did;

  u |= iax;

  memcpy(&did, &u, sizeof(did));

  return y * did;

#else

  /*

          remark : -ffast-math option of gcc may generate bad code for

     fmath::expd

  */

  const uint64_t b = 3ULL << 51;

  di di;

  di.d = x * c.a + b;

  uint64_t iax = c.tbl[di.i & mask(c.sbit)];


  double t = (di.d - b) * c.ra - x;

  uint64_t u = ((di.i + c.adj) >> c.sbit) << 52;

  double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];


  di.i = u | iax;

  return y * di.d;

#endif

}


inline __m128d exp_pd(__m128d x) {

#if 0 // faster on Haswell

    MIE_ALIGN(16) double buf[2];

    memcpy(buf, &x, sizeof(buf));

    buf[0] = expd(buf[0]);

    buf[1] = expd(buf[1]);

    __m128d y;

    memcpy(&y, buf, sizeof(buf));

    return y;

#else // faster on Skeylake

  using namespace local;

  const ExpdVar<> &c = C<>::expdVar;

  const double b = double(3ULL << 51);

  const __m128d mC1 = *cast_to<__m128d>(c.C1);

  const __m128d mC2 = *cast_to<__m128d>(c.C2);

  const __m128d mC3 = *cast_to<__m128d>(c.C3);

  const __m128d ma = _mm_set1_pd(c.a);

  const __m128d mra = _mm_set1_pd(c.ra);

  const __m128i madj = _mm_set1_epi32(c.adj);

  MIE_ALIGN(16)

  const double expMax[2] = {709.78271289338397, 709.78271289338397};

  MIE_ALIGN(16)

  const double expMin[2] = {-708.39641853226408, -708.39641853226408};

  x = _mm_min_pd(x, *(const __m128d *)expMax);

  x = _mm_max_pd(x, *(const __m128d *)expMin);


  __m128d d = _mm_mul_pd(x, ma);

  d = _mm_add_pd(d, _mm_set1_pd(b));

  int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit);

  int adr1 =

      _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) & mask(c.sbit);

  __m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double *)&c.tbl[adr0]));

  __m128i iax = _mm_castpd_si128(_mm_load_sd((const double *)&c.tbl[adr1]));

  iax = _mm_unpacklo_epi64(iaxL, iax);


  __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);

  __m128i u = _mm_castpd_si128(d);

  u = _mm_add_epi64(u, madj);

  u = _mm_srli_epi64(u, c.sbit);

  u = _mm_slli_epi64(u, 52);

  u = _mm_or_si128(u, iax);

  __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));

  y = _mm_mul_pd(y, mC2);

  y = _mm_add_pd(_mm_sub_pd(y, t), mC1);

  y = _mm_mul_pd(y, _mm_castsi128_pd(u));

  return y;

#endif

}


/*

        px : pointer to array of double

        n : size of array

*/


inline void expd_v(double *px, size_t n) {

  using namespace local;

  const ExpdVar<> &c = C<>::expdVar;

  const double b = double(3ULL << 51);

#ifdef __AVX2__

  size_t r = n & 3;

  n &= ~3;

  const __m256d mC1 = _mm256_set1_pd(c.C1[0]);

  const __m256d mC2 = _mm256_set1_pd(c.C2[0]);

  const __m256d mC3 = _mm256_set1_pd(c.C3[0]);

  const __m256d ma = _mm256_set1_pd(c.a);

  const __m256d mra = _mm256_set1_pd(c.ra);

  const __m256i madj = _mm256_set1_epi64x(c.adj);

  const __m256i maskSbit = _mm256_set1_epi64x(mask(c.sbit));

  const __m256d expMax = _mm256_set1_pd(709.78272569338397);

  const __m256d expMin = _mm256_set1_pd(-708.39641853226408);

  for (size_t i = 0; i < n; i += 4) {

    __m256d x = _mm256_load_pd(px);

    x = _mm256_min_pd(x, expMax);

    x = _mm256_max_pd(x, expMin);


    __m256d d = _mm256_mul_pd(x, ma);

    d = _mm256_add_pd(d, _mm256_set1_pd(b));

    __m256i adr = _mm256_and_si256(_mm256_castpd_si256(d), maskSbit);

    __m256i iax = _mm256_i64gather_epi64((const long long *)c.tbl, adr, 8);

    __m256d t = _mm256_sub_pd(

        _mm256_mul_pd(_mm256_sub_pd(d, _mm256_set1_pd(b)), mra), x);

    __m256i u = _mm256_castpd_si256(d);

    u = _mm256_add_epi64(u, madj);

    u = _mm256_srli_epi64(u, c.sbit);

    u = _mm256_slli_epi64(u, 52);

    u = _mm256_or_si256(u, iax);

    __m256d y = _mm256_mul_pd(_mm256_sub_pd(mC3, t), _mm256_mul_pd(t, t));

    y = _mm256_mul_pd(y, mC2);

    y = _mm256_add_pd(_mm256_sub_pd(y, t), mC1);

    _mm256_store_pd(px, _mm256_mul_pd(y, _mm256_castsi256_pd(u)));

    px += 4;

  }

#else

  size_t r = n & 1;

  n &= ~1;

  const __m128d mC1 = _mm_set1_pd(c.C1[0]);

  const __m128d mC2 = _mm_set1_pd(c.C2[0]);

  const __m128d mC3 = _mm_set1_pd(c.C3[0]);

  const __m128d ma = _mm_set1_pd(c.a);

  const __m128d mra = _mm_set1_pd(c.ra);

#if defined(__x86_64__) || defined(_WIN64)

  const __m128i madj = _mm_set1_epi64x(c.adj);

#else

  const __m128i madj = _mm_set_epi32(0, c.adj, 0, c.adj);

#endif

  const __m128d expMax = _mm_set1_pd(709.78272569338397);

  const __m128d expMin = _mm_set1_pd(-708.39641853226408);

  for (size_t i = 0; i < n; i += 2) {

    __m128d x = _mm_load_pd(px);

    x = _mm_min_pd(x, expMax);

    x = _mm_max_pd(x, expMin);


    __m128d d = _mm_mul_pd(x, ma);

    d = _mm_add_pd(d, _mm_set1_pd(b));

    int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit);

    int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) &

               mask(c.sbit);


    __m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double *)&c.tbl[adr0]));

    __m128i iax = _mm_castpd_si128(_mm_load_sd((const double *)&c.tbl[adr1]));

    iax = _mm_unpacklo_epi64(iaxL, iax);


    __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);

    __m128i u = _mm_castpd_si128(d);

    u = _mm_add_epi64(u, madj);

    u = _mm_srli_epi64(u, c.sbit);

    u = _mm_slli_epi64(u, 52);

    u = _mm_or_si128(u, iax);

    __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));

    y = _mm_mul_pd(y, mC2);

    y = _mm_add_pd(_mm_sub_pd(y, t), mC1);

    _mm_store_pd(px, _mm_mul_pd(y, _mm_castsi128_pd(u)));

    px += 2;

  }

#endif

  for (size_t i = 0; i < r; i++) {

    px[i] = expd(px[i]);

  }

}


#ifdef FMATH_USE_XBYAK

inline __m128 exp_psC(__m128 x)

#else


inline __m128 exp_ps(__m128 x)

#endif

{

  using namespace local;

  const ExpVar<> &expVar = C<>::expVar;


  __m128i limit =

      _mm_castps_si128(_mm_and_ps(x, *cast_to<__m128>(expVar.i7fffffff)));

  int over =

      _mm_movemask_epi8(_mm_cmpgt_epi32(limit, *cast_to<__m128i>(expVar.maxX)));

  if (over) {

    x = _mm_min_ps(x, _mm_load_ps(expVar.maxX));

    x = _mm_max_ps(x, _mm_load_ps(expVar.minX));

  }


  __m128i r = _mm_cvtps_epi32(_mm_mul_ps(x, *cast_to<__m128>(expVar.a)));

  __m128 t =

      _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(r), *cast_to<__m128>(expVar.b)));

  t = _mm_add_ps(t, *cast_to<__m128>(expVar.f1));


  __m128i v4 = _mm_and_si128(r, *cast_to<__m128i>(expVar.mask_s));

  __m128i u4 = _mm_add_epi32(r, *cast_to<__m128i>(expVar.i127s));

  u4 = _mm_srli_epi32(u4, expVar.s);

  u4 = _mm_slli_epi32(u4, 23);


#ifdef __AVX2__ // fast?

  __m128i ti = _mm_i32gather_epi32((const int *)expVar.tbl, v4, 4);

  __m128 t0 = _mm_castsi128_ps(ti);

#else

  unsigned int v0, v1, v2, v3;

  v0 = _mm_cvtsi128_si32(v4);

  v1 = _mm_extract_epi16(v4, 2);

  v2 = _mm_extract_epi16(v4, 4);

  v3 = _mm_extract_epi16(v4, 6);

#if 1

  __m128 t0, t1, t2, t3;


  t0 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v0]));

  t1 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v1]));

  t2 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v2]));

  t3 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v3]));


  t1 = _mm_movelh_ps(t1, t3);

  t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32));

  t0 = _mm_movelh_ps(t0, t2);

  t0 = _mm_castsi128_ps(_mm_srli_epi64(_mm_castps_si128(t0), 32));

  t0 = _mm_or_ps(t0, t1);

#else

  __m128i ti = _mm_castps_si128(_mm_load_ss((const float *)&expVar.tbl[v0]));

  ti = _mm_insert_epi32(ti, expVar.tbl[v1], 1);

  ti = _mm_insert_epi32(ti, expVar.tbl[v2], 2);

  ti = _mm_insert_epi32(ti, expVar.tbl[v3], 3);

  __m128 t0 = _mm_castsi128_ps(ti);

#endif

#endif

  t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4));


  t = _mm_mul_ps(t, t0);


  return t;

}


#ifdef __AVX2__

inline __m256 exp_ps256(__m256 x) {

  using namespace local;

  const ExpVar<> &expVar = C<>::expVar;


  __m256i limit = _mm256_castps_si256(

      _mm256_and_ps(x, *reinterpret_cast<const __m256 *>(expVar.i7fffffff)));

  int over = _mm256_movemask_epi8(_mm256_cmpgt_epi32(

      limit, *reinterpret_cast<const __m256i *>(expVar.maxX)));

  if (over) {

    x = _mm256_min_ps(x, _mm256_load_ps(expVar.maxX));

    x = _mm256_max_ps(x, _mm256_load_ps(expVar.minX));

  }

  __m256i r = _mm256_cvtps_epi32(

      _mm256_mul_ps(x, *reinterpret_cast<const __m256 *>(expVar.a)));

  __m256 t = _mm256_sub_ps(

      x, _mm256_mul_ps(_mm256_cvtepi32_ps(r),

                       *reinterpret_cast<const __m256 *>(expVar.b)));

  t = _mm256_add_ps(t, *reinterpret_cast<const __m256 *>(expVar.f1));

  __m256i v8 =

      _mm256_and_si256(r, *reinterpret_cast<const __m256i *>(expVar.mask_s));

  __m256i u8 =

      _mm256_add_epi32(r, *reinterpret_cast<const __m256i *>(expVar.i127s));

  u8 = _mm256_srli_epi32(u8, expVar.s);

  u8 = _mm256_slli_epi32(u8, 23);

#if 1

  __m256i ti = _mm256_i32gather_epi32((const int *)expVar.tbl, v8, 4);

#else

  unsigned int v0, v1, v2, v3, v4, v5, v6, v7;

  v0 = _mm256_extract_epi16(v8, 0);

  v1 = _mm256_extract_epi16(v8, 2);

  v2 = _mm256_extract_epi16(v8, 4);

  v3 = _mm256_extract_epi16(v8, 6);

  v4 = _mm256_extract_epi16(v8, 8);

  v5 = _mm256_extract_epi16(v8, 10);

  v6 = _mm256_extract_epi16(v8, 12);

  v7 = _mm256_extract_epi16(v8, 14);

  __m256i ti = _mm256_setzero_si256();

  ti = _mm256_insert_epi32(ti, expVar.tbl[v0], 0);

  ti = _mm256_insert_epi32(ti, expVar.tbl[v1], 1);

  ti = _mm256_insert_epi32(ti, expVar.tbl[v2], 2);

  ti = _mm256_insert_epi32(ti, expVar.tbl[v3], 3);

  ti = _mm256_insert_epi32(ti, expVar.tbl[v4], 4);

  ti = _mm256_insert_epi32(ti, expVar.tbl[v5], 5);

  ti = _mm256_insert_epi32(ti, expVar.tbl[v6], 6);

  ti = _mm256_insert_epi32(ti, expVar.tbl[v7], 7);

#endif

  __m256 t0 = _mm256_castsi256_ps(ti);

  t0 = _mm256_or_ps(t0, _mm256_castsi256_ps(u8));

  t = _mm256_mul_ps(t, t0);

  return t;

}

#endif


inline float log(float x) {

  using namespace local;

  const LogVar<> &logVar = C<>::logVar;

  const size_t logLen = logVar.LEN;


  fi fi;

  fi.f = x;

  int a = fi.i & (mask(8) << 23);

  unsigned int b1 = fi.i & (mask(logLen) << (23 - logLen));

  unsigned int b2 = fi.i & mask(23 - logLen);

  int idx = b1 >> (23 - logLen);

  float f = float(a - (127 << 23)) * logVar.c_log2 + logVar.tbl[idx].app +

            float(b2) * logVar.tbl[idx].rev;

  return f;

}


inline __m128 log_ps(__m128 x) {

  using namespace local;

  const LogVar<> &logVar = C<>::logVar;


  __m128i xi = _mm_castps_si128(x);

  __m128i idx = _mm_srli_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m2)),

                               (23 - logVar.LEN));

  __m128 a = _mm_cvtepi32_ps(

      _mm_sub_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m1)),

                    *cast_to<__m128i>(logVar.m5)));

  __m128 b2 = _mm_cvtepi32_ps(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m3)));


  a = _mm_mul_ps(a, *cast_to<__m128>(logVar.m4)); // c_log2


  unsigned int i0 = _mm_cvtsi128_si32(idx);


#if 1

  unsigned int i1 = _mm_extract_epi16(idx, 2);

  unsigned int i2 = _mm_extract_epi16(idx, 4);

  unsigned int i3 = _mm_extract_epi16(idx, 6);

#else

  idx = _mm_srli_si128(idx, 4);

  unsigned int i1 = _mm_cvtsi128_si32(idx);


  idx = _mm_srli_si128(idx, 4);

  unsigned int i2 = _mm_cvtsi128_si32(idx);


  idx = _mm_srli_si128(idx, 4);

  unsigned int i3 = _mm_cvtsi128_si32(idx);

#endif


  __m128 app, rev;

  __m128i L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i0].app));

  __m128i H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i1].app));

  __m128 t = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));

  L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i2].app));

  H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i3].app));

  rev = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));

  app = _mm_shuffle_ps(t, rev, MIE_PACK(2, 0, 2, 0));

  rev = _mm_shuffle_ps(t, rev, MIE_PACK(3, 1, 3, 1));


  a = _mm_add_ps(a, app);

  rev = _mm_mul_ps(b2, rev);

  return _mm_add_ps(a, rev);

}


#ifndef __CYGWIN__

// cygwin defines log2() in global namespace!

// log2(x) = log(x) / log(2)

inline float log2(float x) { return fmath::log(x) * 1.442695f; }

#endif


/*

        for given y > 0

        get f_y(x) := pow(x, y) for x >= 0

*/


class PowGenerator {

  enum { N = 11 };

  float tbl0_[256];

  struct {

    float app;

    float rev;

  } tbl1_[1 << N];


public:


  PowGenerator(float y) {

    for (int i = 0; i < 256; i++) {

      tbl0_[i] = ::powf(2, (i - 127) * y);

    }

    const double e = 1 / double(1 << 24);

    const double h = 1 / double(1 << N);

    const size_t n = 1U << N;

    for (size_t i = 0; i < n; i++) {

      double x = 1 + double(i) / n;

      double a = ::pow(x, (double)y);

      tbl1_[i].app = (float)a;

      double b = ::pow(x + h - e, (double)y);

      tbl1_[i].rev = (float)((b - a) / (h - e) / (1 << 23));

    }

  }


  float get(float x) const {

    using namespace local;

    fi fi;

    fi.f = x;

    int a = (fi.i >> 23) & mask(8);

    unsigned int b = fi.i & mask(23);

    unsigned int b1 = b & (mask(N) << (23 - N));

    unsigned int b2 = b & mask(23 - N);

    float f;

    int idx = b1 >> (23 - N);

    f = tbl0_[a] * (tbl1_[idx].app + float(b2) * tbl1_[idx].rev);

    return f;

  }


};


// for Xbyak version

#ifdef FMATH_USE_XBYAK

float (*const exp)(float) = local::C<>::getInstance().exp_;

__m128 (*const exp_ps)(__m128) = local::C<>::getInstance().exp_ps_;

#endif


// exp2(x) = pow(2, x)

inline float exp2(float x) { return fmath::exp(x * 0.6931472f); }


/*

        this function may be optimized in the future

*/


inline __m128d log_pd(__m128d x) {

  double d[2];

  memcpy(d, &x, sizeof(d));

  d[0] = ::log(d[0]);

  d[1] = ::log(d[1]);

  __m128d m;

  memcpy(&m, d, sizeof(m));

  return m;

}


inline __m128 pow_ps(__m128 x, __m128 y) {

  return exp_ps(_mm_mul_ps(y, log_ps(x)));

}


inline __m128d pow_pd(__m128d x, __m128d y) {

  return exp_pd(_mm_mul_pd(y, log_pd(x)));

}


} // namespace fmath


fmath::PowGenerator
Definition fmath.hpp:803

fmath::PowGenerator::rev
float rev
Definition fmath.hpp:808

fmath::PowGenerator::app
float app
Definition fmath.hpp:807

fmath::PowGenerator::get
float get(float x) const
Definition fmath.hpp:827

fmath::PowGenerator::PowGenerator
PowGenerator(float y)
Definition fmath.hpp:812

MIE_ALIGN
#define MIE_ALIGN(x)
Definition fmath.hpp:57

MIE_PACK
#define MIE_PACK(x, y, z, w)
Definition fmath.hpp:61

fmath::local::EXPD_TABLE_SIZE
const size_t EXPD_TABLE_SIZE
Definition fmath.hpp:74

fmath::local::uint64_t
unsigned long long uint64_t
Definition fmath.hpp:77

fmath::local::LOG_TABLE_SIZE
const size_t LOG_TABLE_SIZE
Definition fmath.hpp:75

fmath::local::mask64
uint64_t mask64(int x)
Definition fmath.hpp:91

fmath::local::EXP_TABLE_SIZE
const size_t EXP_TABLE_SIZE
Definition fmath.hpp:73

fmath::local::NumOfArray
size_t NumOfArray(const T(&)[N])
Definition fmath.hpp:97

fmath::local::cast_to
const T * cast_to(const void *p)
Definition fmath.hpp:93

fmath::local::mask
unsigned int mask(int x)
Definition fmath.hpp:89

fmath
Definition fmath.hpp:69

fmath::pow_ps
__m128 pow_ps(__m128 x, __m128 y)
Definition fmath.hpp:863

fmath::log_pd
__m128d log_pd(__m128d x)
Definition fmath.hpp:854

fmath::log2
float log2(float x)
Definition fmath.hpp:796

fmath::expd
double expd(double x)
Definition fmath.hpp:432

fmath::log
float log(float x)
Definition fmath.hpp:731

fmath::exp2
float exp2(float x)
Definition fmath.hpp:849

fmath::log_ps
__m128 log_ps(__m128 x)
Definition fmath.hpp:747

fmath::exp_ps
__m128 exp_ps(__m128 x)
Definition fmath.hpp:616

fmath::exp_pd
__m128d exp_pd(__m128d x)
Definition fmath.hpp:474

fmath::expd_v
void expd_v(double *px, size_t n)
Definition fmath.hpp:527

fmath::pow_pd
__m128d pow_pd(__m128d x, __m128d y)
Definition fmath.hpp:866

fmath::exp
float exp(float x)
Definition fmath.hpp:393

fmath::local::C
Definition fmath.hpp:364

fmath::local::C::expVar
static const ExpVar< EXP_N > expVar
Definition fmath.hpp:365

fmath::local::C::expdVar
static const ExpdVar< EXPD_N > expdVar
Definition fmath.hpp:367

fmath::local::C::logVar
static const LogVar< LOG_N > logVar
Definition fmath.hpp:366

fmath::local::ExpVar
Definition fmath.hpp:104

fmath::local::ExpVar::b
float b[8]
Definition fmath.hpp:113

fmath::local::ExpVar::ExpVar
ExpVar()
Definition fmath.hpp:119

fmath::local::ExpVar::a
float a[8]
Definition fmath.hpp:112

fmath::local::ExpVar::s
@ s
Definition fmath.hpp:106

fmath::local::ExpVar::n
@ n
Definition fmath.hpp:107

fmath::local::ExpVar::f88
@ f88
Definition fmath.hpp:108

fmath::local::ExpVar::f1
float f1[8]
Definition fmath.hpp:114

fmath::local::ExpVar::i7fffffff
unsigned int i7fffffff[8]
Definition fmath.hpp:117

fmath::local::ExpVar::maxX
float maxX[8]
Definition fmath.hpp:111

fmath::local::ExpVar::mask_s
unsigned int mask_s[8]
Definition fmath.hpp:116

fmath::local::ExpVar::minX
float minX[8]
Definition fmath.hpp:110

fmath::local::ExpVar::tbl
unsigned int tbl[n]
Definition fmath.hpp:118

fmath::local::ExpVar::i127s
unsigned int i127s[8]
Definition fmath.hpp:115

fmath::local::ExpdVar
Definition fmath.hpp:141

fmath::local::ExpdVar::ExpdVar
ExpdVar()
Definition fmath.hpp:154

fmath::local::ExpdVar::a
double a
Definition fmath.hpp:152

fmath::local::ExpdVar::sbit
@ sbit
Definition fmath.hpp:143

fmath::local::ExpdVar::s
@ s
Definition fmath.hpp:144

fmath::local::ExpdVar::adj
@ adj
Definition fmath.hpp:145

fmath::local::ExpdVar::C3
double C3[2]
Definition fmath.hpp:150

fmath::local::ExpdVar::tbl
uint64_t tbl[s]
Definition fmath.hpp:151

fmath::local::ExpdVar::ra
double ra
Definition fmath.hpp:153

fmath::local::ExpdVar::C2
double C2[2]
Definition fmath.hpp:149

fmath::local::ExpdVar::C1
double C1[2]
Definition fmath.hpp:148

fmath::local::LogVar
Definition fmath.hpp:174

fmath::local::LogVar::m2
unsigned int m2[4]
Definition fmath.hpp:177

fmath::local::LogVar::m3
unsigned int m3[4]
Definition fmath.hpp:178

fmath::local::LogVar::rev
float rev
Definition fmath.hpp:183

fmath::local::LogVar::LEN
@ LEN
Definition fmath.hpp:175

fmath::local::LogVar::m4
float m4[4]
Definition fmath.hpp:179

fmath::local::LogVar::app
float app
Definition fmath.hpp:182

fmath::local::LogVar::LogVar
LogVar()
Definition fmath.hpp:186

fmath::local::LogVar::m5
unsigned int m5[4]
Definition fmath.hpp:180

fmath::local::LogVar::c_log2
float c_log2
Definition fmath.hpp:185

fmath::local::LogVar::tbl
struct fmath::local::LogVar::@3 tbl[1<< LEN]

fmath::local::LogVar::m1
unsigned int m1[4]
Definition fmath.hpp:176

fmath::local::di
Definition fmath.hpp:84

fmath::local::di::i
uint64_t i
Definition fmath.hpp:86

fmath::local::di::d
double d
Definition fmath.hpp:85

fmath::local::fi
Definition fmath.hpp:79

fmath::local::fi::f
float f
Definition fmath.hpp:80

fmath::local::fi::i
unsigned int i
Definition fmath.hpp:81