89inline unsigned int mask(
int x) {
return (1U << x) - 1; }
93template <
class T>
inline const T *
cast_to(
const void *p) {
94 return reinterpret_cast<const T *
>(p);
97template <
class T,
size_t N>
size_t NumOfArray(
const T (&)[N]) {
return N; }
104template <
size_t N = EXP_TABLE_SIZE>
struct ExpVar {
120 float log_2 = ::logf(2.0f);
121 for (
int i = 0; i < 8; i++) {
132 for (
int i = 0; i <
n; i++) {
133 float y = pow(2.0f, (
float)i /
n);
141template <
size_t sbit_ = EXPD_TABLE_SIZE>
struct ExpdVar {
155 for (
int i = 0; i < 2; i++) {
158 C2[i] = 0.16667794882310216;
159 C3[i] = 2.9997969303278795;
162 C2[i] = 0.16666666685227835064;
163 C3[i] = 3.0000000027955394;
166 for (
int i = 0; i <
s; i++) {
168 di.
d = ::pow(2.0, i * (1.0 /
s));
174template <
size_t N = LOG_TABLE_SIZE>
struct LogVar {
187 const double e = 1 / double(1 << 24);
188 const double h = 1 / double(1 <<
LEN);
189 const size_t n = 1U <<
LEN;
190 for (
size_t i = 0; i < n; i++) {
191 double x = 1 + double(i) / n;
193 tbl[i].app = (float)a;
195 double b =
::log(x + h - e);
196 tbl[i].rev = (float)((b - a) / ((h - e) * (1 << 23)));
198 tbl[i].rev = (float)(1 / (x * (1 << 23)));
201 for (
int i = 0; i < 4; i++) {
211#ifdef FMATH_USE_XBYAK
212struct ExpCode :
public Xbyak::CodeGenerator {
213 float (*exp_)(float);
214 __m128 (*exp_ps_)(__m128);
215 template <
size_t N> ExpCode(
const ExpVar<N> *self) {
216 Xbyak::util::Cpu cpu;
219 exp_ = getCode<float (*)(float)>();
221 exp_ps_ = getCurr<__m128 (*)(__m128)>();
222 makeExpPs(self, cpu);
224 }
catch (std::exception &e) {
225 fprintf(stderr,
"ExpCode ERR:%s\n", e.what());
227 fprintf(stderr,
"ExpCode ERR:unknown error\n");
232 void makeExp(
const ExpVar<N> *self,
const Xbyak::util::Cpu & ) {
233 typedef ExpVar<N> Self;
234 using namespace local;
235 using namespace Xbyak;
239 const Reg64 &base = rcx;
240 const Reg64 &a = rax;
242 const Reg32 &base = ecx;
243 const Reg32 &a = eax;
246 mov(base, (
size_t)self);
249 movss(xm0, ptr[esp + 4]);
254 mulss(xm1, ptr[base + offsetof(Self, a)]);
255 and_(edx, 0x7fffffff);
257 cmp(edx, ExpVar<N>::f88);
259 lea(edx, ptr[eax + (127 << self->s)]);
261 and_(eax,
mask(self->s));
262 mov(eax, ptr[base + a * 4 + offsetof(Self, tbl)]);
264 mulss(xm1, ptr[base + offsetof(Self, b)]);
268 addss(xm0, ptr[base + offsetof(Self, f1)]);
272 movss(ptr[esp + 4], xm0);
277 minss(xm0, ptr[base + offsetof(Self, maxX)]);
278 maxss(xm0, ptr[base + offsetof(Self, minX)]);
283 void makeExpPs(
const ExpVar<N> *self,
const Xbyak::util::Cpu &cpu) {
284 typedef ExpVar<N> Self;
285 using namespace local;
286 using namespace Xbyak;
290 const Reg64 &base = rcx;
291 const Reg64 &a = rax;
292 const Reg64 &d = rdx;
294 const Reg32 &base = ecx;
295 const Reg32 &a = eax;
296 const Reg32 &d = edx;
303 const bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41);
304#if defined(XBYAK64_WIN) && !defined(__INTEL_COMPILER)
305 movaps(xm0, ptr[rcx]);
307 mov(base, (
size_t)self);
310 andps(xm5, ptr[base + offsetof(Self, i7fffffff)]);
311 movaps(xm3, ptr[base + offsetof(Self, a)]);
312 movaps(xm4, ptr[base + offsetof(Self, b)]);
313 pcmpgtd(xm5, ptr[base + offsetof(Self, maxX)]);
315 movaps(xm1, ptr[base + offsetof(Self, i127s)]);
317 movaps(xm5, ptr[base + offsetof(Self, mask_s)]);
328 movd(xm4, ptr[base + a * 4 + offsetof(Self, tbl)]);
329 addps(xm0, ptr[base + offsetof(Self, f1)]);
332 pinsrd(xm4, ptr[base + d * 4 + offsetof(Self, tbl)], 1);
334 movd(xm3, ptr[base + d * 4 + offsetof(Self, tbl)]);
341 pinsrd(xm4, ptr[base + a * 4 + offsetof(Self, tbl)], 2);
342 pinsrd(xm4, ptr[base + d * 4 + offsetof(Self, tbl)], 3);
344 movd(xm2, ptr[base + a * 4 + offsetof(Self, tbl)]);
345 movd(xm3, ptr[base + d * 4 + offsetof(Self, tbl)]);
347 shufps(xm4, xm2,
MIE_PACK(2, 0, 2, 0));
353 minps(xm0, ptr[base + offsetof(Self, maxX)]);
354 maxps(xm0, ptr[base + offsetof(Self, minX)]);
368#ifdef FMATH_USE_XBYAK
369 static const ExpCode &getInstance() {
370 static const ExpCode expCode(&
expVar);
376template <
size_t EXP_N,
size_t LOG_N,
size_t EXPD_N>
378const ExpVar<EXP_N> C<EXP_N, LOG_N, EXPD_N>::expVar;
380template <
size_t EXP_N,
size_t LOG_N,
size_t EXPD_N>
382const LogVar<LOG_N> C<EXP_N, LOG_N, EXPD_N>::logVar;
384template <
size_t EXP_N,
size_t LOG_N,
size_t EXPD_N>
386const ExpdVar<EXPD_N> C<EXP_N, LOG_N, EXPD_N>::expdVar;
396 using namespace local;
400 __m128 x1 = _mm_set_ss(x);
402 int limit = _mm_cvtss_si32(x1) & 0x7fffffff;
404 x1 = _mm_min_ss(x1, _mm_load_ss(expVar.maxX));
405 x1 = _mm_max_ss(x1, _mm_load_ss(expVar.minX));
408 int r = _mm_cvtss_si32(_mm_mul_ss(x1, _mm_load_ss(expVar.a)));
409 unsigned int v = r &
mask(expVar.s);
410 float t = _mm_cvtss_f32(x1) - r * expVar.b[0];
411 int u = r >> expVar.s;
413 fi.i = ((u + 127) << 23) | expVar.tbl[v];
414 return (1 + t) * fi.f;
416 x = std::min(x, expVar.maxX[0]);
417 x = std::max(x, expVar.minX[0]);
418 float t = x * expVar.a[0];
419 const float magic = (1 << 23) + (1 << 22);
423 t = x - (t - magic) * expVar.b[0];
424 int u = ((fi.i + (127 << expVar.s)) >> expVar.s) << 23;
425 unsigned int v = fi.i &
mask(expVar.s);
426 fi.i = u | expVar.tbl[v];
427 return (1 + t) * fi.f;
433 if (x <= -708.39641853226408)
435 if (x >= 709.78271289338397)
436 return std::numeric_limits<double>::infinity();
437 using namespace local;
440 const double _b = double(
uint64_t(3) << 51);
441 __m128d b = _mm_load_sd(&_b);
442 __m128d xx = _mm_load_sd(&x);
443 __m128d d = _mm_add_sd(_mm_mul_sd(xx, _mm_load_sd(&c.a)), b);
444 uint64_t di = _mm_cvtsi128_si32(_mm_castpd_si128(d));
446 __m128d _t = _mm_sub_sd(_mm_mul_sd(_mm_sub_sd(d, b), _mm_load_sd(&c.ra)), xx);
447 uint64_t u = ((di + c.adj) >> c.sbit) << 52;
449 _mm_store_sd(&t, _t);
450 double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];
453 memcpy(&did, &u,
sizeof(did));
465 double t = (di.d - b) * c.ra - x;
466 uint64_t u = ((di.i + c.adj) >> c.sbit) << 52;
467 double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];
477 memcpy(buf, &x,
sizeof(buf));
478 buf[0] =
expd(buf[0]);
479 buf[1] =
expd(buf[1]);
481 memcpy(&y, buf,
sizeof(buf));
484 using namespace local;
486 const double b = double(3ULL << 51);
487 const __m128d mC1 = *cast_to<__m128d>(c.C1);
488 const __m128d mC2 = *cast_to<__m128d>(c.C2);
489 const __m128d mC3 = *cast_to<__m128d>(c.C3);
490 const __m128d ma = _mm_set1_pd(c.a);
491 const __m128d mra = _mm_set1_pd(c.ra);
492 const __m128i madj = _mm_set1_epi32(c.adj);
494 const double expMax[2] = {709.78271289338397, 709.78271289338397};
496 const double expMin[2] = {-708.39641853226408, -708.39641853226408};
497 x = _mm_min_pd(x, *(
const __m128d *)expMax);
498 x = _mm_max_pd(x, *(
const __m128d *)expMin);
500 __m128d d = _mm_mul_pd(x, ma);
501 d = _mm_add_pd(d, _mm_set1_pd(b));
502 int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) &
mask(c.sbit);
504 _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) &
mask(c.sbit);
505 __m128i iaxL = _mm_castpd_si128(_mm_load_sd((
const double *)&c.tbl[adr0]));
506 __m128i iax = _mm_castpd_si128(_mm_load_sd((
const double *)&c.tbl[adr1]));
507 iax = _mm_unpacklo_epi64(iaxL, iax);
509 __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);
510 __m128i u = _mm_castpd_si128(d);
511 u = _mm_add_epi64(u, madj);
512 u = _mm_srli_epi64(u, c.sbit);
513 u = _mm_slli_epi64(u, 52);
514 u = _mm_or_si128(u, iax);
515 __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));
516 y = _mm_mul_pd(y, mC2);
517 y = _mm_add_pd(_mm_sub_pd(y, t), mC1);
518 y = _mm_mul_pd(y, _mm_castsi128_pd(u));
527inline void expd_v(
double *px,
size_t n) {
528 using namespace local;
530 const double b = double(3ULL << 51);
534 const __m256d mC1 = _mm256_set1_pd(c.C1[0]);
535 const __m256d mC2 = _mm256_set1_pd(c.C2[0]);
536 const __m256d mC3 = _mm256_set1_pd(c.C3[0]);
537 const __m256d ma = _mm256_set1_pd(c.a);
538 const __m256d mra = _mm256_set1_pd(c.ra);
539 const __m256i madj = _mm256_set1_epi64x(c.adj);
540 const __m256i maskSbit = _mm256_set1_epi64x(
mask(c.sbit));
541 const __m256d expMax = _mm256_set1_pd(709.78272569338397);
542 const __m256d expMin = _mm256_set1_pd(-708.39641853226408);
543 for (
size_t i = 0; i < n; i += 4) {
544 __m256d x = _mm256_load_pd(px);
545 x = _mm256_min_pd(x, expMax);
546 x = _mm256_max_pd(x, expMin);
548 __m256d d = _mm256_mul_pd(x, ma);
549 d = _mm256_add_pd(d, _mm256_set1_pd(b));
550 __m256i adr = _mm256_and_si256(_mm256_castpd_si256(d), maskSbit);
551 __m256i iax = _mm256_i64gather_epi64((
const long long *)c.tbl, adr, 8);
552 __m256d t = _mm256_sub_pd(
553 _mm256_mul_pd(_mm256_sub_pd(d, _mm256_set1_pd(b)), mra), x);
554 __m256i u = _mm256_castpd_si256(d);
555 u = _mm256_add_epi64(u, madj);
556 u = _mm256_srli_epi64(u, c.sbit);
557 u = _mm256_slli_epi64(u, 52);
558 u = _mm256_or_si256(u, iax);
559 __m256d y = _mm256_mul_pd(_mm256_sub_pd(mC3, t), _mm256_mul_pd(t, t));
560 y = _mm256_mul_pd(y, mC2);
561 y = _mm256_add_pd(_mm256_sub_pd(y, t), mC1);
562 _mm256_store_pd(px, _mm256_mul_pd(y, _mm256_castsi256_pd(u)));
568 const __m128d mC1 = _mm_set1_pd(c.C1[0]);
569 const __m128d mC2 = _mm_set1_pd(c.C2[0]);
570 const __m128d mC3 = _mm_set1_pd(c.C3[0]);
571 const __m128d ma = _mm_set1_pd(c.a);
572 const __m128d mra = _mm_set1_pd(c.ra);
573#if defined(__x86_64__) || defined(_WIN64)
574 const __m128i madj = _mm_set1_epi64x(c.adj);
576 const __m128i madj = _mm_set_epi32(0, c.adj, 0, c.adj);
578 const __m128d expMax = _mm_set1_pd(709.78272569338397);
579 const __m128d expMin = _mm_set1_pd(-708.39641853226408);
580 for (
size_t i = 0; i < n; i += 2) {
581 __m128d x = _mm_load_pd(px);
582 x = _mm_min_pd(x, expMax);
583 x = _mm_max_pd(x, expMin);
585 __m128d d = _mm_mul_pd(x, ma);
586 d = _mm_add_pd(d, _mm_set1_pd(b));
587 int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) &
mask(c.sbit);
588 int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) &
591 __m128i iaxL = _mm_castpd_si128(_mm_load_sd((
const double *)&c.tbl[adr0]));
592 __m128i iax = _mm_castpd_si128(_mm_load_sd((
const double *)&c.tbl[adr1]));
593 iax = _mm_unpacklo_epi64(iaxL, iax);
595 __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);
596 __m128i u = _mm_castpd_si128(d);
597 u = _mm_add_epi64(u, madj);
598 u = _mm_srli_epi64(u, c.sbit);
599 u = _mm_slli_epi64(u, 52);
600 u = _mm_or_si128(u, iax);
601 __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));
602 y = _mm_mul_pd(y, mC2);
603 y = _mm_add_pd(_mm_sub_pd(y, t), mC1);
604 _mm_store_pd(px, _mm_mul_pd(y, _mm_castsi128_pd(u)));
608 for (
size_t i = 0; i < r; i++) {
619 using namespace local;
623 _mm_castps_si128(_mm_and_ps(x, *cast_to<__m128>(expVar.i7fffffff)));
625 _mm_movemask_epi8(_mm_cmpgt_epi32(limit, *cast_to<__m128i>(expVar.maxX)));
627 x = _mm_min_ps(x, _mm_load_ps(expVar.maxX));
628 x = _mm_max_ps(x, _mm_load_ps(expVar.minX));
631 __m128i r = _mm_cvtps_epi32(_mm_mul_ps(x, *cast_to<__m128>(expVar.a)));
633 _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(r), *cast_to<__m128>(expVar.b)));
634 t = _mm_add_ps(t, *cast_to<__m128>(expVar.f1));
636 __m128i v4 = _mm_and_si128(r, *cast_to<__m128i>(expVar.mask_s));
637 __m128i u4 = _mm_add_epi32(r, *cast_to<__m128i>(expVar.i127s));
638 u4 = _mm_srli_epi32(u4, expVar.s);
639 u4 = _mm_slli_epi32(u4, 23);
642 __m128i ti = _mm_i32gather_epi32((
const int *)expVar.tbl, v4, 4);
643 __m128 t0 = _mm_castsi128_ps(ti);
645 unsigned int v0, v1, v2, v3;
646 v0 = _mm_cvtsi128_si32(v4);
647 v1 = _mm_extract_epi16(v4, 2);
648 v2 = _mm_extract_epi16(v4, 4);
649 v3 = _mm_extract_epi16(v4, 6);
651 __m128 t0, t1, t2, t3;
653 t0 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v0]));
654 t1 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v1]));
655 t2 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v2]));
656 t3 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v3]));
658 t1 = _mm_movelh_ps(t1, t3);
659 t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32));
660 t0 = _mm_movelh_ps(t0, t2);
661 t0 = _mm_castsi128_ps(_mm_srli_epi64(_mm_castps_si128(t0), 32));
662 t0 = _mm_or_ps(t0, t1);
664 __m128i ti = _mm_castps_si128(_mm_load_ss((
const float *)&expVar.tbl[v0]));
665 ti = _mm_insert_epi32(ti, expVar.tbl[v1], 1);
666 ti = _mm_insert_epi32(ti, expVar.tbl[v2], 2);
667 ti = _mm_insert_epi32(ti, expVar.tbl[v3], 3);
668 __m128 t0 = _mm_castsi128_ps(ti);
671 t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4));
673 t = _mm_mul_ps(t, t0);
748 using namespace local;
751 __m128i xi = _mm_castps_si128(x);
752 __m128i idx = _mm_srli_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m2)),
754 __m128 a = _mm_cvtepi32_ps(
755 _mm_sub_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m1)),
756 *cast_to<__m128i>(logVar.m5)));
757 __m128 b2 = _mm_cvtepi32_ps(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m3)));
759 a = _mm_mul_ps(a, *cast_to<__m128>(logVar.m4));
761 unsigned int i0 = _mm_cvtsi128_si32(idx);
764 unsigned int i1 = _mm_extract_epi16(idx, 2);
765 unsigned int i2 = _mm_extract_epi16(idx, 4);
766 unsigned int i3 = _mm_extract_epi16(idx, 6);
768 idx = _mm_srli_si128(idx, 4);
769 unsigned int i1 = _mm_cvtsi128_si32(idx);
771 idx = _mm_srli_si128(idx, 4);
772 unsigned int i2 = _mm_cvtsi128_si32(idx);
774 idx = _mm_srli_si128(idx, 4);
775 unsigned int i3 = _mm_cvtsi128_si32(idx);
779 __m128i L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i0].app));
780 __m128i H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i1].app));
781 __m128 t = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
782 L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i2].app));
783 H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i3].app));
784 rev = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
785 app = _mm_shuffle_ps(t, rev,
MIE_PACK(2, 0, 2, 0));
786 rev = _mm_shuffle_ps(t, rev,
MIE_PACK(3, 1, 3, 1));
788 a = _mm_add_ps(a, app);
789 rev = _mm_mul_ps(b2, rev);
790 return _mm_add_ps(a, rev);