openjij
Framework for the Ising model and QUBO.
Loading...
Searching...
No Matches
fmath.hpp
Go to the documentation of this file.
1#pragma once
13/*
14 function prototype list
15
16 float fmath::exp(float);
17 double fmath::expd(double);
18 float fmath::log(float);
19
20 __m128 fmath::exp_ps(__m128);
21 __m256 fmath::exp_ps256(__m256);
22 __m128 fmath::log_ps(__m128);
23
24 double fmath::expd_v(double *, size_t n);
25
26 if FMATH_USE_XBYAK is defined then Xbyak version are used
27*/
28//#define FMATH_USE_XBYAK
29
30#include <assert.h>
31#include <float.h>
32#include <limits>
33#include <math.h>
34#include <stddef.h>
35#include <stdlib.h>
36#include <string.h> // for memcpy
37#if defined(_WIN32) && !defined(__GNUC__)
38#include <intrin.h>
39#ifndef MIE_ALIGN
40#define MIE_ALIGN(x) __declspec(align(x))
41#endif
42#else
43#ifndef __GNUC_PREREQ
44#define __GNUC_PREREQ(major, minor) \
45 ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
46#endif
47#if __GNUC_PREREQ(4, 4) || (__clang__ > 0 && __clang_major__ >= 3) || \
48 !defined(__GNUC__)
49/* GCC >= 4.4 or clang or non-GCC compilers */
50#include <x86intrin.h>
51#elif __GNUC_PREREQ(4, 1)
52/* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header
53 */
54#include <emmintrin.h>
55#endif
56#ifndef MIE_ALIGN
57#define MIE_ALIGN(x) __attribute__((aligned(x)))
58#endif
59#endif
60#ifndef MIE_PACK
61#define MIE_PACK(x, y, z, w) ((x)*64 + (y)*16 + (z)*4 + (w))
62#endif
63#ifdef FMATH_USE_XBYAK
64#define XBYAK_NO_OP_NAMES
65#include "xbyak/xbyak.h"
66#include "xbyak/xbyak_util.h"
67#endif
68
69namespace fmath {
70
71namespace local {
72
73const size_t EXP_TABLE_SIZE = 10;
74const size_t EXPD_TABLE_SIZE = 11;
75const size_t LOG_TABLE_SIZE = 12;
76
77typedef unsigned long long uint64_t;
78
79union fi {
80 float f;
81 unsigned int i;
82};
83
84union di {
85 double d;
87};
88
89inline unsigned int mask(int x) { return (1U << x) - 1; }
90
91inline uint64_t mask64(int x) { return (1ULL << x) - 1; }
92
93template <class T> inline const T *cast_to(const void *p) {
94 return reinterpret_cast<const T *>(p);
95}
96
97template <class T, size_t N> size_t NumOfArray(const T (&)[N]) { return N; }
98
99/*
100 exp(88.722839f) = inf ; 0x42b17218
101 exp(-87.33655f) = 1.175491e-038f(007fffe6) denormal ; 0xc2aeac50
102 exp(-103.972081f) = 0 ; 0xc2cff1b5
103*/
104template <size_t N = EXP_TABLE_SIZE> struct ExpVar {
105 enum {
106 s = N,
107 n = 1 << s,
108 f88 = 0x42b00000 /* 88.0 */
109 };
110 float minX[8];
111 float maxX[8];
112 float a[8];
113 float b[8];
114 float f1[8];
115 unsigned int i127s[8];
116 unsigned int mask_s[8];
117 unsigned int i7fffffff[8];
118 unsigned int tbl[n];
120 float log_2 = ::logf(2.0f);
121 for (int i = 0; i < 8; i++) {
122 maxX[i] = 88;
123 minX[i] = -88;
124 a[i] = n / log_2;
125 b[i] = log_2 / n;
126 f1[i] = 1.0f;
127 i127s[i] = 127 << s;
128 i7fffffff[i] = 0x7fffffff;
129 mask_s[i] = mask(s);
130 }
131
132 for (int i = 0; i < n; i++) {
133 float y = pow(2.0f, (float)i / n);
134 fi fi;
135 fi.f = y;
136 tbl[i] = fi.i & mask(23);
137 }
138 }
139};
140
141template <size_t sbit_ = EXPD_TABLE_SIZE> struct ExpdVar {
142 enum {
143 sbit = sbit_,
144 s = 1UL << sbit,
145 adj = (1UL << (sbit + 10)) - (1UL << sbit)
146 };
147 // A = 1, B = 1, C = 1/2, D = 1/6
148 double C1[2]; // A
149 double C2[2]; // D
150 double C3[2]; // C/D
152 double a;
153 double ra;
154 ExpdVar() : a(s / ::log(2.0)), ra(1 / a) {
155 for (int i = 0; i < 2; i++) {
156#if 0
157 C1[i] = 1.0;
158 C2[i] = 0.16667794882310216;
159 C3[i] = 2.9997969303278795;
160#else
161 C1[i] = 1.0;
162 C2[i] = 0.16666666685227835064;
163 C3[i] = 3.0000000027955394;
164#endif
165 }
166 for (int i = 0; i < s; i++) {
167 di di;
168 di.d = ::pow(2.0, i * (1.0 / s));
169 tbl[i] = di.i & mask64(52);
170 }
171 }
172};
173
174template <size_t N = LOG_TABLE_SIZE> struct LogVar {
175 enum { LEN = N - 1 };
176 unsigned int m1[4]; // 0
177 unsigned int m2[4]; // 16
178 unsigned int m3[4]; // 32
179 float m4[4]; // 48
180 unsigned int m5[4]; // 64
181 struct {
182 float app;
183 float rev;
184 } tbl[1 << LEN];
185 float c_log2;
186 LogVar() : c_log2(::logf(2.0f) / (1 << 23)) {
187 const double e = 1 / double(1 << 24);
188 const double h = 1 / double(1 << LEN);
189 const size_t n = 1U << LEN;
190 for (size_t i = 0; i < n; i++) {
191 double x = 1 + double(i) / n;
192 double a = ::log(x);
193 tbl[i].app = (float)a;
194 if (i < n - 1) {
195 double b = ::log(x + h - e);
196 tbl[i].rev = (float)((b - a) / ((h - e) * (1 << 23)));
197 } else {
198 tbl[i].rev = (float)(1 / (x * (1 << 23)));
199 }
200 }
201 for (int i = 0; i < 4; i++) {
202 m1[i] = mask(8) << 23;
203 m2[i] = mask(LEN) << (23 - LEN);
204 m3[i] = mask(23 - LEN);
205 m4[i] = c_log2;
206 m5[i] = 127U << 23;
207 }
208 }
209};
210
211#ifdef FMATH_USE_XBYAK
212struct ExpCode : public Xbyak::CodeGenerator {
213 float (*exp_)(float);
214 __m128 (*exp_ps_)(__m128);
215 template <size_t N> ExpCode(const ExpVar<N> *self) {
216 Xbyak::util::Cpu cpu;
217 try {
218 makeExp(self, cpu);
219 exp_ = getCode<float (*)(float)>();
220 align(16);
221 exp_ps_ = getCurr<__m128 (*)(__m128)>();
222 makeExpPs(self, cpu);
223 return;
224 } catch (std::exception &e) {
225 fprintf(stderr, "ExpCode ERR:%s\n", e.what());
226 } catch (...) {
227 fprintf(stderr, "ExpCode ERR:unknown error\n");
228 }
229 ::exit(1);
230 }
231 template <size_t N>
232 void makeExp(const ExpVar<N> *self, const Xbyak::util::Cpu & /*cpu*/) {
233 typedef ExpVar<N> Self;
234 using namespace local;
235 using namespace Xbyak;
236
237 inLocalLabel();
238#ifdef XBYAK64
239 const Reg64 &base = rcx;
240 const Reg64 &a = rax;
241#else
242 const Reg32 &base = ecx;
243 const Reg32 &a = eax;
244#endif
245
246 mov(base, (size_t)self);
247
248#ifdef XBYAK32
249 movss(xm0, ptr[esp + 4]);
250#endif
251 L(".retry");
252 movaps(xm1, xm0);
253 movd(edx, xm0);
254 mulss(xm1, ptr[base + offsetof(Self, a)]); // t
255 and_(edx, 0x7fffffff);
256 cvtss2si(eax, xm1);
257 cmp(edx, ExpVar<N>::f88);
258 jg(".overflow");
259 lea(edx, ptr[eax + (127 << self->s)]);
260 cvtsi2ss(xm1, eax);
261 and_(eax, mask(self->s)); // v
262 mov(eax, ptr[base + a * 4 + offsetof(Self, tbl)]); // expVar.tbl[v]
263 shr(edx, self->s);
264 mulss(xm1, ptr[base + offsetof(Self, b)]);
265 shl(edx, 23); // u
266 subss(xm0, xm1); // t
267 or_(eax, edx); // fi.f
268 addss(xm0, ptr[base + offsetof(Self, f1)]);
269 movd(xm1, eax);
270 mulss(xm0, xm1);
271#ifdef XBYAK32
272 movss(ptr[esp + 4], xm0);
273 fld(dword[esp + 4]);
274#endif
275 ret();
276 L(".overflow");
277 minss(xm0, ptr[base + offsetof(Self, maxX)]);
278 maxss(xm0, ptr[base + offsetof(Self, minX)]);
279 jmp(".retry");
280 outLocalLabel();
281 }
282 template <size_t N>
283 void makeExpPs(const ExpVar<N> *self, const Xbyak::util::Cpu &cpu) {
284 typedef ExpVar<N> Self;
285 using namespace local;
286 using namespace Xbyak;
287
288 inLocalLabel();
289#ifdef XBYAK64
290 const Reg64 &base = rcx;
291 const Reg64 &a = rax;
292 const Reg64 &d = rdx;
293#else
294 const Reg32 &base = ecx;
295 const Reg32 &a = eax;
296 const Reg32 &d = edx;
297#endif
298
299 /*
300 if abs(x) >= maxX then x = max(min(x, maxX), -maxX) and try
301 minps, maxps are very slow then avoid them
302 */
303 const bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41);
304#if defined(XBYAK64_WIN) && !defined(__INTEL_COMPILER)
305 movaps(xm0, ptr[rcx]);
306#endif
307 mov(base, (size_t)self);
308 L(".retry");
309 movaps(xm5, xm0);
310 andps(xm5, ptr[base + offsetof(Self, i7fffffff)]);
311 movaps(xm3, ptr[base + offsetof(Self, a)]);
312 movaps(xm4, ptr[base + offsetof(Self, b)]);
313 pcmpgtd(xm5, ptr[base + offsetof(Self, maxX)]);
314 mulps(xm3, xm0);
315 movaps(xm1, ptr[base + offsetof(Self, i127s)]);
316 pmovmskb(eax, xm5);
317 movaps(xm5, ptr[base + offsetof(Self, mask_s)]);
318 cvtps2dq(xm2, xm3);
319 pand(xm5, xm2);
320 cvtdq2ps(xm3, xm2);
321 test(eax, eax);
322 jnz(".overflow");
323 paddd(xm1, xm2);
324 movd(eax, xm5);
325 mulps(xm4, xm3);
326 pextrw(edx, xm5, 2);
327 subps(xm0, xm4);
328 movd(xm4, ptr[base + a * 4 + offsetof(Self, tbl)]);
329 addps(xm0, ptr[base + offsetof(Self, f1)]);
330 pextrw(eax, xm5, 4);
331 if (useSSE41) {
332 pinsrd(xm4, ptr[base + d * 4 + offsetof(Self, tbl)], 1);
333 } else {
334 movd(xm3, ptr[base + d * 4 + offsetof(Self, tbl)]);
335 movlhps(xm4, xm3);
336 }
337 pextrw(edx, xm5, 6);
338 psrld(xm1, self->s);
339 pslld(xm1, 23);
340 if (useSSE41) {
341 pinsrd(xm4, ptr[base + a * 4 + offsetof(Self, tbl)], 2);
342 pinsrd(xm4, ptr[base + d * 4 + offsetof(Self, tbl)], 3);
343 } else {
344 movd(xm2, ptr[base + a * 4 + offsetof(Self, tbl)]);
345 movd(xm3, ptr[base + d * 4 + offsetof(Self, tbl)]);
346 movlhps(xm2, xm3);
347 shufps(xm4, xm2, MIE_PACK(2, 0, 2, 0));
348 }
349 por(xm1, xm4);
350 mulps(xm0, xm1);
351 ret();
352 L(".overflow");
353 minps(xm0, ptr[base + offsetof(Self, maxX)]);
354 maxps(xm0, ptr[base + offsetof(Self, minX)]);
355 jmp(".retry");
356 outLocalLabel();
357 }
358};
359#endif
360
361/* to define static variables in fmath.hpp */
362template <size_t EXP_N = EXP_TABLE_SIZE, size_t LOG_N = LOG_TABLE_SIZE,
363 size_t EXPD_N = EXPD_TABLE_SIZE>
364struct C {
365 static const ExpVar<EXP_N> expVar;
366 static const LogVar<LOG_N> logVar;
368#ifdef FMATH_USE_XBYAK
369 static const ExpCode &getInstance() {
370 static const ExpCode expCode(&expVar);
371 return expCode;
372 }
373#endif
374};
375
376template <size_t EXP_N, size_t LOG_N, size_t EXPD_N>
377MIE_ALIGN(32)
378const ExpVar<EXP_N> C<EXP_N, LOG_N, EXPD_N>::expVar;
379
380template <size_t EXP_N, size_t LOG_N, size_t EXPD_N>
381MIE_ALIGN(32)
382const LogVar<LOG_N> C<EXP_N, LOG_N, EXPD_N>::logVar;
383
384template <size_t EXP_N, size_t LOG_N, size_t EXPD_N>
385MIE_ALIGN(32)
386const ExpdVar<EXPD_N> C<EXP_N, LOG_N, EXPD_N>::expdVar;
387
388} // namespace local
389
390#ifdef FMATH_USE_XBYAK
391inline float expC(float x)
392#else
393inline float exp(float x)
394#endif
395{
396 using namespace local;
397 const ExpVar<> &expVar = C<>::expVar;
398
399#if 1
400 __m128 x1 = _mm_set_ss(x);
401
402 int limit = _mm_cvtss_si32(x1) & 0x7fffffff;
403 if (limit > ExpVar<>::f88) {
404 x1 = _mm_min_ss(x1, _mm_load_ss(expVar.maxX));
405 x1 = _mm_max_ss(x1, _mm_load_ss(expVar.minX));
406 }
407
408 int r = _mm_cvtss_si32(_mm_mul_ss(x1, _mm_load_ss(expVar.a)));
409 unsigned int v = r & mask(expVar.s);
410 float t = _mm_cvtss_f32(x1) - r * expVar.b[0];
411 int u = r >> expVar.s;
412 fi fi;
413 fi.i = ((u + 127) << 23) | expVar.tbl[v];
414 return (1 + t) * fi.f;
415#else
416 x = std::min(x, expVar.maxX[0]);
417 x = std::max(x, expVar.minX[0]);
418 float t = x * expVar.a[0];
419 const float magic = (1 << 23) + (1 << 22); // to round
420 t += magic;
421 fi fi;
422 fi.f = t;
423 t = x - (t - magic) * expVar.b[0];
424 int u = ((fi.i + (127 << expVar.s)) >> expVar.s) << 23;
425 unsigned int v = fi.i & mask(expVar.s);
426 fi.i = u | expVar.tbl[v];
427 return (1 + t) * fi.f;
428// return (1 + t) * pow(2, (float)u) * pow(2, (float)v / n);
429#endif
430}
431
432inline double expd(double x) {
433 if (x <= -708.39641853226408)
434 return 0;
435 if (x >= 709.78271289338397)
436 return std::numeric_limits<double>::infinity();
437 using namespace local;
438 const ExpdVar<> &c = C<>::expdVar;
439#if 1
440 const double _b = double(uint64_t(3) << 51);
441 __m128d b = _mm_load_sd(&_b);
442 __m128d xx = _mm_load_sd(&x);
443 __m128d d = _mm_add_sd(_mm_mul_sd(xx, _mm_load_sd(&c.a)), b);
444 uint64_t di = _mm_cvtsi128_si32(_mm_castpd_si128(d));
445 uint64_t iax = c.tbl[di & mask(c.sbit)];
446 __m128d _t = _mm_sub_sd(_mm_mul_sd(_mm_sub_sd(d, b), _mm_load_sd(&c.ra)), xx);
447 uint64_t u = ((di + c.adj) >> c.sbit) << 52;
448 double t;
449 _mm_store_sd(&t, _t);
450 double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];
451 double did;
452 u |= iax;
453 memcpy(&did, &u, sizeof(did));
454 return y * did;
455#else
456 /*
457 remark : -ffast-math option of gcc may generate bad code for
458 fmath::expd
459 */
460 const uint64_t b = 3ULL << 51;
461 di di;
462 di.d = x * c.a + b;
463 uint64_t iax = c.tbl[di.i & mask(c.sbit)];
464
465 double t = (di.d - b) * c.ra - x;
466 uint64_t u = ((di.i + c.adj) >> c.sbit) << 52;
467 double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];
468
469 di.i = u | iax;
470 return y * di.d;
471#endif
472}
473
474inline __m128d exp_pd(__m128d x) {
475#if 0 // faster on Haswell
476 MIE_ALIGN(16) double buf[2];
477 memcpy(buf, &x, sizeof(buf));
478 buf[0] = expd(buf[0]);
479 buf[1] = expd(buf[1]);
480 __m128d y;
481 memcpy(&y, buf, sizeof(buf));
482 return y;
483#else // faster on Skeylake
484 using namespace local;
485 const ExpdVar<> &c = C<>::expdVar;
486 const double b = double(3ULL << 51);
487 const __m128d mC1 = *cast_to<__m128d>(c.C1);
488 const __m128d mC2 = *cast_to<__m128d>(c.C2);
489 const __m128d mC3 = *cast_to<__m128d>(c.C3);
490 const __m128d ma = _mm_set1_pd(c.a);
491 const __m128d mra = _mm_set1_pd(c.ra);
492 const __m128i madj = _mm_set1_epi32(c.adj);
493 MIE_ALIGN(16)
494 const double expMax[2] = {709.78271289338397, 709.78271289338397};
495 MIE_ALIGN(16)
496 const double expMin[2] = {-708.39641853226408, -708.39641853226408};
497 x = _mm_min_pd(x, *(const __m128d *)expMax);
498 x = _mm_max_pd(x, *(const __m128d *)expMin);
499
500 __m128d d = _mm_mul_pd(x, ma);
501 d = _mm_add_pd(d, _mm_set1_pd(b));
502 int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit);
503 int adr1 =
504 _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) & mask(c.sbit);
505 __m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double *)&c.tbl[adr0]));
506 __m128i iax = _mm_castpd_si128(_mm_load_sd((const double *)&c.tbl[adr1]));
507 iax = _mm_unpacklo_epi64(iaxL, iax);
508
509 __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);
510 __m128i u = _mm_castpd_si128(d);
511 u = _mm_add_epi64(u, madj);
512 u = _mm_srli_epi64(u, c.sbit);
513 u = _mm_slli_epi64(u, 52);
514 u = _mm_or_si128(u, iax);
515 __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));
516 y = _mm_mul_pd(y, mC2);
517 y = _mm_add_pd(_mm_sub_pd(y, t), mC1);
518 y = _mm_mul_pd(y, _mm_castsi128_pd(u));
519 return y;
520#endif
521}
522
523/*
524 px : pointer to array of double
525 n : size of array
526*/
527inline void expd_v(double *px, size_t n) {
528 using namespace local;
529 const ExpdVar<> &c = C<>::expdVar;
530 const double b = double(3ULL << 51);
531#ifdef __AVX2__
532 size_t r = n & 3;
533 n &= ~3;
534 const __m256d mC1 = _mm256_set1_pd(c.C1[0]);
535 const __m256d mC2 = _mm256_set1_pd(c.C2[0]);
536 const __m256d mC3 = _mm256_set1_pd(c.C3[0]);
537 const __m256d ma = _mm256_set1_pd(c.a);
538 const __m256d mra = _mm256_set1_pd(c.ra);
539 const __m256i madj = _mm256_set1_epi64x(c.adj);
540 const __m256i maskSbit = _mm256_set1_epi64x(mask(c.sbit));
541 const __m256d expMax = _mm256_set1_pd(709.78272569338397);
542 const __m256d expMin = _mm256_set1_pd(-708.39641853226408);
543 for (size_t i = 0; i < n; i += 4) {
544 __m256d x = _mm256_load_pd(px);
545 x = _mm256_min_pd(x, expMax);
546 x = _mm256_max_pd(x, expMin);
547
548 __m256d d = _mm256_mul_pd(x, ma);
549 d = _mm256_add_pd(d, _mm256_set1_pd(b));
550 __m256i adr = _mm256_and_si256(_mm256_castpd_si256(d), maskSbit);
551 __m256i iax = _mm256_i64gather_epi64((const long long *)c.tbl, adr, 8);
552 __m256d t = _mm256_sub_pd(
553 _mm256_mul_pd(_mm256_sub_pd(d, _mm256_set1_pd(b)), mra), x);
554 __m256i u = _mm256_castpd_si256(d);
555 u = _mm256_add_epi64(u, madj);
556 u = _mm256_srli_epi64(u, c.sbit);
557 u = _mm256_slli_epi64(u, 52);
558 u = _mm256_or_si256(u, iax);
559 __m256d y = _mm256_mul_pd(_mm256_sub_pd(mC3, t), _mm256_mul_pd(t, t));
560 y = _mm256_mul_pd(y, mC2);
561 y = _mm256_add_pd(_mm256_sub_pd(y, t), mC1);
562 _mm256_store_pd(px, _mm256_mul_pd(y, _mm256_castsi256_pd(u)));
563 px += 4;
564 }
565#else
566 size_t r = n & 1;
567 n &= ~1;
568 const __m128d mC1 = _mm_set1_pd(c.C1[0]);
569 const __m128d mC2 = _mm_set1_pd(c.C2[0]);
570 const __m128d mC3 = _mm_set1_pd(c.C3[0]);
571 const __m128d ma = _mm_set1_pd(c.a);
572 const __m128d mra = _mm_set1_pd(c.ra);
573#if defined(__x86_64__) || defined(_WIN64)
574 const __m128i madj = _mm_set1_epi64x(c.adj);
575#else
576 const __m128i madj = _mm_set_epi32(0, c.adj, 0, c.adj);
577#endif
578 const __m128d expMax = _mm_set1_pd(709.78272569338397);
579 const __m128d expMin = _mm_set1_pd(-708.39641853226408);
580 for (size_t i = 0; i < n; i += 2) {
581 __m128d x = _mm_load_pd(px);
582 x = _mm_min_pd(x, expMax);
583 x = _mm_max_pd(x, expMin);
584
585 __m128d d = _mm_mul_pd(x, ma);
586 d = _mm_add_pd(d, _mm_set1_pd(b));
587 int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit);
588 int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) &
589 mask(c.sbit);
590
591 __m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double *)&c.tbl[adr0]));
592 __m128i iax = _mm_castpd_si128(_mm_load_sd((const double *)&c.tbl[adr1]));
593 iax = _mm_unpacklo_epi64(iaxL, iax);
594
595 __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);
596 __m128i u = _mm_castpd_si128(d);
597 u = _mm_add_epi64(u, madj);
598 u = _mm_srli_epi64(u, c.sbit);
599 u = _mm_slli_epi64(u, 52);
600 u = _mm_or_si128(u, iax);
601 __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));
602 y = _mm_mul_pd(y, mC2);
603 y = _mm_add_pd(_mm_sub_pd(y, t), mC1);
604 _mm_store_pd(px, _mm_mul_pd(y, _mm_castsi128_pd(u)));
605 px += 2;
606 }
607#endif
608 for (size_t i = 0; i < r; i++) {
609 px[i] = expd(px[i]);
610 }
611}
612
613#ifdef FMATH_USE_XBYAK
614inline __m128 exp_psC(__m128 x)
615#else
616inline __m128 exp_ps(__m128 x)
617#endif
618{
619 using namespace local;
620 const ExpVar<> &expVar = C<>::expVar;
621
622 __m128i limit =
623 _mm_castps_si128(_mm_and_ps(x, *cast_to<__m128>(expVar.i7fffffff)));
624 int over =
625 _mm_movemask_epi8(_mm_cmpgt_epi32(limit, *cast_to<__m128i>(expVar.maxX)));
626 if (over) {
627 x = _mm_min_ps(x, _mm_load_ps(expVar.maxX));
628 x = _mm_max_ps(x, _mm_load_ps(expVar.minX));
629 }
630
631 __m128i r = _mm_cvtps_epi32(_mm_mul_ps(x, *cast_to<__m128>(expVar.a)));
632 __m128 t =
633 _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(r), *cast_to<__m128>(expVar.b)));
634 t = _mm_add_ps(t, *cast_to<__m128>(expVar.f1));
635
636 __m128i v4 = _mm_and_si128(r, *cast_to<__m128i>(expVar.mask_s));
637 __m128i u4 = _mm_add_epi32(r, *cast_to<__m128i>(expVar.i127s));
638 u4 = _mm_srli_epi32(u4, expVar.s);
639 u4 = _mm_slli_epi32(u4, 23);
640
641#ifdef __AVX2__ // fast?
642 __m128i ti = _mm_i32gather_epi32((const int *)expVar.tbl, v4, 4);
643 __m128 t0 = _mm_castsi128_ps(ti);
644#else
645 unsigned int v0, v1, v2, v3;
646 v0 = _mm_cvtsi128_si32(v4);
647 v1 = _mm_extract_epi16(v4, 2);
648 v2 = _mm_extract_epi16(v4, 4);
649 v3 = _mm_extract_epi16(v4, 6);
650#if 1
651 __m128 t0, t1, t2, t3;
652
653 t0 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v0]));
654 t1 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v1]));
655 t2 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v2]));
656 t3 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v3]));
657
658 t1 = _mm_movelh_ps(t1, t3);
659 t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32));
660 t0 = _mm_movelh_ps(t0, t2);
661 t0 = _mm_castsi128_ps(_mm_srli_epi64(_mm_castps_si128(t0), 32));
662 t0 = _mm_or_ps(t0, t1);
663#else
664 __m128i ti = _mm_castps_si128(_mm_load_ss((const float *)&expVar.tbl[v0]));
665 ti = _mm_insert_epi32(ti, expVar.tbl[v1], 1);
666 ti = _mm_insert_epi32(ti, expVar.tbl[v2], 2);
667 ti = _mm_insert_epi32(ti, expVar.tbl[v3], 3);
668 __m128 t0 = _mm_castsi128_ps(ti);
669#endif
670#endif
671 t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4));
672
673 t = _mm_mul_ps(t, t0);
674
675 return t;
676}
677#ifdef __AVX2__
678inline __m256 exp_ps256(__m256 x) {
679 using namespace local;
680 const ExpVar<> &expVar = C<>::expVar;
681
682 __m256i limit = _mm256_castps_si256(
683 _mm256_and_ps(x, *reinterpret_cast<const __m256 *>(expVar.i7fffffff)));
684 int over = _mm256_movemask_epi8(_mm256_cmpgt_epi32(
685 limit, *reinterpret_cast<const __m256i *>(expVar.maxX)));
686 if (over) {
687 x = _mm256_min_ps(x, _mm256_load_ps(expVar.maxX));
688 x = _mm256_max_ps(x, _mm256_load_ps(expVar.minX));
689 }
690 __m256i r = _mm256_cvtps_epi32(
691 _mm256_mul_ps(x, *reinterpret_cast<const __m256 *>(expVar.a)));
692 __m256 t = _mm256_sub_ps(
693 x, _mm256_mul_ps(_mm256_cvtepi32_ps(r),
694 *reinterpret_cast<const __m256 *>(expVar.b)));
695 t = _mm256_add_ps(t, *reinterpret_cast<const __m256 *>(expVar.f1));
696 __m256i v8 =
697 _mm256_and_si256(r, *reinterpret_cast<const __m256i *>(expVar.mask_s));
698 __m256i u8 =
699 _mm256_add_epi32(r, *reinterpret_cast<const __m256i *>(expVar.i127s));
700 u8 = _mm256_srli_epi32(u8, expVar.s);
701 u8 = _mm256_slli_epi32(u8, 23);
702#if 1
703 __m256i ti = _mm256_i32gather_epi32((const int *)expVar.tbl, v8, 4);
704#else
705 unsigned int v0, v1, v2, v3, v4, v5, v6, v7;
706 v0 = _mm256_extract_epi16(v8, 0);
707 v1 = _mm256_extract_epi16(v8, 2);
708 v2 = _mm256_extract_epi16(v8, 4);
709 v3 = _mm256_extract_epi16(v8, 6);
710 v4 = _mm256_extract_epi16(v8, 8);
711 v5 = _mm256_extract_epi16(v8, 10);
712 v6 = _mm256_extract_epi16(v8, 12);
713 v7 = _mm256_extract_epi16(v8, 14);
714 __m256i ti = _mm256_setzero_si256();
715 ti = _mm256_insert_epi32(ti, expVar.tbl[v0], 0);
716 ti = _mm256_insert_epi32(ti, expVar.tbl[v1], 1);
717 ti = _mm256_insert_epi32(ti, expVar.tbl[v2], 2);
718 ti = _mm256_insert_epi32(ti, expVar.tbl[v3], 3);
719 ti = _mm256_insert_epi32(ti, expVar.tbl[v4], 4);
720 ti = _mm256_insert_epi32(ti, expVar.tbl[v5], 5);
721 ti = _mm256_insert_epi32(ti, expVar.tbl[v6], 6);
722 ti = _mm256_insert_epi32(ti, expVar.tbl[v7], 7);
723#endif
724 __m256 t0 = _mm256_castsi256_ps(ti);
725 t0 = _mm256_or_ps(t0, _mm256_castsi256_ps(u8));
726 t = _mm256_mul_ps(t, t0);
727 return t;
728}
729#endif
730
731inline float log(float x) {
732 using namespace local;
733 const LogVar<> &logVar = C<>::logVar;
734 const size_t logLen = logVar.LEN;
735
736 fi fi;
737 fi.f = x;
738 int a = fi.i & (mask(8) << 23);
739 unsigned int b1 = fi.i & (mask(logLen) << (23 - logLen));
740 unsigned int b2 = fi.i & mask(23 - logLen);
741 int idx = b1 >> (23 - logLen);
742 float f = float(a - (127 << 23)) * logVar.c_log2 + logVar.tbl[idx].app +
743 float(b2) * logVar.tbl[idx].rev;
744 return f;
745}
746
747inline __m128 log_ps(__m128 x) {
748 using namespace local;
749 const LogVar<> &logVar = C<>::logVar;
750
751 __m128i xi = _mm_castps_si128(x);
752 __m128i idx = _mm_srli_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m2)),
753 (23 - logVar.LEN));
754 __m128 a = _mm_cvtepi32_ps(
755 _mm_sub_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m1)),
756 *cast_to<__m128i>(logVar.m5)));
757 __m128 b2 = _mm_cvtepi32_ps(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m3)));
758
759 a = _mm_mul_ps(a, *cast_to<__m128>(logVar.m4)); // c_log2
760
761 unsigned int i0 = _mm_cvtsi128_si32(idx);
762
763#if 1
764 unsigned int i1 = _mm_extract_epi16(idx, 2);
765 unsigned int i2 = _mm_extract_epi16(idx, 4);
766 unsigned int i3 = _mm_extract_epi16(idx, 6);
767#else
768 idx = _mm_srli_si128(idx, 4);
769 unsigned int i1 = _mm_cvtsi128_si32(idx);
770
771 idx = _mm_srli_si128(idx, 4);
772 unsigned int i2 = _mm_cvtsi128_si32(idx);
773
774 idx = _mm_srli_si128(idx, 4);
775 unsigned int i3 = _mm_cvtsi128_si32(idx);
776#endif
777
778 __m128 app, rev;
779 __m128i L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i0].app));
780 __m128i H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i1].app));
781 __m128 t = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
782 L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i2].app));
783 H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i3].app));
784 rev = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
785 app = _mm_shuffle_ps(t, rev, MIE_PACK(2, 0, 2, 0));
786 rev = _mm_shuffle_ps(t, rev, MIE_PACK(3, 1, 3, 1));
787
788 a = _mm_add_ps(a, app);
789 rev = _mm_mul_ps(b2, rev);
790 return _mm_add_ps(a, rev);
791}
792
793#ifndef __CYGWIN__
794// cygwin defines log2() in global namespace!
795// log2(x) = log(x) / log(2)
796inline float log2(float x) { return fmath::log(x) * 1.442695f; }
797#endif
798
799/*
800 for given y > 0
801 get f_y(x) := pow(x, y) for x >= 0
802*/
804 enum { N = 11 };
805 float tbl0_[256];
806 struct {
807 float app;
808 float rev;
809 } tbl1_[1 << N];
810
811public:
812 PowGenerator(float y) {
813 for (int i = 0; i < 256; i++) {
814 tbl0_[i] = ::powf(2, (i - 127) * y);
815 }
816 const double e = 1 / double(1 << 24);
817 const double h = 1 / double(1 << N);
818 const size_t n = 1U << N;
819 for (size_t i = 0; i < n; i++) {
820 double x = 1 + double(i) / n;
821 double a = ::pow(x, (double)y);
822 tbl1_[i].app = (float)a;
823 double b = ::pow(x + h - e, (double)y);
824 tbl1_[i].rev = (float)((b - a) / (h - e) / (1 << 23));
825 }
826 }
827 float get(float x) const {
828 using namespace local;
829 fi fi;
830 fi.f = x;
831 int a = (fi.i >> 23) & mask(8);
832 unsigned int b = fi.i & mask(23);
833 unsigned int b1 = b & (mask(N) << (23 - N));
834 unsigned int b2 = b & mask(23 - N);
835 float f;
836 int idx = b1 >> (23 - N);
837 f = tbl0_[a] * (tbl1_[idx].app + float(b2) * tbl1_[idx].rev);
838 return f;
839 }
840};
841
842// for Xbyak version
843#ifdef FMATH_USE_XBYAK
844float (*const exp)(float) = local::C<>::getInstance().exp_;
845__m128 (*const exp_ps)(__m128) = local::C<>::getInstance().exp_ps_;
846#endif
847
848// exp2(x) = pow(2, x)
849inline float exp2(float x) { return fmath::exp(x * 0.6931472f); }
850
851/*
852 this function may be optimized in the future
853*/
854inline __m128d log_pd(__m128d x) {
855 double d[2];
856 memcpy(d, &x, sizeof(d));
857 d[0] = ::log(d[0]);
858 d[1] = ::log(d[1]);
859 __m128d m;
860 memcpy(&m, d, sizeof(m));
861 return m;
862}
863inline __m128 pow_ps(__m128 x, __m128 y) {
864 return exp_ps(_mm_mul_ps(y, log_ps(x)));
865}
866inline __m128d pow_pd(__m128d x, __m128d y) {
867 return exp_pd(_mm_mul_pd(y, log_pd(x)));
868}
869
870} // namespace fmath
Definition fmath.hpp:803
float rev
Definition fmath.hpp:808
float app
Definition fmath.hpp:807
float get(float x) const
Definition fmath.hpp:827
PowGenerator(float y)
Definition fmath.hpp:812
#define MIE_ALIGN(x)
Definition fmath.hpp:57
#define MIE_PACK(x, y, z, w)
Definition fmath.hpp:61
const size_t EXPD_TABLE_SIZE
Definition fmath.hpp:74
unsigned long long uint64_t
Definition fmath.hpp:77
const size_t LOG_TABLE_SIZE
Definition fmath.hpp:75
uint64_t mask64(int x)
Definition fmath.hpp:91
const size_t EXP_TABLE_SIZE
Definition fmath.hpp:73
size_t NumOfArray(const T(&)[N])
Definition fmath.hpp:97
const T * cast_to(const void *p)
Definition fmath.hpp:93
unsigned int mask(int x)
Definition fmath.hpp:89
Definition fmath.hpp:69
__m128 pow_ps(__m128 x, __m128 y)
Definition fmath.hpp:863
__m128d log_pd(__m128d x)
Definition fmath.hpp:854
float log2(float x)
Definition fmath.hpp:796
double expd(double x)
Definition fmath.hpp:432
float log(float x)
Definition fmath.hpp:731
float exp2(float x)
Definition fmath.hpp:849
__m128 log_ps(__m128 x)
Definition fmath.hpp:747
__m128 exp_ps(__m128 x)
Definition fmath.hpp:616
__m128d exp_pd(__m128d x)
Definition fmath.hpp:474
void expd_v(double *px, size_t n)
Definition fmath.hpp:527
__m128d pow_pd(__m128d x, __m128d y)
Definition fmath.hpp:866
float exp(float x)
Definition fmath.hpp:393
Definition fmath.hpp:364
static const ExpVar< EXP_N > expVar
Definition fmath.hpp:365
static const ExpdVar< EXPD_N > expdVar
Definition fmath.hpp:367
static const LogVar< LOG_N > logVar
Definition fmath.hpp:366
Definition fmath.hpp:104
float b[8]
Definition fmath.hpp:113
ExpVar()
Definition fmath.hpp:119
float a[8]
Definition fmath.hpp:112
@ s
Definition fmath.hpp:106
@ n
Definition fmath.hpp:107
@ f88
Definition fmath.hpp:108
float f1[8]
Definition fmath.hpp:114
unsigned int i7fffffff[8]
Definition fmath.hpp:117
float maxX[8]
Definition fmath.hpp:111
unsigned int mask_s[8]
Definition fmath.hpp:116
float minX[8]
Definition fmath.hpp:110
unsigned int tbl[n]
Definition fmath.hpp:118
unsigned int i127s[8]
Definition fmath.hpp:115
Definition fmath.hpp:141
ExpdVar()
Definition fmath.hpp:154
double a
Definition fmath.hpp:152
@ sbit
Definition fmath.hpp:143
@ s
Definition fmath.hpp:144
@ adj
Definition fmath.hpp:145
double C3[2]
Definition fmath.hpp:150
uint64_t tbl[s]
Definition fmath.hpp:151
double ra
Definition fmath.hpp:153
double C2[2]
Definition fmath.hpp:149
double C1[2]
Definition fmath.hpp:148
Definition fmath.hpp:174
unsigned int m2[4]
Definition fmath.hpp:177
unsigned int m3[4]
Definition fmath.hpp:178
float rev
Definition fmath.hpp:183
@ LEN
Definition fmath.hpp:175
float m4[4]
Definition fmath.hpp:179
float app
Definition fmath.hpp:182
LogVar()
Definition fmath.hpp:186
unsigned int m5[4]
Definition fmath.hpp:180
float c_log2
Definition fmath.hpp:185
struct fmath::local::LogVar::@3 tbl[1<< LEN]
unsigned int m1[4]
Definition fmath.hpp:176
Definition fmath.hpp:84
uint64_t i
Definition fmath.hpp:86
double d
Definition fmath.hpp:85
Definition fmath.hpp:79
float f
Definition fmath.hpp:80
unsigned int i
Definition fmath.hpp:81