fathom/moab-docs/arch_2AltiVec_2MathFunctions_8h_source.html

00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2007 Julien Pommier
00005 // Copyright (C) 2009 Gael Guennebaud <[email protected]>
00006 //
00007 // This Source Code Form is subject to the terms of the Mozilla
00008 // Public License v. 2.0. If a copy of the MPL was not distributed
00009 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00010
00011 /* The sin, cos, exp, and log functions of this file come from
00012  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
00013  */
00014
00015 #ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
00016 #define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
00017
00018 namespace Eigen {
00019
00020 namespace internal {
00021
00022 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
00023 Packet4f plog<Packet4f>(const Packet4f& _x)
00024 {
00025   Packet4f x = _x;
00026   _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
00027   _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
00028   _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
00029   _EIGEN_DECLARE_CONST_Packet4i(23, 23);
00030
00031   _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
00032
00033   /* the smallest non denormalized float number */
00034   _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
00035   _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
00036   _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
00037
00038   /* natural logarithm computed for 4 simultaneous float
00039     return NaN for x <= 0
00040   */
00041   _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
00042   _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
00043   _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
00044   _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
00045   _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
00046   _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
00047   _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
00048   _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
00049   _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
00050   _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
00051   _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
00052   _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
00053
00054
00055   Packet4i emm0;
00056
00057   /* isvalid_mask is 0 if x < 0 or x is NaN. */
00058   Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO));
00059   Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO));
00060
00061   x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
00062   emm0 = vec_sr(reinterpret_cast<Packet4i>(x),
00063                 reinterpret_cast<Packet4ui>(p4i_23));
00064
00065   /* keep only the fractional part */
00066   x = pand(x, p4f_inv_mant_mask);
00067   x = por(x, p4f_half);
00068
00069   emm0 = psub(emm0, p4i_0x7f);
00070   Packet4f e = padd(vec_ctf(emm0, 0), p4f_1);
00071
00072   /* part2:
00073      if( x < SQRTHF ) {
00074        e -= 1;
00075        x = x + x - 1.0;
00076      } else { x = x - 1.0; }
00077   */
00078   Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF));
00079   Packet4f tmp = pand(x, mask);
00080   x = psub(x, p4f_1);
00081   e = psub(e, pand(p4f_1, mask));
00082   x = padd(x, tmp);
00083
00084   Packet4f x2 = pmul(x,x);
00085   Packet4f x3 = pmul(x2,x);
00086
00087   Packet4f y, y1, y2;
00088   y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
00089   y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
00090   y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
00091   y  = pmadd(y , x, p4f_cephes_log_p2);
00092   y1 = pmadd(y1, x, p4f_cephes_log_p5);
00093   y2 = pmadd(y2, x, p4f_cephes_log_p8);
00094   y = pmadd(y, x3, y1);
00095   y = pmadd(y, x3, y2);
00096   y = pmul(y, x3);
00097
00098   y1 = pmul(e, p4f_cephes_log_q1);
00099   tmp = pmul(x2, p4f_half);
00100   y = padd(y, y1);
00101   x = psub(x, tmp);
00102   y2 = pmul(e, p4f_cephes_log_q2);
00103   x = padd(x, y);
00104   x = padd(x, y2);
00105   // negative arg will be NAN, 0 will be -INF
00106   x = vec_sel(x, p4f_minus_inf, iszero_mask);
00107   x = vec_sel(p4f_minus_nan, x, isvalid_mask);
00108   return x;
00109 }
00110
00111 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
00112 Packet4f pexp<Packet4f>(const Packet4f& _x)
00113 {
00114   Packet4f x = _x;
00115   _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
00116   _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
00117   _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
00118   _EIGEN_DECLARE_CONST_Packet4i(23, 23);
00119
00120
00121   _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
00122   _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
00123
00124   _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
00125   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
00126   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
00127
00128   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
00129   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
00130   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
00131   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
00132   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
00133   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
00134
00135   Packet4f tmp, fx;
00136   Packet4i emm0;
00137
00138   // clamp x
00139   x = vec_max(vec_min(x, p4f_exp_hi), p4f_exp_lo);
00140
00141   /* express exp(x) as exp(g + n*log(2)) */
00142   fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
00143
00144   fx = vec_floor(fx);
00145
00146   tmp = pmul(fx, p4f_cephes_exp_C1);
00147   Packet4f z = pmul(fx, p4f_cephes_exp_C2);
00148   x = psub(x, tmp);
00149   x = psub(x, z);
00150
00151   z = pmul(x,x);
00152
00153   Packet4f y = p4f_cephes_exp_p0;
00154   y = pmadd(y, x, p4f_cephes_exp_p1);
00155   y = pmadd(y, x, p4f_cephes_exp_p2);
00156   y = pmadd(y, x, p4f_cephes_exp_p3);
00157   y = pmadd(y, x, p4f_cephes_exp_p4);
00158   y = pmadd(y, x, p4f_cephes_exp_p5);
00159   y = pmadd(y, z, x);
00160   y = padd(y, p4f_1);
00161
00162   // build 2^n
00163   emm0 = vec_cts(fx, 0);
00164   emm0 = vec_add(emm0, p4i_0x7f);
00165   emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23));
00166
00167   // Altivec's max & min operators just drop silent NaNs. Check NaNs in
00168   // inputs and return them unmodified.
00169   Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
00170   return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
00171                  isnumber_mask);
00172 }
00173
00174 #ifdef __VSX__
00175 // VSX support varies between different compilers and even different
00176 // versions of the same compiler.  For gcc version >= 4.9.3, we can use
00177 // vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
00178 // a slow version that works with older compilers.
00179 static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
00180 #if EIGEN_GNUC_AT_LEAST(5, 0) || \
00181     (EIGEN_GNUC_AT(4, 9) && __GNUC_PATCHLEVEL__ >= 3)
00182   return vec_cts(x, 0);    // TODO: check clang version.
00183 #else
00184   double tmp[2];
00185   memcpy(tmp, &x, sizeof(tmp));
00186   Packet2l l = { static_cast<long long>(tmp[0]),
00187                  static_cast<long long>(tmp[1]) };
00188   return l;
00189 #endif
00190 }
00191
00192 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
00193 Packet2d pexp<Packet2d>(const Packet2d& _x)
00194 {
00195   Packet2d x = _x;
00196
00197   _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
00198   _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
00199   _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
00200
00201   _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
00202   _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
00203
00204   _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
00205
00206   _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
00207   _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
00208   _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
00209
00210   _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
00211   _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
00212   _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
00213   _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
00214
00215   _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
00216   _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
00217
00218   Packet2d tmp, fx;
00219   Packet2l emm0;
00220
00221   // clamp x
00222   x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
00223   /* express exp(x) as exp(g + n*log(2)) */
00224   fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
00225
00226   fx = vec_floor(fx);
00227
00228   tmp = pmul(fx, p2d_cephes_exp_C1);
00229   Packet2d z = pmul(fx, p2d_cephes_exp_C2);
00230   x = psub(x, tmp);
00231   x = psub(x, z);
00232
00233   Packet2d x2 = pmul(x,x);
00234
00235   Packet2d px = p2d_cephes_exp_p0;
00236   px = pmadd(px, x2, p2d_cephes_exp_p1);
00237   px = pmadd(px, x2, p2d_cephes_exp_p2);
00238   px = pmul (px, x);
00239
00240   Packet2d qx = p2d_cephes_exp_q0;
00241   qx = pmadd(qx, x2, p2d_cephes_exp_q1);
00242   qx = pmadd(qx, x2, p2d_cephes_exp_q2);
00243   qx = pmadd(qx, x2, p2d_cephes_exp_q3);
00244
00245   x = pdiv(px,psub(qx,px));
00246   x = pmadd(p2d_2,x,p2d_1);
00247
00248   // build 2^n
00249   emm0 = ConvertToPacket2l(fx);
00250
00251 #ifdef __POWER8_VECTOR__
00252   static const Packet2l p2l_1023 = { 1023, 1023 };
00253   static const Packet2ul p2ul_52 = { 52, 52 };
00254
00255   emm0 = vec_add(emm0, p2l_1023);
00256   emm0 = vec_sl(emm0, p2ul_52);
00257 #else
00258   // Code is a bit complex for POWER7.  There is actually a
00259   // vec_xxsldi intrinsic but it is not supported by some gcc versions.
00260   // So we shift (52-32) bits and do a word swap with zeros.
00261   _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
00262   _EIGEN_DECLARE_CONST_Packet4i(20, 20);    // 52 - 32
00263
00264   Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
00265   emm04i = vec_add(emm04i, p4i_1023);
00266   emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
00267   static const Packet16uc perm = {
00268     0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
00269     0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
00270 #ifdef  _BIG_ENDIAN
00271   emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
00272 #else
00273   emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
00274 #endif
00275
00276 #endif
00277
00278   // Altivec's max & min operators just drop silent NaNs. Check NaNs in
00279   // inputs and return them unmodified.
00280   Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
00281   return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
00282                  isnumber_mask);
00283 }
00284 #endif
00285
00286 }  // end namespace internal
00287
00288 }  // end namespace Eigen
00289
00290 #endif  // EIGEN_MATH_FUNCTIONS_ALTIVEC_H