MOAB
4.9.3pre
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2008-2014 Konstantinos Margaritis <[email protected]> 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H 00011 #define EIGEN_PACKET_MATH_ALTIVEC_H 00012 00013 namespace Eigen { 00014 00015 namespace internal { 00016 00017 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 00018 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 00019 #endif 00020 00021 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 00022 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 00023 #endif 00024 00025 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 00026 #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 00027 #endif 00028 00029 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 00030 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 00031 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 00032 #endif 00033 00034 typedef __vector float Packet4f; 00035 typedef __vector int Packet4i; 00036 typedef __vector unsigned int Packet4ui; 00037 typedef __vector __bool int Packet4bi; 00038 typedef __vector short int Packet8i; 00039 typedef __vector unsigned char Packet16uc; 00040 00041 // We don't want to write the same code all the time, but we need to reuse the constants 00042 // and it doesn't really work to declare them global, so we define macros instead 00043 00044 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ 00045 Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X) 00046 00047 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ 00048 Packet4i p4i_##NAME = vec_splat_s32(X) 00049 00050 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 00051 Packet4f p4f_##NAME = pset1<Packet4f>(X) 00052 00053 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 00054 Packet4i p4i_##NAME = pset1<Packet4i>(X) 00055 00056 #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ 00057 Packet2d p2d_##NAME = pset1<Packet2d>(X) 00058 00059 #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ 00060 Packet2l p2l_##NAME = pset1<Packet2l>(X) 00061 00062 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 00063 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X)) 00064 00065 #define DST_CHAN 1 00066 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) 00067 00068 00069 // These constants are endian-agnostic 00070 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} 00071 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} 00072 #ifndef __VSX__ 00073 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} 00074 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} 00075 #endif 00076 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} 00077 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} 00078 static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} 00079 00080 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; 00081 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; 00082 00083 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; 00084 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; 00085 00086 // Mask alignment 00087 #ifdef __PPC64__ 00088 #define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 00089 #else 00090 #define _EIGEN_MASK_ALIGNMENT 0xfffffff0 00091 #endif 00092 00093 #define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) 00094 00095 // Handle endianness properly while loading constants 00096 // Define global static constants: 00097 #ifdef _BIG_ENDIAN 00098 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 00099 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 00100 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 00101 static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 00102 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 00103 #else 00104 static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 00105 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 00106 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 00107 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 00108 static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 00109 #endif // _BIG_ENDIAN 00110 00111 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; 00112 static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; 00113 static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; 00114 static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; 00115 00116 static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; 00117 00118 #ifdef _BIG_ENDIAN 00119 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 00120 #else 00121 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 00122 #endif // _BIG_ENDIAN 00123 00124 template<> struct packet_traits<float> : default_packet_traits 00125 { 00126 typedef Packet4f type; 00127 typedef Packet4f half; 00128 enum { 00129 Vectorizable = 1, 00130 AlignedOnScalar = 1, 00131 size=4, 00132 HasHalfPacket=0, 00133 00134 // FIXME check the Has* 00135 HasDiv = 1, 00136 HasSin = 0, 00137 HasCos = 0, 00138 HasLog = 1, 00139 HasExp = 1, 00140 HasSqrt = 0 00141 }; 00142 }; 00143 template<> struct packet_traits<int> : default_packet_traits 00144 { 00145 typedef Packet4i type; 00146 typedef Packet4i half; 00147 enum { 00148 // FIXME check the Has* 00149 Vectorizable = 1, 00150 AlignedOnScalar = 1, 00151 size=4 00152 }; 00153 }; 00154 00155 00156 template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; 00157 template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; 00158 00159 inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) 00160 { 00161 union { 00162 Packet16uc v; 00163 unsigned char n[16]; 00164 } vt; 00165 vt.v = v; 00166 for (int i=0; i< 16; i++) 00167 s << (int)vt.n[i] << ", "; 00168 return s; 00169 } 00170 00171 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) 00172 { 00173 union { 00174 Packet4f v; 00175 float n[4]; 00176 } vt; 00177 vt.v = v; 00178 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 00179 return s; 00180 } 00181 00182 inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) 00183 { 00184 union { 00185 Packet4i v; 00186 int n[4]; 00187 } vt; 00188 vt.v = v; 00189 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 00190 return s; 00191 } 00192 00193 inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) 00194 { 00195 union { 00196 Packet4ui v; 00197 unsigned int n[4]; 00198 } vt; 00199 vt.v = v; 00200 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 00201 return s; 00202 } 00203 /* 00204 inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) 00205 { 00206 union { 00207 Packet4bi v; 00208 unsigned int n[4]; 00209 } vt; 00210 vt.v = v; 00211 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 00212 return s; 00213 }*/ 00214 00215 00216 // Need to define them first or we get specialization after instantiation errors 00217 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } 00218 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } 00219 00220 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } 00221 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } 00222 00223 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { 00224 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 00225 float EIGEN_ALIGN16 af[4]; 00226 af[0] = from; 00227 Packet4f vc = pload<Packet4f>(af); 00228 vc = vec_splat(vc, 0); 00229 return vc; 00230 } 00231 00232 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { 00233 int EIGEN_ALIGN16 ai[4]; 00234 ai[0] = from; 00235 Packet4i vc = pload<Packet4i>(ai); 00236 vc = vec_splat(vc, 0); 00237 return vc; 00238 } 00239 template<> EIGEN_STRONG_INLINE void 00240 pbroadcast4<Packet4f>(const float *a, 00241 Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) 00242 { 00243 a3 = pload<Packet4f>(a); 00244 a0 = vec_splat(a3, 0); 00245 a1 = vec_splat(a3, 1); 00246 a2 = vec_splat(a3, 2); 00247 a3 = vec_splat(a3, 3); 00248 } 00249 template<> EIGEN_STRONG_INLINE void 00250 pbroadcast4<Packet4i>(const int *a, 00251 Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) 00252 { 00253 a3 = pload<Packet4i>(a); 00254 a0 = vec_splat(a3, 0); 00255 a1 = vec_splat(a3, 1); 00256 a2 = vec_splat(a3, 2); 00257 a3 = vec_splat(a3, 3); 00258 } 00259 00260 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) 00261 { 00262 float EIGEN_ALIGN16 af[4]; 00263 af[0] = from[0*stride]; 00264 af[1] = from[1*stride]; 00265 af[2] = from[2*stride]; 00266 af[3] = from[3*stride]; 00267 return pload<Packet4f>(af); 00268 } 00269 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) 00270 { 00271 int EIGEN_ALIGN16 ai[4]; 00272 ai[0] = from[0*stride]; 00273 ai[1] = from[1*stride]; 00274 ai[2] = from[2*stride]; 00275 ai[3] = from[3*stride]; 00276 return pload<Packet4i>(ai); 00277 } 00278 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) 00279 { 00280 float EIGEN_ALIGN16 af[4]; 00281 pstore<float>(af, from); 00282 to[0*stride] = af[0]; 00283 to[1*stride] = af[1]; 00284 to[2*stride] = af[2]; 00285 to[3*stride] = af[3]; 00286 } 00287 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) 00288 { 00289 int EIGEN_ALIGN16 ai[4]; 00290 pstore<int>((int *)ai, from); 00291 to[0*stride] = ai[0]; 00292 to[1*stride] = ai[1]; 00293 to[2*stride] = ai[2]; 00294 to[3*stride] = ai[3]; 00295 } 00296 00297 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); } 00298 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); } 00299 00300 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); } 00301 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); } 00302 00303 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); } 00304 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); } 00305 00306 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); } 00307 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); } 00308 00309 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } 00310 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } 00311 00312 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); } 00313 /* Commented out: it's actually slower than processing it scalar 00314 * 00315 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) 00316 { 00317 // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec 00318 //Set up constants, variables 00319 Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel; 00320 00321 // Get the absolute values 00322 a1 = vec_abs(a); 00323 b1 = vec_abs(b); 00324 00325 // Get the signs using xor 00326 Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO); 00327 00328 // Do the multiplication for the asbolute values. 00329 bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 ); 00330 low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1); 00331 high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO); 00332 high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16); 00333 prod = vec_add( low_prod, high_prod ); 00334 00335 // NOR the product and select only the negative elements according to the sign mask 00336 prod_ = vec_nor(prod, prod); 00337 prod_ = vec_sel(p4i_ZERO, prod_, sgn); 00338 00339 // Add 1 to the result to get the negative numbers 00340 v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn); 00341 prod_ = vec_add(prod_, v1sel); 00342 00343 // Merge the results back to the final vector. 00344 prod = vec_sel(prod, prod_, sgn); 00345 00346 return prod; 00347 } 00348 */ 00349 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) 00350 { 00351 #ifndef __VSX__ // VSX actually provides a div instruction 00352 Packet4f t, y_0, y_1; 00353 00354 // Altivec does not offer a divide instruction, we have to do a reciprocal approximation 00355 y_0 = vec_re(b); 00356 00357 // Do one Newton-Raphson iteration to get the needed accuracy 00358 t = vec_nmsub(y_0, b, p4f_ONE); 00359 y_1 = vec_madd(y_0, t, y_0); 00360 00361 return vec_madd(a, y_1, p4f_ZERO); 00362 #else 00363 return vec_div(a, b); 00364 #endif 00365 } 00366 00367 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 00368 { eigen_assert(false && "packet integer division are not supported by AltiVec"); 00369 return pset1<Packet4i>(0); 00370 } 00371 00372 // for some weird raisons, it has to be overloaded for packet of integers 00373 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } 00374 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } 00375 00376 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } 00377 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } 00378 00379 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } 00380 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } 00381 00382 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } 00383 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } 00384 00385 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } 00386 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } 00387 00388 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } 00389 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } 00390 00391 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } 00392 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } 00393 00394 #ifdef _BIG_ENDIAN 00395 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 00396 { 00397 EIGEN_DEBUG_ALIGNED_LOAD 00398 Packet16uc MSQ, LSQ; 00399 Packet16uc mask; 00400 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 00401 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 00402 mask = vec_lvsl(0, from); // create the permute mask 00403 return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data 00404 00405 } 00406 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 00407 { 00408 EIGEN_DEBUG_ALIGNED_LOAD 00409 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 00410 Packet16uc MSQ, LSQ; 00411 Packet16uc mask; 00412 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 00413 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 00414 mask = vec_lvsl(0, from); // create the permute mask 00415 return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data 00416 } 00417 #else 00418 // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX 00419 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 00420 { 00421 EIGEN_DEBUG_ALIGNED_LOAD 00422 return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); 00423 } 00424 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 00425 { 00426 EIGEN_DEBUG_ALIGNED_LOAD 00427 return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); 00428 } 00429 #endif 00430 00431 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 00432 { 00433 Packet4f p; 00434 if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from); 00435 else p = ploadu<Packet4f>(from); 00436 return vec_perm(p, p, p16uc_DUPLICATE32_HI); 00437 } 00438 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 00439 { 00440 Packet4i p; 00441 if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from); 00442 else p = ploadu<Packet4i>(from); 00443 return vec_perm(p, p, p16uc_DUPLICATE32_HI); 00444 } 00445 00446 #ifdef _BIG_ENDIAN 00447 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 00448 { 00449 EIGEN_DEBUG_UNALIGNED_STORE 00450 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 00451 // Warning: not thread safe! 00452 Packet16uc MSQ, LSQ, edges; 00453 Packet16uc edgeAlign, align; 00454 00455 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 00456 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 00457 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 00458 edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges 00459 align = vec_lvsr( 0, to ); // permute map to misalign data 00460 MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) 00461 LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) 00462 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 00463 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 00464 } 00465 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 00466 { 00467 EIGEN_DEBUG_UNALIGNED_STORE 00468 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 00469 // Warning: not thread safe! 00470 Packet16uc MSQ, LSQ, edges; 00471 Packet16uc edgeAlign, align; 00472 00473 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 00474 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 00475 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 00476 edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges 00477 align = vec_lvsr( 0, to ); // permute map to misalign data 00478 MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) 00479 LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) 00480 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 00481 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 00482 } 00483 #else 00484 // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX 00485 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 00486 { 00487 EIGEN_DEBUG_ALIGNED_STORE 00488 vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); 00489 } 00490 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 00491 { 00492 EIGEN_DEBUG_ALIGNED_STORE 00493 vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); 00494 } 00495 #endif 00496 00497 #ifndef __VSX__ 00498 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } 00499 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } 00500 #endif 00501 00502 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } 00503 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } 00504 00505 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); } 00506 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); } 00507 00508 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } 00509 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } 00510 00511 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 00512 { 00513 Packet4f b, sum; 00514 b = (Packet4f) vec_sld(a, a, 8); 00515 sum = vec_add(a, b); 00516 b = (Packet4f) vec_sld(sum, sum, 4); 00517 sum = vec_add(sum, b); 00518 return pfirst(sum); 00519 } 00520 00521 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) 00522 { 00523 Packet4f v[4], sum[4]; 00524 00525 // It's easier and faster to transpose then add as columns 00526 // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 00527 // Do the transpose, first set of moves 00528 v[0] = vec_mergeh(vecs[0], vecs[2]); 00529 v[1] = vec_mergel(vecs[0], vecs[2]); 00530 v[2] = vec_mergeh(vecs[1], vecs[3]); 00531 v[3] = vec_mergel(vecs[1], vecs[3]); 00532 // Get the resulting vectors 00533 sum[0] = vec_mergeh(v[0], v[2]); 00534 sum[1] = vec_mergel(v[0], v[2]); 00535 sum[2] = vec_mergeh(v[1], v[3]); 00536 sum[3] = vec_mergel(v[1], v[3]); 00537 00538 // Now do the summation: 00539 // Lines 0+1 00540 sum[0] = vec_add(sum[0], sum[1]); 00541 // Lines 2+3 00542 sum[1] = vec_add(sum[2], sum[3]); 00543 // Add the results 00544 sum[0] = vec_add(sum[0], sum[1]); 00545 00546 return sum[0]; 00547 } 00548 00549 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 00550 { 00551 Packet4i sum; 00552 sum = vec_sums(a, p4i_ZERO); 00553 #ifdef _BIG_ENDIAN 00554 sum = vec_sld(sum, p4i_ZERO, 12); 00555 #else 00556 sum = vec_sld(p4i_ZERO, sum, 4); 00557 #endif 00558 return pfirst(sum); 00559 } 00560 00561 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) 00562 { 00563 Packet4i v[4], sum[4]; 00564 00565 // It's easier and faster to transpose then add as columns 00566 // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 00567 // Do the transpose, first set of moves 00568 v[0] = vec_mergeh(vecs[0], vecs[2]); 00569 v[1] = vec_mergel(vecs[0], vecs[2]); 00570 v[2] = vec_mergeh(vecs[1], vecs[3]); 00571 v[3] = vec_mergel(vecs[1], vecs[3]); 00572 // Get the resulting vectors 00573 sum[0] = vec_mergeh(v[0], v[2]); 00574 sum[1] = vec_mergel(v[0], v[2]); 00575 sum[2] = vec_mergeh(v[1], v[3]); 00576 sum[3] = vec_mergel(v[1], v[3]); 00577 00578 // Now do the summation: 00579 // Lines 0+1 00580 sum[0] = vec_add(sum[0], sum[1]); 00581 // Lines 2+3 00582 sum[1] = vec_add(sum[2], sum[3]); 00583 // Add the results 00584 sum[0] = vec_add(sum[0], sum[1]); 00585 00586 return sum[0]; 00587 } 00588 00589 // Other reduction functions: 00590 // mul 00591 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 00592 { 00593 Packet4f prod; 00594 prod = pmul(a, (Packet4f)vec_sld(a, a, 8)); 00595 return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4))); 00596 } 00597 00598 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 00599 { 00600 EIGEN_ALIGN16 int aux[4]; 00601 pstore(aux, a); 00602 return aux[0] * aux[1] * aux[2] * aux[3]; 00603 } 00604 00605 // min 00606 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 00607 { 00608 Packet4f b, res; 00609 b = vec_min(a, vec_sld(a, a, 8)); 00610 res = vec_min(b, vec_sld(b, b, 4)); 00611 return pfirst(res); 00612 } 00613 00614 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 00615 { 00616 Packet4i b, res; 00617 b = vec_min(a, vec_sld(a, a, 8)); 00618 res = vec_min(b, vec_sld(b, b, 4)); 00619 return pfirst(res); 00620 } 00621 00622 // max 00623 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 00624 { 00625 Packet4f b, res; 00626 b = vec_max(a, vec_sld(a, a, 8)); 00627 res = vec_max(b, vec_sld(b, b, 4)); 00628 return pfirst(res); 00629 } 00630 00631 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 00632 { 00633 Packet4i b, res; 00634 b = vec_max(a, vec_sld(a, a, 8)); 00635 res = vec_max(b, vec_sld(b, b, 4)); 00636 return pfirst(res); 00637 } 00638 00639 template<int Offset> 00640 struct palign_impl<Offset,Packet4f> 00641 { 00642 static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) 00643 { 00644 #ifdef _BIG_ENDIAN 00645 switch (Offset % 4) { 00646 case 1: 00647 first = vec_sld(first, second, 4); break; 00648 case 2: 00649 first = vec_sld(first, second, 8); break; 00650 case 3: 00651 first = vec_sld(first, second, 12); break; 00652 } 00653 #else 00654 switch (Offset % 4) { 00655 case 1: 00656 first = vec_sld(second, first, 12); break; 00657 case 2: 00658 first = vec_sld(second, first, 8); break; 00659 case 3: 00660 first = vec_sld(second, first, 4); break; 00661 } 00662 #endif 00663 } 00664 }; 00665 00666 template<int Offset> 00667 struct palign_impl<Offset,Packet4i> 00668 { 00669 static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) 00670 { 00671 #ifdef _BIG_ENDIAN 00672 switch (Offset % 4) { 00673 case 1: 00674 first = vec_sld(first, second, 4); break; 00675 case 2: 00676 first = vec_sld(first, second, 8); break; 00677 case 3: 00678 first = vec_sld(first, second, 12); break; 00679 } 00680 #else 00681 switch (Offset % 4) { 00682 case 1: 00683 first = vec_sld(second, first, 12); break; 00684 case 2: 00685 first = vec_sld(second, first, 8); break; 00686 case 3: 00687 first = vec_sld(second, first, 4); break; 00688 } 00689 #endif 00690 } 00691 }; 00692 00693 EIGEN_DEVICE_FUNC inline void 00694 ptranspose(PacketBlock<Packet4f,4>& kernel) { 00695 Packet4f t0, t1, t2, t3; 00696 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 00697 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 00698 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 00699 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 00700 kernel.packet[0] = vec_mergeh(t0, t2); 00701 kernel.packet[1] = vec_mergel(t0, t2); 00702 kernel.packet[2] = vec_mergeh(t1, t3); 00703 kernel.packet[3] = vec_mergel(t1, t3); 00704 } 00705 00706 EIGEN_DEVICE_FUNC inline void 00707 ptranspose(PacketBlock<Packet4i,4>& kernel) { 00708 Packet4i t0, t1, t2, t3; 00709 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 00710 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 00711 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 00712 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 00713 kernel.packet[0] = vec_mergeh(t0, t2); 00714 kernel.packet[1] = vec_mergel(t0, t2); 00715 kernel.packet[2] = vec_mergeh(t1, t3); 00716 kernel.packet[3] = vec_mergel(t1, t3); 00717 } 00718 00719 00720 //---------- double ---------- 00721 #ifdef __VSX__ 00722 typedef __vector double Packet2d; 00723 typedef __vector unsigned long long Packet2ul; 00724 typedef __vector long long Packet2l; 00725 00726 static Packet2l p2l_ZERO = (Packet2l) p4i_ZERO; 00727 static Packet2d p2d_ONE = { 1.0, 1.0 }; 00728 static Packet2d p2d_ZERO = (Packet2d) p4f_ZERO; 00729 static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; 00730 00731 #ifdef _BIG_ENDIAN 00732 static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ZERO, (Packet16uc) p2d_ONE, 8); 00733 #else 00734 static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ONE, (Packet16uc) p2d_ZERO, 8); 00735 #endif 00736 00737 static EIGEN_STRONG_INLINE Packet2d vec_splat_dbl(Packet2d& a, int index) 00738 { 00739 switch (index) { 00740 case 0: 00741 return (Packet2d) vec_perm(a, a, p16uc_PSET64_HI); 00742 case 1: 00743 return (Packet2d) vec_perm(a, a, p16uc_PSET64_LO); 00744 } 00745 return a; 00746 } 00747 00748 template<> struct packet_traits<double> : default_packet_traits 00749 { 00750 typedef Packet2d type; 00751 typedef Packet2d half; 00752 enum { 00753 Vectorizable = 1, 00754 AlignedOnScalar = 1, 00755 size=2, 00756 HasHalfPacket = 0, 00757 00758 HasDiv = 1, 00759 HasExp = 1, 00760 HasSqrt = 0 00761 }; 00762 }; 00763 00764 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; 00765 00766 00767 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) 00768 { 00769 union { 00770 Packet2d v; 00771 double n[2]; 00772 } vt; 00773 vt.v = v; 00774 s << vt.n[0] << ", " << vt.n[1]; 00775 return s; 00776 } 00777 00778 // Need to define them first or we get specialization after instantiation errors 00779 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d) vec_ld(0, (const float *) from); } //FIXME 00780 00781 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st((Packet4f)from, 0, (float *)to); } 00782 00783 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { 00784 double EIGEN_ALIGN16 af[2]; 00785 af[0] = from; 00786 Packet2d vc = pload<Packet2d>(af); 00787 vc = vec_splat_dbl(vc, 0); 00788 return vc; 00789 } 00790 template<> EIGEN_STRONG_INLINE void 00791 pbroadcast4<Packet2d>(const double *a, 00792 Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) 00793 { 00794 a1 = pload<Packet2d>(a); 00795 a0 = vec_splat_dbl(a1, 0); 00796 a1 = vec_splat_dbl(a1, 1); 00797 a3 = pload<Packet2d>(a+2); 00798 a2 = vec_splat_dbl(a3, 0); 00799 a3 = vec_splat_dbl(a3, 1); 00800 } 00801 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) 00802 { 00803 double EIGEN_ALIGN16 af[2]; 00804 af[0] = from[0*stride]; 00805 af[1] = from[1*stride]; 00806 return pload<Packet2d>(af); 00807 } 00808 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) 00809 { 00810 double EIGEN_ALIGN16 af[2]; 00811 pstore<double>(af, from); 00812 to[0*stride] = af[0]; 00813 to[1*stride] = af[1]; 00814 } 00815 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return vec_add(pset1<Packet2d>(a), p2d_COUNTDOWN); } 00816 00817 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_add(a,b); } 00818 00819 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); } 00820 00821 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub<Packet2d>(p2d_ZERO, a); } 00822 00823 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } 00824 00825 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_ZERO); } 00826 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } 00827 00828 // for some weird raisons, it has to be overloaded for packet of integers 00829 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } 00830 00831 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } 00832 00833 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } 00834 00835 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } 00836 00837 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } 00838 00839 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } 00840 00841 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } 00842 00843 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) 00844 { 00845 EIGEN_DEBUG_ALIGNED_LOAD 00846 return (Packet2d) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); 00847 } 00848 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) 00849 { 00850 Packet2d p; 00851 if((ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from); 00852 else p = ploadu<Packet2d>(from); 00853 return vec_perm(p, p, p16uc_PSET64_HI); 00854 } 00855 00856 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) 00857 { 00858 EIGEN_DEBUG_ALIGNED_STORE 00859 vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); 00860 } 00861 00862 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { vec_dstt((const float *) addr, DST_CTRL(2,2,32), DST_CHAN); } 00863 00864 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } 00865 00866 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return (Packet2d)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE64); } 00867 00868 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } 00869 00870 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) 00871 { 00872 Packet2d b, sum; 00873 b = (Packet2d) vec_sld((Packet4ui) a, (Packet4ui)a, 8); 00874 sum = vec_add(a, b); 00875 return pfirst(sum); 00876 } 00877 00878 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) 00879 { 00880 Packet2d v[2], sum; 00881 v[0] = vec_add(vecs[0], (Packet2d) vec_sld((Packet4ui) vecs[0], (Packet4ui) vecs[0], 8)); 00882 v[1] = vec_add(vecs[1], (Packet2d) vec_sld((Packet4ui) vecs[1], (Packet4ui) vecs[1], 8)); 00883 00884 #ifdef _BIG_ENDIAN 00885 sum = (Packet2d) vec_sld((Packet4ui) v[0], (Packet4ui) v[1], 8); 00886 #else 00887 sum = (Packet2d) vec_sld((Packet4ui) v[1], (Packet4ui) v[0], 8); 00888 #endif 00889 00890 return sum; 00891 } 00892 // Other reduction functions: 00893 // mul 00894 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) 00895 { 00896 return pfirst(pmul(a, (Packet2d)vec_sld((Packet4ui) a, (Packet4ui) a, 8))); 00897 } 00898 00899 // min 00900 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) 00901 { 00902 return pfirst(vec_min(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8))); 00903 } 00904 00905 // max 00906 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) 00907 { 00908 return pfirst(vec_max(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8))); 00909 } 00910 00911 template<int Offset> 00912 struct palign_impl<Offset,Packet2d> 00913 { 00914 static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) 00915 { 00916 if (Offset == 1) 00917 #ifdef _BIG_ENDIAN 00918 first = (Packet2d) vec_sld((Packet4ui) first, (Packet4ui) second, 8); 00919 #else 00920 first = (Packet2d) vec_sld((Packet4ui) second, (Packet4ui) first, 8); 00921 #endif 00922 } 00923 }; 00924 00925 EIGEN_DEVICE_FUNC inline void 00926 ptranspose(PacketBlock<Packet2d,2>& kernel) { 00927 Packet2d t0, t1; 00928 t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); 00929 t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); 00930 kernel.packet[0] = t0; 00931 kernel.packet[1] = t1; 00932 } 00933 00934 #endif // __VSX__ 00935 } // end namespace internal 00936 00937 } // end namespace Eigen 00938 00939 #endif // EIGEN_PACKET_MATH_ALTIVEC_H