fathom/moab-docs/SSE_2PacketMath_8h_source.html

00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2008-2009 Gael Guennebaud <[email protected]>
00005 //
00006 // This Source Code Form is subject to the terms of the Mozilla
00007 // Public License v. 2.0. If a copy of the MPL was not distributed
00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00009
00010 #ifndef EIGEN_PACKET_MATH_SSE_H
00011 #define EIGEN_PACKET_MATH_SSE_H
00012
00013 namespace Eigen {
00014
00015 namespace internal {
00016
00017 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
00018 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
00019 #endif
00020
00021 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
00022 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
00023 #endif
00024
00025 #ifdef __FMA__
00026 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
00027 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
00028 #endif
00029 #endif
00030
00031 #if (defined EIGEN_VECTORIZE_AVX) && EIGEN_COMP_GNUC_STRICT && (__GXX_ABI_VERSION < 1004)
00032 // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
00033 // have overloads for both types without linking error.
00034 // One solution is to increase ABI version using -fabi-version=4 (or greater).
00035 // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
00036 // structure:
00037 template<typename T>
00038 struct eigen_packet_wrapper
00039 {
00040   EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
00041   EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
00042   EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
00043   EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
00044   EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
00045     m_val = v;
00046     return *this;
00047   }
00048
00049   T m_val;
00050 };
00051 typedef eigen_packet_wrapper<__m128>  Packet4f;
00052 typedef eigen_packet_wrapper<__m128i> Packet4i;
00053 typedef eigen_packet_wrapper<__m128d> Packet2d;
00054 #else
00055 typedef __m128  Packet4f;
00056 typedef __m128i Packet4i;
00057 typedef __m128d Packet2d;
00058 #endif
00059
00060 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
00061 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
00062 template<> struct is_arithmetic<__m128d> { enum { value = true }; };
00063
00064 #define vec4f_swizzle1(v,p,q,r,s) \
00065   (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
00066
00067 #define vec4i_swizzle1(v,p,q,r,s) \
00068   (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
00069
00070 #define vec2d_swizzle1(v,p,q) \
00071   (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
00072
00073 #define vec4f_swizzle2(a,b,p,q,r,s) \
00074   (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
00075
00076 #define vec4i_swizzle2(a,b,p,q,r,s) \
00077   (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
00078
00079 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
00080   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
00081
00082 #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
00083   const Packet2d p2d_##NAME = pset1<Packet2d>(X)
00084
00085 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
00086   const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
00087
00088 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
00089   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
00090
00091
00092 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
00093 // to leverage AVX instructions.
00094 #ifndef EIGEN_VECTORIZE_AVX
00095 template<> struct packet_traits<float>  : default_packet_traits
00096 {
00097   typedef Packet4f type;
00098   typedef Packet4f half;
00099   enum {
00100     Vectorizable = 1,
00101     AlignedOnScalar = 1,
00102     size=4,
00103     HasHalfPacket = 0,
00104
00105     HasDiv  = 1,
00106     HasSin  = EIGEN_FAST_MATH,
00107     HasCos  = EIGEN_FAST_MATH,
00108     HasLog  = 1,
00109     HasExp  = 1,
00110     HasSqrt = 1,
00111     HasRsqrt = 1,
00112     HasTanh  = EIGEN_FAST_MATH,
00113     HasBlend = 1
00114
00115 #ifdef EIGEN_VECTORIZE_SSE4_1
00116     ,
00117     HasRound = 1,
00118     HasFloor = 1,
00119     HasCeil = 1
00120 #endif
00121   };
00122 };
00123 template<> struct packet_traits<double> : default_packet_traits
00124 {
00125   typedef Packet2d type;
00126   typedef Packet2d half;
00127   enum {
00128     Vectorizable = 1,
00129     AlignedOnScalar = 1,
00130     size=2,
00131     HasHalfPacket = 0,
00132
00133     HasDiv  = 1,
00134     HasExp  = 1,
00135     HasSqrt = 1,
00136     HasRsqrt = 1,
00137     HasBlend = 1
00138
00139 #ifdef EIGEN_VECTORIZE_SSE4_1
00140     ,
00141     HasRound = 1,
00142     HasFloor = 1,
00143     HasCeil = 1
00144 #endif
00145   };
00146 };
00147 #endif
00148 template<> struct packet_traits<int>    : default_packet_traits
00149 {
00150   typedef Packet4i type;
00151   typedef Packet4i half;
00152   enum {
00153     Vectorizable = 1,
00154     AlignedOnScalar = 1,
00155     size=4,
00156
00157     HasBlend = 1
00158   };
00159 };
00160
00161 template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
00162 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
00163 template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
00164
00165 #if EIGEN_COMP_MSVC==1500
00166 // Workaround MSVC 9 internal compiler error.
00167 // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
00168 // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
00169 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps(from,from,from,from); }
00170 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
00171 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); }
00172 #else
00173 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps1(from); }
00174 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
00175 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
00176 #endif
00177
00178 // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
00179 // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
00180 // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
00181 // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
00182 // Also note that with AVX, we want it to generate a vbroadcastss.
00183 #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
00184 template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
00185   return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
00186 }
00187 #endif
00188
00189 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
00190 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
00191 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
00192
00193 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
00194 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
00195 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
00196
00197 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
00198 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
00199 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
00200
00201 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
00202 {
00203   const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
00204   return _mm_xor_ps(a,mask);
00205 }
00206 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
00207 {
00208   const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
00209   return _mm_xor_pd(a,mask);
00210 }
00211 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
00212 {
00213   return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
00214 }
00215
00216 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
00217 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
00218 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
00219
00220 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
00221 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
00222 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
00223 {
00224 #ifdef EIGEN_VECTORIZE_SSE4_1
00225   return _mm_mullo_epi32(a,b);
00226 #else
00227   // this version is slightly faster than 4 scalar products
00228   return vec4i_swizzle1(
00229             vec4i_swizzle2(
00230               _mm_mul_epu32(a,b),
00231               _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
00232                             vec4i_swizzle1(b,1,0,3,2)),
00233               0,2,0,2),
00234             0,2,1,3);
00235 #endif
00236 }
00237
00238 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
00239 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
00240
00241 // for some weird raisons, it has to be overloaded for packet of integers
00242 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
00243 #ifdef __FMA__
00244 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
00245 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
00246 #endif
00247
00248 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
00249 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
00250 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
00251 {
00252 #ifdef EIGEN_VECTORIZE_SSE4_1
00253   return _mm_min_epi32(a,b);
00254 #else
00255   // after some bench, this version *is* faster than a scalar implementation
00256   Packet4i mask = _mm_cmplt_epi32(a,b);
00257   return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
00258 #endif
00259 }
00260
00261 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
00262 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
00263 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
00264 {
00265 #ifdef EIGEN_VECTORIZE_SSE4_1
00266   return _mm_max_epi32(a,b);
00267 #else
00268   // after some bench, this version *is* faster than a scalar implementation
00269   Packet4i mask = _mm_cmpgt_epi32(a,b);
00270   return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
00271 #endif
00272 }
00273
00274 #ifdef EIGEN_VECTORIZE_SSE4_1
00275 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
00276 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
00277
00278 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
00279 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
00280
00281 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
00282 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
00283 #endif
00284
00285 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
00286 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
00287 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
00288
00289 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
00290 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
00291 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
00292
00293 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
00294 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
00295 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
00296
00297 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
00298 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
00299 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
00300
00301 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
00302 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
00303 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
00304
00305 #if EIGEN_COMP_MSVC
00306   template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
00307     EIGEN_DEBUG_UNALIGNED_LOAD
00308     #if (EIGEN_COMP_MSVC==1600)
00309     // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
00310     // (i.e., it does not generate an unaligned load!!
00311     __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
00312     res = _mm_loadh_pi(res, (const __m64*)(from+2));
00313     return res;
00314     #else
00315     return _mm_loadu_ps(from);
00316     #endif
00317   }
00318 #else
00319 // NOTE: with the code below, MSVC's compiler crashes!
00320
00321 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
00322 {
00323   EIGEN_DEBUG_UNALIGNED_LOAD
00324   return _mm_loadu_ps(from);
00325 }
00326 #endif
00327
00328 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
00329 {
00330   EIGEN_DEBUG_UNALIGNED_LOAD
00331   return _mm_loadu_pd(from);
00332 }
00333 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
00334 {
00335   EIGEN_DEBUG_UNALIGNED_LOAD
00336   return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
00337 }
00338
00339
00340 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
00341 {
00342   return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
00343 }
00344 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
00345 { return pset1<Packet2d>(from[0]); }
00346 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
00347 {
00348   Packet4i tmp;
00349   tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
00350   return vec4i_swizzle1(tmp, 0, 0, 1, 1);
00351 }
00352
00353 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
00354 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
00355 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
00356
00357 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
00358 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
00359 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
00360
00361 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
00362 {
00363  return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
00364 }
00365 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
00366 {
00367  return _mm_set_pd(from[1*stride], from[0*stride]);
00368 }
00369 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
00370 {
00371  return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
00372  }
00373
00374 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
00375 {
00376   to[stride*0] = _mm_cvtss_f32(from);
00377   to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
00378   to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
00379   to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
00380 }
00381 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
00382 {
00383   to[stride*0] = _mm_cvtsd_f64(from);
00384   to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
00385 }
00386 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
00387 {
00388   to[stride*0] = _mm_cvtsi128_si32(from);
00389   to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
00390   to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
00391   to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
00392 }
00393
00394 // some compilers might be tempted to perform multiple moves instead of using a vector path.
00395 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
00396 {
00397   Packet4f pa = _mm_set_ss(a);
00398   pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
00399 }
00400 // some compilers might be tempted to perform multiple moves instead of using a vector path.
00401 template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
00402 {
00403   Packet2d pa = _mm_set_sd(a);
00404   pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
00405 }
00406
00407 #ifndef EIGEN_VECTORIZE_AVX
00408 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
00409 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
00410 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
00411 #endif
00412
00413 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
00414 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
00415 // Direct of the struct members fixed bug #62.
00416 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
00417 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
00418 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
00419 #elif EIGEN_COMP_MSVC_STRICT
00420 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
00421 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
00422 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
00423 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
00424 #else
00425 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
00426 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
00427 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
00428 #endif
00429
00430 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
00431 { return _mm_shuffle_ps(a,a,0x1B); }
00432 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
00433 { return _mm_shuffle_pd(a,a,0x1); }
00434 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
00435 { return _mm_shuffle_epi32(a,0x1B); }
00436
00437 template<size_t offset>
00438 struct protate_impl<offset, Packet4f>
00439 {
00440   static Packet4f run(const Packet4f& a) {
00441     return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
00442   }
00443 };
00444
00445 template<size_t offset>
00446 struct protate_impl<offset, Packet4i>
00447 {
00448   static Packet4i run(const Packet4i& a) {
00449     return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
00450   }
00451 };
00452
00453 template<size_t offset>
00454 struct protate_impl<offset, Packet2d>
00455 {
00456   static Packet2d run(const Packet2d& a) {
00457     return vec2d_swizzle1(a, offset, (offset + 1) % 2);
00458   }
00459 };
00460
00461 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
00462 {
00463   const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
00464   return _mm_and_ps(a,mask);
00465 }
00466 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
00467 {
00468   const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
00469   return _mm_and_pd(a,mask);
00470 }
00471 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
00472 {
00473   #ifdef EIGEN_VECTORIZE_SSSE3
00474   return _mm_abs_epi32(a);
00475   #else
00476   Packet4i aux = _mm_srai_epi32(a,31);
00477   return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
00478   #endif
00479 }
00480
00481 // with AVX, the default implementations based on pload1 are faster
00482 #ifndef __AVX__
00483 template<> EIGEN_STRONG_INLINE void
00484 pbroadcast4<Packet4f>(const float *a,
00485                       Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
00486 {
00487   a3 = pload<Packet4f>(a);
00488   a0 = vec4f_swizzle1(a3, 0,0,0,0);
00489   a1 = vec4f_swizzle1(a3, 1,1,1,1);
00490   a2 = vec4f_swizzle1(a3, 2,2,2,2);
00491   a3 = vec4f_swizzle1(a3, 3,3,3,3);
00492 }
00493 template<> EIGEN_STRONG_INLINE void
00494 pbroadcast4<Packet2d>(const double *a,
00495                       Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
00496 {
00497 #ifdef EIGEN_VECTORIZE_SSE3
00498   a0 = _mm_loaddup_pd(a+0);
00499   a1 = _mm_loaddup_pd(a+1);
00500   a2 = _mm_loaddup_pd(a+2);
00501   a3 = _mm_loaddup_pd(a+3);
00502 #else
00503   a1 = pload<Packet2d>(a);
00504   a0 = vec2d_swizzle1(a1, 0,0);
00505   a1 = vec2d_swizzle1(a1, 1,1);
00506   a3 = pload<Packet2d>(a+2);
00507   a2 = vec2d_swizzle1(a3, 0,0);
00508   a3 = vec2d_swizzle1(a3, 1,1);
00509 #endif
00510 }
00511 #endif
00512
00513 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
00514 {
00515   vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
00516   vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
00517   vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
00518   vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
00519 }
00520
00521 #ifdef EIGEN_VECTORIZE_SSE3
00522 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
00523 {
00524   return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
00525 }
00526 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
00527 {
00528   return _mm_hadd_pd(vecs[0], vecs[1]);
00529 }
00530
00531 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
00532 {
00533   Packet4f tmp0 = _mm_hadd_ps(a,a);
00534   return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
00535 }
00536
00537 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
00538 #else
00539 // SSE2 versions
00540 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
00541 {
00542   Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
00543   return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00544 }
00545 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
00546 {
00547   return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
00548 }
00549
00550 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
00551 {
00552   Packet4f tmp0, tmp1, tmp2;
00553   tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
00554   tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
00555   tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
00556   tmp0 = _mm_add_ps(tmp0, tmp1);
00557   tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
00558   tmp1 = _mm_add_ps(tmp1, tmp2);
00559   tmp2 = _mm_movehl_ps(tmp1, tmp0);
00560   tmp0 = _mm_movelh_ps(tmp0, tmp1);
00561   return _mm_add_ps(tmp0, tmp2);
00562 }
00563
00564 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
00565 {
00566   return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
00567 }
00568 #endif  // SSE3
00569
00570
00571 #ifdef EIGEN_VECTORIZE_SSSE3
00572 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
00573 {
00574   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
00575 }
00576 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
00577 {
00578   Packet4i tmp0 = _mm_hadd_epi32(a,a);
00579   return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
00580 }
00581 #else
00582 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
00583 {
00584   Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
00585   return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
00586 }
00587
00588 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
00589 {
00590   Packet4i tmp0, tmp1, tmp2;
00591   tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
00592   tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
00593   tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
00594   tmp0 = _mm_add_epi32(tmp0, tmp1);
00595   tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
00596   tmp1 = _mm_add_epi32(tmp1, tmp2);
00597   tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
00598   tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
00599   return _mm_add_epi32(tmp0, tmp2);
00600 }
00601 #endif
00602 // Other reduction functions:
00603
00604 // mul
00605 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
00606 {
00607   Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
00608   return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00609 }
00610 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
00611 {
00612   return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
00613 }
00614 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
00615 {
00616   // after some experiments, it is seems this is the fastest way to implement it
00617   // for GCC (eg., reusing pmul is very slow !)
00618   // TODO try to call _mm_mul_epu32 directly
00619   EIGEN_ALIGN16 int aux[4];
00620   pstore(aux, a);
00621   return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
00622 }
00623
00624 // min
00625 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
00626 {
00627   Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
00628   return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00629 }
00630 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
00631 {
00632   return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
00633 }
00634 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
00635 {
00636 #ifdef EIGEN_VECTORIZE_SSE4_1
00637   Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
00638   return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
00639 #else
00640   // after some experiments, it is seems this is the fastest way to implement it
00641   // for GCC (eg., it does not like using std::min after the pstore !!)
00642   EIGEN_ALIGN16 int aux[4];
00643   pstore(aux, a);
00644   int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
00645   int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
00646   return aux0<aux2 ? aux0 : aux2;
00647 #endif // EIGEN_VECTORIZE_SSE4_1
00648 }
00649
00650 // max
00651 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
00652 {
00653   Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
00654   return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00655 }
00656 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
00657 {
00658   return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
00659 }
00660 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
00661 {
00662 #ifdef EIGEN_VECTORIZE_SSE4_1
00663   Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
00664   return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
00665 #else
00666   // after some experiments, it is seems this is the fastest way to implement it
00667   // for GCC (eg., it does not like using std::min after the pstore !!)
00668   EIGEN_ALIGN16 int aux[4];
00669   pstore(aux, a);
00670   int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
00671   int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
00672   return aux0>aux2 ? aux0 : aux2;
00673 #endif // EIGEN_VECTORIZE_SSE4_1
00674 }
00675
00676 #if EIGEN_COMP_GNUC
00677 // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
00678 // {
00679 //   Packet4f res = b;
00680 //   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
00681 //   return res;
00682 // }
00683 // EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i&  a, const Packet4i&  b, const int i)
00684 // {
00685 //   Packet4i res = a;
00686 //   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
00687 //   return res;
00688 // }
00689 #endif
00690
00691 #ifdef EIGEN_VECTORIZE_SSSE3
00692 // SSSE3 versions
00693 template<int Offset>
00694 struct palign_impl<Offset,Packet4f>
00695 {
00696   static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
00697   {
00698     if (Offset!=0)
00699       first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
00700   }
00701 };
00702
00703 template<int Offset>
00704 struct palign_impl<Offset,Packet4i>
00705 {
00706   static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
00707   {
00708     if (Offset!=0)
00709       first = _mm_alignr_epi8(second,first, Offset*4);
00710   }
00711 };
00712
00713 template<int Offset>
00714 struct palign_impl<Offset,Packet2d>
00715 {
00716   static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
00717   {
00718     if (Offset==1)
00719       first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
00720   }
00721 };
00722 #else
00723 // SSE2 versions
00724 template<int Offset>
00725 struct palign_impl<Offset,Packet4f>
00726 {
00727   static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
00728   {
00729     if (Offset==1)
00730     {
00731       first = _mm_move_ss(first,second);
00732       first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
00733     }
00734     else if (Offset==2)
00735     {
00736       first = _mm_movehl_ps(first,first);
00737       first = _mm_movelh_ps(first,second);
00738     }
00739     else if (Offset==3)
00740     {
00741       first = _mm_move_ss(first,second);
00742       first = _mm_shuffle_ps(first,second,0x93);
00743     }
00744   }
00745 };
00746
00747 template<int Offset>
00748 struct palign_impl<Offset,Packet4i>
00749 {
00750   static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
00751   {
00752     if (Offset==1)
00753     {
00754       first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00755       first = _mm_shuffle_epi32(first,0x39);
00756     }
00757     else if (Offset==2)
00758     {
00759       first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
00760       first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00761     }
00762     else if (Offset==3)
00763     {
00764       first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00765       first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
00766     }
00767   }
00768 };
00769
00770 template<int Offset>
00771 struct palign_impl<Offset,Packet2d>
00772 {
00773   static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
00774   {
00775     if (Offset==1)
00776     {
00777       first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
00778       first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
00779     }
00780   }
00781 };
00782 #endif
00783
00784 EIGEN_DEVICE_FUNC inline void
00785 ptranspose(PacketBlock<Packet4f,4>& kernel) {
00786   _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
00787 }
00788
00789 EIGEN_DEVICE_FUNC inline void
00790 ptranspose(PacketBlock<Packet2d,2>& kernel) {
00791   __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
00792   kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
00793   kernel.packet[1] = tmp;
00794 }
00795
00796 EIGEN_DEVICE_FUNC inline void
00797 ptranspose(PacketBlock<Packet4i,4>& kernel) {
00798   __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
00799   __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
00800   __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
00801   __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
00802
00803   kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
00804   kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
00805   kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
00806   kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
00807 }
00808
00809 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
00810   const __m128i zero = _mm_setzero_si128();
00811   const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
00812   __m128i false_mask = _mm_cmpeq_epi32(select, zero);
00813 #ifdef EIGEN_VECTORIZE_SSE4_1
00814   return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
00815 #else
00816   return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
00817 #endif
00818 }
00819 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
00820   const __m128 zero = _mm_setzero_ps();
00821   const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
00822   __m128 false_mask = _mm_cmpeq_ps(select, zero);
00823 #ifdef EIGEN_VECTORIZE_SSE4_1
00824   return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
00825 #else
00826   return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
00827 #endif
00828 }
00829 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
00830   const __m128d zero = _mm_setzero_pd();
00831   const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
00832   __m128d false_mask = _mm_cmpeq_pd(select, zero);
00833 #ifdef EIGEN_VECTORIZE_SSE4_1
00834   return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
00835 #else
00836   return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
00837 #endif
00838 }
00839
00840 } // end namespace internal
00841
00842 } // end namespace Eigen
00843
00844 #endif // EIGEN_PACKET_MATH_SSE_H