MOAB
4.9.3pre
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2014 Benoit Steiner ([email protected]) 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_PACKET_MATH_AVX_H 00011 #define EIGEN_PACKET_MATH_AVX_H 00012 00013 namespace Eigen { 00014 00015 namespace internal { 00016 00017 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 00018 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 00019 #endif 00020 00021 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 00022 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) 00023 #endif 00024 00025 #ifdef __FMA__ 00026 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 00027 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 00028 #endif 00029 #endif 00030 00031 typedef __m256 Packet8f; 00032 typedef __m256i Packet8i; 00033 typedef __m256d Packet4d; 00034 00035 template<> struct is_arithmetic<__m256> { enum { value = true }; }; 00036 template<> struct is_arithmetic<__m256i> { enum { value = true }; }; 00037 template<> struct is_arithmetic<__m256d> { enum { value = true }; }; 00038 00039 #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ 00040 const Packet8f p8f_##NAME = pset1<Packet8f>(X) 00041 00042 #define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ 00043 const Packet4d p4d_##NAME = pset1<Packet4d>(X) 00044 00045 #define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \ 00046 const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X)) 00047 00048 #define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \ 00049 const Packet8i p8i_##NAME = pset1<Packet8i>(X) 00050 00051 00052 template<> struct packet_traits<float> : default_packet_traits 00053 { 00054 typedef Packet8f type; 00055 typedef Packet4f half; 00056 enum { 00057 Vectorizable = 1, 00058 AlignedOnScalar = 1, 00059 size=8, 00060 HasHalfPacket = 1, 00061 00062 HasDiv = 1, 00063 HasSin = EIGEN_FAST_MATH, 00064 HasCos = 0, 00065 HasLog = 1, 00066 HasExp = 1, 00067 HasSqrt = 1, 00068 HasRsqrt = 1, 00069 HasTanh = EIGEN_FAST_MATH, 00070 HasBlend = 1, 00071 HasRound = 1, 00072 HasFloor = 1, 00073 HasCeil = 1 00074 }; 00075 }; 00076 template<> struct packet_traits<double> : default_packet_traits 00077 { 00078 typedef Packet4d type; 00079 typedef Packet2d half; 00080 enum { 00081 Vectorizable = 1, 00082 AlignedOnScalar = 1, 00083 size=4, 00084 HasHalfPacket = 1, 00085 00086 HasDiv = 1, 00087 HasExp = 1, 00088 HasSqrt = 1, 00089 HasRsqrt = 1, 00090 HasBlend = 1, 00091 HasRound = 1, 00092 HasFloor = 1, 00093 HasCeil = 1 00094 }; 00095 }; 00096 00097 /* Proper support for integers is only provided by AVX2. In the meantime, we'll 00098 use SSE instructions and packets to deal with integers. 00099 template<> struct packet_traits<int> : default_packet_traits 00100 { 00101 typedef Packet8i type; 00102 enum { 00103 Vectorizable = 1, 00104 AlignedOnScalar = 1, 00105 size=8 00106 }; 00107 }; 00108 */ 00109 00110 template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; 00111 template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; 00112 template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; 00113 00114 template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); } 00115 template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); } 00116 template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); } 00117 00118 template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); } 00119 template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); } 00120 00121 template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } 00122 template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } 00123 00124 template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } 00125 template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } 00126 00127 template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } 00128 template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } 00129 00130 template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) 00131 { 00132 return _mm256_sub_ps(_mm256_set1_ps(0.0),a); 00133 } 00134 template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) 00135 { 00136 return _mm256_sub_pd(_mm256_set1_pd(0.0),a); 00137 } 00138 00139 template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } 00140 template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } 00141 template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } 00142 00143 template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } 00144 template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } 00145 00146 00147 template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } 00148 template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } 00149 template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/) 00150 { eigen_assert(false && "packet integer division are not supported by AVX"); 00151 return pset1<Packet8i>(0); 00152 } 00153 00154 #ifdef __FMA__ 00155 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { 00156 #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG 00157 // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, 00158 // and gcc stupidly generates a vfmadd132ps instruction, 00159 // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate 00160 // the result of the product. 00161 Packet8f res = c; 00162 __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); 00163 return res; 00164 #else 00165 return _mm256_fmadd_ps(a,b,c); 00166 #endif 00167 } 00168 template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { 00169 #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG 00170 // see above 00171 Packet4d res = c; 00172 __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); 00173 return res; 00174 #else 00175 return _mm256_fmadd_pd(a,b,c); 00176 #endif 00177 } 00178 #endif 00179 00180 template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } 00181 template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } 00182 00183 template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } 00184 template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } 00185 00186 template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } 00187 template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } 00188 00189 template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); } 00190 template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); } 00191 00192 template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); } 00193 template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); } 00194 00195 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } 00196 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } 00197 00198 template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } 00199 template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } 00200 00201 template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } 00202 template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } 00203 00204 template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } 00205 template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } 00206 00207 template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } 00208 template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } 00209 template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); } 00210 00211 template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); } 00212 template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } 00213 template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); } 00214 00215 // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} 00216 template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) 00217 { 00218 // TODO try to find a way to avoid the need of a temporary register 00219 // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); 00220 // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); 00221 // return _mm256_unpacklo_ps(tmp,tmp); 00222 00223 // _mm256_insertf128_ps is very slow on Haswell, thus: 00224 Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); 00225 // mimic an "inplace" permutation of the lower 128bits using a blend 00226 tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); 00227 // then we can perform a consistent permutation on the global register to get everything in shape: 00228 return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); 00229 } 00230 // Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} 00231 template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) 00232 { 00233 Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); 00234 return _mm256_permute_pd(tmp, 3<<2); 00235 } 00236 00237 // Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} 00238 template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from) 00239 { 00240 Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); 00241 return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); 00242 } 00243 00244 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } 00245 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } 00246 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } 00247 00248 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); } 00249 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } 00250 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } 00251 00252 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available 00253 // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); 00254 template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride) 00255 { 00256 return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], 00257 from[3*stride], from[2*stride], from[1*stride], from[0*stride]); 00258 } 00259 template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride) 00260 { 00261 return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); 00262 } 00263 00264 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) 00265 { 00266 __m128 low = _mm256_extractf128_ps(from, 0); 00267 to[stride*0] = _mm_cvtss_f32(low); 00268 to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)); 00269 to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)); 00270 to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)); 00271 00272 __m128 high = _mm256_extractf128_ps(from, 1); 00273 to[stride*4] = _mm_cvtss_f32(high); 00274 to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)); 00275 to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)); 00276 to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)); 00277 } 00278 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride) 00279 { 00280 __m128d low = _mm256_extractf128_pd(from, 0); 00281 to[stride*0] = _mm_cvtsd_f64(low); 00282 to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)); 00283 __m128d high = _mm256_extractf128_pd(from, 1); 00284 to[stride*2] = _mm_cvtsd_f64(high); 00285 to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)); 00286 } 00287 00288 template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a) 00289 { 00290 Packet8f pa = pset1<Packet8f>(a); 00291 pstore(to, pa); 00292 } 00293 template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a) 00294 { 00295 Packet4d pa = pset1<Packet4d>(a); 00296 pstore(to, pa); 00297 } 00298 template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a) 00299 { 00300 Packet8i pa = pset1<Packet8i>(a); 00301 pstore(to, pa); 00302 } 00303 00304 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 00305 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 00306 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 00307 00308 template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) { 00309 return _mm_cvtss_f32(_mm256_castps256_ps128(a)); 00310 } 00311 template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) { 00312 return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); 00313 } 00314 template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) { 00315 return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); 00316 } 00317 00318 00319 template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) 00320 { 00321 __m256 tmp = _mm256_shuffle_ps(a,a,0x1b); 00322 return _mm256_permute2f128_ps(tmp, tmp, 1); 00323 } 00324 template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) 00325 { 00326 __m256d tmp = _mm256_shuffle_pd(a,a,5); 00327 return _mm256_permute2f128_pd(tmp, tmp, 1); 00328 00329 __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); 00330 return _mm256_permute_pd(swap_halves,5); 00331 } 00332 00333 // pabs should be ok 00334 template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) 00335 { 00336 const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); 00337 return _mm256_and_ps(a,mask); 00338 } 00339 template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) 00340 { 00341 const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); 00342 return _mm256_and_pd(a,mask); 00343 } 00344 00345 // preduxp should be ok 00346 // FIXME: why is this ok? why isn't the simply implementation working as expected? 00347 template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs) 00348 { 00349 __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); 00350 __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); 00351 __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); 00352 __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); 00353 00354 __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); 00355 __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); 00356 __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); 00357 __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); 00358 00359 __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); 00360 __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); 00361 __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); 00362 __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); 00363 00364 __m256 sum1 = _mm256_add_ps(perm1, hsum5); 00365 __m256 sum2 = _mm256_add_ps(perm2, hsum6); 00366 __m256 sum3 = _mm256_add_ps(perm3, hsum7); 00367 __m256 sum4 = _mm256_add_ps(perm4, hsum8); 00368 00369 __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); 00370 __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); 00371 00372 __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); 00373 return final; 00374 } 00375 template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs) 00376 { 00377 Packet4d tmp0, tmp1; 00378 00379 tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); 00380 tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); 00381 00382 tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); 00383 tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); 00384 00385 return _mm256_blend_pd(tmp0, tmp1, 0xC); 00386 } 00387 00388 template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) 00389 { 00390 Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1)); 00391 tmp0 = _mm256_hadd_ps(tmp0,tmp0); 00392 return pfirst(_mm256_hadd_ps(tmp0, tmp0)); 00393 } 00394 template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a) 00395 { 00396 Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1)); 00397 return pfirst(_mm256_hadd_pd(tmp0,tmp0)); 00398 } 00399 00400 template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a) 00401 { 00402 return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); 00403 } 00404 00405 template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a) 00406 { 00407 Packet8f tmp; 00408 tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1)); 00409 tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); 00410 return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); 00411 } 00412 template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a) 00413 { 00414 Packet4d tmp; 00415 tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1)); 00416 return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1))); 00417 } 00418 00419 template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a) 00420 { 00421 Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1)); 00422 tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); 00423 return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); 00424 } 00425 template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a) 00426 { 00427 Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1)); 00428 return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); 00429 } 00430 00431 template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a) 00432 { 00433 Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1)); 00434 tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); 00435 return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); 00436 } 00437 00438 template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a) 00439 { 00440 Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1)); 00441 return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); 00442 } 00443 00444 00445 template<int Offset> 00446 struct palign_impl<Offset,Packet8f> 00447 { 00448 static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) 00449 { 00450 if (Offset==1) 00451 { 00452 first = _mm256_blend_ps(first, second, 1); 00453 Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); 00454 Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 00455 first = _mm256_blend_ps(tmp1, tmp2, 0x88); 00456 } 00457 else if (Offset==2) 00458 { 00459 first = _mm256_blend_ps(first, second, 3); 00460 Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); 00461 Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 00462 first = _mm256_blend_ps(tmp1, tmp2, 0xcc); 00463 } 00464 else if (Offset==3) 00465 { 00466 first = _mm256_blend_ps(first, second, 7); 00467 Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); 00468 Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 00469 first = _mm256_blend_ps(tmp1, tmp2, 0xee); 00470 } 00471 else if (Offset==4) 00472 { 00473 first = _mm256_blend_ps(first, second, 15); 00474 Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); 00475 Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 00476 first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); 00477 } 00478 else if (Offset==5) 00479 { 00480 first = _mm256_blend_ps(first, second, 31); 00481 first = _mm256_permute2f128_ps(first, first, 1); 00482 Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); 00483 first = _mm256_permute2f128_ps(tmp, tmp, 1); 00484 first = _mm256_blend_ps(tmp, first, 0x88); 00485 } 00486 else if (Offset==6) 00487 { 00488 first = _mm256_blend_ps(first, second, 63); 00489 first = _mm256_permute2f128_ps(first, first, 1); 00490 Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); 00491 first = _mm256_permute2f128_ps(tmp, tmp, 1); 00492 first = _mm256_blend_ps(tmp, first, 0xcc); 00493 } 00494 else if (Offset==7) 00495 { 00496 first = _mm256_blend_ps(first, second, 127); 00497 first = _mm256_permute2f128_ps(first, first, 1); 00498 Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); 00499 first = _mm256_permute2f128_ps(tmp, tmp, 1); 00500 first = _mm256_blend_ps(tmp, first, 0xee); 00501 } 00502 } 00503 }; 00504 00505 template<int Offset> 00506 struct palign_impl<Offset,Packet4d> 00507 { 00508 static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) 00509 { 00510 if (Offset==1) 00511 { 00512 first = _mm256_blend_pd(first, second, 1); 00513 __m256d tmp = _mm256_permute_pd(first, 5); 00514 first = _mm256_permute2f128_pd(tmp, tmp, 1); 00515 first = _mm256_blend_pd(tmp, first, 0xA); 00516 } 00517 else if (Offset==2) 00518 { 00519 first = _mm256_blend_pd(first, second, 3); 00520 first = _mm256_permute2f128_pd(first, first, 1); 00521 } 00522 else if (Offset==3) 00523 { 00524 first = _mm256_blend_pd(first, second, 7); 00525 __m256d tmp = _mm256_permute_pd(first, 5); 00526 first = _mm256_permute2f128_pd(tmp, tmp, 1); 00527 first = _mm256_blend_pd(tmp, first, 5); 00528 } 00529 } 00530 }; 00531 00532 EIGEN_DEVICE_FUNC inline void 00533 ptranspose(PacketBlock<Packet8f,8>& kernel) { 00534 __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); 00535 __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); 00536 __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); 00537 __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); 00538 __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]); 00539 __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]); 00540 __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]); 00541 __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]); 00542 __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); 00543 __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); 00544 __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); 00545 __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); 00546 __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0)); 00547 __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2)); 00548 __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0)); 00549 __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2)); 00550 kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20); 00551 kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20); 00552 kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20); 00553 kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20); 00554 kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31); 00555 kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31); 00556 kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31); 00557 kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31); 00558 } 00559 00560 EIGEN_DEVICE_FUNC inline void 00561 ptranspose(PacketBlock<Packet8f,4>& kernel) { 00562 __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); 00563 __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); 00564 __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); 00565 __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); 00566 00567 __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); 00568 __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); 00569 __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); 00570 __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); 00571 00572 kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20); 00573 kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20); 00574 kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31); 00575 kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31); 00576 } 00577 00578 EIGEN_DEVICE_FUNC inline void 00579 ptranspose(PacketBlock<Packet4d,4>& kernel) { 00580 __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); 00581 __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); 00582 __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15); 00583 __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); 00584 00585 kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32); 00586 kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49); 00587 kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32); 00588 kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); 00589 } 00590 00591 template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { 00592 const __m256 zero = _mm256_setzero_ps(); 00593 const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); 00594 __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); 00595 return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); 00596 } 00597 template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { 00598 const __m256d zero = _mm256_setzero_pd(); 00599 const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); 00600 __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); 00601 return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); 00602 } 00603 00604 } // end namespace internal 00605 00606 } // end namespace Eigen 00607 00608 #endif // EIGEN_PACKET_MATH_AVX_H