MOAB
4.9.3pre
|
#include <SelfadjointMatrixVector.h>
Static Public Member Functions | |
static EIGEN_DONT_INLINE void | run (Index size, const Scalar *lhs, Index lhsStride, const Scalar *rhs, Scalar *res, Scalar alpha) |
Definition at line 27 of file SelfadjointMatrixVector.h.
EIGEN_DONT_INLINE void Eigen::internal::selfadjoint_matrix_vector_product< Scalar, Index, StorageOrder, UpLo, ConjugateLhs, ConjugateRhs, Version >::run | ( | Index | size, |
const Scalar * | lhs, | ||
Index | lhsStride, | ||
const Scalar * | rhs, | ||
Scalar * | res, | ||
Scalar | alpha | ||
) | [static] |
Definition at line 39 of file SelfadjointMatrixVector.h.
{ typedef typename packet_traits<Scalar>::type Packet; typedef typename NumTraits<Scalar>::Real RealScalar; const Index PacketSize = sizeof(Packet)/sizeof(Scalar); enum { IsRowMajor = StorageOrder==RowMajor ? 1 : 0, IsLower = UpLo == Lower ? 1 : 0, FirstTriangular = IsRowMajor == IsLower }; conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; conj_helper<RealScalar,Scalar,false, ConjugateRhs> cjd; conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0; conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1; Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha; Index bound = (std::max)(Index(0),size-8) & 0xfffffffe; if (FirstTriangular) bound = size - bound; for (Index j=FirstTriangular ? bound : 0; j<(FirstTriangular ? size : bound);j+=2) { const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride; Scalar t0 = cjAlpha * rhs[j]; Packet ptmp0 = pset1<Packet>(t0); Scalar t1 = cjAlpha * rhs[j+1]; Packet ptmp1 = pset1<Packet>(t1); Scalar t2(0); Packet ptmp2 = pset1<Packet>(t2); Scalar t3(0); Packet ptmp3 = pset1<Packet>(t3); size_t starti = FirstTriangular ? 0 : j+2; size_t endi = FirstTriangular ? j : size; size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti); size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); res[j] += cjd.pmul(numext::real(A0[j]), t0); res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1); if(FirstTriangular) { res[j] += cj0.pmul(A1[j], t1); t3 += cj1.pmul(A1[j], rhs[j]); } else { res[j+1] += cj0.pmul(A0[j+1],t0); t2 += cj1.pmul(A0[j+1], rhs[j+1]); } for (size_t i=starti; i<alignedStart; ++i) { res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1); t2 += cj1.pmul(A0[i], rhs[i]); t3 += cj1.pmul(A1[i], rhs[i]); } // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up) // gcc 4.2 does this optimization automatically. const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart; const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; Scalar* EIGEN_RESTRICT resIt = res + alignedStart; for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) { Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize; Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize; Packet Bi = ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases Packet Xi = pload <Packet>(resIt); Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi)); ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3); pstore(resIt,Xi); resIt += PacketSize; } for (size_t i=alignedEnd; i<endi; i++) { res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1); t2 += cj1.pmul(A0[i], rhs[i]); t3 += cj1.pmul(A1[i], rhs[i]); } res[j] += alpha * (t2 + predux(ptmp2)); res[j+1] += alpha * (t3 + predux(ptmp3)); } for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++) { const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; Scalar t1 = cjAlpha * rhs[j]; Scalar t2(0); res[j] += cjd.pmul(numext::real(A0[j]), t1); for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++) { res[i] += cj0.pmul(A0[i], t1); t2 += cj1.pmul(A0[i], rhs[i]); } res[j] += alpha * t2; } }