MOAB
4.9.3pre
|
#include <GeneralBlockPanelKernel.h>
Public Types | |
enum | { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, RhsProgress = Traits::RhsProgress, ResPacketSize = Traits::ResPacketSize } |
typedef gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs > | Traits |
typedef Traits::ResScalar | ResScalar |
typedef Traits::LhsPacket | LhsPacket |
typedef Traits::RhsPacket | RhsPacket |
typedef Traits::ResPacket | ResPacket |
typedef Traits::AccPacket | AccPacket |
typedef gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs > | SwappedTraits |
typedef SwappedTraits::ResScalar | SResScalar |
typedef SwappedTraits::LhsPacket | SLhsPacket |
typedef SwappedTraits::RhsPacket | SRhsPacket |
typedef SwappedTraits::ResPacket | SResPacket |
typedef SwappedTraits::AccPacket | SAccPacket |
typedef DataMapper::LinearMapper | LinearMapper |
Public Member Functions | |
EIGEN_DONT_INLINE void | operator() (const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0) |
Static Public Attributes | |
static const bool | UseRotatingKernel |
Definition at line 948 of file GeneralBlockPanelKernel.h.
typedef Traits::AccPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::AccPacket |
Definition at line 955 of file GeneralBlockPanelKernel.h.
typedef Traits::LhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::LhsPacket |
Definition at line 952 of file GeneralBlockPanelKernel.h.
typedef DataMapper::LinearMapper Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::LinearMapper |
Definition at line 964 of file GeneralBlockPanelKernel.h.
typedef Traits::ResPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::ResPacket |
Definition at line 954 of file GeneralBlockPanelKernel.h.
typedef Traits::ResScalar Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::ResScalar |
Definition at line 951 of file GeneralBlockPanelKernel.h.
typedef Traits::RhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::RhsPacket |
Definition at line 953 of file GeneralBlockPanelKernel.h.
typedef SwappedTraits::AccPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SAccPacket |
Definition at line 962 of file GeneralBlockPanelKernel.h.
typedef SwappedTraits::LhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SLhsPacket |
Definition at line 959 of file GeneralBlockPanelKernel.h.
typedef SwappedTraits::ResPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SResPacket |
Definition at line 961 of file GeneralBlockPanelKernel.h.
typedef SwappedTraits::ResScalar Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SResScalar |
Definition at line 958 of file GeneralBlockPanelKernel.h.
typedef SwappedTraits::RhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SRhsPacket |
Definition at line 960 of file GeneralBlockPanelKernel.h.
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SwappedTraits |
Definition at line 957 of file GeneralBlockPanelKernel.h.
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::Traits |
Definition at line 950 of file GeneralBlockPanelKernel.h.
anonymous enum |
Definition at line 966 of file GeneralBlockPanelKernel.h.
EIGEN_DONT_INLINE void Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::operator() | ( | const DataMapper & | res, |
const LhsScalar * | blockA, | ||
const RhsScalar * | blockB, | ||
Index | rows, | ||
Index | depth, | ||
Index | cols, | ||
ResScalar | alpha, | ||
Index | strideA = -1 , |
||
Index | strideB = -1 , |
||
Index | offsetA = 0 , |
||
Index | offsetB = 0 |
||
) |
Definition at line 992 of file GeneralBlockPanelKernel.h.
{ Traits traits; SwappedTraits straits; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); const Index prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; //---------- Process 3 * LhsProgress rows at once ---------- // This corresponds to 3*LhsProgress x nr register blocks. // Usually, make sense only with FMA if(mr>=3*Traits::LhsProgress) { PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits); // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth) // and on each largest micro vertical panel of the rhs (depth * nr). // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1. // However, if depth is too small, we can extend the number of rows of these horizontal panels. // This actual number of rows is computed as follow: const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function. // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess), // or because we are testing specific blocking sizes. const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) )); for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows) { const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3); for(Index j2=0; j2<packet_cols4; j2+=nr) { for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress) { // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely // stored into 3 x nr registers. const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)]; prefetch(&blA[0]); // gets res block as register AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11; traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11); LinearMapper r0 = res.getLinearMapper(i, j2 + 0); LinearMapper r1 = res.getLinearMapper(i, j2 + 1); LinearMapper r2 = res.getLinearMapper(i, j2 + 2); LinearMapper r3 = res.getLinearMapper(i, j2 + 3); r0.prefetch(0); r1.prefetch(0); r2.prefetch(0); r3.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; prefetch(&blB[0]); LhsPacket A0, A1; for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); RhsPacket B_0, T0; LhsPacket A2; #define EIGEN_GEBP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ internal::prefetch(blA+(3*K+16)*LhsProgress); \ if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \ traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \ traits.madd(A0, B_0, C0, T0); \ traits.madd(A1, B_0, C4, T0); \ traits.madd(A2, B_0, C8, B_0); \ possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \ traits.madd(A0, B_0, C1, T0); \ traits.madd(A1, B_0, C5, T0); \ traits.madd(A2, B_0, C9, B_0); \ possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \ traits.madd(A0, B_0, C2, T0); \ traits.madd(A1, B_0, C6, T0); \ traits.madd(A2, B_0, C10, B_0); \ possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \ traits.madd(A0, B_0, C3 , T0); \ traits.madd(A1, B_0, C7, T0); \ traits.madd(A2, B_0, C11, B_0); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ } while(false) internal::prefetch(blB); EIGEN_GEBP_ONESTEP(0); EIGEN_GEBP_ONESTEP(1); EIGEN_GEBP_ONESTEP(2); EIGEN_GEBP_ONESTEP(3); EIGEN_GEBP_ONESTEP(4); EIGEN_GEBP_ONESTEP(5); EIGEN_GEBP_ONESTEP(6); EIGEN_GEBP_ONESTEP(7); blB += pk*4*RhsProgress; blA += pk*3*Traits::LhsProgress; EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { RhsPacket B_0, T0; LhsPacket A2; EIGEN_GEBP_ONESTEP(0); blB += 4*RhsProgress; blA += 3*Traits::LhsProgress; } #undef EIGEN_GEBP_ONESTEP possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3); possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7); possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11); ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); R0 = r0.loadPacket(0 * Traits::ResPacketSize); R1 = r0.loadPacket(1 * Traits::ResPacketSize); R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); r0.storePacket(0 * Traits::ResPacketSize, R0); r0.storePacket(1 * Traits::ResPacketSize, R1); r0.storePacket(2 * Traits::ResPacketSize, R2); R0 = r1.loadPacket(0 * Traits::ResPacketSize); R1 = r1.loadPacket(1 * Traits::ResPacketSize); R2 = r1.loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); r1.storePacket(0 * Traits::ResPacketSize, R0); r1.storePacket(1 * Traits::ResPacketSize, R1); r1.storePacket(2 * Traits::ResPacketSize, R2); R0 = r2.loadPacket(0 * Traits::ResPacketSize); R1 = r2.loadPacket(1 * Traits::ResPacketSize); R2 = r2.loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); r2.storePacket(0 * Traits::ResPacketSize, R0); r2.storePacket(1 * Traits::ResPacketSize, R1); r2.storePacket(2 * Traits::ResPacketSize, R2); R0 = r3.loadPacket(0 * Traits::ResPacketSize); R1 = r3.loadPacket(1 * Traits::ResPacketSize); R2 = r3.loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); r3.storePacket(0 * Traits::ResPacketSize, R0); r3.storePacket(1 * Traits::ResPacketSize, R1); r3.storePacket(2 * Traits::ResPacketSize, R2); } } // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2<cols; j2++) { for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress) { // One column at a time const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)]; prefetch(&blA[0]); // gets res block as register AccPacket C0, C4, C8; traits.initAcc(C0); traits.initAcc(C4); traits.initAcc(C8); LinearMapper r0 = res.getLinearMapper(i, j2); r0.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; LhsPacket A0, A1, A2; for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1"); RhsPacket B_0; #define EIGEN_GEBGP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ traits.madd(A0, B_0, C0, B_0); \ traits.madd(A1, B_0, C4, B_0); \ traits.madd(A2, B_0, C8, B_0); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ } while(false) EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); EIGEN_GEBGP_ONESTEP(3); EIGEN_GEBGP_ONESTEP(4); EIGEN_GEBGP_ONESTEP(5); EIGEN_GEBGP_ONESTEP(6); EIGEN_GEBGP_ONESTEP(7); blB += pk*RhsProgress; blA += pk*3*Traits::LhsProgress; EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { RhsPacket B_0; EIGEN_GEBGP_ONESTEP(0); blB += RhsProgress; blA += 3*Traits::LhsProgress; } #undef EIGEN_GEBGP_ONESTEP ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); R0 = r0.loadPacket(0 * Traits::ResPacketSize); R1 = r0.loadPacket(1 * Traits::ResPacketSize); R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); r0.storePacket(0 * Traits::ResPacketSize, R0); r0.storePacket(1 * Traits::ResPacketSize, R1); r0.storePacket(2 * Traits::ResPacketSize, R2); } } } } //---------- Process 2 * LhsProgress rows at once ---------- if(mr>=2*Traits::LhsProgress) { const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function. // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess), // or because we are testing specific blocking sizes. Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) )); for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows) { Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2); for(Index j2=0; j2<packet_cols4; j2+=nr) { for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress) { // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely // stored into 2 x nr registers. const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)]; prefetch(&blA[0]); // gets res block as register AccPacket C0, C1, C2, C3, C4, C5, C6, C7; traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); LinearMapper r0 = res.getLinearMapper(i, j2 + 0); LinearMapper r1 = res.getLinearMapper(i, j2 + 1); LinearMapper r2 = res.getLinearMapper(i, j2 + 2); LinearMapper r3 = res.getLinearMapper(i, j2 + 3); r0.prefetch(prefetch_res_offset); r1.prefetch(prefetch_res_offset); r2.prefetch(prefetch_res_offset); r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; prefetch(&blB[0]); LhsPacket A0, A1; for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4"); RhsPacket B_0, B1, B2, B3, T0; #define EIGEN_GEBGP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ traits.madd(A0, B_0, C0, T0); \ traits.madd(A1, B_0, C4, B_0); \ traits.madd(A0, B1, C1, T0); \ traits.madd(A1, B1, C5, B1); \ traits.madd(A0, B2, C2, T0); \ traits.madd(A1, B2, C6, B2); \ traits.madd(A0, B3, C3, T0); \ traits.madd(A1, B3, C7, B3); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ } while(false) internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); EIGEN_GEBGP_ONESTEP(3); internal::prefetch(blB+(48+16)); EIGEN_GEBGP_ONESTEP(4); EIGEN_GEBGP_ONESTEP(5); EIGEN_GEBGP_ONESTEP(6); EIGEN_GEBGP_ONESTEP(7); blB += pk*4*RhsProgress; blA += pk*(2*Traits::LhsProgress); EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { RhsPacket B_0, B1, B2, B3, T0; EIGEN_GEBGP_ONESTEP(0); blB += 4*RhsProgress; blA += 2*Traits::LhsProgress; } #undef EIGEN_GEBGP_ONESTEP ResPacket R0, R1, R2, R3; ResPacket alphav = pset1<ResPacket>(alpha); R0 = r0.loadPacket(0 * Traits::ResPacketSize); R1 = r0.loadPacket(1 * Traits::ResPacketSize); R2 = r1.loadPacket(0 * Traits::ResPacketSize); R3 = r1.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); traits.acc(C5, alphav, R3); r0.storePacket(0 * Traits::ResPacketSize, R0); r0.storePacket(1 * Traits::ResPacketSize, R1); r1.storePacket(0 * Traits::ResPacketSize, R2); r1.storePacket(1 * Traits::ResPacketSize, R3); R0 = r2.loadPacket(0 * Traits::ResPacketSize); R1 = r2.loadPacket(1 * Traits::ResPacketSize); R2 = r3.loadPacket(0 * Traits::ResPacketSize); R3 = r3.loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); traits.acc(C7, alphav, R3); r2.storePacket(0 * Traits::ResPacketSize, R0); r2.storePacket(1 * Traits::ResPacketSize, R1); r3.storePacket(0 * Traits::ResPacketSize, R2); r3.storePacket(1 * Traits::ResPacketSize, R3); } } // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2<cols; j2++) { for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress) { // One column at a time const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)]; prefetch(&blA[0]); // gets res block as register AccPacket C0, C4; traits.initAcc(C0); traits.initAcc(C4); LinearMapper r0 = res.getLinearMapper(i, j2); r0.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; LhsPacket A0, A1; for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1"); RhsPacket B_0, B1; #define EIGEN_GEBGP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ traits.madd(A0, B_0, C0, B1); \ traits.madd(A1, B_0, C4, B_0); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ } while(false) EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); EIGEN_GEBGP_ONESTEP(3); EIGEN_GEBGP_ONESTEP(4); EIGEN_GEBGP_ONESTEP(5); EIGEN_GEBGP_ONESTEP(6); EIGEN_GEBGP_ONESTEP(7); blB += pk*RhsProgress; blA += pk*2*Traits::LhsProgress; EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { RhsPacket B_0, B1; EIGEN_GEBGP_ONESTEP(0); blB += RhsProgress; blA += 2*Traits::LhsProgress; } #undef EIGEN_GEBGP_ONESTEP ResPacket R0, R1; ResPacket alphav = pset1<ResPacket>(alpha); R0 = r0.loadPacket(0 * Traits::ResPacketSize); R1 = r0.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); r0.storePacket(1 * Traits::ResPacketSize, R1); } } } } //---------- Process 1 * LhsProgress rows at once ---------- if(mr>=1*Traits::LhsProgress) { // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress) { // loops on each largest micro vertical panel of rhs (depth * nr) for(Index j2=0; j2<packet_cols4; j2+=nr) { // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely // stored into 1 x nr registers. const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; prefetch(&blA[0]); // gets res block as register AccPacket C0, C1, C2, C3; traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); LinearMapper r0 = res.getLinearMapper(i, j2 + 0); LinearMapper r1 = res.getLinearMapper(i, j2 + 1); LinearMapper r2 = res.getLinearMapper(i, j2 + 2); LinearMapper r3 = res.getLinearMapper(i, j2 + 3); r0.prefetch(prefetch_res_offset); r1.prefetch(prefetch_res_offset); r2.prefetch(prefetch_res_offset); r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; prefetch(&blB[0]); LhsPacket A0; for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4"); RhsPacket B_0, B1, B2, B3; #define EIGEN_GEBGP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ traits.madd(A0, B_0, C0, B_0); \ traits.madd(A0, B1, C1, B1); \ traits.madd(A0, B2, C2, B2); \ traits.madd(A0, B3, C3, B3); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ } while(false) internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); EIGEN_GEBGP_ONESTEP(3); internal::prefetch(blB+(48+16)); EIGEN_GEBGP_ONESTEP(4); EIGEN_GEBGP_ONESTEP(5); EIGEN_GEBGP_ONESTEP(6); EIGEN_GEBGP_ONESTEP(7); blB += pk*4*RhsProgress; blA += pk*1*LhsProgress; EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { RhsPacket B_0, B1, B2, B3; EIGEN_GEBGP_ONESTEP(0); blB += 4*RhsProgress; blA += 1*LhsProgress; } #undef EIGEN_GEBGP_ONESTEP ResPacket R0, R1; ResPacket alphav = pset1<ResPacket>(alpha); R0 = r0.loadPacket(0 * Traits::ResPacketSize); R1 = r1.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C1, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); r1.storePacket(0 * Traits::ResPacketSize, R1); R0 = r2.loadPacket(0 * Traits::ResPacketSize); R1 = r3.loadPacket(0 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C3, alphav, R1); r2.storePacket(0 * Traits::ResPacketSize, R0); r3.storePacket(0 * Traits::ResPacketSize, R1); } // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2<cols; j2++) { // One column at a time const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; prefetch(&blA[0]); // gets res block as register AccPacket C0; traits.initAcc(C0); LinearMapper r0 = res.getLinearMapper(i, j2); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; LhsPacket A0; for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1"); RhsPacket B_0; #define EIGEN_GEBGP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ traits.madd(A0, B_0, C0, B_0); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \ } while(false); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); EIGEN_GEBGP_ONESTEP(3); EIGEN_GEBGP_ONESTEP(4); EIGEN_GEBGP_ONESTEP(5); EIGEN_GEBGP_ONESTEP(6); EIGEN_GEBGP_ONESTEP(7); blB += pk*RhsProgress; blA += pk*1*Traits::LhsProgress; EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { RhsPacket B_0; EIGEN_GEBGP_ONESTEP(0); blB += RhsProgress; blA += 1*Traits::LhsProgress; } #undef EIGEN_GEBGP_ONESTEP ResPacket R0; ResPacket alphav = pset1<ResPacket>(alpha); R0 = r0.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); r0.storePacket(0 * Traits::ResPacketSize, R0); } } } //---------- Process remaining rows, 1 at once ---------- if(peeled_mc1<rows) { // loop on each panel of the rhs for(Index j2=0; j2<packet_cols4; j2+=nr) { // loop on each row of the lhs (1*LhsProgress x depth) for(Index i=peeled_mc1; i<rows; i+=1) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; if( (SwappedTraits::LhsProgress % 4)==0 ) { // NOTE The following piece of code wont work for 512 bit registers SAccPacket C0, C1, C2, C3; straits.initAcc(C0); straits.initAcc(C1); straits.initAcc(C2); straits.initAcc(C3); const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4); const Index endk = (depth/spk)*spk; const Index endk4 = (depth/(spk*4))*(spk*4); Index k=0; for(; k<endk4; k+=4*spk) { SLhsPacket A0,A1; SRhsPacket B_0,B_1; straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+0*spk, B_0); straits.loadRhsQuad(blA+1*spk, B_1); straits.madd(A0,B_0,C0,B_0); straits.madd(A1,B_1,C1,B_1); straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); straits.madd(A0,B_0,C2,B_0); straits.madd(A1,B_1,C3,B_1); blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; } C0 = padd(padd(C0,C1),padd(C2,C3)); for(; k<endk; k+=spk) { SLhsPacket A0; SRhsPacket B_0; straits.loadLhsUnaligned(blB, A0); straits.loadRhsQuad(blA, B_0); straits.madd(A0,B_0,C0,B_0); blB += SwappedTraits::LhsProgress; blA += spk; } if(SwappedTraits::LhsProgress==8) { // Special case where we have to first reduce the accumulation register C0 typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf; typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf; typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf; SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2); SResPacketHalf alphav = pset1<SResPacketHalf>(alpha); if(depth-endk>0) { // We have to handle the last row of the rhs which corresponds to a half-packet SLhsPacketHalf a0; SRhsPacketHalf b0; straits.loadLhsUnaligned(blB, a0); straits.loadRhs(blA, b0); SAccPacketHalf c0 = predux4(C0); straits.madd(a0,b0,c0,b0); straits.acc(c0, alphav, R); } else { straits.acc(predux4(C0), alphav, R); } res.scatterPacket(i, j2, R); } else { SResPacket R = res.template gatherPacket<SResPacket>(i, j2); SResPacket alphav = pset1<SResPacket>(alpha); straits.acc(C0, alphav, R); res.scatterPacket(i, j2, R); } } else // scalar path { // get a 1 x 4 res block as registers ResScalar C0(0), C1(0), C2(0), C3(0); for(Index k=0; k<depth; k++) { LhsScalar A0; RhsScalar B_0, B_1; A0 = blA[k]; B_0 = blB[0]; B_1 = blB[1]; CJMADD(cj,A0,B_0,C0, B_0); CJMADD(cj,A0,B_1,C1, B_1); B_0 = blB[2]; B_1 = blB[3]; CJMADD(cj,A0,B_0,C2, B_0); CJMADD(cj,A0,B_1,C3, B_1); blB += 4; } res(i, j2 + 0) += alpha * C0; res(i, j2 + 1) += alpha * C1; res(i, j2 + 2) += alpha * C2; res(i, j2 + 3) += alpha * C3; } } } // remaining columns for(Index j2=packet_cols4; j2<cols; j2++) { // loop on each row of the lhs (1*LhsProgress x depth) for(Index i=peeled_mc1; i<rows; i+=1) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); // gets a 1 x 1 res block as registers ResScalar C0(0); const RhsScalar* blB = &blockB[j2*strideB+offsetB]; for(Index k=0; k<depth; k++) { LhsScalar A0 = blA[k]; RhsScalar B_0 = blB[k]; CJMADD(cj, A0, B_0, C0, B_0); } res(i, j2) += alpha * C0; } } } }
const bool Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::UseRotatingKernel [static] |
EIGEN_ARCH_ARM && internal::is_same<LhsScalar, float>::value && internal::is_same<RhsScalar, float>::value && internal::is_same<ResScalar, float>::value && Traits::LhsPacketSize == 4 && Traits::RhsPacketSize == 4 && Traits::ResPacketSize == 4
Definition at line 974 of file GeneralBlockPanelKernel.h.