Actual source code: baijfact12.c

  1: /*$Id: baijfact12.c,v 1.17 2001/08/31 16:22:11 bsmith Exp $*/
  2: /*
  3:     Factorization code for BAIJ format. 
  4: */
 5:  #include src/mat/impls/baij/seq/baij.h
 6:  #include src/vec/vecimpl.h
 7:  #include src/inline/ilu.h

  9: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat A,Mat *B)
 10: {
 11: /*
 12:     Default Version for when blocks are 4 by 4 Using natural ordering
 13: */
 14:   Mat         C = *B;
 15:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
 16:   int         ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
 17:   int         *ajtmpold,*ajtmp,nz,row;
 18:   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
 19:   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
 20:   MatScalar   p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
 21:   MatScalar   p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
 22:   MatScalar   p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
 23:   MatScalar   m13,m14,m15,m16;
 24:   MatScalar   *ba = b->a,*aa = a->a;

 27:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);

 29:   for (i=0; i<n; i++) {
 30:     nz    = bi[i+1] - bi[i];
 31:     ajtmp = bj + bi[i];
 32:     for  (j=0; j<nz; j++) {
 33:       x = rtmp+16*ajtmp[j];
 34:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
 35:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
 36:     }
 37:     /* load in initial (unfactored row) */
 38:     nz       = ai[i+1] - ai[i];
 39:     ajtmpold = aj + ai[i];
 40:     v        = aa + 16*ai[i];
 41:     for (j=0; j<nz; j++) {
 42:       x    = rtmp+16*ajtmpold[j];
 43:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
 44:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
 45:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
 46:       x[14] = v[14]; x[15] = v[15];
 47:       v    += 16;
 48:     }
 49:     row = *ajtmp++;
 50:     while (row < i) {
 51:       pc  = rtmp + 16*row;
 52:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
 53:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
 54:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
 55:       p15 = pc[14]; p16 = pc[15];
 56:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
 57:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
 58:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
 59:           || p16 != 0.0) {
 60:         pv = ba + 16*diag_offset[row];
 61:         pj = bj + diag_offset[row] + 1;
 62:         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
 63:         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
 64:         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
 65:         x15 = pv[14]; x16 = pv[15];
 66:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
 67:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
 68:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
 69:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

 71:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
 72:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
 73:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
 74:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

 76:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
 77:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
 78:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
 79:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

 81:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
 82:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
 83:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
 84:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
 85:         nz = bi[row+1] - diag_offset[row] - 1;
 86:         pv += 16;
 87:         for (j=0; j<nz; j++) {
 88:           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
 89:           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
 90:           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
 91:           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
 92:           x    = rtmp + 16*pj[j];
 93:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
 94:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
 95:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
 96:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

 98:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
 99:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
100:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
101:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

103:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
104:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
105:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
106:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

108:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
109:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
110:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
111:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

113:           pv   += 16;
114:         }
115:         PetscLogFlops(128*nz+112);
116:       }
117:       row = *ajtmp++;
118:     }
119:     /* finished row so stick it into b->a */
120:     pv = ba + 16*bi[i];
121:     pj = bj + bi[i];
122:     nz = bi[i+1] - bi[i];
123:     for (j=0; j<nz; j++) {
124:       x      = rtmp+16*pj[j];
125:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
126:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
127:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
128:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
129:       pv   += 16;
130:     }
131:     /* invert diagonal block */
132:     w = ba + 16*diag_offset[i];
133:     Kernel_A_gets_inverse_A_4(w);
134:   }

136:   PetscFree(rtmp);
137:   C->factor    = FACTOR_LU;
138:   C->assembled = PETSC_TRUE;
139:   PetscLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
140:   return(0);
141: }


144: #if defined(PETSC_HAVE_SSE)

146: #include PETSC_HAVE_SSE

148: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
149: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat A,Mat *B)
150: {
151:   Mat         C = *B;
152:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
153:   int         ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
154:   int         *ajtmpold,*ajtmp,nz,row;
155:   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
156:   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
157:   MatScalar   *ba = b->a,*aa = a->a;
158:   int nonzero=0;

161:   SSE_SCOPE_BEGIN;

163:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
164:   for (i=0; i<n; i++) {
165:     nz    = bi[i+1] - bi[i];
166:     ajtmp = bj + bi[i];
167:     /* zero out the 4x4 block accumulators */
168:     /* zero out one register */
169:     XOR_PS(XMM7,XMM7);
170:     for  (j=0; j<nz; j++) {
171:       x = rtmp+16*ajtmp[j];
172:       SSE_INLINE_BEGIN_1(x)
173:         /* Copy zero register to memory locations */
174:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
175:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
176:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
177:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
178:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
179:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
180:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
181:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
182:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
183:       SSE_INLINE_END_1;
184:     }
185:     /* load in initial (unfactored row) */
186:     nz       = ai[i+1] - ai[i];
187:     ajtmpold = aj + ai[i];
188:     v        = aa + 16*ai[i];
189:     for (j=0; j<nz; j++) {
190:       x = rtmp+16*ajtmpold[j];
191:       /* Copy v block into x block */
192:       SSE_INLINE_BEGIN_2(v,x)
193:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
194:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
195:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

197:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
198:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

200:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
201:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

203:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
204:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

206:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
207:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

209:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
210:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

212:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
213:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

215:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
216:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
217:       SSE_INLINE_END_2;

219:       v += 16;
220:     }
221:     row = *ajtmp++;
222:     while (row < i) {
223:       pc  = rtmp + 16*row;
224:       SSE_INLINE_BEGIN_1(pc)
225:         /* Load block from lower triangle */
226:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
227:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
228:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

230:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
231:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

233:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
234:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

236:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
237:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

239:         /* Compare block to zero block */

241:         SSE_COPY_PS(XMM4,XMM7)
242:         SSE_CMPNEQ_PS(XMM4,XMM0)

244:         SSE_COPY_PS(XMM5,XMM7)
245:         SSE_CMPNEQ_PS(XMM5,XMM1)

247:         SSE_COPY_PS(XMM6,XMM7)
248:         SSE_CMPNEQ_PS(XMM6,XMM2)

250:         SSE_CMPNEQ_PS(XMM7,XMM3)

252:         /* Reduce the comparisons to one SSE register */
253:         SSE_OR_PS(XMM6,XMM7)
254:         SSE_OR_PS(XMM5,XMM4)
255:         SSE_OR_PS(XMM5,XMM6)
256:       SSE_INLINE_END_1;

258:       /* Reduce the one SSE register to an integer register for branching */
259:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
260:       MOVEMASK(nonzero,XMM5);

262:       /* If block is nonzero ... */
263:       if (nonzero) {
264:         pv = ba + 16*diag_offset[row];
265:         PREFETCH_L1(&pv[16]);
266:         pj = bj + diag_offset[row] + 1;

268:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
269:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
270:         /* but the diagonal was inverted already */
271:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

273:         SSE_INLINE_BEGIN_2(pv,pc)
274:           /* Column 0, product is accumulated in XMM4 */
275:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
276:           SSE_SHUFFLE(XMM4,XMM4,0x00)
277:           SSE_MULT_PS(XMM4,XMM0)

279:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
280:           SSE_SHUFFLE(XMM5,XMM5,0x00)
281:           SSE_MULT_PS(XMM5,XMM1)
282:           SSE_ADD_PS(XMM4,XMM5)

284:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
285:           SSE_SHUFFLE(XMM6,XMM6,0x00)
286:           SSE_MULT_PS(XMM6,XMM2)
287:           SSE_ADD_PS(XMM4,XMM6)

289:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
290:           SSE_SHUFFLE(XMM7,XMM7,0x00)
291:           SSE_MULT_PS(XMM7,XMM3)
292:           SSE_ADD_PS(XMM4,XMM7)

294:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
295:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

297:           /* Column 1, product is accumulated in XMM5 */
298:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
299:           SSE_SHUFFLE(XMM5,XMM5,0x00)
300:           SSE_MULT_PS(XMM5,XMM0)

302:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
303:           SSE_SHUFFLE(XMM6,XMM6,0x00)
304:           SSE_MULT_PS(XMM6,XMM1)
305:           SSE_ADD_PS(XMM5,XMM6)

307:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
308:           SSE_SHUFFLE(XMM7,XMM7,0x00)
309:           SSE_MULT_PS(XMM7,XMM2)
310:           SSE_ADD_PS(XMM5,XMM7)

312:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
313:           SSE_SHUFFLE(XMM6,XMM6,0x00)
314:           SSE_MULT_PS(XMM6,XMM3)
315:           SSE_ADD_PS(XMM5,XMM6)

317:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
318:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

320:           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

322:           /* Column 2, product is accumulated in XMM6 */
323:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
324:           SSE_SHUFFLE(XMM6,XMM6,0x00)
325:           SSE_MULT_PS(XMM6,XMM0)

327:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
328:           SSE_SHUFFLE(XMM7,XMM7,0x00)
329:           SSE_MULT_PS(XMM7,XMM1)
330:           SSE_ADD_PS(XMM6,XMM7)

332:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
333:           SSE_SHUFFLE(XMM7,XMM7,0x00)
334:           SSE_MULT_PS(XMM7,XMM2)
335:           SSE_ADD_PS(XMM6,XMM7)

337:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
338:           SSE_SHUFFLE(XMM7,XMM7,0x00)
339:           SSE_MULT_PS(XMM7,XMM3)
340:           SSE_ADD_PS(XMM6,XMM7)
341: 
342:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
343:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

345:           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
346:           /* Column 3, product is accumulated in XMM0 */
347:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
348:           SSE_SHUFFLE(XMM7,XMM7,0x00)
349:           SSE_MULT_PS(XMM0,XMM7)

351:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
352:           SSE_SHUFFLE(XMM7,XMM7,0x00)
353:           SSE_MULT_PS(XMM1,XMM7)
354:           SSE_ADD_PS(XMM0,XMM1)

356:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
357:           SSE_SHUFFLE(XMM1,XMM1,0x00)
358:           SSE_MULT_PS(XMM1,XMM2)
359:           SSE_ADD_PS(XMM0,XMM1)

361:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
362:           SSE_SHUFFLE(XMM7,XMM7,0x00)
363:           SSE_MULT_PS(XMM3,XMM7)
364:           SSE_ADD_PS(XMM0,XMM3)

366:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
367:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

369:           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
370:           /* This is code to be maintained and read by humans afterall. */
371:           /* Copy Multiplier Col 3 into XMM3 */
372:           SSE_COPY_PS(XMM3,XMM0)
373:           /* Copy Multiplier Col 2 into XMM2 */
374:           SSE_COPY_PS(XMM2,XMM6)
375:           /* Copy Multiplier Col 1 into XMM1 */
376:           SSE_COPY_PS(XMM1,XMM5)
377:           /* Copy Multiplier Col 0 into XMM0 */
378:           SSE_COPY_PS(XMM0,XMM4)
379:         SSE_INLINE_END_2;

381:         /* Update the row: */
382:         nz = bi[row+1] - diag_offset[row] - 1;
383:         pv += 16;
384:         for (j=0; j<nz; j++) {
385:           PREFETCH_L1(&pv[16]);
386:           x = rtmp + 16*pj[j];

388:           /* X:=X-M*PV, One column at a time */
389:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
390:           SSE_INLINE_BEGIN_2(x,pv)
391:             /* Load First Column of X*/
392:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
393:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

395:             /* Matrix-Vector Product: */
396:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
397:             SSE_SHUFFLE(XMM5,XMM5,0x00)
398:             SSE_MULT_PS(XMM5,XMM0)
399:             SSE_SUB_PS(XMM4,XMM5)

401:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
402:             SSE_SHUFFLE(XMM6,XMM6,0x00)
403:             SSE_MULT_PS(XMM6,XMM1)
404:             SSE_SUB_PS(XMM4,XMM6)

406:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
407:             SSE_SHUFFLE(XMM7,XMM7,0x00)
408:             SSE_MULT_PS(XMM7,XMM2)
409:             SSE_SUB_PS(XMM4,XMM7)

411:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
412:             SSE_SHUFFLE(XMM5,XMM5,0x00)
413:             SSE_MULT_PS(XMM5,XMM3)
414:             SSE_SUB_PS(XMM4,XMM5)

416:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
417:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

419:             /* Second Column */
420:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
421:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

423:             /* Matrix-Vector Product: */
424:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
425:             SSE_SHUFFLE(XMM6,XMM6,0x00)
426:             SSE_MULT_PS(XMM6,XMM0)
427:             SSE_SUB_PS(XMM5,XMM6)

429:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
430:             SSE_SHUFFLE(XMM7,XMM7,0x00)
431:             SSE_MULT_PS(XMM7,XMM1)
432:             SSE_SUB_PS(XMM5,XMM7)

434:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
435:             SSE_SHUFFLE(XMM4,XMM4,0x00)
436:             SSE_MULT_PS(XMM4,XMM2)
437:             SSE_SUB_PS(XMM5,XMM4)

439:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
440:             SSE_SHUFFLE(XMM6,XMM6,0x00)
441:             SSE_MULT_PS(XMM6,XMM3)
442:             SSE_SUB_PS(XMM5,XMM6)
443: 
444:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
445:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

447:             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

449:             /* Third Column */
450:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
451:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

453:             /* Matrix-Vector Product: */
454:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
455:             SSE_SHUFFLE(XMM7,XMM7,0x00)
456:             SSE_MULT_PS(XMM7,XMM0)
457:             SSE_SUB_PS(XMM6,XMM7)

459:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
460:             SSE_SHUFFLE(XMM4,XMM4,0x00)
461:             SSE_MULT_PS(XMM4,XMM1)
462:             SSE_SUB_PS(XMM6,XMM4)

464:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
465:             SSE_SHUFFLE(XMM5,XMM5,0x00)
466:             SSE_MULT_PS(XMM5,XMM2)
467:             SSE_SUB_PS(XMM6,XMM5)

469:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
470:             SSE_SHUFFLE(XMM7,XMM7,0x00)
471:             SSE_MULT_PS(XMM7,XMM3)
472:             SSE_SUB_PS(XMM6,XMM7)
473: 
474:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
475:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
476: 
477:             /* Fourth Column */
478:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
479:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

481:             /* Matrix-Vector Product: */
482:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
483:             SSE_SHUFFLE(XMM5,XMM5,0x00)
484:             SSE_MULT_PS(XMM5,XMM0)
485:             SSE_SUB_PS(XMM4,XMM5)

487:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
488:             SSE_SHUFFLE(XMM6,XMM6,0x00)
489:             SSE_MULT_PS(XMM6,XMM1)
490:             SSE_SUB_PS(XMM4,XMM6)

492:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
493:             SSE_SHUFFLE(XMM7,XMM7,0x00)
494:             SSE_MULT_PS(XMM7,XMM2)
495:             SSE_SUB_PS(XMM4,XMM7)

497:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
498:             SSE_SHUFFLE(XMM5,XMM5,0x00)
499:             SSE_MULT_PS(XMM5,XMM3)
500:             SSE_SUB_PS(XMM4,XMM5)
501: 
502:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
503:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
504:           SSE_INLINE_END_2;
505:           pv   += 16;
506:         }
507:         PetscLogFlops(128*nz+112);
508:       }
509:       row = *ajtmp++;
510:     }
511:     /* finished row so stick it into b->a */
512:     pv = ba + 16*bi[i];
513:     pj = bj + bi[i];
514:     nz = bi[i+1] - bi[i];

516:     /* Copy x block back into pv block */
517:     for (j=0; j<nz; j++) {
518:       x  = rtmp+16*pj[j];

520:       SSE_INLINE_BEGIN_2(x,pv)
521:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
522:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
523:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

525:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
526:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

528:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
529:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

531:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
532:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

534:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
535:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

537:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
538:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

540:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
541:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

543:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
544:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
545:       SSE_INLINE_END_2;
546:       pv += 16;
547:     }
548:     /* invert diagonal block */
549:     w = ba + 16*diag_offset[i];
550:     Kernel_A_gets_inverse_A_4_SSE(w);
551:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
552:   }

554:   PetscFree(rtmp);
555:   C->factor    = FACTOR_LU;
556:   C->assembled = PETSC_TRUE;
557:   PetscLogFlops(1.3333*64*b->mbs);
558:   /* Flop Count from inverting diagonal blocks */
559:   SSE_SCOPE_END;
560:   return(0);
561: }

563: #endif