Actual source code: baijfact12.c
1: /*$Id: baijfact12.c,v 1.17 2001/08/31 16:22:11 bsmith Exp $*/
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include src/mat/impls/baij/seq/baij.h
6: #include src/vec/vecimpl.h
7: #include src/inline/ilu.h
9: #undef __FUNCT__
11: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat A,Mat *B)
12: {
13: /*
14: Default Version for when blocks are 4 by 4 Using natural ordering
15: */
16: Mat C = *B;
17: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
18: int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
19: int *ajtmpold,*ajtmp,nz,row;
20: int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
21: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
22: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
23: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
24: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
25: MatScalar m13,m14,m15,m16;
26: MatScalar *ba = b->a,*aa = a->a;
27: PetscTruth pivotinblocks = b->pivotinblocks;
30: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
32: for (i=0; i<n; i++) {
33: nz = bi[i+1] - bi[i];
34: ajtmp = bj + bi[i];
35: for (j=0; j<nz; j++) {
36: x = rtmp+16*ajtmp[j];
37: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
38: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
39: }
40: /* load in initial (unfactored row) */
41: nz = ai[i+1] - ai[i];
42: ajtmpold = aj + ai[i];
43: v = aa + 16*ai[i];
44: for (j=0; j<nz; j++) {
45: x = rtmp+16*ajtmpold[j];
46: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
47: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
48: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
49: x[14] = v[14]; x[15] = v[15];
50: v += 16;
51: }
52: row = *ajtmp++;
53: while (row < i) {
54: pc = rtmp + 16*row;
55: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
56: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
57: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
58: p15 = pc[14]; p16 = pc[15];
59: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
60: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
61: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
62: || p16 != 0.0) {
63: pv = ba + 16*diag_offset[row];
64: pj = bj + diag_offset[row] + 1;
65: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
66: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
67: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
68: x15 = pv[14]; x16 = pv[15];
69: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
70: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
71: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
72: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
74: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
75: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
76: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
77: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
79: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
80: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
81: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
82: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
84: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
85: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
86: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
87: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
88: nz = bi[row+1] - diag_offset[row] - 1;
89: pv += 16;
90: for (j=0; j<nz; j++) {
91: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
92: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
93: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
94: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
95: x = rtmp + 16*pj[j];
96: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
97: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
98: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
99: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
101: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
102: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
103: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
104: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
106: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
107: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
108: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
109: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
111: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
112: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
113: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
114: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
116: pv += 16;
117: }
118: PetscLogFlops(128*nz+112);
119: }
120: row = *ajtmp++;
121: }
122: /* finished row so stick it into b->a */
123: pv = ba + 16*bi[i];
124: pj = bj + bi[i];
125: nz = bi[i+1] - bi[i];
126: for (j=0; j<nz; j++) {
127: x = rtmp+16*pj[j];
128: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
129: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
130: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
131: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
132: pv += 16;
133: }
134: /* invert diagonal block */
135: w = ba + 16*diag_offset[i];
136: if (pivotinblocks) {
137: Kernel_A_gets_inverse_A_4(w);
138: } else {
139: Kernel_A_gets_inverse_A_4_nopivot(w);
140: }
141: }
143: PetscFree(rtmp);
144: C->factor = FACTOR_LU;
145: C->assembled = PETSC_TRUE;
146: PetscLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
147: return(0);
148: }
151: #if defined(PETSC_HAVE_SSE)
153: #include PETSC_HAVE_SSE
155: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
156: #undef __FUNCT__
158: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat A,Mat *B)
159: {
160: Mat C = *B;
161: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
162: int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
163: int *ajtmpold,*ajtmp,nz,row;
164: int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
165: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
166: MatScalar *ba = b->a,*aa = a->a;
167: int nonzero=0,colscale = 16;
168: PetscTruth pivotinblocks = b->pivotinblocks;
171: SSE_SCOPE_BEGIN;
173: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
174: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
175: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
176: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
177: if (bj==aj) {
178: colscale = 4;
179: }
180: for (i=0; i<n; i++) {
181: nz = bi[i+1] - bi[i];
182: ajtmp = bj + bi[i];
183: /* zero out the 4x4 block accumulators */
184: /* zero out one register */
185: XOR_PS(XMM7,XMM7);
186: for (j=0; j<nz; j++) {
187: x = rtmp+4*ajtmp[j];
188: SSE_INLINE_BEGIN_1(x)
189: /* Copy zero register to memory locations */
190: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
191: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
192: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
193: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
194: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
195: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
196: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
197: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
198: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
199: SSE_INLINE_END_1;
200: }
201: /* load in initial (unfactored row) */
202: nz = ai[i+1] - ai[i];
203: ajtmpold = aj + ai[i];
204: v = aa + 16*ai[i];
205: for (j=0; j<nz; j++) {
206: x = rtmp+colscale*ajtmpold[j];
207: /* Copy v block into x block */
208: SSE_INLINE_BEGIN_2(v,x)
209: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
210: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
211: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
213: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
214: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
216: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
217: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
219: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
220: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
222: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
223: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
225: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
226: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
228: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
229: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
231: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
232: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
233: SSE_INLINE_END_2;
235: v += 16;
236: }
237: row = (*ajtmp)/4;
238: ajtmp++;
239: while (row < i) {
240: pc = rtmp + 16*row;
241: SSE_INLINE_BEGIN_1(pc)
242: /* Load block from lower triangle */
243: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
244: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
245: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
247: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
248: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
250: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
251: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
253: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
254: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
256: /* Compare block to zero block */
258: SSE_COPY_PS(XMM4,XMM7)
259: SSE_CMPNEQ_PS(XMM4,XMM0)
261: SSE_COPY_PS(XMM5,XMM7)
262: SSE_CMPNEQ_PS(XMM5,XMM1)
264: SSE_COPY_PS(XMM6,XMM7)
265: SSE_CMPNEQ_PS(XMM6,XMM2)
267: SSE_CMPNEQ_PS(XMM7,XMM3)
269: /* Reduce the comparisons to one SSE register */
270: SSE_OR_PS(XMM6,XMM7)
271: SSE_OR_PS(XMM5,XMM4)
272: SSE_OR_PS(XMM5,XMM6)
273: SSE_INLINE_END_1;
275: /* Reduce the one SSE register to an integer register for branching */
276: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
277: MOVEMASK(nonzero,XMM5);
279: /* If block is nonzero ... */
280: if (nonzero) {
281: pv = ba + 16*diag_offset[row];
282: PREFETCH_L1(&pv[16]);
283: pj = bj + diag_offset[row] + 1;
285: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
286: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
287: /* but the diagonal was inverted already */
288: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
290: SSE_INLINE_BEGIN_2(pv,pc)
291: /* Column 0, product is accumulated in XMM4 */
292: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
293: SSE_SHUFFLE(XMM4,XMM4,0x00)
294: SSE_MULT_PS(XMM4,XMM0)
296: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
297: SSE_SHUFFLE(XMM5,XMM5,0x00)
298: SSE_MULT_PS(XMM5,XMM1)
299: SSE_ADD_PS(XMM4,XMM5)
301: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
302: SSE_SHUFFLE(XMM6,XMM6,0x00)
303: SSE_MULT_PS(XMM6,XMM2)
304: SSE_ADD_PS(XMM4,XMM6)
306: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
307: SSE_SHUFFLE(XMM7,XMM7,0x00)
308: SSE_MULT_PS(XMM7,XMM3)
309: SSE_ADD_PS(XMM4,XMM7)
311: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
312: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
314: /* Column 1, product is accumulated in XMM5 */
315: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
316: SSE_SHUFFLE(XMM5,XMM5,0x00)
317: SSE_MULT_PS(XMM5,XMM0)
319: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
320: SSE_SHUFFLE(XMM6,XMM6,0x00)
321: SSE_MULT_PS(XMM6,XMM1)
322: SSE_ADD_PS(XMM5,XMM6)
324: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
325: SSE_SHUFFLE(XMM7,XMM7,0x00)
326: SSE_MULT_PS(XMM7,XMM2)
327: SSE_ADD_PS(XMM5,XMM7)
329: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
330: SSE_SHUFFLE(XMM6,XMM6,0x00)
331: SSE_MULT_PS(XMM6,XMM3)
332: SSE_ADD_PS(XMM5,XMM6)
334: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
335: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
337: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
339: /* Column 2, product is accumulated in XMM6 */
340: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
341: SSE_SHUFFLE(XMM6,XMM6,0x00)
342: SSE_MULT_PS(XMM6,XMM0)
344: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
345: SSE_SHUFFLE(XMM7,XMM7,0x00)
346: SSE_MULT_PS(XMM7,XMM1)
347: SSE_ADD_PS(XMM6,XMM7)
349: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
350: SSE_SHUFFLE(XMM7,XMM7,0x00)
351: SSE_MULT_PS(XMM7,XMM2)
352: SSE_ADD_PS(XMM6,XMM7)
354: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
355: SSE_SHUFFLE(XMM7,XMM7,0x00)
356: SSE_MULT_PS(XMM7,XMM3)
357: SSE_ADD_PS(XMM6,XMM7)
358:
359: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
360: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
362: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
363: /* Column 3, product is accumulated in XMM0 */
364: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
365: SSE_SHUFFLE(XMM7,XMM7,0x00)
366: SSE_MULT_PS(XMM0,XMM7)
368: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
369: SSE_SHUFFLE(XMM7,XMM7,0x00)
370: SSE_MULT_PS(XMM1,XMM7)
371: SSE_ADD_PS(XMM0,XMM1)
373: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
374: SSE_SHUFFLE(XMM1,XMM1,0x00)
375: SSE_MULT_PS(XMM1,XMM2)
376: SSE_ADD_PS(XMM0,XMM1)
378: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
379: SSE_SHUFFLE(XMM7,XMM7,0x00)
380: SSE_MULT_PS(XMM3,XMM7)
381: SSE_ADD_PS(XMM0,XMM3)
383: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
384: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
386: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
387: /* This is code to be maintained and read by humans afterall. */
388: /* Copy Multiplier Col 3 into XMM3 */
389: SSE_COPY_PS(XMM3,XMM0)
390: /* Copy Multiplier Col 2 into XMM2 */
391: SSE_COPY_PS(XMM2,XMM6)
392: /* Copy Multiplier Col 1 into XMM1 */
393: SSE_COPY_PS(XMM1,XMM5)
394: /* Copy Multiplier Col 0 into XMM0 */
395: SSE_COPY_PS(XMM0,XMM4)
396: SSE_INLINE_END_2;
398: /* Update the row: */
399: nz = bi[row+1] - diag_offset[row] - 1;
400: pv += 16;
401: for (j=0; j<nz; j++) {
402: PREFETCH_L1(&pv[16]);
403: x = rtmp + 4*pj[j];
405: /* X:=X-M*PV, One column at a time */
406: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
407: SSE_INLINE_BEGIN_2(x,pv)
408: /* Load First Column of X*/
409: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
410: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
412: /* Matrix-Vector Product: */
413: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
414: SSE_SHUFFLE(XMM5,XMM5,0x00)
415: SSE_MULT_PS(XMM5,XMM0)
416: SSE_SUB_PS(XMM4,XMM5)
418: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
419: SSE_SHUFFLE(XMM6,XMM6,0x00)
420: SSE_MULT_PS(XMM6,XMM1)
421: SSE_SUB_PS(XMM4,XMM6)
423: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
424: SSE_SHUFFLE(XMM7,XMM7,0x00)
425: SSE_MULT_PS(XMM7,XMM2)
426: SSE_SUB_PS(XMM4,XMM7)
428: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
429: SSE_SHUFFLE(XMM5,XMM5,0x00)
430: SSE_MULT_PS(XMM5,XMM3)
431: SSE_SUB_PS(XMM4,XMM5)
433: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
434: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
436: /* Second Column */
437: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
438: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
440: /* Matrix-Vector Product: */
441: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
442: SSE_SHUFFLE(XMM6,XMM6,0x00)
443: SSE_MULT_PS(XMM6,XMM0)
444: SSE_SUB_PS(XMM5,XMM6)
446: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
447: SSE_SHUFFLE(XMM7,XMM7,0x00)
448: SSE_MULT_PS(XMM7,XMM1)
449: SSE_SUB_PS(XMM5,XMM7)
451: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
452: SSE_SHUFFLE(XMM4,XMM4,0x00)
453: SSE_MULT_PS(XMM4,XMM2)
454: SSE_SUB_PS(XMM5,XMM4)
456: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
457: SSE_SHUFFLE(XMM6,XMM6,0x00)
458: SSE_MULT_PS(XMM6,XMM3)
459: SSE_SUB_PS(XMM5,XMM6)
460:
461: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
462: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
464: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
466: /* Third Column */
467: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
468: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
470: /* Matrix-Vector Product: */
471: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
472: SSE_SHUFFLE(XMM7,XMM7,0x00)
473: SSE_MULT_PS(XMM7,XMM0)
474: SSE_SUB_PS(XMM6,XMM7)
476: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
477: SSE_SHUFFLE(XMM4,XMM4,0x00)
478: SSE_MULT_PS(XMM4,XMM1)
479: SSE_SUB_PS(XMM6,XMM4)
481: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
482: SSE_SHUFFLE(XMM5,XMM5,0x00)
483: SSE_MULT_PS(XMM5,XMM2)
484: SSE_SUB_PS(XMM6,XMM5)
486: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
487: SSE_SHUFFLE(XMM7,XMM7,0x00)
488: SSE_MULT_PS(XMM7,XMM3)
489: SSE_SUB_PS(XMM6,XMM7)
490:
491: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
492: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
493:
494: /* Fourth Column */
495: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
496: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
498: /* Matrix-Vector Product: */
499: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
500: SSE_SHUFFLE(XMM5,XMM5,0x00)
501: SSE_MULT_PS(XMM5,XMM0)
502: SSE_SUB_PS(XMM4,XMM5)
504: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
505: SSE_SHUFFLE(XMM6,XMM6,0x00)
506: SSE_MULT_PS(XMM6,XMM1)
507: SSE_SUB_PS(XMM4,XMM6)
509: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
510: SSE_SHUFFLE(XMM7,XMM7,0x00)
511: SSE_MULT_PS(XMM7,XMM2)
512: SSE_SUB_PS(XMM4,XMM7)
514: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
515: SSE_SHUFFLE(XMM5,XMM5,0x00)
516: SSE_MULT_PS(XMM5,XMM3)
517: SSE_SUB_PS(XMM4,XMM5)
518:
519: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
520: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
521: SSE_INLINE_END_2;
522: pv += 16;
523: }
524: PetscLogFlops(128*nz+112);
525: }
526: row = (*ajtmp)/4;
527: ajtmp++;
528: }
529: /* finished row so stick it into b->a */
530: pv = ba + 16*bi[i];
531: pj = bj + bi[i];
532: nz = bi[i+1] - bi[i];
534: /* Copy x block back into pv block */
535: for (j=0; j<nz; j++) {
536: x = rtmp+4*pj[j];
538: SSE_INLINE_BEGIN_2(x,pv)
539: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
540: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
541: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
543: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
544: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
546: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
547: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
549: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
550: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
552: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
553: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
555: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
556: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
558: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
559: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
561: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
562: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
563: SSE_INLINE_END_2;
564: pv += 16;
565: }
566: /* invert diagonal block */
567: w = ba + 16*diag_offset[i];
568: if (pivotinblocks) {
569: Kernel_A_gets_inverse_A_4(w);
570: } else {
571: Kernel_A_gets_inverse_A_4_nopivot(w);
572: }
573: /* Kernel_A_gets_inverse_A_4_SSE(w); */
574: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
575: }
577: PetscFree(rtmp);
578: C->factor = FACTOR_LU;
579: C->assembled = PETSC_TRUE;
580: PetscLogFlops(1.3333*64*b->mbs);
581: /* Flop Count from inverting diagonal blocks */
582: SSE_SCOPE_END;
583: return(0);
584: }
586: #endif