Actual source code: baijfact12.c
1: /*$Id: baijfact12.c,v 1.17 2001/08/31 16:22:11 bsmith Exp $*/
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include src/mat/impls/baij/seq/baij.h
6: #include src/vec/vecimpl.h
7: #include src/inline/ilu.h
11: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat A,Mat *B)
12: {
13: /*
14: Default Version for when blocks are 4 by 4 Using natural ordering
15: */
16: Mat C = *B;
17: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
18: int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
19: int *ajtmpold,*ajtmp,nz,row;
20: int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
21: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
22: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
23: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
24: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
25: MatScalar m13,m14,m15,m16;
26: MatScalar *ba = b->a,*aa = a->a;
27: PetscTruth pivotinblocks = b->pivotinblocks;
30: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
32: for (i=0; i<n; i++) {
33: nz = bi[i+1] - bi[i];
34: ajtmp = bj + bi[i];
35: for (j=0; j<nz; j++) {
36: x = rtmp+16*ajtmp[j];
37: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
38: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
39: }
40: /* load in initial (unfactored row) */
41: nz = ai[i+1] - ai[i];
42: ajtmpold = aj + ai[i];
43: v = aa + 16*ai[i];
44: for (j=0; j<nz; j++) {
45: x = rtmp+16*ajtmpold[j];
46: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
47: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
48: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
49: x[14] = v[14]; x[15] = v[15];
50: v += 16;
51: }
52: row = *ajtmp++;
53: while (row < i) {
54: pc = rtmp + 16*row;
55: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
56: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
57: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
58: p15 = pc[14]; p16 = pc[15];
59: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
60: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
61: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
62: || p16 != 0.0) {
63: pv = ba + 16*diag_offset[row];
64: pj = bj + diag_offset[row] + 1;
65: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
66: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
67: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
68: x15 = pv[14]; x16 = pv[15];
69: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
70: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
71: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
72: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
74: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
75: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
76: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
77: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
79: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
80: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
81: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
82: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
84: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
85: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
86: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
87: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
88: nz = bi[row+1] - diag_offset[row] - 1;
89: pv += 16;
90: for (j=0; j<nz; j++) {
91: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
92: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
93: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
94: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
95: x = rtmp + 16*pj[j];
96: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
97: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
98: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
99: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
101: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
102: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
103: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
104: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
106: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
107: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
108: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
109: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
111: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
112: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
113: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
114: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
116: pv += 16;
117: }
118: PetscLogFlops(128*nz+112);
119: }
120: row = *ajtmp++;
121: }
122: /* finished row so stick it into b->a */
123: pv = ba + 16*bi[i];
124: pj = bj + bi[i];
125: nz = bi[i+1] - bi[i];
126: for (j=0; j<nz; j++) {
127: x = rtmp+16*pj[j];
128: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
129: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
130: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
131: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
132: pv += 16;
133: }
134: /* invert diagonal block */
135: w = ba + 16*diag_offset[i];
136: if (pivotinblocks) {
137: Kernel_A_gets_inverse_A_4(w);
138: } else {
139: Kernel_A_gets_inverse_A_4_nopivot(w);
140: }
141: }
143: PetscFree(rtmp);
144: C->factor = FACTOR_LU;
145: C->assembled = PETSC_TRUE;
146: PetscLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
147: return(0);
148: }
151: #if defined(PETSC_HAVE_SSE)
153: #include PETSC_HAVE_SSE
155: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
158: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat A,Mat *B)
159: {
160: Mat C = *B;
161: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
162: int ierr,i,j,n = a->mbs;
163: int *bj = b->j,*bjtmp,*pj;
164: int row;
165: int *ajtmpold,nz,*bi=b->i;
166: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
167: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
168: MatScalar *ba = b->a,*aa = a->a;
169: int nonzero=0;
170: /* int nonzero=0,colscale = 16; */
171: PetscTruth pivotinblocks = b->pivotinblocks;
174: SSE_SCOPE_BEGIN;
176: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
177: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
178: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
179: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
180: /* if ((unsigned long)bj==(unsigned long)aj) { */
181: /* colscale = 4; */
182: /* } */
183: for (i=0; i<n; i++) {
184: nz = bi[i+1] - bi[i];
185: bjtmp = bj + bi[i];
186: /* zero out the 4x4 block accumulators */
187: /* zero out one register */
188: XOR_PS(XMM7,XMM7);
189: for (j=0; j<nz; j++) {
190: x = rtmp+16*bjtmp[j];
191: /* x = rtmp+4*bjtmp[j]; */
192: SSE_INLINE_BEGIN_1(x)
193: /* Copy zero register to memory locations */
194: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
195: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
196: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
197: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
198: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
199: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
200: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
201: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
202: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
203: SSE_INLINE_END_1;
204: }
205: /* load in initial (unfactored row) */
206: nz = ai[i+1] - ai[i];
207: ajtmpold = aj + ai[i];
208: v = aa + 16*ai[i];
209: for (j=0; j<nz; j++) {
210: x = rtmp+16*ajtmpold[j];
211: /* x = rtmp+colscale*ajtmpold[j]; */
212: /* Copy v block into x block */
213: SSE_INLINE_BEGIN_2(v,x)
214: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
215: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
216: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
218: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
219: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
221: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
222: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
224: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
225: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
227: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
228: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
230: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
231: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
233: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
234: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
236: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
237: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
238: SSE_INLINE_END_2;
240: v += 16;
241: }
242: /* row = (*bjtmp++)/4; */
243: row = *bjtmp++;
244: while (row < i) {
245: pc = rtmp + 16*row;
246: SSE_INLINE_BEGIN_1(pc)
247: /* Load block from lower triangle */
248: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
249: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
250: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
252: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
253: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
255: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
256: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
258: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
259: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
261: /* Compare block to zero block */
263: SSE_COPY_PS(XMM4,XMM7)
264: SSE_CMPNEQ_PS(XMM4,XMM0)
266: SSE_COPY_PS(XMM5,XMM7)
267: SSE_CMPNEQ_PS(XMM5,XMM1)
269: SSE_COPY_PS(XMM6,XMM7)
270: SSE_CMPNEQ_PS(XMM6,XMM2)
272: SSE_CMPNEQ_PS(XMM7,XMM3)
274: /* Reduce the comparisons to one SSE register */
275: SSE_OR_PS(XMM6,XMM7)
276: SSE_OR_PS(XMM5,XMM4)
277: SSE_OR_PS(XMM5,XMM6)
278: SSE_INLINE_END_1;
280: /* Reduce the one SSE register to an integer register for branching */
281: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
282: MOVEMASK(nonzero,XMM5);
284: /* If block is nonzero ... */
285: if (nonzero) {
286: pv = ba + 16*diag_offset[row];
287: PREFETCH_L1(&pv[16]);
288: pj = bj + diag_offset[row] + 1;
290: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
291: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
292: /* but the diagonal was inverted already */
293: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
295: SSE_INLINE_BEGIN_2(pv,pc)
296: /* Column 0, product is accumulated in XMM4 */
297: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
298: SSE_SHUFFLE(XMM4,XMM4,0x00)
299: SSE_MULT_PS(XMM4,XMM0)
301: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
302: SSE_SHUFFLE(XMM5,XMM5,0x00)
303: SSE_MULT_PS(XMM5,XMM1)
304: SSE_ADD_PS(XMM4,XMM5)
306: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
307: SSE_SHUFFLE(XMM6,XMM6,0x00)
308: SSE_MULT_PS(XMM6,XMM2)
309: SSE_ADD_PS(XMM4,XMM6)
311: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
312: SSE_SHUFFLE(XMM7,XMM7,0x00)
313: SSE_MULT_PS(XMM7,XMM3)
314: SSE_ADD_PS(XMM4,XMM7)
316: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
317: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
319: /* Column 1, product is accumulated in XMM5 */
320: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
321: SSE_SHUFFLE(XMM5,XMM5,0x00)
322: SSE_MULT_PS(XMM5,XMM0)
324: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
325: SSE_SHUFFLE(XMM6,XMM6,0x00)
326: SSE_MULT_PS(XMM6,XMM1)
327: SSE_ADD_PS(XMM5,XMM6)
329: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
330: SSE_SHUFFLE(XMM7,XMM7,0x00)
331: SSE_MULT_PS(XMM7,XMM2)
332: SSE_ADD_PS(XMM5,XMM7)
334: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
335: SSE_SHUFFLE(XMM6,XMM6,0x00)
336: SSE_MULT_PS(XMM6,XMM3)
337: SSE_ADD_PS(XMM5,XMM6)
339: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
340: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
342: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
344: /* Column 2, product is accumulated in XMM6 */
345: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
346: SSE_SHUFFLE(XMM6,XMM6,0x00)
347: SSE_MULT_PS(XMM6,XMM0)
349: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
350: SSE_SHUFFLE(XMM7,XMM7,0x00)
351: SSE_MULT_PS(XMM7,XMM1)
352: SSE_ADD_PS(XMM6,XMM7)
354: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
355: SSE_SHUFFLE(XMM7,XMM7,0x00)
356: SSE_MULT_PS(XMM7,XMM2)
357: SSE_ADD_PS(XMM6,XMM7)
359: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
360: SSE_SHUFFLE(XMM7,XMM7,0x00)
361: SSE_MULT_PS(XMM7,XMM3)
362: SSE_ADD_PS(XMM6,XMM7)
363:
364: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
365: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
367: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
368: /* Column 3, product is accumulated in XMM0 */
369: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
370: SSE_SHUFFLE(XMM7,XMM7,0x00)
371: SSE_MULT_PS(XMM0,XMM7)
373: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
374: SSE_SHUFFLE(XMM7,XMM7,0x00)
375: SSE_MULT_PS(XMM1,XMM7)
376: SSE_ADD_PS(XMM0,XMM1)
378: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
379: SSE_SHUFFLE(XMM1,XMM1,0x00)
380: SSE_MULT_PS(XMM1,XMM2)
381: SSE_ADD_PS(XMM0,XMM1)
383: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
384: SSE_SHUFFLE(XMM7,XMM7,0x00)
385: SSE_MULT_PS(XMM3,XMM7)
386: SSE_ADD_PS(XMM0,XMM3)
388: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
389: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
391: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
392: /* This is code to be maintained and read by humans afterall. */
393: /* Copy Multiplier Col 3 into XMM3 */
394: SSE_COPY_PS(XMM3,XMM0)
395: /* Copy Multiplier Col 2 into XMM2 */
396: SSE_COPY_PS(XMM2,XMM6)
397: /* Copy Multiplier Col 1 into XMM1 */
398: SSE_COPY_PS(XMM1,XMM5)
399: /* Copy Multiplier Col 0 into XMM0 */
400: SSE_COPY_PS(XMM0,XMM4)
401: SSE_INLINE_END_2;
403: /* Update the row: */
404: nz = bi[row+1] - diag_offset[row] - 1;
405: pv += 16;
406: for (j=0; j<nz; j++) {
407: PREFETCH_L1(&pv[16]);
408: x = rtmp + 16*pj[j];
409: /* x = rtmp + 4*pj[j]; */
411: /* X:=X-M*PV, One column at a time */
412: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
413: SSE_INLINE_BEGIN_2(x,pv)
414: /* Load First Column of X*/
415: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
416: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
418: /* Matrix-Vector Product: */
419: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
420: SSE_SHUFFLE(XMM5,XMM5,0x00)
421: SSE_MULT_PS(XMM5,XMM0)
422: SSE_SUB_PS(XMM4,XMM5)
424: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
425: SSE_SHUFFLE(XMM6,XMM6,0x00)
426: SSE_MULT_PS(XMM6,XMM1)
427: SSE_SUB_PS(XMM4,XMM6)
429: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
430: SSE_SHUFFLE(XMM7,XMM7,0x00)
431: SSE_MULT_PS(XMM7,XMM2)
432: SSE_SUB_PS(XMM4,XMM7)
434: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
435: SSE_SHUFFLE(XMM5,XMM5,0x00)
436: SSE_MULT_PS(XMM5,XMM3)
437: SSE_SUB_PS(XMM4,XMM5)
439: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
440: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
442: /* Second Column */
443: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
444: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
446: /* Matrix-Vector Product: */
447: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
448: SSE_SHUFFLE(XMM6,XMM6,0x00)
449: SSE_MULT_PS(XMM6,XMM0)
450: SSE_SUB_PS(XMM5,XMM6)
452: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
453: SSE_SHUFFLE(XMM7,XMM7,0x00)
454: SSE_MULT_PS(XMM7,XMM1)
455: SSE_SUB_PS(XMM5,XMM7)
457: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
458: SSE_SHUFFLE(XMM4,XMM4,0x00)
459: SSE_MULT_PS(XMM4,XMM2)
460: SSE_SUB_PS(XMM5,XMM4)
462: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
463: SSE_SHUFFLE(XMM6,XMM6,0x00)
464: SSE_MULT_PS(XMM6,XMM3)
465: SSE_SUB_PS(XMM5,XMM6)
466:
467: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
468: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
470: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
472: /* Third Column */
473: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
474: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
476: /* Matrix-Vector Product: */
477: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
478: SSE_SHUFFLE(XMM7,XMM7,0x00)
479: SSE_MULT_PS(XMM7,XMM0)
480: SSE_SUB_PS(XMM6,XMM7)
482: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
483: SSE_SHUFFLE(XMM4,XMM4,0x00)
484: SSE_MULT_PS(XMM4,XMM1)
485: SSE_SUB_PS(XMM6,XMM4)
487: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
488: SSE_SHUFFLE(XMM5,XMM5,0x00)
489: SSE_MULT_PS(XMM5,XMM2)
490: SSE_SUB_PS(XMM6,XMM5)
492: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
493: SSE_SHUFFLE(XMM7,XMM7,0x00)
494: SSE_MULT_PS(XMM7,XMM3)
495: SSE_SUB_PS(XMM6,XMM7)
496:
497: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
498: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
499:
500: /* Fourth Column */
501: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
502: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
504: /* Matrix-Vector Product: */
505: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
506: SSE_SHUFFLE(XMM5,XMM5,0x00)
507: SSE_MULT_PS(XMM5,XMM0)
508: SSE_SUB_PS(XMM4,XMM5)
510: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
511: SSE_SHUFFLE(XMM6,XMM6,0x00)
512: SSE_MULT_PS(XMM6,XMM1)
513: SSE_SUB_PS(XMM4,XMM6)
515: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
516: SSE_SHUFFLE(XMM7,XMM7,0x00)
517: SSE_MULT_PS(XMM7,XMM2)
518: SSE_SUB_PS(XMM4,XMM7)
520: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
521: SSE_SHUFFLE(XMM5,XMM5,0x00)
522: SSE_MULT_PS(XMM5,XMM3)
523: SSE_SUB_PS(XMM4,XMM5)
524:
525: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
526: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
527: SSE_INLINE_END_2;
528: pv += 16;
529: }
530: PetscLogFlops(128*nz+112);
531: }
532: row = *bjtmp++;
533: /* row = (*bjtmp++)/4; */
534: }
535: /* finished row so stick it into b->a */
536: pv = ba + 16*bi[i];
537: pj = bj + bi[i];
538: nz = bi[i+1] - bi[i];
540: /* Copy x block back into pv block */
541: for (j=0; j<nz; j++) {
542: x = rtmp+16*pj[j];
543: /* x = rtmp+4*pj[j]; */
545: SSE_INLINE_BEGIN_2(x,pv)
546: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
547: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
548: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
550: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
551: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
553: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
554: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
556: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
557: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
559: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
560: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
562: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
563: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
565: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
566: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
568: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
569: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
570: SSE_INLINE_END_2;
571: pv += 16;
572: }
573: /* invert diagonal block */
574: w = ba + 16*diag_offset[i];
575: if (pivotinblocks) {
576: Kernel_A_gets_inverse_A_4(w);
577: } else {
578: Kernel_A_gets_inverse_A_4_nopivot(w);
579: }
580: /* Kernel_A_gets_inverse_A_4_SSE(w); */
581: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
582: }
584: PetscFree(rtmp);
585: C->factor = FACTOR_LU;
586: C->assembled = PETSC_TRUE;
587: PetscLogFlops(1.3333*64*b->mbs);
588: /* Flop Count from inverting diagonal blocks */
589: SSE_SCOPE_END;
590: return(0);
591: }
595: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat *B)
596: {
597: Mat A=*B,C=*B;
598: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
599: int ierr,i,j,n = a->mbs;
600: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
601: unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
602: unsigned int row;
603: int nz,*bi=b->i;
604: int *diag_offset = b->diag,*ai=a->i;
605: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
606: MatScalar *ba = b->a,*aa = a->a;
607: int nonzero=0;
608: /* int nonzero=0,colscale = 16; */
609: PetscTruth pivotinblocks = b->pivotinblocks;
612: SSE_SCOPE_BEGIN;
614: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
615: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
616: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
617: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
618: /* if ((unsigned long)bj==(unsigned long)aj) { */
619: /* colscale = 4; */
620: /* } */
621:
622: for (i=0; i<n; i++) {
623: nz = bi[i+1] - bi[i];
624: bjtmp = bj + bi[i];
625: /* zero out the 4x4 block accumulators */
626: /* zero out one register */
627: XOR_PS(XMM7,XMM7);
628: for (j=0; j<nz; j++) {
629: x = rtmp+16*((unsigned int)bjtmp[j]);
630: /* x = rtmp+4*bjtmp[j]; */
631: SSE_INLINE_BEGIN_1(x)
632: /* Copy zero register to memory locations */
633: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
634: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
635: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
636: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
637: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
638: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
639: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
640: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
641: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
642: SSE_INLINE_END_1;
643: }
644: /* load in initial (unfactored row) */
645: nz = ai[i+1] - ai[i];
646: ajtmp = aj + ai[i];
647: v = aa + 16*ai[i];
648: for (j=0; j<nz; j++) {
649: x = rtmp+16*((unsigned int)ajtmp[j]);
650: /* x = rtmp+colscale*ajtmp[j]; */
651: /* Copy v block into x block */
652: SSE_INLINE_BEGIN_2(v,x)
653: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
654: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
655: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
657: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
658: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
660: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
661: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
663: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
664: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
666: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
667: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
669: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
670: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
672: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
673: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
675: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
676: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
677: SSE_INLINE_END_2;
679: v += 16;
680: }
681: /* row = (*bjtmp++)/4; */
682: row = (unsigned int)(*bjtmp++);
683: while (row < i) {
684: pc = rtmp + 16*row;
685: SSE_INLINE_BEGIN_1(pc)
686: /* Load block from lower triangle */
687: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
688: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
689: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
691: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
692: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
694: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
695: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
697: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
698: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
700: /* Compare block to zero block */
702: SSE_COPY_PS(XMM4,XMM7)
703: SSE_CMPNEQ_PS(XMM4,XMM0)
705: SSE_COPY_PS(XMM5,XMM7)
706: SSE_CMPNEQ_PS(XMM5,XMM1)
708: SSE_COPY_PS(XMM6,XMM7)
709: SSE_CMPNEQ_PS(XMM6,XMM2)
711: SSE_CMPNEQ_PS(XMM7,XMM3)
713: /* Reduce the comparisons to one SSE register */
714: SSE_OR_PS(XMM6,XMM7)
715: SSE_OR_PS(XMM5,XMM4)
716: SSE_OR_PS(XMM5,XMM6)
717: SSE_INLINE_END_1;
719: /* Reduce the one SSE register to an integer register for branching */
720: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
721: MOVEMASK(nonzero,XMM5);
723: /* If block is nonzero ... */
724: if (nonzero) {
725: pv = ba + 16*diag_offset[row];
726: PREFETCH_L1(&pv[16]);
727: pj = bj + diag_offset[row] + 1;
729: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
730: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
731: /* but the diagonal was inverted already */
732: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
734: SSE_INLINE_BEGIN_2(pv,pc)
735: /* Column 0, product is accumulated in XMM4 */
736: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
737: SSE_SHUFFLE(XMM4,XMM4,0x00)
738: SSE_MULT_PS(XMM4,XMM0)
740: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
741: SSE_SHUFFLE(XMM5,XMM5,0x00)
742: SSE_MULT_PS(XMM5,XMM1)
743: SSE_ADD_PS(XMM4,XMM5)
745: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
746: SSE_SHUFFLE(XMM6,XMM6,0x00)
747: SSE_MULT_PS(XMM6,XMM2)
748: SSE_ADD_PS(XMM4,XMM6)
750: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
751: SSE_SHUFFLE(XMM7,XMM7,0x00)
752: SSE_MULT_PS(XMM7,XMM3)
753: SSE_ADD_PS(XMM4,XMM7)
755: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
756: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
758: /* Column 1, product is accumulated in XMM5 */
759: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
760: SSE_SHUFFLE(XMM5,XMM5,0x00)
761: SSE_MULT_PS(XMM5,XMM0)
763: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
764: SSE_SHUFFLE(XMM6,XMM6,0x00)
765: SSE_MULT_PS(XMM6,XMM1)
766: SSE_ADD_PS(XMM5,XMM6)
768: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
769: SSE_SHUFFLE(XMM7,XMM7,0x00)
770: SSE_MULT_PS(XMM7,XMM2)
771: SSE_ADD_PS(XMM5,XMM7)
773: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
774: SSE_SHUFFLE(XMM6,XMM6,0x00)
775: SSE_MULT_PS(XMM6,XMM3)
776: SSE_ADD_PS(XMM5,XMM6)
778: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
779: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
781: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
783: /* Column 2, product is accumulated in XMM6 */
784: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
785: SSE_SHUFFLE(XMM6,XMM6,0x00)
786: SSE_MULT_PS(XMM6,XMM0)
788: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
789: SSE_SHUFFLE(XMM7,XMM7,0x00)
790: SSE_MULT_PS(XMM7,XMM1)
791: SSE_ADD_PS(XMM6,XMM7)
793: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
794: SSE_SHUFFLE(XMM7,XMM7,0x00)
795: SSE_MULT_PS(XMM7,XMM2)
796: SSE_ADD_PS(XMM6,XMM7)
798: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
799: SSE_SHUFFLE(XMM7,XMM7,0x00)
800: SSE_MULT_PS(XMM7,XMM3)
801: SSE_ADD_PS(XMM6,XMM7)
802:
803: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
804: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
806: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
807: /* Column 3, product is accumulated in XMM0 */
808: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
809: SSE_SHUFFLE(XMM7,XMM7,0x00)
810: SSE_MULT_PS(XMM0,XMM7)
812: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
813: SSE_SHUFFLE(XMM7,XMM7,0x00)
814: SSE_MULT_PS(XMM1,XMM7)
815: SSE_ADD_PS(XMM0,XMM1)
817: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
818: SSE_SHUFFLE(XMM1,XMM1,0x00)
819: SSE_MULT_PS(XMM1,XMM2)
820: SSE_ADD_PS(XMM0,XMM1)
822: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
823: SSE_SHUFFLE(XMM7,XMM7,0x00)
824: SSE_MULT_PS(XMM3,XMM7)
825: SSE_ADD_PS(XMM0,XMM3)
827: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
828: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
830: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
831: /* This is code to be maintained and read by humans afterall. */
832: /* Copy Multiplier Col 3 into XMM3 */
833: SSE_COPY_PS(XMM3,XMM0)
834: /* Copy Multiplier Col 2 into XMM2 */
835: SSE_COPY_PS(XMM2,XMM6)
836: /* Copy Multiplier Col 1 into XMM1 */
837: SSE_COPY_PS(XMM1,XMM5)
838: /* Copy Multiplier Col 0 into XMM0 */
839: SSE_COPY_PS(XMM0,XMM4)
840: SSE_INLINE_END_2;
842: /* Update the row: */
843: nz = bi[row+1] - diag_offset[row] - 1;
844: pv += 16;
845: for (j=0; j<nz; j++) {
846: PREFETCH_L1(&pv[16]);
847: x = rtmp + 16*((unsigned int)pj[j]);
848: /* x = rtmp + 4*pj[j]; */
850: /* X:=X-M*PV, One column at a time */
851: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
852: SSE_INLINE_BEGIN_2(x,pv)
853: /* Load First Column of X*/
854: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
855: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
857: /* Matrix-Vector Product: */
858: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
859: SSE_SHUFFLE(XMM5,XMM5,0x00)
860: SSE_MULT_PS(XMM5,XMM0)
861: SSE_SUB_PS(XMM4,XMM5)
863: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
864: SSE_SHUFFLE(XMM6,XMM6,0x00)
865: SSE_MULT_PS(XMM6,XMM1)
866: SSE_SUB_PS(XMM4,XMM6)
868: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
869: SSE_SHUFFLE(XMM7,XMM7,0x00)
870: SSE_MULT_PS(XMM7,XMM2)
871: SSE_SUB_PS(XMM4,XMM7)
873: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
874: SSE_SHUFFLE(XMM5,XMM5,0x00)
875: SSE_MULT_PS(XMM5,XMM3)
876: SSE_SUB_PS(XMM4,XMM5)
878: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
879: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
881: /* Second Column */
882: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
883: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
885: /* Matrix-Vector Product: */
886: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
887: SSE_SHUFFLE(XMM6,XMM6,0x00)
888: SSE_MULT_PS(XMM6,XMM0)
889: SSE_SUB_PS(XMM5,XMM6)
891: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
892: SSE_SHUFFLE(XMM7,XMM7,0x00)
893: SSE_MULT_PS(XMM7,XMM1)
894: SSE_SUB_PS(XMM5,XMM7)
896: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
897: SSE_SHUFFLE(XMM4,XMM4,0x00)
898: SSE_MULT_PS(XMM4,XMM2)
899: SSE_SUB_PS(XMM5,XMM4)
901: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
902: SSE_SHUFFLE(XMM6,XMM6,0x00)
903: SSE_MULT_PS(XMM6,XMM3)
904: SSE_SUB_PS(XMM5,XMM6)
905:
906: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
907: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
909: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
911: /* Third Column */
912: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
913: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
915: /* Matrix-Vector Product: */
916: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
917: SSE_SHUFFLE(XMM7,XMM7,0x00)
918: SSE_MULT_PS(XMM7,XMM0)
919: SSE_SUB_PS(XMM6,XMM7)
921: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
922: SSE_SHUFFLE(XMM4,XMM4,0x00)
923: SSE_MULT_PS(XMM4,XMM1)
924: SSE_SUB_PS(XMM6,XMM4)
926: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
927: SSE_SHUFFLE(XMM5,XMM5,0x00)
928: SSE_MULT_PS(XMM5,XMM2)
929: SSE_SUB_PS(XMM6,XMM5)
931: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
932: SSE_SHUFFLE(XMM7,XMM7,0x00)
933: SSE_MULT_PS(XMM7,XMM3)
934: SSE_SUB_PS(XMM6,XMM7)
935:
936: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
937: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
938:
939: /* Fourth Column */
940: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
941: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
943: /* Matrix-Vector Product: */
944: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
945: SSE_SHUFFLE(XMM5,XMM5,0x00)
946: SSE_MULT_PS(XMM5,XMM0)
947: SSE_SUB_PS(XMM4,XMM5)
949: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
950: SSE_SHUFFLE(XMM6,XMM6,0x00)
951: SSE_MULT_PS(XMM6,XMM1)
952: SSE_SUB_PS(XMM4,XMM6)
954: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
955: SSE_SHUFFLE(XMM7,XMM7,0x00)
956: SSE_MULT_PS(XMM7,XMM2)
957: SSE_SUB_PS(XMM4,XMM7)
959: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
960: SSE_SHUFFLE(XMM5,XMM5,0x00)
961: SSE_MULT_PS(XMM5,XMM3)
962: SSE_SUB_PS(XMM4,XMM5)
963:
964: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
965: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
966: SSE_INLINE_END_2;
967: pv += 16;
968: }
969: PetscLogFlops(128*nz+112);
970: }
971: row = (unsigned int)(*bjtmp++);
972: /* row = (*bjtmp++)/4; */
973: /* bjtmp++; */
974: }
975: /* finished row so stick it into b->a */
976: pv = ba + 16*bi[i];
977: pj = bj + bi[i];
978: nz = bi[i+1] - bi[i];
980: /* Copy x block back into pv block */
981: for (j=0; j<nz; j++) {
982: x = rtmp+16*((unsigned int)pj[j]);
983: /* x = rtmp+4*pj[j]; */
985: SSE_INLINE_BEGIN_2(x,pv)
986: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
987: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
988: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
990: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
991: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
993: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
994: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
996: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
997: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
999: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1000: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1002: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1003: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1005: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1006: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1008: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1009: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1010: SSE_INLINE_END_2;
1011: pv += 16;
1012: }
1013: /* invert diagonal block */
1014: w = ba + 16*diag_offset[i];
1015: if (pivotinblocks) {
1016: Kernel_A_gets_inverse_A_4(w);
1017: } else {
1018: Kernel_A_gets_inverse_A_4_nopivot(w);
1019: }
1020: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1021: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1022: }
1024: PetscFree(rtmp);
1025: C->factor = FACTOR_LU;
1026: C->assembled = PETSC_TRUE;
1027: PetscLogFlops(1.3333*64*b->mbs);
1028: /* Flop Count from inverting diagonal blocks */
1029: SSE_SCOPE_END;
1030: return(0);
1031: }
1035: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A,Mat *B)
1036: {
1037: Mat C = *B;
1038: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1039: int ierr,i,j,n = a->mbs;
1040: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1041: unsigned int row;
1042: int *ajtmpold,nz,*bi=b->i;
1043: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1044: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1045: MatScalar *ba = b->a,*aa = a->a;
1046: int nonzero=0;
1047: /* int nonzero=0,colscale = 16; */
1048: PetscTruth pivotinblocks = b->pivotinblocks;
1051: SSE_SCOPE_BEGIN;
1053: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1054: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1055: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1056: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1057: /* if ((unsigned long)bj==(unsigned long)aj) { */
1058: /* colscale = 4; */
1059: /* } */
1060: if ((unsigned long)bj==(unsigned long)aj) {
1061: return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(B));
1062: }
1063:
1064: for (i=0; i<n; i++) {
1065: nz = bi[i+1] - bi[i];
1066: bjtmp = bj + bi[i];
1067: /* zero out the 4x4 block accumulators */
1068: /* zero out one register */
1069: XOR_PS(XMM7,XMM7);
1070: for (j=0; j<nz; j++) {
1071: x = rtmp+16*((unsigned int)bjtmp[j]);
1072: /* x = rtmp+4*bjtmp[j]; */
1073: SSE_INLINE_BEGIN_1(x)
1074: /* Copy zero register to memory locations */
1075: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1076: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1077: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1078: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1079: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1080: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1081: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1082: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1083: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1084: SSE_INLINE_END_1;
1085: }
1086: /* load in initial (unfactored row) */
1087: nz = ai[i+1] - ai[i];
1088: ajtmpold = aj + ai[i];
1089: v = aa + 16*ai[i];
1090: for (j=0; j<nz; j++) {
1091: x = rtmp+16*ajtmpold[j];
1092: /* x = rtmp+colscale*ajtmpold[j]; */
1093: /* Copy v block into x block */
1094: SSE_INLINE_BEGIN_2(v,x)
1095: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1096: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1097: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1099: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1100: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1102: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1103: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1105: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1106: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1108: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1109: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1111: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1112: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1114: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1115: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1117: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1118: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1119: SSE_INLINE_END_2;
1121: v += 16;
1122: }
1123: /* row = (*bjtmp++)/4; */
1124: row = (unsigned int)(*bjtmp++);
1125: while (row < i) {
1126: pc = rtmp + 16*row;
1127: SSE_INLINE_BEGIN_1(pc)
1128: /* Load block from lower triangle */
1129: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1130: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1131: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1133: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1134: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1136: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1137: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1139: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1140: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1142: /* Compare block to zero block */
1144: SSE_COPY_PS(XMM4,XMM7)
1145: SSE_CMPNEQ_PS(XMM4,XMM0)
1147: SSE_COPY_PS(XMM5,XMM7)
1148: SSE_CMPNEQ_PS(XMM5,XMM1)
1150: SSE_COPY_PS(XMM6,XMM7)
1151: SSE_CMPNEQ_PS(XMM6,XMM2)
1153: SSE_CMPNEQ_PS(XMM7,XMM3)
1155: /* Reduce the comparisons to one SSE register */
1156: SSE_OR_PS(XMM6,XMM7)
1157: SSE_OR_PS(XMM5,XMM4)
1158: SSE_OR_PS(XMM5,XMM6)
1159: SSE_INLINE_END_1;
1161: /* Reduce the one SSE register to an integer register for branching */
1162: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1163: MOVEMASK(nonzero,XMM5);
1165: /* If block is nonzero ... */
1166: if (nonzero) {
1167: pv = ba + 16*diag_offset[row];
1168: PREFETCH_L1(&pv[16]);
1169: pj = bj + diag_offset[row] + 1;
1171: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1172: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1173: /* but the diagonal was inverted already */
1174: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1176: SSE_INLINE_BEGIN_2(pv,pc)
1177: /* Column 0, product is accumulated in XMM4 */
1178: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1179: SSE_SHUFFLE(XMM4,XMM4,0x00)
1180: SSE_MULT_PS(XMM4,XMM0)
1182: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1183: SSE_SHUFFLE(XMM5,XMM5,0x00)
1184: SSE_MULT_PS(XMM5,XMM1)
1185: SSE_ADD_PS(XMM4,XMM5)
1187: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1188: SSE_SHUFFLE(XMM6,XMM6,0x00)
1189: SSE_MULT_PS(XMM6,XMM2)
1190: SSE_ADD_PS(XMM4,XMM6)
1192: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1193: SSE_SHUFFLE(XMM7,XMM7,0x00)
1194: SSE_MULT_PS(XMM7,XMM3)
1195: SSE_ADD_PS(XMM4,XMM7)
1197: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1198: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1200: /* Column 1, product is accumulated in XMM5 */
1201: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1202: SSE_SHUFFLE(XMM5,XMM5,0x00)
1203: SSE_MULT_PS(XMM5,XMM0)
1205: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1206: SSE_SHUFFLE(XMM6,XMM6,0x00)
1207: SSE_MULT_PS(XMM6,XMM1)
1208: SSE_ADD_PS(XMM5,XMM6)
1210: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1211: SSE_SHUFFLE(XMM7,XMM7,0x00)
1212: SSE_MULT_PS(XMM7,XMM2)
1213: SSE_ADD_PS(XMM5,XMM7)
1215: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1216: SSE_SHUFFLE(XMM6,XMM6,0x00)
1217: SSE_MULT_PS(XMM6,XMM3)
1218: SSE_ADD_PS(XMM5,XMM6)
1220: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1221: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1223: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1225: /* Column 2, product is accumulated in XMM6 */
1226: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1227: SSE_SHUFFLE(XMM6,XMM6,0x00)
1228: SSE_MULT_PS(XMM6,XMM0)
1230: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1231: SSE_SHUFFLE(XMM7,XMM7,0x00)
1232: SSE_MULT_PS(XMM7,XMM1)
1233: SSE_ADD_PS(XMM6,XMM7)
1235: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1236: SSE_SHUFFLE(XMM7,XMM7,0x00)
1237: SSE_MULT_PS(XMM7,XMM2)
1238: SSE_ADD_PS(XMM6,XMM7)
1240: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1241: SSE_SHUFFLE(XMM7,XMM7,0x00)
1242: SSE_MULT_PS(XMM7,XMM3)
1243: SSE_ADD_PS(XMM6,XMM7)
1244:
1245: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1246: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1248: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1249: /* Column 3, product is accumulated in XMM0 */
1250: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1251: SSE_SHUFFLE(XMM7,XMM7,0x00)
1252: SSE_MULT_PS(XMM0,XMM7)
1254: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1255: SSE_SHUFFLE(XMM7,XMM7,0x00)
1256: SSE_MULT_PS(XMM1,XMM7)
1257: SSE_ADD_PS(XMM0,XMM1)
1259: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1260: SSE_SHUFFLE(XMM1,XMM1,0x00)
1261: SSE_MULT_PS(XMM1,XMM2)
1262: SSE_ADD_PS(XMM0,XMM1)
1264: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1265: SSE_SHUFFLE(XMM7,XMM7,0x00)
1266: SSE_MULT_PS(XMM3,XMM7)
1267: SSE_ADD_PS(XMM0,XMM3)
1269: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1270: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1272: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1273: /* This is code to be maintained and read by humans afterall. */
1274: /* Copy Multiplier Col 3 into XMM3 */
1275: SSE_COPY_PS(XMM3,XMM0)
1276: /* Copy Multiplier Col 2 into XMM2 */
1277: SSE_COPY_PS(XMM2,XMM6)
1278: /* Copy Multiplier Col 1 into XMM1 */
1279: SSE_COPY_PS(XMM1,XMM5)
1280: /* Copy Multiplier Col 0 into XMM0 */
1281: SSE_COPY_PS(XMM0,XMM4)
1282: SSE_INLINE_END_2;
1284: /* Update the row: */
1285: nz = bi[row+1] - diag_offset[row] - 1;
1286: pv += 16;
1287: for (j=0; j<nz; j++) {
1288: PREFETCH_L1(&pv[16]);
1289: x = rtmp + 16*((unsigned int)pj[j]);
1290: /* x = rtmp + 4*pj[j]; */
1292: /* X:=X-M*PV, One column at a time */
1293: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1294: SSE_INLINE_BEGIN_2(x,pv)
1295: /* Load First Column of X*/
1296: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1297: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1299: /* Matrix-Vector Product: */
1300: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1301: SSE_SHUFFLE(XMM5,XMM5,0x00)
1302: SSE_MULT_PS(XMM5,XMM0)
1303: SSE_SUB_PS(XMM4,XMM5)
1305: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1306: SSE_SHUFFLE(XMM6,XMM6,0x00)
1307: SSE_MULT_PS(XMM6,XMM1)
1308: SSE_SUB_PS(XMM4,XMM6)
1310: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1311: SSE_SHUFFLE(XMM7,XMM7,0x00)
1312: SSE_MULT_PS(XMM7,XMM2)
1313: SSE_SUB_PS(XMM4,XMM7)
1315: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1316: SSE_SHUFFLE(XMM5,XMM5,0x00)
1317: SSE_MULT_PS(XMM5,XMM3)
1318: SSE_SUB_PS(XMM4,XMM5)
1320: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1321: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1323: /* Second Column */
1324: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1325: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1327: /* Matrix-Vector Product: */
1328: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1329: SSE_SHUFFLE(XMM6,XMM6,0x00)
1330: SSE_MULT_PS(XMM6,XMM0)
1331: SSE_SUB_PS(XMM5,XMM6)
1333: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1334: SSE_SHUFFLE(XMM7,XMM7,0x00)
1335: SSE_MULT_PS(XMM7,XMM1)
1336: SSE_SUB_PS(XMM5,XMM7)
1338: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1339: SSE_SHUFFLE(XMM4,XMM4,0x00)
1340: SSE_MULT_PS(XMM4,XMM2)
1341: SSE_SUB_PS(XMM5,XMM4)
1343: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1344: SSE_SHUFFLE(XMM6,XMM6,0x00)
1345: SSE_MULT_PS(XMM6,XMM3)
1346: SSE_SUB_PS(XMM5,XMM6)
1347:
1348: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1349: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1351: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1353: /* Third Column */
1354: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1355: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1357: /* Matrix-Vector Product: */
1358: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1359: SSE_SHUFFLE(XMM7,XMM7,0x00)
1360: SSE_MULT_PS(XMM7,XMM0)
1361: SSE_SUB_PS(XMM6,XMM7)
1363: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1364: SSE_SHUFFLE(XMM4,XMM4,0x00)
1365: SSE_MULT_PS(XMM4,XMM1)
1366: SSE_SUB_PS(XMM6,XMM4)
1368: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1369: SSE_SHUFFLE(XMM5,XMM5,0x00)
1370: SSE_MULT_PS(XMM5,XMM2)
1371: SSE_SUB_PS(XMM6,XMM5)
1373: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1374: SSE_SHUFFLE(XMM7,XMM7,0x00)
1375: SSE_MULT_PS(XMM7,XMM3)
1376: SSE_SUB_PS(XMM6,XMM7)
1377:
1378: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1379: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1380:
1381: /* Fourth Column */
1382: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1383: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1385: /* Matrix-Vector Product: */
1386: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1387: SSE_SHUFFLE(XMM5,XMM5,0x00)
1388: SSE_MULT_PS(XMM5,XMM0)
1389: SSE_SUB_PS(XMM4,XMM5)
1391: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1392: SSE_SHUFFLE(XMM6,XMM6,0x00)
1393: SSE_MULT_PS(XMM6,XMM1)
1394: SSE_SUB_PS(XMM4,XMM6)
1396: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1397: SSE_SHUFFLE(XMM7,XMM7,0x00)
1398: SSE_MULT_PS(XMM7,XMM2)
1399: SSE_SUB_PS(XMM4,XMM7)
1401: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1402: SSE_SHUFFLE(XMM5,XMM5,0x00)
1403: SSE_MULT_PS(XMM5,XMM3)
1404: SSE_SUB_PS(XMM4,XMM5)
1405:
1406: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1407: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1408: SSE_INLINE_END_2;
1409: pv += 16;
1410: }
1411: PetscLogFlops(128*nz+112);
1412: }
1413: row = (unsigned int)(*bjtmp++);
1414: /* row = (*bjtmp++)/4; */
1415: /* bjtmp++; */
1416: }
1417: /* finished row so stick it into b->a */
1418: pv = ba + 16*bi[i];
1419: pj = bj + bi[i];
1420: nz = bi[i+1] - bi[i];
1422: /* Copy x block back into pv block */
1423: for (j=0; j<nz; j++) {
1424: x = rtmp+16*((unsigned int)pj[j]);
1425: /* x = rtmp+4*pj[j]; */
1427: SSE_INLINE_BEGIN_2(x,pv)
1428: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1429: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1430: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1432: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1433: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1435: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1436: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1438: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1439: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1441: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1442: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1444: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1445: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1447: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1448: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1450: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1451: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1452: SSE_INLINE_END_2;
1453: pv += 16;
1454: }
1455: /* invert diagonal block */
1456: w = ba + 16*diag_offset[i];
1457: if (pivotinblocks) {
1458: Kernel_A_gets_inverse_A_4(w);
1459: } else {
1460: Kernel_A_gets_inverse_A_4_nopivot(w);
1461: }
1462: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1463: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1464: }
1466: PetscFree(rtmp);
1467: C->factor = FACTOR_LU;
1468: C->assembled = PETSC_TRUE;
1469: PetscLogFlops(1.3333*64*b->mbs);
1470: /* Flop Count from inverting diagonal blocks */
1471: SSE_SCOPE_END;
1472: return(0);
1473: }
1475: #endif