Actual source code: baijfact12.c
1: /*
2: Factorization code for BAIJ format.
3: */
4: #include src/mat/impls/baij/seq/baij.h
5: #include src/inline/ilu.h
9: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat A,Mat *B)
10: {
11: /*
12: Default Version for when blocks are 4 by 4 Using natural ordering
13: */
14: Mat C = *B;
15: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
17: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
18: PetscInt *ajtmpold,*ajtmp,nz,row;
19: PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
20: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
21: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
22: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
23: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
24: MatScalar m13,m14,m15,m16;
25: MatScalar *ba = b->a,*aa = a->a;
26: PetscTruth pivotinblocks = b->pivotinblocks;
29: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
31: for (i=0; i<n; i++) {
32: nz = bi[i+1] - bi[i];
33: ajtmp = bj + bi[i];
34: for (j=0; j<nz; j++) {
35: x = rtmp+16*ajtmp[j];
36: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
37: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
38: }
39: /* load in initial (unfactored row) */
40: nz = ai[i+1] - ai[i];
41: ajtmpold = aj + ai[i];
42: v = aa + 16*ai[i];
43: for (j=0; j<nz; j++) {
44: x = rtmp+16*ajtmpold[j];
45: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
46: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
47: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
48: x[14] = v[14]; x[15] = v[15];
49: v += 16;
50: }
51: row = *ajtmp++;
52: while (row < i) {
53: pc = rtmp + 16*row;
54: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
55: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
56: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
57: p15 = pc[14]; p16 = pc[15];
58: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
59: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
60: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
61: || p16 != 0.0) {
62: pv = ba + 16*diag_offset[row];
63: pj = bj + diag_offset[row] + 1;
64: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
65: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
66: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
67: x15 = pv[14]; x16 = pv[15];
68: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
69: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
70: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
71: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
73: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
74: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
75: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
76: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
78: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
79: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
80: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
81: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
83: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
84: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
85: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
86: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
87: nz = bi[row+1] - diag_offset[row] - 1;
88: pv += 16;
89: for (j=0; j<nz; j++) {
90: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
91: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
92: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
93: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
94: x = rtmp + 16*pj[j];
95: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
96: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
97: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
98: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
100: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
101: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
102: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
103: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
105: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
106: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
107: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
108: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
110: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
111: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
112: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
113: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
115: pv += 16;
116: }
117: PetscLogFlops(128*nz+112);
118: }
119: row = *ajtmp++;
120: }
121: /* finished row so stick it into b->a */
122: pv = ba + 16*bi[i];
123: pj = bj + bi[i];
124: nz = bi[i+1] - bi[i];
125: for (j=0; j<nz; j++) {
126: x = rtmp+16*pj[j];
127: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
128: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
129: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
130: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
131: pv += 16;
132: }
133: /* invert diagonal block */
134: w = ba + 16*diag_offset[i];
135: if (pivotinblocks) {
136: Kernel_A_gets_inverse_A_4(w);
137: } else {
138: Kernel_A_gets_inverse_A_4_nopivot(w);
139: }
140: }
142: PetscFree(rtmp);
143: C->factor = FACTOR_LU;
144: C->assembled = PETSC_TRUE;
145: PetscLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
146: return(0);
147: }
150: #if defined(PETSC_HAVE_SSE)
152: #include PETSC_HAVE_SSE
154: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
157: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat A,Mat *B)
158: {
159: Mat C = *B;
160: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
162: int i,j,n = a->mbs;
163: int *bj = b->j,*bjtmp,*pj;
164: int row;
165: int *ajtmpold,nz,*bi=b->i;
166: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
167: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
168: MatScalar *ba = b->a,*aa = a->a;
169: int nonzero=0;
170: /* int nonzero=0,colscale = 16; */
171: PetscTruth pivotinblocks = b->pivotinblocks;
174: SSE_SCOPE_BEGIN;
176: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
177: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
178: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
179: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
180: /* if ((unsigned long)bj==(unsigned long)aj) { */
181: /* colscale = 4; */
182: /* } */
183: for (i=0; i<n; i++) {
184: nz = bi[i+1] - bi[i];
185: bjtmp = bj + bi[i];
186: /* zero out the 4x4 block accumulators */
187: /* zero out one register */
188: XOR_PS(XMM7,XMM7);
189: for (j=0; j<nz; j++) {
190: x = rtmp+16*bjtmp[j];
191: /* x = rtmp+4*bjtmp[j]; */
192: SSE_INLINE_BEGIN_1(x)
193: /* Copy zero register to memory locations */
194: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
195: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
196: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
197: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
198: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
199: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
200: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
201: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
202: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
203: SSE_INLINE_END_1;
204: }
205: /* load in initial (unfactored row) */
206: nz = ai[i+1] - ai[i];
207: ajtmpold = aj + ai[i];
208: v = aa + 16*ai[i];
209: for (j=0; j<nz; j++) {
210: x = rtmp+16*ajtmpold[j];
211: /* x = rtmp+colscale*ajtmpold[j]; */
212: /* Copy v block into x block */
213: SSE_INLINE_BEGIN_2(v,x)
214: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
215: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
216: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
218: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
219: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
221: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
222: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
224: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
225: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
227: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
228: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
230: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
231: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
233: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
234: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
236: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
237: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
238: SSE_INLINE_END_2;
240: v += 16;
241: }
242: /* row = (*bjtmp++)/4; */
243: row = *bjtmp++;
244: while (row < i) {
245: pc = rtmp + 16*row;
246: SSE_INLINE_BEGIN_1(pc)
247: /* Load block from lower triangle */
248: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
249: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
250: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
252: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
253: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
255: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
256: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
258: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
259: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
261: /* Compare block to zero block */
263: SSE_COPY_PS(XMM4,XMM7)
264: SSE_CMPNEQ_PS(XMM4,XMM0)
266: SSE_COPY_PS(XMM5,XMM7)
267: SSE_CMPNEQ_PS(XMM5,XMM1)
269: SSE_COPY_PS(XMM6,XMM7)
270: SSE_CMPNEQ_PS(XMM6,XMM2)
272: SSE_CMPNEQ_PS(XMM7,XMM3)
274: /* Reduce the comparisons to one SSE register */
275: SSE_OR_PS(XMM6,XMM7)
276: SSE_OR_PS(XMM5,XMM4)
277: SSE_OR_PS(XMM5,XMM6)
278: SSE_INLINE_END_1;
280: /* Reduce the one SSE register to an integer register for branching */
281: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
282: MOVEMASK(nonzero,XMM5);
284: /* If block is nonzero ... */
285: if (nonzero) {
286: pv = ba + 16*diag_offset[row];
287: PREFETCH_L1(&pv[16]);
288: pj = bj + diag_offset[row] + 1;
290: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
291: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
292: /* but the diagonal was inverted already */
293: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
295: SSE_INLINE_BEGIN_2(pv,pc)
296: /* Column 0, product is accumulated in XMM4 */
297: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
298: SSE_SHUFFLE(XMM4,XMM4,0x00)
299: SSE_MULT_PS(XMM4,XMM0)
301: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
302: SSE_SHUFFLE(XMM5,XMM5,0x00)
303: SSE_MULT_PS(XMM5,XMM1)
304: SSE_ADD_PS(XMM4,XMM5)
306: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
307: SSE_SHUFFLE(XMM6,XMM6,0x00)
308: SSE_MULT_PS(XMM6,XMM2)
309: SSE_ADD_PS(XMM4,XMM6)
311: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
312: SSE_SHUFFLE(XMM7,XMM7,0x00)
313: SSE_MULT_PS(XMM7,XMM3)
314: SSE_ADD_PS(XMM4,XMM7)
316: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
317: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
319: /* Column 1, product is accumulated in XMM5 */
320: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
321: SSE_SHUFFLE(XMM5,XMM5,0x00)
322: SSE_MULT_PS(XMM5,XMM0)
324: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
325: SSE_SHUFFLE(XMM6,XMM6,0x00)
326: SSE_MULT_PS(XMM6,XMM1)
327: SSE_ADD_PS(XMM5,XMM6)
329: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
330: SSE_SHUFFLE(XMM7,XMM7,0x00)
331: SSE_MULT_PS(XMM7,XMM2)
332: SSE_ADD_PS(XMM5,XMM7)
334: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
335: SSE_SHUFFLE(XMM6,XMM6,0x00)
336: SSE_MULT_PS(XMM6,XMM3)
337: SSE_ADD_PS(XMM5,XMM6)
339: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
340: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
342: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
344: /* Column 2, product is accumulated in XMM6 */
345: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
346: SSE_SHUFFLE(XMM6,XMM6,0x00)
347: SSE_MULT_PS(XMM6,XMM0)
349: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
350: SSE_SHUFFLE(XMM7,XMM7,0x00)
351: SSE_MULT_PS(XMM7,XMM1)
352: SSE_ADD_PS(XMM6,XMM7)
354: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
355: SSE_SHUFFLE(XMM7,XMM7,0x00)
356: SSE_MULT_PS(XMM7,XMM2)
357: SSE_ADD_PS(XMM6,XMM7)
359: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
360: SSE_SHUFFLE(XMM7,XMM7,0x00)
361: SSE_MULT_PS(XMM7,XMM3)
362: SSE_ADD_PS(XMM6,XMM7)
363:
364: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
365: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
367: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
368: /* Column 3, product is accumulated in XMM0 */
369: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
370: SSE_SHUFFLE(XMM7,XMM7,0x00)
371: SSE_MULT_PS(XMM0,XMM7)
373: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
374: SSE_SHUFFLE(XMM7,XMM7,0x00)
375: SSE_MULT_PS(XMM1,XMM7)
376: SSE_ADD_PS(XMM0,XMM1)
378: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
379: SSE_SHUFFLE(XMM1,XMM1,0x00)
380: SSE_MULT_PS(XMM1,XMM2)
381: SSE_ADD_PS(XMM0,XMM1)
383: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
384: SSE_SHUFFLE(XMM7,XMM7,0x00)
385: SSE_MULT_PS(XMM3,XMM7)
386: SSE_ADD_PS(XMM0,XMM3)
388: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
389: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
391: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
392: /* This is code to be maintained and read by humans afterall. */
393: /* Copy Multiplier Col 3 into XMM3 */
394: SSE_COPY_PS(XMM3,XMM0)
395: /* Copy Multiplier Col 2 into XMM2 */
396: SSE_COPY_PS(XMM2,XMM6)
397: /* Copy Multiplier Col 1 into XMM1 */
398: SSE_COPY_PS(XMM1,XMM5)
399: /* Copy Multiplier Col 0 into XMM0 */
400: SSE_COPY_PS(XMM0,XMM4)
401: SSE_INLINE_END_2;
403: /* Update the row: */
404: nz = bi[row+1] - diag_offset[row] - 1;
405: pv += 16;
406: for (j=0; j<nz; j++) {
407: PREFETCH_L1(&pv[16]);
408: x = rtmp + 16*pj[j];
409: /* x = rtmp + 4*pj[j]; */
411: /* X:=X-M*PV, One column at a time */
412: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
413: SSE_INLINE_BEGIN_2(x,pv)
414: /* Load First Column of X*/
415: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
416: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
418: /* Matrix-Vector Product: */
419: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
420: SSE_SHUFFLE(XMM5,XMM5,0x00)
421: SSE_MULT_PS(XMM5,XMM0)
422: SSE_SUB_PS(XMM4,XMM5)
424: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
425: SSE_SHUFFLE(XMM6,XMM6,0x00)
426: SSE_MULT_PS(XMM6,XMM1)
427: SSE_SUB_PS(XMM4,XMM6)
429: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
430: SSE_SHUFFLE(XMM7,XMM7,0x00)
431: SSE_MULT_PS(XMM7,XMM2)
432: SSE_SUB_PS(XMM4,XMM7)
434: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
435: SSE_SHUFFLE(XMM5,XMM5,0x00)
436: SSE_MULT_PS(XMM5,XMM3)
437: SSE_SUB_PS(XMM4,XMM5)
439: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
440: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
442: /* Second Column */
443: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
444: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
446: /* Matrix-Vector Product: */
447: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
448: SSE_SHUFFLE(XMM6,XMM6,0x00)
449: SSE_MULT_PS(XMM6,XMM0)
450: SSE_SUB_PS(XMM5,XMM6)
452: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
453: SSE_SHUFFLE(XMM7,XMM7,0x00)
454: SSE_MULT_PS(XMM7,XMM1)
455: SSE_SUB_PS(XMM5,XMM7)
457: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
458: SSE_SHUFFLE(XMM4,XMM4,0x00)
459: SSE_MULT_PS(XMM4,XMM2)
460: SSE_SUB_PS(XMM5,XMM4)
462: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
463: SSE_SHUFFLE(XMM6,XMM6,0x00)
464: SSE_MULT_PS(XMM6,XMM3)
465: SSE_SUB_PS(XMM5,XMM6)
466:
467: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
468: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
470: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
472: /* Third Column */
473: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
474: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
476: /* Matrix-Vector Product: */
477: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
478: SSE_SHUFFLE(XMM7,XMM7,0x00)
479: SSE_MULT_PS(XMM7,XMM0)
480: SSE_SUB_PS(XMM6,XMM7)
482: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
483: SSE_SHUFFLE(XMM4,XMM4,0x00)
484: SSE_MULT_PS(XMM4,XMM1)
485: SSE_SUB_PS(XMM6,XMM4)
487: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
488: SSE_SHUFFLE(XMM5,XMM5,0x00)
489: SSE_MULT_PS(XMM5,XMM2)
490: SSE_SUB_PS(XMM6,XMM5)
492: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
493: SSE_SHUFFLE(XMM7,XMM7,0x00)
494: SSE_MULT_PS(XMM7,XMM3)
495: SSE_SUB_PS(XMM6,XMM7)
496:
497: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
498: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
499:
500: /* Fourth Column */
501: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
502: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
504: /* Matrix-Vector Product: */
505: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
506: SSE_SHUFFLE(XMM5,XMM5,0x00)
507: SSE_MULT_PS(XMM5,XMM0)
508: SSE_SUB_PS(XMM4,XMM5)
510: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
511: SSE_SHUFFLE(XMM6,XMM6,0x00)
512: SSE_MULT_PS(XMM6,XMM1)
513: SSE_SUB_PS(XMM4,XMM6)
515: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
516: SSE_SHUFFLE(XMM7,XMM7,0x00)
517: SSE_MULT_PS(XMM7,XMM2)
518: SSE_SUB_PS(XMM4,XMM7)
520: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
521: SSE_SHUFFLE(XMM5,XMM5,0x00)
522: SSE_MULT_PS(XMM5,XMM3)
523: SSE_SUB_PS(XMM4,XMM5)
524:
525: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
526: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
527: SSE_INLINE_END_2;
528: pv += 16;
529: }
530: PetscLogFlops(128*nz+112);
531: }
532: row = *bjtmp++;
533: /* row = (*bjtmp++)/4; */
534: }
535: /* finished row so stick it into b->a */
536: pv = ba + 16*bi[i];
537: pj = bj + bi[i];
538: nz = bi[i+1] - bi[i];
540: /* Copy x block back into pv block */
541: for (j=0; j<nz; j++) {
542: x = rtmp+16*pj[j];
543: /* x = rtmp+4*pj[j]; */
545: SSE_INLINE_BEGIN_2(x,pv)
546: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
547: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
548: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
550: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
551: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
553: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
554: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
556: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
557: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
559: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
560: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
562: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
563: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
565: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
566: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
568: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
569: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
570: SSE_INLINE_END_2;
571: pv += 16;
572: }
573: /* invert diagonal block */
574: w = ba + 16*diag_offset[i];
575: if (pivotinblocks) {
576: Kernel_A_gets_inverse_A_4(w);
577: } else {
578: Kernel_A_gets_inverse_A_4_nopivot(w);
579: }
580: /* Kernel_A_gets_inverse_A_4_SSE(w); */
581: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
582: }
584: PetscFree(rtmp);
585: C->factor = FACTOR_LU;
586: C->assembled = PETSC_TRUE;
587: PetscLogFlops(1.3333*64*b->mbs);
588: /* Flop Count from inverting diagonal blocks */
589: SSE_SCOPE_END;
590: return(0);
591: }
595: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat *B)
596: {
597: Mat A=*B,C=*B;
598: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
600: int i,j,n = a->mbs;
601: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
602: unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
603: unsigned int row;
604: int nz,*bi=b->i;
605: int *diag_offset = b->diag,*ai=a->i;
606: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
607: MatScalar *ba = b->a,*aa = a->a;
608: int nonzero=0;
609: /* int nonzero=0,colscale = 16; */
610: PetscTruth pivotinblocks = b->pivotinblocks;
613: SSE_SCOPE_BEGIN;
615: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
616: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
617: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
618: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
619: /* if ((unsigned long)bj==(unsigned long)aj) { */
620: /* colscale = 4; */
621: /* } */
622:
623: for (i=0; i<n; i++) {
624: nz = bi[i+1] - bi[i];
625: bjtmp = bj + bi[i];
626: /* zero out the 4x4 block accumulators */
627: /* zero out one register */
628: XOR_PS(XMM7,XMM7);
629: for (j=0; j<nz; j++) {
630: x = rtmp+16*((unsigned int)bjtmp[j]);
631: /* x = rtmp+4*bjtmp[j]; */
632: SSE_INLINE_BEGIN_1(x)
633: /* Copy zero register to memory locations */
634: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
635: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
636: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
637: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
638: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
639: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
640: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
641: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
642: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
643: SSE_INLINE_END_1;
644: }
645: /* load in initial (unfactored row) */
646: nz = ai[i+1] - ai[i];
647: ajtmp = aj + ai[i];
648: v = aa + 16*ai[i];
649: for (j=0; j<nz; j++) {
650: x = rtmp+16*((unsigned int)ajtmp[j]);
651: /* x = rtmp+colscale*ajtmp[j]; */
652: /* Copy v block into x block */
653: SSE_INLINE_BEGIN_2(v,x)
654: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
655: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
656: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
658: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
659: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
661: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
662: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
664: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
665: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
667: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
668: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
670: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
671: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
673: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
674: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
676: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
677: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
678: SSE_INLINE_END_2;
680: v += 16;
681: }
682: /* row = (*bjtmp++)/4; */
683: row = (unsigned int)(*bjtmp++);
684: while (row < i) {
685: pc = rtmp + 16*row;
686: SSE_INLINE_BEGIN_1(pc)
687: /* Load block from lower triangle */
688: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
689: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
690: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
692: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
693: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
695: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
696: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
698: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
699: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
701: /* Compare block to zero block */
703: SSE_COPY_PS(XMM4,XMM7)
704: SSE_CMPNEQ_PS(XMM4,XMM0)
706: SSE_COPY_PS(XMM5,XMM7)
707: SSE_CMPNEQ_PS(XMM5,XMM1)
709: SSE_COPY_PS(XMM6,XMM7)
710: SSE_CMPNEQ_PS(XMM6,XMM2)
712: SSE_CMPNEQ_PS(XMM7,XMM3)
714: /* Reduce the comparisons to one SSE register */
715: SSE_OR_PS(XMM6,XMM7)
716: SSE_OR_PS(XMM5,XMM4)
717: SSE_OR_PS(XMM5,XMM6)
718: SSE_INLINE_END_1;
720: /* Reduce the one SSE register to an integer register for branching */
721: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
722: MOVEMASK(nonzero,XMM5);
724: /* If block is nonzero ... */
725: if (nonzero) {
726: pv = ba + 16*diag_offset[row];
727: PREFETCH_L1(&pv[16]);
728: pj = bj + diag_offset[row] + 1;
730: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
731: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
732: /* but the diagonal was inverted already */
733: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
735: SSE_INLINE_BEGIN_2(pv,pc)
736: /* Column 0, product is accumulated in XMM4 */
737: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
738: SSE_SHUFFLE(XMM4,XMM4,0x00)
739: SSE_MULT_PS(XMM4,XMM0)
741: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
742: SSE_SHUFFLE(XMM5,XMM5,0x00)
743: SSE_MULT_PS(XMM5,XMM1)
744: SSE_ADD_PS(XMM4,XMM5)
746: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
747: SSE_SHUFFLE(XMM6,XMM6,0x00)
748: SSE_MULT_PS(XMM6,XMM2)
749: SSE_ADD_PS(XMM4,XMM6)
751: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
752: SSE_SHUFFLE(XMM7,XMM7,0x00)
753: SSE_MULT_PS(XMM7,XMM3)
754: SSE_ADD_PS(XMM4,XMM7)
756: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
757: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
759: /* Column 1, product is accumulated in XMM5 */
760: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
761: SSE_SHUFFLE(XMM5,XMM5,0x00)
762: SSE_MULT_PS(XMM5,XMM0)
764: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
765: SSE_SHUFFLE(XMM6,XMM6,0x00)
766: SSE_MULT_PS(XMM6,XMM1)
767: SSE_ADD_PS(XMM5,XMM6)
769: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
770: SSE_SHUFFLE(XMM7,XMM7,0x00)
771: SSE_MULT_PS(XMM7,XMM2)
772: SSE_ADD_PS(XMM5,XMM7)
774: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
775: SSE_SHUFFLE(XMM6,XMM6,0x00)
776: SSE_MULT_PS(XMM6,XMM3)
777: SSE_ADD_PS(XMM5,XMM6)
779: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
780: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
782: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
784: /* Column 2, product is accumulated in XMM6 */
785: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
786: SSE_SHUFFLE(XMM6,XMM6,0x00)
787: SSE_MULT_PS(XMM6,XMM0)
789: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
790: SSE_SHUFFLE(XMM7,XMM7,0x00)
791: SSE_MULT_PS(XMM7,XMM1)
792: SSE_ADD_PS(XMM6,XMM7)
794: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
795: SSE_SHUFFLE(XMM7,XMM7,0x00)
796: SSE_MULT_PS(XMM7,XMM2)
797: SSE_ADD_PS(XMM6,XMM7)
799: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
800: SSE_SHUFFLE(XMM7,XMM7,0x00)
801: SSE_MULT_PS(XMM7,XMM3)
802: SSE_ADD_PS(XMM6,XMM7)
803:
804: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
805: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
807: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
808: /* Column 3, product is accumulated in XMM0 */
809: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
810: SSE_SHUFFLE(XMM7,XMM7,0x00)
811: SSE_MULT_PS(XMM0,XMM7)
813: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
814: SSE_SHUFFLE(XMM7,XMM7,0x00)
815: SSE_MULT_PS(XMM1,XMM7)
816: SSE_ADD_PS(XMM0,XMM1)
818: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
819: SSE_SHUFFLE(XMM1,XMM1,0x00)
820: SSE_MULT_PS(XMM1,XMM2)
821: SSE_ADD_PS(XMM0,XMM1)
823: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
824: SSE_SHUFFLE(XMM7,XMM7,0x00)
825: SSE_MULT_PS(XMM3,XMM7)
826: SSE_ADD_PS(XMM0,XMM3)
828: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
829: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
831: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
832: /* This is code to be maintained and read by humans afterall. */
833: /* Copy Multiplier Col 3 into XMM3 */
834: SSE_COPY_PS(XMM3,XMM0)
835: /* Copy Multiplier Col 2 into XMM2 */
836: SSE_COPY_PS(XMM2,XMM6)
837: /* Copy Multiplier Col 1 into XMM1 */
838: SSE_COPY_PS(XMM1,XMM5)
839: /* Copy Multiplier Col 0 into XMM0 */
840: SSE_COPY_PS(XMM0,XMM4)
841: SSE_INLINE_END_2;
843: /* Update the row: */
844: nz = bi[row+1] - diag_offset[row] - 1;
845: pv += 16;
846: for (j=0; j<nz; j++) {
847: PREFETCH_L1(&pv[16]);
848: x = rtmp + 16*((unsigned int)pj[j]);
849: /* x = rtmp + 4*pj[j]; */
851: /* X:=X-M*PV, One column at a time */
852: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
853: SSE_INLINE_BEGIN_2(x,pv)
854: /* Load First Column of X*/
855: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
856: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
858: /* Matrix-Vector Product: */
859: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
860: SSE_SHUFFLE(XMM5,XMM5,0x00)
861: SSE_MULT_PS(XMM5,XMM0)
862: SSE_SUB_PS(XMM4,XMM5)
864: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
865: SSE_SHUFFLE(XMM6,XMM6,0x00)
866: SSE_MULT_PS(XMM6,XMM1)
867: SSE_SUB_PS(XMM4,XMM6)
869: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
870: SSE_SHUFFLE(XMM7,XMM7,0x00)
871: SSE_MULT_PS(XMM7,XMM2)
872: SSE_SUB_PS(XMM4,XMM7)
874: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
875: SSE_SHUFFLE(XMM5,XMM5,0x00)
876: SSE_MULT_PS(XMM5,XMM3)
877: SSE_SUB_PS(XMM4,XMM5)
879: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
880: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
882: /* Second Column */
883: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
884: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
886: /* Matrix-Vector Product: */
887: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
888: SSE_SHUFFLE(XMM6,XMM6,0x00)
889: SSE_MULT_PS(XMM6,XMM0)
890: SSE_SUB_PS(XMM5,XMM6)
892: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
893: SSE_SHUFFLE(XMM7,XMM7,0x00)
894: SSE_MULT_PS(XMM7,XMM1)
895: SSE_SUB_PS(XMM5,XMM7)
897: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
898: SSE_SHUFFLE(XMM4,XMM4,0x00)
899: SSE_MULT_PS(XMM4,XMM2)
900: SSE_SUB_PS(XMM5,XMM4)
902: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
903: SSE_SHUFFLE(XMM6,XMM6,0x00)
904: SSE_MULT_PS(XMM6,XMM3)
905: SSE_SUB_PS(XMM5,XMM6)
906:
907: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
908: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
910: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
912: /* Third Column */
913: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
914: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
916: /* Matrix-Vector Product: */
917: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
918: SSE_SHUFFLE(XMM7,XMM7,0x00)
919: SSE_MULT_PS(XMM7,XMM0)
920: SSE_SUB_PS(XMM6,XMM7)
922: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
923: SSE_SHUFFLE(XMM4,XMM4,0x00)
924: SSE_MULT_PS(XMM4,XMM1)
925: SSE_SUB_PS(XMM6,XMM4)
927: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
928: SSE_SHUFFLE(XMM5,XMM5,0x00)
929: SSE_MULT_PS(XMM5,XMM2)
930: SSE_SUB_PS(XMM6,XMM5)
932: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
933: SSE_SHUFFLE(XMM7,XMM7,0x00)
934: SSE_MULT_PS(XMM7,XMM3)
935: SSE_SUB_PS(XMM6,XMM7)
936:
937: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
938: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
939:
940: /* Fourth Column */
941: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
942: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
944: /* Matrix-Vector Product: */
945: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
946: SSE_SHUFFLE(XMM5,XMM5,0x00)
947: SSE_MULT_PS(XMM5,XMM0)
948: SSE_SUB_PS(XMM4,XMM5)
950: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
951: SSE_SHUFFLE(XMM6,XMM6,0x00)
952: SSE_MULT_PS(XMM6,XMM1)
953: SSE_SUB_PS(XMM4,XMM6)
955: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
956: SSE_SHUFFLE(XMM7,XMM7,0x00)
957: SSE_MULT_PS(XMM7,XMM2)
958: SSE_SUB_PS(XMM4,XMM7)
960: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
961: SSE_SHUFFLE(XMM5,XMM5,0x00)
962: SSE_MULT_PS(XMM5,XMM3)
963: SSE_SUB_PS(XMM4,XMM5)
964:
965: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
966: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
967: SSE_INLINE_END_2;
968: pv += 16;
969: }
970: PetscLogFlops(128*nz+112);
971: }
972: row = (unsigned int)(*bjtmp++);
973: /* row = (*bjtmp++)/4; */
974: /* bjtmp++; */
975: }
976: /* finished row so stick it into b->a */
977: pv = ba + 16*bi[i];
978: pj = bj + bi[i];
979: nz = bi[i+1] - bi[i];
981: /* Copy x block back into pv block */
982: for (j=0; j<nz; j++) {
983: x = rtmp+16*((unsigned int)pj[j]);
984: /* x = rtmp+4*pj[j]; */
986: SSE_INLINE_BEGIN_2(x,pv)
987: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
988: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
989: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
991: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
992: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
994: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
995: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
997: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
998: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1000: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1001: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1003: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1004: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1006: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1007: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1009: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1010: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1011: SSE_INLINE_END_2;
1012: pv += 16;
1013: }
1014: /* invert diagonal block */
1015: w = ba + 16*diag_offset[i];
1016: if (pivotinblocks) {
1017: Kernel_A_gets_inverse_A_4(w);
1018: } else {
1019: Kernel_A_gets_inverse_A_4_nopivot(w);
1020: }
1021: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1022: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1023: }
1025: PetscFree(rtmp);
1026: C->factor = FACTOR_LU;
1027: C->assembled = PETSC_TRUE;
1028: PetscLogFlops(1.3333*64*b->mbs);
1029: /* Flop Count from inverting diagonal blocks */
1030: SSE_SCOPE_END;
1031: return(0);
1032: }
1036: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A,Mat *B)
1037: {
1038: Mat C = *B;
1039: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1041: int i,j,n = a->mbs;
1042: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1043: unsigned int row;
1044: int *ajtmpold,nz,*bi=b->i;
1045: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1046: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1047: MatScalar *ba = b->a,*aa = a->a;
1048: int nonzero=0;
1049: /* int nonzero=0,colscale = 16; */
1050: PetscTruth pivotinblocks = b->pivotinblocks;
1053: SSE_SCOPE_BEGIN;
1055: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1056: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1057: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1058: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1059: /* if ((unsigned long)bj==(unsigned long)aj) { */
1060: /* colscale = 4; */
1061: /* } */
1062: if ((unsigned long)bj==(unsigned long)aj) {
1063: return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(B));
1064: }
1065:
1066: for (i=0; i<n; i++) {
1067: nz = bi[i+1] - bi[i];
1068: bjtmp = bj + bi[i];
1069: /* zero out the 4x4 block accumulators */
1070: /* zero out one register */
1071: XOR_PS(XMM7,XMM7);
1072: for (j=0; j<nz; j++) {
1073: x = rtmp+16*((unsigned int)bjtmp[j]);
1074: /* x = rtmp+4*bjtmp[j]; */
1075: SSE_INLINE_BEGIN_1(x)
1076: /* Copy zero register to memory locations */
1077: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1078: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1079: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1080: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1081: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1082: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1083: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1084: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1085: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1086: SSE_INLINE_END_1;
1087: }
1088: /* load in initial (unfactored row) */
1089: nz = ai[i+1] - ai[i];
1090: ajtmpold = aj + ai[i];
1091: v = aa + 16*ai[i];
1092: for (j=0; j<nz; j++) {
1093: x = rtmp+16*ajtmpold[j];
1094: /* x = rtmp+colscale*ajtmpold[j]; */
1095: /* Copy v block into x block */
1096: SSE_INLINE_BEGIN_2(v,x)
1097: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1098: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1099: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1101: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1102: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1104: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1105: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1107: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1108: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1110: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1111: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1113: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1114: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1116: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1117: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1119: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1120: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1121: SSE_INLINE_END_2;
1123: v += 16;
1124: }
1125: /* row = (*bjtmp++)/4; */
1126: row = (unsigned int)(*bjtmp++);
1127: while (row < i) {
1128: pc = rtmp + 16*row;
1129: SSE_INLINE_BEGIN_1(pc)
1130: /* Load block from lower triangle */
1131: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1132: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1133: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1135: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1136: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1138: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1139: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1141: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1142: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1144: /* Compare block to zero block */
1146: SSE_COPY_PS(XMM4,XMM7)
1147: SSE_CMPNEQ_PS(XMM4,XMM0)
1149: SSE_COPY_PS(XMM5,XMM7)
1150: SSE_CMPNEQ_PS(XMM5,XMM1)
1152: SSE_COPY_PS(XMM6,XMM7)
1153: SSE_CMPNEQ_PS(XMM6,XMM2)
1155: SSE_CMPNEQ_PS(XMM7,XMM3)
1157: /* Reduce the comparisons to one SSE register */
1158: SSE_OR_PS(XMM6,XMM7)
1159: SSE_OR_PS(XMM5,XMM4)
1160: SSE_OR_PS(XMM5,XMM6)
1161: SSE_INLINE_END_1;
1163: /* Reduce the one SSE register to an integer register for branching */
1164: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1165: MOVEMASK(nonzero,XMM5);
1167: /* If block is nonzero ... */
1168: if (nonzero) {
1169: pv = ba + 16*diag_offset[row];
1170: PREFETCH_L1(&pv[16]);
1171: pj = bj + diag_offset[row] + 1;
1173: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1174: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1175: /* but the diagonal was inverted already */
1176: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1178: SSE_INLINE_BEGIN_2(pv,pc)
1179: /* Column 0, product is accumulated in XMM4 */
1180: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1181: SSE_SHUFFLE(XMM4,XMM4,0x00)
1182: SSE_MULT_PS(XMM4,XMM0)
1184: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1185: SSE_SHUFFLE(XMM5,XMM5,0x00)
1186: SSE_MULT_PS(XMM5,XMM1)
1187: SSE_ADD_PS(XMM4,XMM5)
1189: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1190: SSE_SHUFFLE(XMM6,XMM6,0x00)
1191: SSE_MULT_PS(XMM6,XMM2)
1192: SSE_ADD_PS(XMM4,XMM6)
1194: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1195: SSE_SHUFFLE(XMM7,XMM7,0x00)
1196: SSE_MULT_PS(XMM7,XMM3)
1197: SSE_ADD_PS(XMM4,XMM7)
1199: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1200: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1202: /* Column 1, product is accumulated in XMM5 */
1203: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1204: SSE_SHUFFLE(XMM5,XMM5,0x00)
1205: SSE_MULT_PS(XMM5,XMM0)
1207: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1208: SSE_SHUFFLE(XMM6,XMM6,0x00)
1209: SSE_MULT_PS(XMM6,XMM1)
1210: SSE_ADD_PS(XMM5,XMM6)
1212: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1213: SSE_SHUFFLE(XMM7,XMM7,0x00)
1214: SSE_MULT_PS(XMM7,XMM2)
1215: SSE_ADD_PS(XMM5,XMM7)
1217: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1218: SSE_SHUFFLE(XMM6,XMM6,0x00)
1219: SSE_MULT_PS(XMM6,XMM3)
1220: SSE_ADD_PS(XMM5,XMM6)
1222: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1223: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1225: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1227: /* Column 2, product is accumulated in XMM6 */
1228: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1229: SSE_SHUFFLE(XMM6,XMM6,0x00)
1230: SSE_MULT_PS(XMM6,XMM0)
1232: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1233: SSE_SHUFFLE(XMM7,XMM7,0x00)
1234: SSE_MULT_PS(XMM7,XMM1)
1235: SSE_ADD_PS(XMM6,XMM7)
1237: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1238: SSE_SHUFFLE(XMM7,XMM7,0x00)
1239: SSE_MULT_PS(XMM7,XMM2)
1240: SSE_ADD_PS(XMM6,XMM7)
1242: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1243: SSE_SHUFFLE(XMM7,XMM7,0x00)
1244: SSE_MULT_PS(XMM7,XMM3)
1245: SSE_ADD_PS(XMM6,XMM7)
1246:
1247: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1248: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1250: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1251: /* Column 3, product is accumulated in XMM0 */
1252: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1253: SSE_SHUFFLE(XMM7,XMM7,0x00)
1254: SSE_MULT_PS(XMM0,XMM7)
1256: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1257: SSE_SHUFFLE(XMM7,XMM7,0x00)
1258: SSE_MULT_PS(XMM1,XMM7)
1259: SSE_ADD_PS(XMM0,XMM1)
1261: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1262: SSE_SHUFFLE(XMM1,XMM1,0x00)
1263: SSE_MULT_PS(XMM1,XMM2)
1264: SSE_ADD_PS(XMM0,XMM1)
1266: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1267: SSE_SHUFFLE(XMM7,XMM7,0x00)
1268: SSE_MULT_PS(XMM3,XMM7)
1269: SSE_ADD_PS(XMM0,XMM3)
1271: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1272: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1274: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1275: /* This is code to be maintained and read by humans afterall. */
1276: /* Copy Multiplier Col 3 into XMM3 */
1277: SSE_COPY_PS(XMM3,XMM0)
1278: /* Copy Multiplier Col 2 into XMM2 */
1279: SSE_COPY_PS(XMM2,XMM6)
1280: /* Copy Multiplier Col 1 into XMM1 */
1281: SSE_COPY_PS(XMM1,XMM5)
1282: /* Copy Multiplier Col 0 into XMM0 */
1283: SSE_COPY_PS(XMM0,XMM4)
1284: SSE_INLINE_END_2;
1286: /* Update the row: */
1287: nz = bi[row+1] - diag_offset[row] - 1;
1288: pv += 16;
1289: for (j=0; j<nz; j++) {
1290: PREFETCH_L1(&pv[16]);
1291: x = rtmp + 16*((unsigned int)pj[j]);
1292: /* x = rtmp + 4*pj[j]; */
1294: /* X:=X-M*PV, One column at a time */
1295: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1296: SSE_INLINE_BEGIN_2(x,pv)
1297: /* Load First Column of X*/
1298: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1299: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1301: /* Matrix-Vector Product: */
1302: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1303: SSE_SHUFFLE(XMM5,XMM5,0x00)
1304: SSE_MULT_PS(XMM5,XMM0)
1305: SSE_SUB_PS(XMM4,XMM5)
1307: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1308: SSE_SHUFFLE(XMM6,XMM6,0x00)
1309: SSE_MULT_PS(XMM6,XMM1)
1310: SSE_SUB_PS(XMM4,XMM6)
1312: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1313: SSE_SHUFFLE(XMM7,XMM7,0x00)
1314: SSE_MULT_PS(XMM7,XMM2)
1315: SSE_SUB_PS(XMM4,XMM7)
1317: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1318: SSE_SHUFFLE(XMM5,XMM5,0x00)
1319: SSE_MULT_PS(XMM5,XMM3)
1320: SSE_SUB_PS(XMM4,XMM5)
1322: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1323: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1325: /* Second Column */
1326: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1327: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1329: /* Matrix-Vector Product: */
1330: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1331: SSE_SHUFFLE(XMM6,XMM6,0x00)
1332: SSE_MULT_PS(XMM6,XMM0)
1333: SSE_SUB_PS(XMM5,XMM6)
1335: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1336: SSE_SHUFFLE(XMM7,XMM7,0x00)
1337: SSE_MULT_PS(XMM7,XMM1)
1338: SSE_SUB_PS(XMM5,XMM7)
1340: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1341: SSE_SHUFFLE(XMM4,XMM4,0x00)
1342: SSE_MULT_PS(XMM4,XMM2)
1343: SSE_SUB_PS(XMM5,XMM4)
1345: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1346: SSE_SHUFFLE(XMM6,XMM6,0x00)
1347: SSE_MULT_PS(XMM6,XMM3)
1348: SSE_SUB_PS(XMM5,XMM6)
1349:
1350: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1351: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1353: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1355: /* Third Column */
1356: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1357: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1359: /* Matrix-Vector Product: */
1360: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1361: SSE_SHUFFLE(XMM7,XMM7,0x00)
1362: SSE_MULT_PS(XMM7,XMM0)
1363: SSE_SUB_PS(XMM6,XMM7)
1365: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1366: SSE_SHUFFLE(XMM4,XMM4,0x00)
1367: SSE_MULT_PS(XMM4,XMM1)
1368: SSE_SUB_PS(XMM6,XMM4)
1370: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1371: SSE_SHUFFLE(XMM5,XMM5,0x00)
1372: SSE_MULT_PS(XMM5,XMM2)
1373: SSE_SUB_PS(XMM6,XMM5)
1375: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1376: SSE_SHUFFLE(XMM7,XMM7,0x00)
1377: SSE_MULT_PS(XMM7,XMM3)
1378: SSE_SUB_PS(XMM6,XMM7)
1379:
1380: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1381: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1382:
1383: /* Fourth Column */
1384: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1385: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1387: /* Matrix-Vector Product: */
1388: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1389: SSE_SHUFFLE(XMM5,XMM5,0x00)
1390: SSE_MULT_PS(XMM5,XMM0)
1391: SSE_SUB_PS(XMM4,XMM5)
1393: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1394: SSE_SHUFFLE(XMM6,XMM6,0x00)
1395: SSE_MULT_PS(XMM6,XMM1)
1396: SSE_SUB_PS(XMM4,XMM6)
1398: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1399: SSE_SHUFFLE(XMM7,XMM7,0x00)
1400: SSE_MULT_PS(XMM7,XMM2)
1401: SSE_SUB_PS(XMM4,XMM7)
1403: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1404: SSE_SHUFFLE(XMM5,XMM5,0x00)
1405: SSE_MULT_PS(XMM5,XMM3)
1406: SSE_SUB_PS(XMM4,XMM5)
1407:
1408: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1409: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1410: SSE_INLINE_END_2;
1411: pv += 16;
1412: }
1413: PetscLogFlops(128*nz+112);
1414: }
1415: row = (unsigned int)(*bjtmp++);
1416: /* row = (*bjtmp++)/4; */
1417: /* bjtmp++; */
1418: }
1419: /* finished row so stick it into b->a */
1420: pv = ba + 16*bi[i];
1421: pj = bj + bi[i];
1422: nz = bi[i+1] - bi[i];
1424: /* Copy x block back into pv block */
1425: for (j=0; j<nz; j++) {
1426: x = rtmp+16*((unsigned int)pj[j]);
1427: /* x = rtmp+4*pj[j]; */
1429: SSE_INLINE_BEGIN_2(x,pv)
1430: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1431: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1432: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1434: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1435: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1437: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1438: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1440: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1441: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1443: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1444: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1446: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1447: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1449: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1450: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1452: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1453: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1454: SSE_INLINE_END_2;
1455: pv += 16;
1456: }
1457: /* invert diagonal block */
1458: w = ba + 16*diag_offset[i];
1459: if (pivotinblocks) {
1460: Kernel_A_gets_inverse_A_4(w);
1461: } else {
1462: Kernel_A_gets_inverse_A_4_nopivot(w);
1463: }
1464: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1465: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1466: }
1468: PetscFree(rtmp);
1469: C->factor = FACTOR_LU;
1470: C->assembled = PETSC_TRUE;
1471: PetscLogFlops(1.3333*64*b->mbs);
1472: /* Flop Count from inverting diagonal blocks */
1473: SSE_SCOPE_END;
1474: return(0);
1475: }
1477: #endif