Actual source code: baijfact2.c
1: #define PETSCMAT_DLL
3: /*
4: Factorization code for BAIJ format.
5: */
7: #include src/mat/impls/baij/seq/baij.h
8: #include src/inline/ilu.h
9: #include src/inline/dot.h
13: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14: {
15: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
17: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18: PetscInt *diag = a->diag;
19: MatScalar *aa=a->a,*v;
20: PetscScalar s1,*x,*b;
23: VecCopy(bb,xx);
24: VecGetArray(bb,&b);
25: VecGetArray(xx,&x);
26:
27: /* forward solve the U^T */
28: for (i=0; i<n; i++) {
30: v = aa + diag[i];
31: /* multiply by the inverse of the block diagonal */
32: s1 = (*v++)*x[i];
33: vi = aj + diag[i] + 1;
34: nz = ai[i+1] - diag[i] - 1;
35: while (nz--) {
36: x[*vi++] -= (*v++)*s1;
37: }
38: x[i] = s1;
39: }
40: /* backward solve the L^T */
41: for (i=n-1; i>=0; i--){
42: v = aa + diag[i] - 1;
43: vi = aj + diag[i] - 1;
44: nz = diag[i] - ai[i];
45: s1 = x[i];
46: while (nz--) {
47: x[*vi--] -= (*v--)*s1;
48: }
49: }
50: VecRestoreArray(bb,&b);
51: VecRestoreArray(xx,&x);
52: PetscLogFlops(2*(a->nz) - A->cmap.n);
53: return(0);
54: }
58: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59: {
60: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
62: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63: PetscInt *diag = a->diag,oidx;
64: MatScalar *aa=a->a,*v;
65: PetscScalar s1,s2,x1,x2;
66: PetscScalar *x,*b;
69: VecCopy(bb,xx);
70: VecGetArray(bb,&b);
71: VecGetArray(xx,&x);
73: /* forward solve the U^T */
74: idx = 0;
75: for (i=0; i<n; i++) {
77: v = aa + 4*diag[i];
78: /* multiply by the inverse of the block diagonal */
79: x1 = x[idx]; x2 = x[1+idx];
80: s1 = v[0]*x1 + v[1]*x2;
81: s2 = v[2]*x1 + v[3]*x2;
82: v += 4;
84: vi = aj + diag[i] + 1;
85: nz = ai[i+1] - diag[i] - 1;
86: while (nz--) {
87: oidx = 2*(*vi++);
88: x[oidx] -= v[0]*s1 + v[1]*s2;
89: x[oidx+1] -= v[2]*s1 + v[3]*s2;
90: v += 4;
91: }
92: x[idx] = s1;x[1+idx] = s2;
93: idx += 2;
94: }
95: /* backward solve the L^T */
96: for (i=n-1; i>=0; i--){
97: v = aa + 4*diag[i] - 4;
98: vi = aj + diag[i] - 1;
99: nz = diag[i] - ai[i];
100: idt = 2*i;
101: s1 = x[idt]; s2 = x[1+idt];
102: while (nz--) {
103: idx = 2*(*vi--);
104: x[idx] -= v[0]*s1 + v[1]*s2;
105: x[idx+1] -= v[2]*s1 + v[3]*s2;
106: v -= 4;
107: }
108: }
109: VecRestoreArray(bb,&b);
110: VecRestoreArray(xx,&x);
111: PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
112: return(0);
113: }
117: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118: {
119: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
121: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122: PetscInt *diag = a->diag,oidx;
123: MatScalar *aa=a->a,*v;
124: PetscScalar s1,s2,s3,x1,x2,x3;
125: PetscScalar *x,*b;
128: VecCopy(bb,xx);
129: VecGetArray(bb,&b);
130: VecGetArray(xx,&x);
132: /* forward solve the U^T */
133: idx = 0;
134: for (i=0; i<n; i++) {
136: v = aa + 9*diag[i];
137: /* multiply by the inverse of the block diagonal */
138: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
139: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
140: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
141: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
142: v += 9;
144: vi = aj + diag[i] + 1;
145: nz = ai[i+1] - diag[i] - 1;
146: while (nz--) {
147: oidx = 3*(*vi++);
148: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
149: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
150: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151: v += 9;
152: }
153: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
154: idx += 3;
155: }
156: /* backward solve the L^T */
157: for (i=n-1; i>=0; i--){
158: v = aa + 9*diag[i] - 9;
159: vi = aj + diag[i] - 1;
160: nz = diag[i] - ai[i];
161: idt = 3*i;
162: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
163: while (nz--) {
164: idx = 3*(*vi--);
165: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
166: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
167: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168: v -= 9;
169: }
170: }
171: VecRestoreArray(bb,&b);
172: VecRestoreArray(xx,&x);
173: PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
174: return(0);
175: }
179: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180: {
181: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
183: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184: PetscInt *diag = a->diag,oidx;
185: MatScalar *aa=a->a,*v;
186: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
187: PetscScalar *x,*b;
190: VecCopy(bb,xx);
191: VecGetArray(bb,&b);
192: VecGetArray(xx,&x);
194: /* forward solve the U^T */
195: idx = 0;
196: for (i=0; i<n; i++) {
198: v = aa + 16*diag[i];
199: /* multiply by the inverse of the block diagonal */
200: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
201: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
202: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
203: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
204: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205: v += 16;
207: vi = aj + diag[i] + 1;
208: nz = ai[i+1] - diag[i] - 1;
209: while (nz--) {
210: oidx = 4*(*vi++);
211: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
212: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
213: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215: v += 16;
216: }
217: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218: idx += 4;
219: }
220: /* backward solve the L^T */
221: for (i=n-1; i>=0; i--){
222: v = aa + 16*diag[i] - 16;
223: vi = aj + diag[i] - 1;
224: nz = diag[i] - ai[i];
225: idt = 4*i;
226: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227: while (nz--) {
228: idx = 4*(*vi--);
229: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
230: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
231: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233: v -= 16;
234: }
235: }
236: VecRestoreArray(bb,&b);
237: VecRestoreArray(xx,&x);
238: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
239: return(0);
240: }
244: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245: {
246: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
248: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249: PetscInt *diag = a->diag,oidx;
250: MatScalar *aa=a->a,*v;
251: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252: PetscScalar *x,*b;
255: VecCopy(bb,xx);
256: VecGetArray(bb,&b);
257: VecGetArray(xx,&x);
259: /* forward solve the U^T */
260: idx = 0;
261: for (i=0; i<n; i++) {
263: v = aa + 25*diag[i];
264: /* multiply by the inverse of the block diagonal */
265: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
267: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
268: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271: v += 25;
273: vi = aj + diag[i] + 1;
274: nz = ai[i+1] - diag[i] - 1;
275: while (nz--) {
276: oidx = 5*(*vi++);
277: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
278: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
279: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282: v += 25;
283: }
284: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285: idx += 5;
286: }
287: /* backward solve the L^T */
288: for (i=n-1; i>=0; i--){
289: v = aa + 25*diag[i] - 25;
290: vi = aj + diag[i] - 1;
291: nz = diag[i] - ai[i];
292: idt = 5*i;
293: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294: while (nz--) {
295: idx = 5*(*vi--);
296: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
297: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
298: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301: v -= 25;
302: }
303: }
304: VecRestoreArray(bb,&b);
305: VecRestoreArray(xx,&x);
306: PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
307: return(0);
308: }
312: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313: {
314: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
316: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317: PetscInt *diag = a->diag,oidx;
318: MatScalar *aa=a->a,*v;
319: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320: PetscScalar *x,*b;
323: VecCopy(bb,xx);
324: VecGetArray(bb,&b);
325: VecGetArray(xx,&x);
327: /* forward solve the U^T */
328: idx = 0;
329: for (i=0; i<n; i++) {
331: v = aa + 36*diag[i];
332: /* multiply by the inverse of the block diagonal */
333: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334: x6 = x[5+idx];
335: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
336: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
337: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341: v += 36;
343: vi = aj + diag[i] + 1;
344: nz = ai[i+1] - diag[i] - 1;
345: while (nz--) {
346: oidx = 6*(*vi++);
347: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
348: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
349: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353: v += 36;
354: }
355: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356: x[5+idx] = s6;
357: idx += 6;
358: }
359: /* backward solve the L^T */
360: for (i=n-1; i>=0; i--){
361: v = aa + 36*diag[i] - 36;
362: vi = aj + diag[i] - 1;
363: nz = diag[i] - ai[i];
364: idt = 6*i;
365: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366: s6 = x[5+idt];
367: while (nz--) {
368: idx = 6*(*vi--);
369: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
370: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
371: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375: v -= 36;
376: }
377: }
378: VecRestoreArray(bb,&b);
379: VecRestoreArray(xx,&x);
380: PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
381: return(0);
382: }
386: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387: {
388: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
390: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391: PetscInt *diag = a->diag,oidx;
392: MatScalar *aa=a->a,*v;
393: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394: PetscScalar *x,*b;
397: VecCopy(bb,xx);
398: VecGetArray(bb,&b);
399: VecGetArray(xx,&x);
401: /* forward solve the U^T */
402: idx = 0;
403: for (i=0; i<n; i++) {
405: v = aa + 49*diag[i];
406: /* multiply by the inverse of the block diagonal */
407: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408: x6 = x[5+idx]; x7 = x[6+idx];
409: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
410: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416: v += 49;
418: vi = aj + diag[i] + 1;
419: nz = ai[i+1] - diag[i] - 1;
420: while (nz--) {
421: oidx = 7*(*vi++);
422: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
423: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429: v += 49;
430: }
431: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432: x[5+idx] = s6;x[6+idx] = s7;
433: idx += 7;
434: }
435: /* backward solve the L^T */
436: for (i=n-1; i>=0; i--){
437: v = aa + 49*diag[i] - 49;
438: vi = aj + diag[i] - 1;
439: nz = diag[i] - ai[i];
440: idt = 7*i;
441: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442: s6 = x[5+idt];s7 = x[6+idt];
443: while (nz--) {
444: idx = 7*(*vi--);
445: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
446: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452: v -= 49;
453: }
454: }
455: VecRestoreArray(bb,&b);
456: VecRestoreArray(xx,&x);
457: PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
458: return(0);
459: }
461: /*---------------------------------------------------------------------------------------------*/
464: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465: {
466: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
467: IS iscol=a->col,isrow=a->row;
469: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
470: PetscInt *diag = a->diag;
471: MatScalar *aa=a->a,*v;
472: PetscScalar s1,*x,*b,*t;
475: VecGetArray(bb,&b);
476: VecGetArray(xx,&x);
477: t = a->solve_work;
479: ISGetIndices(isrow,&rout); r = rout;
480: ISGetIndices(iscol,&cout); c = cout;
482: /* copy the b into temp work space according to permutation */
483: for (i=0; i<n; i++) {
484: t[i] = b[c[i]];
485: }
487: /* forward solve the U^T */
488: for (i=0; i<n; i++) {
490: v = aa + diag[i];
491: /* multiply by the inverse of the block diagonal */
492: s1 = (*v++)*t[i];
493: vi = aj + diag[i] + 1;
494: nz = ai[i+1] - diag[i] - 1;
495: while (nz--) {
496: t[*vi++] -= (*v++)*s1;
497: }
498: t[i] = s1;
499: }
500: /* backward solve the L^T */
501: for (i=n-1; i>=0; i--){
502: v = aa + diag[i] - 1;
503: vi = aj + diag[i] - 1;
504: nz = diag[i] - ai[i];
505: s1 = t[i];
506: while (nz--) {
507: t[*vi--] -= (*v--)*s1;
508: }
509: }
511: /* copy t into x according to permutation */
512: for (i=0; i<n; i++) {
513: x[r[i]] = t[i];
514: }
516: ISRestoreIndices(isrow,&rout);
517: ISRestoreIndices(iscol,&cout);
518: VecRestoreArray(bb,&b);
519: VecRestoreArray(xx,&x);
520: PetscLogFlops(2*(a->nz) - A->cmap.n);
521: return(0);
522: }
526: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
527: {
528: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
529: IS iscol=a->col,isrow=a->row;
531: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
532: PetscInt *diag = a->diag,ii,ic,ir,oidx;
533: MatScalar *aa=a->a,*v;
534: PetscScalar s1,s2,x1,x2;
535: PetscScalar *x,*b,*t;
538: VecGetArray(bb,&b);
539: VecGetArray(xx,&x);
540: t = a->solve_work;
542: ISGetIndices(isrow,&rout); r = rout;
543: ISGetIndices(iscol,&cout); c = cout;
545: /* copy the b into temp work space according to permutation */
546: ii = 0;
547: for (i=0; i<n; i++) {
548: ic = 2*c[i];
549: t[ii] = b[ic];
550: t[ii+1] = b[ic+1];
551: ii += 2;
552: }
554: /* forward solve the U^T */
555: idx = 0;
556: for (i=0; i<n; i++) {
558: v = aa + 4*diag[i];
559: /* multiply by the inverse of the block diagonal */
560: x1 = t[idx]; x2 = t[1+idx];
561: s1 = v[0]*x1 + v[1]*x2;
562: s2 = v[2]*x1 + v[3]*x2;
563: v += 4;
565: vi = aj + diag[i] + 1;
566: nz = ai[i+1] - diag[i] - 1;
567: while (nz--) {
568: oidx = 2*(*vi++);
569: t[oidx] -= v[0]*s1 + v[1]*s2;
570: t[oidx+1] -= v[2]*s1 + v[3]*s2;
571: v += 4;
572: }
573: t[idx] = s1;t[1+idx] = s2;
574: idx += 2;
575: }
576: /* backward solve the L^T */
577: for (i=n-1; i>=0; i--){
578: v = aa + 4*diag[i] - 4;
579: vi = aj + diag[i] - 1;
580: nz = diag[i] - ai[i];
581: idt = 2*i;
582: s1 = t[idt]; s2 = t[1+idt];
583: while (nz--) {
584: idx = 2*(*vi--);
585: t[idx] -= v[0]*s1 + v[1]*s2;
586: t[idx+1] -= v[2]*s1 + v[3]*s2;
587: v -= 4;
588: }
589: }
591: /* copy t into x according to permutation */
592: ii = 0;
593: for (i=0; i<n; i++) {
594: ir = 2*r[i];
595: x[ir] = t[ii];
596: x[ir+1] = t[ii+1];
597: ii += 2;
598: }
600: ISRestoreIndices(isrow,&rout);
601: ISRestoreIndices(iscol,&cout);
602: VecRestoreArray(bb,&b);
603: VecRestoreArray(xx,&x);
604: PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
605: return(0);
606: }
610: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
611: {
612: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
613: IS iscol=a->col,isrow=a->row;
615: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
616: PetscInt *diag = a->diag,ii,ic,ir,oidx;
617: MatScalar *aa=a->a,*v;
618: PetscScalar s1,s2,s3,x1,x2,x3;
619: PetscScalar *x,*b,*t;
622: VecGetArray(bb,&b);
623: VecGetArray(xx,&x);
624: t = a->solve_work;
626: ISGetIndices(isrow,&rout); r = rout;
627: ISGetIndices(iscol,&cout); c = cout;
629: /* copy the b into temp work space according to permutation */
630: ii = 0;
631: for (i=0; i<n; i++) {
632: ic = 3*c[i];
633: t[ii] = b[ic];
634: t[ii+1] = b[ic+1];
635: t[ii+2] = b[ic+2];
636: ii += 3;
637: }
639: /* forward solve the U^T */
640: idx = 0;
641: for (i=0; i<n; i++) {
643: v = aa + 9*diag[i];
644: /* multiply by the inverse of the block diagonal */
645: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
646: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
647: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
648: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
649: v += 9;
651: vi = aj + diag[i] + 1;
652: nz = ai[i+1] - diag[i] - 1;
653: while (nz--) {
654: oidx = 3*(*vi++);
655: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
656: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
657: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658: v += 9;
659: }
660: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
661: idx += 3;
662: }
663: /* backward solve the L^T */
664: for (i=n-1; i>=0; i--){
665: v = aa + 9*diag[i] - 9;
666: vi = aj + diag[i] - 1;
667: nz = diag[i] - ai[i];
668: idt = 3*i;
669: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
670: while (nz--) {
671: idx = 3*(*vi--);
672: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
673: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
674: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
675: v -= 9;
676: }
677: }
679: /* copy t into x according to permutation */
680: ii = 0;
681: for (i=0; i<n; i++) {
682: ir = 3*r[i];
683: x[ir] = t[ii];
684: x[ir+1] = t[ii+1];
685: x[ir+2] = t[ii+2];
686: ii += 3;
687: }
689: ISRestoreIndices(isrow,&rout);
690: ISRestoreIndices(iscol,&cout);
691: VecRestoreArray(bb,&b);
692: VecRestoreArray(xx,&x);
693: PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
694: return(0);
695: }
699: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
700: {
701: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
702: IS iscol=a->col,isrow=a->row;
704: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
705: PetscInt *diag = a->diag,ii,ic,ir,oidx;
706: MatScalar *aa=a->a,*v;
707: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
708: PetscScalar *x,*b,*t;
711: VecGetArray(bb,&b);
712: VecGetArray(xx,&x);
713: t = a->solve_work;
715: ISGetIndices(isrow,&rout); r = rout;
716: ISGetIndices(iscol,&cout); c = cout;
718: /* copy the b into temp work space according to permutation */
719: ii = 0;
720: for (i=0; i<n; i++) {
721: ic = 4*c[i];
722: t[ii] = b[ic];
723: t[ii+1] = b[ic+1];
724: t[ii+2] = b[ic+2];
725: t[ii+3] = b[ic+3];
726: ii += 4;
727: }
729: /* forward solve the U^T */
730: idx = 0;
731: for (i=0; i<n; i++) {
733: v = aa + 16*diag[i];
734: /* multiply by the inverse of the block diagonal */
735: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
736: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
737: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
738: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
739: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
740: v += 16;
742: vi = aj + diag[i] + 1;
743: nz = ai[i+1] - diag[i] - 1;
744: while (nz--) {
745: oidx = 4*(*vi++);
746: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
747: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
748: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750: v += 16;
751: }
752: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
753: idx += 4;
754: }
755: /* backward solve the L^T */
756: for (i=n-1; i>=0; i--){
757: v = aa + 16*diag[i] - 16;
758: vi = aj + diag[i] - 1;
759: nz = diag[i] - ai[i];
760: idt = 4*i;
761: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
762: while (nz--) {
763: idx = 4*(*vi--);
764: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
765: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
766: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
767: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
768: v -= 16;
769: }
770: }
772: /* copy t into x according to permutation */
773: ii = 0;
774: for (i=0; i<n; i++) {
775: ir = 4*r[i];
776: x[ir] = t[ii];
777: x[ir+1] = t[ii+1];
778: x[ir+2] = t[ii+2];
779: x[ir+3] = t[ii+3];
780: ii += 4;
781: }
783: ISRestoreIndices(isrow,&rout);
784: ISRestoreIndices(iscol,&cout);
785: VecRestoreArray(bb,&b);
786: VecRestoreArray(xx,&x);
787: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
788: return(0);
789: }
793: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
794: {
795: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
796: IS iscol=a->col,isrow=a->row;
798: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
799: PetscInt *diag = a->diag,ii,ic,ir,oidx;
800: MatScalar *aa=a->a,*v;
801: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
802: PetscScalar *x,*b,*t;
805: VecGetArray(bb,&b);
806: VecGetArray(xx,&x);
807: t = a->solve_work;
809: ISGetIndices(isrow,&rout); r = rout;
810: ISGetIndices(iscol,&cout); c = cout;
812: /* copy the b into temp work space according to permutation */
813: ii = 0;
814: for (i=0; i<n; i++) {
815: ic = 5*c[i];
816: t[ii] = b[ic];
817: t[ii+1] = b[ic+1];
818: t[ii+2] = b[ic+2];
819: t[ii+3] = b[ic+3];
820: t[ii+4] = b[ic+4];
821: ii += 5;
822: }
824: /* forward solve the U^T */
825: idx = 0;
826: for (i=0; i<n; i++) {
828: v = aa + 25*diag[i];
829: /* multiply by the inverse of the block diagonal */
830: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
831: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
832: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
833: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
834: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
835: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
836: v += 25;
838: vi = aj + diag[i] + 1;
839: nz = ai[i+1] - diag[i] - 1;
840: while (nz--) {
841: oidx = 5*(*vi++);
842: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
843: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
844: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847: v += 25;
848: }
849: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
850: idx += 5;
851: }
852: /* backward solve the L^T */
853: for (i=n-1; i>=0; i--){
854: v = aa + 25*diag[i] - 25;
855: vi = aj + diag[i] - 1;
856: nz = diag[i] - ai[i];
857: idt = 5*i;
858: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
859: while (nz--) {
860: idx = 5*(*vi--);
861: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
862: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
863: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
864: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
865: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
866: v -= 25;
867: }
868: }
870: /* copy t into x according to permutation */
871: ii = 0;
872: for (i=0; i<n; i++) {
873: ir = 5*r[i];
874: x[ir] = t[ii];
875: x[ir+1] = t[ii+1];
876: x[ir+2] = t[ii+2];
877: x[ir+3] = t[ii+3];
878: x[ir+4] = t[ii+4];
879: ii += 5;
880: }
882: ISRestoreIndices(isrow,&rout);
883: ISRestoreIndices(iscol,&cout);
884: VecRestoreArray(bb,&b);
885: VecRestoreArray(xx,&x);
886: PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
887: return(0);
888: }
892: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
893: {
894: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
895: IS iscol=a->col,isrow=a->row;
897: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
898: PetscInt *diag = a->diag,ii,ic,ir,oidx;
899: MatScalar *aa=a->a,*v;
900: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
901: PetscScalar *x,*b,*t;
904: VecGetArray(bb,&b);
905: VecGetArray(xx,&x);
906: t = a->solve_work;
908: ISGetIndices(isrow,&rout); r = rout;
909: ISGetIndices(iscol,&cout); c = cout;
911: /* copy the b into temp work space according to permutation */
912: ii = 0;
913: for (i=0; i<n; i++) {
914: ic = 6*c[i];
915: t[ii] = b[ic];
916: t[ii+1] = b[ic+1];
917: t[ii+2] = b[ic+2];
918: t[ii+3] = b[ic+3];
919: t[ii+4] = b[ic+4];
920: t[ii+5] = b[ic+5];
921: ii += 6;
922: }
924: /* forward solve the U^T */
925: idx = 0;
926: for (i=0; i<n; i++) {
928: v = aa + 36*diag[i];
929: /* multiply by the inverse of the block diagonal */
930: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
931: x6 = t[5+idx];
932: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
933: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
934: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
935: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
936: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
937: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
938: v += 36;
940: vi = aj + diag[i] + 1;
941: nz = ai[i+1] - diag[i] - 1;
942: while (nz--) {
943: oidx = 6*(*vi++);
944: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
945: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
946: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
947: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
948: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
949: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
950: v += 36;
951: }
952: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
953: t[5+idx] = s6;
954: idx += 6;
955: }
956: /* backward solve the L^T */
957: for (i=n-1; i>=0; i--){
958: v = aa + 36*diag[i] - 36;
959: vi = aj + diag[i] - 1;
960: nz = diag[i] - ai[i];
961: idt = 6*i;
962: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
963: s6 = t[5+idt];
964: while (nz--) {
965: idx = 6*(*vi--);
966: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
967: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
968: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
969: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
970: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
971: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
972: v -= 36;
973: }
974: }
976: /* copy t into x according to permutation */
977: ii = 0;
978: for (i=0; i<n; i++) {
979: ir = 6*r[i];
980: x[ir] = t[ii];
981: x[ir+1] = t[ii+1];
982: x[ir+2] = t[ii+2];
983: x[ir+3] = t[ii+3];
984: x[ir+4] = t[ii+4];
985: x[ir+5] = t[ii+5];
986: ii += 6;
987: }
989: ISRestoreIndices(isrow,&rout);
990: ISRestoreIndices(iscol,&cout);
991: VecRestoreArray(bb,&b);
992: VecRestoreArray(xx,&x);
993: PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
994: return(0);
995: }
999: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1000: {
1001: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1002: IS iscol=a->col,isrow=a->row;
1004: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1005: PetscInt *diag = a->diag,ii,ic,ir,oidx;
1006: MatScalar *aa=a->a,*v;
1007: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1008: PetscScalar *x,*b,*t;
1011: VecGetArray(bb,&b);
1012: VecGetArray(xx,&x);
1013: t = a->solve_work;
1015: ISGetIndices(isrow,&rout); r = rout;
1016: ISGetIndices(iscol,&cout); c = cout;
1018: /* copy the b into temp work space according to permutation */
1019: ii = 0;
1020: for (i=0; i<n; i++) {
1021: ic = 7*c[i];
1022: t[ii] = b[ic];
1023: t[ii+1] = b[ic+1];
1024: t[ii+2] = b[ic+2];
1025: t[ii+3] = b[ic+3];
1026: t[ii+4] = b[ic+4];
1027: t[ii+5] = b[ic+5];
1028: t[ii+6] = b[ic+6];
1029: ii += 7;
1030: }
1032: /* forward solve the U^T */
1033: idx = 0;
1034: for (i=0; i<n; i++) {
1036: v = aa + 49*diag[i];
1037: /* multiply by the inverse of the block diagonal */
1038: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1039: x6 = t[5+idx]; x7 = t[6+idx];
1040: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1041: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1042: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1043: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1044: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1045: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1046: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1047: v += 49;
1049: vi = aj + diag[i] + 1;
1050: nz = ai[i+1] - diag[i] - 1;
1051: while (nz--) {
1052: oidx = 7*(*vi++);
1053: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1054: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1055: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1056: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1057: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1058: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1059: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1060: v += 49;
1061: }
1062: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1063: t[5+idx] = s6;t[6+idx] = s7;
1064: idx += 7;
1065: }
1066: /* backward solve the L^T */
1067: for (i=n-1; i>=0; i--){
1068: v = aa + 49*diag[i] - 49;
1069: vi = aj + diag[i] - 1;
1070: nz = diag[i] - ai[i];
1071: idt = 7*i;
1072: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1073: s6 = t[5+idt];s7 = t[6+idt];
1074: while (nz--) {
1075: idx = 7*(*vi--);
1076: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1077: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1078: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1079: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1080: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1081: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1082: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1083: v -= 49;
1084: }
1085: }
1087: /* copy t into x according to permutation */
1088: ii = 0;
1089: for (i=0; i<n; i++) {
1090: ir = 7*r[i];
1091: x[ir] = t[ii];
1092: x[ir+1] = t[ii+1];
1093: x[ir+2] = t[ii+2];
1094: x[ir+3] = t[ii+3];
1095: x[ir+4] = t[ii+4];
1096: x[ir+5] = t[ii+5];
1097: x[ir+6] = t[ii+6];
1098: ii += 7;
1099: }
1101: ISRestoreIndices(isrow,&rout);
1102: ISRestoreIndices(iscol,&cout);
1103: VecRestoreArray(bb,&b);
1104: VecRestoreArray(xx,&x);
1105: PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
1106: return(0);
1107: }
1109: /* ----------------------------------------------------------- */
1112: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1113: {
1114: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1115: IS iscol=a->col,isrow=a->row;
1117: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1118: PetscInt nz,bs=A->rmap.bs,bs2=a->bs2,*rout,*cout;
1119: MatScalar *aa=a->a,*v;
1120: PetscScalar *x,*b,*s,*t,*ls;
1123: VecGetArray(bb,&b);
1124: VecGetArray(xx,&x);
1125: t = a->solve_work;
1127: ISGetIndices(isrow,&rout); r = rout;
1128: ISGetIndices(iscol,&cout); c = cout + (n-1);
1130: /* forward solve the lower triangular */
1131: PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1132: for (i=1; i<n; i++) {
1133: v = aa + bs2*ai[i];
1134: vi = aj + ai[i];
1135: nz = a->diag[i] - ai[i];
1136: s = t + bs*i;
1137: PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1138: while (nz--) {
1139: Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1140: v += bs2;
1141: }
1142: }
1143: /* backward solve the upper triangular */
1144: ls = a->solve_work + A->cmap.n;
1145: for (i=n-1; i>=0; i--){
1146: v = aa + bs2*(a->diag[i] + 1);
1147: vi = aj + a->diag[i] + 1;
1148: nz = ai[i+1] - a->diag[i] - 1;
1149: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1150: while (nz--) {
1151: Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1152: v += bs2;
1153: }
1154: Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1155: PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1156: }
1158: ISRestoreIndices(isrow,&rout);
1159: ISRestoreIndices(iscol,&cout);
1160: VecRestoreArray(bb,&b);
1161: VecRestoreArray(xx,&x);
1162: PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap.bs*A->cmap.n);
1163: return(0);
1164: }
1168: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1169: {
1170: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1171: IS iscol=a->col,isrow=a->row;
1173: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1174: PetscInt *diag = a->diag;
1175: MatScalar *aa=a->a,*v;
1176: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1177: PetscScalar *x,*b,*t;
1180: VecGetArray(bb,&b);
1181: VecGetArray(xx,&x);
1182: t = a->solve_work;
1184: ISGetIndices(isrow,&rout); r = rout;
1185: ISGetIndices(iscol,&cout); c = cout + (n-1);
1187: /* forward solve the lower triangular */
1188: idx = 7*(*r++);
1189: t[0] = b[idx]; t[1] = b[1+idx];
1190: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1191: t[5] = b[5+idx]; t[6] = b[6+idx];
1193: for (i=1; i<n; i++) {
1194: v = aa + 49*ai[i];
1195: vi = aj + ai[i];
1196: nz = diag[i] - ai[i];
1197: idx = 7*(*r++);
1198: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1199: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1200: while (nz--) {
1201: idx = 7*(*vi++);
1202: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1203: x4 = t[3+idx];x5 = t[4+idx];
1204: x6 = t[5+idx];x7 = t[6+idx];
1205: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1206: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1207: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1208: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1209: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1210: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1211: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1212: v += 49;
1213: }
1214: idx = 7*i;
1215: t[idx] = s1;t[1+idx] = s2;
1216: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1217: t[5+idx] = s6;t[6+idx] = s7;
1218: }
1219: /* backward solve the upper triangular */
1220: for (i=n-1; i>=0; i--){
1221: v = aa + 49*diag[i] + 49;
1222: vi = aj + diag[i] + 1;
1223: nz = ai[i+1] - diag[i] - 1;
1224: idt = 7*i;
1225: s1 = t[idt]; s2 = t[1+idt];
1226: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1227: s6 = t[5+idt];s7 = t[6+idt];
1228: while (nz--) {
1229: idx = 7*(*vi++);
1230: x1 = t[idx]; x2 = t[1+idx];
1231: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1232: x6 = t[5+idx]; x7 = t[6+idx];
1233: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1234: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1235: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1236: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1237: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1238: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1239: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1240: v += 49;
1241: }
1242: idc = 7*(*c--);
1243: v = aa + 49*diag[i];
1244: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
1245: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1246: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1247: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1248: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1249: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1250: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1251: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1252: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1253: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1254: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1255: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1256: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1257: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1258: }
1260: ISRestoreIndices(isrow,&rout);
1261: ISRestoreIndices(iscol,&cout);
1262: VecRestoreArray(bb,&b);
1263: VecRestoreArray(xx,&x);
1264: PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
1265: return(0);
1266: }
1270: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1271: {
1272: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1273: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1274: PetscErrorCode ierr;
1275: PetscInt *diag = a->diag,jdx;
1276: const MatScalar *aa=a->a,*v;
1277: PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1278: const PetscScalar *b;
1281: VecGetArray(bb,(PetscScalar**)&b);
1282: VecGetArray(xx,&x);
1283: /* forward solve the lower triangular */
1284: idx = 0;
1285: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1286: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1287: x[6] = b[6+idx];
1288: for (i=1; i<n; i++) {
1289: v = aa + 49*ai[i];
1290: vi = aj + ai[i];
1291: nz = diag[i] - ai[i];
1292: idx = 7*i;
1293: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1294: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1295: s7 = b[6+idx];
1296: while (nz--) {
1297: jdx = 7*(*vi++);
1298: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1299: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1300: x7 = x[6+jdx];
1301: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1302: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1303: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1304: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1305: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1306: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1307: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1308: v += 49;
1309: }
1310: x[idx] = s1;
1311: x[1+idx] = s2;
1312: x[2+idx] = s3;
1313: x[3+idx] = s4;
1314: x[4+idx] = s5;
1315: x[5+idx] = s6;
1316: x[6+idx] = s7;
1317: }
1318: /* backward solve the upper triangular */
1319: for (i=n-1; i>=0; i--){
1320: v = aa + 49*diag[i] + 49;
1321: vi = aj + diag[i] + 1;
1322: nz = ai[i+1] - diag[i] - 1;
1323: idt = 7*i;
1324: s1 = x[idt]; s2 = x[1+idt];
1325: s3 = x[2+idt]; s4 = x[3+idt];
1326: s5 = x[4+idt]; s6 = x[5+idt];
1327: s7 = x[6+idt];
1328: while (nz--) {
1329: idx = 7*(*vi++);
1330: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1331: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1332: x7 = x[6+idx];
1333: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1334: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1335: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1336: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1337: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1338: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1339: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1340: v += 49;
1341: }
1342: v = aa + 49*diag[i];
1343: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
1344: + v[28]*s5 + v[35]*s6 + v[42]*s7;
1345: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
1346: + v[29]*s5 + v[36]*s6 + v[43]*s7;
1347: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
1348: + v[30]*s5 + v[37]*s6 + v[44]*s7;
1349: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
1350: + v[31]*s5 + v[38]*s6 + v[45]*s7;
1351: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
1352: + v[32]*s5 + v[39]*s6 + v[46]*s7;
1353: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
1354: + v[33]*s5 + v[40]*s6 + v[47]*s7;
1355: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
1356: + v[34]*s5 + v[41]*s6 + v[48]*s7;
1357: }
1359: VecRestoreArray(bb,(PetscScalar**)&b);
1360: VecRestoreArray(xx,&x);
1361: PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1362: return(0);
1363: }
1367: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1368: {
1369: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1370: IS iscol=a->col,isrow=a->row;
1371: PetscErrorCode ierr;
1372: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1373: PetscInt *diag = a->diag;
1374: const MatScalar *aa=a->a,*v;
1375: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1376: const PetscScalar *b;
1378: VecGetArray(bb,(PetscScalar**)&b);
1379: VecGetArray(xx,&x);
1380: t = a->solve_work;
1382: ISGetIndices(isrow,&rout); r = rout;
1383: ISGetIndices(iscol,&cout); c = cout + (n-1);
1385: /* forward solve the lower triangular */
1386: idx = 6*(*r++);
1387: t[0] = b[idx]; t[1] = b[1+idx];
1388: t[2] = b[2+idx]; t[3] = b[3+idx];
1389: t[4] = b[4+idx]; t[5] = b[5+idx];
1390: for (i=1; i<n; i++) {
1391: v = aa + 36*ai[i];
1392: vi = aj + ai[i];
1393: nz = diag[i] - ai[i];
1394: idx = 6*(*r++);
1395: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1396: s5 = b[4+idx]; s6 = b[5+idx];
1397: while (nz--) {
1398: idx = 6*(*vi++);
1399: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1400: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1401: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1402: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1403: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1404: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1405: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1406: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1407: v += 36;
1408: }
1409: idx = 6*i;
1410: t[idx] = s1;t[1+idx] = s2;
1411: t[2+idx] = s3;t[3+idx] = s4;
1412: t[4+idx] = s5;t[5+idx] = s6;
1413: }
1414: /* backward solve the upper triangular */
1415: for (i=n-1; i>=0; i--){
1416: v = aa + 36*diag[i] + 36;
1417: vi = aj + diag[i] + 1;
1418: nz = ai[i+1] - diag[i] - 1;
1419: idt = 6*i;
1420: s1 = t[idt]; s2 = t[1+idt];
1421: s3 = t[2+idt];s4 = t[3+idt];
1422: s5 = t[4+idt];s6 = t[5+idt];
1423: while (nz--) {
1424: idx = 6*(*vi++);
1425: x1 = t[idx]; x2 = t[1+idx];
1426: x3 = t[2+idx]; x4 = t[3+idx];
1427: x5 = t[4+idx]; x6 = t[5+idx];
1428: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1429: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1430: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1431: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1432: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1433: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1434: v += 36;
1435: }
1436: idc = 6*(*c--);
1437: v = aa + 36*diag[i];
1438: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
1439: v[18]*s4+v[24]*s5+v[30]*s6;
1440: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1441: v[19]*s4+v[25]*s5+v[31]*s6;
1442: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1443: v[20]*s4+v[26]*s5+v[32]*s6;
1444: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1445: v[21]*s4+v[27]*s5+v[33]*s6;
1446: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1447: v[22]*s4+v[28]*s5+v[34]*s6;
1448: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1449: v[23]*s4+v[29]*s5+v[35]*s6;
1450: }
1452: ISRestoreIndices(isrow,&rout);
1453: ISRestoreIndices(iscol,&cout);
1454: VecRestoreArray(bb,(PetscScalar**)&b);
1455: VecRestoreArray(xx,&x);
1456: PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1457: return(0);
1458: }
1462: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1463: {
1464: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1465: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1466: PetscErrorCode ierr;
1467: PetscInt *diag = a->diag,jdx;
1468: const MatScalar *aa=a->a,*v;
1469: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1470: const PetscScalar *b;
1473: VecGetArray(bb,(PetscScalar**)&b);
1474: VecGetArray(xx,&x);
1475: /* forward solve the lower triangular */
1476: idx = 0;
1477: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1478: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1479: for (i=1; i<n; i++) {
1480: v = aa + 36*ai[i];
1481: vi = aj + ai[i];
1482: nz = diag[i] - ai[i];
1483: idx = 6*i;
1484: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1485: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1486: while (nz--) {
1487: jdx = 6*(*vi++);
1488: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1489: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1490: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1491: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1492: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1493: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1494: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1495: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1496: v += 36;
1497: }
1498: x[idx] = s1;
1499: x[1+idx] = s2;
1500: x[2+idx] = s3;
1501: x[3+idx] = s4;
1502: x[4+idx] = s5;
1503: x[5+idx] = s6;
1504: }
1505: /* backward solve the upper triangular */
1506: for (i=n-1; i>=0; i--){
1507: v = aa + 36*diag[i] + 36;
1508: vi = aj + diag[i] + 1;
1509: nz = ai[i+1] - diag[i] - 1;
1510: idt = 6*i;
1511: s1 = x[idt]; s2 = x[1+idt];
1512: s3 = x[2+idt]; s4 = x[3+idt];
1513: s5 = x[4+idt]; s6 = x[5+idt];
1514: while (nz--) {
1515: idx = 6*(*vi++);
1516: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1517: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1518: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1519: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1520: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1521: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1522: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1523: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1524: v += 36;
1525: }
1526: v = aa + 36*diag[i];
1527: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1528: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1529: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1530: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1531: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1532: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1533: }
1535: VecRestoreArray(bb,(PetscScalar**)&b);
1536: VecRestoreArray(xx,&x);
1537: PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1538: return(0);
1539: }
1543: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1544: {
1545: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1546: IS iscol=a->col,isrow=a->row;
1547: PetscErrorCode ierr;
1548: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1549: PetscInt *diag = a->diag;
1550: const MatScalar *aa=a->a,*v;
1551: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1552: const PetscScalar *b;
1555: VecGetArray(bb,(PetscScalar**)&b);
1556: VecGetArray(xx,&x);
1557: t = a->solve_work;
1559: ISGetIndices(isrow,&rout); r = rout;
1560: ISGetIndices(iscol,&cout); c = cout + (n-1);
1562: /* forward solve the lower triangular */
1563: idx = 5*(*r++);
1564: t[0] = b[idx]; t[1] = b[1+idx];
1565: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1566: for (i=1; i<n; i++) {
1567: v = aa + 25*ai[i];
1568: vi = aj + ai[i];
1569: nz = diag[i] - ai[i];
1570: idx = 5*(*r++);
1571: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1572: s5 = b[4+idx];
1573: while (nz--) {
1574: idx = 5*(*vi++);
1575: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1576: x4 = t[3+idx];x5 = t[4+idx];
1577: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1582: v += 25;
1583: }
1584: idx = 5*i;
1585: t[idx] = s1;t[1+idx] = s2;
1586: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1587: }
1588: /* backward solve the upper triangular */
1589: for (i=n-1; i>=0; i--){
1590: v = aa + 25*diag[i] + 25;
1591: vi = aj + diag[i] + 1;
1592: nz = ai[i+1] - diag[i] - 1;
1593: idt = 5*i;
1594: s1 = t[idt]; s2 = t[1+idt];
1595: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1596: while (nz--) {
1597: idx = 5*(*vi++);
1598: x1 = t[idx]; x2 = t[1+idx];
1599: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1600: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1601: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1602: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1603: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1604: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1605: v += 25;
1606: }
1607: idc = 5*(*c--);
1608: v = aa + 25*diag[i];
1609: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
1610: v[15]*s4+v[20]*s5;
1611: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1612: v[16]*s4+v[21]*s5;
1613: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1614: v[17]*s4+v[22]*s5;
1615: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1616: v[18]*s4+v[23]*s5;
1617: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1618: v[19]*s4+v[24]*s5;
1619: }
1621: ISRestoreIndices(isrow,&rout);
1622: ISRestoreIndices(iscol,&cout);
1623: VecRestoreArray(bb,(PetscScalar**)&b);
1624: VecRestoreArray(xx,&x);
1625: PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
1626: return(0);
1627: }
1631: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1632: {
1633: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1634: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1635: PetscErrorCode ierr;
1636: PetscInt *diag = a->diag,jdx;
1637: const MatScalar *aa=a->a,*v;
1638: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1639: const PetscScalar *b;
1642: VecGetArray(bb,(PetscScalar**)&b);
1643: VecGetArray(xx,&x);
1644: /* forward solve the lower triangular */
1645: idx = 0;
1646: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1647: for (i=1; i<n; i++) {
1648: v = aa + 25*ai[i];
1649: vi = aj + ai[i];
1650: nz = diag[i] - ai[i];
1651: idx = 5*i;
1652: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1653: while (nz--) {
1654: jdx = 5*(*vi++);
1655: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1656: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1657: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1658: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1659: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1660: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1661: v += 25;
1662: }
1663: x[idx] = s1;
1664: x[1+idx] = s2;
1665: x[2+idx] = s3;
1666: x[3+idx] = s4;
1667: x[4+idx] = s5;
1668: }
1669: /* backward solve the upper triangular */
1670: for (i=n-1; i>=0; i--){
1671: v = aa + 25*diag[i] + 25;
1672: vi = aj + diag[i] + 1;
1673: nz = ai[i+1] - diag[i] - 1;
1674: idt = 5*i;
1675: s1 = x[idt]; s2 = x[1+idt];
1676: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1677: while (nz--) {
1678: idx = 5*(*vi++);
1679: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1680: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1681: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1682: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1683: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1684: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1685: v += 25;
1686: }
1687: v = aa + 25*diag[i];
1688: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
1689: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
1690: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
1691: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
1692: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
1693: }
1695: VecRestoreArray(bb,(PetscScalar**)&b);
1696: VecRestoreArray(xx,&x);
1697: PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
1698: return(0);
1699: }
1703: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1704: {
1705: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1706: IS iscol=a->col,isrow=a->row;
1707: PetscErrorCode ierr;
1708: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1709: PetscInt *diag = a->diag;
1710: const MatScalar *aa=a->a,*v;
1711: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1712: const PetscScalar *b;
1715: VecGetArray(bb,(PetscScalar**)&b);
1716: VecGetArray(xx,&x);
1717: t = a->solve_work;
1719: ISGetIndices(isrow,&rout); r = rout;
1720: ISGetIndices(iscol,&cout); c = cout + (n-1);
1722: /* forward solve the lower triangular */
1723: idx = 4*(*r++);
1724: t[0] = b[idx]; t[1] = b[1+idx];
1725: t[2] = b[2+idx]; t[3] = b[3+idx];
1726: for (i=1; i<n; i++) {
1727: v = aa + 16*ai[i];
1728: vi = aj + ai[i];
1729: nz = diag[i] - ai[i];
1730: idx = 4*(*r++);
1731: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1732: while (nz--) {
1733: idx = 4*(*vi++);
1734: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1735: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1736: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1737: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1738: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1739: v += 16;
1740: }
1741: idx = 4*i;
1742: t[idx] = s1;t[1+idx] = s2;
1743: t[2+idx] = s3;t[3+idx] = s4;
1744: }
1745: /* backward solve the upper triangular */
1746: for (i=n-1; i>=0; i--){
1747: v = aa + 16*diag[i] + 16;
1748: vi = aj + diag[i] + 1;
1749: nz = ai[i+1] - diag[i] - 1;
1750: idt = 4*i;
1751: s1 = t[idt]; s2 = t[1+idt];
1752: s3 = t[2+idt];s4 = t[3+idt];
1753: while (nz--) {
1754: idx = 4*(*vi++);
1755: x1 = t[idx]; x2 = t[1+idx];
1756: x3 = t[2+idx]; x4 = t[3+idx];
1757: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1758: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1759: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1760: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1761: v += 16;
1762: }
1763: idc = 4*(*c--);
1764: v = aa + 16*diag[i];
1765: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1766: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1767: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1768: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1769: }
1771: ISRestoreIndices(isrow,&rout);
1772: ISRestoreIndices(iscol,&cout);
1773: VecRestoreArray(bb,(PetscScalar**)&b);
1774: VecRestoreArray(xx,&x);
1775: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
1776: return(0);
1777: }
1781: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1782: {
1783: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1784: IS iscol=a->col,isrow=a->row;
1785: PetscErrorCode ierr;
1786: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1787: PetscInt *diag = a->diag;
1788: const MatScalar *aa=a->a,*v;
1789: MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t;
1790: PetscScalar *x;
1791: const PetscScalar *b;
1794: VecGetArray(bb,(PetscScalar**)&b);
1795: VecGetArray(xx,&x);
1796: t = (MatScalar *)a->solve_work;
1798: ISGetIndices(isrow,&rout); r = rout;
1799: ISGetIndices(iscol,&cout); c = cout + (n-1);
1801: /* forward solve the lower triangular */
1802: idx = 4*(*r++);
1803: t[0] = (MatScalar)b[idx];
1804: t[1] = (MatScalar)b[1+idx];
1805: t[2] = (MatScalar)b[2+idx];
1806: t[3] = (MatScalar)b[3+idx];
1807: for (i=1; i<n; i++) {
1808: v = aa + 16*ai[i];
1809: vi = aj + ai[i];
1810: nz = diag[i] - ai[i];
1811: idx = 4*(*r++);
1812: s1 = (MatScalar)b[idx];
1813: s2 = (MatScalar)b[1+idx];
1814: s3 = (MatScalar)b[2+idx];
1815: s4 = (MatScalar)b[3+idx];
1816: while (nz--) {
1817: idx = 4*(*vi++);
1818: x1 = t[idx];
1819: x2 = t[1+idx];
1820: x3 = t[2+idx];
1821: x4 = t[3+idx];
1822: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1823: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1824: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1825: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1826: v += 16;
1827: }
1828: idx = 4*i;
1829: t[idx] = s1;
1830: t[1+idx] = s2;
1831: t[2+idx] = s3;
1832: t[3+idx] = s4;
1833: }
1834: /* backward solve the upper triangular */
1835: for (i=n-1; i>=0; i--){
1836: v = aa + 16*diag[i] + 16;
1837: vi = aj + diag[i] + 1;
1838: nz = ai[i+1] - diag[i] - 1;
1839: idt = 4*i;
1840: s1 = t[idt];
1841: s2 = t[1+idt];
1842: s3 = t[2+idt];
1843: s4 = t[3+idt];
1844: while (nz--) {
1845: idx = 4*(*vi++);
1846: x1 = t[idx];
1847: x2 = t[1+idx];
1848: x3 = t[2+idx];
1849: x4 = t[3+idx];
1850: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1851: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1852: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1853: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1854: v += 16;
1855: }
1856: idc = 4*(*c--);
1857: v = aa + 16*diag[i];
1858: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1859: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1860: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1861: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1862: x[idc] = (PetscScalar)t[idt];
1863: x[1+idc] = (PetscScalar)t[1+idt];
1864: x[2+idc] = (PetscScalar)t[2+idt];
1865: x[3+idc] = (PetscScalar)t[3+idt];
1866: }
1868: ISRestoreIndices(isrow,&rout);
1869: ISRestoreIndices(iscol,&cout);
1870: VecRestoreArray(bb,(PetscScalar**)&b);
1871: VecRestoreArray(xx,&x);
1872: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
1873: return(0);
1874: }
1876: #if defined (PETSC_HAVE_SSE)
1878: #include PETSC_HAVE_SSE
1882: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1883: {
1884: /*
1885: Note: This code uses demotion of double
1886: to float when performing the mixed-mode computation.
1887: This may not be numerically reasonable for all applications.
1888: */
1889: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1890: IS iscol=a->col,isrow=a->row;
1892: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1893: PetscInt *diag = a->diag,ai16;
1894: MatScalar *aa=a->a,*v;
1895: PetscScalar *x,*b,*t;
1897: /* Make space in temp stack for 16 Byte Aligned arrays */
1898: float ssealignedspace[11],*tmps,*tmpx;
1899: unsigned long offset;
1900:
1902: SSE_SCOPE_BEGIN;
1904: offset = (unsigned long)ssealignedspace % 16;
1905: if (offset) offset = (16 - offset)/4;
1906: tmps = &ssealignedspace[offset];
1907: tmpx = &ssealignedspace[offset+4];
1908: PREFETCH_NTA(aa+16*ai[1]);
1910: VecGetArray(bb,&b);
1911: VecGetArray(xx,&x);
1912: t = a->solve_work;
1914: ISGetIndices(isrow,&rout); r = rout;
1915: ISGetIndices(iscol,&cout); c = cout + (n-1);
1917: /* forward solve the lower triangular */
1918: idx = 4*(*r++);
1919: t[0] = b[idx]; t[1] = b[1+idx];
1920: t[2] = b[2+idx]; t[3] = b[3+idx];
1921: v = aa + 16*ai[1];
1923: for (i=1; i<n;) {
1924: PREFETCH_NTA(&v[8]);
1925: vi = aj + ai[i];
1926: nz = diag[i] - ai[i];
1927: idx = 4*(*r++);
1929: /* Demote sum from double to float */
1930: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1931: LOAD_PS(tmps,XMM7);
1933: while (nz--) {
1934: PREFETCH_NTA(&v[16]);
1935: idx = 4*(*vi++);
1936:
1937: /* Demote solution (so far) from double to float */
1938: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1940: /* 4x4 Matrix-Vector product with negative accumulation: */
1941: SSE_INLINE_BEGIN_2(tmpx,v)
1942: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1944: /* First Column */
1945: SSE_COPY_PS(XMM0,XMM6)
1946: SSE_SHUFFLE(XMM0,XMM0,0x00)
1947: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1948: SSE_SUB_PS(XMM7,XMM0)
1949:
1950: /* Second Column */
1951: SSE_COPY_PS(XMM1,XMM6)
1952: SSE_SHUFFLE(XMM1,XMM1,0x55)
1953: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1954: SSE_SUB_PS(XMM7,XMM1)
1955:
1956: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1957:
1958: /* Third Column */
1959: SSE_COPY_PS(XMM2,XMM6)
1960: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1961: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1962: SSE_SUB_PS(XMM7,XMM2)
1964: /* Fourth Column */
1965: SSE_COPY_PS(XMM3,XMM6)
1966: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1967: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1968: SSE_SUB_PS(XMM7,XMM3)
1969: SSE_INLINE_END_2
1970:
1971: v += 16;
1972: }
1973: idx = 4*i;
1974: v = aa + 16*ai[++i];
1975: PREFETCH_NTA(v);
1976: STORE_PS(tmps,XMM7);
1978: /* Promote result from float to double */
1979: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1980: }
1981: /* backward solve the upper triangular */
1982: idt = 4*(n-1);
1983: ai16 = 16*diag[n-1];
1984: v = aa + ai16 + 16;
1985: for (i=n-1; i>=0;){
1986: PREFETCH_NTA(&v[8]);
1987: vi = aj + diag[i] + 1;
1988: nz = ai[i+1] - diag[i] - 1;
1989:
1990: /* Demote accumulator from double to float */
1991: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1992: LOAD_PS(tmps,XMM7);
1994: while (nz--) {
1995: PREFETCH_NTA(&v[16]);
1996: idx = 4*(*vi++);
1998: /* Demote solution (so far) from double to float */
1999: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2001: /* 4x4 Matrix-Vector Product with negative accumulation: */
2002: SSE_INLINE_BEGIN_2(tmpx,v)
2003: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2005: /* First Column */
2006: SSE_COPY_PS(XMM0,XMM6)
2007: SSE_SHUFFLE(XMM0,XMM0,0x00)
2008: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2009: SSE_SUB_PS(XMM7,XMM0)
2011: /* Second Column */
2012: SSE_COPY_PS(XMM1,XMM6)
2013: SSE_SHUFFLE(XMM1,XMM1,0x55)
2014: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2015: SSE_SUB_PS(XMM7,XMM1)
2017: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2018:
2019: /* Third Column */
2020: SSE_COPY_PS(XMM2,XMM6)
2021: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2022: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2023: SSE_SUB_PS(XMM7,XMM2)
2025: /* Fourth Column */
2026: SSE_COPY_PS(XMM3,XMM6)
2027: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2028: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2029: SSE_SUB_PS(XMM7,XMM3)
2030: SSE_INLINE_END_2
2031: v += 16;
2032: }
2033: v = aa + ai16;
2034: ai16 = 16*diag[--i];
2035: PREFETCH_NTA(aa+ai16+16);
2036: /*
2037: Scale the result by the diagonal 4x4 block,
2038: which was inverted as part of the factorization
2039: */
2040: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2041: /* First Column */
2042: SSE_COPY_PS(XMM0,XMM7)
2043: SSE_SHUFFLE(XMM0,XMM0,0x00)
2044: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2046: /* Second Column */
2047: SSE_COPY_PS(XMM1,XMM7)
2048: SSE_SHUFFLE(XMM1,XMM1,0x55)
2049: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2050: SSE_ADD_PS(XMM0,XMM1)
2052: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2053:
2054: /* Third Column */
2055: SSE_COPY_PS(XMM2,XMM7)
2056: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2057: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2058: SSE_ADD_PS(XMM0,XMM2)
2060: /* Fourth Column */
2061: SSE_COPY_PS(XMM3,XMM7)
2062: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2063: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2064: SSE_ADD_PS(XMM0,XMM3)
2065:
2066: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2067: SSE_INLINE_END_3
2069: /* Promote solution from float to double */
2070: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2072: /* Apply reordering to t and stream into x. */
2073: /* This way, x doesn't pollute the cache. */
2074: /* Be careful with size: 2 doubles = 4 floats! */
2075: idc = 4*(*c--);
2076: SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2077: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
2078: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2079: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2080: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2081: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2082: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2083: SSE_INLINE_END_2
2084: v = aa + ai16 + 16;
2085: idt -= 4;
2086: }
2088: ISRestoreIndices(isrow,&rout);
2089: ISRestoreIndices(iscol,&cout);
2090: VecRestoreArray(bb,&b);
2091: VecRestoreArray(xx,&x);
2092: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2093: SSE_SCOPE_END;
2094: return(0);
2095: }
2097: #endif
2100: /*
2101: Special case where the matrix was ILU(0) factored in the natural
2102: ordering. This eliminates the need for the column and row permutation.
2103: */
2106: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2107: {
2108: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2109: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2110: PetscErrorCode ierr;
2111: PetscInt *diag = a->diag;
2112: const MatScalar *aa=a->a;
2113: PetscScalar *x;
2114: const PetscScalar *b;
2117: VecGetArray(bb,(PetscScalar**)&b);
2118: VecGetArray(xx,&x);
2120: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2121: {
2122: static PetscScalar w[2000]; /* very BAD need to fix */
2123: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2124: }
2125: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2126: {
2127: static PetscScalar w[2000]; /* very BAD need to fix */
2128: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2129: }
2130: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2131: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2132: #else
2133: {
2134: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
2135: const MatScalar *v;
2136: PetscInt jdx,idt,idx,nz,*vi,i,ai16;
2138: /* forward solve the lower triangular */
2139: idx = 0;
2140: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2141: for (i=1; i<n; i++) {
2142: v = aa + 16*ai[i];
2143: vi = aj + ai[i];
2144: nz = diag[i] - ai[i];
2145: idx += 4;
2146: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2147: while (nz--) {
2148: jdx = 4*(*vi++);
2149: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2150: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2151: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2152: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2153: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2154: v += 16;
2155: }
2156: x[idx] = s1;
2157: x[1+idx] = s2;
2158: x[2+idx] = s3;
2159: x[3+idx] = s4;
2160: }
2161: /* backward solve the upper triangular */
2162: idt = 4*(n-1);
2163: for (i=n-1; i>=0; i--){
2164: ai16 = 16*diag[i];
2165: v = aa + ai16 + 16;
2166: vi = aj + diag[i] + 1;
2167: nz = ai[i+1] - diag[i] - 1;
2168: s1 = x[idt]; s2 = x[1+idt];
2169: s3 = x[2+idt];s4 = x[3+idt];
2170: while (nz--) {
2171: idx = 4*(*vi++);
2172: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
2173: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2174: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2175: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2176: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2177: v += 16;
2178: }
2179: v = aa + ai16;
2180: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2181: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
2182: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2183: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2184: idt -= 4;
2185: }
2186: }
2187: #endif
2189: VecRestoreArray(bb,(PetscScalar**)&b);
2190: VecRestoreArray(xx,&x);
2191: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2192: return(0);
2193: }
2197: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2198: {
2199: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2200: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2202: PetscInt *diag = a->diag;
2203: MatScalar *aa=a->a;
2204: PetscScalar *x,*b;
2207: VecGetArray(bb,&b);
2208: VecGetArray(xx,&x);
2210: {
2211: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
2212: MatScalar *v,*t=(MatScalar *)x;
2213: PetscInt jdx,idt,idx,nz,*vi,i,ai16;
2215: /* forward solve the lower triangular */
2216: idx = 0;
2217: t[0] = (MatScalar)b[0];
2218: t[1] = (MatScalar)b[1];
2219: t[2] = (MatScalar)b[2];
2220: t[3] = (MatScalar)b[3];
2221: for (i=1; i<n; i++) {
2222: v = aa + 16*ai[i];
2223: vi = aj + ai[i];
2224: nz = diag[i] - ai[i];
2225: idx += 4;
2226: s1 = (MatScalar)b[idx];
2227: s2 = (MatScalar)b[1+idx];
2228: s3 = (MatScalar)b[2+idx];
2229: s4 = (MatScalar)b[3+idx];
2230: while (nz--) {
2231: jdx = 4*(*vi++);
2232: x1 = t[jdx];
2233: x2 = t[1+jdx];
2234: x3 = t[2+jdx];
2235: x4 = t[3+jdx];
2236: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2237: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2238: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2239: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2240: v += 16;
2241: }
2242: t[idx] = s1;
2243: t[1+idx] = s2;
2244: t[2+idx] = s3;
2245: t[3+idx] = s4;
2246: }
2247: /* backward solve the upper triangular */
2248: idt = 4*(n-1);
2249: for (i=n-1; i>=0; i--){
2250: ai16 = 16*diag[i];
2251: v = aa + ai16 + 16;
2252: vi = aj + diag[i] + 1;
2253: nz = ai[i+1] - diag[i] - 1;
2254: s1 = t[idt];
2255: s2 = t[1+idt];
2256: s3 = t[2+idt];
2257: s4 = t[3+idt];
2258: while (nz--) {
2259: idx = 4*(*vi++);
2260: x1 = (MatScalar)x[idx];
2261: x2 = (MatScalar)x[1+idx];
2262: x3 = (MatScalar)x[2+idx];
2263: x4 = (MatScalar)x[3+idx];
2264: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2265: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2266: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2267: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2268: v += 16;
2269: }
2270: v = aa + ai16;
2271: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
2272: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
2273: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2274: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2275: idt -= 4;
2276: }
2277: }
2279: VecRestoreArray(bb,&b);
2280: VecRestoreArray(xx,&x);
2281: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2282: return(0);
2283: }
2285: #if defined (PETSC_HAVE_SSE)
2287: #include PETSC_HAVE_SSE
2290: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2291: {
2292: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2293: unsigned short *aj=(unsigned short *)a->j;
2295: int *ai=a->i,n=a->mbs,*diag = a->diag;
2296: MatScalar *aa=a->a;
2297: PetscScalar *x,*b;
2300: SSE_SCOPE_BEGIN;
2301: /*
2302: Note: This code currently uses demotion of double
2303: to float when performing the mixed-mode computation.
2304: This may not be numerically reasonable for all applications.
2305: */
2306: PREFETCH_NTA(aa+16*ai[1]);
2308: VecGetArray(bb,&b);
2309: VecGetArray(xx,&x);
2310: {
2311: /* x will first be computed in single precision then promoted inplace to double */
2312: MatScalar *v,*t=(MatScalar *)x;
2313: int nz,i,idt,ai16;
2314: unsigned int jdx,idx;
2315: unsigned short *vi;
2316: /* Forward solve the lower triangular factor. */
2318: /* First block is the identity. */
2319: idx = 0;
2320: CONVERT_DOUBLE4_FLOAT4(t,b);
2321: v = aa + 16*((unsigned int)ai[1]);
2323: for (i=1; i<n;) {
2324: PREFETCH_NTA(&v[8]);
2325: vi = aj + ai[i];
2326: nz = diag[i] - ai[i];
2327: idx += 4;
2329: /* Demote RHS from double to float. */
2330: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2331: LOAD_PS(&t[idx],XMM7);
2333: while (nz--) {
2334: PREFETCH_NTA(&v[16]);
2335: jdx = 4*((unsigned int)(*vi++));
2336:
2337: /* 4x4 Matrix-Vector product with negative accumulation: */
2338: SSE_INLINE_BEGIN_2(&t[jdx],v)
2339: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2341: /* First Column */
2342: SSE_COPY_PS(XMM0,XMM6)
2343: SSE_SHUFFLE(XMM0,XMM0,0x00)
2344: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2345: SSE_SUB_PS(XMM7,XMM0)
2347: /* Second Column */
2348: SSE_COPY_PS(XMM1,XMM6)
2349: SSE_SHUFFLE(XMM1,XMM1,0x55)
2350: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2351: SSE_SUB_PS(XMM7,XMM1)
2353: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2354:
2355: /* Third Column */
2356: SSE_COPY_PS(XMM2,XMM6)
2357: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2358: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2359: SSE_SUB_PS(XMM7,XMM2)
2361: /* Fourth Column */
2362: SSE_COPY_PS(XMM3,XMM6)
2363: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2364: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2365: SSE_SUB_PS(XMM7,XMM3)
2366: SSE_INLINE_END_2
2367:
2368: v += 16;
2369: }
2370: v = aa + 16*ai[++i];
2371: PREFETCH_NTA(v);
2372: STORE_PS(&t[idx],XMM7);
2373: }
2375: /* Backward solve the upper triangular factor.*/
2377: idt = 4*(n-1);
2378: ai16 = 16*diag[n-1];
2379: v = aa + ai16 + 16;
2380: for (i=n-1; i>=0;){
2381: PREFETCH_NTA(&v[8]);
2382: vi = aj + diag[i] + 1;
2383: nz = ai[i+1] - diag[i] - 1;
2384:
2385: LOAD_PS(&t[idt],XMM7);
2387: while (nz--) {
2388: PREFETCH_NTA(&v[16]);
2389: idx = 4*((unsigned int)(*vi++));
2391: /* 4x4 Matrix-Vector Product with negative accumulation: */
2392: SSE_INLINE_BEGIN_2(&t[idx],v)
2393: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2395: /* First Column */
2396: SSE_COPY_PS(XMM0,XMM6)
2397: SSE_SHUFFLE(XMM0,XMM0,0x00)
2398: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2399: SSE_SUB_PS(XMM7,XMM0)
2401: /* Second Column */
2402: SSE_COPY_PS(XMM1,XMM6)
2403: SSE_SHUFFLE(XMM1,XMM1,0x55)
2404: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2405: SSE_SUB_PS(XMM7,XMM1)
2407: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2408:
2409: /* Third Column */
2410: SSE_COPY_PS(XMM2,XMM6)
2411: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2412: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2413: SSE_SUB_PS(XMM7,XMM2)
2415: /* Fourth Column */
2416: SSE_COPY_PS(XMM3,XMM6)
2417: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2418: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2419: SSE_SUB_PS(XMM7,XMM3)
2420: SSE_INLINE_END_2
2421: v += 16;
2422: }
2423: v = aa + ai16;
2424: ai16 = 16*diag[--i];
2425: PREFETCH_NTA(aa+ai16+16);
2426: /*
2427: Scale the result by the diagonal 4x4 block,
2428: which was inverted as part of the factorization
2429: */
2430: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2431: /* First Column */
2432: SSE_COPY_PS(XMM0,XMM7)
2433: SSE_SHUFFLE(XMM0,XMM0,0x00)
2434: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2436: /* Second Column */
2437: SSE_COPY_PS(XMM1,XMM7)
2438: SSE_SHUFFLE(XMM1,XMM1,0x55)
2439: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2440: SSE_ADD_PS(XMM0,XMM1)
2442: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2443:
2444: /* Third Column */
2445: SSE_COPY_PS(XMM2,XMM7)
2446: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2447: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2448: SSE_ADD_PS(XMM0,XMM2)
2450: /* Fourth Column */
2451: SSE_COPY_PS(XMM3,XMM7)
2452: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2453: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2454: SSE_ADD_PS(XMM0,XMM3)
2456: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2457: SSE_INLINE_END_3
2459: v = aa + ai16 + 16;
2460: idt -= 4;
2461: }
2463: /* Convert t from single precision back to double precision (inplace)*/
2464: idt = 4*(n-1);
2465: for (i=n-1;i>=0;i--) {
2466: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2467: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2468: PetscScalar *xtemp=&x[idt];
2469: MatScalar *ttemp=&t[idt];
2470: xtemp[3] = (PetscScalar)ttemp[3];
2471: xtemp[2] = (PetscScalar)ttemp[2];
2472: xtemp[1] = (PetscScalar)ttemp[1];
2473: xtemp[0] = (PetscScalar)ttemp[0];
2474: idt -= 4;
2475: }
2477: } /* End of artificial scope. */
2478: VecRestoreArray(bb,&b);
2479: VecRestoreArray(xx,&x);
2480: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2481: SSE_SCOPE_END;
2482: return(0);
2483: }
2487: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2488: {
2489: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2490: int *aj=a->j;
2492: int *ai=a->i,n=a->mbs,*diag = a->diag;
2493: MatScalar *aa=a->a;
2494: PetscScalar *x,*b;
2497: SSE_SCOPE_BEGIN;
2498: /*
2499: Note: This code currently uses demotion of double
2500: to float when performing the mixed-mode computation.
2501: This may not be numerically reasonable for all applications.
2502: */
2503: PREFETCH_NTA(aa+16*ai[1]);
2505: VecGetArray(bb,&b);
2506: VecGetArray(xx,&x);
2507: {
2508: /* x will first be computed in single precision then promoted inplace to double */
2509: MatScalar *v,*t=(MatScalar *)x;
2510: int nz,i,idt,ai16;
2511: int jdx,idx;
2512: int *vi;
2513: /* Forward solve the lower triangular factor. */
2515: /* First block is the identity. */
2516: idx = 0;
2517: CONVERT_DOUBLE4_FLOAT4(t,b);
2518: v = aa + 16*ai[1];
2520: for (i=1; i<n;) {
2521: PREFETCH_NTA(&v[8]);
2522: vi = aj + ai[i];
2523: nz = diag[i] - ai[i];
2524: idx += 4;
2526: /* Demote RHS from double to float. */
2527: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2528: LOAD_PS(&t[idx],XMM7);
2530: while (nz--) {
2531: PREFETCH_NTA(&v[16]);
2532: jdx = 4*(*vi++);
2533: /* jdx = *vi++; */
2534:
2535: /* 4x4 Matrix-Vector product with negative accumulation: */
2536: SSE_INLINE_BEGIN_2(&t[jdx],v)
2537: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2539: /* First Column */
2540: SSE_COPY_PS(XMM0,XMM6)
2541: SSE_SHUFFLE(XMM0,XMM0,0x00)
2542: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2543: SSE_SUB_PS(XMM7,XMM0)
2545: /* Second Column */
2546: SSE_COPY_PS(XMM1,XMM6)
2547: SSE_SHUFFLE(XMM1,XMM1,0x55)
2548: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2549: SSE_SUB_PS(XMM7,XMM1)
2551: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2552:
2553: /* Third Column */
2554: SSE_COPY_PS(XMM2,XMM6)
2555: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2556: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2557: SSE_SUB_PS(XMM7,XMM2)
2559: /* Fourth Column */
2560: SSE_COPY_PS(XMM3,XMM6)
2561: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2562: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2563: SSE_SUB_PS(XMM7,XMM3)
2564: SSE_INLINE_END_2
2565:
2566: v += 16;
2567: }
2568: v = aa + 16*ai[++i];
2569: PREFETCH_NTA(v);
2570: STORE_PS(&t[idx],XMM7);
2571: }
2573: /* Backward solve the upper triangular factor.*/
2575: idt = 4*(n-1);
2576: ai16 = 16*diag[n-1];
2577: v = aa + ai16 + 16;
2578: for (i=n-1; i>=0;){
2579: PREFETCH_NTA(&v[8]);
2580: vi = aj + diag[i] + 1;
2581: nz = ai[i+1] - diag[i] - 1;
2582:
2583: LOAD_PS(&t[idt],XMM7);
2585: while (nz--) {
2586: PREFETCH_NTA(&v[16]);
2587: idx = 4*(*vi++);
2588: /* idx = *vi++; */
2590: /* 4x4 Matrix-Vector Product with negative accumulation: */
2591: SSE_INLINE_BEGIN_2(&t[idx],v)
2592: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2594: /* First Column */
2595: SSE_COPY_PS(XMM0,XMM6)
2596: SSE_SHUFFLE(XMM0,XMM0,0x00)
2597: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2598: SSE_SUB_PS(XMM7,XMM0)
2600: /* Second Column */
2601: SSE_COPY_PS(XMM1,XMM6)
2602: SSE_SHUFFLE(XMM1,XMM1,0x55)
2603: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2604: SSE_SUB_PS(XMM7,XMM1)
2606: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2607:
2608: /* Third Column */
2609: SSE_COPY_PS(XMM2,XMM6)
2610: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2611: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2612: SSE_SUB_PS(XMM7,XMM2)
2614: /* Fourth Column */
2615: SSE_COPY_PS(XMM3,XMM6)
2616: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2617: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2618: SSE_SUB_PS(XMM7,XMM3)
2619: SSE_INLINE_END_2
2620: v += 16;
2621: }
2622: v = aa + ai16;
2623: ai16 = 16*diag[--i];
2624: PREFETCH_NTA(aa+ai16+16);
2625: /*
2626: Scale the result by the diagonal 4x4 block,
2627: which was inverted as part of the factorization
2628: */
2629: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2630: /* First Column */
2631: SSE_COPY_PS(XMM0,XMM7)
2632: SSE_SHUFFLE(XMM0,XMM0,0x00)
2633: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2635: /* Second Column */
2636: SSE_COPY_PS(XMM1,XMM7)
2637: SSE_SHUFFLE(XMM1,XMM1,0x55)
2638: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2639: SSE_ADD_PS(XMM0,XMM1)
2641: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2642:
2643: /* Third Column */
2644: SSE_COPY_PS(XMM2,XMM7)
2645: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2646: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2647: SSE_ADD_PS(XMM0,XMM2)
2649: /* Fourth Column */
2650: SSE_COPY_PS(XMM3,XMM7)
2651: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2652: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2653: SSE_ADD_PS(XMM0,XMM3)
2655: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2656: SSE_INLINE_END_3
2658: v = aa + ai16 + 16;
2659: idt -= 4;
2660: }
2662: /* Convert t from single precision back to double precision (inplace)*/
2663: idt = 4*(n-1);
2664: for (i=n-1;i>=0;i--) {
2665: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2666: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2667: PetscScalar *xtemp=&x[idt];
2668: MatScalar *ttemp=&t[idt];
2669: xtemp[3] = (PetscScalar)ttemp[3];
2670: xtemp[2] = (PetscScalar)ttemp[2];
2671: xtemp[1] = (PetscScalar)ttemp[1];
2672: xtemp[0] = (PetscScalar)ttemp[0];
2673: idt -= 4;
2674: }
2676: } /* End of artificial scope. */
2677: VecRestoreArray(bb,&b);
2678: VecRestoreArray(xx,&x);
2679: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2680: SSE_SCOPE_END;
2681: return(0);
2682: }
2684: #endif
2687: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2688: {
2689: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2690: IS iscol=a->col,isrow=a->row;
2691: PetscErrorCode ierr;
2692: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2693: PetscInt *diag = a->diag;
2694: const MatScalar *aa=a->a,*v;
2695: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
2696: const PetscScalar *b;
2699: VecGetArray(bb,(PetscScalar**)&b);
2700: VecGetArray(xx,&x);
2701: t = a->solve_work;
2703: ISGetIndices(isrow,&rout); r = rout;
2704: ISGetIndices(iscol,&cout); c = cout + (n-1);
2706: /* forward solve the lower triangular */
2707: idx = 3*(*r++);
2708: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2709: for (i=1; i<n; i++) {
2710: v = aa + 9*ai[i];
2711: vi = aj + ai[i];
2712: nz = diag[i] - ai[i];
2713: idx = 3*(*r++);
2714: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2715: while (nz--) {
2716: idx = 3*(*vi++);
2717: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2718: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2719: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2720: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2721: v += 9;
2722: }
2723: idx = 3*i;
2724: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2725: }
2726: /* backward solve the upper triangular */
2727: for (i=n-1; i>=0; i--){
2728: v = aa + 9*diag[i] + 9;
2729: vi = aj + diag[i] + 1;
2730: nz = ai[i+1] - diag[i] - 1;
2731: idt = 3*i;
2732: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2733: while (nz--) {
2734: idx = 3*(*vi++);
2735: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2736: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2737: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2738: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2739: v += 9;
2740: }
2741: idc = 3*(*c--);
2742: v = aa + 9*diag[i];
2743: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2744: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2745: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2746: }
2747: ISRestoreIndices(isrow,&rout);
2748: ISRestoreIndices(iscol,&cout);
2749: VecRestoreArray(bb,(PetscScalar**)&b);
2750: VecRestoreArray(xx,&x);
2751: PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
2752: return(0);
2753: }
2755: /*
2756: Special case where the matrix was ILU(0) factored in the natural
2757: ordering. This eliminates the need for the column and row permutation.
2758: */
2761: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2762: {
2763: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2764: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2765: PetscErrorCode ierr;
2766: PetscInt *diag = a->diag;
2767: const MatScalar *aa=a->a,*v;
2768: PetscScalar *x,s1,s2,s3,x1,x2,x3;
2769: const PetscScalar *b;
2770: PetscInt jdx,idt,idx,nz,*vi,i;
2773: VecGetArray(bb,(PetscScalar**)&b);
2774: VecGetArray(xx,&x);
2776: /* forward solve the lower triangular */
2777: idx = 0;
2778: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
2779: for (i=1; i<n; i++) {
2780: v = aa + 9*ai[i];
2781: vi = aj + ai[i];
2782: nz = diag[i] - ai[i];
2783: idx += 3;
2784: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
2785: while (nz--) {
2786: jdx = 3*(*vi++);
2787: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2788: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2789: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2790: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2791: v += 9;
2792: }
2793: x[idx] = s1;
2794: x[1+idx] = s2;
2795: x[2+idx] = s3;
2796: }
2797: /* backward solve the upper triangular */
2798: for (i=n-1; i>=0; i--){
2799: v = aa + 9*diag[i] + 9;
2800: vi = aj + diag[i] + 1;
2801: nz = ai[i+1] - diag[i] - 1;
2802: idt = 3*i;
2803: s1 = x[idt]; s2 = x[1+idt];
2804: s3 = x[2+idt];
2805: while (nz--) {
2806: idx = 3*(*vi++);
2807: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
2808: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2809: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2810: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2811: v += 9;
2812: }
2813: v = aa + 9*diag[i];
2814: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2815: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2816: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2817: }
2819: VecRestoreArray(bb,(PetscScalar**)&b);
2820: VecRestoreArray(xx,&x);
2821: PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
2822: return(0);
2823: }
2827: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2828: {
2829: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2830: IS iscol=a->col,isrow=a->row;
2831: PetscErrorCode ierr;
2832: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2833: PetscInt *diag = a->diag;
2834: const MatScalar *aa=a->a,*v;
2835: PetscScalar *x,s1,s2,x1,x2,*t;
2836: const PetscScalar *b;
2839: VecGetArray(bb,(PetscScalar**)&b);
2840: VecGetArray(xx,&x);
2841: t = a->solve_work;
2843: ISGetIndices(isrow,&rout); r = rout;
2844: ISGetIndices(iscol,&cout); c = cout + (n-1);
2846: /* forward solve the lower triangular */
2847: idx = 2*(*r++);
2848: t[0] = b[idx]; t[1] = b[1+idx];
2849: for (i=1; i<n; i++) {
2850: v = aa + 4*ai[i];
2851: vi = aj + ai[i];
2852: nz = diag[i] - ai[i];
2853: idx = 2*(*r++);
2854: s1 = b[idx]; s2 = b[1+idx];
2855: while (nz--) {
2856: idx = 2*(*vi++);
2857: x1 = t[idx]; x2 = t[1+idx];
2858: s1 -= v[0]*x1 + v[2]*x2;
2859: s2 -= v[1]*x1 + v[3]*x2;
2860: v += 4;
2861: }
2862: idx = 2*i;
2863: t[idx] = s1; t[1+idx] = s2;
2864: }
2865: /* backward solve the upper triangular */
2866: for (i=n-1; i>=0; i--){
2867: v = aa + 4*diag[i] + 4;
2868: vi = aj + diag[i] + 1;
2869: nz = ai[i+1] - diag[i] - 1;
2870: idt = 2*i;
2871: s1 = t[idt]; s2 = t[1+idt];
2872: while (nz--) {
2873: idx = 2*(*vi++);
2874: x1 = t[idx]; x2 = t[1+idx];
2875: s1 -= v[0]*x1 + v[2]*x2;
2876: s2 -= v[1]*x1 + v[3]*x2;
2877: v += 4;
2878: }
2879: idc = 2*(*c--);
2880: v = aa + 4*diag[i];
2881: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
2882: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2883: }
2884: ISRestoreIndices(isrow,&rout);
2885: ISRestoreIndices(iscol,&cout);
2886: VecRestoreArray(bb,(PetscScalar**)&b);
2887: VecRestoreArray(xx,&x);
2888: PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
2889: return(0);
2890: }
2892: /*
2893: Special case where the matrix was ILU(0) factored in the natural
2894: ordering. This eliminates the need for the column and row permutation.
2895: */
2898: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2899: {
2900: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2901: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2902: PetscErrorCode ierr;
2903: PetscInt *diag = a->diag;
2904: const MatScalar *aa=a->a,*v;
2905: PetscScalar *x,s1,s2,x1,x2;
2906: const PetscScalar *b;
2907: PetscInt jdx,idt,idx,nz,*vi,i;
2910: VecGetArray(bb,(PetscScalar**)&b);
2911: VecGetArray(xx,&x);
2913: /* forward solve the lower triangular */
2914: idx = 0;
2915: x[0] = b[0]; x[1] = b[1];
2916: for (i=1; i<n; i++) {
2917: v = aa + 4*ai[i];
2918: vi = aj + ai[i];
2919: nz = diag[i] - ai[i];
2920: idx += 2;
2921: s1 = b[idx];s2 = b[1+idx];
2922: while (nz--) {
2923: jdx = 2*(*vi++);
2924: x1 = x[jdx];x2 = x[1+jdx];
2925: s1 -= v[0]*x1 + v[2]*x2;
2926: s2 -= v[1]*x1 + v[3]*x2;
2927: v += 4;
2928: }
2929: x[idx] = s1;
2930: x[1+idx] = s2;
2931: }
2932: /* backward solve the upper triangular */
2933: for (i=n-1; i>=0; i--){
2934: v = aa + 4*diag[i] + 4;
2935: vi = aj + diag[i] + 1;
2936: nz = ai[i+1] - diag[i] - 1;
2937: idt = 2*i;
2938: s1 = x[idt]; s2 = x[1+idt];
2939: while (nz--) {
2940: idx = 2*(*vi++);
2941: x1 = x[idx]; x2 = x[1+idx];
2942: s1 -= v[0]*x1 + v[2]*x2;
2943: s2 -= v[1]*x1 + v[3]*x2;
2944: v += 4;
2945: }
2946: v = aa + 4*diag[i];
2947: x[idt] = v[0]*s1 + v[2]*s2;
2948: x[1+idt] = v[1]*s1 + v[3]*s2;
2949: }
2951: VecRestoreArray(bb,(PetscScalar**)&b);
2952: VecRestoreArray(xx,&x);
2953: PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
2954: return(0);
2955: }
2959: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2960: {
2961: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2962: IS iscol=a->col,isrow=a->row;
2964: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2965: PetscInt *diag = a->diag;
2966: MatScalar *aa=a->a,*v;
2967: PetscScalar *x,*b,s1,*t;
2970: if (!n) return(0);
2972: VecGetArray(bb,&b);
2973: VecGetArray(xx,&x);
2974: t = a->solve_work;
2976: ISGetIndices(isrow,&rout); r = rout;
2977: ISGetIndices(iscol,&cout); c = cout + (n-1);
2979: /* forward solve the lower triangular */
2980: t[0] = b[*r++];
2981: for (i=1; i<n; i++) {
2982: v = aa + ai[i];
2983: vi = aj + ai[i];
2984: nz = diag[i] - ai[i];
2985: s1 = b[*r++];
2986: while (nz--) {
2987: s1 -= (*v++)*t[*vi++];
2988: }
2989: t[i] = s1;
2990: }
2991: /* backward solve the upper triangular */
2992: for (i=n-1; i>=0; i--){
2993: v = aa + diag[i] + 1;
2994: vi = aj + diag[i] + 1;
2995: nz = ai[i+1] - diag[i] - 1;
2996: s1 = t[i];
2997: while (nz--) {
2998: s1 -= (*v++)*t[*vi++];
2999: }
3000: x[*c--] = t[i] = aa[diag[i]]*s1;
3001: }
3003: ISRestoreIndices(isrow,&rout);
3004: ISRestoreIndices(iscol,&cout);
3005: VecRestoreArray(bb,&b);
3006: VecRestoreArray(xx,&x);
3007: PetscLogFlops(2*1*(a->nz) - A->cmap.n);
3008: return(0);
3009: }
3010: /*
3011: Special case where the matrix was ILU(0) factored in the natural
3012: ordering. This eliminates the need for the column and row permutation.
3013: */
3016: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3017: {
3018: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3019: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
3021: PetscInt *diag = a->diag;
3022: MatScalar *aa=a->a;
3023: PetscScalar *x,*b;
3024: PetscScalar s1,x1;
3025: MatScalar *v;
3026: PetscInt jdx,idt,idx,nz,*vi,i;
3029: VecGetArray(bb,&b);
3030: VecGetArray(xx,&x);
3032: /* forward solve the lower triangular */
3033: idx = 0;
3034: x[0] = b[0];
3035: for (i=1; i<n; i++) {
3036: v = aa + ai[i];
3037: vi = aj + ai[i];
3038: nz = diag[i] - ai[i];
3039: idx += 1;
3040: s1 = b[idx];
3041: while (nz--) {
3042: jdx = *vi++;
3043: x1 = x[jdx];
3044: s1 -= v[0]*x1;
3045: v += 1;
3046: }
3047: x[idx] = s1;
3048: }
3049: /* backward solve the upper triangular */
3050: for (i=n-1; i>=0; i--){
3051: v = aa + diag[i] + 1;
3052: vi = aj + diag[i] + 1;
3053: nz = ai[i+1] - diag[i] - 1;
3054: idt = i;
3055: s1 = x[idt];
3056: while (nz--) {
3057: idx = *vi++;
3058: x1 = x[idx];
3059: s1 -= v[0]*x1;
3060: v += 1;
3061: }
3062: v = aa + diag[i];
3063: x[idt] = v[0]*s1;
3064: }
3065: VecRestoreArray(bb,&b);
3066: VecRestoreArray(xx,&x);
3067: PetscLogFlops(2*(a->nz) - A->cmap.n);
3068: return(0);
3069: }
3071: /* ----------------------------------------------------------------*/
3072: /*
3073: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3074: except that the data structure of Mat_SeqAIJ is slightly different.
3075: Not a good example of code reuse.
3076: */
3080: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3081: {
3082: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3083: IS isicol;
3085: PetscInt *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3086: PetscInt *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3087: PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3088: PetscInt incrlev,nnz,i,bs = A->rmap.bs,bs2 = a->bs2,levels,diagonal_fill,dd;
3089: PetscTruth col_identity,row_identity,flg;
3090: PetscReal f;
3093: f = info->fill;
3094: levels = (PetscInt)info->levels;
3095: diagonal_fill = (PetscInt)info->diagonal_fill;
3096: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3097: ISIdentity(isrow,&row_identity);
3098: ISIdentity(iscol,&col_identity);
3100: if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */
3101: MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
3102: (*fact)->factor = FACTOR_LU;
3103: b = (Mat_SeqBAIJ*)(*fact)->data;
3104: MatMissingDiagonal_SeqBAIJ(*fact,&flg,&dd);
3105: if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd);
3106: b->row = isrow;
3107: b->col = iscol;
3108: PetscObjectReference((PetscObject)isrow);
3109: PetscObjectReference((PetscObject)iscol);
3110: b->icol = isicol;
3111: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3112: PetscMalloc(((*fact)->rmap.N+1+(*fact)->rmap.bs)*sizeof(PetscScalar),&b->solve_work);
3113: } else { /* general case perform the symbolic factorization */
3114: ISGetIndices(isrow,&r);
3115: ISGetIndices(isicol,&ic);
3117: /* get new row pointers */
3118: PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
3119: ainew[0] = 0;
3120: /* don't know how many column pointers are needed so estimate */
3121: jmax = (PetscInt)(f*ai[n] + 1);
3122: PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
3123: /* ajfill is level of fill for each fill entry */
3124: PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
3125: /* fill is a linked list of nonzeros in active row */
3126: PetscMalloc((n+1)*sizeof(PetscInt),&fill);
3127: /* im is level for each filled value */
3128: PetscMalloc((n+1)*sizeof(PetscInt),&im);
3129: /* dloc is location of diagonal in factor */
3130: PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
3131: dloc[0] = 0;
3132: for (prow=0; prow<n; prow++) {
3134: /* copy prow into linked list */
3135: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
3136: if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3137: xi = aj + ai[r[prow]];
3138: fill[n] = n;
3139: fill[prow] = -1; /* marker for diagonal entry */
3140: while (nz--) {
3141: fm = n;
3142: idx = ic[*xi++];
3143: do {
3144: m = fm;
3145: fm = fill[m];
3146: } while (fm < idx);
3147: fill[m] = idx;
3148: fill[idx] = fm;
3149: im[idx] = 0;
3150: }
3152: /* make sure diagonal entry is included */
3153: if (diagonal_fill && fill[prow] == -1) {
3154: fm = n;
3155: while (fill[fm] < prow) fm = fill[fm];
3156: fill[prow] = fill[fm]; /* insert diagonal into linked list */
3157: fill[fm] = prow;
3158: im[prow] = 0;
3159: nzf++;
3160: dcount++;
3161: }
3163: nzi = 0;
3164: row = fill[n];
3165: while (row < prow) {
3166: incrlev = im[row] + 1;
3167: nz = dloc[row];
3168: xi = ajnew + ainew[row] + nz + 1;
3169: flev = ajfill + ainew[row] + nz + 1;
3170: nnz = ainew[row+1] - ainew[row] - nz - 1;
3171: fm = row;
3172: while (nnz-- > 0) {
3173: idx = *xi++;
3174: if (*flev + incrlev > levels) {
3175: flev++;
3176: continue;
3177: }
3178: do {
3179: m = fm;
3180: fm = fill[m];
3181: } while (fm < idx);
3182: if (fm != idx) {
3183: im[idx] = *flev + incrlev;
3184: fill[m] = idx;
3185: fill[idx] = fm;
3186: fm = idx;
3187: nzf++;
3188: } else {
3189: if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3190: }
3191: flev++;
3192: }
3193: row = fill[row];
3194: nzi++;
3195: }
3196: /* copy new filled row into permanent storage */
3197: ainew[prow+1] = ainew[prow] + nzf;
3198: if (ainew[prow+1] > jmax) {
3200: /* estimate how much additional space we will need */
3201: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3202: /* just double the memory each time */
3203: PetscInt maxadd = jmax;
3204: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3205: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3206: jmax += maxadd;
3208: /* allocate a longer ajnew and ajfill */
3209: PetscMalloc(jmax*sizeof(PetscInt),&xi);
3210: PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));
3211: PetscFree(ajnew);
3212: ajnew = xi;
3213: PetscMalloc(jmax*sizeof(PetscInt),&xi);
3214: PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));
3215: PetscFree(ajfill);
3216: ajfill = xi;
3217: reallocate++; /* count how many reallocations are needed */
3218: }
3219: xi = ajnew + ainew[prow];
3220: flev = ajfill + ainew[prow];
3221: dloc[prow] = nzi;
3222: fm = fill[n];
3223: while (nzf--) {
3224: *xi++ = fm;
3225: *flev++ = im[fm];
3226: fm = fill[fm];
3227: }
3228: /* make sure row has diagonal entry */
3229: if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3230: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3231: try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3232: }
3233: }
3234: PetscFree(ajfill);
3235: ISRestoreIndices(isrow,&r);
3236: ISRestoreIndices(isicol,&ic);
3237: PetscFree(fill);
3238: PetscFree(im);
3240: #if defined(PETSC_USE_INFO)
3241: {
3242: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3243: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
3244: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
3245: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
3246: PetscInfo(A,"for best performance.\n");
3247: if (diagonal_fill) {
3248: PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
3249: }
3250: }
3251: #endif
3253: /* put together the new matrix */
3254: MatCreate(((PetscObject)A)->comm,fact);
3255: MatSetSizes(*fact,bs*n,bs*n,bs*n,bs*n);
3256: MatSetType(*fact,((PetscObject)A)->type_name);
3257: MatSeqBAIJSetPreallocation_SeqBAIJ(*fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
3258: PetscLogObjectParent(*fact,isicol);
3259: b = (Mat_SeqBAIJ*)(*fact)->data;
3260: b->free_a = PETSC_TRUE;
3261: b->free_ij = PETSC_TRUE;
3262: b->singlemalloc = PETSC_FALSE;
3263: PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);
3264: b->j = ajnew;
3265: b->i = ainew;
3266: for (i=0; i<n; i++) dloc[i] += ainew[i];
3267: b->diag = dloc;
3268: b->ilen = 0;
3269: b->imax = 0;
3270: b->row = isrow;
3271: b->col = iscol;
3272: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3273: PetscObjectReference((PetscObject)isrow);
3274: PetscObjectReference((PetscObject)iscol);
3275: b->icol = isicol;
3276: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3277: /* In b structure: Free imax, ilen, old a, old j.
3278: Allocate dloc, solve_work, new a, new j */
3279: PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
3280: b->maxnz = b->nz = ainew[n];
3281: (*fact)->factor = FACTOR_LU;
3283: (*fact)->info.factor_mallocs = reallocate;
3284: (*fact)->info.fill_ratio_given = f;
3285: (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3286: }
3288: if (row_identity && col_identity) {
3289: MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
3290: }
3291: return(0);
3292: }
3296: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3297: {
3298: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3299: /* int i,*AJ=a->j,nz=a->nz; */
3301: /* Undo Column scaling */
3302: /* while (nz--) { */
3303: /* AJ[i] = AJ[i]/4; */
3304: /* } */
3305: /* This should really invoke a push/pop logic, but we don't have that yet. */
3306: A->ops->setunfactored = PETSC_NULL;
3307: return(0);
3308: }
3312: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3313: {
3314: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3315: PetscInt *AJ=a->j,nz=a->nz;
3316: unsigned short *aj=(unsigned short *)AJ;
3318: /* Is this really necessary? */
3319: while (nz--) {
3320: AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3321: }
3322: A->ops->setunfactored = PETSC_NULL;
3323: return(0);
3324: }
3328: PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3329: {
3331: /*
3332: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3333: with natural ordering
3334: */
3336: inA->ops->solve = MatSolve_SeqBAIJ_Update;
3337: inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update;
3338: switch (inA->rmap.bs) {
3339: case 1:
3340: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3341: PetscInfo(inA,"Using special in-place natural ordering factor BS=1\n");
3342: break;
3343: case 2:
3344: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3345: PetscInfo(inA,"Using special in-place natural ordering factor BS=2\n");
3346: break;
3347: case 3:
3348: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3349: PetscInfo(inA,"Using special in-place natural ordering factor BS=3\n");
3350: break;
3351: case 4:
3352: #if defined(PETSC_USE_MAT_SINGLE)
3353: {
3354: PetscTruth sse_enabled_local;
3356: PetscSSEIsEnabled(((PetscObject)inA)->comm,&sse_enabled_local,PETSC_NULL);
3357: if (sse_enabled_local) {
3358: # if defined(PETSC_HAVE_SSE)
3359: int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3360: if (n==(unsigned short)n) {
3361: unsigned short *aj=(unsigned short *)AJ;
3362: for (i=0;i<nz;i++) {
3363: aj[i] = (unsigned short)AJ[i];
3364: }
3365: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3366: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3367: PetscInfo(inA,"Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3368: } else {
3369: /* Scale the column indices for easier indexing in MatSolve. */
3370: /* for (i=0;i<nz;i++) { */
3371: /* AJ[i] = AJ[i]*4; */
3372: /* } */
3373: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3374: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3375: PetscInfo(inA,"Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3376: }
3377: # else
3378: /* This should never be reached. If so, problem in PetscSSEIsEnabled. */
3379: SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3380: # endif
3381: } else {
3382: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3383: PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");
3384: }
3385: }
3386: #else
3387: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3388: PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");
3389: #endif
3390: break;
3391: case 5:
3392: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3393: PetscInfo(inA,"Using special in-place natural ordering factor BS=5\n");
3394: break;
3395: case 6:
3396: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3397: PetscInfo(inA,"Using special in-place natural ordering factor BS=6\n");
3398: break;
3399: case 7:
3400: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3401: PetscInfo(inA,"Using special in-place natural ordering factor BS=7\n");
3402: break;
3403: }
3404: return(0);
3405: }
3409: PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A)
3410: {
3411: /*
3412: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3413: with natural ordering
3414: */
3415: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3416: IS row = a->row, col = a->col;
3417: PetscTruth row_identity, col_identity;
3418: PetscTruth use_natural;
3423: use_natural = PETSC_FALSE;
3424: if (row && col) {
3425: ISIdentity(row,&row_identity);
3426: ISIdentity(col,&col_identity);
3428: if (row_identity && col_identity) {
3429: use_natural = PETSC_TRUE;
3430: }
3431: } else {
3432: use_natural = PETSC_TRUE;
3433: }
3435: switch (A->rmap.bs) {
3436: case 1:
3437: if (use_natural) {
3438: A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
3439: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3440: PetscInfo(A,"Using special in-place natural ordering solve BS=1\n");
3441: PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3442: } else {
3443: A->ops->solve = MatSolve_SeqBAIJ_1;
3444: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
3445: }
3446: break;
3447: case 2:
3448: if (use_natural) {
3449: A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
3450: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3451: PetscInfo(A,"Using special in-place natural ordering solve BS=2\n");
3452: PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3453: } else {
3454: A->ops->solve = MatSolve_SeqBAIJ_2;
3455: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
3456: }
3457: break;
3458: case 3:
3459: if (use_natural) {
3460: A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering;
3461: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3462: PetscInfo(A,"Using special in-place natural ordering solve BS=3\n");
3463: PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3464: } else {
3465: A->ops->solve = MatSolve_SeqBAIJ_3;
3466: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
3467: }
3468: break;
3469: case 4:
3470: {
3471: PetscTruth sse_enabled_local;
3472: PetscSSEIsEnabled(((PetscObject)A)->comm,&sse_enabled_local,PETSC_NULL);
3473: if (use_natural) {
3474: #if defined(PETSC_USE_MAT_SINGLE)
3475: if (sse_enabled_local) { /* Natural + Single + SSE */
3476: # if defined(PETSC_HAVE_SSE)
3477: PetscInt n=a->mbs;
3478: if (n==(unsigned short)n) {
3479: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3480: PetscInfo(A,"Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3481: } else {
3482: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3483: PetscInfo(A,"Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3484: }
3485: # else
3486: /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3487: SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3488: # endif
3489: } else { /* Natural + Single */
3490: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3491: PetscInfo(A,"Using single precision, in-place, natural ordering solve BS=4\n");
3492: }
3493: #else
3494: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
3495: PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");
3496: #endif
3497: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3498: PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");
3499: } else { /* Arbitrary ordering */
3500: #if defined(PETSC_USE_MAT_SINGLE)
3501: if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3502: # if defined(PETSC_HAVE_SSE)
3503: A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion;
3504: PetscInfo(A,"Using single precision, SSE solve BS=4\n");
3505: # else
3506: /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3507: SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3508: # endif
3509: } else { /* Arbitrary + Single */
3510: A->ops->solve = MatSolve_SeqBAIJ_4_Demotion;
3511: PetscInfo(A,"Using single precision solve BS=4\n");
3512: }
3513: #else
3514: A->ops->solve = MatSolve_SeqBAIJ_4;
3515: #endif
3516: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
3517: }
3518: }
3519: break;
3520: case 5:
3521: if (use_natural) {
3522: A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering;
3523: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3524: PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");
3525: PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");
3526: } else {
3527: A->ops->solve = MatSolve_SeqBAIJ_5;
3528: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5;
3529: }
3530: break;
3531: case 6:
3532: if (use_natural) {
3533: A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering;
3534: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3535: PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");
3536: PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");
3537: } else {
3538: A->ops->solve = MatSolve_SeqBAIJ_6;
3539: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6;
3540: }
3541: break;
3542: case 7:
3543: if (use_natural) {
3544: A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering;
3545: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3546: PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");
3547: PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");
3548: } else {
3549: A->ops->solve = MatSolve_SeqBAIJ_7;
3550: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7;
3551: }
3552: break;
3553: default:
3554: A->ops->solve = MatSolve_SeqBAIJ_N;
3555: break;
3556: }
3557: return(0);
3558: }
3562: PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3563: {
3567: MatSeqBAIJ_UpdateSolvers(A);
3568: if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3569: (*A->ops->solve)(A,x,y);
3570: } else {
3571: SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3572: }
3573: return(0);
3574: }
3578: PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3579: {
3583: MatSeqBAIJ_UpdateSolvers(A);
3584: (*A->ops->solvetranspose)(A,x,y);
3585: return(0);
3586: }