Actual source code: baijfact2.c
1: /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
2: /*
3: Factorization code for BAIJ format.
4: */
6: #include src/mat/impls/baij/seq/baij.h
7: #include src/vec/vecimpl.h
8: #include src/inline/ilu.h
9: #include src/inline/dot.h
13: int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14: {
15: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
16: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17: int *diag = a->diag;
18: MatScalar *aa=a->a,*v;
19: PetscScalar s1,*x,*b;
22: VecCopy(bb,xx);
23: VecGetArray(bb,&b);
24: VecGetArray(xx,&x);
25:
26: /* forward solve the U^T */
27: for (i=0; i<n; i++) {
29: v = aa + diag[i];
30: /* multiply by the inverse of the block diagonal */
31: s1 = (*v++)*x[i];
32: vi = aj + diag[i] + 1;
33: nz = ai[i+1] - diag[i] - 1;
34: while (nz--) {
35: x[*vi++] -= (*v++)*s1;
36: }
37: x[i] = s1;
38: }
39: /* backward solve the L^T */
40: for (i=n-1; i>=0; i--){
41: v = aa + diag[i] - 1;
42: vi = aj + diag[i] - 1;
43: nz = diag[i] - ai[i];
44: s1 = x[i];
45: while (nz--) {
46: x[*vi--] -= (*v--)*s1;
47: }
48: }
49: VecRestoreArray(bb,&b);
50: VecRestoreArray(xx,&x);
51: PetscLogFlops(2*(a->nz) - A->n);
52: return(0);
53: }
57: int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
58: {
59: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
60: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61: int *diag = a->diag,oidx;
62: MatScalar *aa=a->a,*v;
63: PetscScalar s1,s2,x1,x2;
64: PetscScalar *x,*b;
67: VecCopy(bb,xx);
68: VecGetArray(bb,&b);
69: VecGetArray(xx,&x);
71: /* forward solve the U^T */
72: idx = 0;
73: for (i=0; i<n; i++) {
75: v = aa + 4*diag[i];
76: /* multiply by the inverse of the block diagonal */
77: x1 = x[idx]; x2 = x[1+idx];
78: s1 = v[0]*x1 + v[1]*x2;
79: s2 = v[2]*x1 + v[3]*x2;
80: v += 4;
82: vi = aj + diag[i] + 1;
83: nz = ai[i+1] - diag[i] - 1;
84: while (nz--) {
85: oidx = 2*(*vi++);
86: x[oidx] -= v[0]*s1 + v[1]*s2;
87: x[oidx+1] -= v[2]*s1 + v[3]*s2;
88: v += 4;
89: }
90: x[idx] = s1;x[1+idx] = s2;
91: idx += 2;
92: }
93: /* backward solve the L^T */
94: for (i=n-1; i>=0; i--){
95: v = aa + 4*diag[i] - 4;
96: vi = aj + diag[i] - 1;
97: nz = diag[i] - ai[i];
98: idt = 2*i;
99: s1 = x[idt]; s2 = x[1+idt];
100: while (nz--) {
101: idx = 2*(*vi--);
102: x[idx] -= v[0]*s1 + v[1]*s2;
103: x[idx+1] -= v[2]*s1 + v[3]*s2;
104: v -= 4;
105: }
106: }
107: VecRestoreArray(bb,&b);
108: VecRestoreArray(xx,&x);
109: PetscLogFlops(2*4*(a->nz) - 2*A->n);
110: return(0);
111: }
115: int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116: {
117: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
118: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
119: int *diag = a->diag,oidx;
120: MatScalar *aa=a->a,*v;
121: PetscScalar s1,s2,s3,x1,x2,x3;
122: PetscScalar *x,*b;
125: VecCopy(bb,xx);
126: VecGetArray(bb,&b);
127: VecGetArray(xx,&x);
129: /* forward solve the U^T */
130: idx = 0;
131: for (i=0; i<n; i++) {
133: v = aa + 9*diag[i];
134: /* multiply by the inverse of the block diagonal */
135: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
136: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
137: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
138: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
139: v += 9;
141: vi = aj + diag[i] + 1;
142: nz = ai[i+1] - diag[i] - 1;
143: while (nz--) {
144: oidx = 3*(*vi++);
145: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
146: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
147: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
148: v += 9;
149: }
150: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
151: idx += 3;
152: }
153: /* backward solve the L^T */
154: for (i=n-1; i>=0; i--){
155: v = aa + 9*diag[i] - 9;
156: vi = aj + diag[i] - 1;
157: nz = diag[i] - ai[i];
158: idt = 3*i;
159: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
160: while (nz--) {
161: idx = 3*(*vi--);
162: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
163: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
164: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
165: v -= 9;
166: }
167: }
168: VecRestoreArray(bb,&b);
169: VecRestoreArray(xx,&x);
170: PetscLogFlops(2*9*(a->nz) - 3*A->n);
171: return(0);
172: }
176: int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
177: {
178: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
179: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
180: int *diag = a->diag,oidx;
181: MatScalar *aa=a->a,*v;
182: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
183: PetscScalar *x,*b;
186: VecCopy(bb,xx);
187: VecGetArray(bb,&b);
188: VecGetArray(xx,&x);
190: /* forward solve the U^T */
191: idx = 0;
192: for (i=0; i<n; i++) {
194: v = aa + 16*diag[i];
195: /* multiply by the inverse of the block diagonal */
196: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
197: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
198: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
199: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
200: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
201: v += 16;
203: vi = aj + diag[i] + 1;
204: nz = ai[i+1] - diag[i] - 1;
205: while (nz--) {
206: oidx = 4*(*vi++);
207: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
208: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
209: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
210: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
211: v += 16;
212: }
213: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
214: idx += 4;
215: }
216: /* backward solve the L^T */
217: for (i=n-1; i>=0; i--){
218: v = aa + 16*diag[i] - 16;
219: vi = aj + diag[i] - 1;
220: nz = diag[i] - ai[i];
221: idt = 4*i;
222: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
223: while (nz--) {
224: idx = 4*(*vi--);
225: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
226: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
227: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
228: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
229: v -= 16;
230: }
231: }
232: VecRestoreArray(bb,&b);
233: VecRestoreArray(xx,&x);
234: PetscLogFlops(2*16*(a->nz) - 4*A->n);
235: return(0);
236: }
240: int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
241: {
242: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
243: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
244: int *diag = a->diag,oidx;
245: MatScalar *aa=a->a,*v;
246: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
247: PetscScalar *x,*b;
250: VecCopy(bb,xx);
251: VecGetArray(bb,&b);
252: VecGetArray(xx,&x);
254: /* forward solve the U^T */
255: idx = 0;
256: for (i=0; i<n; i++) {
258: v = aa + 25*diag[i];
259: /* multiply by the inverse of the block diagonal */
260: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
261: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
262: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
263: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
264: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
265: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
266: v += 25;
268: vi = aj + diag[i] + 1;
269: nz = ai[i+1] - diag[i] - 1;
270: while (nz--) {
271: oidx = 5*(*vi++);
272: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
273: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
274: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
275: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
276: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
277: v += 25;
278: }
279: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
280: idx += 5;
281: }
282: /* backward solve the L^T */
283: for (i=n-1; i>=0; i--){
284: v = aa + 25*diag[i] - 25;
285: vi = aj + diag[i] - 1;
286: nz = diag[i] - ai[i];
287: idt = 5*i;
288: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
289: while (nz--) {
290: idx = 5*(*vi--);
291: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
292: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
293: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
294: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
295: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
296: v -= 25;
297: }
298: }
299: VecRestoreArray(bb,&b);
300: VecRestoreArray(xx,&x);
301: PetscLogFlops(2*25*(a->nz) - 5*A->n);
302: return(0);
303: }
307: int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
308: {
309: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
310: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
311: int *diag = a->diag,oidx;
312: MatScalar *aa=a->a,*v;
313: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
314: PetscScalar *x,*b;
317: VecCopy(bb,xx);
318: VecGetArray(bb,&b);
319: VecGetArray(xx,&x);
321: /* forward solve the U^T */
322: idx = 0;
323: for (i=0; i<n; i++) {
325: v = aa + 36*diag[i];
326: /* multiply by the inverse of the block diagonal */
327: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
328: x6 = x[5+idx];
329: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
330: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
331: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
332: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
333: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
334: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
335: v += 36;
337: vi = aj + diag[i] + 1;
338: nz = ai[i+1] - diag[i] - 1;
339: while (nz--) {
340: oidx = 6*(*vi++);
341: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
342: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
343: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
344: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
345: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
346: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
347: v += 36;
348: }
349: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
350: x[5+idx] = s6;
351: idx += 6;
352: }
353: /* backward solve the L^T */
354: for (i=n-1; i>=0; i--){
355: v = aa + 36*diag[i] - 36;
356: vi = aj + diag[i] - 1;
357: nz = diag[i] - ai[i];
358: idt = 6*i;
359: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360: s6 = x[5+idt];
361: while (nz--) {
362: idx = 6*(*vi--);
363: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
364: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
365: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
366: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
367: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
368: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
369: v -= 36;
370: }
371: }
372: VecRestoreArray(bb,&b);
373: VecRestoreArray(xx,&x);
374: PetscLogFlops(2*36*(a->nz) - 6*A->n);
375: return(0);
376: }
380: int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
381: {
382: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
383: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
384: int *diag = a->diag,oidx;
385: MatScalar *aa=a->a,*v;
386: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
387: PetscScalar *x,*b;
390: VecCopy(bb,xx);
391: VecGetArray(bb,&b);
392: VecGetArray(xx,&x);
394: /* forward solve the U^T */
395: idx = 0;
396: for (i=0; i<n; i++) {
398: v = aa + 49*diag[i];
399: /* multiply by the inverse of the block diagonal */
400: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
401: x6 = x[5+idx]; x7 = x[6+idx];
402: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
403: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
404: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
405: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
406: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
407: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
408: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
409: v += 49;
411: vi = aj + diag[i] + 1;
412: nz = ai[i+1] - diag[i] - 1;
413: while (nz--) {
414: oidx = 7*(*vi++);
415: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
416: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
417: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
418: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
419: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
420: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
421: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
422: v += 49;
423: }
424: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
425: x[5+idx] = s6;x[6+idx] = s7;
426: idx += 7;
427: }
428: /* backward solve the L^T */
429: for (i=n-1; i>=0; i--){
430: v = aa + 49*diag[i] - 49;
431: vi = aj + diag[i] - 1;
432: nz = diag[i] - ai[i];
433: idt = 7*i;
434: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
435: s6 = x[5+idt];s7 = x[6+idt];
436: while (nz--) {
437: idx = 7*(*vi--);
438: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
439: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
440: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
441: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
442: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
443: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
444: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
445: v -= 49;
446: }
447: }
448: VecRestoreArray(bb,&b);
449: VecRestoreArray(xx,&x);
450: PetscLogFlops(2*49*(a->nz) - 7*A->n);
451: return(0);
452: }
454: /*---------------------------------------------------------------------------------------------*/
457: int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
458: {
459: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
460: IS iscol=a->col,isrow=a->row;
461: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
462: int *diag = a->diag;
463: MatScalar *aa=a->a,*v;
464: PetscScalar s1,*x,*b,*t;
467: VecGetArray(bb,&b);
468: VecGetArray(xx,&x);
469: t = a->solve_work;
471: ISGetIndices(isrow,&rout); r = rout;
472: ISGetIndices(iscol,&cout); c = cout;
474: /* copy the b into temp work space according to permutation */
475: for (i=0; i<n; i++) {
476: t[i] = b[c[i]];
477: }
479: /* forward solve the U^T */
480: for (i=0; i<n; i++) {
482: v = aa + diag[i];
483: /* multiply by the inverse of the block diagonal */
484: s1 = (*v++)*t[i];
485: vi = aj + diag[i] + 1;
486: nz = ai[i+1] - diag[i] - 1;
487: while (nz--) {
488: t[*vi++] -= (*v++)*s1;
489: }
490: t[i] = s1;
491: }
492: /* backward solve the L^T */
493: for (i=n-1; i>=0; i--){
494: v = aa + diag[i] - 1;
495: vi = aj + diag[i] - 1;
496: nz = diag[i] - ai[i];
497: s1 = t[i];
498: while (nz--) {
499: t[*vi--] -= (*v--)*s1;
500: }
501: }
503: /* copy t into x according to permutation */
504: for (i=0; i<n; i++) {
505: x[r[i]] = t[i];
506: }
508: ISRestoreIndices(isrow,&rout);
509: ISRestoreIndices(iscol,&cout);
510: VecRestoreArray(bb,&b);
511: VecRestoreArray(xx,&x);
512: PetscLogFlops(2*(a->nz) - A->n);
513: return(0);
514: }
518: int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
519: {
520: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
521: IS iscol=a->col,isrow=a->row;
522: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
523: int *diag = a->diag,ii,ic,ir,oidx;
524: MatScalar *aa=a->a,*v;
525: PetscScalar s1,s2,x1,x2;
526: PetscScalar *x,*b,*t;
529: VecGetArray(bb,&b);
530: VecGetArray(xx,&x);
531: t = a->solve_work;
533: ISGetIndices(isrow,&rout); r = rout;
534: ISGetIndices(iscol,&cout); c = cout;
536: /* copy the b into temp work space according to permutation */
537: ii = 0;
538: for (i=0; i<n; i++) {
539: ic = 2*c[i];
540: t[ii] = b[ic];
541: t[ii+1] = b[ic+1];
542: ii += 2;
543: }
545: /* forward solve the U^T */
546: idx = 0;
547: for (i=0; i<n; i++) {
549: v = aa + 4*diag[i];
550: /* multiply by the inverse of the block diagonal */
551: x1 = t[idx]; x2 = t[1+idx];
552: s1 = v[0]*x1 + v[1]*x2;
553: s2 = v[2]*x1 + v[3]*x2;
554: v += 4;
556: vi = aj + diag[i] + 1;
557: nz = ai[i+1] - diag[i] - 1;
558: while (nz--) {
559: oidx = 2*(*vi++);
560: t[oidx] -= v[0]*s1 + v[1]*s2;
561: t[oidx+1] -= v[2]*s1 + v[3]*s2;
562: v += 4;
563: }
564: t[idx] = s1;t[1+idx] = s2;
565: idx += 2;
566: }
567: /* backward solve the L^T */
568: for (i=n-1; i>=0; i--){
569: v = aa + 4*diag[i] - 4;
570: vi = aj + diag[i] - 1;
571: nz = diag[i] - ai[i];
572: idt = 2*i;
573: s1 = t[idt]; s2 = t[1+idt];
574: while (nz--) {
575: idx = 2*(*vi--);
576: t[idx] -= v[0]*s1 + v[1]*s2;
577: t[idx+1] -= v[2]*s1 + v[3]*s2;
578: v -= 4;
579: }
580: }
582: /* copy t into x according to permutation */
583: ii = 0;
584: for (i=0; i<n; i++) {
585: ir = 2*r[i];
586: x[ir] = t[ii];
587: x[ir+1] = t[ii+1];
588: ii += 2;
589: }
591: ISRestoreIndices(isrow,&rout);
592: ISRestoreIndices(iscol,&cout);
593: VecRestoreArray(bb,&b);
594: VecRestoreArray(xx,&x);
595: PetscLogFlops(2*4*(a->nz) - 2*A->n);
596: return(0);
597: }
601: int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
602: {
603: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
604: IS iscol=a->col,isrow=a->row;
605: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
606: int *diag = a->diag,ii,ic,ir,oidx;
607: MatScalar *aa=a->a,*v;
608: PetscScalar s1,s2,s3,x1,x2,x3;
609: PetscScalar *x,*b,*t;
612: VecGetArray(bb,&b);
613: VecGetArray(xx,&x);
614: t = a->solve_work;
616: ISGetIndices(isrow,&rout); r = rout;
617: ISGetIndices(iscol,&cout); c = cout;
619: /* copy the b into temp work space according to permutation */
620: ii = 0;
621: for (i=0; i<n; i++) {
622: ic = 3*c[i];
623: t[ii] = b[ic];
624: t[ii+1] = b[ic+1];
625: t[ii+2] = b[ic+2];
626: ii += 3;
627: }
629: /* forward solve the U^T */
630: idx = 0;
631: for (i=0; i<n; i++) {
633: v = aa + 9*diag[i];
634: /* multiply by the inverse of the block diagonal */
635: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
636: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
637: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
638: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
639: v += 9;
641: vi = aj + diag[i] + 1;
642: nz = ai[i+1] - diag[i] - 1;
643: while (nz--) {
644: oidx = 3*(*vi++);
645: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
646: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
647: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
648: v += 9;
649: }
650: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
651: idx += 3;
652: }
653: /* backward solve the L^T */
654: for (i=n-1; i>=0; i--){
655: v = aa + 9*diag[i] - 9;
656: vi = aj + diag[i] - 1;
657: nz = diag[i] - ai[i];
658: idt = 3*i;
659: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
660: while (nz--) {
661: idx = 3*(*vi--);
662: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
663: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
664: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
665: v -= 9;
666: }
667: }
669: /* copy t into x according to permutation */
670: ii = 0;
671: for (i=0; i<n; i++) {
672: ir = 3*r[i];
673: x[ir] = t[ii];
674: x[ir+1] = t[ii+1];
675: x[ir+2] = t[ii+2];
676: ii += 3;
677: }
679: ISRestoreIndices(isrow,&rout);
680: ISRestoreIndices(iscol,&cout);
681: VecRestoreArray(bb,&b);
682: VecRestoreArray(xx,&x);
683: PetscLogFlops(2*9*(a->nz) - 3*A->n);
684: return(0);
685: }
689: int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
690: {
691: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
692: IS iscol=a->col,isrow=a->row;
693: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
694: int *diag = a->diag,ii,ic,ir,oidx;
695: MatScalar *aa=a->a,*v;
696: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
697: PetscScalar *x,*b,*t;
700: VecGetArray(bb,&b);
701: VecGetArray(xx,&x);
702: t = a->solve_work;
704: ISGetIndices(isrow,&rout); r = rout;
705: ISGetIndices(iscol,&cout); c = cout;
707: /* copy the b into temp work space according to permutation */
708: ii = 0;
709: for (i=0; i<n; i++) {
710: ic = 4*c[i];
711: t[ii] = b[ic];
712: t[ii+1] = b[ic+1];
713: t[ii+2] = b[ic+2];
714: t[ii+3] = b[ic+3];
715: ii += 4;
716: }
718: /* forward solve the U^T */
719: idx = 0;
720: for (i=0; i<n; i++) {
722: v = aa + 16*diag[i];
723: /* multiply by the inverse of the block diagonal */
724: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
725: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
726: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
727: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
728: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
729: v += 16;
731: vi = aj + diag[i] + 1;
732: nz = ai[i+1] - diag[i] - 1;
733: while (nz--) {
734: oidx = 4*(*vi++);
735: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
736: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
737: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
738: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
739: v += 16;
740: }
741: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
742: idx += 4;
743: }
744: /* backward solve the L^T */
745: for (i=n-1; i>=0; i--){
746: v = aa + 16*diag[i] - 16;
747: vi = aj + diag[i] - 1;
748: nz = diag[i] - ai[i];
749: idt = 4*i;
750: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
751: while (nz--) {
752: idx = 4*(*vi--);
753: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
754: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
755: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
756: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
757: v -= 16;
758: }
759: }
761: /* copy t into x according to permutation */
762: ii = 0;
763: for (i=0; i<n; i++) {
764: ir = 4*r[i];
765: x[ir] = t[ii];
766: x[ir+1] = t[ii+1];
767: x[ir+2] = t[ii+2];
768: x[ir+3] = t[ii+3];
769: ii += 4;
770: }
772: ISRestoreIndices(isrow,&rout);
773: ISRestoreIndices(iscol,&cout);
774: VecRestoreArray(bb,&b);
775: VecRestoreArray(xx,&x);
776: PetscLogFlops(2*16*(a->nz) - 4*A->n);
777: return(0);
778: }
782: int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
783: {
784: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
785: IS iscol=a->col,isrow=a->row;
786: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
787: int *diag = a->diag,ii,ic,ir,oidx;
788: MatScalar *aa=a->a,*v;
789: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
790: PetscScalar *x,*b,*t;
793: VecGetArray(bb,&b);
794: VecGetArray(xx,&x);
795: t = a->solve_work;
797: ISGetIndices(isrow,&rout); r = rout;
798: ISGetIndices(iscol,&cout); c = cout;
800: /* copy the b into temp work space according to permutation */
801: ii = 0;
802: for (i=0; i<n; i++) {
803: ic = 5*c[i];
804: t[ii] = b[ic];
805: t[ii+1] = b[ic+1];
806: t[ii+2] = b[ic+2];
807: t[ii+3] = b[ic+3];
808: t[ii+4] = b[ic+4];
809: ii += 5;
810: }
812: /* forward solve the U^T */
813: idx = 0;
814: for (i=0; i<n; i++) {
816: v = aa + 25*diag[i];
817: /* multiply by the inverse of the block diagonal */
818: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
819: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
820: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
821: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
822: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
823: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
824: v += 25;
826: vi = aj + diag[i] + 1;
827: nz = ai[i+1] - diag[i] - 1;
828: while (nz--) {
829: oidx = 5*(*vi++);
830: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
831: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
832: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
833: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
834: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
835: v += 25;
836: }
837: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
838: idx += 5;
839: }
840: /* backward solve the L^T */
841: for (i=n-1; i>=0; i--){
842: v = aa + 25*diag[i] - 25;
843: vi = aj + diag[i] - 1;
844: nz = diag[i] - ai[i];
845: idt = 5*i;
846: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
847: while (nz--) {
848: idx = 5*(*vi--);
849: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
850: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
851: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854: v -= 25;
855: }
856: }
858: /* copy t into x according to permutation */
859: ii = 0;
860: for (i=0; i<n; i++) {
861: ir = 5*r[i];
862: x[ir] = t[ii];
863: x[ir+1] = t[ii+1];
864: x[ir+2] = t[ii+2];
865: x[ir+3] = t[ii+3];
866: x[ir+4] = t[ii+4];
867: ii += 5;
868: }
870: ISRestoreIndices(isrow,&rout);
871: ISRestoreIndices(iscol,&cout);
872: VecRestoreArray(bb,&b);
873: VecRestoreArray(xx,&x);
874: PetscLogFlops(2*25*(a->nz) - 5*A->n);
875: return(0);
876: }
880: int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
881: {
882: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
883: IS iscol=a->col,isrow=a->row;
884: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
885: int *diag = a->diag,ii,ic,ir,oidx;
886: MatScalar *aa=a->a,*v;
887: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
888: PetscScalar *x,*b,*t;
891: VecGetArray(bb,&b);
892: VecGetArray(xx,&x);
893: t = a->solve_work;
895: ISGetIndices(isrow,&rout); r = rout;
896: ISGetIndices(iscol,&cout); c = cout;
898: /* copy the b into temp work space according to permutation */
899: ii = 0;
900: for (i=0; i<n; i++) {
901: ic = 6*c[i];
902: t[ii] = b[ic];
903: t[ii+1] = b[ic+1];
904: t[ii+2] = b[ic+2];
905: t[ii+3] = b[ic+3];
906: t[ii+4] = b[ic+4];
907: t[ii+5] = b[ic+5];
908: ii += 6;
909: }
911: /* forward solve the U^T */
912: idx = 0;
913: for (i=0; i<n; i++) {
915: v = aa + 36*diag[i];
916: /* multiply by the inverse of the block diagonal */
917: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
918: x6 = t[5+idx];
919: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
920: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
921: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
922: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
923: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
924: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
925: v += 36;
927: vi = aj + diag[i] + 1;
928: nz = ai[i+1] - diag[i] - 1;
929: while (nz--) {
930: oidx = 6*(*vi++);
931: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
932: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
933: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
934: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
935: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
936: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
937: v += 36;
938: }
939: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
940: t[5+idx] = s6;
941: idx += 6;
942: }
943: /* backward solve the L^T */
944: for (i=n-1; i>=0; i--){
945: v = aa + 36*diag[i] - 36;
946: vi = aj + diag[i] - 1;
947: nz = diag[i] - ai[i];
948: idt = 6*i;
949: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
950: s6 = t[5+idt];
951: while (nz--) {
952: idx = 6*(*vi--);
953: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
954: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
955: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
956: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
957: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
958: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
959: v -= 36;
960: }
961: }
963: /* copy t into x according to permutation */
964: ii = 0;
965: for (i=0; i<n; i++) {
966: ir = 6*r[i];
967: x[ir] = t[ii];
968: x[ir+1] = t[ii+1];
969: x[ir+2] = t[ii+2];
970: x[ir+3] = t[ii+3];
971: x[ir+4] = t[ii+4];
972: x[ir+5] = t[ii+5];
973: ii += 6;
974: }
976: ISRestoreIndices(isrow,&rout);
977: ISRestoreIndices(iscol,&cout);
978: VecRestoreArray(bb,&b);
979: VecRestoreArray(xx,&x);
980: PetscLogFlops(2*36*(a->nz) - 6*A->n);
981: return(0);
982: }
986: int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
987: {
988: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
989: IS iscol=a->col,isrow=a->row;
990: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
991: int *diag = a->diag,ii,ic,ir,oidx;
992: MatScalar *aa=a->a,*v;
993: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
994: PetscScalar *x,*b,*t;
997: VecGetArray(bb,&b);
998: VecGetArray(xx,&x);
999: t = a->solve_work;
1001: ISGetIndices(isrow,&rout); r = rout;
1002: ISGetIndices(iscol,&cout); c = cout;
1004: /* copy the b into temp work space according to permutation */
1005: ii = 0;
1006: for (i=0; i<n; i++) {
1007: ic = 7*c[i];
1008: t[ii] = b[ic];
1009: t[ii+1] = b[ic+1];
1010: t[ii+2] = b[ic+2];
1011: t[ii+3] = b[ic+3];
1012: t[ii+4] = b[ic+4];
1013: t[ii+5] = b[ic+5];
1014: t[ii+6] = b[ic+6];
1015: ii += 7;
1016: }
1018: /* forward solve the U^T */
1019: idx = 0;
1020: for (i=0; i<n; i++) {
1022: v = aa + 49*diag[i];
1023: /* multiply by the inverse of the block diagonal */
1024: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1025: x6 = t[5+idx]; x7 = t[6+idx];
1026: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1027: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1028: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1029: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1030: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1031: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1032: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1033: v += 49;
1035: vi = aj + diag[i] + 1;
1036: nz = ai[i+1] - diag[i] - 1;
1037: while (nz--) {
1038: oidx = 7*(*vi++);
1039: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1040: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1041: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1042: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1043: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1044: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1045: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1046: v += 49;
1047: }
1048: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1049: t[5+idx] = s6;t[6+idx] = s7;
1050: idx += 7;
1051: }
1052: /* backward solve the L^T */
1053: for (i=n-1; i>=0; i--){
1054: v = aa + 49*diag[i] - 49;
1055: vi = aj + diag[i] - 1;
1056: nz = diag[i] - ai[i];
1057: idt = 7*i;
1058: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1059: s6 = t[5+idt];s7 = t[6+idt];
1060: while (nz--) {
1061: idx = 7*(*vi--);
1062: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1063: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069: v -= 49;
1070: }
1071: }
1073: /* copy t into x according to permutation */
1074: ii = 0;
1075: for (i=0; i<n; i++) {
1076: ir = 7*r[i];
1077: x[ir] = t[ii];
1078: x[ir+1] = t[ii+1];
1079: x[ir+2] = t[ii+2];
1080: x[ir+3] = t[ii+3];
1081: x[ir+4] = t[ii+4];
1082: x[ir+5] = t[ii+5];
1083: x[ir+6] = t[ii+6];
1084: ii += 7;
1085: }
1087: ISRestoreIndices(isrow,&rout);
1088: ISRestoreIndices(iscol,&cout);
1089: VecRestoreArray(bb,&b);
1090: VecRestoreArray(xx,&x);
1091: PetscLogFlops(2*49*(a->nz) - 7*A->n);
1092: return(0);
1093: }
1095: /* ----------------------------------------------------------- */
1098: int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1099: {
1100: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1101: IS iscol=a->col,isrow=a->row;
1102: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1103: int nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1104: MatScalar *aa=a->a,*v;
1105: PetscScalar *x,*b,*s,*t,*ls;
1108: VecGetArray(bb,&b);
1109: VecGetArray(xx,&x);
1110: t = a->solve_work;
1112: ISGetIndices(isrow,&rout); r = rout;
1113: ISGetIndices(iscol,&cout); c = cout + (n-1);
1115: /* forward solve the lower triangular */
1116: PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1117: for (i=1; i<n; i++) {
1118: v = aa + bs2*ai[i];
1119: vi = aj + ai[i];
1120: nz = a->diag[i] - ai[i];
1121: s = t + bs*i;
1122: PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1123: while (nz--) {
1124: Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1125: v += bs2;
1126: }
1127: }
1128: /* backward solve the upper triangular */
1129: ls = a->solve_work + A->n;
1130: for (i=n-1; i>=0; i--){
1131: v = aa + bs2*(a->diag[i] + 1);
1132: vi = aj + a->diag[i] + 1;
1133: nz = ai[i+1] - a->diag[i] - 1;
1134: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1135: while (nz--) {
1136: Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1137: v += bs2;
1138: }
1139: Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1140: PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1141: }
1143: ISRestoreIndices(isrow,&rout);
1144: ISRestoreIndices(iscol,&cout);
1145: VecRestoreArray(bb,&b);
1146: VecRestoreArray(xx,&x);
1147: PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1148: return(0);
1149: }
1153: int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1154: {
1155: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1156: IS iscol=a->col,isrow=a->row;
1157: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1158: int *diag = a->diag;
1159: MatScalar *aa=a->a,*v;
1160: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1161: PetscScalar *x,*b,*t;
1164: VecGetArray(bb,&b);
1165: VecGetArray(xx,&x);
1166: t = a->solve_work;
1168: ISGetIndices(isrow,&rout); r = rout;
1169: ISGetIndices(iscol,&cout); c = cout + (n-1);
1171: /* forward solve the lower triangular */
1172: idx = 7*(*r++);
1173: t[0] = b[idx]; t[1] = b[1+idx];
1174: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1175: t[5] = b[5+idx]; t[6] = b[6+idx];
1177: for (i=1; i<n; i++) {
1178: v = aa + 49*ai[i];
1179: vi = aj + ai[i];
1180: nz = diag[i] - ai[i];
1181: idx = 7*(*r++);
1182: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1183: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1184: while (nz--) {
1185: idx = 7*(*vi++);
1186: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1187: x4 = t[3+idx];x5 = t[4+idx];
1188: x6 = t[5+idx];x7 = t[6+idx];
1189: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1190: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1191: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1192: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1193: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1194: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1195: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1196: v += 49;
1197: }
1198: idx = 7*i;
1199: t[idx] = s1;t[1+idx] = s2;
1200: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1201: t[5+idx] = s6;t[6+idx] = s7;
1202: }
1203: /* backward solve the upper triangular */
1204: for (i=n-1; i>=0; i--){
1205: v = aa + 49*diag[i] + 49;
1206: vi = aj + diag[i] + 1;
1207: nz = ai[i+1] - diag[i] - 1;
1208: idt = 7*i;
1209: s1 = t[idt]; s2 = t[1+idt];
1210: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1211: s6 = t[5+idt];s7 = t[6+idt];
1212: while (nz--) {
1213: idx = 7*(*vi++);
1214: x1 = t[idx]; x2 = t[1+idx];
1215: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1216: x6 = t[5+idx]; x7 = t[6+idx];
1217: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1218: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1219: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1220: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1221: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1222: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1223: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1224: v += 49;
1225: }
1226: idc = 7*(*c--);
1227: v = aa + 49*diag[i];
1228: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
1229: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1230: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1231: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1232: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1233: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1234: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1235: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1236: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1237: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1238: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1239: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1240: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1241: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1242: }
1244: ISRestoreIndices(isrow,&rout);
1245: ISRestoreIndices(iscol,&cout);
1246: VecRestoreArray(bb,&b);
1247: VecRestoreArray(xx,&x);
1248: PetscLogFlops(2*49*(a->nz) - 7*A->n);
1249: return(0);
1250: }
1254: int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1255: {
1256: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1257: int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1258: int ierr,*diag = a->diag,jdx;
1259: MatScalar *aa=a->a,*v;
1260: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1263: VecGetArray(bb,&b);
1264: VecGetArray(xx,&x);
1265: /* forward solve the lower triangular */
1266: idx = 0;
1267: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1268: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1269: x[6] = b[6+idx];
1270: for (i=1; i<n; i++) {
1271: v = aa + 49*ai[i];
1272: vi = aj + ai[i];
1273: nz = diag[i] - ai[i];
1274: idx = 7*i;
1275: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1276: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1277: s7 = b[6+idx];
1278: while (nz--) {
1279: jdx = 7*(*vi++);
1280: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1281: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1282: x7 = x[6+jdx];
1283: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1284: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1285: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1286: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1287: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1288: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1289: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1290: v += 49;
1291: }
1292: x[idx] = s1;
1293: x[1+idx] = s2;
1294: x[2+idx] = s3;
1295: x[3+idx] = s4;
1296: x[4+idx] = s5;
1297: x[5+idx] = s6;
1298: x[6+idx] = s7;
1299: }
1300: /* backward solve the upper triangular */
1301: for (i=n-1; i>=0; i--){
1302: v = aa + 49*diag[i] + 49;
1303: vi = aj + diag[i] + 1;
1304: nz = ai[i+1] - diag[i] - 1;
1305: idt = 7*i;
1306: s1 = x[idt]; s2 = x[1+idt];
1307: s3 = x[2+idt]; s4 = x[3+idt];
1308: s5 = x[4+idt]; s6 = x[5+idt];
1309: s7 = x[6+idt];
1310: while (nz--) {
1311: idx = 7*(*vi++);
1312: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1313: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1314: x7 = x[6+idx];
1315: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1316: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1317: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1318: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1319: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1320: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1321: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1322: v += 49;
1323: }
1324: v = aa + 49*diag[i];
1325: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
1326: + v[28]*s5 + v[35]*s6 + v[42]*s7;
1327: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
1328: + v[29]*s5 + v[36]*s6 + v[43]*s7;
1329: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
1330: + v[30]*s5 + v[37]*s6 + v[44]*s7;
1331: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
1332: + v[31]*s5 + v[38]*s6 + v[45]*s7;
1333: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
1334: + v[32]*s5 + v[39]*s6 + v[46]*s7;
1335: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
1336: + v[33]*s5 + v[40]*s6 + v[47]*s7;
1337: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
1338: + v[34]*s5 + v[41]*s6 + v[48]*s7;
1339: }
1341: VecRestoreArray(bb,&b);
1342: VecRestoreArray(xx,&x);
1343: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1344: return(0);
1345: }
1349: int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1350: {
1351: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1352: IS iscol=a->col,isrow=a->row;
1353: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1354: int *diag = a->diag;
1355: MatScalar *aa=a->a,*v;
1356: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1359: VecGetArray(bb,&b);
1360: VecGetArray(xx,&x);
1361: t = a->solve_work;
1363: ISGetIndices(isrow,&rout); r = rout;
1364: ISGetIndices(iscol,&cout); c = cout + (n-1);
1366: /* forward solve the lower triangular */
1367: idx = 6*(*r++);
1368: t[0] = b[idx]; t[1] = b[1+idx];
1369: t[2] = b[2+idx]; t[3] = b[3+idx];
1370: t[4] = b[4+idx]; t[5] = b[5+idx];
1371: for (i=1; i<n; i++) {
1372: v = aa + 36*ai[i];
1373: vi = aj + ai[i];
1374: nz = diag[i] - ai[i];
1375: idx = 6*(*r++);
1376: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1377: s5 = b[4+idx]; s6 = b[5+idx];
1378: while (nz--) {
1379: idx = 6*(*vi++);
1380: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1381: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1382: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1383: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1384: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1385: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1386: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1387: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1388: v += 36;
1389: }
1390: idx = 6*i;
1391: t[idx] = s1;t[1+idx] = s2;
1392: t[2+idx] = s3;t[3+idx] = s4;
1393: t[4+idx] = s5;t[5+idx] = s6;
1394: }
1395: /* backward solve the upper triangular */
1396: for (i=n-1; i>=0; i--){
1397: v = aa + 36*diag[i] + 36;
1398: vi = aj + diag[i] + 1;
1399: nz = ai[i+1] - diag[i] - 1;
1400: idt = 6*i;
1401: s1 = t[idt]; s2 = t[1+idt];
1402: s3 = t[2+idt];s4 = t[3+idt];
1403: s5 = t[4+idt];s6 = t[5+idt];
1404: while (nz--) {
1405: idx = 6*(*vi++);
1406: x1 = t[idx]; x2 = t[1+idx];
1407: x3 = t[2+idx]; x4 = t[3+idx];
1408: x5 = t[4+idx]; x6 = t[5+idx];
1409: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1415: v += 36;
1416: }
1417: idc = 6*(*c--);
1418: v = aa + 36*diag[i];
1419: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
1420: v[18]*s4+v[24]*s5+v[30]*s6;
1421: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1422: v[19]*s4+v[25]*s5+v[31]*s6;
1423: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1424: v[20]*s4+v[26]*s5+v[32]*s6;
1425: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1426: v[21]*s4+v[27]*s5+v[33]*s6;
1427: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1428: v[22]*s4+v[28]*s5+v[34]*s6;
1429: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1430: v[23]*s4+v[29]*s5+v[35]*s6;
1431: }
1433: ISRestoreIndices(isrow,&rout);
1434: ISRestoreIndices(iscol,&cout);
1435: VecRestoreArray(bb,&b);
1436: VecRestoreArray(xx,&x);
1437: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1438: return(0);
1439: }
1443: int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1444: {
1445: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1446: int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1447: int ierr,*diag = a->diag,jdx;
1448: MatScalar *aa=a->a,*v;
1449: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1452: VecGetArray(bb,&b);
1453: VecGetArray(xx,&x);
1454: /* forward solve the lower triangular */
1455: idx = 0;
1456: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1457: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1458: for (i=1; i<n; i++) {
1459: v = aa + 36*ai[i];
1460: vi = aj + ai[i];
1461: nz = diag[i] - ai[i];
1462: idx = 6*i;
1463: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1464: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1465: while (nz--) {
1466: jdx = 6*(*vi++);
1467: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1468: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1469: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1470: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1471: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1472: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1473: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1474: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1475: v += 36;
1476: }
1477: x[idx] = s1;
1478: x[1+idx] = s2;
1479: x[2+idx] = s3;
1480: x[3+idx] = s4;
1481: x[4+idx] = s5;
1482: x[5+idx] = s6;
1483: }
1484: /* backward solve the upper triangular */
1485: for (i=n-1; i>=0; i--){
1486: v = aa + 36*diag[i] + 36;
1487: vi = aj + diag[i] + 1;
1488: nz = ai[i+1] - diag[i] - 1;
1489: idt = 6*i;
1490: s1 = x[idt]; s2 = x[1+idt];
1491: s3 = x[2+idt]; s4 = x[3+idt];
1492: s5 = x[4+idt]; s6 = x[5+idt];
1493: while (nz--) {
1494: idx = 6*(*vi++);
1495: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1496: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1497: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1503: v += 36;
1504: }
1505: v = aa + 36*diag[i];
1506: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1507: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1508: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1509: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1510: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1511: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1512: }
1514: VecRestoreArray(bb,&b);
1515: VecRestoreArray(xx,&x);
1516: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1517: return(0);
1518: }
1522: int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1523: {
1524: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1525: IS iscol=a->col,isrow=a->row;
1526: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1527: int *diag = a->diag;
1528: MatScalar *aa=a->a,*v;
1529: PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1532: VecGetArray(bb,&b);
1533: VecGetArray(xx,&x);
1534: t = a->solve_work;
1536: ISGetIndices(isrow,&rout); r = rout;
1537: ISGetIndices(iscol,&cout); c = cout + (n-1);
1539: /* forward solve the lower triangular */
1540: idx = 5*(*r++);
1541: t[0] = b[idx]; t[1] = b[1+idx];
1542: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1543: for (i=1; i<n; i++) {
1544: v = aa + 25*ai[i];
1545: vi = aj + ai[i];
1546: nz = diag[i] - ai[i];
1547: idx = 5*(*r++);
1548: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1549: s5 = b[4+idx];
1550: while (nz--) {
1551: idx = 5*(*vi++);
1552: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1553: x4 = t[3+idx];x5 = t[4+idx];
1554: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1555: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1556: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1557: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1558: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1559: v += 25;
1560: }
1561: idx = 5*i;
1562: t[idx] = s1;t[1+idx] = s2;
1563: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1564: }
1565: /* backward solve the upper triangular */
1566: for (i=n-1; i>=0; i--){
1567: v = aa + 25*diag[i] + 25;
1568: vi = aj + diag[i] + 1;
1569: nz = ai[i+1] - diag[i] - 1;
1570: idt = 5*i;
1571: s1 = t[idt]; s2 = t[1+idt];
1572: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1573: while (nz--) {
1574: idx = 5*(*vi++);
1575: x1 = t[idx]; x2 = t[1+idx];
1576: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1577: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1582: v += 25;
1583: }
1584: idc = 5*(*c--);
1585: v = aa + 25*diag[i];
1586: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
1587: v[15]*s4+v[20]*s5;
1588: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1589: v[16]*s4+v[21]*s5;
1590: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1591: v[17]*s4+v[22]*s5;
1592: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1593: v[18]*s4+v[23]*s5;
1594: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1595: v[19]*s4+v[24]*s5;
1596: }
1598: ISRestoreIndices(isrow,&rout);
1599: ISRestoreIndices(iscol,&cout);
1600: VecRestoreArray(bb,&b);
1601: VecRestoreArray(xx,&x);
1602: PetscLogFlops(2*25*(a->nz) - 5*A->n);
1603: return(0);
1604: }
1608: int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1609: {
1610: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1611: int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1612: int ierr,*diag = a->diag,jdx;
1613: MatScalar *aa=a->a,*v;
1614: PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1617: VecGetArray(bb,&b);
1618: VecGetArray(xx,&x);
1619: /* forward solve the lower triangular */
1620: idx = 0;
1621: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1622: for (i=1; i<n; i++) {
1623: v = aa + 25*ai[i];
1624: vi = aj + ai[i];
1625: nz = diag[i] - ai[i];
1626: idx = 5*i;
1627: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1628: while (nz--) {
1629: jdx = 5*(*vi++);
1630: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1631: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1632: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1633: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1634: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1635: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1636: v += 25;
1637: }
1638: x[idx] = s1;
1639: x[1+idx] = s2;
1640: x[2+idx] = s3;
1641: x[3+idx] = s4;
1642: x[4+idx] = s5;
1643: }
1644: /* backward solve the upper triangular */
1645: for (i=n-1; i>=0; i--){
1646: v = aa + 25*diag[i] + 25;
1647: vi = aj + diag[i] + 1;
1648: nz = ai[i+1] - diag[i] - 1;
1649: idt = 5*i;
1650: s1 = x[idt]; s2 = x[1+idt];
1651: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1652: while (nz--) {
1653: idx = 5*(*vi++);
1654: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1655: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1656: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1657: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1658: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1659: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1660: v += 25;
1661: }
1662: v = aa + 25*diag[i];
1663: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
1664: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
1665: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
1666: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
1667: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
1668: }
1670: VecRestoreArray(bb,&b);
1671: VecRestoreArray(xx,&x);
1672: PetscLogFlops(2*25*(a->nz) - 5*A->n);
1673: return(0);
1674: }
1678: int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1679: {
1680: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1681: IS iscol=a->col,isrow=a->row;
1682: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1683: int *diag = a->diag;
1684: MatScalar *aa=a->a,*v;
1685: PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1688: VecGetArray(bb,&b);
1689: VecGetArray(xx,&x);
1690: t = a->solve_work;
1692: ISGetIndices(isrow,&rout); r = rout;
1693: ISGetIndices(iscol,&cout); c = cout + (n-1);
1695: /* forward solve the lower triangular */
1696: idx = 4*(*r++);
1697: t[0] = b[idx]; t[1] = b[1+idx];
1698: t[2] = b[2+idx]; t[3] = b[3+idx];
1699: for (i=1; i<n; i++) {
1700: v = aa + 16*ai[i];
1701: vi = aj + ai[i];
1702: nz = diag[i] - ai[i];
1703: idx = 4*(*r++);
1704: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1705: while (nz--) {
1706: idx = 4*(*vi++);
1707: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1708: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1709: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1710: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1711: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1712: v += 16;
1713: }
1714: idx = 4*i;
1715: t[idx] = s1;t[1+idx] = s2;
1716: t[2+idx] = s3;t[3+idx] = s4;
1717: }
1718: /* backward solve the upper triangular */
1719: for (i=n-1; i>=0; i--){
1720: v = aa + 16*diag[i] + 16;
1721: vi = aj + diag[i] + 1;
1722: nz = ai[i+1] - diag[i] - 1;
1723: idt = 4*i;
1724: s1 = t[idt]; s2 = t[1+idt];
1725: s3 = t[2+idt];s4 = t[3+idt];
1726: while (nz--) {
1727: idx = 4*(*vi++);
1728: x1 = t[idx]; x2 = t[1+idx];
1729: x3 = t[2+idx]; x4 = t[3+idx];
1730: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1731: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1732: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1733: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1734: v += 16;
1735: }
1736: idc = 4*(*c--);
1737: v = aa + 16*diag[i];
1738: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1739: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1740: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1741: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1742: }
1744: ISRestoreIndices(isrow,&rout);
1745: ISRestoreIndices(iscol,&cout);
1746: VecRestoreArray(bb,&b);
1747: VecRestoreArray(xx,&x);
1748: PetscLogFlops(2*16*(a->nz) - 4*A->n);
1749: return(0);
1750: }
1754: int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1755: {
1756: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1757: IS iscol=a->col,isrow=a->row;
1758: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1759: int *diag = a->diag;
1760: MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1761: PetscScalar *x,*b;
1764: VecGetArray(bb,&b);
1765: VecGetArray(xx,&x);
1766: t = (MatScalar *)a->solve_work;
1768: ISGetIndices(isrow,&rout); r = rout;
1769: ISGetIndices(iscol,&cout); c = cout + (n-1);
1771: /* forward solve the lower triangular */
1772: idx = 4*(*r++);
1773: t[0] = (MatScalar)b[idx];
1774: t[1] = (MatScalar)b[1+idx];
1775: t[2] = (MatScalar)b[2+idx];
1776: t[3] = (MatScalar)b[3+idx];
1777: for (i=1; i<n; i++) {
1778: v = aa + 16*ai[i];
1779: vi = aj + ai[i];
1780: nz = diag[i] - ai[i];
1781: idx = 4*(*r++);
1782: s1 = (MatScalar)b[idx];
1783: s2 = (MatScalar)b[1+idx];
1784: s3 = (MatScalar)b[2+idx];
1785: s4 = (MatScalar)b[3+idx];
1786: while (nz--) {
1787: idx = 4*(*vi++);
1788: x1 = t[idx];
1789: x2 = t[1+idx];
1790: x3 = t[2+idx];
1791: x4 = t[3+idx];
1792: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1793: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1794: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1795: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1796: v += 16;
1797: }
1798: idx = 4*i;
1799: t[idx] = s1;
1800: t[1+idx] = s2;
1801: t[2+idx] = s3;
1802: t[3+idx] = s4;
1803: }
1804: /* backward solve the upper triangular */
1805: for (i=n-1; i>=0; i--){
1806: v = aa + 16*diag[i] + 16;
1807: vi = aj + diag[i] + 1;
1808: nz = ai[i+1] - diag[i] - 1;
1809: idt = 4*i;
1810: s1 = t[idt];
1811: s2 = t[1+idt];
1812: s3 = t[2+idt];
1813: s4 = t[3+idt];
1814: while (nz--) {
1815: idx = 4*(*vi++);
1816: x1 = t[idx];
1817: x2 = t[1+idx];
1818: x3 = t[2+idx];
1819: x4 = t[3+idx];
1820: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1821: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1822: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1823: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1824: v += 16;
1825: }
1826: idc = 4*(*c--);
1827: v = aa + 16*diag[i];
1828: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1829: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1830: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1831: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1832: x[idc] = (PetscScalar)t[idt];
1833: x[1+idc] = (PetscScalar)t[1+idt];
1834: x[2+idc] = (PetscScalar)t[2+idt];
1835: x[3+idc] = (PetscScalar)t[3+idt];
1836: }
1838: ISRestoreIndices(isrow,&rout);
1839: ISRestoreIndices(iscol,&cout);
1840: VecRestoreArray(bb,&b);
1841: VecRestoreArray(xx,&x);
1842: PetscLogFlops(2*16*(a->nz) - 4*A->n);
1843: return(0);
1844: }
1846: #if defined (PETSC_HAVE_SSE)
1848: #include PETSC_HAVE_SSE
1852: int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1853: {
1854: /*
1855: Note: This code uses demotion of double
1856: to float when performing the mixed-mode computation.
1857: This may not be numerically reasonable for all applications.
1858: */
1859: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1860: IS iscol=a->col,isrow=a->row;
1861: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1862: int *diag = a->diag,ai16;
1863: MatScalar *aa=a->a,*v;
1864: PetscScalar *x,*b,*t;
1866: /* Make space in temp stack for 16 Byte Aligned arrays */
1867: float ssealignedspace[11],*tmps,*tmpx;
1868: unsigned long offset;
1869:
1871: SSE_SCOPE_BEGIN;
1873: offset = (unsigned long)ssealignedspace % 16;
1874: if (offset) offset = (16 - offset)/4;
1875: tmps = &ssealignedspace[offset];
1876: tmpx = &ssealignedspace[offset+4];
1877: PREFETCH_NTA(aa+16*ai[1]);
1879: VecGetArray(bb,&b);
1880: VecGetArray(xx,&x);
1881: t = a->solve_work;
1883: ISGetIndices(isrow,&rout); r = rout;
1884: ISGetIndices(iscol,&cout); c = cout + (n-1);
1886: /* forward solve the lower triangular */
1887: idx = 4*(*r++);
1888: t[0] = b[idx]; t[1] = b[1+idx];
1889: t[2] = b[2+idx]; t[3] = b[3+idx];
1890: v = aa + 16*ai[1];
1892: for (i=1; i<n;) {
1893: PREFETCH_NTA(&v[8]);
1894: vi = aj + ai[i];
1895: nz = diag[i] - ai[i];
1896: idx = 4*(*r++);
1898: /* Demote sum from double to float */
1899: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1900: LOAD_PS(tmps,XMM7);
1902: while (nz--) {
1903: PREFETCH_NTA(&v[16]);
1904: idx = 4*(*vi++);
1905:
1906: /* Demote solution (so far) from double to float */
1907: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1909: /* 4x4 Matrix-Vector product with negative accumulation: */
1910: SSE_INLINE_BEGIN_2(tmpx,v)
1911: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1913: /* First Column */
1914: SSE_COPY_PS(XMM0,XMM6)
1915: SSE_SHUFFLE(XMM0,XMM0,0x00)
1916: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1917: SSE_SUB_PS(XMM7,XMM0)
1918:
1919: /* Second Column */
1920: SSE_COPY_PS(XMM1,XMM6)
1921: SSE_SHUFFLE(XMM1,XMM1,0x55)
1922: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1923: SSE_SUB_PS(XMM7,XMM1)
1924:
1925: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1926:
1927: /* Third Column */
1928: SSE_COPY_PS(XMM2,XMM6)
1929: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1930: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1931: SSE_SUB_PS(XMM7,XMM2)
1933: /* Fourth Column */
1934: SSE_COPY_PS(XMM3,XMM6)
1935: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1936: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1937: SSE_SUB_PS(XMM7,XMM3)
1938: SSE_INLINE_END_2
1939:
1940: v += 16;
1941: }
1942: idx = 4*i;
1943: v = aa + 16*ai[++i];
1944: PREFETCH_NTA(v);
1945: STORE_PS(tmps,XMM7);
1947: /* Promote result from float to double */
1948: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1949: }
1950: /* backward solve the upper triangular */
1951: idt = 4*(n-1);
1952: ai16 = 16*diag[n-1];
1953: v = aa + ai16 + 16;
1954: for (i=n-1; i>=0;){
1955: PREFETCH_NTA(&v[8]);
1956: vi = aj + diag[i] + 1;
1957: nz = ai[i+1] - diag[i] - 1;
1958:
1959: /* Demote accumulator from double to float */
1960: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1961: LOAD_PS(tmps,XMM7);
1963: while (nz--) {
1964: PREFETCH_NTA(&v[16]);
1965: idx = 4*(*vi++);
1967: /* Demote solution (so far) from double to float */
1968: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1970: /* 4x4 Matrix-Vector Product with negative accumulation: */
1971: SSE_INLINE_BEGIN_2(tmpx,v)
1972: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1974: /* First Column */
1975: SSE_COPY_PS(XMM0,XMM6)
1976: SSE_SHUFFLE(XMM0,XMM0,0x00)
1977: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1978: SSE_SUB_PS(XMM7,XMM0)
1980: /* Second Column */
1981: SSE_COPY_PS(XMM1,XMM6)
1982: SSE_SHUFFLE(XMM1,XMM1,0x55)
1983: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1984: SSE_SUB_PS(XMM7,XMM1)
1986: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1987:
1988: /* Third Column */
1989: SSE_COPY_PS(XMM2,XMM6)
1990: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1991: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1992: SSE_SUB_PS(XMM7,XMM2)
1994: /* Fourth Column */
1995: SSE_COPY_PS(XMM3,XMM6)
1996: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1997: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1998: SSE_SUB_PS(XMM7,XMM3)
1999: SSE_INLINE_END_2
2000: v += 16;
2001: }
2002: v = aa + ai16;
2003: ai16 = 16*diag[--i];
2004: PREFETCH_NTA(aa+ai16+16);
2005: /*
2006: Scale the result by the diagonal 4x4 block,
2007: which was inverted as part of the factorization
2008: */
2009: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2010: /* First Column */
2011: SSE_COPY_PS(XMM0,XMM7)
2012: SSE_SHUFFLE(XMM0,XMM0,0x00)
2013: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2015: /* Second Column */
2016: SSE_COPY_PS(XMM1,XMM7)
2017: SSE_SHUFFLE(XMM1,XMM1,0x55)
2018: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2019: SSE_ADD_PS(XMM0,XMM1)
2021: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2022:
2023: /* Third Column */
2024: SSE_COPY_PS(XMM2,XMM7)
2025: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2026: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2027: SSE_ADD_PS(XMM0,XMM2)
2029: /* Fourth Column */
2030: SSE_COPY_PS(XMM3,XMM7)
2031: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2032: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2033: SSE_ADD_PS(XMM0,XMM3)
2034:
2035: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2036: SSE_INLINE_END_3
2038: /* Promote solution from float to double */
2039: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2041: /* Apply reordering to t and stream into x. */
2042: /* This way, x doesn't pollute the cache. */
2043: /* Be careful with size: 2 doubles = 4 floats! */
2044: idc = 4*(*c--);
2045: SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2046: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
2047: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2048: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2049: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2050: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2051: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2052: SSE_INLINE_END_2
2053: v = aa + ai16 + 16;
2054: idt -= 4;
2055: }
2057: ISRestoreIndices(isrow,&rout);
2058: ISRestoreIndices(iscol,&cout);
2059: VecRestoreArray(bb,&b);
2060: VecRestoreArray(xx,&x);
2061: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2062: SSE_SCOPE_END;
2063: return(0);
2064: }
2066: #endif
2069: /*
2070: Special case where the matrix was ILU(0) factored in the natural
2071: ordering. This eliminates the need for the column and row permutation.
2072: */
2075: int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2076: {
2077: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2078: int n=a->mbs,*ai=a->i,*aj=a->j;
2079: int ierr,*diag = a->diag;
2080: MatScalar *aa=a->a;
2081: PetscScalar *x,*b;
2084: VecGetArray(bb,&b);
2085: VecGetArray(xx,&x);
2087: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2088: {
2089: static PetscScalar w[2000]; /* very BAD need to fix */
2090: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2091: }
2092: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2093: {
2094: static PetscScalar w[2000]; /* very BAD need to fix */
2095: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2096: }
2097: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2098: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2099: #else
2100: {
2101: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
2102: MatScalar *v;
2103: int jdx,idt,idx,nz,*vi,i,ai16;
2105: /* forward solve the lower triangular */
2106: idx = 0;
2107: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2108: for (i=1; i<n; i++) {
2109: v = aa + 16*ai[i];
2110: vi = aj + ai[i];
2111: nz = diag[i] - ai[i];
2112: idx += 4;
2113: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2114: while (nz--) {
2115: jdx = 4*(*vi++);
2116: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2117: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2118: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2119: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2120: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2121: v += 16;
2122: }
2123: x[idx] = s1;
2124: x[1+idx] = s2;
2125: x[2+idx] = s3;
2126: x[3+idx] = s4;
2127: }
2128: /* backward solve the upper triangular */
2129: idt = 4*(n-1);
2130: for (i=n-1; i>=0; i--){
2131: ai16 = 16*diag[i];
2132: v = aa + ai16 + 16;
2133: vi = aj + diag[i] + 1;
2134: nz = ai[i+1] - diag[i] - 1;
2135: s1 = x[idt]; s2 = x[1+idt];
2136: s3 = x[2+idt];s4 = x[3+idt];
2137: while (nz--) {
2138: idx = 4*(*vi++);
2139: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
2140: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2141: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2142: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2143: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2144: v += 16;
2145: }
2146: v = aa + ai16;
2147: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2148: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
2149: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2150: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2151: idt -= 4;
2152: }
2153: }
2154: #endif
2156: VecRestoreArray(bb,&b);
2157: VecRestoreArray(xx,&x);
2158: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2159: return(0);
2160: }
2164: int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2165: {
2166: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2167: int n=a->mbs,*ai=a->i,*aj=a->j;
2168: int ierr,*diag = a->diag;
2169: MatScalar *aa=a->a;
2170: PetscScalar *x,*b;
2173: VecGetArray(bb,&b);
2174: VecGetArray(xx,&x);
2176: {
2177: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
2178: MatScalar *v,*t=(MatScalar *)x;
2179: int jdx,idt,idx,nz,*vi,i,ai16;
2181: /* forward solve the lower triangular */
2182: idx = 0;
2183: t[0] = (MatScalar)b[0];
2184: t[1] = (MatScalar)b[1];
2185: t[2] = (MatScalar)b[2];
2186: t[3] = (MatScalar)b[3];
2187: for (i=1; i<n; i++) {
2188: v = aa + 16*ai[i];
2189: vi = aj + ai[i];
2190: nz = diag[i] - ai[i];
2191: idx += 4;
2192: s1 = (MatScalar)b[idx];
2193: s2 = (MatScalar)b[1+idx];
2194: s3 = (MatScalar)b[2+idx];
2195: s4 = (MatScalar)b[3+idx];
2196: while (nz--) {
2197: jdx = 4*(*vi++);
2198: x1 = t[jdx];
2199: x2 = t[1+jdx];
2200: x3 = t[2+jdx];
2201: x4 = t[3+jdx];
2202: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2203: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2204: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2205: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2206: v += 16;
2207: }
2208: t[idx] = s1;
2209: t[1+idx] = s2;
2210: t[2+idx] = s3;
2211: t[3+idx] = s4;
2212: }
2213: /* backward solve the upper triangular */
2214: idt = 4*(n-1);
2215: for (i=n-1; i>=0; i--){
2216: ai16 = 16*diag[i];
2217: v = aa + ai16 + 16;
2218: vi = aj + diag[i] + 1;
2219: nz = ai[i+1] - diag[i] - 1;
2220: s1 = t[idt];
2221: s2 = t[1+idt];
2222: s3 = t[2+idt];
2223: s4 = t[3+idt];
2224: while (nz--) {
2225: idx = 4*(*vi++);
2226: x1 = (MatScalar)x[idx];
2227: x2 = (MatScalar)x[1+idx];
2228: x3 = (MatScalar)x[2+idx];
2229: x4 = (MatScalar)x[3+idx];
2230: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2231: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2232: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2233: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2234: v += 16;
2235: }
2236: v = aa + ai16;
2237: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
2238: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
2239: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2240: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2241: idt -= 4;
2242: }
2243: }
2245: VecRestoreArray(bb,&b);
2246: VecRestoreArray(xx,&x);
2247: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2248: return(0);
2249: }
2251: #if defined (PETSC_HAVE_SSE)
2253: #include PETSC_HAVE_SSE
2254: #include "src/vec/vecimpl.h" /* to allow VecGetArrayFast() */
2257: int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2258: {
2259: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2260: unsigned short *aj=(unsigned short *)a->j;
2261: int ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2262: MatScalar *aa=a->a;
2263: PetscScalar *x,*b;
2266: SSE_SCOPE_BEGIN;
2267: /*
2268: Note: This code currently uses demotion of double
2269: to float when performing the mixed-mode computation.
2270: This may not be numerically reasonable for all applications.
2271: */
2272: PREFETCH_NTA(aa+16*ai[1]);
2274: VecGetArrayFast(bb,&b);
2275: VecGetArrayFast(xx,&x);
2276: {
2277: /* x will first be computed in single precision then promoted inplace to double */
2278: MatScalar *v,*t=(MatScalar *)x;
2279: int nz,i,idt,ai16;
2280: unsigned int jdx,idx;
2281: unsigned short *vi;
2282: /* Forward solve the lower triangular factor. */
2284: /* First block is the identity. */
2285: idx = 0;
2286: CONVERT_DOUBLE4_FLOAT4(t,b);
2287: v = aa + 16*((unsigned int)ai[1]);
2289: for (i=1; i<n;) {
2290: PREFETCH_NTA(&v[8]);
2291: vi = aj + ai[i];
2292: nz = diag[i] - ai[i];
2293: idx += 4;
2295: /* Demote RHS from double to float. */
2296: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2297: LOAD_PS(&t[idx],XMM7);
2299: while (nz--) {
2300: PREFETCH_NTA(&v[16]);
2301: jdx = 4*((unsigned int)(*vi++));
2302:
2303: /* 4x4 Matrix-Vector product with negative accumulation: */
2304: SSE_INLINE_BEGIN_2(&t[jdx],v)
2305: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2307: /* First Column */
2308: SSE_COPY_PS(XMM0,XMM6)
2309: SSE_SHUFFLE(XMM0,XMM0,0x00)
2310: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2311: SSE_SUB_PS(XMM7,XMM0)
2313: /* Second Column */
2314: SSE_COPY_PS(XMM1,XMM6)
2315: SSE_SHUFFLE(XMM1,XMM1,0x55)
2316: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2317: SSE_SUB_PS(XMM7,XMM1)
2319: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2320:
2321: /* Third Column */
2322: SSE_COPY_PS(XMM2,XMM6)
2323: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2324: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2325: SSE_SUB_PS(XMM7,XMM2)
2327: /* Fourth Column */
2328: SSE_COPY_PS(XMM3,XMM6)
2329: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2330: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2331: SSE_SUB_PS(XMM7,XMM3)
2332: SSE_INLINE_END_2
2333:
2334: v += 16;
2335: }
2336: v = aa + 16*ai[++i];
2337: PREFETCH_NTA(v);
2338: STORE_PS(&t[idx],XMM7);
2339: }
2341: /* Backward solve the upper triangular factor.*/
2343: idt = 4*(n-1);
2344: ai16 = 16*diag[n-1];
2345: v = aa + ai16 + 16;
2346: for (i=n-1; i>=0;){
2347: PREFETCH_NTA(&v[8]);
2348: vi = aj + diag[i] + 1;
2349: nz = ai[i+1] - diag[i] - 1;
2350:
2351: LOAD_PS(&t[idt],XMM7);
2353: while (nz--) {
2354: PREFETCH_NTA(&v[16]);
2355: idx = 4*((unsigned int)(*vi++));
2357: /* 4x4 Matrix-Vector Product with negative accumulation: */
2358: SSE_INLINE_BEGIN_2(&t[idx],v)
2359: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2361: /* First Column */
2362: SSE_COPY_PS(XMM0,XMM6)
2363: SSE_SHUFFLE(XMM0,XMM0,0x00)
2364: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2365: SSE_SUB_PS(XMM7,XMM0)
2367: /* Second Column */
2368: SSE_COPY_PS(XMM1,XMM6)
2369: SSE_SHUFFLE(XMM1,XMM1,0x55)
2370: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2371: SSE_SUB_PS(XMM7,XMM1)
2373: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2374:
2375: /* Third Column */
2376: SSE_COPY_PS(XMM2,XMM6)
2377: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2378: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2379: SSE_SUB_PS(XMM7,XMM2)
2381: /* Fourth Column */
2382: SSE_COPY_PS(XMM3,XMM6)
2383: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2384: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2385: SSE_SUB_PS(XMM7,XMM3)
2386: SSE_INLINE_END_2
2387: v += 16;
2388: }
2389: v = aa + ai16;
2390: ai16 = 16*diag[--i];
2391: PREFETCH_NTA(aa+ai16+16);
2392: /*
2393: Scale the result by the diagonal 4x4 block,
2394: which was inverted as part of the factorization
2395: */
2396: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2397: /* First Column */
2398: SSE_COPY_PS(XMM0,XMM7)
2399: SSE_SHUFFLE(XMM0,XMM0,0x00)
2400: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2402: /* Second Column */
2403: SSE_COPY_PS(XMM1,XMM7)
2404: SSE_SHUFFLE(XMM1,XMM1,0x55)
2405: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2406: SSE_ADD_PS(XMM0,XMM1)
2408: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2409:
2410: /* Third Column */
2411: SSE_COPY_PS(XMM2,XMM7)
2412: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2413: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2414: SSE_ADD_PS(XMM0,XMM2)
2416: /* Fourth Column */
2417: SSE_COPY_PS(XMM3,XMM7)
2418: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2419: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2420: SSE_ADD_PS(XMM0,XMM3)
2422: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2423: SSE_INLINE_END_3
2425: v = aa + ai16 + 16;
2426: idt -= 4;
2427: }
2429: /* Convert t from single precision back to double precision (inplace)*/
2430: idt = 4*(n-1);
2431: for (i=n-1;i>=0;i--) {
2432: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2433: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2434: PetscScalar *xtemp=&x[idt];
2435: MatScalar *ttemp=&t[idt];
2436: xtemp[3] = (PetscScalar)ttemp[3];
2437: xtemp[2] = (PetscScalar)ttemp[2];
2438: xtemp[1] = (PetscScalar)ttemp[1];
2439: xtemp[0] = (PetscScalar)ttemp[0];
2440: idt -= 4;
2441: }
2443: } /* End of artificial scope. */
2444: VecRestoreArrayFast(bb,&b);
2445: VecRestoreArrayFast(xx,&x);
2446: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2447: SSE_SCOPE_END;
2448: return(0);
2449: }
2453: int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2454: {
2455: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2456: int *aj=a->j;
2457: int ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2458: MatScalar *aa=a->a;
2459: PetscScalar *x,*b;
2462: SSE_SCOPE_BEGIN;
2463: /*
2464: Note: This code currently uses demotion of double
2465: to float when performing the mixed-mode computation.
2466: This may not be numerically reasonable for all applications.
2467: */
2468: PREFETCH_NTA(aa+16*ai[1]);
2470: VecGetArrayFast(bb,&b);
2471: VecGetArrayFast(xx,&x);
2472: {
2473: /* x will first be computed in single precision then promoted inplace to double */
2474: MatScalar *v,*t=(MatScalar *)x;
2475: int nz,i,idt,ai16;
2476: int jdx,idx;
2477: int *vi;
2478: /* Forward solve the lower triangular factor. */
2480: /* First block is the identity. */
2481: idx = 0;
2482: CONVERT_DOUBLE4_FLOAT4(t,b);
2483: v = aa + 16*ai[1];
2485: for (i=1; i<n;) {
2486: PREFETCH_NTA(&v[8]);
2487: vi = aj + ai[i];
2488: nz = diag[i] - ai[i];
2489: idx += 4;
2491: /* Demote RHS from double to float. */
2492: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2493: LOAD_PS(&t[idx],XMM7);
2495: while (nz--) {
2496: PREFETCH_NTA(&v[16]);
2497: jdx = 4*(*vi++);
2498: /* jdx = *vi++; */
2499:
2500: /* 4x4 Matrix-Vector product with negative accumulation: */
2501: SSE_INLINE_BEGIN_2(&t[jdx],v)
2502: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2504: /* First Column */
2505: SSE_COPY_PS(XMM0,XMM6)
2506: SSE_SHUFFLE(XMM0,XMM0,0x00)
2507: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2508: SSE_SUB_PS(XMM7,XMM0)
2510: /* Second Column */
2511: SSE_COPY_PS(XMM1,XMM6)
2512: SSE_SHUFFLE(XMM1,XMM1,0x55)
2513: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2514: SSE_SUB_PS(XMM7,XMM1)
2516: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2517:
2518: /* Third Column */
2519: SSE_COPY_PS(XMM2,XMM6)
2520: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2521: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2522: SSE_SUB_PS(XMM7,XMM2)
2524: /* Fourth Column */
2525: SSE_COPY_PS(XMM3,XMM6)
2526: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2527: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2528: SSE_SUB_PS(XMM7,XMM3)
2529: SSE_INLINE_END_2
2530:
2531: v += 16;
2532: }
2533: v = aa + 16*ai[++i];
2534: PREFETCH_NTA(v);
2535: STORE_PS(&t[idx],XMM7);
2536: }
2538: /* Backward solve the upper triangular factor.*/
2540: idt = 4*(n-1);
2541: ai16 = 16*diag[n-1];
2542: v = aa + ai16 + 16;
2543: for (i=n-1; i>=0;){
2544: PREFETCH_NTA(&v[8]);
2545: vi = aj + diag[i] + 1;
2546: nz = ai[i+1] - diag[i] - 1;
2547:
2548: LOAD_PS(&t[idt],XMM7);
2550: while (nz--) {
2551: PREFETCH_NTA(&v[16]);
2552: idx = 4*(*vi++);
2553: /* idx = *vi++; */
2555: /* 4x4 Matrix-Vector Product with negative accumulation: */
2556: SSE_INLINE_BEGIN_2(&t[idx],v)
2557: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2559: /* First Column */
2560: SSE_COPY_PS(XMM0,XMM6)
2561: SSE_SHUFFLE(XMM0,XMM0,0x00)
2562: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2563: SSE_SUB_PS(XMM7,XMM0)
2565: /* Second Column */
2566: SSE_COPY_PS(XMM1,XMM6)
2567: SSE_SHUFFLE(XMM1,XMM1,0x55)
2568: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2569: SSE_SUB_PS(XMM7,XMM1)
2571: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2572:
2573: /* Third Column */
2574: SSE_COPY_PS(XMM2,XMM6)
2575: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2576: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2577: SSE_SUB_PS(XMM7,XMM2)
2579: /* Fourth Column */
2580: SSE_COPY_PS(XMM3,XMM6)
2581: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2582: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2583: SSE_SUB_PS(XMM7,XMM3)
2584: SSE_INLINE_END_2
2585: v += 16;
2586: }
2587: v = aa + ai16;
2588: ai16 = 16*diag[--i];
2589: PREFETCH_NTA(aa+ai16+16);
2590: /*
2591: Scale the result by the diagonal 4x4 block,
2592: which was inverted as part of the factorization
2593: */
2594: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2595: /* First Column */
2596: SSE_COPY_PS(XMM0,XMM7)
2597: SSE_SHUFFLE(XMM0,XMM0,0x00)
2598: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2600: /* Second Column */
2601: SSE_COPY_PS(XMM1,XMM7)
2602: SSE_SHUFFLE(XMM1,XMM1,0x55)
2603: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2604: SSE_ADD_PS(XMM0,XMM1)
2606: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2607:
2608: /* Third Column */
2609: SSE_COPY_PS(XMM2,XMM7)
2610: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2611: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2612: SSE_ADD_PS(XMM0,XMM2)
2614: /* Fourth Column */
2615: SSE_COPY_PS(XMM3,XMM7)
2616: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2617: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2618: SSE_ADD_PS(XMM0,XMM3)
2620: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2621: SSE_INLINE_END_3
2623: v = aa + ai16 + 16;
2624: idt -= 4;
2625: }
2627: /* Convert t from single precision back to double precision (inplace)*/
2628: idt = 4*(n-1);
2629: for (i=n-1;i>=0;i--) {
2630: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2631: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2632: PetscScalar *xtemp=&x[idt];
2633: MatScalar *ttemp=&t[idt];
2634: xtemp[3] = (PetscScalar)ttemp[3];
2635: xtemp[2] = (PetscScalar)ttemp[2];
2636: xtemp[1] = (PetscScalar)ttemp[1];
2637: xtemp[0] = (PetscScalar)ttemp[0];
2638: idt -= 4;
2639: }
2641: } /* End of artificial scope. */
2642: VecRestoreArrayFast(bb,&b);
2643: VecRestoreArrayFast(xx,&x);
2644: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2645: SSE_SCOPE_END;
2646: return(0);
2647: }
2649: #endif
2652: int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2653: {
2654: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2655: IS iscol=a->col,isrow=a->row;
2656: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2657: int *diag = a->diag;
2658: MatScalar *aa=a->a,*v;
2659: PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t;
2662: VecGetArray(bb,&b);
2663: VecGetArray(xx,&x);
2664: t = a->solve_work;
2666: ISGetIndices(isrow,&rout); r = rout;
2667: ISGetIndices(iscol,&cout); c = cout + (n-1);
2669: /* forward solve the lower triangular */
2670: idx = 3*(*r++);
2671: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2672: for (i=1; i<n; i++) {
2673: v = aa + 9*ai[i];
2674: vi = aj + ai[i];
2675: nz = diag[i] - ai[i];
2676: idx = 3*(*r++);
2677: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2678: while (nz--) {
2679: idx = 3*(*vi++);
2680: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2681: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2682: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2683: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2684: v += 9;
2685: }
2686: idx = 3*i;
2687: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2688: }
2689: /* backward solve the upper triangular */
2690: for (i=n-1; i>=0; i--){
2691: v = aa + 9*diag[i] + 9;
2692: vi = aj + diag[i] + 1;
2693: nz = ai[i+1] - diag[i] - 1;
2694: idt = 3*i;
2695: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2696: while (nz--) {
2697: idx = 3*(*vi++);
2698: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2699: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2700: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2701: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2702: v += 9;
2703: }
2704: idc = 3*(*c--);
2705: v = aa + 9*diag[i];
2706: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2707: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2708: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2709: }
2710: ISRestoreIndices(isrow,&rout);
2711: ISRestoreIndices(iscol,&cout);
2712: VecRestoreArray(bb,&b);
2713: VecRestoreArray(xx,&x);
2714: PetscLogFlops(2*9*(a->nz) - 3*A->n);
2715: return(0);
2716: }
2718: /*
2719: Special case where the matrix was ILU(0) factored in the natural
2720: ordering. This eliminates the need for the column and row permutation.
2721: */
2724: int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2725: {
2726: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2727: int n=a->mbs,*ai=a->i,*aj=a->j;
2728: int ierr,*diag = a->diag;
2729: MatScalar *aa=a->a,*v;
2730: PetscScalar *x,*b,s1,s2,s3,x1,x2,x3;
2731: int jdx,idt,idx,nz,*vi,i;
2734: VecGetArray(bb,&b);
2735: VecGetArray(xx,&x);
2738: /* forward solve the lower triangular */
2739: idx = 0;
2740: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
2741: for (i=1; i<n; i++) {
2742: v = aa + 9*ai[i];
2743: vi = aj + ai[i];
2744: nz = diag[i] - ai[i];
2745: idx += 3;
2746: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
2747: while (nz--) {
2748: jdx = 3*(*vi++);
2749: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2750: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2751: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2752: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2753: v += 9;
2754: }
2755: x[idx] = s1;
2756: x[1+idx] = s2;
2757: x[2+idx] = s3;
2758: }
2759: /* backward solve the upper triangular */
2760: for (i=n-1; i>=0; i--){
2761: v = aa + 9*diag[i] + 9;
2762: vi = aj + diag[i] + 1;
2763: nz = ai[i+1] - diag[i] - 1;
2764: idt = 3*i;
2765: s1 = x[idt]; s2 = x[1+idt];
2766: s3 = x[2+idt];
2767: while (nz--) {
2768: idx = 3*(*vi++);
2769: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
2770: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2771: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2772: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2773: v += 9;
2774: }
2775: v = aa + 9*diag[i];
2776: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2777: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2778: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2779: }
2781: VecRestoreArray(bb,&b);
2782: VecRestoreArray(xx,&x);
2783: PetscLogFlops(2*9*(a->nz) - 3*A->n);
2784: return(0);
2785: }
2789: int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2790: {
2791: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2792: IS iscol=a->col,isrow=a->row;
2793: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2794: int *diag = a->diag;
2795: MatScalar *aa=a->a,*v;
2796: PetscScalar *x,*b,s1,s2,x1,x2,*t;
2799: VecGetArray(bb,&b);
2800: VecGetArray(xx,&x);
2801: t = a->solve_work;
2803: ISGetIndices(isrow,&rout); r = rout;
2804: ISGetIndices(iscol,&cout); c = cout + (n-1);
2806: /* forward solve the lower triangular */
2807: idx = 2*(*r++);
2808: t[0] = b[idx]; t[1] = b[1+idx];
2809: for (i=1; i<n; i++) {
2810: v = aa + 4*ai[i];
2811: vi = aj + ai[i];
2812: nz = diag[i] - ai[i];
2813: idx = 2*(*r++);
2814: s1 = b[idx]; s2 = b[1+idx];
2815: while (nz--) {
2816: idx = 2*(*vi++);
2817: x1 = t[idx]; x2 = t[1+idx];
2818: s1 -= v[0]*x1 + v[2]*x2;
2819: s2 -= v[1]*x1 + v[3]*x2;
2820: v += 4;
2821: }
2822: idx = 2*i;
2823: t[idx] = s1; t[1+idx] = s2;
2824: }
2825: /* backward solve the upper triangular */
2826: for (i=n-1; i>=0; i--){
2827: v = aa + 4*diag[i] + 4;
2828: vi = aj + diag[i] + 1;
2829: nz = ai[i+1] - diag[i] - 1;
2830: idt = 2*i;
2831: s1 = t[idt]; s2 = t[1+idt];
2832: while (nz--) {
2833: idx = 2*(*vi++);
2834: x1 = t[idx]; x2 = t[1+idx];
2835: s1 -= v[0]*x1 + v[2]*x2;
2836: s2 -= v[1]*x1 + v[3]*x2;
2837: v += 4;
2838: }
2839: idc = 2*(*c--);
2840: v = aa + 4*diag[i];
2841: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
2842: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2843: }
2844: ISRestoreIndices(isrow,&rout);
2845: ISRestoreIndices(iscol,&cout);
2846: VecRestoreArray(bb,&b);
2847: VecRestoreArray(xx,&x);
2848: PetscLogFlops(2*4*(a->nz) - 2*A->n);
2849: return(0);
2850: }
2852: /*
2853: Special case where the matrix was ILU(0) factored in the natural
2854: ordering. This eliminates the need for the column and row permutation.
2855: */
2858: int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2859: {
2860: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2861: int n=a->mbs,*ai=a->i,*aj=a->j;
2862: int ierr,*diag = a->diag;
2863: MatScalar *aa=a->a,*v;
2864: PetscScalar *x,*b,s1,s2,x1,x2;
2865: int jdx,idt,idx,nz,*vi,i;
2868: VecGetArray(bb,&b);
2869: VecGetArray(xx,&x);
2871: /* forward solve the lower triangular */
2872: idx = 0;
2873: x[0] = b[0]; x[1] = b[1];
2874: for (i=1; i<n; i++) {
2875: v = aa + 4*ai[i];
2876: vi = aj + ai[i];
2877: nz = diag[i] - ai[i];
2878: idx += 2;
2879: s1 = b[idx];s2 = b[1+idx];
2880: while (nz--) {
2881: jdx = 2*(*vi++);
2882: x1 = x[jdx];x2 = x[1+jdx];
2883: s1 -= v[0]*x1 + v[2]*x2;
2884: s2 -= v[1]*x1 + v[3]*x2;
2885: v += 4;
2886: }
2887: x[idx] = s1;
2888: x[1+idx] = s2;
2889: }
2890: /* backward solve the upper triangular */
2891: for (i=n-1; i>=0; i--){
2892: v = aa + 4*diag[i] + 4;
2893: vi = aj + diag[i] + 1;
2894: nz = ai[i+1] - diag[i] - 1;
2895: idt = 2*i;
2896: s1 = x[idt]; s2 = x[1+idt];
2897: while (nz--) {
2898: idx = 2*(*vi++);
2899: x1 = x[idx]; x2 = x[1+idx];
2900: s1 -= v[0]*x1 + v[2]*x2;
2901: s2 -= v[1]*x1 + v[3]*x2;
2902: v += 4;
2903: }
2904: v = aa + 4*diag[i];
2905: x[idt] = v[0]*s1 + v[2]*s2;
2906: x[1+idt] = v[1]*s1 + v[3]*s2;
2907: }
2909: VecRestoreArray(bb,&b);
2910: VecRestoreArray(xx,&x);
2911: PetscLogFlops(2*4*(a->nz) - 2*A->n);
2912: return(0);
2913: }
2917: int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2918: {
2919: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2920: IS iscol=a->col,isrow=a->row;
2921: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2922: int *diag = a->diag;
2923: MatScalar *aa=a->a,*v;
2924: PetscScalar *x,*b,s1,*t;
2927: if (!n) return(0);
2929: VecGetArray(bb,&b);
2930: VecGetArray(xx,&x);
2931: t = a->solve_work;
2933: ISGetIndices(isrow,&rout); r = rout;
2934: ISGetIndices(iscol,&cout); c = cout + (n-1);
2936: /* forward solve the lower triangular */
2937: t[0] = b[*r++];
2938: for (i=1; i<n; i++) {
2939: v = aa + ai[i];
2940: vi = aj + ai[i];
2941: nz = diag[i] - ai[i];
2942: s1 = b[*r++];
2943: while (nz--) {
2944: s1 -= (*v++)*t[*vi++];
2945: }
2946: t[i] = s1;
2947: }
2948: /* backward solve the upper triangular */
2949: for (i=n-1; i>=0; i--){
2950: v = aa + diag[i] + 1;
2951: vi = aj + diag[i] + 1;
2952: nz = ai[i+1] - diag[i] - 1;
2953: s1 = t[i];
2954: while (nz--) {
2955: s1 -= (*v++)*t[*vi++];
2956: }
2957: x[*c--] = t[i] = aa[diag[i]]*s1;
2958: }
2960: ISRestoreIndices(isrow,&rout);
2961: ISRestoreIndices(iscol,&cout);
2962: VecRestoreArray(bb,&b);
2963: VecRestoreArray(xx,&x);
2964: PetscLogFlops(2*1*(a->nz) - A->n);
2965: return(0);
2966: }
2967: /*
2968: Special case where the matrix was ILU(0) factored in the natural
2969: ordering. This eliminates the need for the column and row permutation.
2970: */
2973: int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2974: {
2975: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2976: int n=a->mbs,*ai=a->i,*aj=a->j;
2977: int ierr,*diag = a->diag;
2978: MatScalar *aa=a->a;
2979: PetscScalar *x,*b;
2980: PetscScalar s1,x1;
2981: MatScalar *v;
2982: int jdx,idt,idx,nz,*vi,i;
2985: VecGetArray(bb,&b);
2986: VecGetArray(xx,&x);
2988: /* forward solve the lower triangular */
2989: idx = 0;
2990: x[0] = b[0];
2991: for (i=1; i<n; i++) {
2992: v = aa + ai[i];
2993: vi = aj + ai[i];
2994: nz = diag[i] - ai[i];
2995: idx += 1;
2996: s1 = b[idx];
2997: while (nz--) {
2998: jdx = *vi++;
2999: x1 = x[jdx];
3000: s1 -= v[0]*x1;
3001: v += 1;
3002: }
3003: x[idx] = s1;
3004: }
3005: /* backward solve the upper triangular */
3006: for (i=n-1; i>=0; i--){
3007: v = aa + diag[i] + 1;
3008: vi = aj + diag[i] + 1;
3009: nz = ai[i+1] - diag[i] - 1;
3010: idt = i;
3011: s1 = x[idt];
3012: while (nz--) {
3013: idx = *vi++;
3014: x1 = x[idx];
3015: s1 -= v[0]*x1;
3016: v += 1;
3017: }
3018: v = aa + diag[i];
3019: x[idt] = v[0]*s1;
3020: }
3021: VecRestoreArray(bb,&b);
3022: VecRestoreArray(xx,&x);
3023: PetscLogFlops(2*(a->nz) - A->n);
3024: return(0);
3025: }
3027: /* ----------------------------------------------------------------*/
3028: /*
3029: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3030: except that the data structure of Mat_SeqAIJ is slightly different.
3031: Not a good example of code reuse.
3032: */
3033: EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
3037: int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3038: {
3039: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3040: IS isicol;
3041: int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3042: int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3043: int *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3044: int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
3045: PetscTruth col_identity,row_identity;
3046: PetscReal f;
3049: f = info->fill;
3050: levels = (int)info->levels;
3051: diagonal_fill = (int)info->diagonal_fill;
3052: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3053: ISIdentity(isrow,&row_identity);
3054: ISIdentity(iscol,&col_identity);
3056: if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */
3057: MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
3058: (*fact)->factor = FACTOR_LU;
3059: b = (Mat_SeqBAIJ*)(*fact)->data;
3060: if (!b->diag) {
3061: MatMarkDiagonal_SeqBAIJ(*fact);
3062: }
3063: MatMissingDiagonal_SeqBAIJ(*fact);
3064: b->row = isrow;
3065: b->col = iscol;
3066: PetscObjectReference((PetscObject)isrow);
3067: PetscObjectReference((PetscObject)iscol);
3068: b->icol = isicol;
3069: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3070: PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);
3071: } else { /* general case perform the symbolic factorization */
3072: ISGetIndices(isrow,&r);
3073: ISGetIndices(isicol,&ic);
3075: /* get new row pointers */
3076: PetscMalloc((n+1)*sizeof(int),&ainew);
3077: ainew[0] = 0;
3078: /* don't know how many column pointers are needed so estimate */
3079: jmax = (int)(f*ai[n] + 1);
3080: PetscMalloc((jmax)*sizeof(int),&ajnew);
3081: /* ajfill is level of fill for each fill entry */
3082: PetscMalloc((jmax)*sizeof(int),&ajfill);
3083: /* fill is a linked list of nonzeros in active row */
3084: PetscMalloc((n+1)*sizeof(int),&fill);
3085: /* im is level for each filled value */
3086: PetscMalloc((n+1)*sizeof(int),&im);
3087: /* dloc is location of diagonal in factor */
3088: PetscMalloc((n+1)*sizeof(int),&dloc);
3089: dloc[0] = 0;
3090: for (prow=0; prow<n; prow++) {
3092: /* copy prow into linked list */
3093: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
3094: if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3095: xi = aj + ai[r[prow]];
3096: fill[n] = n;
3097: fill[prow] = -1; /* marker for diagonal entry */
3098: while (nz--) {
3099: fm = n;
3100: idx = ic[*xi++];
3101: do {
3102: m = fm;
3103: fm = fill[m];
3104: } while (fm < idx);
3105: fill[m] = idx;
3106: fill[idx] = fm;
3107: im[idx] = 0;
3108: }
3110: /* make sure diagonal entry is included */
3111: if (diagonal_fill && fill[prow] == -1) {
3112: fm = n;
3113: while (fill[fm] < prow) fm = fill[fm];
3114: fill[prow] = fill[fm]; /* insert diagonal into linked list */
3115: fill[fm] = prow;
3116: im[prow] = 0;
3117: nzf++;
3118: dcount++;
3119: }
3121: nzi = 0;
3122: row = fill[n];
3123: while (row < prow) {
3124: incrlev = im[row] + 1;
3125: nz = dloc[row];
3126: xi = ajnew + ainew[row] + nz + 1;
3127: flev = ajfill + ainew[row] + nz + 1;
3128: nnz = ainew[row+1] - ainew[row] - nz - 1;
3129: fm = row;
3130: while (nnz-- > 0) {
3131: idx = *xi++;
3132: if (*flev + incrlev > levels) {
3133: flev++;
3134: continue;
3135: }
3136: do {
3137: m = fm;
3138: fm = fill[m];
3139: } while (fm < idx);
3140: if (fm != idx) {
3141: im[idx] = *flev + incrlev;
3142: fill[m] = idx;
3143: fill[idx] = fm;
3144: fm = idx;
3145: nzf++;
3146: } else {
3147: if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3148: }
3149: flev++;
3150: }
3151: row = fill[row];
3152: nzi++;
3153: }
3154: /* copy new filled row into permanent storage */
3155: ainew[prow+1] = ainew[prow] + nzf;
3156: if (ainew[prow+1] > jmax) {
3158: /* estimate how much additional space we will need */
3159: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3160: /* just double the memory each time */
3161: int maxadd = jmax;
3162: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3163: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3164: jmax += maxadd;
3166: /* allocate a longer ajnew and ajfill */
3167: PetscMalloc(jmax*sizeof(int),&xi);
3168: PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
3169: PetscFree(ajnew);
3170: ajnew = xi;
3171: PetscMalloc(jmax*sizeof(int),&xi);
3172: PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
3173: PetscFree(ajfill);
3174: ajfill = xi;
3175: reallocate++; /* count how many reallocations are needed */
3176: }
3177: xi = ajnew + ainew[prow];
3178: flev = ajfill + ainew[prow];
3179: dloc[prow] = nzi;
3180: fm = fill[n];
3181: while (nzf--) {
3182: *xi++ = fm;
3183: *flev++ = im[fm];
3184: fm = fill[fm];
3185: }
3186: /* make sure row has diagonal entry */
3187: if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3188: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3189: try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3190: }
3191: }
3192: PetscFree(ajfill);
3193: ISRestoreIndices(isrow,&r);
3194: ISRestoreIndices(isicol,&ic);
3195: PetscFree(fill);
3196: PetscFree(im);
3198: {
3199: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3200: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3201: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3202: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3203: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3204: if (diagonal_fill) {
3205: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3206: }
3207: }
3209: /* put together the new matrix */
3210: MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);
3211: PetscLogObjectParent(*fact,isicol);
3212: b = (Mat_SeqBAIJ*)(*fact)->data;
3213: PetscFree(b->imax);
3214: b->singlemalloc = PETSC_FALSE;
3215: len = bs2*ainew[n]*sizeof(MatScalar);
3216: /* the next line frees the default space generated by the Create() */
3217: PetscFree(b->a);
3218: PetscFree(b->ilen);
3219: PetscMalloc(len,&b->a);
3220: b->j = ajnew;
3221: b->i = ainew;
3222: for (i=0; i<n; i++) dloc[i] += ainew[i];
3223: b->diag = dloc;
3224: b->ilen = 0;
3225: b->imax = 0;
3226: b->row = isrow;
3227: b->col = iscol;
3228: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3229: PetscObjectReference((PetscObject)isrow);
3230: PetscObjectReference((PetscObject)iscol);
3231: b->icol = isicol;
3232: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3233: /* In b structure: Free imax, ilen, old a, old j.
3234: Allocate dloc, solve_work, new a, new j */
3235: PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
3236: b->maxnz = b->nz = ainew[n];
3237: (*fact)->factor = FACTOR_LU;
3239: (*fact)->info.factor_mallocs = reallocate;
3240: (*fact)->info.fill_ratio_given = f;
3241: (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3242: }
3244: if (row_identity && col_identity) {
3245: MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
3246: }
3247: return(0);
3248: }
3252: int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3253: {
3254: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3255: /* int i,*AJ=a->j,nz=a->nz; */
3257: /* Undo Column scaling */
3258: /* while (nz--) { */
3259: /* AJ[i] = AJ[i]/4; */
3260: /* } */
3261: /* This should really invoke a push/pop logic, but we don't have that yet. */
3262: A->ops->setunfactored = PETSC_NULL;
3263: return(0);
3264: }
3268: int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3269: {
3270: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3271: int *AJ=a->j,nz=a->nz;
3272: unsigned short *aj=(unsigned short *)AJ;
3274: /* Is this really necessary? */
3275: while (nz--) {
3276: AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3277: }
3278: A->ops->setunfactored = PETSC_NULL;
3279: return(0);
3280: }
3284: int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3285: {
3286: /*
3287: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3288: with natural ordering
3289: */
3290: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3293: inA->ops->solve = MatSolve_SeqBAIJ_Update;
3294: inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update;
3295: switch (a->bs) {
3296: case 1:
3297: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3298: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3299: break;
3300: case 2:
3301: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3302: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3303: break;
3304: case 3:
3305: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3306: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3307: break;
3308: case 4:
3309: #if defined(PETSC_USE_MAT_SINGLE)
3310: {
3311: PetscTruth sse_enabled_local;
3312: int ierr;
3313: PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);
3314: if (sse_enabled_local) {
3315: # if defined(PETSC_HAVE_SSE)
3316: int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3317: if (n==(unsigned short)n) {
3318: unsigned short *aj=(unsigned short *)AJ;
3319: for (i=0;i<nz;i++) {
3320: aj[i] = (unsigned short)AJ[i];
3321: }
3322: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3323: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3324: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3325: } else {
3326: /* Scale the column indices for easier indexing in MatSolve. */
3327: /* for (i=0;i<nz;i++) { */
3328: /* AJ[i] = AJ[i]*4; */
3329: /* } */
3330: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3331: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3332: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3333: }
3334: # else
3335: /* This should never be reached. If so, problem in PetscSSEIsEnabled. */
3336: SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3337: # endif
3338: } else {
3339: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3340: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3341: }
3342: }
3343: #else
3344: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3345: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3346: #endif
3347: break;
3348: case 5:
3349: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3350: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3351: break;
3352: case 6:
3353: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3354: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3355: break;
3356: case 7:
3357: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3358: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3359: break;
3360: }
3361: return(0);
3362: }
3366: int MatSeqBAIJ_UpdateSolvers(Mat A)
3367: {
3368: /*
3369: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3370: with natural ordering
3371: */
3372: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3373: IS row = a->row, col = a->col;
3374: PetscTruth row_identity, col_identity;
3375: PetscTruth use_natural;
3376: int ierr;
3380: use_natural = PETSC_FALSE;
3382: ISIdentity(row,&row_identity);
3383: ISIdentity(col,&col_identity);
3385: if (row_identity && col_identity) {
3386: use_natural = PETSC_TRUE;
3387: } else {
3388: use_natural = PETSC_FALSE;
3389: }
3390: switch (a->bs) {
3391: case 1:
3392: if (use_natural) {
3393: A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
3394: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3395: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3396: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3397: } else {
3398: A->ops->solve = MatSolve_SeqBAIJ_1;
3399: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
3400: }
3401: break;
3402: case 2:
3403: if (use_natural) {
3404: A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
3405: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3406: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3407: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3408: } else {
3409: A->ops->solve = MatSolve_SeqBAIJ_2;
3410: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
3411: }
3412: break;
3413: case 3:
3414: if (use_natural) {
3415: A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering;
3416: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3417: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3418: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3419: } else {
3420: A->ops->solve = MatSolve_SeqBAIJ_3;
3421: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
3422: }
3423: break;
3424: case 4:
3425: {
3426: PetscTruth sse_enabled_local;
3427: PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);
3428: if (use_natural) {
3429: #if defined(PETSC_USE_MAT_SINGLE)
3430: if (sse_enabled_local) { /* Natural + Single + SSE */
3431: # if defined(PETSC_HAVE_SSE)
3432: int n=a->mbs;
3433: if (n==(unsigned short)n) {
3434: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3435: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3436: } else {
3437: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3438: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3439: }
3440: # else
3441: /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3442: SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3443: # endif
3444: } else { /* Natural + Single */
3445: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3446: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3447: }
3448: #else
3449: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
3450: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3451: #endif
3452: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3453: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3454: } else { /* Arbitrary ordering */
3455: #if defined(PETSC_USE_MAT_SINGLE)
3456: if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3457: # if defined(PETSC_HAVE_SSE)
3458: A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion;
3459: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3460: # else
3461: /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3462: SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3463: # endif
3464: } else { /* Arbitrary + Single */
3465: A->ops->solve = MatSolve_SeqBAIJ_4_Demotion;
3466: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3467: }
3468: #else
3469: A->ops->solve = MatSolve_SeqBAIJ_4;
3470: #endif
3471: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
3472: }
3473: }
3474: break;
3475: case 5:
3476: if (use_natural) {
3477: A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering;
3478: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3479: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3480: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3481: } else {
3482: A->ops->solve = MatSolve_SeqBAIJ_5;
3483: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5;
3484: }
3485: break;
3486: case 6:
3487: if (use_natural) {
3488: A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering;
3489: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3490: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3491: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3492: } else {
3493: A->ops->solve = MatSolve_SeqBAIJ_6;
3494: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6;
3495: }
3496: break;
3497: case 7:
3498: if (use_natural) {
3499: A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering;
3500: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3501: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3502: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3503: } else {
3504: A->ops->solve = MatSolve_SeqBAIJ_7;
3505: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7;
3506: }
3507: break;
3508: default:
3509: A->ops->solve = MatSolve_SeqBAIJ_N;
3510: break;
3511: }
3512: return(0);
3513: }
3517: int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3521: MatSeqBAIJ_UpdateSolvers(A);
3522: if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3523: (*A->ops->solve)(A,x,y);
3524: } else {
3525: SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3526: }
3527: return(0);
3528: }
3532: int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3536: MatSeqBAIJ_UpdateSolvers(A);
3537: (*A->ops->solvetranspose)(A,x,y);
3538: return(0);
3539: }