Actual source code: baijfact2.c
1: /*
2: Factorization code for BAIJ format.
3: */
5: #include src/mat/impls/baij/seq/baij.h
6: #include src/inline/ilu.h
7: #include src/inline/dot.h
11: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
12: {
13: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
15: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
16: PetscInt *diag = a->diag;
17: MatScalar *aa=a->a,*v;
18: PetscScalar s1,*x,*b;
21: VecCopy(bb,xx);
22: VecGetArray(bb,&b);
23: VecGetArray(xx,&x);
24:
25: /* forward solve the U^T */
26: for (i=0; i<n; i++) {
28: v = aa + diag[i];
29: /* multiply by the inverse of the block diagonal */
30: s1 = (*v++)*x[i];
31: vi = aj + diag[i] + 1;
32: nz = ai[i+1] - diag[i] - 1;
33: while (nz--) {
34: x[*vi++] -= (*v++)*s1;
35: }
36: x[i] = s1;
37: }
38: /* backward solve the L^T */
39: for (i=n-1; i>=0; i--){
40: v = aa + diag[i] - 1;
41: vi = aj + diag[i] - 1;
42: nz = diag[i] - ai[i];
43: s1 = x[i];
44: while (nz--) {
45: x[*vi--] -= (*v--)*s1;
46: }
47: }
48: VecRestoreArray(bb,&b);
49: VecRestoreArray(xx,&x);
50: PetscLogFlops(2*(a->nz) - A->n);
51: return(0);
52: }
56: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57: {
58: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
60: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61: PetscInt *diag = a->diag,oidx;
62: MatScalar *aa=a->a,*v;
63: PetscScalar s1,s2,x1,x2;
64: PetscScalar *x,*b;
67: VecCopy(bb,xx);
68: VecGetArray(bb,&b);
69: VecGetArray(xx,&x);
71: /* forward solve the U^T */
72: idx = 0;
73: for (i=0; i<n; i++) {
75: v = aa + 4*diag[i];
76: /* multiply by the inverse of the block diagonal */
77: x1 = x[idx]; x2 = x[1+idx];
78: s1 = v[0]*x1 + v[1]*x2;
79: s2 = v[2]*x1 + v[3]*x2;
80: v += 4;
82: vi = aj + diag[i] + 1;
83: nz = ai[i+1] - diag[i] - 1;
84: while (nz--) {
85: oidx = 2*(*vi++);
86: x[oidx] -= v[0]*s1 + v[1]*s2;
87: x[oidx+1] -= v[2]*s1 + v[3]*s2;
88: v += 4;
89: }
90: x[idx] = s1;x[1+idx] = s2;
91: idx += 2;
92: }
93: /* backward solve the L^T */
94: for (i=n-1; i>=0; i--){
95: v = aa + 4*diag[i] - 4;
96: vi = aj + diag[i] - 1;
97: nz = diag[i] - ai[i];
98: idt = 2*i;
99: s1 = x[idt]; s2 = x[1+idt];
100: while (nz--) {
101: idx = 2*(*vi--);
102: x[idx] -= v[0]*s1 + v[1]*s2;
103: x[idx+1] -= v[2]*s1 + v[3]*s2;
104: v -= 4;
105: }
106: }
107: VecRestoreArray(bb,&b);
108: VecRestoreArray(xx,&x);
109: PetscLogFlops(2*4*(a->nz) - 2*A->n);
110: return(0);
111: }
115: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116: {
117: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
119: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
120: PetscInt *diag = a->diag,oidx;
121: MatScalar *aa=a->a,*v;
122: PetscScalar s1,s2,s3,x1,x2,x3;
123: PetscScalar *x,*b;
126: VecCopy(bb,xx);
127: VecGetArray(bb,&b);
128: VecGetArray(xx,&x);
130: /* forward solve the U^T */
131: idx = 0;
132: for (i=0; i<n; i++) {
134: v = aa + 9*diag[i];
135: /* multiply by the inverse of the block diagonal */
136: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
137: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
138: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
139: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
140: v += 9;
142: vi = aj + diag[i] + 1;
143: nz = ai[i+1] - diag[i] - 1;
144: while (nz--) {
145: oidx = 3*(*vi++);
146: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
147: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
148: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
149: v += 9;
150: }
151: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
152: idx += 3;
153: }
154: /* backward solve the L^T */
155: for (i=n-1; i>=0; i--){
156: v = aa + 9*diag[i] - 9;
157: vi = aj + diag[i] - 1;
158: nz = diag[i] - ai[i];
159: idt = 3*i;
160: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
161: while (nz--) {
162: idx = 3*(*vi--);
163: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
164: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
165: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
166: v -= 9;
167: }
168: }
169: VecRestoreArray(bb,&b);
170: VecRestoreArray(xx,&x);
171: PetscLogFlops(2*9*(a->nz) - 3*A->n);
172: return(0);
173: }
177: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
178: {
179: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
181: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
182: PetscInt *diag = a->diag,oidx;
183: MatScalar *aa=a->a,*v;
184: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
185: PetscScalar *x,*b;
188: VecCopy(bb,xx);
189: VecGetArray(bb,&b);
190: VecGetArray(xx,&x);
192: /* forward solve the U^T */
193: idx = 0;
194: for (i=0; i<n; i++) {
196: v = aa + 16*diag[i];
197: /* multiply by the inverse of the block diagonal */
198: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
199: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
200: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
201: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
202: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
203: v += 16;
205: vi = aj + diag[i] + 1;
206: nz = ai[i+1] - diag[i] - 1;
207: while (nz--) {
208: oidx = 4*(*vi++);
209: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
210: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
211: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
212: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
213: v += 16;
214: }
215: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
216: idx += 4;
217: }
218: /* backward solve the L^T */
219: for (i=n-1; i>=0; i--){
220: v = aa + 16*diag[i] - 16;
221: vi = aj + diag[i] - 1;
222: nz = diag[i] - ai[i];
223: idt = 4*i;
224: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
225: while (nz--) {
226: idx = 4*(*vi--);
227: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
228: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
229: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
230: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
231: v -= 16;
232: }
233: }
234: VecRestoreArray(bb,&b);
235: VecRestoreArray(xx,&x);
236: PetscLogFlops(2*16*(a->nz) - 4*A->n);
237: return(0);
238: }
242: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
243: {
244: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
246: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
247: PetscInt *diag = a->diag,oidx;
248: MatScalar *aa=a->a,*v;
249: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
250: PetscScalar *x,*b;
253: VecCopy(bb,xx);
254: VecGetArray(bb,&b);
255: VecGetArray(xx,&x);
257: /* forward solve the U^T */
258: idx = 0;
259: for (i=0; i<n; i++) {
261: v = aa + 25*diag[i];
262: /* multiply by the inverse of the block diagonal */
263: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
264: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
265: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
266: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
267: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
268: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
269: v += 25;
271: vi = aj + diag[i] + 1;
272: nz = ai[i+1] - diag[i] - 1;
273: while (nz--) {
274: oidx = 5*(*vi++);
275: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
276: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
277: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
278: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
279: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
280: v += 25;
281: }
282: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
283: idx += 5;
284: }
285: /* backward solve the L^T */
286: for (i=n-1; i>=0; i--){
287: v = aa + 25*diag[i] - 25;
288: vi = aj + diag[i] - 1;
289: nz = diag[i] - ai[i];
290: idt = 5*i;
291: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
292: while (nz--) {
293: idx = 5*(*vi--);
294: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
295: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
296: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
297: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
298: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
299: v -= 25;
300: }
301: }
302: VecRestoreArray(bb,&b);
303: VecRestoreArray(xx,&x);
304: PetscLogFlops(2*25*(a->nz) - 5*A->n);
305: return(0);
306: }
310: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
311: {
312: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
314: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
315: PetscInt *diag = a->diag,oidx;
316: MatScalar *aa=a->a,*v;
317: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
318: PetscScalar *x,*b;
321: VecCopy(bb,xx);
322: VecGetArray(bb,&b);
323: VecGetArray(xx,&x);
325: /* forward solve the U^T */
326: idx = 0;
327: for (i=0; i<n; i++) {
329: v = aa + 36*diag[i];
330: /* multiply by the inverse of the block diagonal */
331: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
332: x6 = x[5+idx];
333: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
334: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
335: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
336: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
337: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
338: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
339: v += 36;
341: vi = aj + diag[i] + 1;
342: nz = ai[i+1] - diag[i] - 1;
343: while (nz--) {
344: oidx = 6*(*vi++);
345: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
346: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
347: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
348: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
349: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
350: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
351: v += 36;
352: }
353: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
354: x[5+idx] = s6;
355: idx += 6;
356: }
357: /* backward solve the L^T */
358: for (i=n-1; i>=0; i--){
359: v = aa + 36*diag[i] - 36;
360: vi = aj + diag[i] - 1;
361: nz = diag[i] - ai[i];
362: idt = 6*i;
363: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
364: s6 = x[5+idt];
365: while (nz--) {
366: idx = 6*(*vi--);
367: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
368: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
369: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
370: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
371: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
372: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
373: v -= 36;
374: }
375: }
376: VecRestoreArray(bb,&b);
377: VecRestoreArray(xx,&x);
378: PetscLogFlops(2*36*(a->nz) - 6*A->n);
379: return(0);
380: }
384: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
385: {
386: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
388: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
389: PetscInt *diag = a->diag,oidx;
390: MatScalar *aa=a->a,*v;
391: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
392: PetscScalar *x,*b;
395: VecCopy(bb,xx);
396: VecGetArray(bb,&b);
397: VecGetArray(xx,&x);
399: /* forward solve the U^T */
400: idx = 0;
401: for (i=0; i<n; i++) {
403: v = aa + 49*diag[i];
404: /* multiply by the inverse of the block diagonal */
405: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
406: x6 = x[5+idx]; x7 = x[6+idx];
407: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
408: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
409: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
410: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
411: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
412: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
413: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
414: v += 49;
416: vi = aj + diag[i] + 1;
417: nz = ai[i+1] - diag[i] - 1;
418: while (nz--) {
419: oidx = 7*(*vi++);
420: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
421: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
422: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
423: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
424: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
425: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
426: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
427: v += 49;
428: }
429: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
430: x[5+idx] = s6;x[6+idx] = s7;
431: idx += 7;
432: }
433: /* backward solve the L^T */
434: for (i=n-1; i>=0; i--){
435: v = aa + 49*diag[i] - 49;
436: vi = aj + diag[i] - 1;
437: nz = diag[i] - ai[i];
438: idt = 7*i;
439: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
440: s6 = x[5+idt];s7 = x[6+idt];
441: while (nz--) {
442: idx = 7*(*vi--);
443: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
444: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
445: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
446: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
447: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
448: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
449: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
450: v -= 49;
451: }
452: }
453: VecRestoreArray(bb,&b);
454: VecRestoreArray(xx,&x);
455: PetscLogFlops(2*49*(a->nz) - 7*A->n);
456: return(0);
457: }
459: /*---------------------------------------------------------------------------------------------*/
462: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
463: {
464: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
465: IS iscol=a->col,isrow=a->row;
467: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
468: PetscInt *diag = a->diag;
469: MatScalar *aa=a->a,*v;
470: PetscScalar s1,*x,*b,*t;
473: VecGetArray(bb,&b);
474: VecGetArray(xx,&x);
475: t = a->solve_work;
477: ISGetIndices(isrow,&rout); r = rout;
478: ISGetIndices(iscol,&cout); c = cout;
480: /* copy the b into temp work space according to permutation */
481: for (i=0; i<n; i++) {
482: t[i] = b[c[i]];
483: }
485: /* forward solve the U^T */
486: for (i=0; i<n; i++) {
488: v = aa + diag[i];
489: /* multiply by the inverse of the block diagonal */
490: s1 = (*v++)*t[i];
491: vi = aj + diag[i] + 1;
492: nz = ai[i+1] - diag[i] - 1;
493: while (nz--) {
494: t[*vi++] -= (*v++)*s1;
495: }
496: t[i] = s1;
497: }
498: /* backward solve the L^T */
499: for (i=n-1; i>=0; i--){
500: v = aa + diag[i] - 1;
501: vi = aj + diag[i] - 1;
502: nz = diag[i] - ai[i];
503: s1 = t[i];
504: while (nz--) {
505: t[*vi--] -= (*v--)*s1;
506: }
507: }
509: /* copy t into x according to permutation */
510: for (i=0; i<n; i++) {
511: x[r[i]] = t[i];
512: }
514: ISRestoreIndices(isrow,&rout);
515: ISRestoreIndices(iscol,&cout);
516: VecRestoreArray(bb,&b);
517: VecRestoreArray(xx,&x);
518: PetscLogFlops(2*(a->nz) - A->n);
519: return(0);
520: }
524: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
525: {
526: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
527: IS iscol=a->col,isrow=a->row;
529: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
530: PetscInt *diag = a->diag,ii,ic,ir,oidx;
531: MatScalar *aa=a->a,*v;
532: PetscScalar s1,s2,x1,x2;
533: PetscScalar *x,*b,*t;
536: VecGetArray(bb,&b);
537: VecGetArray(xx,&x);
538: t = a->solve_work;
540: ISGetIndices(isrow,&rout); r = rout;
541: ISGetIndices(iscol,&cout); c = cout;
543: /* copy the b into temp work space according to permutation */
544: ii = 0;
545: for (i=0; i<n; i++) {
546: ic = 2*c[i];
547: t[ii] = b[ic];
548: t[ii+1] = b[ic+1];
549: ii += 2;
550: }
552: /* forward solve the U^T */
553: idx = 0;
554: for (i=0; i<n; i++) {
556: v = aa + 4*diag[i];
557: /* multiply by the inverse of the block diagonal */
558: x1 = t[idx]; x2 = t[1+idx];
559: s1 = v[0]*x1 + v[1]*x2;
560: s2 = v[2]*x1 + v[3]*x2;
561: v += 4;
563: vi = aj + diag[i] + 1;
564: nz = ai[i+1] - diag[i] - 1;
565: while (nz--) {
566: oidx = 2*(*vi++);
567: t[oidx] -= v[0]*s1 + v[1]*s2;
568: t[oidx+1] -= v[2]*s1 + v[3]*s2;
569: v += 4;
570: }
571: t[idx] = s1;t[1+idx] = s2;
572: idx += 2;
573: }
574: /* backward solve the L^T */
575: for (i=n-1; i>=0; i--){
576: v = aa + 4*diag[i] - 4;
577: vi = aj + diag[i] - 1;
578: nz = diag[i] - ai[i];
579: idt = 2*i;
580: s1 = t[idt]; s2 = t[1+idt];
581: while (nz--) {
582: idx = 2*(*vi--);
583: t[idx] -= v[0]*s1 + v[1]*s2;
584: t[idx+1] -= v[2]*s1 + v[3]*s2;
585: v -= 4;
586: }
587: }
589: /* copy t into x according to permutation */
590: ii = 0;
591: for (i=0; i<n; i++) {
592: ir = 2*r[i];
593: x[ir] = t[ii];
594: x[ir+1] = t[ii+1];
595: ii += 2;
596: }
598: ISRestoreIndices(isrow,&rout);
599: ISRestoreIndices(iscol,&cout);
600: VecRestoreArray(bb,&b);
601: VecRestoreArray(xx,&x);
602: PetscLogFlops(2*4*(a->nz) - 2*A->n);
603: return(0);
604: }
608: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
609: {
610: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
611: IS iscol=a->col,isrow=a->row;
613: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
614: PetscInt *diag = a->diag,ii,ic,ir,oidx;
615: MatScalar *aa=a->a,*v;
616: PetscScalar s1,s2,s3,x1,x2,x3;
617: PetscScalar *x,*b,*t;
620: VecGetArray(bb,&b);
621: VecGetArray(xx,&x);
622: t = a->solve_work;
624: ISGetIndices(isrow,&rout); r = rout;
625: ISGetIndices(iscol,&cout); c = cout;
627: /* copy the b into temp work space according to permutation */
628: ii = 0;
629: for (i=0; i<n; i++) {
630: ic = 3*c[i];
631: t[ii] = b[ic];
632: t[ii+1] = b[ic+1];
633: t[ii+2] = b[ic+2];
634: ii += 3;
635: }
637: /* forward solve the U^T */
638: idx = 0;
639: for (i=0; i<n; i++) {
641: v = aa + 9*diag[i];
642: /* multiply by the inverse of the block diagonal */
643: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
644: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
645: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
646: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
647: v += 9;
649: vi = aj + diag[i] + 1;
650: nz = ai[i+1] - diag[i] - 1;
651: while (nz--) {
652: oidx = 3*(*vi++);
653: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
654: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
655: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
656: v += 9;
657: }
658: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
659: idx += 3;
660: }
661: /* backward solve the L^T */
662: for (i=n-1; i>=0; i--){
663: v = aa + 9*diag[i] - 9;
664: vi = aj + diag[i] - 1;
665: nz = diag[i] - ai[i];
666: idt = 3*i;
667: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
668: while (nz--) {
669: idx = 3*(*vi--);
670: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
671: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
672: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
673: v -= 9;
674: }
675: }
677: /* copy t into x according to permutation */
678: ii = 0;
679: for (i=0; i<n; i++) {
680: ir = 3*r[i];
681: x[ir] = t[ii];
682: x[ir+1] = t[ii+1];
683: x[ir+2] = t[ii+2];
684: ii += 3;
685: }
687: ISRestoreIndices(isrow,&rout);
688: ISRestoreIndices(iscol,&cout);
689: VecRestoreArray(bb,&b);
690: VecRestoreArray(xx,&x);
691: PetscLogFlops(2*9*(a->nz) - 3*A->n);
692: return(0);
693: }
697: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
698: {
699: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
700: IS iscol=a->col,isrow=a->row;
702: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
703: PetscInt *diag = a->diag,ii,ic,ir,oidx;
704: MatScalar *aa=a->a,*v;
705: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
706: PetscScalar *x,*b,*t;
709: VecGetArray(bb,&b);
710: VecGetArray(xx,&x);
711: t = a->solve_work;
713: ISGetIndices(isrow,&rout); r = rout;
714: ISGetIndices(iscol,&cout); c = cout;
716: /* copy the b into temp work space according to permutation */
717: ii = 0;
718: for (i=0; i<n; i++) {
719: ic = 4*c[i];
720: t[ii] = b[ic];
721: t[ii+1] = b[ic+1];
722: t[ii+2] = b[ic+2];
723: t[ii+3] = b[ic+3];
724: ii += 4;
725: }
727: /* forward solve the U^T */
728: idx = 0;
729: for (i=0; i<n; i++) {
731: v = aa + 16*diag[i];
732: /* multiply by the inverse of the block diagonal */
733: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
734: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
735: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
736: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
737: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
738: v += 16;
740: vi = aj + diag[i] + 1;
741: nz = ai[i+1] - diag[i] - 1;
742: while (nz--) {
743: oidx = 4*(*vi++);
744: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
745: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
746: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
747: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
748: v += 16;
749: }
750: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
751: idx += 4;
752: }
753: /* backward solve the L^T */
754: for (i=n-1; i>=0; i--){
755: v = aa + 16*diag[i] - 16;
756: vi = aj + diag[i] - 1;
757: nz = diag[i] - ai[i];
758: idt = 4*i;
759: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
760: while (nz--) {
761: idx = 4*(*vi--);
762: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
763: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
764: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
765: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
766: v -= 16;
767: }
768: }
770: /* copy t into x according to permutation */
771: ii = 0;
772: for (i=0; i<n; i++) {
773: ir = 4*r[i];
774: x[ir] = t[ii];
775: x[ir+1] = t[ii+1];
776: x[ir+2] = t[ii+2];
777: x[ir+3] = t[ii+3];
778: ii += 4;
779: }
781: ISRestoreIndices(isrow,&rout);
782: ISRestoreIndices(iscol,&cout);
783: VecRestoreArray(bb,&b);
784: VecRestoreArray(xx,&x);
785: PetscLogFlops(2*16*(a->nz) - 4*A->n);
786: return(0);
787: }
791: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
792: {
793: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
794: IS iscol=a->col,isrow=a->row;
796: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
797: PetscInt *diag = a->diag,ii,ic,ir,oidx;
798: MatScalar *aa=a->a,*v;
799: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
800: PetscScalar *x,*b,*t;
803: VecGetArray(bb,&b);
804: VecGetArray(xx,&x);
805: t = a->solve_work;
807: ISGetIndices(isrow,&rout); r = rout;
808: ISGetIndices(iscol,&cout); c = cout;
810: /* copy the b into temp work space according to permutation */
811: ii = 0;
812: for (i=0; i<n; i++) {
813: ic = 5*c[i];
814: t[ii] = b[ic];
815: t[ii+1] = b[ic+1];
816: t[ii+2] = b[ic+2];
817: t[ii+3] = b[ic+3];
818: t[ii+4] = b[ic+4];
819: ii += 5;
820: }
822: /* forward solve the U^T */
823: idx = 0;
824: for (i=0; i<n; i++) {
826: v = aa + 25*diag[i];
827: /* multiply by the inverse of the block diagonal */
828: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
829: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
830: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
831: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
832: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
833: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
834: v += 25;
836: vi = aj + diag[i] + 1;
837: nz = ai[i+1] - diag[i] - 1;
838: while (nz--) {
839: oidx = 5*(*vi++);
840: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
841: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
842: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
843: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
844: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
845: v += 25;
846: }
847: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
848: idx += 5;
849: }
850: /* backward solve the L^T */
851: for (i=n-1; i>=0; i--){
852: v = aa + 25*diag[i] - 25;
853: vi = aj + diag[i] - 1;
854: nz = diag[i] - ai[i];
855: idt = 5*i;
856: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
857: while (nz--) {
858: idx = 5*(*vi--);
859: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
860: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
861: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
862: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
863: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
864: v -= 25;
865: }
866: }
868: /* copy t into x according to permutation */
869: ii = 0;
870: for (i=0; i<n; i++) {
871: ir = 5*r[i];
872: x[ir] = t[ii];
873: x[ir+1] = t[ii+1];
874: x[ir+2] = t[ii+2];
875: x[ir+3] = t[ii+3];
876: x[ir+4] = t[ii+4];
877: ii += 5;
878: }
880: ISRestoreIndices(isrow,&rout);
881: ISRestoreIndices(iscol,&cout);
882: VecRestoreArray(bb,&b);
883: VecRestoreArray(xx,&x);
884: PetscLogFlops(2*25*(a->nz) - 5*A->n);
885: return(0);
886: }
890: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
891: {
892: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
893: IS iscol=a->col,isrow=a->row;
895: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
896: PetscInt *diag = a->diag,ii,ic,ir,oidx;
897: MatScalar *aa=a->a,*v;
898: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
899: PetscScalar *x,*b,*t;
902: VecGetArray(bb,&b);
903: VecGetArray(xx,&x);
904: t = a->solve_work;
906: ISGetIndices(isrow,&rout); r = rout;
907: ISGetIndices(iscol,&cout); c = cout;
909: /* copy the b into temp work space according to permutation */
910: ii = 0;
911: for (i=0; i<n; i++) {
912: ic = 6*c[i];
913: t[ii] = b[ic];
914: t[ii+1] = b[ic+1];
915: t[ii+2] = b[ic+2];
916: t[ii+3] = b[ic+3];
917: t[ii+4] = b[ic+4];
918: t[ii+5] = b[ic+5];
919: ii += 6;
920: }
922: /* forward solve the U^T */
923: idx = 0;
924: for (i=0; i<n; i++) {
926: v = aa + 36*diag[i];
927: /* multiply by the inverse of the block diagonal */
928: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
929: x6 = t[5+idx];
930: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
931: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
932: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
933: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
934: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
935: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
936: v += 36;
938: vi = aj + diag[i] + 1;
939: nz = ai[i+1] - diag[i] - 1;
940: while (nz--) {
941: oidx = 6*(*vi++);
942: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
943: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
944: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
945: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
946: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
947: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
948: v += 36;
949: }
950: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
951: t[5+idx] = s6;
952: idx += 6;
953: }
954: /* backward solve the L^T */
955: for (i=n-1; i>=0; i--){
956: v = aa + 36*diag[i] - 36;
957: vi = aj + diag[i] - 1;
958: nz = diag[i] - ai[i];
959: idt = 6*i;
960: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
961: s6 = t[5+idt];
962: while (nz--) {
963: idx = 6*(*vi--);
964: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
965: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
966: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
967: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
968: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
969: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
970: v -= 36;
971: }
972: }
974: /* copy t into x according to permutation */
975: ii = 0;
976: for (i=0; i<n; i++) {
977: ir = 6*r[i];
978: x[ir] = t[ii];
979: x[ir+1] = t[ii+1];
980: x[ir+2] = t[ii+2];
981: x[ir+3] = t[ii+3];
982: x[ir+4] = t[ii+4];
983: x[ir+5] = t[ii+5];
984: ii += 6;
985: }
987: ISRestoreIndices(isrow,&rout);
988: ISRestoreIndices(iscol,&cout);
989: VecRestoreArray(bb,&b);
990: VecRestoreArray(xx,&x);
991: PetscLogFlops(2*36*(a->nz) - 6*A->n);
992: return(0);
993: }
997: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
998: {
999: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1000: IS iscol=a->col,isrow=a->row;
1002: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1003: PetscInt *diag = a->diag,ii,ic,ir,oidx;
1004: MatScalar *aa=a->a,*v;
1005: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1006: PetscScalar *x,*b,*t;
1009: VecGetArray(bb,&b);
1010: VecGetArray(xx,&x);
1011: t = a->solve_work;
1013: ISGetIndices(isrow,&rout); r = rout;
1014: ISGetIndices(iscol,&cout); c = cout;
1016: /* copy the b into temp work space according to permutation */
1017: ii = 0;
1018: for (i=0; i<n; i++) {
1019: ic = 7*c[i];
1020: t[ii] = b[ic];
1021: t[ii+1] = b[ic+1];
1022: t[ii+2] = b[ic+2];
1023: t[ii+3] = b[ic+3];
1024: t[ii+4] = b[ic+4];
1025: t[ii+5] = b[ic+5];
1026: t[ii+6] = b[ic+6];
1027: ii += 7;
1028: }
1030: /* forward solve the U^T */
1031: idx = 0;
1032: for (i=0; i<n; i++) {
1034: v = aa + 49*diag[i];
1035: /* multiply by the inverse of the block diagonal */
1036: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1037: x6 = t[5+idx]; x7 = t[6+idx];
1038: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1039: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1040: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1041: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1042: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1043: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1044: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1045: v += 49;
1047: vi = aj + diag[i] + 1;
1048: nz = ai[i+1] - diag[i] - 1;
1049: while (nz--) {
1050: oidx = 7*(*vi++);
1051: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1052: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1053: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1054: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1055: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1056: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1057: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1058: v += 49;
1059: }
1060: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1061: t[5+idx] = s6;t[6+idx] = s7;
1062: idx += 7;
1063: }
1064: /* backward solve the L^T */
1065: for (i=n-1; i>=0; i--){
1066: v = aa + 49*diag[i] - 49;
1067: vi = aj + diag[i] - 1;
1068: nz = diag[i] - ai[i];
1069: idt = 7*i;
1070: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1071: s6 = t[5+idt];s7 = t[6+idt];
1072: while (nz--) {
1073: idx = 7*(*vi--);
1074: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1075: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1076: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1077: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1078: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1079: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1080: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1081: v -= 49;
1082: }
1083: }
1085: /* copy t into x according to permutation */
1086: ii = 0;
1087: for (i=0; i<n; i++) {
1088: ir = 7*r[i];
1089: x[ir] = t[ii];
1090: x[ir+1] = t[ii+1];
1091: x[ir+2] = t[ii+2];
1092: x[ir+3] = t[ii+3];
1093: x[ir+4] = t[ii+4];
1094: x[ir+5] = t[ii+5];
1095: x[ir+6] = t[ii+6];
1096: ii += 7;
1097: }
1099: ISRestoreIndices(isrow,&rout);
1100: ISRestoreIndices(iscol,&cout);
1101: VecRestoreArray(bb,&b);
1102: VecRestoreArray(xx,&x);
1103: PetscLogFlops(2*49*(a->nz) - 7*A->n);
1104: return(0);
1105: }
1107: /* ----------------------------------------------------------- */
1110: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1111: {
1112: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1113: IS iscol=a->col,isrow=a->row;
1115: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1116: PetscInt nz,bs=A->bs,bs2=a->bs2,*rout,*cout;
1117: MatScalar *aa=a->a,*v;
1118: PetscScalar *x,*b,*s,*t,*ls;
1121: VecGetArray(bb,&b);
1122: VecGetArray(xx,&x);
1123: t = a->solve_work;
1125: ISGetIndices(isrow,&rout); r = rout;
1126: ISGetIndices(iscol,&cout); c = cout + (n-1);
1128: /* forward solve the lower triangular */
1129: PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1130: for (i=1; i<n; i++) {
1131: v = aa + bs2*ai[i];
1132: vi = aj + ai[i];
1133: nz = a->diag[i] - ai[i];
1134: s = t + bs*i;
1135: PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1136: while (nz--) {
1137: Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1138: v += bs2;
1139: }
1140: }
1141: /* backward solve the upper triangular */
1142: ls = a->solve_work + A->n;
1143: for (i=n-1; i>=0; i--){
1144: v = aa + bs2*(a->diag[i] + 1);
1145: vi = aj + a->diag[i] + 1;
1146: nz = ai[i+1] - a->diag[i] - 1;
1147: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1148: while (nz--) {
1149: Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1150: v += bs2;
1151: }
1152: Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1153: PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1154: }
1156: ISRestoreIndices(isrow,&rout);
1157: ISRestoreIndices(iscol,&cout);
1158: VecRestoreArray(bb,&b);
1159: VecRestoreArray(xx,&x);
1160: PetscLogFlops(2*(a->bs2)*(a->nz) - A->bs*A->n);
1161: return(0);
1162: }
1166: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1167: {
1168: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1169: IS iscol=a->col,isrow=a->row;
1171: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1172: PetscInt *diag = a->diag;
1173: MatScalar *aa=a->a,*v;
1174: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1175: PetscScalar *x,*b,*t;
1178: VecGetArray(bb,&b);
1179: VecGetArray(xx,&x);
1180: t = a->solve_work;
1182: ISGetIndices(isrow,&rout); r = rout;
1183: ISGetIndices(iscol,&cout); c = cout + (n-1);
1185: /* forward solve the lower triangular */
1186: idx = 7*(*r++);
1187: t[0] = b[idx]; t[1] = b[1+idx];
1188: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1189: t[5] = b[5+idx]; t[6] = b[6+idx];
1191: for (i=1; i<n; i++) {
1192: v = aa + 49*ai[i];
1193: vi = aj + ai[i];
1194: nz = diag[i] - ai[i];
1195: idx = 7*(*r++);
1196: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1197: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1198: while (nz--) {
1199: idx = 7*(*vi++);
1200: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1201: x4 = t[3+idx];x5 = t[4+idx];
1202: x6 = t[5+idx];x7 = t[6+idx];
1203: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1204: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1205: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1206: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1207: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1208: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1209: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1210: v += 49;
1211: }
1212: idx = 7*i;
1213: t[idx] = s1;t[1+idx] = s2;
1214: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1215: t[5+idx] = s6;t[6+idx] = s7;
1216: }
1217: /* backward solve the upper triangular */
1218: for (i=n-1; i>=0; i--){
1219: v = aa + 49*diag[i] + 49;
1220: vi = aj + diag[i] + 1;
1221: nz = ai[i+1] - diag[i] - 1;
1222: idt = 7*i;
1223: s1 = t[idt]; s2 = t[1+idt];
1224: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1225: s6 = t[5+idt];s7 = t[6+idt];
1226: while (nz--) {
1227: idx = 7*(*vi++);
1228: x1 = t[idx]; x2 = t[1+idx];
1229: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1230: x6 = t[5+idx]; x7 = t[6+idx];
1231: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1232: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1233: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1234: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1235: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1236: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1237: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1238: v += 49;
1239: }
1240: idc = 7*(*c--);
1241: v = aa + 49*diag[i];
1242: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
1243: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1244: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1245: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1246: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1247: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1248: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1249: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1250: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1251: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1252: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1253: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1254: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1255: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1256: }
1258: ISRestoreIndices(isrow,&rout);
1259: ISRestoreIndices(iscol,&cout);
1260: VecRestoreArray(bb,&b);
1261: VecRestoreArray(xx,&x);
1262: PetscLogFlops(2*49*(a->nz) - 7*A->n);
1263: return(0);
1264: }
1268: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1269: {
1270: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1271: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1273: PetscInt *diag = a->diag,jdx;
1274: MatScalar *aa=a->a,*v;
1275: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1278: VecGetArray(bb,&b);
1279: VecGetArray(xx,&x);
1280: /* forward solve the lower triangular */
1281: idx = 0;
1282: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1283: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1284: x[6] = b[6+idx];
1285: for (i=1; i<n; i++) {
1286: v = aa + 49*ai[i];
1287: vi = aj + ai[i];
1288: nz = diag[i] - ai[i];
1289: idx = 7*i;
1290: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1291: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1292: s7 = b[6+idx];
1293: while (nz--) {
1294: jdx = 7*(*vi++);
1295: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1296: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1297: x7 = x[6+jdx];
1298: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1299: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1300: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1301: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1302: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1303: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1304: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1305: v += 49;
1306: }
1307: x[idx] = s1;
1308: x[1+idx] = s2;
1309: x[2+idx] = s3;
1310: x[3+idx] = s4;
1311: x[4+idx] = s5;
1312: x[5+idx] = s6;
1313: x[6+idx] = s7;
1314: }
1315: /* backward solve the upper triangular */
1316: for (i=n-1; i>=0; i--){
1317: v = aa + 49*diag[i] + 49;
1318: vi = aj + diag[i] + 1;
1319: nz = ai[i+1] - diag[i] - 1;
1320: idt = 7*i;
1321: s1 = x[idt]; s2 = x[1+idt];
1322: s3 = x[2+idt]; s4 = x[3+idt];
1323: s5 = x[4+idt]; s6 = x[5+idt];
1324: s7 = x[6+idt];
1325: while (nz--) {
1326: idx = 7*(*vi++);
1327: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1328: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1329: x7 = x[6+idx];
1330: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1331: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1332: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1333: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1334: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1335: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1336: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1337: v += 49;
1338: }
1339: v = aa + 49*diag[i];
1340: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
1341: + v[28]*s5 + v[35]*s6 + v[42]*s7;
1342: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
1343: + v[29]*s5 + v[36]*s6 + v[43]*s7;
1344: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
1345: + v[30]*s5 + v[37]*s6 + v[44]*s7;
1346: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
1347: + v[31]*s5 + v[38]*s6 + v[45]*s7;
1348: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
1349: + v[32]*s5 + v[39]*s6 + v[46]*s7;
1350: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
1351: + v[33]*s5 + v[40]*s6 + v[47]*s7;
1352: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
1353: + v[34]*s5 + v[41]*s6 + v[48]*s7;
1354: }
1356: VecRestoreArray(bb,&b);
1357: VecRestoreArray(xx,&x);
1358: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1359: return(0);
1360: }
1364: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1365: {
1366: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1367: IS iscol=a->col,isrow=a->row;
1369: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1370: PetscInt *diag = a->diag;
1371: MatScalar *aa=a->a,*v;
1372: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1375: VecGetArray(bb,&b);
1376: VecGetArray(xx,&x);
1377: t = a->solve_work;
1379: ISGetIndices(isrow,&rout); r = rout;
1380: ISGetIndices(iscol,&cout); c = cout + (n-1);
1382: /* forward solve the lower triangular */
1383: idx = 6*(*r++);
1384: t[0] = b[idx]; t[1] = b[1+idx];
1385: t[2] = b[2+idx]; t[3] = b[3+idx];
1386: t[4] = b[4+idx]; t[5] = b[5+idx];
1387: for (i=1; i<n; i++) {
1388: v = aa + 36*ai[i];
1389: vi = aj + ai[i];
1390: nz = diag[i] - ai[i];
1391: idx = 6*(*r++);
1392: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1393: s5 = b[4+idx]; s6 = b[5+idx];
1394: while (nz--) {
1395: idx = 6*(*vi++);
1396: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1397: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1398: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1399: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1400: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1401: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1402: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1403: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1404: v += 36;
1405: }
1406: idx = 6*i;
1407: t[idx] = s1;t[1+idx] = s2;
1408: t[2+idx] = s3;t[3+idx] = s4;
1409: t[4+idx] = s5;t[5+idx] = s6;
1410: }
1411: /* backward solve the upper triangular */
1412: for (i=n-1; i>=0; i--){
1413: v = aa + 36*diag[i] + 36;
1414: vi = aj + diag[i] + 1;
1415: nz = ai[i+1] - diag[i] - 1;
1416: idt = 6*i;
1417: s1 = t[idt]; s2 = t[1+idt];
1418: s3 = t[2+idt];s4 = t[3+idt];
1419: s5 = t[4+idt];s6 = t[5+idt];
1420: while (nz--) {
1421: idx = 6*(*vi++);
1422: x1 = t[idx]; x2 = t[1+idx];
1423: x3 = t[2+idx]; x4 = t[3+idx];
1424: x5 = t[4+idx]; x6 = t[5+idx];
1425: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1426: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1427: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1428: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1429: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1430: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1431: v += 36;
1432: }
1433: idc = 6*(*c--);
1434: v = aa + 36*diag[i];
1435: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
1436: v[18]*s4+v[24]*s5+v[30]*s6;
1437: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1438: v[19]*s4+v[25]*s5+v[31]*s6;
1439: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1440: v[20]*s4+v[26]*s5+v[32]*s6;
1441: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1442: v[21]*s4+v[27]*s5+v[33]*s6;
1443: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1444: v[22]*s4+v[28]*s5+v[34]*s6;
1445: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1446: v[23]*s4+v[29]*s5+v[35]*s6;
1447: }
1449: ISRestoreIndices(isrow,&rout);
1450: ISRestoreIndices(iscol,&cout);
1451: VecRestoreArray(bb,&b);
1452: VecRestoreArray(xx,&x);
1453: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1454: return(0);
1455: }
1459: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1460: {
1461: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1462: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1464: PetscInt *diag = a->diag,jdx;
1465: MatScalar *aa=a->a,*v;
1466: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1469: VecGetArray(bb,&b);
1470: VecGetArray(xx,&x);
1471: /* forward solve the lower triangular */
1472: idx = 0;
1473: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1474: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1475: for (i=1; i<n; i++) {
1476: v = aa + 36*ai[i];
1477: vi = aj + ai[i];
1478: nz = diag[i] - ai[i];
1479: idx = 6*i;
1480: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1481: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1482: while (nz--) {
1483: jdx = 6*(*vi++);
1484: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1485: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1486: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1487: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1488: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1489: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1490: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1491: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1492: v += 36;
1493: }
1494: x[idx] = s1;
1495: x[1+idx] = s2;
1496: x[2+idx] = s3;
1497: x[3+idx] = s4;
1498: x[4+idx] = s5;
1499: x[5+idx] = s6;
1500: }
1501: /* backward solve the upper triangular */
1502: for (i=n-1; i>=0; i--){
1503: v = aa + 36*diag[i] + 36;
1504: vi = aj + diag[i] + 1;
1505: nz = ai[i+1] - diag[i] - 1;
1506: idt = 6*i;
1507: s1 = x[idt]; s2 = x[1+idt];
1508: s3 = x[2+idt]; s4 = x[3+idt];
1509: s5 = x[4+idt]; s6 = x[5+idt];
1510: while (nz--) {
1511: idx = 6*(*vi++);
1512: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1513: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1514: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1515: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1516: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1517: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1518: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1519: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1520: v += 36;
1521: }
1522: v = aa + 36*diag[i];
1523: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1524: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1525: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1526: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1527: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1528: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1529: }
1531: VecRestoreArray(bb,&b);
1532: VecRestoreArray(xx,&x);
1533: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1534: return(0);
1535: }
1539: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1540: {
1541: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1542: IS iscol=a->col,isrow=a->row;
1544: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1545: PetscInt *diag = a->diag;
1546: MatScalar *aa=a->a,*v;
1547: PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1550: VecGetArray(bb,&b);
1551: VecGetArray(xx,&x);
1552: t = a->solve_work;
1554: ISGetIndices(isrow,&rout); r = rout;
1555: ISGetIndices(iscol,&cout); c = cout + (n-1);
1557: /* forward solve the lower triangular */
1558: idx = 5*(*r++);
1559: t[0] = b[idx]; t[1] = b[1+idx];
1560: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1561: for (i=1; i<n; i++) {
1562: v = aa + 25*ai[i];
1563: vi = aj + ai[i];
1564: nz = diag[i] - ai[i];
1565: idx = 5*(*r++);
1566: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1567: s5 = b[4+idx];
1568: while (nz--) {
1569: idx = 5*(*vi++);
1570: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1571: x4 = t[3+idx];x5 = t[4+idx];
1572: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1573: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1574: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1575: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1576: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1577: v += 25;
1578: }
1579: idx = 5*i;
1580: t[idx] = s1;t[1+idx] = s2;
1581: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1582: }
1583: /* backward solve the upper triangular */
1584: for (i=n-1; i>=0; i--){
1585: v = aa + 25*diag[i] + 25;
1586: vi = aj + diag[i] + 1;
1587: nz = ai[i+1] - diag[i] - 1;
1588: idt = 5*i;
1589: s1 = t[idt]; s2 = t[1+idt];
1590: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1591: while (nz--) {
1592: idx = 5*(*vi++);
1593: x1 = t[idx]; x2 = t[1+idx];
1594: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1595: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1596: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1597: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1598: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1599: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1600: v += 25;
1601: }
1602: idc = 5*(*c--);
1603: v = aa + 25*diag[i];
1604: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
1605: v[15]*s4+v[20]*s5;
1606: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1607: v[16]*s4+v[21]*s5;
1608: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1609: v[17]*s4+v[22]*s5;
1610: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1611: v[18]*s4+v[23]*s5;
1612: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1613: v[19]*s4+v[24]*s5;
1614: }
1616: ISRestoreIndices(isrow,&rout);
1617: ISRestoreIndices(iscol,&cout);
1618: VecRestoreArray(bb,&b);
1619: VecRestoreArray(xx,&x);
1620: PetscLogFlops(2*25*(a->nz) - 5*A->n);
1621: return(0);
1622: }
1626: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1627: {
1628: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1629: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1631: PetscInt *diag = a->diag,jdx;
1632: MatScalar *aa=a->a,*v;
1633: PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1636: VecGetArray(bb,&b);
1637: VecGetArray(xx,&x);
1638: /* forward solve the lower triangular */
1639: idx = 0;
1640: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1641: for (i=1; i<n; i++) {
1642: v = aa + 25*ai[i];
1643: vi = aj + ai[i];
1644: nz = diag[i] - ai[i];
1645: idx = 5*i;
1646: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1647: while (nz--) {
1648: jdx = 5*(*vi++);
1649: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1650: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1651: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1652: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1653: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1654: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1655: v += 25;
1656: }
1657: x[idx] = s1;
1658: x[1+idx] = s2;
1659: x[2+idx] = s3;
1660: x[3+idx] = s4;
1661: x[4+idx] = s5;
1662: }
1663: /* backward solve the upper triangular */
1664: for (i=n-1; i>=0; i--){
1665: v = aa + 25*diag[i] + 25;
1666: vi = aj + diag[i] + 1;
1667: nz = ai[i+1] - diag[i] - 1;
1668: idt = 5*i;
1669: s1 = x[idt]; s2 = x[1+idt];
1670: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1671: while (nz--) {
1672: idx = 5*(*vi++);
1673: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1674: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1675: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1676: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1677: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1678: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1679: v += 25;
1680: }
1681: v = aa + 25*diag[i];
1682: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
1683: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
1684: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
1685: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
1686: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
1687: }
1689: VecRestoreArray(bb,&b);
1690: VecRestoreArray(xx,&x);
1691: PetscLogFlops(2*25*(a->nz) - 5*A->n);
1692: return(0);
1693: }
1697: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1698: {
1699: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1700: IS iscol=a->col,isrow=a->row;
1702: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1703: PetscInt *diag = a->diag;
1704: MatScalar *aa=a->a,*v;
1705: PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1708: VecGetArray(bb,&b);
1709: VecGetArray(xx,&x);
1710: t = a->solve_work;
1712: ISGetIndices(isrow,&rout); r = rout;
1713: ISGetIndices(iscol,&cout); c = cout + (n-1);
1715: /* forward solve the lower triangular */
1716: idx = 4*(*r++);
1717: t[0] = b[idx]; t[1] = b[1+idx];
1718: t[2] = b[2+idx]; t[3] = b[3+idx];
1719: for (i=1; i<n; i++) {
1720: v = aa + 16*ai[i];
1721: vi = aj + ai[i];
1722: nz = diag[i] - ai[i];
1723: idx = 4*(*r++);
1724: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1725: while (nz--) {
1726: idx = 4*(*vi++);
1727: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1728: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1729: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1730: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1731: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1732: v += 16;
1733: }
1734: idx = 4*i;
1735: t[idx] = s1;t[1+idx] = s2;
1736: t[2+idx] = s3;t[3+idx] = s4;
1737: }
1738: /* backward solve the upper triangular */
1739: for (i=n-1; i>=0; i--){
1740: v = aa + 16*diag[i] + 16;
1741: vi = aj + diag[i] + 1;
1742: nz = ai[i+1] - diag[i] - 1;
1743: idt = 4*i;
1744: s1 = t[idt]; s2 = t[1+idt];
1745: s3 = t[2+idt];s4 = t[3+idt];
1746: while (nz--) {
1747: idx = 4*(*vi++);
1748: x1 = t[idx]; x2 = t[1+idx];
1749: x3 = t[2+idx]; x4 = t[3+idx];
1750: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1751: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1752: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1753: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1754: v += 16;
1755: }
1756: idc = 4*(*c--);
1757: v = aa + 16*diag[i];
1758: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1759: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1760: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1761: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1762: }
1764: ISRestoreIndices(isrow,&rout);
1765: ISRestoreIndices(iscol,&cout);
1766: VecRestoreArray(bb,&b);
1767: VecRestoreArray(xx,&x);
1768: PetscLogFlops(2*16*(a->nz) - 4*A->n);
1769: return(0);
1770: }
1774: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1775: {
1776: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1777: IS iscol=a->col,isrow=a->row;
1779: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1780: PetscInt *diag = a->diag;
1781: MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1782: PetscScalar *x,*b;
1785: VecGetArray(bb,&b);
1786: VecGetArray(xx,&x);
1787: t = (MatScalar *)a->solve_work;
1789: ISGetIndices(isrow,&rout); r = rout;
1790: ISGetIndices(iscol,&cout); c = cout + (n-1);
1792: /* forward solve the lower triangular */
1793: idx = 4*(*r++);
1794: t[0] = (MatScalar)b[idx];
1795: t[1] = (MatScalar)b[1+idx];
1796: t[2] = (MatScalar)b[2+idx];
1797: t[3] = (MatScalar)b[3+idx];
1798: for (i=1; i<n; i++) {
1799: v = aa + 16*ai[i];
1800: vi = aj + ai[i];
1801: nz = diag[i] - ai[i];
1802: idx = 4*(*r++);
1803: s1 = (MatScalar)b[idx];
1804: s2 = (MatScalar)b[1+idx];
1805: s3 = (MatScalar)b[2+idx];
1806: s4 = (MatScalar)b[3+idx];
1807: while (nz--) {
1808: idx = 4*(*vi++);
1809: x1 = t[idx];
1810: x2 = t[1+idx];
1811: x3 = t[2+idx];
1812: x4 = t[3+idx];
1813: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1814: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1815: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1816: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1817: v += 16;
1818: }
1819: idx = 4*i;
1820: t[idx] = s1;
1821: t[1+idx] = s2;
1822: t[2+idx] = s3;
1823: t[3+idx] = s4;
1824: }
1825: /* backward solve the upper triangular */
1826: for (i=n-1; i>=0; i--){
1827: v = aa + 16*diag[i] + 16;
1828: vi = aj + diag[i] + 1;
1829: nz = ai[i+1] - diag[i] - 1;
1830: idt = 4*i;
1831: s1 = t[idt];
1832: s2 = t[1+idt];
1833: s3 = t[2+idt];
1834: s4 = t[3+idt];
1835: while (nz--) {
1836: idx = 4*(*vi++);
1837: x1 = t[idx];
1838: x2 = t[1+idx];
1839: x3 = t[2+idx];
1840: x4 = t[3+idx];
1841: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1842: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1843: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1844: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1845: v += 16;
1846: }
1847: idc = 4*(*c--);
1848: v = aa + 16*diag[i];
1849: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1850: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1851: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1852: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1853: x[idc] = (PetscScalar)t[idt];
1854: x[1+idc] = (PetscScalar)t[1+idt];
1855: x[2+idc] = (PetscScalar)t[2+idt];
1856: x[3+idc] = (PetscScalar)t[3+idt];
1857: }
1859: ISRestoreIndices(isrow,&rout);
1860: ISRestoreIndices(iscol,&cout);
1861: VecRestoreArray(bb,&b);
1862: VecRestoreArray(xx,&x);
1863: PetscLogFlops(2*16*(a->nz) - 4*A->n);
1864: return(0);
1865: }
1867: #if defined (PETSC_HAVE_SSE)
1869: #include PETSC_HAVE_SSE
1873: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1874: {
1875: /*
1876: Note: This code uses demotion of double
1877: to float when performing the mixed-mode computation.
1878: This may not be numerically reasonable for all applications.
1879: */
1880: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1881: IS iscol=a->col,isrow=a->row;
1883: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1884: PetscInt *diag = a->diag,ai16;
1885: MatScalar *aa=a->a,*v;
1886: PetscScalar *x,*b,*t;
1888: /* Make space in temp stack for 16 Byte Aligned arrays */
1889: float ssealignedspace[11],*tmps,*tmpx;
1890: unsigned long offset;
1891:
1893: SSE_SCOPE_BEGIN;
1895: offset = (unsigned long)ssealignedspace % 16;
1896: if (offset) offset = (16 - offset)/4;
1897: tmps = &ssealignedspace[offset];
1898: tmpx = &ssealignedspace[offset+4];
1899: PREFETCH_NTA(aa+16*ai[1]);
1901: VecGetArray(bb,&b);
1902: VecGetArray(xx,&x);
1903: t = a->solve_work;
1905: ISGetIndices(isrow,&rout); r = rout;
1906: ISGetIndices(iscol,&cout); c = cout + (n-1);
1908: /* forward solve the lower triangular */
1909: idx = 4*(*r++);
1910: t[0] = b[idx]; t[1] = b[1+idx];
1911: t[2] = b[2+idx]; t[3] = b[3+idx];
1912: v = aa + 16*ai[1];
1914: for (i=1; i<n;) {
1915: PREFETCH_NTA(&v[8]);
1916: vi = aj + ai[i];
1917: nz = diag[i] - ai[i];
1918: idx = 4*(*r++);
1920: /* Demote sum from double to float */
1921: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1922: LOAD_PS(tmps,XMM7);
1924: while (nz--) {
1925: PREFETCH_NTA(&v[16]);
1926: idx = 4*(*vi++);
1927:
1928: /* Demote solution (so far) from double to float */
1929: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1931: /* 4x4 Matrix-Vector product with negative accumulation: */
1932: SSE_INLINE_BEGIN_2(tmpx,v)
1933: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1935: /* First Column */
1936: SSE_COPY_PS(XMM0,XMM6)
1937: SSE_SHUFFLE(XMM0,XMM0,0x00)
1938: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1939: SSE_SUB_PS(XMM7,XMM0)
1940:
1941: /* Second Column */
1942: SSE_COPY_PS(XMM1,XMM6)
1943: SSE_SHUFFLE(XMM1,XMM1,0x55)
1944: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1945: SSE_SUB_PS(XMM7,XMM1)
1946:
1947: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1948:
1949: /* Third Column */
1950: SSE_COPY_PS(XMM2,XMM6)
1951: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1952: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1953: SSE_SUB_PS(XMM7,XMM2)
1955: /* Fourth Column */
1956: SSE_COPY_PS(XMM3,XMM6)
1957: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1958: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1959: SSE_SUB_PS(XMM7,XMM3)
1960: SSE_INLINE_END_2
1961:
1962: v += 16;
1963: }
1964: idx = 4*i;
1965: v = aa + 16*ai[++i];
1966: PREFETCH_NTA(v);
1967: STORE_PS(tmps,XMM7);
1969: /* Promote result from float to double */
1970: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1971: }
1972: /* backward solve the upper triangular */
1973: idt = 4*(n-1);
1974: ai16 = 16*diag[n-1];
1975: v = aa + ai16 + 16;
1976: for (i=n-1; i>=0;){
1977: PREFETCH_NTA(&v[8]);
1978: vi = aj + diag[i] + 1;
1979: nz = ai[i+1] - diag[i] - 1;
1980:
1981: /* Demote accumulator from double to float */
1982: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1983: LOAD_PS(tmps,XMM7);
1985: while (nz--) {
1986: PREFETCH_NTA(&v[16]);
1987: idx = 4*(*vi++);
1989: /* Demote solution (so far) from double to float */
1990: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1992: /* 4x4 Matrix-Vector Product with negative accumulation: */
1993: SSE_INLINE_BEGIN_2(tmpx,v)
1994: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1996: /* First Column */
1997: SSE_COPY_PS(XMM0,XMM6)
1998: SSE_SHUFFLE(XMM0,XMM0,0x00)
1999: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2000: SSE_SUB_PS(XMM7,XMM0)
2002: /* Second Column */
2003: SSE_COPY_PS(XMM1,XMM6)
2004: SSE_SHUFFLE(XMM1,XMM1,0x55)
2005: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2006: SSE_SUB_PS(XMM7,XMM1)
2008: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2009:
2010: /* Third Column */
2011: SSE_COPY_PS(XMM2,XMM6)
2012: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2013: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2014: SSE_SUB_PS(XMM7,XMM2)
2016: /* Fourth Column */
2017: SSE_COPY_PS(XMM3,XMM6)
2018: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2019: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2020: SSE_SUB_PS(XMM7,XMM3)
2021: SSE_INLINE_END_2
2022: v += 16;
2023: }
2024: v = aa + ai16;
2025: ai16 = 16*diag[--i];
2026: PREFETCH_NTA(aa+ai16+16);
2027: /*
2028: Scale the result by the diagonal 4x4 block,
2029: which was inverted as part of the factorization
2030: */
2031: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2032: /* First Column */
2033: SSE_COPY_PS(XMM0,XMM7)
2034: SSE_SHUFFLE(XMM0,XMM0,0x00)
2035: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2037: /* Second Column */
2038: SSE_COPY_PS(XMM1,XMM7)
2039: SSE_SHUFFLE(XMM1,XMM1,0x55)
2040: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2041: SSE_ADD_PS(XMM0,XMM1)
2043: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2044:
2045: /* Third Column */
2046: SSE_COPY_PS(XMM2,XMM7)
2047: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2048: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2049: SSE_ADD_PS(XMM0,XMM2)
2051: /* Fourth Column */
2052: SSE_COPY_PS(XMM3,XMM7)
2053: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2054: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2055: SSE_ADD_PS(XMM0,XMM3)
2056:
2057: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2058: SSE_INLINE_END_3
2060: /* Promote solution from float to double */
2061: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2063: /* Apply reordering to t and stream into x. */
2064: /* This way, x doesn't pollute the cache. */
2065: /* Be careful with size: 2 doubles = 4 floats! */
2066: idc = 4*(*c--);
2067: SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2068: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
2069: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2070: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2071: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2072: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2073: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2074: SSE_INLINE_END_2
2075: v = aa + ai16 + 16;
2076: idt -= 4;
2077: }
2079: ISRestoreIndices(isrow,&rout);
2080: ISRestoreIndices(iscol,&cout);
2081: VecRestoreArray(bb,&b);
2082: VecRestoreArray(xx,&x);
2083: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2084: SSE_SCOPE_END;
2085: return(0);
2086: }
2088: #endif
2091: /*
2092: Special case where the matrix was ILU(0) factored in the natural
2093: ordering. This eliminates the need for the column and row permutation.
2094: */
2097: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2098: {
2099: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2100: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2102: PetscInt *diag = a->diag;
2103: MatScalar *aa=a->a;
2104: PetscScalar *x,*b;
2107: VecGetArray(bb,&b);
2108: VecGetArray(xx,&x);
2110: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2111: {
2112: static PetscScalar w[2000]; /* very BAD need to fix */
2113: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2114: }
2115: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2116: {
2117: static PetscScalar w[2000]; /* very BAD need to fix */
2118: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2119: }
2120: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2121: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2122: #else
2123: {
2124: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
2125: MatScalar *v;
2126: PetscInt jdx,idt,idx,nz,*vi,i,ai16;
2128: /* forward solve the lower triangular */
2129: idx = 0;
2130: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2131: for (i=1; i<n; i++) {
2132: v = aa + 16*ai[i];
2133: vi = aj + ai[i];
2134: nz = diag[i] - ai[i];
2135: idx += 4;
2136: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2137: while (nz--) {
2138: jdx = 4*(*vi++);
2139: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2140: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2141: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2142: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2143: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2144: v += 16;
2145: }
2146: x[idx] = s1;
2147: x[1+idx] = s2;
2148: x[2+idx] = s3;
2149: x[3+idx] = s4;
2150: }
2151: /* backward solve the upper triangular */
2152: idt = 4*(n-1);
2153: for (i=n-1; i>=0; i--){
2154: ai16 = 16*diag[i];
2155: v = aa + ai16 + 16;
2156: vi = aj + diag[i] + 1;
2157: nz = ai[i+1] - diag[i] - 1;
2158: s1 = x[idt]; s2 = x[1+idt];
2159: s3 = x[2+idt];s4 = x[3+idt];
2160: while (nz--) {
2161: idx = 4*(*vi++);
2162: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
2163: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2164: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2165: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2166: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2167: v += 16;
2168: }
2169: v = aa + ai16;
2170: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2171: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
2172: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2173: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2174: idt -= 4;
2175: }
2176: }
2177: #endif
2179: VecRestoreArray(bb,&b);
2180: VecRestoreArray(xx,&x);
2181: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2182: return(0);
2183: }
2187: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2188: {
2189: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2190: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2192: PetscInt *diag = a->diag;
2193: MatScalar *aa=a->a;
2194: PetscScalar *x,*b;
2197: VecGetArray(bb,&b);
2198: VecGetArray(xx,&x);
2200: {
2201: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
2202: MatScalar *v,*t=(MatScalar *)x;
2203: PetscInt jdx,idt,idx,nz,*vi,i,ai16;
2205: /* forward solve the lower triangular */
2206: idx = 0;
2207: t[0] = (MatScalar)b[0];
2208: t[1] = (MatScalar)b[1];
2209: t[2] = (MatScalar)b[2];
2210: t[3] = (MatScalar)b[3];
2211: for (i=1; i<n; i++) {
2212: v = aa + 16*ai[i];
2213: vi = aj + ai[i];
2214: nz = diag[i] - ai[i];
2215: idx += 4;
2216: s1 = (MatScalar)b[idx];
2217: s2 = (MatScalar)b[1+idx];
2218: s3 = (MatScalar)b[2+idx];
2219: s4 = (MatScalar)b[3+idx];
2220: while (nz--) {
2221: jdx = 4*(*vi++);
2222: x1 = t[jdx];
2223: x2 = t[1+jdx];
2224: x3 = t[2+jdx];
2225: x4 = t[3+jdx];
2226: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2227: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2228: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2229: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2230: v += 16;
2231: }
2232: t[idx] = s1;
2233: t[1+idx] = s2;
2234: t[2+idx] = s3;
2235: t[3+idx] = s4;
2236: }
2237: /* backward solve the upper triangular */
2238: idt = 4*(n-1);
2239: for (i=n-1; i>=0; i--){
2240: ai16 = 16*diag[i];
2241: v = aa + ai16 + 16;
2242: vi = aj + diag[i] + 1;
2243: nz = ai[i+1] - diag[i] - 1;
2244: s1 = t[idt];
2245: s2 = t[1+idt];
2246: s3 = t[2+idt];
2247: s4 = t[3+idt];
2248: while (nz--) {
2249: idx = 4*(*vi++);
2250: x1 = (MatScalar)x[idx];
2251: x2 = (MatScalar)x[1+idx];
2252: x3 = (MatScalar)x[2+idx];
2253: x4 = (MatScalar)x[3+idx];
2254: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2255: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2256: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2257: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2258: v += 16;
2259: }
2260: v = aa + ai16;
2261: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
2262: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
2263: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2264: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2265: idt -= 4;
2266: }
2267: }
2269: VecRestoreArray(bb,&b);
2270: VecRestoreArray(xx,&x);
2271: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2272: return(0);
2273: }
2275: #if defined (PETSC_HAVE_SSE)
2277: #include PETSC_HAVE_SSE
2280: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2281: {
2282: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2283: unsigned short *aj=(unsigned short *)a->j;
2285: int *ai=a->i,n=a->mbs,*diag = a->diag;
2286: MatScalar *aa=a->a;
2287: PetscScalar *x,*b;
2290: SSE_SCOPE_BEGIN;
2291: /*
2292: Note: This code currently uses demotion of double
2293: to float when performing the mixed-mode computation.
2294: This may not be numerically reasonable for all applications.
2295: */
2296: PREFETCH_NTA(aa+16*ai[1]);
2298: VecGetArray(bb,&b);
2299: VecGetArray(xx,&x);
2300: {
2301: /* x will first be computed in single precision then promoted inplace to double */
2302: MatScalar *v,*t=(MatScalar *)x;
2303: int nz,i,idt,ai16;
2304: unsigned int jdx,idx;
2305: unsigned short *vi;
2306: /* Forward solve the lower triangular factor. */
2308: /* First block is the identity. */
2309: idx = 0;
2310: CONVERT_DOUBLE4_FLOAT4(t,b);
2311: v = aa + 16*((unsigned int)ai[1]);
2313: for (i=1; i<n;) {
2314: PREFETCH_NTA(&v[8]);
2315: vi = aj + ai[i];
2316: nz = diag[i] - ai[i];
2317: idx += 4;
2319: /* Demote RHS from double to float. */
2320: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2321: LOAD_PS(&t[idx],XMM7);
2323: while (nz--) {
2324: PREFETCH_NTA(&v[16]);
2325: jdx = 4*((unsigned int)(*vi++));
2326:
2327: /* 4x4 Matrix-Vector product with negative accumulation: */
2328: SSE_INLINE_BEGIN_2(&t[jdx],v)
2329: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2331: /* First Column */
2332: SSE_COPY_PS(XMM0,XMM6)
2333: SSE_SHUFFLE(XMM0,XMM0,0x00)
2334: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2335: SSE_SUB_PS(XMM7,XMM0)
2337: /* Second Column */
2338: SSE_COPY_PS(XMM1,XMM6)
2339: SSE_SHUFFLE(XMM1,XMM1,0x55)
2340: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2341: SSE_SUB_PS(XMM7,XMM1)
2343: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2344:
2345: /* Third Column */
2346: SSE_COPY_PS(XMM2,XMM6)
2347: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2348: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2349: SSE_SUB_PS(XMM7,XMM2)
2351: /* Fourth Column */
2352: SSE_COPY_PS(XMM3,XMM6)
2353: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2354: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2355: SSE_SUB_PS(XMM7,XMM3)
2356: SSE_INLINE_END_2
2357:
2358: v += 16;
2359: }
2360: v = aa + 16*ai[++i];
2361: PREFETCH_NTA(v);
2362: STORE_PS(&t[idx],XMM7);
2363: }
2365: /* Backward solve the upper triangular factor.*/
2367: idt = 4*(n-1);
2368: ai16 = 16*diag[n-1];
2369: v = aa + ai16 + 16;
2370: for (i=n-1; i>=0;){
2371: PREFETCH_NTA(&v[8]);
2372: vi = aj + diag[i] + 1;
2373: nz = ai[i+1] - diag[i] - 1;
2374:
2375: LOAD_PS(&t[idt],XMM7);
2377: while (nz--) {
2378: PREFETCH_NTA(&v[16]);
2379: idx = 4*((unsigned int)(*vi++));
2381: /* 4x4 Matrix-Vector Product with negative accumulation: */
2382: SSE_INLINE_BEGIN_2(&t[idx],v)
2383: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2385: /* First Column */
2386: SSE_COPY_PS(XMM0,XMM6)
2387: SSE_SHUFFLE(XMM0,XMM0,0x00)
2388: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2389: SSE_SUB_PS(XMM7,XMM0)
2391: /* Second Column */
2392: SSE_COPY_PS(XMM1,XMM6)
2393: SSE_SHUFFLE(XMM1,XMM1,0x55)
2394: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2395: SSE_SUB_PS(XMM7,XMM1)
2397: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2398:
2399: /* Third Column */
2400: SSE_COPY_PS(XMM2,XMM6)
2401: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2402: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2403: SSE_SUB_PS(XMM7,XMM2)
2405: /* Fourth Column */
2406: SSE_COPY_PS(XMM3,XMM6)
2407: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2408: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2409: SSE_SUB_PS(XMM7,XMM3)
2410: SSE_INLINE_END_2
2411: v += 16;
2412: }
2413: v = aa + ai16;
2414: ai16 = 16*diag[--i];
2415: PREFETCH_NTA(aa+ai16+16);
2416: /*
2417: Scale the result by the diagonal 4x4 block,
2418: which was inverted as part of the factorization
2419: */
2420: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2421: /* First Column */
2422: SSE_COPY_PS(XMM0,XMM7)
2423: SSE_SHUFFLE(XMM0,XMM0,0x00)
2424: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2426: /* Second Column */
2427: SSE_COPY_PS(XMM1,XMM7)
2428: SSE_SHUFFLE(XMM1,XMM1,0x55)
2429: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2430: SSE_ADD_PS(XMM0,XMM1)
2432: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2433:
2434: /* Third Column */
2435: SSE_COPY_PS(XMM2,XMM7)
2436: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2437: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2438: SSE_ADD_PS(XMM0,XMM2)
2440: /* Fourth Column */
2441: SSE_COPY_PS(XMM3,XMM7)
2442: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2443: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2444: SSE_ADD_PS(XMM0,XMM3)
2446: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2447: SSE_INLINE_END_3
2449: v = aa + ai16 + 16;
2450: idt -= 4;
2451: }
2453: /* Convert t from single precision back to double precision (inplace)*/
2454: idt = 4*(n-1);
2455: for (i=n-1;i>=0;i--) {
2456: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2457: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2458: PetscScalar *xtemp=&x[idt];
2459: MatScalar *ttemp=&t[idt];
2460: xtemp[3] = (PetscScalar)ttemp[3];
2461: xtemp[2] = (PetscScalar)ttemp[2];
2462: xtemp[1] = (PetscScalar)ttemp[1];
2463: xtemp[0] = (PetscScalar)ttemp[0];
2464: idt -= 4;
2465: }
2467: } /* End of artificial scope. */
2468: VecRestoreArray(bb,&b);
2469: VecRestoreArray(xx,&x);
2470: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2471: SSE_SCOPE_END;
2472: return(0);
2473: }
2477: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2478: {
2479: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2480: int *aj=a->j;
2482: int *ai=a->i,n=a->mbs,*diag = a->diag;
2483: MatScalar *aa=a->a;
2484: PetscScalar *x,*b;
2487: SSE_SCOPE_BEGIN;
2488: /*
2489: Note: This code currently uses demotion of double
2490: to float when performing the mixed-mode computation.
2491: This may not be numerically reasonable for all applications.
2492: */
2493: PREFETCH_NTA(aa+16*ai[1]);
2495: VecGetArray(bb,&b);
2496: VecGetArray(xx,&x);
2497: {
2498: /* x will first be computed in single precision then promoted inplace to double */
2499: MatScalar *v,*t=(MatScalar *)x;
2500: int nz,i,idt,ai16;
2501: int jdx,idx;
2502: int *vi;
2503: /* Forward solve the lower triangular factor. */
2505: /* First block is the identity. */
2506: idx = 0;
2507: CONVERT_DOUBLE4_FLOAT4(t,b);
2508: v = aa + 16*ai[1];
2510: for (i=1; i<n;) {
2511: PREFETCH_NTA(&v[8]);
2512: vi = aj + ai[i];
2513: nz = diag[i] - ai[i];
2514: idx += 4;
2516: /* Demote RHS from double to float. */
2517: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2518: LOAD_PS(&t[idx],XMM7);
2520: while (nz--) {
2521: PREFETCH_NTA(&v[16]);
2522: jdx = 4*(*vi++);
2523: /* jdx = *vi++; */
2524:
2525: /* 4x4 Matrix-Vector product with negative accumulation: */
2526: SSE_INLINE_BEGIN_2(&t[jdx],v)
2527: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2529: /* First Column */
2530: SSE_COPY_PS(XMM0,XMM6)
2531: SSE_SHUFFLE(XMM0,XMM0,0x00)
2532: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2533: SSE_SUB_PS(XMM7,XMM0)
2535: /* Second Column */
2536: SSE_COPY_PS(XMM1,XMM6)
2537: SSE_SHUFFLE(XMM1,XMM1,0x55)
2538: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2539: SSE_SUB_PS(XMM7,XMM1)
2541: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2542:
2543: /* Third Column */
2544: SSE_COPY_PS(XMM2,XMM6)
2545: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2546: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2547: SSE_SUB_PS(XMM7,XMM2)
2549: /* Fourth Column */
2550: SSE_COPY_PS(XMM3,XMM6)
2551: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2552: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2553: SSE_SUB_PS(XMM7,XMM3)
2554: SSE_INLINE_END_2
2555:
2556: v += 16;
2557: }
2558: v = aa + 16*ai[++i];
2559: PREFETCH_NTA(v);
2560: STORE_PS(&t[idx],XMM7);
2561: }
2563: /* Backward solve the upper triangular factor.*/
2565: idt = 4*(n-1);
2566: ai16 = 16*diag[n-1];
2567: v = aa + ai16 + 16;
2568: for (i=n-1; i>=0;){
2569: PREFETCH_NTA(&v[8]);
2570: vi = aj + diag[i] + 1;
2571: nz = ai[i+1] - diag[i] - 1;
2572:
2573: LOAD_PS(&t[idt],XMM7);
2575: while (nz--) {
2576: PREFETCH_NTA(&v[16]);
2577: idx = 4*(*vi++);
2578: /* idx = *vi++; */
2580: /* 4x4 Matrix-Vector Product with negative accumulation: */
2581: SSE_INLINE_BEGIN_2(&t[idx],v)
2582: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2584: /* First Column */
2585: SSE_COPY_PS(XMM0,XMM6)
2586: SSE_SHUFFLE(XMM0,XMM0,0x00)
2587: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2588: SSE_SUB_PS(XMM7,XMM0)
2590: /* Second Column */
2591: SSE_COPY_PS(XMM1,XMM6)
2592: SSE_SHUFFLE(XMM1,XMM1,0x55)
2593: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2594: SSE_SUB_PS(XMM7,XMM1)
2596: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2597:
2598: /* Third Column */
2599: SSE_COPY_PS(XMM2,XMM6)
2600: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2601: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2602: SSE_SUB_PS(XMM7,XMM2)
2604: /* Fourth Column */
2605: SSE_COPY_PS(XMM3,XMM6)
2606: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2607: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2608: SSE_SUB_PS(XMM7,XMM3)
2609: SSE_INLINE_END_2
2610: v += 16;
2611: }
2612: v = aa + ai16;
2613: ai16 = 16*diag[--i];
2614: PREFETCH_NTA(aa+ai16+16);
2615: /*
2616: Scale the result by the diagonal 4x4 block,
2617: which was inverted as part of the factorization
2618: */
2619: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2620: /* First Column */
2621: SSE_COPY_PS(XMM0,XMM7)
2622: SSE_SHUFFLE(XMM0,XMM0,0x00)
2623: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2625: /* Second Column */
2626: SSE_COPY_PS(XMM1,XMM7)
2627: SSE_SHUFFLE(XMM1,XMM1,0x55)
2628: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2629: SSE_ADD_PS(XMM0,XMM1)
2631: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2632:
2633: /* Third Column */
2634: SSE_COPY_PS(XMM2,XMM7)
2635: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2636: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2637: SSE_ADD_PS(XMM0,XMM2)
2639: /* Fourth Column */
2640: SSE_COPY_PS(XMM3,XMM7)
2641: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2642: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2643: SSE_ADD_PS(XMM0,XMM3)
2645: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2646: SSE_INLINE_END_3
2648: v = aa + ai16 + 16;
2649: idt -= 4;
2650: }
2652: /* Convert t from single precision back to double precision (inplace)*/
2653: idt = 4*(n-1);
2654: for (i=n-1;i>=0;i--) {
2655: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2656: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2657: PetscScalar *xtemp=&x[idt];
2658: MatScalar *ttemp=&t[idt];
2659: xtemp[3] = (PetscScalar)ttemp[3];
2660: xtemp[2] = (PetscScalar)ttemp[2];
2661: xtemp[1] = (PetscScalar)ttemp[1];
2662: xtemp[0] = (PetscScalar)ttemp[0];
2663: idt -= 4;
2664: }
2666: } /* End of artificial scope. */
2667: VecRestoreArray(bb,&b);
2668: VecRestoreArray(xx,&x);
2669: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2670: SSE_SCOPE_END;
2671: return(0);
2672: }
2674: #endif
2677: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2678: {
2679: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2680: IS iscol=a->col,isrow=a->row;
2682: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2683: PetscInt *diag = a->diag;
2684: MatScalar *aa=a->a,*v;
2685: PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t;
2688: VecGetArray(bb,&b);
2689: VecGetArray(xx,&x);
2690: t = a->solve_work;
2692: ISGetIndices(isrow,&rout); r = rout;
2693: ISGetIndices(iscol,&cout); c = cout + (n-1);
2695: /* forward solve the lower triangular */
2696: idx = 3*(*r++);
2697: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2698: for (i=1; i<n; i++) {
2699: v = aa + 9*ai[i];
2700: vi = aj + ai[i];
2701: nz = diag[i] - ai[i];
2702: idx = 3*(*r++);
2703: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2704: while (nz--) {
2705: idx = 3*(*vi++);
2706: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2707: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2708: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2709: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2710: v += 9;
2711: }
2712: idx = 3*i;
2713: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2714: }
2715: /* backward solve the upper triangular */
2716: for (i=n-1; i>=0; i--){
2717: v = aa + 9*diag[i] + 9;
2718: vi = aj + diag[i] + 1;
2719: nz = ai[i+1] - diag[i] - 1;
2720: idt = 3*i;
2721: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2722: while (nz--) {
2723: idx = 3*(*vi++);
2724: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2725: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2726: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2727: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2728: v += 9;
2729: }
2730: idc = 3*(*c--);
2731: v = aa + 9*diag[i];
2732: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2733: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2734: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2735: }
2736: ISRestoreIndices(isrow,&rout);
2737: ISRestoreIndices(iscol,&cout);
2738: VecRestoreArray(bb,&b);
2739: VecRestoreArray(xx,&x);
2740: PetscLogFlops(2*9*(a->nz) - 3*A->n);
2741: return(0);
2742: }
2744: /*
2745: Special case where the matrix was ILU(0) factored in the natural
2746: ordering. This eliminates the need for the column and row permutation.
2747: */
2750: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2751: {
2752: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2753: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2755: PetscInt *diag = a->diag;
2756: MatScalar *aa=a->a,*v;
2757: PetscScalar *x,*b,s1,s2,s3,x1,x2,x3;
2758: PetscInt jdx,idt,idx,nz,*vi,i;
2761: VecGetArray(bb,&b);
2762: VecGetArray(xx,&x);
2765: /* forward solve the lower triangular */
2766: idx = 0;
2767: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
2768: for (i=1; i<n; i++) {
2769: v = aa + 9*ai[i];
2770: vi = aj + ai[i];
2771: nz = diag[i] - ai[i];
2772: idx += 3;
2773: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
2774: while (nz--) {
2775: jdx = 3*(*vi++);
2776: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2777: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2778: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2779: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2780: v += 9;
2781: }
2782: x[idx] = s1;
2783: x[1+idx] = s2;
2784: x[2+idx] = s3;
2785: }
2786: /* backward solve the upper triangular */
2787: for (i=n-1; i>=0; i--){
2788: v = aa + 9*diag[i] + 9;
2789: vi = aj + diag[i] + 1;
2790: nz = ai[i+1] - diag[i] - 1;
2791: idt = 3*i;
2792: s1 = x[idt]; s2 = x[1+idt];
2793: s3 = x[2+idt];
2794: while (nz--) {
2795: idx = 3*(*vi++);
2796: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
2797: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2798: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2799: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2800: v += 9;
2801: }
2802: v = aa + 9*diag[i];
2803: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2804: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2805: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2806: }
2808: VecRestoreArray(bb,&b);
2809: VecRestoreArray(xx,&x);
2810: PetscLogFlops(2*9*(a->nz) - 3*A->n);
2811: return(0);
2812: }
2816: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2817: {
2818: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2819: IS iscol=a->col,isrow=a->row;
2821: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2822: PetscInt *diag = a->diag;
2823: MatScalar *aa=a->a,*v;
2824: PetscScalar *x,*b,s1,s2,x1,x2,*t;
2827: VecGetArray(bb,&b);
2828: VecGetArray(xx,&x);
2829: t = a->solve_work;
2831: ISGetIndices(isrow,&rout); r = rout;
2832: ISGetIndices(iscol,&cout); c = cout + (n-1);
2834: /* forward solve the lower triangular */
2835: idx = 2*(*r++);
2836: t[0] = b[idx]; t[1] = b[1+idx];
2837: for (i=1; i<n; i++) {
2838: v = aa + 4*ai[i];
2839: vi = aj + ai[i];
2840: nz = diag[i] - ai[i];
2841: idx = 2*(*r++);
2842: s1 = b[idx]; s2 = b[1+idx];
2843: while (nz--) {
2844: idx = 2*(*vi++);
2845: x1 = t[idx]; x2 = t[1+idx];
2846: s1 -= v[0]*x1 + v[2]*x2;
2847: s2 -= v[1]*x1 + v[3]*x2;
2848: v += 4;
2849: }
2850: idx = 2*i;
2851: t[idx] = s1; t[1+idx] = s2;
2852: }
2853: /* backward solve the upper triangular */
2854: for (i=n-1; i>=0; i--){
2855: v = aa + 4*diag[i] + 4;
2856: vi = aj + diag[i] + 1;
2857: nz = ai[i+1] - diag[i] - 1;
2858: idt = 2*i;
2859: s1 = t[idt]; s2 = t[1+idt];
2860: while (nz--) {
2861: idx = 2*(*vi++);
2862: x1 = t[idx]; x2 = t[1+idx];
2863: s1 -= v[0]*x1 + v[2]*x2;
2864: s2 -= v[1]*x1 + v[3]*x2;
2865: v += 4;
2866: }
2867: idc = 2*(*c--);
2868: v = aa + 4*diag[i];
2869: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
2870: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2871: }
2872: ISRestoreIndices(isrow,&rout);
2873: ISRestoreIndices(iscol,&cout);
2874: VecRestoreArray(bb,&b);
2875: VecRestoreArray(xx,&x);
2876: PetscLogFlops(2*4*(a->nz) - 2*A->n);
2877: return(0);
2878: }
2880: /*
2881: Special case where the matrix was ILU(0) factored in the natural
2882: ordering. This eliminates the need for the column and row permutation.
2883: */
2886: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2887: {
2888: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2889: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2891: PetscInt *diag = a->diag;
2892: MatScalar *aa=a->a,*v;
2893: PetscScalar *x,*b,s1,s2,x1,x2;
2894: PetscInt jdx,idt,idx,nz,*vi,i;
2897: VecGetArray(bb,&b);
2898: VecGetArray(xx,&x);
2900: /* forward solve the lower triangular */
2901: idx = 0;
2902: x[0] = b[0]; x[1] = b[1];
2903: for (i=1; i<n; i++) {
2904: v = aa + 4*ai[i];
2905: vi = aj + ai[i];
2906: nz = diag[i] - ai[i];
2907: idx += 2;
2908: s1 = b[idx];s2 = b[1+idx];
2909: while (nz--) {
2910: jdx = 2*(*vi++);
2911: x1 = x[jdx];x2 = x[1+jdx];
2912: s1 -= v[0]*x1 + v[2]*x2;
2913: s2 -= v[1]*x1 + v[3]*x2;
2914: v += 4;
2915: }
2916: x[idx] = s1;
2917: x[1+idx] = s2;
2918: }
2919: /* backward solve the upper triangular */
2920: for (i=n-1; i>=0; i--){
2921: v = aa + 4*diag[i] + 4;
2922: vi = aj + diag[i] + 1;
2923: nz = ai[i+1] - diag[i] - 1;
2924: idt = 2*i;
2925: s1 = x[idt]; s2 = x[1+idt];
2926: while (nz--) {
2927: idx = 2*(*vi++);
2928: x1 = x[idx]; x2 = x[1+idx];
2929: s1 -= v[0]*x1 + v[2]*x2;
2930: s2 -= v[1]*x1 + v[3]*x2;
2931: v += 4;
2932: }
2933: v = aa + 4*diag[i];
2934: x[idt] = v[0]*s1 + v[2]*s2;
2935: x[1+idt] = v[1]*s1 + v[3]*s2;
2936: }
2938: VecRestoreArray(bb,&b);
2939: VecRestoreArray(xx,&x);
2940: PetscLogFlops(2*4*(a->nz) - 2*A->n);
2941: return(0);
2942: }
2946: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2947: {
2948: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2949: IS iscol=a->col,isrow=a->row;
2951: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2952: PetscInt *diag = a->diag;
2953: MatScalar *aa=a->a,*v;
2954: PetscScalar *x,*b,s1,*t;
2957: if (!n) return(0);
2959: VecGetArray(bb,&b);
2960: VecGetArray(xx,&x);
2961: t = a->solve_work;
2963: ISGetIndices(isrow,&rout); r = rout;
2964: ISGetIndices(iscol,&cout); c = cout + (n-1);
2966: /* forward solve the lower triangular */
2967: t[0] = b[*r++];
2968: for (i=1; i<n; i++) {
2969: v = aa + ai[i];
2970: vi = aj + ai[i];
2971: nz = diag[i] - ai[i];
2972: s1 = b[*r++];
2973: while (nz--) {
2974: s1 -= (*v++)*t[*vi++];
2975: }
2976: t[i] = s1;
2977: }
2978: /* backward solve the upper triangular */
2979: for (i=n-1; i>=0; i--){
2980: v = aa + diag[i] + 1;
2981: vi = aj + diag[i] + 1;
2982: nz = ai[i+1] - diag[i] - 1;
2983: s1 = t[i];
2984: while (nz--) {
2985: s1 -= (*v++)*t[*vi++];
2986: }
2987: x[*c--] = t[i] = aa[diag[i]]*s1;
2988: }
2990: ISRestoreIndices(isrow,&rout);
2991: ISRestoreIndices(iscol,&cout);
2992: VecRestoreArray(bb,&b);
2993: VecRestoreArray(xx,&x);
2994: PetscLogFlops(2*1*(a->nz) - A->n);
2995: return(0);
2996: }
2997: /*
2998: Special case where the matrix was ILU(0) factored in the natural
2999: ordering. This eliminates the need for the column and row permutation.
3000: */
3003: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3004: {
3005: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3006: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
3008: PetscInt *diag = a->diag;
3009: MatScalar *aa=a->a;
3010: PetscScalar *x,*b;
3011: PetscScalar s1,x1;
3012: MatScalar *v;
3013: PetscInt jdx,idt,idx,nz,*vi,i;
3016: VecGetArray(bb,&b);
3017: VecGetArray(xx,&x);
3019: /* forward solve the lower triangular */
3020: idx = 0;
3021: x[0] = b[0];
3022: for (i=1; i<n; i++) {
3023: v = aa + ai[i];
3024: vi = aj + ai[i];
3025: nz = diag[i] - ai[i];
3026: idx += 1;
3027: s1 = b[idx];
3028: while (nz--) {
3029: jdx = *vi++;
3030: x1 = x[jdx];
3031: s1 -= v[0]*x1;
3032: v += 1;
3033: }
3034: x[idx] = s1;
3035: }
3036: /* backward solve the upper triangular */
3037: for (i=n-1; i>=0; i--){
3038: v = aa + diag[i] + 1;
3039: vi = aj + diag[i] + 1;
3040: nz = ai[i+1] - diag[i] - 1;
3041: idt = i;
3042: s1 = x[idt];
3043: while (nz--) {
3044: idx = *vi++;
3045: x1 = x[idx];
3046: s1 -= v[0]*x1;
3047: v += 1;
3048: }
3049: v = aa + diag[i];
3050: x[idt] = v[0]*s1;
3051: }
3052: VecRestoreArray(bb,&b);
3053: VecRestoreArray(xx,&x);
3054: PetscLogFlops(2*(a->nz) - A->n);
3055: return(0);
3056: }
3058: /* ----------------------------------------------------------------*/
3059: /*
3060: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3061: except that the data structure of Mat_SeqAIJ is slightly different.
3062: Not a good example of code reuse.
3063: */
3064: EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat);
3068: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3069: {
3070: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3071: IS isicol;
3073: PetscInt *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3074: PetscInt *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3075: PetscInt *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3076: PetscInt incrlev,nnz,i,bs = A->bs,bs2 = a->bs2,levels,diagonal_fill;
3077: PetscTruth col_identity,row_identity;
3078: PetscReal f;
3081: f = info->fill;
3082: levels = (PetscInt)info->levels;
3083: diagonal_fill = (PetscInt)info->diagonal_fill;
3084: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3085: ISIdentity(isrow,&row_identity);
3086: ISIdentity(iscol,&col_identity);
3088: if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */
3089: MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
3090: (*fact)->factor = FACTOR_LU;
3091: b = (Mat_SeqBAIJ*)(*fact)->data;
3092: if (!b->diag) {
3093: MatMarkDiagonal_SeqBAIJ(*fact);
3094: }
3095: MatMissingDiagonal_SeqBAIJ(*fact);
3096: b->row = isrow;
3097: b->col = iscol;
3098: PetscObjectReference((PetscObject)isrow);
3099: PetscObjectReference((PetscObject)iscol);
3100: b->icol = isicol;
3101: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3102: PetscMalloc(((*fact)->m+1+(*fact)->bs)*sizeof(PetscScalar),&b->solve_work);
3103: } else { /* general case perform the symbolic factorization */
3104: ISGetIndices(isrow,&r);
3105: ISGetIndices(isicol,&ic);
3107: /* get new row pointers */
3108: PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
3109: ainew[0] = 0;
3110: /* don't know how many column pointers are needed so estimate */
3111: jmax = (PetscInt)(f*ai[n] + 1);
3112: PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
3113: /* ajfill is level of fill for each fill entry */
3114: PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
3115: /* fill is a linked list of nonzeros in active row */
3116: PetscMalloc((n+1)*sizeof(PetscInt),&fill);
3117: /* im is level for each filled value */
3118: PetscMalloc((n+1)*sizeof(PetscInt),&im);
3119: /* dloc is location of diagonal in factor */
3120: PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
3121: dloc[0] = 0;
3122: for (prow=0; prow<n; prow++) {
3124: /* copy prow into linked list */
3125: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
3126: if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3127: xi = aj + ai[r[prow]];
3128: fill[n] = n;
3129: fill[prow] = -1; /* marker for diagonal entry */
3130: while (nz--) {
3131: fm = n;
3132: idx = ic[*xi++];
3133: do {
3134: m = fm;
3135: fm = fill[m];
3136: } while (fm < idx);
3137: fill[m] = idx;
3138: fill[idx] = fm;
3139: im[idx] = 0;
3140: }
3142: /* make sure diagonal entry is included */
3143: if (diagonal_fill && fill[prow] == -1) {
3144: fm = n;
3145: while (fill[fm] < prow) fm = fill[fm];
3146: fill[prow] = fill[fm]; /* insert diagonal into linked list */
3147: fill[fm] = prow;
3148: im[prow] = 0;
3149: nzf++;
3150: dcount++;
3151: }
3153: nzi = 0;
3154: row = fill[n];
3155: while (row < prow) {
3156: incrlev = im[row] + 1;
3157: nz = dloc[row];
3158: xi = ajnew + ainew[row] + nz + 1;
3159: flev = ajfill + ainew[row] + nz + 1;
3160: nnz = ainew[row+1] - ainew[row] - nz - 1;
3161: fm = row;
3162: while (nnz-- > 0) {
3163: idx = *xi++;
3164: if (*flev + incrlev > levels) {
3165: flev++;
3166: continue;
3167: }
3168: do {
3169: m = fm;
3170: fm = fill[m];
3171: } while (fm < idx);
3172: if (fm != idx) {
3173: im[idx] = *flev + incrlev;
3174: fill[m] = idx;
3175: fill[idx] = fm;
3176: fm = idx;
3177: nzf++;
3178: } else {
3179: if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3180: }
3181: flev++;
3182: }
3183: row = fill[row];
3184: nzi++;
3185: }
3186: /* copy new filled row into permanent storage */
3187: ainew[prow+1] = ainew[prow] + nzf;
3188: if (ainew[prow+1] > jmax) {
3190: /* estimate how much additional space we will need */
3191: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3192: /* just double the memory each time */
3193: PetscInt maxadd = jmax;
3194: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3195: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3196: jmax += maxadd;
3198: /* allocate a longer ajnew and ajfill */
3199: PetscMalloc(jmax*sizeof(PetscInt),&xi);
3200: PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));
3201: PetscFree(ajnew);
3202: ajnew = xi;
3203: PetscMalloc(jmax*sizeof(PetscInt),&xi);
3204: PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));
3205: PetscFree(ajfill);
3206: ajfill = xi;
3207: reallocate++; /* count how many reallocations are needed */
3208: }
3209: xi = ajnew + ainew[prow];
3210: flev = ajfill + ainew[prow];
3211: dloc[prow] = nzi;
3212: fm = fill[n];
3213: while (nzf--) {
3214: *xi++ = fm;
3215: *flev++ = im[fm];
3216: fm = fill[fm];
3217: }
3218: /* make sure row has diagonal entry */
3219: if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3220: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3221: try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3222: }
3223: }
3224: PetscFree(ajfill);
3225: ISRestoreIndices(isrow,&r);
3226: ISRestoreIndices(isicol,&ic);
3227: PetscFree(fill);
3228: PetscFree(im);
3230: {
3231: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3232: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocate,f,af);
3233: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3234: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3235: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3236: if (diagonal_fill) {
3237: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %D missing diagonals",dcount);
3238: }
3239: }
3241: /* put together the new matrix */
3242: MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);
3243: MatSetType(*fact,A->type_name);
3244: MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);
3245: PetscLogObjectParent(*fact,isicol);
3246: b = (Mat_SeqBAIJ*)(*fact)->data;
3247: PetscFree(b->imax);
3248: b->singlemalloc = PETSC_FALSE;
3249: len = bs2*ainew[n]*sizeof(MatScalar);
3250: /* the next line frees the default space generated by the Create() */
3251: PetscFree(b->a);
3252: PetscFree(b->ilen);
3253: PetscMalloc(len,&b->a);
3254: b->j = ajnew;
3255: b->i = ainew;
3256: for (i=0; i<n; i++) dloc[i] += ainew[i];
3257: b->diag = dloc;
3258: b->ilen = 0;
3259: b->imax = 0;
3260: b->row = isrow;
3261: b->col = iscol;
3262: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3263: PetscObjectReference((PetscObject)isrow);
3264: PetscObjectReference((PetscObject)iscol);
3265: b->icol = isicol;
3266: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3267: /* In b structure: Free imax, ilen, old a, old j.
3268: Allocate dloc, solve_work, new a, new j */
3269: PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
3270: b->maxnz = b->nz = ainew[n];
3271: (*fact)->factor = FACTOR_LU;
3273: (*fact)->info.factor_mallocs = reallocate;
3274: (*fact)->info.fill_ratio_given = f;
3275: (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3276: }
3278: if (row_identity && col_identity) {
3279: MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
3280: }
3281: return(0);
3282: }
3286: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3287: {
3288: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3289: /* int i,*AJ=a->j,nz=a->nz; */
3291: /* Undo Column scaling */
3292: /* while (nz--) { */
3293: /* AJ[i] = AJ[i]/4; */
3294: /* } */
3295: /* This should really invoke a push/pop logic, but we don't have that yet. */
3296: A->ops->setunfactored = PETSC_NULL;
3297: return(0);
3298: }
3302: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3303: {
3304: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3305: PetscInt *AJ=a->j,nz=a->nz;
3306: unsigned short *aj=(unsigned short *)AJ;
3308: /* Is this really necessary? */
3309: while (nz--) {
3310: AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3311: }
3312: A->ops->setunfactored = PETSC_NULL;
3313: return(0);
3314: }
3318: PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3319: {
3320: /*
3321: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3322: with natural ordering
3323: */
3325: inA->ops->solve = MatSolve_SeqBAIJ_Update;
3326: inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update;
3327: switch (inA->bs) {
3328: case 1:
3329: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3330: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3331: break;
3332: case 2:
3333: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3334: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3335: break;
3336: case 3:
3337: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3338: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3339: break;
3340: case 4:
3341: #if defined(PETSC_USE_MAT_SINGLE)
3342: {
3343: PetscTruth sse_enabled_local;
3345: PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);
3346: if (sse_enabled_local) {
3347: # if defined(PETSC_HAVE_SSE)
3348: int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3349: if (n==(unsigned short)n) {
3350: unsigned short *aj=(unsigned short *)AJ;
3351: for (i=0;i<nz;i++) {
3352: aj[i] = (unsigned short)AJ[i];
3353: }
3354: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3355: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3356: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3357: } else {
3358: /* Scale the column indices for easier indexing in MatSolve. */
3359: /* for (i=0;i<nz;i++) { */
3360: /* AJ[i] = AJ[i]*4; */
3361: /* } */
3362: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3363: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3364: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3365: }
3366: # else
3367: /* This should never be reached. If so, problem in PetscSSEIsEnabled. */
3368: SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3369: # endif
3370: } else {
3371: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3372: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3373: }
3374: }
3375: #else
3376: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3377: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3378: #endif
3379: break;
3380: case 5:
3381: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3382: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3383: break;
3384: case 6:
3385: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3386: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3387: break;
3388: case 7:
3389: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3390: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3391: break;
3392: }
3393: return(0);
3394: }
3398: PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A)
3399: {
3400: /*
3401: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3402: with natural ordering
3403: */
3404: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3405: IS row = a->row, col = a->col;
3406: PetscTruth row_identity, col_identity;
3407: PetscTruth use_natural;
3412: use_natural = PETSC_FALSE;
3413: if (row && col) {
3414: ISIdentity(row,&row_identity);
3415: ISIdentity(col,&col_identity);
3417: if (row_identity && col_identity) {
3418: use_natural = PETSC_TRUE;
3419: }
3420: } else {
3421: use_natural = PETSC_TRUE;
3422: }
3424: switch (A->bs) {
3425: case 1:
3426: if (use_natural) {
3427: A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
3428: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3429: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3430: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3431: } else {
3432: A->ops->solve = MatSolve_SeqBAIJ_1;
3433: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
3434: }
3435: break;
3436: case 2:
3437: if (use_natural) {
3438: A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
3439: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3440: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3441: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3442: } else {
3443: A->ops->solve = MatSolve_SeqBAIJ_2;
3444: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
3445: }
3446: break;
3447: case 3:
3448: if (use_natural) {
3449: A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering;
3450: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3451: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3452: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3453: } else {
3454: A->ops->solve = MatSolve_SeqBAIJ_3;
3455: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
3456: }
3457: break;
3458: case 4:
3459: {
3460: PetscTruth sse_enabled_local;
3461: PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);
3462: if (use_natural) {
3463: #if defined(PETSC_USE_MAT_SINGLE)
3464: if (sse_enabled_local) { /* Natural + Single + SSE */
3465: # if defined(PETSC_HAVE_SSE)
3466: PetscInt n=a->mbs;
3467: if (n==(unsigned short)n) {
3468: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3469: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3470: } else {
3471: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3472: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3473: }
3474: # else
3475: /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3476: SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3477: # endif
3478: } else { /* Natural + Single */
3479: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3480: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3481: }
3482: #else
3483: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
3484: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3485: #endif
3486: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3487: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3488: } else { /* Arbitrary ordering */
3489: #if defined(PETSC_USE_MAT_SINGLE)
3490: if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3491: # if defined(PETSC_HAVE_SSE)
3492: A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion;
3493: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3494: # else
3495: /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3496: SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3497: # endif
3498: } else { /* Arbitrary + Single */
3499: A->ops->solve = MatSolve_SeqBAIJ_4_Demotion;
3500: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3501: }
3502: #else
3503: A->ops->solve = MatSolve_SeqBAIJ_4;
3504: #endif
3505: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
3506: }
3507: }
3508: break;
3509: case 5:
3510: if (use_natural) {
3511: A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering;
3512: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3513: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3514: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3515: } else {
3516: A->ops->solve = MatSolve_SeqBAIJ_5;
3517: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5;
3518: }
3519: break;
3520: case 6:
3521: if (use_natural) {
3522: A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering;
3523: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3524: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3525: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3526: } else {
3527: A->ops->solve = MatSolve_SeqBAIJ_6;
3528: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6;
3529: }
3530: break;
3531: case 7:
3532: if (use_natural) {
3533: A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering;
3534: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3535: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3536: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3537: } else {
3538: A->ops->solve = MatSolve_SeqBAIJ_7;
3539: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7;
3540: }
3541: break;
3542: default:
3543: A->ops->solve = MatSolve_SeqBAIJ_N;
3544: break;
3545: }
3546: return(0);
3547: }
3551: PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3552: {
3556: MatSeqBAIJ_UpdateSolvers(A);
3557: if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3558: (*A->ops->solve)(A,x,y);
3559: } else {
3560: SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3561: }
3562: return(0);
3563: }
3567: PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3568: {
3572: MatSeqBAIJ_UpdateSolvers(A);
3573: (*A->ops->solvetranspose)(A,x,y);
3574: return(0);
3575: }