Actual source code: inode.c
petsc-3.6.0 2015-06-09
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
14: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
17: n = A->cmap->n;
18: m = A->rmap->n;
19: ns_row = a->inode.size;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24: for (; count+1 < n; count++,i++) ;
25: if (count < n) {
26: i++;
27: }
28: *size = i;
29: return(0);
30: }
31: PetscMalloc1(n+1,&ns_col);
33: /* Use the same row structure wherever feasible. */
34: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35: ns_col[i] = ns_row[i];
36: }
38: /* if m < n; pad up the remainder with inode_limit */
39: for (; count+1 < n; count++,i++) {
40: ns_col[i] = 1;
41: }
42: /* The last node is the odd ball. padd it up with the remaining rows; */
43: if (count < n) {
44: ns_col[i] = n - count;
45: i++;
46: } else if (count > n) {
47: /* Adjust for the over estimation */
48: ns_col[i-1] += n - count;
49: }
50: *size = i;
51: *ns = ns_col;
52: return(0);
53: }
56: /*
57: This builds symmetric version of nonzero structure,
58: */
61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62: {
63: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
65: PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67: const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap->n;
72: n = A->cmap->n;
73: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc1(nslim_col+1,&tns);
81: PetscMalloc1(n+1,&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1) {
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87: }
88: /* allocate space for row pointers */
89: PetscMalloc1(nslim_row+1,&ia);
90: *iia = ia;
91: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
92: PetscMalloc1(nslim_row+1,&work);
94: /* determine the number of columns in each row */
95: ia[0] = oshift;
96: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
98: j = aj + ai[row] + ishift;
99: jmax = aj + ai[row+1] + ishift;
100: col = *j++ + ishift;
101: i2 = tvc[col];
102: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103: ia[i1+1]++;
104: ia[i2+1]++;
105: i2++; /* Start col of next node */
106: while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107: i2 = tvc[col];
108: }
109: if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
110: }
112: /* shift ia[i] to point to next row */
113: for (i1=1; i1<nslim_row+1; i1++) {
114: row = ia[i1-1];
115: ia[i1] += row;
116: work[i1-1] = row - oshift;
117: }
119: /* allocate space for column pointers */
120: nz = ia[nslim_row] + (!ishift);
121: PetscMalloc1(nz,&ja);
122: *jja = ja;
124: /* loop over lower triangular part putting into ja */
125: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126: j = aj + ai[row] + ishift;
127: jmax = aj + ai[row+1] + ishift;
128: col = *j++ + ishift;
129: i2 = tvc[col];
130: while (i2<i1 && j<jmax) {
131: ja[work[i2]++] = i1 + oshift;
132: ja[work[i1]++] = i2 + oshift;
133: ++i2;
134: while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135: i2 = tvc[col];
136: }
137: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
139: }
140: PetscFree(work);
141: PetscFree(tns);
142: PetscFree(tvc);
143: return(0);
144: }
146: /*
147: This builds nonsymmetric version of nonzero structure,
148: */
151: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152: {
153: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
155: PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156: PetscInt *tns,*tvc,nsz,i1,i2;
157: const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
160: nslim_row = a->inode.node_count;
161: n = A->cmap->n;
163: /* Create The column_inode for this matrix */
164: Mat_CreateColInode(A,&nslim_col,&ns_col);
166: /* allocate space for reformated column_inode structure */
167: PetscMalloc1(nslim_col +1,&tns);
168: PetscMalloc1(n + 1,&tvc);
169: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
171: for (i1=0,col=0; i1<nslim_col; ++i1) {
172: nsz = ns_col[i1];
173: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174: }
175: /* allocate space for row pointers */
176: PetscMalloc1(nslim_row+1,&ia);
177: *iia = ia;
178: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
179: PetscMalloc1(nslim_row+1,&work);
181: /* determine the number of columns in each row */
182: ia[0] = oshift;
183: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184: j = aj + ai[row] + ishift;
185: col = *j++ + ishift;
186: i2 = tvc[col];
187: nz = ai[row+1] - ai[row];
188: while (nz-- > 0) { /* off-diagonal elemets */
189: ia[i1+1]++;
190: i2++; /* Start col of next node */
191: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192: if (nz > 0) i2 = tvc[col];
193: }
194: }
196: /* shift ia[i] to point to next row */
197: for (i1=1; i1<nslim_row+1; i1++) {
198: row = ia[i1-1];
199: ia[i1] += row;
200: work[i1-1] = row - oshift;
201: }
203: /* allocate space for column pointers */
204: nz = ia[nslim_row] + (!ishift);
205: PetscMalloc1(nz,&ja);
206: *jja = ja;
208: /* loop over matrix putting into ja */
209: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210: j = aj + ai[row] + ishift;
211: col = *j++ + ishift;
212: i2 = tvc[col];
213: nz = ai[row+1] - ai[row];
214: while (nz-- > 0) {
215: ja[work[i1]++] = i2 + oshift;
216: ++i2;
217: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218: if (nz > 0) i2 = tvc[col];
219: }
220: }
221: PetscFree(ns_col);
222: PetscFree(work);
223: PetscFree(tns);
224: PetscFree(tvc);
225: return(0);
226: }
230: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
231: {
232: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
236: *n = a->inode.node_count;
237: if (!ia) return(0);
238: if (!blockcompressed) {
239: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
240: } else if (symmetric) {
241: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
242: } else {
243: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
244: }
245: return(0);
246: }
250: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
251: {
255: if (!ia) return(0);
257: if (!blockcompressed) {
258: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
259: } else {
260: PetscFree(*ia);
261: PetscFree(*ja);
262: }
263: return(0);
264: }
266: /* ----------------------------------------------------------- */
270: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271: {
272: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
274: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
278: nslim_row = a->inode.node_count;
279: n = A->cmap->n;
281: /* Create The column_inode for this matrix */
282: Mat_CreateColInode(A,&nslim_col,&ns_col);
284: /* allocate space for reformated column_inode structure */
285: PetscMalloc1(nslim_col + 1,&tns);
286: PetscMalloc1(n + 1,&tvc);
287: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
289: for (i1=0,col=0; i1<nslim_col; ++i1) {
290: nsz = ns_col[i1];
291: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292: }
293: /* allocate space for column pointers */
294: PetscMalloc1(nslim_col+1,&ia);
295: *iia = ia;
296: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
297: PetscMalloc1(nslim_col+1,&work);
299: /* determine the number of columns in each row */
300: ia[0] = oshift;
301: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302: j = aj + ai[row] + ishift;
303: col = *j++ + ishift;
304: i2 = tvc[col];
305: nz = ai[row+1] - ai[row];
306: while (nz-- > 0) { /* off-diagonal elemets */
307: /* ia[i1+1]++; */
308: ia[i2+1]++;
309: i2++;
310: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311: if (nz > 0) i2 = tvc[col];
312: }
313: }
315: /* shift ia[i] to point to next col */
316: for (i1=1; i1<nslim_col+1; i1++) {
317: col = ia[i1-1];
318: ia[i1] += col;
319: work[i1-1] = col - oshift;
320: }
322: /* allocate space for column pointers */
323: nz = ia[nslim_col] + (!ishift);
324: PetscMalloc1(nz,&ja);
325: *jja = ja;
327: /* loop over matrix putting into ja */
328: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329: j = aj + ai[row] + ishift;
330: col = *j++ + ishift;
331: i2 = tvc[col];
332: nz = ai[row+1] - ai[row];
333: while (nz-- > 0) {
334: /* ja[work[i1]++] = i2 + oshift; */
335: ja[work[i2]++] = i1 + oshift;
336: i2++;
337: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338: if (nz > 0) i2 = tvc[col];
339: }
340: }
341: PetscFree(ns_col);
342: PetscFree(work);
343: PetscFree(tns);
344: PetscFree(tvc);
345: return(0);
346: }
350: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
351: {
355: Mat_CreateColInode(A,n,NULL);
356: if (!ia) return(0);
358: if (!blockcompressed) {
359: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
360: } else if (symmetric) {
361: /* Since the indices are symmetric it does'nt matter */
362: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
363: } else {
364: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
365: }
366: return(0);
367: }
371: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
372: {
376: if (!ia) return(0);
377: if (!blockcompressed) {
378: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
379: } else {
380: PetscFree(*ia);
381: PetscFree(*ja);
382: }
383: return(0);
384: }
386: /* ----------------------------------------------------------- */
390: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391: {
392: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
393: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394: PetscScalar *y;
395: const PetscScalar *x;
396: const MatScalar *v1,*v2,*v3,*v4,*v5;
397: PetscErrorCode ierr;
398: PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399: const PetscInt *idx,*ns,*ii;
401: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403: #endif
406: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407: node_max = a->inode.node_count;
408: ns = a->inode.size; /* Node Size array */
409: VecGetArrayRead(xx,&x);
410: VecGetArray(yy,&y);
411: idx = a->j;
412: v1 = a->a;
413: ii = a->i;
415: for (i = 0,row = 0; i< node_max; ++i) {
416: nsz = ns[i];
417: n = ii[1] - ii[0];
418: nonzerorow += (n>0)*nsz;
419: ii += nsz;
420: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
421: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
422: sz = n; /* No of non zeros in this row */
423: /* Switch on the size of Node */
424: switch (nsz) { /* Each loop in 'case' is unrolled */
425: case 1:
426: sum1 = 0.;
428: for (n = 0; n< sz-1; n+=2) {
429: i1 = idx[0]; /* The instructions are ordered to */
430: i2 = idx[1]; /* make the compiler's job easy */
431: idx += 2;
432: tmp0 = x[i1];
433: tmp1 = x[i2];
434: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435: }
437: if (n == sz-1) { /* Take care of the last nonzero */
438: tmp0 = x[*idx++];
439: sum1 += *v1++ *tmp0;
440: }
441: y[row++]=sum1;
442: break;
443: case 2:
444: sum1 = 0.;
445: sum2 = 0.;
446: v2 = v1 + n;
448: for (n = 0; n< sz-1; n+=2) {
449: i1 = idx[0];
450: i2 = idx[1];
451: idx += 2;
452: tmp0 = x[i1];
453: tmp1 = x[i2];
454: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456: }
457: if (n == sz-1) {
458: tmp0 = x[*idx++];
459: sum1 += *v1++ * tmp0;
460: sum2 += *v2++ * tmp0;
461: }
462: y[row++]=sum1;
463: y[row++]=sum2;
464: v1 =v2; /* Since the next block to be processed starts there*/
465: idx +=sz;
466: break;
467: case 3:
468: sum1 = 0.;
469: sum2 = 0.;
470: sum3 = 0.;
471: v2 = v1 + n;
472: v3 = v2 + n;
474: for (n = 0; n< sz-1; n+=2) {
475: i1 = idx[0];
476: i2 = idx[1];
477: idx += 2;
478: tmp0 = x[i1];
479: tmp1 = x[i2];
480: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483: }
484: if (n == sz-1) {
485: tmp0 = x[*idx++];
486: sum1 += *v1++ * tmp0;
487: sum2 += *v2++ * tmp0;
488: sum3 += *v3++ * tmp0;
489: }
490: y[row++]=sum1;
491: y[row++]=sum2;
492: y[row++]=sum3;
493: v1 =v3; /* Since the next block to be processed starts there*/
494: idx +=2*sz;
495: break;
496: case 4:
497: sum1 = 0.;
498: sum2 = 0.;
499: sum3 = 0.;
500: sum4 = 0.;
501: v2 = v1 + n;
502: v3 = v2 + n;
503: v4 = v3 + n;
505: for (n = 0; n< sz-1; n+=2) {
506: i1 = idx[0];
507: i2 = idx[1];
508: idx += 2;
509: tmp0 = x[i1];
510: tmp1 = x[i2];
511: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515: }
516: if (n == sz-1) {
517: tmp0 = x[*idx++];
518: sum1 += *v1++ * tmp0;
519: sum2 += *v2++ * tmp0;
520: sum3 += *v3++ * tmp0;
521: sum4 += *v4++ * tmp0;
522: }
523: y[row++]=sum1;
524: y[row++]=sum2;
525: y[row++]=sum3;
526: y[row++]=sum4;
527: v1 =v4; /* Since the next block to be processed starts there*/
528: idx +=3*sz;
529: break;
530: case 5:
531: sum1 = 0.;
532: sum2 = 0.;
533: sum3 = 0.;
534: sum4 = 0.;
535: sum5 = 0.;
536: v2 = v1 + n;
537: v3 = v2 + n;
538: v4 = v3 + n;
539: v5 = v4 + n;
541: for (n = 0; n<sz-1; n+=2) {
542: i1 = idx[0];
543: i2 = idx[1];
544: idx += 2;
545: tmp0 = x[i1];
546: tmp1 = x[i2];
547: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552: }
553: if (n == sz-1) {
554: tmp0 = x[*idx++];
555: sum1 += *v1++ * tmp0;
556: sum2 += *v2++ * tmp0;
557: sum3 += *v3++ * tmp0;
558: sum4 += *v4++ * tmp0;
559: sum5 += *v5++ * tmp0;
560: }
561: y[row++]=sum1;
562: y[row++]=sum2;
563: y[row++]=sum3;
564: y[row++]=sum4;
565: y[row++]=sum5;
566: v1 =v5; /* Since the next block to be processed starts there */
567: idx +=4*sz;
568: break;
569: default:
570: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571: }
572: }
573: VecRestoreArrayRead(xx,&x);
574: VecRestoreArray(yy,&y);
575: PetscLogFlops(2.0*a->nz - nonzerorow);
576: return(0);
577: }
578: /* ----------------------------------------------------------- */
579: /* Almost same code as the MatMult_SeqAIJ_Inode() */
582: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583: {
584: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
585: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586: const MatScalar *v1,*v2,*v3,*v4,*v5;
587: const PetscScalar *x;
588: PetscScalar *y,*z,*zt;
589: PetscErrorCode ierr;
590: PetscInt i1,i2,n,i,row,node_max,nsz,sz;
591: const PetscInt *idx,*ns,*ii;
594: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595: node_max = a->inode.node_count;
596: ns = a->inode.size; /* Node Size array */
598: VecGetArrayRead(xx,&x);
599: VecGetArray(yy,&y);
600: if (zz != yy) {
601: VecGetArray(zz,&z);
602: } else {
603: z = y;
604: }
605: zt = z;
607: idx = a->j;
608: v1 = a->a;
609: ii = a->i;
611: for (i = 0,row = 0; i< node_max; ++i) {
612: nsz = ns[i];
613: n = ii[1] - ii[0];
614: ii += nsz;
615: sz = n; /* No of non zeros in this row */
616: /* Switch on the size of Node */
617: switch (nsz) { /* Each loop in 'case' is unrolled */
618: case 1:
619: sum1 = *zt++;
621: for (n = 0; n< sz-1; n+=2) {
622: i1 = idx[0]; /* The instructions are ordered to */
623: i2 = idx[1]; /* make the compiler's job easy */
624: idx += 2;
625: tmp0 = x[i1];
626: tmp1 = x[i2];
627: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
628: }
630: if (n == sz-1) { /* Take care of the last nonzero */
631: tmp0 = x[*idx++];
632: sum1 += *v1++ * tmp0;
633: }
634: y[row++]=sum1;
635: break;
636: case 2:
637: sum1 = *zt++;
638: sum2 = *zt++;
639: v2 = v1 + n;
641: for (n = 0; n< sz-1; n+=2) {
642: i1 = idx[0];
643: i2 = idx[1];
644: idx += 2;
645: tmp0 = x[i1];
646: tmp1 = x[i2];
647: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649: }
650: if (n == sz-1) {
651: tmp0 = x[*idx++];
652: sum1 += *v1++ * tmp0;
653: sum2 += *v2++ * tmp0;
654: }
655: y[row++]=sum1;
656: y[row++]=sum2;
657: v1 =v2; /* Since the next block to be processed starts there*/
658: idx +=sz;
659: break;
660: case 3:
661: sum1 = *zt++;
662: sum2 = *zt++;
663: sum3 = *zt++;
664: v2 = v1 + n;
665: v3 = v2 + n;
667: for (n = 0; n< sz-1; n+=2) {
668: i1 = idx[0];
669: i2 = idx[1];
670: idx += 2;
671: tmp0 = x[i1];
672: tmp1 = x[i2];
673: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
674: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
675: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
676: }
677: if (n == sz-1) {
678: tmp0 = x[*idx++];
679: sum1 += *v1++ * tmp0;
680: sum2 += *v2++ * tmp0;
681: sum3 += *v3++ * tmp0;
682: }
683: y[row++]=sum1;
684: y[row++]=sum2;
685: y[row++]=sum3;
686: v1 =v3; /* Since the next block to be processed starts there*/
687: idx +=2*sz;
688: break;
689: case 4:
690: sum1 = *zt++;
691: sum2 = *zt++;
692: sum3 = *zt++;
693: sum4 = *zt++;
694: v2 = v1 + n;
695: v3 = v2 + n;
696: v4 = v3 + n;
698: for (n = 0; n< sz-1; n+=2) {
699: i1 = idx[0];
700: i2 = idx[1];
701: idx += 2;
702: tmp0 = x[i1];
703: tmp1 = x[i2];
704: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
705: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
706: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
707: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
708: }
709: if (n == sz-1) {
710: tmp0 = x[*idx++];
711: sum1 += *v1++ * tmp0;
712: sum2 += *v2++ * tmp0;
713: sum3 += *v3++ * tmp0;
714: sum4 += *v4++ * tmp0;
715: }
716: y[row++]=sum1;
717: y[row++]=sum2;
718: y[row++]=sum3;
719: y[row++]=sum4;
720: v1 =v4; /* Since the next block to be processed starts there*/
721: idx +=3*sz;
722: break;
723: case 5:
724: sum1 = *zt++;
725: sum2 = *zt++;
726: sum3 = *zt++;
727: sum4 = *zt++;
728: sum5 = *zt++;
729: v2 = v1 + n;
730: v3 = v2 + n;
731: v4 = v3 + n;
732: v5 = v4 + n;
734: for (n = 0; n<sz-1; n+=2) {
735: i1 = idx[0];
736: i2 = idx[1];
737: idx += 2;
738: tmp0 = x[i1];
739: tmp1 = x[i2];
740: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
741: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
742: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
743: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
744: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
745: }
746: if (n == sz-1) {
747: tmp0 = x[*idx++];
748: sum1 += *v1++ * tmp0;
749: sum2 += *v2++ * tmp0;
750: sum3 += *v3++ * tmp0;
751: sum4 += *v4++ * tmp0;
752: sum5 += *v5++ * tmp0;
753: }
754: y[row++]=sum1;
755: y[row++]=sum2;
756: y[row++]=sum3;
757: y[row++]=sum4;
758: y[row++]=sum5;
759: v1 =v5; /* Since the next block to be processed starts there */
760: idx +=4*sz;
761: break;
762: default:
763: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
764: }
765: }
766: VecRestoreArrayRead(xx,&x);
767: VecRestoreArray(yy,&y);
768: if (zz != yy) {
769: VecRestoreArray(zz,&z);
770: }
771: PetscLogFlops(2.0*a->nz);
772: return(0);
773: }
775: /* ----------------------------------------------------------- */
778: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
779: {
780: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
781: IS iscol = a->col,isrow = a->row;
782: PetscErrorCode ierr;
783: const PetscInt *r,*c,*rout,*cout;
784: PetscInt i,j,n = A->rmap->n,nz;
785: PetscInt node_max,*ns,row,nsz,aii,i0,i1;
786: const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
787: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
788: PetscScalar sum1,sum2,sum3,sum4,sum5;
789: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790: const PetscScalar *b;
793: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
794: node_max = a->inode.node_count;
795: ns = a->inode.size; /* Node Size array */
797: VecGetArrayRead(bb,&b);
798: VecGetArray(xx,&x);
799: tmp = a->solve_work;
801: ISGetIndices(isrow,&rout); r = rout;
802: ISGetIndices(iscol,&cout); c = cout + (n-1);
804: /* forward solve the lower triangular */
805: tmps = tmp;
806: aa = a_a;
807: aj = a_j;
808: ad = a->diag;
810: for (i = 0,row = 0; i< node_max; ++i) {
811: nsz = ns[i];
812: aii = ai[row];
813: v1 = aa + aii;
814: vi = aj + aii;
815: nz = ad[row]- aii;
816: if (i < node_max-1) {
817: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
818: * but our indexing to determine it's size could. */
819: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
820: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
821: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
822: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
823: }
825: switch (nsz) { /* Each loop in 'case' is unrolled */
826: case 1:
827: sum1 = b[*r++];
828: for (j=0; j<nz-1; j+=2) {
829: i0 = vi[0];
830: i1 = vi[1];
831: vi +=2;
832: tmp0 = tmps[i0];
833: tmp1 = tmps[i1];
834: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
835: }
836: if (j == nz-1) {
837: tmp0 = tmps[*vi++];
838: sum1 -= *v1++ *tmp0;
839: }
840: tmp[row++]=sum1;
841: break;
842: case 2:
843: sum1 = b[*r++];
844: sum2 = b[*r++];
845: v2 = aa + ai[row+1];
847: for (j=0; j<nz-1; j+=2) {
848: i0 = vi[0];
849: i1 = vi[1];
850: vi +=2;
851: tmp0 = tmps[i0];
852: tmp1 = tmps[i1];
853: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
854: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
855: }
856: if (j == nz-1) {
857: tmp0 = tmps[*vi++];
858: sum1 -= *v1++ *tmp0;
859: sum2 -= *v2++ *tmp0;
860: }
861: sum2 -= *v2++ *sum1;
862: tmp[row++]=sum1;
863: tmp[row++]=sum2;
864: break;
865: case 3:
866: sum1 = b[*r++];
867: sum2 = b[*r++];
868: sum3 = b[*r++];
869: v2 = aa + ai[row+1];
870: v3 = aa + ai[row+2];
872: for (j=0; j<nz-1; j+=2) {
873: i0 = vi[0];
874: i1 = vi[1];
875: vi +=2;
876: tmp0 = tmps[i0];
877: tmp1 = tmps[i1];
878: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
879: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
880: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
881: }
882: if (j == nz-1) {
883: tmp0 = tmps[*vi++];
884: sum1 -= *v1++ *tmp0;
885: sum2 -= *v2++ *tmp0;
886: sum3 -= *v3++ *tmp0;
887: }
888: sum2 -= *v2++ * sum1;
889: sum3 -= *v3++ * sum1;
890: sum3 -= *v3++ * sum2;
892: tmp[row++]=sum1;
893: tmp[row++]=sum2;
894: tmp[row++]=sum3;
895: break;
897: case 4:
898: sum1 = b[*r++];
899: sum2 = b[*r++];
900: sum3 = b[*r++];
901: sum4 = b[*r++];
902: v2 = aa + ai[row+1];
903: v3 = aa + ai[row+2];
904: v4 = aa + ai[row+3];
906: for (j=0; j<nz-1; j+=2) {
907: i0 = vi[0];
908: i1 = vi[1];
909: vi +=2;
910: tmp0 = tmps[i0];
911: tmp1 = tmps[i1];
912: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916: }
917: if (j == nz-1) {
918: tmp0 = tmps[*vi++];
919: sum1 -= *v1++ *tmp0;
920: sum2 -= *v2++ *tmp0;
921: sum3 -= *v3++ *tmp0;
922: sum4 -= *v4++ *tmp0;
923: }
924: sum2 -= *v2++ * sum1;
925: sum3 -= *v3++ * sum1;
926: sum4 -= *v4++ * sum1;
927: sum3 -= *v3++ * sum2;
928: sum4 -= *v4++ * sum2;
929: sum4 -= *v4++ * sum3;
931: tmp[row++]=sum1;
932: tmp[row++]=sum2;
933: tmp[row++]=sum3;
934: tmp[row++]=sum4;
935: break;
936: case 5:
937: sum1 = b[*r++];
938: sum2 = b[*r++];
939: sum3 = b[*r++];
940: sum4 = b[*r++];
941: sum5 = b[*r++];
942: v2 = aa + ai[row+1];
943: v3 = aa + ai[row+2];
944: v4 = aa + ai[row+3];
945: v5 = aa + ai[row+4];
947: for (j=0; j<nz-1; j+=2) {
948: i0 = vi[0];
949: i1 = vi[1];
950: vi +=2;
951: tmp0 = tmps[i0];
952: tmp1 = tmps[i1];
953: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958: }
959: if (j == nz-1) {
960: tmp0 = tmps[*vi++];
961: sum1 -= *v1++ *tmp0;
962: sum2 -= *v2++ *tmp0;
963: sum3 -= *v3++ *tmp0;
964: sum4 -= *v4++ *tmp0;
965: sum5 -= *v5++ *tmp0;
966: }
968: sum2 -= *v2++ * sum1;
969: sum3 -= *v3++ * sum1;
970: sum4 -= *v4++ * sum1;
971: sum5 -= *v5++ * sum1;
972: sum3 -= *v3++ * sum2;
973: sum4 -= *v4++ * sum2;
974: sum5 -= *v5++ * sum2;
975: sum4 -= *v4++ * sum3;
976: sum5 -= *v5++ * sum3;
977: sum5 -= *v5++ * sum4;
979: tmp[row++]=sum1;
980: tmp[row++]=sum2;
981: tmp[row++]=sum3;
982: tmp[row++]=sum4;
983: tmp[row++]=sum5;
984: break;
985: default:
986: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987: }
988: }
989: /* backward solve the upper triangular */
990: for (i=node_max -1,row = n-1; i>=0; i--) {
991: nsz = ns[i];
992: aii = ai[row+1] -1;
993: v1 = aa + aii;
994: vi = aj + aii;
995: nz = aii- ad[row];
996: switch (nsz) { /* Each loop in 'case' is unrolled */
997: case 1:
998: sum1 = tmp[row];
1000: for (j=nz; j>1; j-=2) {
1001: vi -=2;
1002: i0 = vi[2];
1003: i1 = vi[1];
1004: tmp0 = tmps[i0];
1005: tmp1 = tmps[i1];
1006: v1 -= 2;
1007: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008: }
1009: if (j==1) {
1010: tmp0 = tmps[*vi--];
1011: sum1 -= *v1-- * tmp0;
1012: }
1013: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014: break;
1015: case 2:
1016: sum1 = tmp[row];
1017: sum2 = tmp[row -1];
1018: v2 = aa + ai[row]-1;
1019: for (j=nz; j>1; j-=2) {
1020: vi -=2;
1021: i0 = vi[2];
1022: i1 = vi[1];
1023: tmp0 = tmps[i0];
1024: tmp1 = tmps[i1];
1025: v1 -= 2;
1026: v2 -= 2;
1027: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029: }
1030: if (j==1) {
1031: tmp0 = tmps[*vi--];
1032: sum1 -= *v1-- * tmp0;
1033: sum2 -= *v2-- * tmp0;
1034: }
1036: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037: sum2 -= *v2-- * tmp0;
1038: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039: break;
1040: case 3:
1041: sum1 = tmp[row];
1042: sum2 = tmp[row -1];
1043: sum3 = tmp[row -2];
1044: v2 = aa + ai[row]-1;
1045: v3 = aa + ai[row -1]-1;
1046: for (j=nz; j>1; j-=2) {
1047: vi -=2;
1048: i0 = vi[2];
1049: i1 = vi[1];
1050: tmp0 = tmps[i0];
1051: tmp1 = tmps[i1];
1052: v1 -= 2;
1053: v2 -= 2;
1054: v3 -= 2;
1055: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058: }
1059: if (j==1) {
1060: tmp0 = tmps[*vi--];
1061: sum1 -= *v1-- * tmp0;
1062: sum2 -= *v2-- * tmp0;
1063: sum3 -= *v3-- * tmp0;
1064: }
1065: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066: sum2 -= *v2-- * tmp0;
1067: sum3 -= *v3-- * tmp0;
1068: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069: sum3 -= *v3-- * tmp0;
1070: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072: break;
1073: case 4:
1074: sum1 = tmp[row];
1075: sum2 = tmp[row -1];
1076: sum3 = tmp[row -2];
1077: sum4 = tmp[row -3];
1078: v2 = aa + ai[row]-1;
1079: v3 = aa + ai[row -1]-1;
1080: v4 = aa + ai[row -2]-1;
1082: for (j=nz; j>1; j-=2) {
1083: vi -=2;
1084: i0 = vi[2];
1085: i1 = vi[1];
1086: tmp0 = tmps[i0];
1087: tmp1 = tmps[i1];
1088: v1 -= 2;
1089: v2 -= 2;
1090: v3 -= 2;
1091: v4 -= 2;
1092: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096: }
1097: if (j==1) {
1098: tmp0 = tmps[*vi--];
1099: sum1 -= *v1-- * tmp0;
1100: sum2 -= *v2-- * tmp0;
1101: sum3 -= *v3-- * tmp0;
1102: sum4 -= *v4-- * tmp0;
1103: }
1105: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106: sum2 -= *v2-- * tmp0;
1107: sum3 -= *v3-- * tmp0;
1108: sum4 -= *v4-- * tmp0;
1109: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110: sum3 -= *v3-- * tmp0;
1111: sum4 -= *v4-- * tmp0;
1112: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113: sum4 -= *v4-- * tmp0;
1114: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115: break;
1116: case 5:
1117: sum1 = tmp[row];
1118: sum2 = tmp[row -1];
1119: sum3 = tmp[row -2];
1120: sum4 = tmp[row -3];
1121: sum5 = tmp[row -4];
1122: v2 = aa + ai[row]-1;
1123: v3 = aa + ai[row -1]-1;
1124: v4 = aa + ai[row -2]-1;
1125: v5 = aa + ai[row -3]-1;
1126: for (j=nz; j>1; j-=2) {
1127: vi -= 2;
1128: i0 = vi[2];
1129: i1 = vi[1];
1130: tmp0 = tmps[i0];
1131: tmp1 = tmps[i1];
1132: v1 -= 2;
1133: v2 -= 2;
1134: v3 -= 2;
1135: v4 -= 2;
1136: v5 -= 2;
1137: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142: }
1143: if (j==1) {
1144: tmp0 = tmps[*vi--];
1145: sum1 -= *v1-- * tmp0;
1146: sum2 -= *v2-- * tmp0;
1147: sum3 -= *v3-- * tmp0;
1148: sum4 -= *v4-- * tmp0;
1149: sum5 -= *v5-- * tmp0;
1150: }
1152: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153: sum2 -= *v2-- * tmp0;
1154: sum3 -= *v3-- * tmp0;
1155: sum4 -= *v4-- * tmp0;
1156: sum5 -= *v5-- * tmp0;
1157: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158: sum3 -= *v3-- * tmp0;
1159: sum4 -= *v4-- * tmp0;
1160: sum5 -= *v5-- * tmp0;
1161: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162: sum4 -= *v4-- * tmp0;
1163: sum5 -= *v5-- * tmp0;
1164: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165: sum5 -= *v5-- * tmp0;
1166: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167: break;
1168: default:
1169: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170: }
1171: }
1172: ISRestoreIndices(isrow,&rout);
1173: ISRestoreIndices(iscol,&cout);
1174: VecRestoreArrayRead(bb,&b);
1175: VecRestoreArray(xx,&x);
1176: PetscLogFlops(2.0*a->nz - A->cmap->n);
1177: return(0);
1178: }
1182: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183: {
1184: Mat C =B;
1185: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1186: IS isrow = b->row,isicol = b->icol;
1187: PetscErrorCode ierr;
1188: const PetscInt *r,*ic,*ics;
1189: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190: PetscInt i,j,k,nz,nzL,row,*pj;
1191: const PetscInt *ajtmp,*bjtmp;
1192: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194: FactorShiftCtx sctx;
1195: const PetscInt *ddiag;
1196: PetscReal rs;
1197: MatScalar d;
1198: PetscInt inod,nodesz,node_max,col;
1199: const PetscInt *ns;
1200: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1203: /* MatPivotSetUp(): initialize shift context sctx */
1204: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1206: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207: ddiag = a->diag;
1208: sctx.shift_top = info->zeropivot;
1209: for (i=0; i<n; i++) {
1210: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211: d = (aa)[ddiag[i]];
1212: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213: v = aa+ai[i];
1214: nz = ai[i+1] - ai[i];
1215: for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1216: if (rs>sctx.shift_top) sctx.shift_top = rs;
1217: }
1218: sctx.shift_top *= 1.1;
1219: sctx.nshift_max = 5;
1220: sctx.shift_lo = 0.;
1221: sctx.shift_hi = 1.;
1222: }
1224: ISGetIndices(isrow,&r);
1225: ISGetIndices(isicol,&ic);
1227: PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1228: ics = ic;
1230: node_max = a->inode.node_count;
1231: ns = a->inode.size;
1232: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1234: /* If max inode size > 4, split it into two inodes.*/
1235: /* also map the inode sizes according to the ordering */
1236: PetscMalloc1(n+1,&tmp_vec1);
1237: for (i=0,j=0; i<node_max; ++i,++j) {
1238: if (ns[i] > 4) {
1239: tmp_vec1[j] = 4;
1240: ++j;
1241: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1242: } else {
1243: tmp_vec1[j] = ns[i];
1244: }
1245: }
1246: /* Use the correct node_max */
1247: node_max = j;
1249: /* Now reorder the inode info based on mat re-ordering info */
1250: /* First create a row -> inode_size_array_index map */
1251: PetscMalloc1(n+1,&nsmap);
1252: PetscMalloc1(node_max+1,&tmp_vec2);
1253: for (i=0,row=0; i<node_max; i++) {
1254: nodesz = tmp_vec1[i];
1255: for (j=0; j<nodesz; j++,row++) {
1256: nsmap[row] = i;
1257: }
1258: }
1259: /* Using nsmap, create a reordered ns structure */
1260: for (i=0,j=0; i< node_max; i++) {
1261: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1262: tmp_vec2[i] = nodesz;
1263: j += nodesz;
1264: }
1265: PetscFree(nsmap);
1266: PetscFree(tmp_vec1);
1268: /* Now use the correct ns */
1269: ns = tmp_vec2;
1271: do {
1272: sctx.newshift = PETSC_FALSE;
1273: /* Now loop over each block-row, and do the factorization */
1274: for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1275: nodesz = ns[inod];
1277: switch (nodesz) {
1278: case 1:
1279: /*----------*/
1280: /* zero rtmp1 */
1281: /* L part */
1282: nz = bi[i+1] - bi[i];
1283: bjtmp = bj + bi[i];
1284: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1286: /* U part */
1287: nz = bdiag[i]-bdiag[i+1];
1288: bjtmp = bj + bdiag[i+1]+1;
1289: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1291: /* load in initial (unfactored row) */
1292: nz = ai[r[i]+1] - ai[r[i]];
1293: ajtmp = aj + ai[r[i]];
1294: v = aa + ai[r[i]];
1295: for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1297: /* ZeropivotApply() */
1298: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1300: /* elimination */
1301: bjtmp = bj + bi[i];
1302: row = *bjtmp++;
1303: nzL = bi[i+1] - bi[i];
1304: for (k=0; k < nzL; k++) {
1305: pc = rtmp1 + row;
1306: if (*pc != 0.0) {
1307: pv = b->a + bdiag[row];
1308: mul1 = *pc * (*pv);
1309: *pc = mul1;
1310: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1311: pv = b->a + bdiag[row+1]+1;
1312: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1313: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1314: PetscLogFlops(1+2*nz);
1315: }
1316: row = *bjtmp++;
1317: }
1319: /* finished row so stick it into b->a */
1320: rs = 0.0;
1321: /* L part */
1322: pv = b->a + bi[i];
1323: pj = b->j + bi[i];
1324: nz = bi[i+1] - bi[i];
1325: for (j=0; j<nz; j++) {
1326: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327: }
1329: /* U part */
1330: pv = b->a + bdiag[i+1]+1;
1331: pj = b->j + bdiag[i+1]+1;
1332: nz = bdiag[i] - bdiag[i+1]-1;
1333: for (j=0; j<nz; j++) {
1334: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1335: }
1337: /* Check zero pivot */
1338: sctx.rs = rs;
1339: sctx.pv = rtmp1[i];
1340: MatPivotCheck(A,info,&sctx,i);
1341: if (sctx.newshift) break;
1343: /* Mark diagonal and invert diagonal for simplier triangular solves */
1344: pv = b->a + bdiag[i];
1345: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1346: break;
1348: case 2:
1349: /*----------*/
1350: /* zero rtmp1 and rtmp2 */
1351: /* L part */
1352: nz = bi[i+1] - bi[i];
1353: bjtmp = bj + bi[i];
1354: for (j=0; j<nz; j++) {
1355: col = bjtmp[j];
1356: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357: }
1359: /* U part */
1360: nz = bdiag[i]-bdiag[i+1];
1361: bjtmp = bj + bdiag[i+1]+1;
1362: for (j=0; j<nz; j++) {
1363: col = bjtmp[j];
1364: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1365: }
1367: /* load in initial (unfactored row) */
1368: nz = ai[r[i]+1] - ai[r[i]];
1369: ajtmp = aj + ai[r[i]];
1370: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1371: for (j=0; j<nz; j++) {
1372: col = ics[ajtmp[j]];
1373: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1374: }
1375: /* ZeropivotApply(): shift the diagonal of the matrix */
1376: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1378: /* elimination */
1379: bjtmp = bj + bi[i];
1380: row = *bjtmp++; /* pivot row */
1381: nzL = bi[i+1] - bi[i];
1382: for (k=0; k < nzL; k++) {
1383: pc1 = rtmp1 + row;
1384: pc2 = rtmp2 + row;
1385: if (*pc1 != 0.0 || *pc2 != 0.0) {
1386: pv = b->a + bdiag[row];
1387: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1388: *pc1 = mul1; *pc2 = mul2;
1390: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1391: pv = b->a + bdiag[row+1]+1;
1392: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1393: for (j=0; j<nz; j++) {
1394: col = pj[j];
1395: rtmp1[col] -= mul1 * pv[j];
1396: rtmp2[col] -= mul2 * pv[j];
1397: }
1398: PetscLogFlops(2+4*nz);
1399: }
1400: row = *bjtmp++;
1401: }
1403: /* finished row i; check zero pivot, then stick row i into b->a */
1404: rs = 0.0;
1405: /* L part */
1406: pc1 = b->a + bi[i];
1407: pj = b->j + bi[i];
1408: nz = bi[i+1] - bi[i];
1409: for (j=0; j<nz; j++) {
1410: col = pj[j];
1411: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412: }
1413: /* U part */
1414: pc1 = b->a + bdiag[i+1]+1;
1415: pj = b->j + bdiag[i+1]+1;
1416: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1417: for (j=0; j<nz; j++) {
1418: col = pj[j];
1419: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1420: }
1422: sctx.rs = rs;
1423: sctx.pv = rtmp1[i];
1424: MatPivotCheck(A,info,&sctx,i);
1425: if (sctx.newshift) break;
1426: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1427: *pc1 = 1.0/sctx.pv;
1429: /* Now take care of diagonal 2x2 block. */
1430: pc2 = rtmp2 + i;
1431: if (*pc2 != 0.0) {
1432: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1433: *pc2 = mul1; /* insert L entry */
1434: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1435: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1436: for (j=0; j<nz; j++) {
1437: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1438: }
1439: PetscLogFlops(1+2*nz);
1440: }
1442: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1443: rs = 0.0;
1444: /* L part */
1445: pc2 = b->a + bi[i+1];
1446: pj = b->j + bi[i+1];
1447: nz = bi[i+2] - bi[i+1];
1448: for (j=0; j<nz; j++) {
1449: col = pj[j];
1450: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451: }
1452: /* U part */
1453: pc2 = b->a + bdiag[i+2]+1;
1454: pj = b->j + bdiag[i+2]+1;
1455: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1456: for (j=0; j<nz; j++) {
1457: col = pj[j];
1458: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1459: }
1461: sctx.rs = rs;
1462: sctx.pv = rtmp2[i+1];
1463: MatPivotCheck(A,info,&sctx,i+1);
1464: if (sctx.newshift) break;
1465: pc2 = b->a + bdiag[i+1];
1466: *pc2 = 1.0/sctx.pv;
1467: break;
1469: case 3:
1470: /*----------*/
1471: /* zero rtmp */
1472: /* L part */
1473: nz = bi[i+1] - bi[i];
1474: bjtmp = bj + bi[i];
1475: for (j=0; j<nz; j++) {
1476: col = bjtmp[j];
1477: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478: }
1480: /* U part */
1481: nz = bdiag[i]-bdiag[i+1];
1482: bjtmp = bj + bdiag[i+1]+1;
1483: for (j=0; j<nz; j++) {
1484: col = bjtmp[j];
1485: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1486: }
1488: /* load in initial (unfactored row) */
1489: nz = ai[r[i]+1] - ai[r[i]];
1490: ajtmp = aj + ai[r[i]];
1491: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1492: for (j=0; j<nz; j++) {
1493: col = ics[ajtmp[j]];
1494: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1495: }
1496: /* ZeropivotApply(): shift the diagonal of the matrix */
1497: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1499: /* elimination */
1500: bjtmp = bj + bi[i];
1501: row = *bjtmp++; /* pivot row */
1502: nzL = bi[i+1] - bi[i];
1503: for (k=0; k < nzL; k++) {
1504: pc1 = rtmp1 + row;
1505: pc2 = rtmp2 + row;
1506: pc3 = rtmp3 + row;
1507: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1508: pv = b->a + bdiag[row];
1509: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1510: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1512: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1513: pv = b->a + bdiag[row+1]+1;
1514: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1515: for (j=0; j<nz; j++) {
1516: col = pj[j];
1517: rtmp1[col] -= mul1 * pv[j];
1518: rtmp2[col] -= mul2 * pv[j];
1519: rtmp3[col] -= mul3 * pv[j];
1520: }
1521: PetscLogFlops(3+6*nz);
1522: }
1523: row = *bjtmp++;
1524: }
1526: /* finished row i; check zero pivot, then stick row i into b->a */
1527: rs = 0.0;
1528: /* L part */
1529: pc1 = b->a + bi[i];
1530: pj = b->j + bi[i];
1531: nz = bi[i+1] - bi[i];
1532: for (j=0; j<nz; j++) {
1533: col = pj[j];
1534: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535: }
1536: /* U part */
1537: pc1 = b->a + bdiag[i+1]+1;
1538: pj = b->j + bdiag[i+1]+1;
1539: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1540: for (j=0; j<nz; j++) {
1541: col = pj[j];
1542: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1543: }
1545: sctx.rs = rs;
1546: sctx.pv = rtmp1[i];
1547: MatPivotCheck(A,info,&sctx,i);
1548: if (sctx.newshift) break;
1549: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1550: *pc1 = 1.0/sctx.pv;
1552: /* Now take care of 1st column of diagonal 3x3 block. */
1553: pc2 = rtmp2 + i;
1554: pc3 = rtmp3 + i;
1555: if (*pc2 != 0.0 || *pc3 != 0.0) {
1556: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1557: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1558: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1559: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1560: for (j=0; j<nz; j++) {
1561: col = pj[j];
1562: rtmp2[col] -= mul2 * rtmp1[col];
1563: rtmp3[col] -= mul3 * rtmp1[col];
1564: }
1565: PetscLogFlops(2+4*nz);
1566: }
1568: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1569: rs = 0.0;
1570: /* L part */
1571: pc2 = b->a + bi[i+1];
1572: pj = b->j + bi[i+1];
1573: nz = bi[i+2] - bi[i+1];
1574: for (j=0; j<nz; j++) {
1575: col = pj[j];
1576: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577: }
1578: /* U part */
1579: pc2 = b->a + bdiag[i+2]+1;
1580: pj = b->j + bdiag[i+2]+1;
1581: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1582: for (j=0; j<nz; j++) {
1583: col = pj[j];
1584: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1585: }
1587: sctx.rs = rs;
1588: sctx.pv = rtmp2[i+1];
1589: MatPivotCheck(A,info,&sctx,i+1);
1590: if (sctx.newshift) break;
1591: pc2 = b->a + bdiag[i+1];
1592: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1594: /* Now take care of 2nd column of diagonal 3x3 block. */
1595: pc3 = rtmp3 + i+1;
1596: if (*pc3 != 0.0) {
1597: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1598: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1599: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1600: for (j=0; j<nz; j++) {
1601: col = pj[j];
1602: rtmp3[col] -= mul3 * rtmp2[col];
1603: }
1604: PetscLogFlops(1+2*nz);
1605: }
1607: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1608: rs = 0.0;
1609: /* L part */
1610: pc3 = b->a + bi[i+2];
1611: pj = b->j + bi[i+2];
1612: nz = bi[i+3] - bi[i+2];
1613: for (j=0; j<nz; j++) {
1614: col = pj[j];
1615: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616: }
1617: /* U part */
1618: pc3 = b->a + bdiag[i+3]+1;
1619: pj = b->j + bdiag[i+3]+1;
1620: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1621: for (j=0; j<nz; j++) {
1622: col = pj[j];
1623: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1624: }
1626: sctx.rs = rs;
1627: sctx.pv = rtmp3[i+2];
1628: MatPivotCheck(A,info,&sctx,i+2);
1629: if (sctx.newshift) break;
1630: pc3 = b->a + bdiag[i+2];
1631: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1632: break;
1633: case 4:
1634: /*----------*/
1635: /* zero rtmp */
1636: /* L part */
1637: nz = bi[i+1] - bi[i];
1638: bjtmp = bj + bi[i];
1639: for (j=0; j<nz; j++) {
1640: col = bjtmp[j];
1641: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1642: }
1644: /* U part */
1645: nz = bdiag[i]-bdiag[i+1];
1646: bjtmp = bj + bdiag[i+1]+1;
1647: for (j=0; j<nz; j++) {
1648: col = bjtmp[j];
1649: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1650: }
1652: /* load in initial (unfactored row) */
1653: nz = ai[r[i]+1] - ai[r[i]];
1654: ajtmp = aj + ai[r[i]];
1655: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1656: for (j=0; j<nz; j++) {
1657: col = ics[ajtmp[j]];
1658: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1659: }
1660: /* ZeropivotApply(): shift the diagonal of the matrix */
1661: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1663: /* elimination */
1664: bjtmp = bj + bi[i];
1665: row = *bjtmp++; /* pivot row */
1666: nzL = bi[i+1] - bi[i];
1667: for (k=0; k < nzL; k++) {
1668: pc1 = rtmp1 + row;
1669: pc2 = rtmp2 + row;
1670: pc3 = rtmp3 + row;
1671: pc4 = rtmp4 + row;
1672: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1673: pv = b->a + bdiag[row];
1674: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1675: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1677: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1678: pv = b->a + bdiag[row+1]+1;
1679: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1680: for (j=0; j<nz; j++) {
1681: col = pj[j];
1682: rtmp1[col] -= mul1 * pv[j];
1683: rtmp2[col] -= mul2 * pv[j];
1684: rtmp3[col] -= mul3 * pv[j];
1685: rtmp4[col] -= mul4 * pv[j];
1686: }
1687: PetscLogFlops(4+8*nz);
1688: }
1689: row = *bjtmp++;
1690: }
1692: /* finished row i; check zero pivot, then stick row i into b->a */
1693: rs = 0.0;
1694: /* L part */
1695: pc1 = b->a + bi[i];
1696: pj = b->j + bi[i];
1697: nz = bi[i+1] - bi[i];
1698: for (j=0; j<nz; j++) {
1699: col = pj[j];
1700: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701: }
1702: /* U part */
1703: pc1 = b->a + bdiag[i+1]+1;
1704: pj = b->j + bdiag[i+1]+1;
1705: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1706: for (j=0; j<nz; j++) {
1707: col = pj[j];
1708: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1709: }
1711: sctx.rs = rs;
1712: sctx.pv = rtmp1[i];
1713: MatPivotCheck(A,info,&sctx,i);
1714: if (sctx.newshift) break;
1715: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1716: *pc1 = 1.0/sctx.pv;
1718: /* Now take care of 1st column of diagonal 4x4 block. */
1719: pc2 = rtmp2 + i;
1720: pc3 = rtmp3 + i;
1721: pc4 = rtmp4 + i;
1722: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1723: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1724: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1725: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1726: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1727: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1728: for (j=0; j<nz; j++) {
1729: col = pj[j];
1730: rtmp2[col] -= mul2 * rtmp1[col];
1731: rtmp3[col] -= mul3 * rtmp1[col];
1732: rtmp4[col] -= mul4 * rtmp1[col];
1733: }
1734: PetscLogFlops(3+6*nz);
1735: }
1737: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1738: rs = 0.0;
1739: /* L part */
1740: pc2 = b->a + bi[i+1];
1741: pj = b->j + bi[i+1];
1742: nz = bi[i+2] - bi[i+1];
1743: for (j=0; j<nz; j++) {
1744: col = pj[j];
1745: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746: }
1747: /* U part */
1748: pc2 = b->a + bdiag[i+2]+1;
1749: pj = b->j + bdiag[i+2]+1;
1750: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1751: for (j=0; j<nz; j++) {
1752: col = pj[j];
1753: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1754: }
1756: sctx.rs = rs;
1757: sctx.pv = rtmp2[i+1];
1758: MatPivotCheck(A,info,&sctx,i+1);
1759: if (sctx.newshift) break;
1760: pc2 = b->a + bdiag[i+1];
1761: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1763: /* Now take care of 2nd column of diagonal 4x4 block. */
1764: pc3 = rtmp3 + i+1;
1765: pc4 = rtmp4 + i+1;
1766: if (*pc3 != 0.0 || *pc4 != 0.0) {
1767: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1768: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1769: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1770: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1771: for (j=0; j<nz; j++) {
1772: col = pj[j];
1773: rtmp3[col] -= mul3 * rtmp2[col];
1774: rtmp4[col] -= mul4 * rtmp2[col];
1775: }
1776: PetscLogFlops(4*nz);
1777: }
1779: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1780: rs = 0.0;
1781: /* L part */
1782: pc3 = b->a + bi[i+2];
1783: pj = b->j + bi[i+2];
1784: nz = bi[i+3] - bi[i+2];
1785: for (j=0; j<nz; j++) {
1786: col = pj[j];
1787: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788: }
1789: /* U part */
1790: pc3 = b->a + bdiag[i+3]+1;
1791: pj = b->j + bdiag[i+3]+1;
1792: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1793: for (j=0; j<nz; j++) {
1794: col = pj[j];
1795: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1796: }
1798: sctx.rs = rs;
1799: sctx.pv = rtmp3[i+2];
1800: MatPivotCheck(A,info,&sctx,i+2);
1801: if (sctx.newshift) break;
1802: pc3 = b->a + bdiag[i+2];
1803: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1805: /* Now take care of 3rd column of diagonal 4x4 block. */
1806: pc4 = rtmp4 + i+2;
1807: if (*pc4 != 0.0) {
1808: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1809: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1810: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1811: for (j=0; j<nz; j++) {
1812: col = pj[j];
1813: rtmp4[col] -= mul4 * rtmp3[col];
1814: }
1815: PetscLogFlops(1+2*nz);
1816: }
1818: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1819: rs = 0.0;
1820: /* L part */
1821: pc4 = b->a + bi[i+3];
1822: pj = b->j + bi[i+3];
1823: nz = bi[i+4] - bi[i+3];
1824: for (j=0; j<nz; j++) {
1825: col = pj[j];
1826: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827: }
1828: /* U part */
1829: pc4 = b->a + bdiag[i+4]+1;
1830: pj = b->j + bdiag[i+4]+1;
1831: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1832: for (j=0; j<nz; j++) {
1833: col = pj[j];
1834: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1835: }
1837: sctx.rs = rs;
1838: sctx.pv = rtmp4[i+3];
1839: MatPivotCheck(A,info,&sctx,i+3);
1840: if (sctx.newshift) break;
1841: pc4 = b->a + bdiag[i+3];
1842: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1843: break;
1845: default:
1846: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1847: }
1848: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1849: i += nodesz; /* Update the row */
1850: }
1852: /* MatPivotRefine() */
1853: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1854: /*
1855: * if no shift in this attempt & shifting & started shifting & can refine,
1856: * then try lower shift
1857: */
1858: sctx.shift_hi = sctx.shift_fraction;
1859: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1860: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1861: sctx.newshift = PETSC_TRUE;
1862: sctx.nshift++;
1863: }
1864: } while (sctx.newshift);
1866: PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1867: PetscFree(tmp_vec2);
1868: ISRestoreIndices(isicol,&ic);
1869: ISRestoreIndices(isrow,&r);
1871: if (b->inode.size) {
1872: C->ops->solve = MatSolve_SeqAIJ_Inode;
1873: } else {
1874: C->ops->solve = MatSolve_SeqAIJ;
1875: }
1876: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1877: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1878: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1879: C->ops->matsolve = MatMatSolve_SeqAIJ;
1880: C->assembled = PETSC_TRUE;
1881: C->preallocated = PETSC_TRUE;
1883: PetscLogFlops(C->cmap->n);
1885: /* MatShiftView(A,info,&sctx) */
1886: if (sctx.nshift) {
1887: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1888: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1889: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1890: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1891: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1892: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1893: }
1894: }
1895: return(0);
1896: }
1900: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1901: {
1902: Mat C = B;
1903: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1904: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1905: PetscErrorCode ierr;
1906: const PetscInt *r,*ic,*c,*ics;
1907: PetscInt n = A->rmap->n,*bi = b->i;
1908: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1909: PetscInt i,j,idx,*bd = b->diag,node_max,nodesz;
1910: PetscInt *ai = a->i,*aj = a->j;
1911: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1912: PetscScalar mul1,mul2,mul3,tmp;
1913: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1914: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1915: PetscReal rs=0.0;
1916: FactorShiftCtx sctx;
1919: sctx.shift_top = 0;
1920: sctx.nshift_max = 0;
1921: sctx.shift_lo = 0;
1922: sctx.shift_hi = 0;
1923: sctx.shift_fraction = 0;
1925: /* if both shift schemes are chosen by user, only use info->shiftpd */
1926: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1927: sctx.shift_top = 0;
1928: for (i=0; i<n; i++) {
1929: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1930: rs = 0.0;
1931: ajtmp = aj + ai[i];
1932: rtmp1 = aa + ai[i];
1933: nz = ai[i+1] - ai[i];
1934: for (j=0; j<nz; j++) {
1935: if (*ajtmp != i) {
1936: rs += PetscAbsScalar(*rtmp1++);
1937: } else {
1938: rs -= PetscRealPart(*rtmp1++);
1939: }
1940: ajtmp++;
1941: }
1942: if (rs>sctx.shift_top) sctx.shift_top = rs;
1943: }
1944: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1945: sctx.shift_top *= 1.1;
1946: sctx.nshift_max = 5;
1947: sctx.shift_lo = 0.;
1948: sctx.shift_hi = 1.;
1949: }
1950: sctx.shift_amount = 0;
1951: sctx.nshift = 0;
1953: ISGetIndices(isrow,&r);
1954: ISGetIndices(iscol,&c);
1955: ISGetIndices(isicol,&ic);
1956: PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1957: ics = ic;
1959: node_max = a->inode.node_count;
1960: ns = a->inode.size;
1961: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1963: /* If max inode size > 3, split it into two inodes.*/
1964: /* also map the inode sizes according to the ordering */
1965: PetscMalloc1(n+1,&tmp_vec1);
1966: for (i=0,j=0; i<node_max; ++i,++j) {
1967: if (ns[i]>3) {
1968: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1969: ++j;
1970: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1971: } else {
1972: tmp_vec1[j] = ns[i];
1973: }
1974: }
1975: /* Use the correct node_max */
1976: node_max = j;
1978: /* Now reorder the inode info based on mat re-ordering info */
1979: /* First create a row -> inode_size_array_index map */
1980: PetscMalloc1(n+1,&nsmap);
1981: PetscMalloc1(node_max+1,&tmp_vec2);
1982: for (i=0,row=0; i<node_max; i++) {
1983: nodesz = tmp_vec1[i];
1984: for (j=0; j<nodesz; j++,row++) {
1985: nsmap[row] = i;
1986: }
1987: }
1988: /* Using nsmap, create a reordered ns structure */
1989: for (i=0,j=0; i< node_max; i++) {
1990: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1991: tmp_vec2[i] = nodesz;
1992: j += nodesz;
1993: }
1994: PetscFree(nsmap);
1995: PetscFree(tmp_vec1);
1996: /* Now use the correct ns */
1997: ns = tmp_vec2;
1999: do {
2000: sctx.newshift = PETSC_FALSE;
2001: /* Now loop over each block-row, and do the factorization */
2002: for (i=0,row=0; i<node_max; i++) {
2003: nodesz = ns[i];
2004: nz = bi[row+1] - bi[row];
2005: bjtmp = bj + bi[row];
2007: switch (nodesz) {
2008: case 1:
2009: for (j=0; j<nz; j++) {
2010: idx = bjtmp[j];
2011: rtmp11[idx] = 0.0;
2012: }
2014: /* load in initial (unfactored row) */
2015: idx = r[row];
2016: nz_tmp = ai[idx+1] - ai[idx];
2017: ajtmp = aj + ai[idx];
2018: v1 = aa + ai[idx];
2020: for (j=0; j<nz_tmp; j++) {
2021: idx = ics[ajtmp[j]];
2022: rtmp11[idx] = v1[j];
2023: }
2024: rtmp11[ics[r[row]]] += sctx.shift_amount;
2026: prow = *bjtmp++;
2027: while (prow < row) {
2028: pc1 = rtmp11 + prow;
2029: if (*pc1 != 0.0) {
2030: pv = ba + bd[prow];
2031: pj = nbj + bd[prow];
2032: mul1 = *pc1 * *pv++;
2033: *pc1 = mul1;
2034: nz_tmp = bi[prow+1] - bd[prow] - 1;
2035: PetscLogFlops(1+2*nz_tmp);
2036: for (j=0; j<nz_tmp; j++) {
2037: tmp = pv[j];
2038: idx = pj[j];
2039: rtmp11[idx] -= mul1 * tmp;
2040: }
2041: }
2042: prow = *bjtmp++;
2043: }
2044: pj = bj + bi[row];
2045: pc1 = ba + bi[row];
2047: sctx.pv = rtmp11[row];
2048: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2049: rs = 0.0;
2050: for (j=0; j<nz; j++) {
2051: idx = pj[j];
2052: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2053: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2054: }
2055: sctx.rs = rs;
2056: MatPivotCheck(A,info,&sctx,row);
2057: if (sctx.newshift) goto endofwhile;
2058: break;
2060: case 2:
2061: for (j=0; j<nz; j++) {
2062: idx = bjtmp[j];
2063: rtmp11[idx] = 0.0;
2064: rtmp22[idx] = 0.0;
2065: }
2067: /* load in initial (unfactored row) */
2068: idx = r[row];
2069: nz_tmp = ai[idx+1] - ai[idx];
2070: ajtmp = aj + ai[idx];
2071: v1 = aa + ai[idx];
2072: v2 = aa + ai[idx+1];
2073: for (j=0; j<nz_tmp; j++) {
2074: idx = ics[ajtmp[j]];
2075: rtmp11[idx] = v1[j];
2076: rtmp22[idx] = v2[j];
2077: }
2078: rtmp11[ics[r[row]]] += sctx.shift_amount;
2079: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2081: prow = *bjtmp++;
2082: while (prow < row) {
2083: pc1 = rtmp11 + prow;
2084: pc2 = rtmp22 + prow;
2085: if (*pc1 != 0.0 || *pc2 != 0.0) {
2086: pv = ba + bd[prow];
2087: pj = nbj + bd[prow];
2088: mul1 = *pc1 * *pv;
2089: mul2 = *pc2 * *pv;
2090: ++pv;
2091: *pc1 = mul1;
2092: *pc2 = mul2;
2094: nz_tmp = bi[prow+1] - bd[prow] - 1;
2095: for (j=0; j<nz_tmp; j++) {
2096: tmp = pv[j];
2097: idx = pj[j];
2098: rtmp11[idx] -= mul1 * tmp;
2099: rtmp22[idx] -= mul2 * tmp;
2100: }
2101: PetscLogFlops(2+4*nz_tmp);
2102: }
2103: prow = *bjtmp++;
2104: }
2106: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2107: pc1 = rtmp11 + prow;
2108: pc2 = rtmp22 + prow;
2110: sctx.pv = *pc1;
2111: pj = bj + bi[prow];
2112: rs = 0.0;
2113: for (j=0; j<nz; j++) {
2114: idx = pj[j];
2115: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2116: }
2117: sctx.rs = rs;
2118: MatPivotCheck(A,info,&sctx,row);
2119: if (sctx.newshift) goto endofwhile;
2121: if (*pc2 != 0.0) {
2122: pj = nbj + bd[prow];
2123: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2124: *pc2 = mul2;
2125: nz_tmp = bi[prow+1] - bd[prow] - 1;
2126: for (j=0; j<nz_tmp; j++) {
2127: idx = pj[j];
2128: tmp = rtmp11[idx];
2129: rtmp22[idx] -= mul2 * tmp;
2130: }
2131: PetscLogFlops(1+2*nz_tmp);
2132: }
2134: pj = bj + bi[row];
2135: pc1 = ba + bi[row];
2136: pc2 = ba + bi[row+1];
2138: sctx.pv = rtmp22[row+1];
2139: rs = 0.0;
2140: rtmp11[row] = 1.0/rtmp11[row];
2141: rtmp22[row+1] = 1.0/rtmp22[row+1];
2142: /* copy row entries from dense representation to sparse */
2143: for (j=0; j<nz; j++) {
2144: idx = pj[j];
2145: pc1[j] = rtmp11[idx];
2146: pc2[j] = rtmp22[idx];
2147: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2148: }
2149: sctx.rs = rs;
2150: MatPivotCheck(A,info,&sctx,row+1);
2151: if (sctx.newshift) goto endofwhile;
2152: break;
2154: case 3:
2155: for (j=0; j<nz; j++) {
2156: idx = bjtmp[j];
2157: rtmp11[idx] = 0.0;
2158: rtmp22[idx] = 0.0;
2159: rtmp33[idx] = 0.0;
2160: }
2161: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2162: idx = r[row];
2163: nz_tmp = ai[idx+1] - ai[idx];
2164: ajtmp = aj + ai[idx];
2165: v1 = aa + ai[idx];
2166: v2 = aa + ai[idx+1];
2167: v3 = aa + ai[idx+2];
2168: for (j=0; j<nz_tmp; j++) {
2169: idx = ics[ajtmp[j]];
2170: rtmp11[idx] = v1[j];
2171: rtmp22[idx] = v2[j];
2172: rtmp33[idx] = v3[j];
2173: }
2174: rtmp11[ics[r[row]]] += sctx.shift_amount;
2175: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2176: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2178: /* loop over all pivot row blocks above this row block */
2179: prow = *bjtmp++;
2180: while (prow < row) {
2181: pc1 = rtmp11 + prow;
2182: pc2 = rtmp22 + prow;
2183: pc3 = rtmp33 + prow;
2184: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2185: pv = ba + bd[prow];
2186: pj = nbj + bd[prow];
2187: mul1 = *pc1 * *pv;
2188: mul2 = *pc2 * *pv;
2189: mul3 = *pc3 * *pv;
2190: ++pv;
2191: *pc1 = mul1;
2192: *pc2 = mul2;
2193: *pc3 = mul3;
2195: nz_tmp = bi[prow+1] - bd[prow] - 1;
2196: /* update this row based on pivot row */
2197: for (j=0; j<nz_tmp; j++) {
2198: tmp = pv[j];
2199: idx = pj[j];
2200: rtmp11[idx] -= mul1 * tmp;
2201: rtmp22[idx] -= mul2 * tmp;
2202: rtmp33[idx] -= mul3 * tmp;
2203: }
2204: PetscLogFlops(3+6*nz_tmp);
2205: }
2206: prow = *bjtmp++;
2207: }
2209: /* Now take care of diagonal 3x3 block in this set of rows */
2210: /* note: prow = row here */
2211: pc1 = rtmp11 + prow;
2212: pc2 = rtmp22 + prow;
2213: pc3 = rtmp33 + prow;
2215: sctx.pv = *pc1;
2216: pj = bj + bi[prow];
2217: rs = 0.0;
2218: for (j=0; j<nz; j++) {
2219: idx = pj[j];
2220: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2221: }
2222: sctx.rs = rs;
2223: MatPivotCheck(A,info,&sctx,row);
2224: if (sctx.newshift) goto endofwhile;
2226: if (*pc2 != 0.0 || *pc3 != 0.0) {
2227: mul2 = (*pc2)/(*pc1);
2228: mul3 = (*pc3)/(*pc1);
2229: *pc2 = mul2;
2230: *pc3 = mul3;
2231: nz_tmp = bi[prow+1] - bd[prow] - 1;
2232: pj = nbj + bd[prow];
2233: for (j=0; j<nz_tmp; j++) {
2234: idx = pj[j];
2235: tmp = rtmp11[idx];
2236: rtmp22[idx] -= mul2 * tmp;
2237: rtmp33[idx] -= mul3 * tmp;
2238: }
2239: PetscLogFlops(2+4*nz_tmp);
2240: }
2241: ++prow;
2243: pc2 = rtmp22 + prow;
2244: pc3 = rtmp33 + prow;
2245: sctx.pv = *pc2;
2246: pj = bj + bi[prow];
2247: rs = 0.0;
2248: for (j=0; j<nz; j++) {
2249: idx = pj[j];
2250: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2251: }
2252: sctx.rs = rs;
2253: MatPivotCheck(A,info,&sctx,row+1);
2254: if (sctx.newshift) goto endofwhile;
2256: if (*pc3 != 0.0) {
2257: mul3 = (*pc3)/(*pc2);
2258: *pc3 = mul3;
2259: pj = nbj + bd[prow];
2260: nz_tmp = bi[prow+1] - bd[prow] - 1;
2261: for (j=0; j<nz_tmp; j++) {
2262: idx = pj[j];
2263: tmp = rtmp22[idx];
2264: rtmp33[idx] -= mul3 * tmp;
2265: }
2266: PetscLogFlops(1+2*nz_tmp);
2267: }
2269: pj = bj + bi[row];
2270: pc1 = ba + bi[row];
2271: pc2 = ba + bi[row+1];
2272: pc3 = ba + bi[row+2];
2274: sctx.pv = rtmp33[row+2];
2275: rs = 0.0;
2276: rtmp11[row] = 1.0/rtmp11[row];
2277: rtmp22[row+1] = 1.0/rtmp22[row+1];
2278: rtmp33[row+2] = 1.0/rtmp33[row+2];
2279: /* copy row entries from dense representation to sparse */
2280: for (j=0; j<nz; j++) {
2281: idx = pj[j];
2282: pc1[j] = rtmp11[idx];
2283: pc2[j] = rtmp22[idx];
2284: pc3[j] = rtmp33[idx];
2285: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2286: }
2288: sctx.rs = rs;
2289: MatPivotCheck(A,info,&sctx,row+2);
2290: if (sctx.newshift) goto endofwhile;
2291: break;
2293: default:
2294: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2295: }
2296: row += nodesz; /* Update the row */
2297: }
2298: endofwhile:;
2299: } while (sctx.newshift);
2300: PetscFree3(rtmp11,rtmp22,rtmp33);
2301: PetscFree(tmp_vec2);
2302: ISRestoreIndices(isicol,&ic);
2303: ISRestoreIndices(isrow,&r);
2304: ISRestoreIndices(iscol,&c);
2306: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2307: /* do not set solve add, since MatSolve_Inode + Add is faster */
2308: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2309: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2310: C->assembled = PETSC_TRUE;
2311: C->preallocated = PETSC_TRUE;
2312: if (sctx.nshift) {
2313: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2314: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2315: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2316: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2317: }
2318: }
2319: PetscLogFlops(C->cmap->n);
2320: MatSeqAIJCheckInode(C);
2321: return(0);
2322: }
2325: /* ----------------------------------------------------------- */
2328: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2329: {
2330: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2331: IS iscol = a->col,isrow = a->row;
2332: PetscErrorCode ierr;
2333: const PetscInt *r,*c,*rout,*cout;
2334: PetscInt i,j,n = A->rmap->n;
2335: PetscInt node_max,row,nsz,aii,i0,i1,nz;
2336: const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2337: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2338: PetscScalar sum1,sum2,sum3,sum4,sum5;
2339: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2340: const PetscScalar *b;
2343: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2344: node_max = a->inode.node_count;
2345: ns = a->inode.size; /* Node Size array */
2347: VecGetArrayRead(bb,&b);
2348: VecGetArray(xx,&x);
2349: tmp = a->solve_work;
2351: ISGetIndices(isrow,&rout); r = rout;
2352: ISGetIndices(iscol,&cout); c = cout;
2354: /* forward solve the lower triangular */
2355: tmps = tmp;
2356: aa = a_a;
2357: aj = a_j;
2358: ad = a->diag;
2360: for (i = 0,row = 0; i< node_max; ++i) {
2361: nsz = ns[i];
2362: aii = ai[row];
2363: v1 = aa + aii;
2364: vi = aj + aii;
2365: nz = ai[row+1]- ai[row];
2367: if (i < node_max-1) {
2368: /* Prefetch the indices for the next block */
2369: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2370: /* Prefetch the data for the next block */
2371: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2372: }
2374: switch (nsz) { /* Each loop in 'case' is unrolled */
2375: case 1:
2376: sum1 = b[r[row]];
2377: for (j=0; j<nz-1; j+=2) {
2378: i0 = vi[j];
2379: i1 = vi[j+1];
2380: tmp0 = tmps[i0];
2381: tmp1 = tmps[i1];
2382: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2383: }
2384: if (j == nz-1) {
2385: tmp0 = tmps[vi[j]];
2386: sum1 -= v1[j]*tmp0;
2387: }
2388: tmp[row++]=sum1;
2389: break;
2390: case 2:
2391: sum1 = b[r[row]];
2392: sum2 = b[r[row+1]];
2393: v2 = aa + ai[row+1];
2395: for (j=0; j<nz-1; j+=2) {
2396: i0 = vi[j];
2397: i1 = vi[j+1];
2398: tmp0 = tmps[i0];
2399: tmp1 = tmps[i1];
2400: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2401: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2402: }
2403: if (j == nz-1) {
2404: tmp0 = tmps[vi[j]];
2405: sum1 -= v1[j] *tmp0;
2406: sum2 -= v2[j] *tmp0;
2407: }
2408: sum2 -= v2[nz] * sum1;
2409: tmp[row++]=sum1;
2410: tmp[row++]=sum2;
2411: break;
2412: case 3:
2413: sum1 = b[r[row]];
2414: sum2 = b[r[row+1]];
2415: sum3 = b[r[row+2]];
2416: v2 = aa + ai[row+1];
2417: v3 = aa + ai[row+2];
2419: for (j=0; j<nz-1; j+=2) {
2420: i0 = vi[j];
2421: i1 = vi[j+1];
2422: tmp0 = tmps[i0];
2423: tmp1 = tmps[i1];
2424: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427: }
2428: if (j == nz-1) {
2429: tmp0 = tmps[vi[j]];
2430: sum1 -= v1[j] *tmp0;
2431: sum2 -= v2[j] *tmp0;
2432: sum3 -= v3[j] *tmp0;
2433: }
2434: sum2 -= v2[nz] * sum1;
2435: sum3 -= v3[nz] * sum1;
2436: sum3 -= v3[nz+1] * sum2;
2437: tmp[row++]=sum1;
2438: tmp[row++]=sum2;
2439: tmp[row++]=sum3;
2440: break;
2442: case 4:
2443: sum1 = b[r[row]];
2444: sum2 = b[r[row+1]];
2445: sum3 = b[r[row+2]];
2446: sum4 = b[r[row+3]];
2447: v2 = aa + ai[row+1];
2448: v3 = aa + ai[row+2];
2449: v4 = aa + ai[row+3];
2451: for (j=0; j<nz-1; j+=2) {
2452: i0 = vi[j];
2453: i1 = vi[j+1];
2454: tmp0 = tmps[i0];
2455: tmp1 = tmps[i1];
2456: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2457: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2458: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2459: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2460: }
2461: if (j == nz-1) {
2462: tmp0 = tmps[vi[j]];
2463: sum1 -= v1[j] *tmp0;
2464: sum2 -= v2[j] *tmp0;
2465: sum3 -= v3[j] *tmp0;
2466: sum4 -= v4[j] *tmp0;
2467: }
2468: sum2 -= v2[nz] * sum1;
2469: sum3 -= v3[nz] * sum1;
2470: sum4 -= v4[nz] * sum1;
2471: sum3 -= v3[nz+1] * sum2;
2472: sum4 -= v4[nz+1] * sum2;
2473: sum4 -= v4[nz+2] * sum3;
2475: tmp[row++]=sum1;
2476: tmp[row++]=sum2;
2477: tmp[row++]=sum3;
2478: tmp[row++]=sum4;
2479: break;
2480: case 5:
2481: sum1 = b[r[row]];
2482: sum2 = b[r[row+1]];
2483: sum3 = b[r[row+2]];
2484: sum4 = b[r[row+3]];
2485: sum5 = b[r[row+4]];
2486: v2 = aa + ai[row+1];
2487: v3 = aa + ai[row+2];
2488: v4 = aa + ai[row+3];
2489: v5 = aa + ai[row+4];
2491: for (j=0; j<nz-1; j+=2) {
2492: i0 = vi[j];
2493: i1 = vi[j+1];
2494: tmp0 = tmps[i0];
2495: tmp1 = tmps[i1];
2496: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2497: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2498: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2499: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2500: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2501: }
2502: if (j == nz-1) {
2503: tmp0 = tmps[vi[j]];
2504: sum1 -= v1[j] *tmp0;
2505: sum2 -= v2[j] *tmp0;
2506: sum3 -= v3[j] *tmp0;
2507: sum4 -= v4[j] *tmp0;
2508: sum5 -= v5[j] *tmp0;
2509: }
2511: sum2 -= v2[nz] * sum1;
2512: sum3 -= v3[nz] * sum1;
2513: sum4 -= v4[nz] * sum1;
2514: sum5 -= v5[nz] * sum1;
2515: sum3 -= v3[nz+1] * sum2;
2516: sum4 -= v4[nz+1] * sum2;
2517: sum5 -= v5[nz+1] * sum2;
2518: sum4 -= v4[nz+2] * sum3;
2519: sum5 -= v5[nz+2] * sum3;
2520: sum5 -= v5[nz+3] * sum4;
2522: tmp[row++]=sum1;
2523: tmp[row++]=sum2;
2524: tmp[row++]=sum3;
2525: tmp[row++]=sum4;
2526: tmp[row++]=sum5;
2527: break;
2528: default:
2529: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2530: }
2531: }
2532: /* backward solve the upper triangular */
2533: for (i=node_max -1,row = n-1; i>=0; i--) {
2534: nsz = ns[i];
2535: aii = ad[row+1] + 1;
2536: v1 = aa + aii;
2537: vi = aj + aii;
2538: nz = ad[row]- ad[row+1] - 1;
2540: if (i > 0) {
2541: /* Prefetch the indices for the next block */
2542: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2543: /* Prefetch the data for the next block */
2544: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2545: }
2547: switch (nsz) { /* Each loop in 'case' is unrolled */
2548: case 1:
2549: sum1 = tmp[row];
2551: for (j=0; j<nz-1; j+=2) {
2552: i0 = vi[j];
2553: i1 = vi[j+1];
2554: tmp0 = tmps[i0];
2555: tmp1 = tmps[i1];
2556: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2557: }
2558: if (j == nz-1) {
2559: tmp0 = tmps[vi[j]];
2560: sum1 -= v1[j]*tmp0;
2561: }
2562: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2563: break;
2564: case 2:
2565: sum1 = tmp[row];
2566: sum2 = tmp[row-1];
2567: v2 = aa + ad[row] + 1;
2568: for (j=0; j<nz-1; j+=2) {
2569: i0 = vi[j];
2570: i1 = vi[j+1];
2571: tmp0 = tmps[i0];
2572: tmp1 = tmps[i1];
2573: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2574: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2575: }
2576: if (j == nz-1) {
2577: tmp0 = tmps[vi[j]];
2578: sum1 -= v1[j]* tmp0;
2579: sum2 -= v2[j+1]* tmp0;
2580: }
2582: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2583: sum2 -= v2[0] * tmp0;
2584: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2585: break;
2586: case 3:
2587: sum1 = tmp[row];
2588: sum2 = tmp[row -1];
2589: sum3 = tmp[row -2];
2590: v2 = aa + ad[row] + 1;
2591: v3 = aa + ad[row -1] + 1;
2592: for (j=0; j<nz-1; j+=2) {
2593: i0 = vi[j];
2594: i1 = vi[j+1];
2595: tmp0 = tmps[i0];
2596: tmp1 = tmps[i1];
2597: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2598: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2599: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2600: }
2601: if (j== nz-1) {
2602: tmp0 = tmps[vi[j]];
2603: sum1 -= v1[j] * tmp0;
2604: sum2 -= v2[j+1] * tmp0;
2605: sum3 -= v3[j+2] * tmp0;
2606: }
2607: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2608: sum2 -= v2[0]* tmp0;
2609: sum3 -= v3[1] * tmp0;
2610: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2611: sum3 -= v3[0]* tmp0;
2612: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2614: break;
2615: case 4:
2616: sum1 = tmp[row];
2617: sum2 = tmp[row -1];
2618: sum3 = tmp[row -2];
2619: sum4 = tmp[row -3];
2620: v2 = aa + ad[row]+1;
2621: v3 = aa + ad[row -1]+1;
2622: v4 = aa + ad[row -2]+1;
2624: for (j=0; j<nz-1; j+=2) {
2625: i0 = vi[j];
2626: i1 = vi[j+1];
2627: tmp0 = tmps[i0];
2628: tmp1 = tmps[i1];
2629: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2630: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2631: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2632: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2633: }
2634: if (j== nz-1) {
2635: tmp0 = tmps[vi[j]];
2636: sum1 -= v1[j] * tmp0;
2637: sum2 -= v2[j+1] * tmp0;
2638: sum3 -= v3[j+2] * tmp0;
2639: sum4 -= v4[j+3] * tmp0;
2640: }
2642: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2643: sum2 -= v2[0] * tmp0;
2644: sum3 -= v3[1] * tmp0;
2645: sum4 -= v4[2] * tmp0;
2646: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2647: sum3 -= v3[0] * tmp0;
2648: sum4 -= v4[1] * tmp0;
2649: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2650: sum4 -= v4[0] * tmp0;
2651: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2652: break;
2653: case 5:
2654: sum1 = tmp[row];
2655: sum2 = tmp[row -1];
2656: sum3 = tmp[row -2];
2657: sum4 = tmp[row -3];
2658: sum5 = tmp[row -4];
2659: v2 = aa + ad[row]+1;
2660: v3 = aa + ad[row -1]+1;
2661: v4 = aa + ad[row -2]+1;
2662: v5 = aa + ad[row -3]+1;
2663: for (j=0; j<nz-1; j+=2) {
2664: i0 = vi[j];
2665: i1 = vi[j+1];
2666: tmp0 = tmps[i0];
2667: tmp1 = tmps[i1];
2668: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2669: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2670: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2671: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2672: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2673: }
2674: if (j==nz-1) {
2675: tmp0 = tmps[vi[j]];
2676: sum1 -= v1[j] * tmp0;
2677: sum2 -= v2[j+1] * tmp0;
2678: sum3 -= v3[j+2] * tmp0;
2679: sum4 -= v4[j+3] * tmp0;
2680: sum5 -= v5[j+4] * tmp0;
2681: }
2683: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2684: sum2 -= v2[0] * tmp0;
2685: sum3 -= v3[1] * tmp0;
2686: sum4 -= v4[2] * tmp0;
2687: sum5 -= v5[3] * tmp0;
2688: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2689: sum3 -= v3[0] * tmp0;
2690: sum4 -= v4[1] * tmp0;
2691: sum5 -= v5[2] * tmp0;
2692: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2693: sum4 -= v4[0] * tmp0;
2694: sum5 -= v5[1] * tmp0;
2695: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2696: sum5 -= v5[0] * tmp0;
2697: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2698: break;
2699: default:
2700: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2701: }
2702: }
2703: ISRestoreIndices(isrow,&rout);
2704: ISRestoreIndices(iscol,&cout);
2705: VecRestoreArrayRead(bb,&b);
2706: VecRestoreArray(xx,&x);
2707: PetscLogFlops(2.0*a->nz - A->cmap->n);
2708: return(0);
2709: }
2712: /*
2713: Makes a longer coloring[] array and calls the usual code with that
2714: */
2717: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2718: {
2719: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2720: PetscErrorCode ierr;
2721: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2722: PetscInt *colorused,i;
2723: ISColoringValue *newcolor;
2726: PetscMalloc1(n+1,&newcolor);
2727: /* loop over inodes, marking a color for each column*/
2728: row = 0;
2729: for (i=0; i<m; i++) {
2730: for (j=0; j<ns[i]; j++) {
2731: newcolor[row++] = coloring[i] + j*ncolors;
2732: }
2733: }
2735: /* eliminate unneeded colors */
2736: PetscCalloc1(5*ncolors,&colorused);
2737: for (i=0; i<n; i++) {
2738: colorused[newcolor[i]] = 1;
2739: }
2741: for (i=1; i<5*ncolors; i++) {
2742: colorused[i] += colorused[i-1];
2743: }
2744: ncolors = colorused[5*ncolors-1];
2745: for (i=0; i<n; i++) {
2746: newcolor[i] = colorused[newcolor[i]]-1;
2747: }
2748: PetscFree(colorused);
2749: ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2750: PetscFree(coloring);
2751: return(0);
2752: }
2754: #include <petsc/private/kernels/blockinvert.h>
2758: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2759: {
2760: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2761: PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2762: MatScalar *ibdiag,*bdiag,work[25],*t;
2763: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2764: const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2765: const PetscScalar *xb, *b;
2766: PetscReal zeropivot = 1.0e-15, shift = 0.0;
2767: PetscErrorCode ierr;
2768: PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2769: PetscInt sz,k,ipvt[5];
2770: const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2773: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2774: if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2775: if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2777: if (!a->inode.ibdiagvalid) {
2778: if (!a->inode.ibdiag) {
2779: /* calculate space needed for diagonal blocks */
2780: for (i=0; i<m; i++) {
2781: cnt += sizes[i]*sizes[i];
2782: }
2783: a->inode.bdiagsize = cnt;
2785: PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2786: }
2788: /* copy over the diagonal blocks and invert them */
2789: ibdiag = a->inode.ibdiag;
2790: bdiag = a->inode.bdiag;
2791: cnt = 0;
2792: for (i=0, row = 0; i<m; i++) {
2793: for (j=0; j<sizes[i]; j++) {
2794: for (k=0; k<sizes[i]; k++) {
2795: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2796: }
2797: }
2798: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2800: switch (sizes[i]) {
2801: case 1:
2802: /* Create matrix data structure */
2803: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2804: ibdiag[cnt] = 1.0/ibdiag[cnt];
2805: break;
2806: case 2:
2807: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
2808: break;
2809: case 3:
2810: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
2811: break;
2812: case 4:
2813: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
2814: break;
2815: case 5:
2816: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);
2817: break;
2818: default:
2819: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2820: }
2821: cnt += sizes[i]*sizes[i];
2822: row += sizes[i];
2823: }
2824: a->inode.ibdiagvalid = PETSC_TRUE;
2825: }
2826: ibdiag = a->inode.ibdiag;
2827: bdiag = a->inode.bdiag;
2828: t = a->inode.ssor_work;
2830: VecGetArray(xx,&x);
2831: VecGetArrayRead(bb,&b);
2832: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2833: if (flag & SOR_ZERO_INITIAL_GUESS) {
2834: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2836: for (i=0, row=0; i<m; i++) {
2837: sz = diag[row] - ii[row];
2838: v1 = a->a + ii[row];
2839: idx = a->j + ii[row];
2841: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2842: switch (sizes[i]) {
2843: case 1:
2845: sum1 = b[row];
2846: for (n = 0; n<sz-1; n+=2) {
2847: i1 = idx[0];
2848: i2 = idx[1];
2849: idx += 2;
2850: tmp0 = x[i1];
2851: tmp1 = x[i2];
2852: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2853: }
2855: if (n == sz-1) {
2856: tmp0 = x[*idx];
2857: sum1 -= *v1 * tmp0;
2858: }
2859: t[row] = sum1;
2860: x[row++] = sum1*(*ibdiag++);
2861: break;
2862: case 2:
2863: v2 = a->a + ii[row+1];
2864: sum1 = b[row];
2865: sum2 = b[row+1];
2866: for (n = 0; n<sz-1; n+=2) {
2867: i1 = idx[0];
2868: i2 = idx[1];
2869: idx += 2;
2870: tmp0 = x[i1];
2871: tmp1 = x[i2];
2872: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2873: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2874: }
2876: if (n == sz-1) {
2877: tmp0 = x[*idx];
2878: sum1 -= v1[0] * tmp0;
2879: sum2 -= v2[0] * tmp0;
2880: }
2881: t[row] = sum1;
2882: t[row+1] = sum2;
2883: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2884: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2885: ibdiag += 4;
2886: break;
2887: case 3:
2888: v2 = a->a + ii[row+1];
2889: v3 = a->a + ii[row+2];
2890: sum1 = b[row];
2891: sum2 = b[row+1];
2892: sum3 = b[row+2];
2893: for (n = 0; n<sz-1; n+=2) {
2894: i1 = idx[0];
2895: i2 = idx[1];
2896: idx += 2;
2897: tmp0 = x[i1];
2898: tmp1 = x[i2];
2899: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2900: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2901: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2902: }
2904: if (n == sz-1) {
2905: tmp0 = x[*idx];
2906: sum1 -= v1[0] * tmp0;
2907: sum2 -= v2[0] * tmp0;
2908: sum3 -= v3[0] * tmp0;
2909: }
2910: t[row] = sum1;
2911: t[row+1] = sum2;
2912: t[row+2] = sum3;
2913: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2914: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2915: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2916: ibdiag += 9;
2917: break;
2918: case 4:
2919: v2 = a->a + ii[row+1];
2920: v3 = a->a + ii[row+2];
2921: v4 = a->a + ii[row+3];
2922: sum1 = b[row];
2923: sum2 = b[row+1];
2924: sum3 = b[row+2];
2925: sum4 = b[row+3];
2926: for (n = 0; n<sz-1; n+=2) {
2927: i1 = idx[0];
2928: i2 = idx[1];
2929: idx += 2;
2930: tmp0 = x[i1];
2931: tmp1 = x[i2];
2932: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2933: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2934: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2935: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2936: }
2938: if (n == sz-1) {
2939: tmp0 = x[*idx];
2940: sum1 -= v1[0] * tmp0;
2941: sum2 -= v2[0] * tmp0;
2942: sum3 -= v3[0] * tmp0;
2943: sum4 -= v4[0] * tmp0;
2944: }
2945: t[row] = sum1;
2946: t[row+1] = sum2;
2947: t[row+2] = sum3;
2948: t[row+3] = sum4;
2949: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2950: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2951: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2952: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2953: ibdiag += 16;
2954: break;
2955: case 5:
2956: v2 = a->a + ii[row+1];
2957: v3 = a->a + ii[row+2];
2958: v4 = a->a + ii[row+3];
2959: v5 = a->a + ii[row+4];
2960: sum1 = b[row];
2961: sum2 = b[row+1];
2962: sum3 = b[row+2];
2963: sum4 = b[row+3];
2964: sum5 = b[row+4];
2965: for (n = 0; n<sz-1; n+=2) {
2966: i1 = idx[0];
2967: i2 = idx[1];
2968: idx += 2;
2969: tmp0 = x[i1];
2970: tmp1 = x[i2];
2971: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2972: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2973: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2974: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2975: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2976: }
2978: if (n == sz-1) {
2979: tmp0 = x[*idx];
2980: sum1 -= v1[0] * tmp0;
2981: sum2 -= v2[0] * tmp0;
2982: sum3 -= v3[0] * tmp0;
2983: sum4 -= v4[0] * tmp0;
2984: sum5 -= v5[0] * tmp0;
2985: }
2986: t[row] = sum1;
2987: t[row+1] = sum2;
2988: t[row+2] = sum3;
2989: t[row+3] = sum4;
2990: t[row+4] = sum5;
2991: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2992: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2993: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2994: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2995: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2996: ibdiag += 25;
2997: break;
2998: default:
2999: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3000: }
3001: }
3003: xb = t;
3004: PetscLogFlops(a->nz);
3005: } else xb = b;
3006: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3008: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3009: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3010: ibdiag -= sizes[i]*sizes[i];
3011: sz = ii[row+1] - diag[row] - 1;
3012: v1 = a->a + diag[row] + 1;
3013: idx = a->j + diag[row] + 1;
3015: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3016: switch (sizes[i]) {
3017: case 1:
3019: sum1 = xb[row];
3020: for (n = 0; n<sz-1; n+=2) {
3021: i1 = idx[0];
3022: i2 = idx[1];
3023: idx += 2;
3024: tmp0 = x[i1];
3025: tmp1 = x[i2];
3026: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3027: }
3029: if (n == sz-1) {
3030: tmp0 = x[*idx];
3031: sum1 -= *v1*tmp0;
3032: }
3033: x[row--] = sum1*(*ibdiag);
3034: break;
3036: case 2:
3038: sum1 = xb[row];
3039: sum2 = xb[row-1];
3040: /* note that sum1 is associated with the second of the two rows */
3041: v2 = a->a + diag[row-1] + 2;
3042: for (n = 0; n<sz-1; n+=2) {
3043: i1 = idx[0];
3044: i2 = idx[1];
3045: idx += 2;
3046: tmp0 = x[i1];
3047: tmp1 = x[i2];
3048: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3049: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3050: }
3052: if (n == sz-1) {
3053: tmp0 = x[*idx];
3054: sum1 -= *v1*tmp0;
3055: sum2 -= *v2*tmp0;
3056: }
3057: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3058: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3059: break;
3060: case 3:
3062: sum1 = xb[row];
3063: sum2 = xb[row-1];
3064: sum3 = xb[row-2];
3065: v2 = a->a + diag[row-1] + 2;
3066: v3 = a->a + diag[row-2] + 3;
3067: for (n = 0; n<sz-1; n+=2) {
3068: i1 = idx[0];
3069: i2 = idx[1];
3070: idx += 2;
3071: tmp0 = x[i1];
3072: tmp1 = x[i2];
3073: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3074: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3075: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3076: }
3078: if (n == sz-1) {
3079: tmp0 = x[*idx];
3080: sum1 -= *v1*tmp0;
3081: sum2 -= *v2*tmp0;
3082: sum3 -= *v3*tmp0;
3083: }
3084: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3085: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3086: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3087: break;
3088: case 4:
3090: sum1 = xb[row];
3091: sum2 = xb[row-1];
3092: sum3 = xb[row-2];
3093: sum4 = xb[row-3];
3094: v2 = a->a + diag[row-1] + 2;
3095: v3 = a->a + diag[row-2] + 3;
3096: v4 = a->a + diag[row-3] + 4;
3097: for (n = 0; n<sz-1; n+=2) {
3098: i1 = idx[0];
3099: i2 = idx[1];
3100: idx += 2;
3101: tmp0 = x[i1];
3102: tmp1 = x[i2];
3103: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3104: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3105: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3106: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3107: }
3109: if (n == sz-1) {
3110: tmp0 = x[*idx];
3111: sum1 -= *v1*tmp0;
3112: sum2 -= *v2*tmp0;
3113: sum3 -= *v3*tmp0;
3114: sum4 -= *v4*tmp0;
3115: }
3116: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3117: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3118: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3119: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3120: break;
3121: case 5:
3123: sum1 = xb[row];
3124: sum2 = xb[row-1];
3125: sum3 = xb[row-2];
3126: sum4 = xb[row-3];
3127: sum5 = xb[row-4];
3128: v2 = a->a + diag[row-1] + 2;
3129: v3 = a->a + diag[row-2] + 3;
3130: v4 = a->a + diag[row-3] + 4;
3131: v5 = a->a + diag[row-4] + 5;
3132: for (n = 0; n<sz-1; n+=2) {
3133: i1 = idx[0];
3134: i2 = idx[1];
3135: idx += 2;
3136: tmp0 = x[i1];
3137: tmp1 = x[i2];
3138: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3139: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3140: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3141: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3142: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3143: }
3145: if (n == sz-1) {
3146: tmp0 = x[*idx];
3147: sum1 -= *v1*tmp0;
3148: sum2 -= *v2*tmp0;
3149: sum3 -= *v3*tmp0;
3150: sum4 -= *v4*tmp0;
3151: sum5 -= *v5*tmp0;
3152: }
3153: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3154: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3155: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3156: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3157: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3158: break;
3159: default:
3160: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3161: }
3162: }
3164: PetscLogFlops(a->nz);
3165: }
3166: its--;
3167: }
3168: while (its--) {
3170: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3171: for (i=0, row=0, ibdiag = a->inode.ibdiag;
3172: i<m;
3173: row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3175: sz = diag[row] - ii[row];
3176: v1 = a->a + ii[row];
3177: idx = a->j + ii[row];
3178: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3179: switch (sizes[i]) {
3180: case 1:
3181: sum1 = b[row];
3182: for (n = 0; n<sz-1; n+=2) {
3183: i1 = idx[0];
3184: i2 = idx[1];
3185: idx += 2;
3186: tmp0 = x[i1];
3187: tmp1 = x[i2];
3188: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3189: }
3190: if (n == sz-1) {
3191: tmp0 = x[*idx++];
3192: sum1 -= *v1 * tmp0;
3193: v1++;
3194: }
3195: t[row] = sum1;
3196: sz = ii[row+1] - diag[row] - 1;
3197: idx = a->j + diag[row] + 1;
3198: v1 += 1;
3199: for (n = 0; n<sz-1; n+=2) {
3200: i1 = idx[0];
3201: i2 = idx[1];
3202: idx += 2;
3203: tmp0 = x[i1];
3204: tmp1 = x[i2];
3205: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3206: }
3207: if (n == sz-1) {
3208: tmp0 = x[*idx++];
3209: sum1 -= *v1 * tmp0;
3210: }
3211: /* in MatSOR_SeqAIJ this line would be
3212: *
3213: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3214: *
3215: * but omega == 1, so this becomes
3216: *
3217: * x[row] = sum1*(*ibdiag++);
3218: *
3219: */
3220: x[row] = sum1*(*ibdiag);
3221: break;
3222: case 2:
3223: v2 = a->a + ii[row+1];
3224: sum1 = b[row];
3225: sum2 = b[row+1];
3226: for (n = 0; n<sz-1; n+=2) {
3227: i1 = idx[0];
3228: i2 = idx[1];
3229: idx += 2;
3230: tmp0 = x[i1];
3231: tmp1 = x[i2];
3232: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3233: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3234: }
3235: if (n == sz-1) {
3236: tmp0 = x[*idx++];
3237: sum1 -= v1[0] * tmp0;
3238: sum2 -= v2[0] * tmp0;
3239: v1++; v2++;
3240: }
3241: t[row] = sum1;
3242: t[row+1] = sum2;
3243: sz = ii[row+1] - diag[row] - 2;
3244: idx = a->j + diag[row] + 2;
3245: v1 += 2;
3246: v2 += 2;
3247: for (n = 0; n<sz-1; n+=2) {
3248: i1 = idx[0];
3249: i2 = idx[1];
3250: idx += 2;
3251: tmp0 = x[i1];
3252: tmp1 = x[i2];
3253: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3254: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3255: }
3256: if (n == sz-1) {
3257: tmp0 = x[*idx];
3258: sum1 -= v1[0] * tmp0;
3259: sum2 -= v2[0] * tmp0;
3260: }
3261: x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3262: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3263: break;
3264: case 3:
3265: v2 = a->a + ii[row+1];
3266: v3 = a->a + ii[row+2];
3267: sum1 = b[row];
3268: sum2 = b[row+1];
3269: sum3 = b[row+2];
3270: for (n = 0; n<sz-1; n+=2) {
3271: i1 = idx[0];
3272: i2 = idx[1];
3273: idx += 2;
3274: tmp0 = x[i1];
3275: tmp1 = x[i2];
3276: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3277: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3278: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3279: }
3280: if (n == sz-1) {
3281: tmp0 = x[*idx++];
3282: sum1 -= v1[0] * tmp0;
3283: sum2 -= v2[0] * tmp0;
3284: sum3 -= v3[0] * tmp0;
3285: v1++; v2++; v3++;
3286: }
3287: t[row] = sum1;
3288: t[row+1] = sum2;
3289: t[row+2] = sum3;
3290: sz = ii[row+1] - diag[row] - 3;
3291: idx = a->j + diag[row] + 3;
3292: v1 += 3;
3293: v2 += 3;
3294: v3 += 3;
3295: for (n = 0; n<sz-1; n+=2) {
3296: i1 = idx[0];
3297: i2 = idx[1];
3298: idx += 2;
3299: tmp0 = x[i1];
3300: tmp1 = x[i2];
3301: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3302: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3303: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3304: }
3305: if (n == sz-1) {
3306: tmp0 = x[*idx];
3307: sum1 -= v1[0] * tmp0;
3308: sum2 -= v2[0] * tmp0;
3309: sum3 -= v3[0] * tmp0;
3310: }
3311: x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3312: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3313: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3314: break;
3315: case 4:
3316: v2 = a->a + ii[row+1];
3317: v3 = a->a + ii[row+2];
3318: v4 = a->a + ii[row+3];
3319: sum1 = b[row];
3320: sum2 = b[row+1];
3321: sum3 = b[row+2];
3322: sum4 = b[row+3];
3323: for (n = 0; n<sz-1; n+=2) {
3324: i1 = idx[0];
3325: i2 = idx[1];
3326: idx += 2;
3327: tmp0 = x[i1];
3328: tmp1 = x[i2];
3329: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3330: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3331: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3332: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3333: }
3334: if (n == sz-1) {
3335: tmp0 = x[*idx++];
3336: sum1 -= v1[0] * tmp0;
3337: sum2 -= v2[0] * tmp0;
3338: sum3 -= v3[0] * tmp0;
3339: sum4 -= v4[0] * tmp0;
3340: v1++; v2++; v3++; v4++;
3341: }
3342: t[row] = sum1;
3343: t[row+1] = sum2;
3344: t[row+2] = sum3;
3345: t[row+3] = sum4;
3346: sz = ii[row+1] - diag[row] - 4;
3347: idx = a->j + diag[row] + 4;
3348: v1 += 4;
3349: v2 += 4;
3350: v3 += 4;
3351: v4 += 4;
3352: for (n = 0; n<sz-1; n+=2) {
3353: i1 = idx[0];
3354: i2 = idx[1];
3355: idx += 2;
3356: tmp0 = x[i1];
3357: tmp1 = x[i2];
3358: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3359: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3360: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3361: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3362: }
3363: if (n == sz-1) {
3364: tmp0 = x[*idx];
3365: sum1 -= v1[0] * tmp0;
3366: sum2 -= v2[0] * tmp0;
3367: sum3 -= v3[0] * tmp0;
3368: sum4 -= v4[0] * tmp0;
3369: }
3370: x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3371: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3372: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3373: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3374: break;
3375: case 5:
3376: v2 = a->a + ii[row+1];
3377: v3 = a->a + ii[row+2];
3378: v4 = a->a + ii[row+3];
3379: v5 = a->a + ii[row+4];
3380: sum1 = b[row];
3381: sum2 = b[row+1];
3382: sum3 = b[row+2];
3383: sum4 = b[row+3];
3384: sum5 = b[row+4];
3385: for (n = 0; n<sz-1; n+=2) {
3386: i1 = idx[0];
3387: i2 = idx[1];
3388: idx += 2;
3389: tmp0 = x[i1];
3390: tmp1 = x[i2];
3391: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3392: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3393: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3394: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3395: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3396: }
3397: if (n == sz-1) {
3398: tmp0 = x[*idx++];
3399: sum1 -= v1[0] * tmp0;
3400: sum2 -= v2[0] * tmp0;
3401: sum3 -= v3[0] * tmp0;
3402: sum4 -= v4[0] * tmp0;
3403: sum5 -= v5[0] * tmp0;
3404: v1++; v2++; v3++; v4++; v5++;
3405: }
3406: t[row] = sum1;
3407: t[row+1] = sum2;
3408: t[row+2] = sum3;
3409: t[row+3] = sum4;
3410: t[row+4] = sum5;
3411: sz = ii[row+1] - diag[row] - 5;
3412: idx = a->j + diag[row] + 5;
3413: v1 += 5;
3414: v2 += 5;
3415: v3 += 5;
3416: v4 += 5;
3417: v5 += 5;
3418: for (n = 0; n<sz-1; n+=2) {
3419: i1 = idx[0];
3420: i2 = idx[1];
3421: idx += 2;
3422: tmp0 = x[i1];
3423: tmp1 = x[i2];
3424: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3425: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3426: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3427: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3428: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3429: }
3430: if (n == sz-1) {
3431: tmp0 = x[*idx];
3432: sum1 -= v1[0] * tmp0;
3433: sum2 -= v2[0] * tmp0;
3434: sum3 -= v3[0] * tmp0;
3435: sum4 -= v4[0] * tmp0;
3436: sum5 -= v5[0] * tmp0;
3437: }
3438: x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3439: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3440: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3441: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3442: x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3443: break;
3444: default:
3445: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3446: }
3447: }
3448: xb = t;
3449: PetscLogFlops(2.0*a->nz); /* undercounts diag inverse */
3450: } else xb = b;
3452: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3454: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3455: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3456: ibdiag -= sizes[i]*sizes[i];
3458: /* set RHS */
3459: if (xb == b) {
3460: /* whole (old way) */
3461: sz = ii[row+1] - ii[row];
3462: idx = a->j + ii[row];
3463: switch (sizes[i]) {
3464: case 5:
3465: v5 = a->a + ii[row-4];
3466: case 4: /* fall through */
3467: v4 = a->a + ii[row-3];
3468: case 3:
3469: v3 = a->a + ii[row-2];
3470: case 2:
3471: v2 = a->a + ii[row-1];
3472: case 1:
3473: v1 = a->a + ii[row];
3474: break;
3475: default:
3476: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3477: }
3478: } else {
3479: /* upper, no diag */
3480: sz = ii[row+1] - diag[row] - 1;
3481: idx = a->j + diag[row] + 1;
3482: switch (sizes[i]) {
3483: case 5:
3484: v5 = a->a + diag[row-4] + 5;
3485: case 4: /* fall through */
3486: v4 = a->a + diag[row-3] + 4;
3487: case 3:
3488: v3 = a->a + diag[row-2] + 3;
3489: case 2:
3490: v2 = a->a + diag[row-1] + 2;
3491: case 1:
3492: v1 = a->a + diag[row] + 1;
3493: }
3494: }
3495: /* set sum */
3496: switch (sizes[i]) {
3497: case 5:
3498: sum5 = xb[row-4];
3499: case 4: /* fall through */
3500: sum4 = xb[row-3];
3501: case 3:
3502: sum3 = xb[row-2];
3503: case 2:
3504: sum2 = xb[row-1];
3505: case 1:
3506: /* note that sum1 is associated with the last row */
3507: sum1 = xb[row];
3508: }
3509: /* do sums */
3510: for (n = 0; n<sz-1; n+=2) {
3511: i1 = idx[0];
3512: i2 = idx[1];
3513: idx += 2;
3514: tmp0 = x[i1];
3515: tmp1 = x[i2];
3516: switch (sizes[i]) {
3517: case 5:
3518: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3519: case 4: /* fall through */
3520: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3521: case 3:
3522: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3523: case 2:
3524: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3525: case 1:
3526: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3527: }
3528: }
3529: /* ragged edge */
3530: if (n == sz-1) {
3531: tmp0 = x[*idx];
3532: switch (sizes[i]) {
3533: case 5:
3534: sum5 -= *v5*tmp0;
3535: case 4: /* fall through */
3536: sum4 -= *v4*tmp0;
3537: case 3:
3538: sum3 -= *v3*tmp0;
3539: case 2:
3540: sum2 -= *v2*tmp0;
3541: case 1:
3542: sum1 -= *v1*tmp0;
3543: }
3544: }
3545: /* update */
3546: if (xb == b) {
3547: /* whole (old way) w/ diag */
3548: switch (sizes[i]) {
3549: case 5:
3550: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3551: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3552: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3553: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3554: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3555: break;
3556: case 4:
3557: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3558: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3559: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3560: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3561: break;
3562: case 3:
3563: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3564: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3565: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3566: break;
3567: case 2:
3568: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3569: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3570: break;
3571: case 1:
3572: x[row--] += sum1*(*ibdiag);
3573: break;
3574: }
3575: } else {
3576: /* no diag so set = */
3577: switch (sizes[i]) {
3578: case 5:
3579: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3580: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3581: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3582: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3583: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3584: break;
3585: case 4:
3586: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3587: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3588: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3589: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3590: break;
3591: case 3:
3592: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3593: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3594: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3595: break;
3596: case 2:
3597: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3598: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3599: break;
3600: case 1:
3601: x[row--] = sum1*(*ibdiag);
3602: break;
3603: }
3604: }
3605: }
3606: if (xb == b) {
3607: PetscLogFlops(2.0*a->nz);
3608: } else {
3609: PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3610: }
3611: }
3612: }
3613: if (flag & SOR_EISENSTAT) {
3614: /*
3615: Apply (U + D)^-1 where D is now the block diagonal
3616: */
3617: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3618: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3619: ibdiag -= sizes[i]*sizes[i];
3620: sz = ii[row+1] - diag[row] - 1;
3621: v1 = a->a + diag[row] + 1;
3622: idx = a->j + diag[row] + 1;
3623: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3624: switch (sizes[i]) {
3625: case 1:
3627: sum1 = b[row];
3628: for (n = 0; n<sz-1; n+=2) {
3629: i1 = idx[0];
3630: i2 = idx[1];
3631: idx += 2;
3632: tmp0 = x[i1];
3633: tmp1 = x[i2];
3634: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3635: }
3637: if (n == sz-1) {
3638: tmp0 = x[*idx];
3639: sum1 -= *v1*tmp0;
3640: }
3641: x[row] = sum1*(*ibdiag);row--;
3642: break;
3644: case 2:
3646: sum1 = b[row];
3647: sum2 = b[row-1];
3648: /* note that sum1 is associated with the second of the two rows */
3649: v2 = a->a + diag[row-1] + 2;
3650: for (n = 0; n<sz-1; n+=2) {
3651: i1 = idx[0];
3652: i2 = idx[1];
3653: idx += 2;
3654: tmp0 = x[i1];
3655: tmp1 = x[i2];
3656: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3657: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3658: }
3660: if (n == sz-1) {
3661: tmp0 = x[*idx];
3662: sum1 -= *v1*tmp0;
3663: sum2 -= *v2*tmp0;
3664: }
3665: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3666: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3667: row -= 2;
3668: break;
3669: case 3:
3671: sum1 = b[row];
3672: sum2 = b[row-1];
3673: sum3 = b[row-2];
3674: v2 = a->a + diag[row-1] + 2;
3675: v3 = a->a + diag[row-2] + 3;
3676: for (n = 0; n<sz-1; n+=2) {
3677: i1 = idx[0];
3678: i2 = idx[1];
3679: idx += 2;
3680: tmp0 = x[i1];
3681: tmp1 = x[i2];
3682: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3683: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3684: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3685: }
3687: if (n == sz-1) {
3688: tmp0 = x[*idx];
3689: sum1 -= *v1*tmp0;
3690: sum2 -= *v2*tmp0;
3691: sum3 -= *v3*tmp0;
3692: }
3693: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3694: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3695: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3696: row -= 3;
3697: break;
3698: case 4:
3700: sum1 = b[row];
3701: sum2 = b[row-1];
3702: sum3 = b[row-2];
3703: sum4 = b[row-3];
3704: v2 = a->a + diag[row-1] + 2;
3705: v3 = a->a + diag[row-2] + 3;
3706: v4 = a->a + diag[row-3] + 4;
3707: for (n = 0; n<sz-1; n+=2) {
3708: i1 = idx[0];
3709: i2 = idx[1];
3710: idx += 2;
3711: tmp0 = x[i1];
3712: tmp1 = x[i2];
3713: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3714: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3715: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3716: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3717: }
3719: if (n == sz-1) {
3720: tmp0 = x[*idx];
3721: sum1 -= *v1*tmp0;
3722: sum2 -= *v2*tmp0;
3723: sum3 -= *v3*tmp0;
3724: sum4 -= *v4*tmp0;
3725: }
3726: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3727: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3728: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3729: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3730: row -= 4;
3731: break;
3732: case 5:
3734: sum1 = b[row];
3735: sum2 = b[row-1];
3736: sum3 = b[row-2];
3737: sum4 = b[row-3];
3738: sum5 = b[row-4];
3739: v2 = a->a + diag[row-1] + 2;
3740: v3 = a->a + diag[row-2] + 3;
3741: v4 = a->a + diag[row-3] + 4;
3742: v5 = a->a + diag[row-4] + 5;
3743: for (n = 0; n<sz-1; n+=2) {
3744: i1 = idx[0];
3745: i2 = idx[1];
3746: idx += 2;
3747: tmp0 = x[i1];
3748: tmp1 = x[i2];
3749: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3750: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3751: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3752: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3753: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3754: }
3756: if (n == sz-1) {
3757: tmp0 = x[*idx];
3758: sum1 -= *v1*tmp0;
3759: sum2 -= *v2*tmp0;
3760: sum3 -= *v3*tmp0;
3761: sum4 -= *v4*tmp0;
3762: sum5 -= *v5*tmp0;
3763: }
3764: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3765: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3766: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3767: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3768: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3769: row -= 5;
3770: break;
3771: default:
3772: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3773: }
3774: }
3775: PetscLogFlops(a->nz);
3777: /*
3778: t = b - D x where D is the block diagonal
3779: */
3780: cnt = 0;
3781: for (i=0, row=0; i<m; i++) {
3782: switch (sizes[i]) {
3783: case 1:
3784: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3785: break;
3786: case 2:
3787: x1 = x[row]; x2 = x[row+1];
3788: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3789: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3790: t[row] = b[row] - tmp1;
3791: t[row+1] = b[row+1] - tmp2; row += 2;
3792: cnt += 4;
3793: break;
3794: case 3:
3795: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3796: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3797: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3798: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3799: t[row] = b[row] - tmp1;
3800: t[row+1] = b[row+1] - tmp2;
3801: t[row+2] = b[row+2] - tmp3; row += 3;
3802: cnt += 9;
3803: break;
3804: case 4:
3805: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3806: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3807: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3808: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3809: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3810: t[row] = b[row] - tmp1;
3811: t[row+1] = b[row+1] - tmp2;
3812: t[row+2] = b[row+2] - tmp3;
3813: t[row+3] = b[row+3] - tmp4; row += 4;
3814: cnt += 16;
3815: break;
3816: case 5:
3817: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3818: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3819: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3820: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3821: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3822: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3823: t[row] = b[row] - tmp1;
3824: t[row+1] = b[row+1] - tmp2;
3825: t[row+2] = b[row+2] - tmp3;
3826: t[row+3] = b[row+3] - tmp4;
3827: t[row+4] = b[row+4] - tmp5;row += 5;
3828: cnt += 25;
3829: break;
3830: default:
3831: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3832: }
3833: }
3834: PetscLogFlops(m);
3838: /*
3839: Apply (L + D)^-1 where D is the block diagonal
3840: */
3841: for (i=0, row=0; i<m; i++) {
3842: sz = diag[row] - ii[row];
3843: v1 = a->a + ii[row];
3844: idx = a->j + ii[row];
3845: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3846: switch (sizes[i]) {
3847: case 1:
3849: sum1 = t[row];
3850: for (n = 0; n<sz-1; n+=2) {
3851: i1 = idx[0];
3852: i2 = idx[1];
3853: idx += 2;
3854: tmp0 = t[i1];
3855: tmp1 = t[i2];
3856: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3857: }
3859: if (n == sz-1) {
3860: tmp0 = t[*idx];
3861: sum1 -= *v1 * tmp0;
3862: }
3863: x[row] += t[row] = sum1*(*ibdiag++); row++;
3864: break;
3865: case 2:
3866: v2 = a->a + ii[row+1];
3867: sum1 = t[row];
3868: sum2 = t[row+1];
3869: for (n = 0; n<sz-1; n+=2) {
3870: i1 = idx[0];
3871: i2 = idx[1];
3872: idx += 2;
3873: tmp0 = t[i1];
3874: tmp1 = t[i2];
3875: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3876: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3877: }
3879: if (n == sz-1) {
3880: tmp0 = t[*idx];
3881: sum1 -= v1[0] * tmp0;
3882: sum2 -= v2[0] * tmp0;
3883: }
3884: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3885: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3886: ibdiag += 4; row += 2;
3887: break;
3888: case 3:
3889: v2 = a->a + ii[row+1];
3890: v3 = a->a + ii[row+2];
3891: sum1 = t[row];
3892: sum2 = t[row+1];
3893: sum3 = t[row+2];
3894: for (n = 0; n<sz-1; n+=2) {
3895: i1 = idx[0];
3896: i2 = idx[1];
3897: idx += 2;
3898: tmp0 = t[i1];
3899: tmp1 = t[i2];
3900: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3901: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3902: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3903: }
3905: if (n == sz-1) {
3906: tmp0 = t[*idx];
3907: sum1 -= v1[0] * tmp0;
3908: sum2 -= v2[0] * tmp0;
3909: sum3 -= v3[0] * tmp0;
3910: }
3911: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3912: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3913: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3914: ibdiag += 9; row += 3;
3915: break;
3916: case 4:
3917: v2 = a->a + ii[row+1];
3918: v3 = a->a + ii[row+2];
3919: v4 = a->a + ii[row+3];
3920: sum1 = t[row];
3921: sum2 = t[row+1];
3922: sum3 = t[row+2];
3923: sum4 = t[row+3];
3924: for (n = 0; n<sz-1; n+=2) {
3925: i1 = idx[0];
3926: i2 = idx[1];
3927: idx += 2;
3928: tmp0 = t[i1];
3929: tmp1 = t[i2];
3930: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3931: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3932: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3933: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3934: }
3936: if (n == sz-1) {
3937: tmp0 = t[*idx];
3938: sum1 -= v1[0] * tmp0;
3939: sum2 -= v2[0] * tmp0;
3940: sum3 -= v3[0] * tmp0;
3941: sum4 -= v4[0] * tmp0;
3942: }
3943: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3944: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3945: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3946: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3947: ibdiag += 16; row += 4;
3948: break;
3949: case 5:
3950: v2 = a->a + ii[row+1];
3951: v3 = a->a + ii[row+2];
3952: v4 = a->a + ii[row+3];
3953: v5 = a->a + ii[row+4];
3954: sum1 = t[row];
3955: sum2 = t[row+1];
3956: sum3 = t[row+2];
3957: sum4 = t[row+3];
3958: sum5 = t[row+4];
3959: for (n = 0; n<sz-1; n+=2) {
3960: i1 = idx[0];
3961: i2 = idx[1];
3962: idx += 2;
3963: tmp0 = t[i1];
3964: tmp1 = t[i2];
3965: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3966: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3967: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3968: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3969: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3970: }
3972: if (n == sz-1) {
3973: tmp0 = t[*idx];
3974: sum1 -= v1[0] * tmp0;
3975: sum2 -= v2[0] * tmp0;
3976: sum3 -= v3[0] * tmp0;
3977: sum4 -= v4[0] * tmp0;
3978: sum5 -= v5[0] * tmp0;
3979: }
3980: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3981: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3982: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3983: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3984: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3985: ibdiag += 25; row += 5;
3986: break;
3987: default:
3988: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3989: }
3990: }
3991: PetscLogFlops(a->nz);
3992: }
3993: VecRestoreArray(xx,&x);
3994: VecRestoreArrayRead(bb,&b);
3995: return(0);
3996: }
4000: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4001: {
4002: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4003: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4004: const MatScalar *bdiag = a->inode.bdiag;
4005: const PetscScalar *b;
4006: PetscErrorCode ierr;
4007: PetscInt m = a->inode.node_count,cnt = 0,i,row;
4008: const PetscInt *sizes = a->inode.size;
4011: VecGetArray(xx,&x);
4012: VecGetArrayRead(bb,&b);
4013: cnt = 0;
4014: for (i=0, row=0; i<m; i++) {
4015: switch (sizes[i]) {
4016: case 1:
4017: x[row] = b[row]*bdiag[cnt++];row++;
4018: break;
4019: case 2:
4020: x1 = b[row]; x2 = b[row+1];
4021: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4022: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4023: x[row++] = tmp1;
4024: x[row++] = tmp2;
4025: cnt += 4;
4026: break;
4027: case 3:
4028: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
4029: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4030: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4031: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4032: x[row++] = tmp1;
4033: x[row++] = tmp2;
4034: x[row++] = tmp3;
4035: cnt += 9;
4036: break;
4037: case 4:
4038: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4039: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4040: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4041: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4042: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4043: x[row++] = tmp1;
4044: x[row++] = tmp2;
4045: x[row++] = tmp3;
4046: x[row++] = tmp4;
4047: cnt += 16;
4048: break;
4049: case 5:
4050: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4051: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4052: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4053: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4054: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4055: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4056: x[row++] = tmp1;
4057: x[row++] = tmp2;
4058: x[row++] = tmp3;
4059: x[row++] = tmp4;
4060: x[row++] = tmp5;
4061: cnt += 25;
4062: break;
4063: default:
4064: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4065: }
4066: }
4067: PetscLogFlops(2*cnt);
4068: VecRestoreArray(xx,&x);
4069: VecRestoreArrayRead(bb,&b);
4070: return(0);
4071: }
4073: /*
4074: samestructure indicates that the matrix has not changed its nonzero structure so we
4075: do not need to recompute the inodes
4076: */
4079: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4080: {
4081: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4083: PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size;
4084: PetscBool flag;
4085: const PetscInt *idx,*idy,*ii;
4088: if (!a->inode.use) return(0);
4089: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);
4091: m = A->rmap->n;
4092: if (a->inode.size) ns = a->inode.size;
4093: else {
4094: PetscMalloc1(m+1,&ns);
4095: }
4097: i = 0;
4098: node_count = 0;
4099: idx = a->j;
4100: ii = a->i;
4101: while (i < m) { /* For each row */
4102: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
4103: /* Limits the number of elements in a node to 'a->inode.limit' */
4104: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4105: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
4106: if (nzy != nzx) break;
4107: idy += nzx; /* Same nonzero pattern */
4108: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
4109: if (!flag) break;
4110: }
4111: ns[node_count++] = blk_size;
4112: idx += blk_size*nzx;
4113: i = j;
4114: }
4115: /* If not enough inodes found,, do not use inode version of the routines */
4116: if (!m || node_count > .8*m) {
4117: PetscFree(ns);
4119: a->inode.node_count = 0;
4120: a->inode.size = NULL;
4121: a->inode.use = PETSC_FALSE;
4122: A->ops->mult = MatMult_SeqAIJ;
4123: A->ops->sor = MatSOR_SeqAIJ;
4124: A->ops->multadd = MatMultAdd_SeqAIJ;
4125: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4126: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4127: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4128: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4129: A->ops->coloringpatch = 0;
4130: A->ops->multdiagonalblock = 0;
4132: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4133: } else {
4134: if (!A->factortype) {
4135: A->ops->mult = MatMult_SeqAIJ_Inode;
4136: A->ops->sor = MatSOR_SeqAIJ_Inode;
4137: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4138: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4139: if (A->rmap->n == A->cmap->n) {
4140: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4141: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4142: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4143: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4144: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4145: }
4146: } else {
4147: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4148: }
4149: a->inode.node_count = node_count;
4150: a->inode.size = ns;
4151: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4152: }
4153: a->inode.checked = PETSC_TRUE;
4154: a->inode.mat_nonzerostate = A->nonzerostate;
4155: return(0);
4156: }
4160: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4161: {
4162: Mat B =*C;
4163: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4165: PetscInt m=A->rmap->n;
4168: c->inode.use = a->inode.use;
4169: c->inode.limit = a->inode.limit;
4170: c->inode.max_limit = a->inode.max_limit;
4171: if (a->inode.size) {
4172: PetscMalloc1(m+1,&c->inode.size);
4173: c->inode.node_count = a->inode.node_count;
4174: PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4175: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4176: if (!B->factortype) {
4177: B->ops->mult = MatMult_SeqAIJ_Inode;
4178: B->ops->sor = MatSOR_SeqAIJ_Inode;
4179: B->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4180: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4181: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4182: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4183: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4184: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4185: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4186: } else {
4187: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4188: }
4189: } else {
4190: c->inode.size = 0;
4191: c->inode.node_count = 0;
4192: }
4193: c->inode.ibdiagvalid = PETSC_FALSE;
4194: c->inode.ibdiag = 0;
4195: c->inode.bdiag = 0;
4196: return(0);
4197: }
4201: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4202: {
4203: PetscInt k;
4204: const PetscInt *vi;
4207: vi = aj + ai[row];
4208: for (k=0; k<nzl; k++) cols[k] = vi[k];
4209: vi = aj + adiag[row];
4210: cols[nzl] = vi[0];
4211: vi = aj + adiag[row+1]+1;
4212: for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4213: return(0);
4214: }
4215: /*
4216: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4217: Modified from MatSeqAIJCheckInode().
4219: Input Parameters:
4220: . Mat A - ILU or LU matrix factor
4222: */
4225: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4226: {
4227: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4229: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4230: PetscInt *cols1,*cols2,*ns;
4231: const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4232: PetscBool flag;
4235: if (!a->inode.use) return(0);
4236: if (a->inode.checked) return(0);
4238: m = A->rmap->n;
4239: if (a->inode.size) ns = a->inode.size;
4240: else {
4241: PetscMalloc1(m+1,&ns);
4242: }
4244: i = 0;
4245: node_count = 0;
4246: PetscMalloc2(m,&cols1,m,&cols2);
4247: while (i < m) { /* For each row */
4248: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4249: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4250: nzx = nzl1 + nzu1 + 1;
4251: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4253: /* Limits the number of elements in a node to 'a->inode.limit' */
4254: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4255: nzl2 = ai[j+1] - ai[j];
4256: nzu2 = adiag[j] - adiag[j+1] - 1;
4257: nzy = nzl2 + nzu2 + 1;
4258: if (nzy != nzx) break;
4259: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4260: PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4261: if (!flag) break;
4262: }
4263: ns[node_count++] = blk_size;
4264: i = j;
4265: }
4266: PetscFree2(cols1,cols2);
4267: /* If not enough inodes found,, do not use inode version of the routines */
4268: if (!m || node_count > .8*m) {
4269: PetscFree(ns);
4271: a->inode.node_count = 0;
4272: a->inode.size = NULL;
4273: a->inode.use = PETSC_FALSE;
4275: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4276: } else {
4277: A->ops->mult = 0;
4278: A->ops->sor = 0;
4279: A->ops->multadd = 0;
4280: A->ops->getrowij = 0;
4281: A->ops->restorerowij = 0;
4282: A->ops->getcolumnij = 0;
4283: A->ops->restorecolumnij = 0;
4284: A->ops->coloringpatch = 0;
4285: A->ops->multdiagonalblock = 0;
4286: a->inode.node_count = node_count;
4287: a->inode.size = ns;
4289: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4290: }
4291: a->inode.checked = PETSC_TRUE;
4292: return(0);
4293: }
4297: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4298: {
4299: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4302: a->inode.ibdiagvalid = PETSC_FALSE;
4303: return(0);
4304: }
4306: /*
4307: This is really ugly. if inodes are used this replaces the
4308: permutations with ones that correspond to rows/cols of the matrix
4309: rather then inode blocks
4310: */
4313: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4314: {
4318: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4319: return(0);
4320: }
4324: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4325: {
4326: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4328: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4329: const PetscInt *ridx,*cidx;
4330: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4331: PetscInt nslim_col,*ns_col;
4332: IS ris = *rperm,cis = *cperm;
4335: if (!a->inode.size) return(0); /* no inodes so return */
4336: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
4338: Mat_CreateColInode(A,&nslim_col,&ns_col);
4339: PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4340: PetscMalloc2(m,&permr,n,&permc);
4342: ISGetIndices(ris,&ridx);
4343: ISGetIndices(cis,&cidx);
4345: /* Form the inode structure for the rows of permuted matric using inv perm*/
4346: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4348: /* Construct the permutations for rows*/
4349: for (i=0,row = 0; i<nslim_row; ++i) {
4350: indx = ridx[i];
4351: start_val = tns[indx];
4352: end_val = tns[indx + 1];
4353: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4354: }
4356: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4357: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4359: /* Construct permutations for columns */
4360: for (i=0,col=0; i<nslim_col; ++i) {
4361: indx = cidx[i];
4362: start_val = tns[indx];
4363: end_val = tns[indx + 1];
4364: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4365: }
4367: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4368: ISSetPermutation(*rperm);
4369: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4370: ISSetPermutation(*cperm);
4372: ISRestoreIndices(ris,&ridx);
4373: ISRestoreIndices(cis,&cidx);
4375: PetscFree(ns_col);
4376: PetscFree2(permr,permc);
4377: ISDestroy(&cis);
4378: ISDestroy(&ris);
4379: PetscFree(tns);
4380: return(0);
4381: }
4385: /*@C
4386: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4388: Not Collective
4390: Input Parameter:
4391: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4393: Output Parameter:
4394: + node_count - no of inodes present in the matrix.
4395: . sizes - an array of size node_count,with sizes of each inode.
4396: - limit - the max size used to generate the inodes.
4398: Level: advanced
4400: Notes: This routine returns some internal storage information
4401: of the matrix, it is intended to be used by advanced users.
4402: It should be called after the matrix is assembled.
4403: The contents of the sizes[] array should not be changed.
4404: NULL may be passed for information not requested.
4406: .keywords: matrix, seqaij, get, inode
4408: .seealso: MatGetInfo()
4409: @*/
4410: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4411: {
4412: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4415: if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4416: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4417: if (f) {
4418: (*f)(A,node_count,sizes,limit);
4419: }
4420: return(0);
4421: }
4425: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4426: {
4427: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4430: if (node_count) *node_count = a->inode.node_count;
4431: if (sizes) *sizes = a->inode.size;
4432: if (limit) *limit = a->inode.limit;
4433: return(0);
4434: }