Actual source code: inode.c
1: #define PETSCMAT_DLL
3: /*
4: This file provides high performance routines for the Inode format (compressed sparse row)
5: by taking advantage of rows with identical nonzero structure (I-nodes).
6: */
7: #include src/mat/impls/aij/seq/aij.h
11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12: {
13: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
15: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
18: n = A->cmap.n;
19: m = A->rmap.n;
20: ns_row = a->inode.size;
21:
22: min_mn = (m < n) ? m : n;
23: if (!ns) {
24: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25: for(; count+1 < n; count++,i++);
26: if (count < n) {
27: i++;
28: }
29: *size = i;
30: return(0);
31: }
32: PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
33:
34: /* Use the same row structure wherever feasible. */
35: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36: ns_col[i] = ns_row[i];
37: }
39: /* if m < n; pad up the remainder with inode_limit */
40: for(; count+1 < n; count++,i++) {
41: ns_col[i] = 1;
42: }
43: /* The last node is the odd ball. padd it up with the remaining rows; */
44: if (count < n) {
45: ns_col[i] = n - count;
46: i++;
47: } else if (count > n) {
48: /* Adjust for the over estimation */
49: ns_col[i-1] += n - count;
50: }
51: *size = i;
52: *ns = ns_col;
53: return(0);
54: }
57: /*
58: This builds symmetric version of nonzero structure,
59: */
62: static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63: {
64: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
66: PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap.n;
72: n = A->cmap.n;
73: if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74:
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
81: PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1){
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col)
87: tvc[col] = i1;
88: }
89: /* allocate space for row pointers */
90: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
91: *iia = ia;
92: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
93: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
95: /* determine the number of columns in each row */
96: ia[0] = oshift;
97: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
99: j = aj + ai[row] + ishift;
100: jmax = aj + ai[row+1] + ishift;
101: i2 = 0;
102: col = *j++ + ishift;
103: i2 = tvc[col];
104: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105: ia[i1+1]++;
106: ia[i2+1]++;
107: i2++; /* Start col of next node */
108: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109: i2 = tvc[col];
110: }
111: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
112: }
114: /* shift ia[i] to point to next row */
115: for (i1=1; i1<nslim_row+1; i1++) {
116: row = ia[i1-1];
117: ia[i1] += row;
118: work[i1-1] = row - oshift;
119: }
121: /* allocate space for column pointers */
122: nz = ia[nslim_row] + (!ishift);
123: PetscMalloc(nz*sizeof(PetscInt),&ja);
124: *jja = ja;
126: /* loop over lower triangular part putting into ja */
127: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128: j = aj + ai[row] + ishift;
129: jmax = aj + ai[row+1] + ishift;
130: i2 = 0; /* Col inode index */
131: col = *j++ + ishift;
132: i2 = tvc[col];
133: while (i2<i1 && j<jmax) {
134: ja[work[i2]++] = i1 + oshift;
135: ja[work[i1]++] = i2 + oshift;
136: ++i2;
137: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138: i2 = tvc[col];
139: }
140: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
142: }
143: PetscFree(work);
144: PetscFree(tns);
145: PetscFree(tvc);
146: return(0);
147: }
149: /*
150: This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
158: PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
162: nslim_row = a->inode.node_count;
163: n = A->cmap.n;
165: /* Create The column_inode for this matrix */
166: Mat_CreateColInode(A,&nslim_col,&ns_col);
167:
168: /* allocate space for reformated column_inode structure */
169: PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170: PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
173: for (i1=0,col=0; i1<nslim_col; ++i1){
174: nsz = ns_col[i1];
175: for (i2=0; i2<nsz; ++i2,++col)
176: tvc[col] = i1;
177: }
178: /* allocate space for row pointers */
179: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180: *iia = ia;
181: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
184: /* determine the number of columns in each row */
185: ia[0] = oshift;
186: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187: j = aj + ai[row] + ishift;
188: col = *j++ + ishift;
189: i2 = tvc[col];
190: nz = ai[row+1] - ai[row];
191: while (nz-- > 0) { /* off-diagonal elemets */
192: ia[i1+1]++;
193: i2++; /* Start col of next node */
194: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195: if (nz > 0) i2 = tvc[col];
196: }
197: }
199: /* shift ia[i] to point to next row */
200: for (i1=1; i1<nslim_row+1; i1++) {
201: row = ia[i1-1];
202: ia[i1] += row;
203: work[i1-1] = row - oshift;
204: }
206: /* allocate space for column pointers */
207: nz = ia[nslim_row] + (!ishift);
208: PetscMalloc(nz*sizeof(PetscInt),&ja);
209: *jja = ja;
211: /* loop over matrix putting into ja */
212: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213: j = aj + ai[row] + ishift;
214: i2 = 0; /* Col inode index */
215: col = *j++ + ishift;
216: i2 = tvc[col];
217: nz = ai[row+1] - ai[row];
218: while (nz-- > 0) {
219: ja[work[i1]++] = i2 + oshift;
220: ++i2;
221: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222: if (nz > 0) i2 = tvc[col];
223: }
224: }
225: PetscFree(ns_col);
226: PetscFree(work);
227: PetscFree(tns);
228: PetscFree(tvc);
229: return(0);
230: }
234: static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
240: *n = a->inode.node_count;
241: if (!ia) return(0);
242: if (!blockcompressed) {
243: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
244: } else if (symmetric) {
245: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
246: } else {
247: MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
248: }
249: return(0);
250: }
254: static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255: {
259: if (!ia) return(0);
261: if (!blockcompressed) {
262: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
263: } else {
264: PetscFree(*ia);
265: PetscFree(*ja);
266: }
268: return(0);
269: }
271: /* ----------------------------------------------------------- */
275: static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276: {
277: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
279: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
283: nslim_row = a->inode.node_count;
284: n = A->cmap.n;
286: /* Create The column_inode for this matrix */
287: Mat_CreateColInode(A,&nslim_col,&ns_col);
288:
289: /* allocate space for reformated column_inode structure */
290: PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
291: PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
292: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
294: for (i1=0,col=0; i1<nslim_col; ++i1){
295: nsz = ns_col[i1];
296: for (i2=0; i2<nsz; ++i2,++col)
297: tvc[col] = i1;
298: }
299: /* allocate space for column pointers */
300: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
301: *iia = ia;
302: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
303: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);
305: /* determine the number of columns in each row */
306: ia[0] = oshift;
307: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308: j = aj + ai[row] + ishift;
309: col = *j++ + ishift;
310: i2 = tvc[col];
311: nz = ai[row+1] - ai[row];
312: while (nz-- > 0) { /* off-diagonal elemets */
313: /* ia[i1+1]++; */
314: ia[i2+1]++;
315: i2++;
316: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317: if (nz > 0) i2 = tvc[col];
318: }
319: }
321: /* shift ia[i] to point to next col */
322: for (i1=1; i1<nslim_col+1; i1++) {
323: col = ia[i1-1];
324: ia[i1] += col;
325: work[i1-1] = col - oshift;
326: }
328: /* allocate space for column pointers */
329: nz = ia[nslim_col] + (!ishift);
330: PetscMalloc(nz*sizeof(PetscInt),&ja);
331: *jja = ja;
333: /* loop over matrix putting into ja */
334: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335: j = aj + ai[row] + ishift;
336: i2 = 0; /* Col inode index */
337: col = *j++ + ishift;
338: i2 = tvc[col];
339: nz = ai[row+1] - ai[row];
340: while (nz-- > 0) {
341: /* ja[work[i1]++] = i2 + oshift; */
342: ja[work[i2]++] = i1 + oshift;
343: i2++;
344: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345: if (nz > 0) i2 = tvc[col];
346: }
347: }
348: PetscFree(ns_col);
349: PetscFree(work);
350: PetscFree(tns);
351: PetscFree(tvc);
352: return(0);
353: }
357: static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358: {
362: Mat_CreateColInode(A,n,PETSC_NULL);
363: if (!ia) return(0);
365: if (!blockcompressed) {
366: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367: } else if (symmetric) {
368: /* Since the indices are symmetric it does'nt matter */
369: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
370: } else {
371: MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
372: }
373: return(0);
374: }
378: static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379: {
383: if (!ia) return(0);
384: if (!blockcompressed) {
385: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
386: } else {
387: PetscFree(*ia);
388: PetscFree(*ja);
389: }
390: return(0);
391: }
393: /* ----------------------------------------------------------- */
397: static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398: {
399: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
400: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y;
403: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
404:
405: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
406: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
407: #endif
410: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
411: node_max = a->inode.node_count;
412: ns = a->inode.size; /* Node Size array */
413: VecGetArray(xx,&x);
414: VecGetArray(yy,&y);
415: idx = a->j;
416: v1 = a->a;
417: ii = a->i;
419: for (i = 0,row = 0; i< node_max; ++i){
420: nsz = ns[i];
421: n = ii[1] - ii[0];
422: nonzerorow += (n>0)*nsz;
423: ii += nsz;
424: sz = n; /* No of non zeros in this row */
425: /* Switch on the size of Node */
426: switch (nsz){ /* Each loop in 'case' is unrolled */
427: case 1 :
428: sum1 = 0;
429:
430: for(n = 0; n< sz-1; n+=2) {
431: i1 = idx[0]; /* The instructions are ordered to */
432: i2 = idx[1]; /* make the compiler's job easy */
433: idx += 2;
434: tmp0 = x[i1];
435: tmp1 = x[i2];
436: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
437: }
438:
439: if (n == sz-1){ /* Take care of the last nonzero */
440: tmp0 = x[*idx++];
441: sum1 += *v1++ * tmp0;
442: }
443: y[row++]=sum1;
444: break;
445: case 2:
446: sum1 = 0;
447: sum2 = 0;
448: v2 = v1 + n;
449:
450: for (n = 0; n< sz-1; n+=2) {
451: i1 = idx[0];
452: i2 = idx[1];
453: idx += 2;
454: tmp0 = x[i1];
455: tmp1 = x[i2];
456: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
457: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
458: }
459: if (n == sz-1){
460: tmp0 = x[*idx++];
461: sum1 += *v1++ * tmp0;
462: sum2 += *v2++ * tmp0;
463: }
464: y[row++]=sum1;
465: y[row++]=sum2;
466: v1 =v2; /* Since the next block to be processed starts there*/
467: idx +=sz;
468: break;
469: case 3:
470: sum1 = 0;
471: sum2 = 0;
472: sum3 = 0;
473: v2 = v1 + n;
474: v3 = v2 + n;
475:
476: for (n = 0; n< sz-1; n+=2) {
477: i1 = idx[0];
478: i2 = idx[1];
479: idx += 2;
480: tmp0 = x[i1];
481: tmp1 = x[i2];
482: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
483: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
484: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
485: }
486: if (n == sz-1){
487: tmp0 = x[*idx++];
488: sum1 += *v1++ * tmp0;
489: sum2 += *v2++ * tmp0;
490: sum3 += *v3++ * tmp0;
491: }
492: y[row++]=sum1;
493: y[row++]=sum2;
494: y[row++]=sum3;
495: v1 =v3; /* Since the next block to be processed starts there*/
496: idx +=2*sz;
497: break;
498: case 4:
499: sum1 = 0;
500: sum2 = 0;
501: sum3 = 0;
502: sum4 = 0;
503: v2 = v1 + n;
504: v3 = v2 + n;
505: v4 = v3 + n;
506:
507: for (n = 0; n< sz-1; n+=2) {
508: i1 = idx[0];
509: i2 = idx[1];
510: idx += 2;
511: tmp0 = x[i1];
512: tmp1 = x[i2];
513: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
514: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
515: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
516: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
517: }
518: if (n == sz-1){
519: tmp0 = x[*idx++];
520: sum1 += *v1++ * tmp0;
521: sum2 += *v2++ * tmp0;
522: sum3 += *v3++ * tmp0;
523: sum4 += *v4++ * tmp0;
524: }
525: y[row++]=sum1;
526: y[row++]=sum2;
527: y[row++]=sum3;
528: y[row++]=sum4;
529: v1 =v4; /* Since the next block to be processed starts there*/
530: idx +=3*sz;
531: break;
532: case 5:
533: sum1 = 0;
534: sum2 = 0;
535: sum3 = 0;
536: sum4 = 0;
537: sum5 = 0;
538: v2 = v1 + n;
539: v3 = v2 + n;
540: v4 = v3 + n;
541: v5 = v4 + n;
542:
543: for (n = 0; n<sz-1; n+=2) {
544: i1 = idx[0];
545: i2 = idx[1];
546: idx += 2;
547: tmp0 = x[i1];
548: tmp1 = x[i2];
549: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
550: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
551: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
552: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
553: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
554: }
555: if (n == sz-1){
556: tmp0 = x[*idx++];
557: sum1 += *v1++ * tmp0;
558: sum2 += *v2++ * tmp0;
559: sum3 += *v3++ * tmp0;
560: sum4 += *v4++ * tmp0;
561: sum5 += *v5++ * tmp0;
562: }
563: y[row++]=sum1;
564: y[row++]=sum2;
565: y[row++]=sum3;
566: y[row++]=sum4;
567: y[row++]=sum5;
568: v1 =v5; /* Since the next block to be processed starts there */
569: idx +=4*sz;
570: break;
571: default :
572: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
573: }
574: }
575: VecRestoreArray(xx,&x);
576: VecRestoreArray(yy,&y);
577: PetscLogFlops(2*a->nz - nonzerorow);
578: return(0);
579: }
580: /* ----------------------------------------------------------- */
581: /* Almost same code as the MatMult_Inode() */
584: static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
585: {
586: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
587: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
588: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
590: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
591:
593: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
594: node_max = a->inode.node_count;
595: ns = a->inode.size; /* Node Size array */
596: VecGetArray(xx,&x);
597: VecGetArray(yy,&y);
598: if (zz != yy) {
599: VecGetArray(zz,&z);
600: } else {
601: z = y;
602: }
603: zt = z;
605: idx = a->j;
606: v1 = a->a;
607: ii = a->i;
609: for (i = 0,row = 0; i< node_max; ++i){
610: nsz = ns[i];
611: n = ii[1] - ii[0];
612: ii += nsz;
613: sz = n; /* No of non zeros in this row */
614: /* Switch on the size of Node */
615: switch (nsz){ /* Each loop in 'case' is unrolled */
616: case 1 :
617: sum1 = *zt++;
618:
619: for(n = 0; n< sz-1; n+=2) {
620: i1 = idx[0]; /* The instructions are ordered to */
621: i2 = idx[1]; /* make the compiler's job easy */
622: idx += 2;
623: tmp0 = x[i1];
624: tmp1 = x[i2];
625: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
626: }
627:
628: if(n == sz-1){ /* Take care of the last nonzero */
629: tmp0 = x[*idx++];
630: sum1 += *v1++ * tmp0;
631: }
632: y[row++]=sum1;
633: break;
634: case 2:
635: sum1 = *zt++;
636: sum2 = *zt++;
637: v2 = v1 + n;
638:
639: for(n = 0; n< sz-1; n+=2) {
640: i1 = idx[0];
641: i2 = idx[1];
642: idx += 2;
643: tmp0 = x[i1];
644: tmp1 = x[i2];
645: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
646: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
647: }
648: if(n == sz-1){
649: tmp0 = x[*idx++];
650: sum1 += *v1++ * tmp0;
651: sum2 += *v2++ * tmp0;
652: }
653: y[row++]=sum1;
654: y[row++]=sum2;
655: v1 =v2; /* Since the next block to be processed starts there*/
656: idx +=sz;
657: break;
658: case 3:
659: sum1 = *zt++;
660: sum2 = *zt++;
661: sum3 = *zt++;
662: v2 = v1 + n;
663: v3 = v2 + n;
664:
665: for (n = 0; n< sz-1; n+=2) {
666: i1 = idx[0];
667: i2 = idx[1];
668: idx += 2;
669: tmp0 = x[i1];
670: tmp1 = x[i2];
671: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
672: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
673: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
674: }
675: if (n == sz-1){
676: tmp0 = x[*idx++];
677: sum1 += *v1++ * tmp0;
678: sum2 += *v2++ * tmp0;
679: sum3 += *v3++ * tmp0;
680: }
681: y[row++]=sum1;
682: y[row++]=sum2;
683: y[row++]=sum3;
684: v1 =v3; /* Since the next block to be processed starts there*/
685: idx +=2*sz;
686: break;
687: case 4:
688: sum1 = *zt++;
689: sum2 = *zt++;
690: sum3 = *zt++;
691: sum4 = *zt++;
692: v2 = v1 + n;
693: v3 = v2 + n;
694: v4 = v3 + n;
695:
696: for (n = 0; n< sz-1; n+=2) {
697: i1 = idx[0];
698: i2 = idx[1];
699: idx += 2;
700: tmp0 = x[i1];
701: tmp1 = x[i2];
702: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
703: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
704: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
705: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
706: }
707: if (n == sz-1){
708: tmp0 = x[*idx++];
709: sum1 += *v1++ * tmp0;
710: sum2 += *v2++ * tmp0;
711: sum3 += *v3++ * tmp0;
712: sum4 += *v4++ * tmp0;
713: }
714: y[row++]=sum1;
715: y[row++]=sum2;
716: y[row++]=sum3;
717: y[row++]=sum4;
718: v1 =v4; /* Since the next block to be processed starts there*/
719: idx +=3*sz;
720: break;
721: case 5:
722: sum1 = *zt++;
723: sum2 = *zt++;
724: sum3 = *zt++;
725: sum4 = *zt++;
726: sum5 = *zt++;
727: v2 = v1 + n;
728: v3 = v2 + n;
729: v4 = v3 + n;
730: v5 = v4 + n;
731:
732: for (n = 0; n<sz-1; n+=2) {
733: i1 = idx[0];
734: i2 = idx[1];
735: idx += 2;
736: tmp0 = x[i1];
737: tmp1 = x[i2];
738: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
739: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
740: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
741: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
742: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
743: }
744: if(n == sz-1){
745: tmp0 = x[*idx++];
746: sum1 += *v1++ * tmp0;
747: sum2 += *v2++ * tmp0;
748: sum3 += *v3++ * tmp0;
749: sum4 += *v4++ * tmp0;
750: sum5 += *v5++ * tmp0;
751: }
752: y[row++]=sum1;
753: y[row++]=sum2;
754: y[row++]=sum3;
755: y[row++]=sum4;
756: y[row++]=sum5;
757: v1 =v5; /* Since the next block to be processed starts there */
758: idx +=4*sz;
759: break;
760: default :
761: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
762: }
763: }
764: VecRestoreArray(xx,&x);
765: VecRestoreArray(yy,&y);
766: if (zz != yy) {
767: VecRestoreArray(zz,&z);
768: }
769: PetscLogFlops(2*a->nz);
770: return(0);
771: }
773: /* ----------------------------------------------------------- */
776: PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
777: {
778: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
779: IS iscol = a->col,isrow = a->row;
781: PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
782: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
783: PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
784: PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
787: if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
788: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
789: node_max = a->inode.node_count;
790: ns = a->inode.size; /* Node Size array */
792: VecGetArray(bb,&b);
793: VecGetArray(xx,&x);
794: tmp = a->solve_work;
795:
796: ISGetIndices(isrow,&rout); r = rout;
797: ISGetIndices(iscol,&cout); c = cout + (n-1);
798:
799: /* forward solve the lower triangular */
800: tmps = tmp ;
801: aa = a_a ;
802: aj = a_j ;
803: ad = a->diag;
805: for (i = 0,row = 0; i< node_max; ++i){
806: nsz = ns[i];
807: aii = ai[row];
808: v1 = aa + aii;
809: vi = aj + aii;
810: nz = ad[row]- aii;
811:
812: switch (nsz){ /* Each loop in 'case' is unrolled */
813: case 1 :
814: sum1 = b[*r++];
815: /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
816: for(j=0; j<nz-1; j+=2){
817: i0 = vi[0];
818: i1 = vi[1];
819: vi +=2;
820: tmp0 = tmps[i0];
821: tmp1 = tmps[i1];
822: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
823: }
824: if(j == nz-1){
825: tmp0 = tmps[*vi++];
826: sum1 -= *v1++ *tmp0;
827: }
828: tmp[row ++]=sum1;
829: break;
830: case 2:
831: sum1 = b[*r++];
832: sum2 = b[*r++];
833: v2 = aa + ai[row+1];
835: for(j=0; j<nz-1; j+=2){
836: i0 = vi[0];
837: i1 = vi[1];
838: vi +=2;
839: tmp0 = tmps[i0];
840: tmp1 = tmps[i1];
841: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
842: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
843: }
844: if(j == nz-1){
845: tmp0 = tmps[*vi++];
846: sum1 -= *v1++ *tmp0;
847: sum2 -= *v2++ *tmp0;
848: }
849: sum2 -= *v2++ * sum1;
850: tmp[row ++]=sum1;
851: tmp[row ++]=sum2;
852: break;
853: case 3:
854: sum1 = b[*r++];
855: sum2 = b[*r++];
856: sum3 = b[*r++];
857: v2 = aa + ai[row+1];
858: v3 = aa + ai[row+2];
859:
860: for (j=0; j<nz-1; j+=2){
861: i0 = vi[0];
862: i1 = vi[1];
863: vi +=2;
864: tmp0 = tmps[i0];
865: tmp1 = tmps[i1];
866: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
867: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
868: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
869: }
870: if (j == nz-1){
871: tmp0 = tmps[*vi++];
872: sum1 -= *v1++ *tmp0;
873: sum2 -= *v2++ *tmp0;
874: sum3 -= *v3++ *tmp0;
875: }
876: sum2 -= *v2++ * sum1;
877: sum3 -= *v3++ * sum1;
878: sum3 -= *v3++ * sum2;
879: tmp[row ++]=sum1;
880: tmp[row ++]=sum2;
881: tmp[row ++]=sum3;
882: break;
883:
884: case 4:
885: sum1 = b[*r++];
886: sum2 = b[*r++];
887: sum3 = b[*r++];
888: sum4 = b[*r++];
889: v2 = aa + ai[row+1];
890: v3 = aa + ai[row+2];
891: v4 = aa + ai[row+3];
892:
893: for (j=0; j<nz-1; j+=2){
894: i0 = vi[0];
895: i1 = vi[1];
896: vi +=2;
897: tmp0 = tmps[i0];
898: tmp1 = tmps[i1];
899: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
900: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
901: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
902: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
903: }
904: if (j == nz-1){
905: tmp0 = tmps[*vi++];
906: sum1 -= *v1++ *tmp0;
907: sum2 -= *v2++ *tmp0;
908: sum3 -= *v3++ *tmp0;
909: sum4 -= *v4++ *tmp0;
910: }
911: sum2 -= *v2++ * sum1;
912: sum3 -= *v3++ * sum1;
913: sum4 -= *v4++ * sum1;
914: sum3 -= *v3++ * sum2;
915: sum4 -= *v4++ * sum2;
916: sum4 -= *v4++ * sum3;
917:
918: tmp[row ++]=sum1;
919: tmp[row ++]=sum2;
920: tmp[row ++]=sum3;
921: tmp[row ++]=sum4;
922: break;
923: case 5:
924: sum1 = b[*r++];
925: sum2 = b[*r++];
926: sum3 = b[*r++];
927: sum4 = b[*r++];
928: sum5 = b[*r++];
929: v2 = aa + ai[row+1];
930: v3 = aa + ai[row+2];
931: v4 = aa + ai[row+3];
932: v5 = aa + ai[row+4];
933:
934: for (j=0; j<nz-1; j+=2){
935: i0 = vi[0];
936: i1 = vi[1];
937: vi +=2;
938: tmp0 = tmps[i0];
939: tmp1 = tmps[i1];
940: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
941: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
942: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
943: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
944: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
945: }
946: if (j == nz-1){
947: tmp0 = tmps[*vi++];
948: sum1 -= *v1++ *tmp0;
949: sum2 -= *v2++ *tmp0;
950: sum3 -= *v3++ *tmp0;
951: sum4 -= *v4++ *tmp0;
952: sum5 -= *v5++ *tmp0;
953: }
955: sum2 -= *v2++ * sum1;
956: sum3 -= *v3++ * sum1;
957: sum4 -= *v4++ * sum1;
958: sum5 -= *v5++ * sum1;
959: sum3 -= *v3++ * sum2;
960: sum4 -= *v4++ * sum2;
961: sum5 -= *v5++ * sum2;
962: sum4 -= *v4++ * sum3;
963: sum5 -= *v5++ * sum3;
964: sum5 -= *v5++ * sum4;
965:
966: tmp[row ++]=sum1;
967: tmp[row ++]=sum2;
968: tmp[row ++]=sum3;
969: tmp[row ++]=sum4;
970: tmp[row ++]=sum5;
971: break;
972: default:
973: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
974: }
975: }
976: /* backward solve the upper triangular */
977: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
978: nsz = ns[i];
979: aii = ai[row+1] -1;
980: v1 = aa + aii;
981: vi = aj + aii;
982: nz = aii- ad[row];
983: switch (nsz){ /* Each loop in 'case' is unrolled */
984: case 1 :
985: sum1 = tmp[row];
987: for(j=nz ; j>1; j-=2){
988: vi -=2;
989: i0 = vi[2];
990: i1 = vi[1];
991: tmp0 = tmps[i0];
992: tmp1 = tmps[i1];
993: v1 -= 2;
994: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
995: }
996: if (j==1){
997: tmp0 = tmps[*vi--];
998: sum1 -= *v1-- * tmp0;
999: }
1000: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1001: break;
1002: case 2 :
1003: sum1 = tmp[row];
1004: sum2 = tmp[row -1];
1005: v2 = aa + ai[row]-1;
1006: for (j=nz ; j>1; j-=2){
1007: vi -=2;
1008: i0 = vi[2];
1009: i1 = vi[1];
1010: tmp0 = tmps[i0];
1011: tmp1 = tmps[i1];
1012: v1 -= 2;
1013: v2 -= 2;
1014: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1015: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1016: }
1017: if (j==1){
1018: tmp0 = tmps[*vi--];
1019: sum1 -= *v1-- * tmp0;
1020: sum2 -= *v2-- * tmp0;
1021: }
1022:
1023: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1024: sum2 -= *v2-- * tmp0;
1025: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1026: break;
1027: case 3 :
1028: sum1 = tmp[row];
1029: sum2 = tmp[row -1];
1030: sum3 = tmp[row -2];
1031: v2 = aa + ai[row]-1;
1032: v3 = aa + ai[row -1]-1;
1033: for (j=nz ; j>1; j-=2){
1034: vi -=2;
1035: i0 = vi[2];
1036: i1 = vi[1];
1037: tmp0 = tmps[i0];
1038: tmp1 = tmps[i1];
1039: v1 -= 2;
1040: v2 -= 2;
1041: v3 -= 2;
1042: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1043: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1044: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1045: }
1046: if (j==1){
1047: tmp0 = tmps[*vi--];
1048: sum1 -= *v1-- * tmp0;
1049: sum2 -= *v2-- * tmp0;
1050: sum3 -= *v3-- * tmp0;
1051: }
1052: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1053: sum2 -= *v2-- * tmp0;
1054: sum3 -= *v3-- * tmp0;
1055: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1056: sum3 -= *v3-- * tmp0;
1057: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1058:
1059: break;
1060: case 4 :
1061: sum1 = tmp[row];
1062: sum2 = tmp[row -1];
1063: sum3 = tmp[row -2];
1064: sum4 = tmp[row -3];
1065: v2 = aa + ai[row]-1;
1066: v3 = aa + ai[row -1]-1;
1067: v4 = aa + ai[row -2]-1;
1069: for (j=nz ; j>1; j-=2){
1070: vi -=2;
1071: i0 = vi[2];
1072: i1 = vi[1];
1073: tmp0 = tmps[i0];
1074: tmp1 = tmps[i1];
1075: v1 -= 2;
1076: v2 -= 2;
1077: v3 -= 2;
1078: v4 -= 2;
1079: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1080: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1081: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1082: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1083: }
1084: if (j==1){
1085: tmp0 = tmps[*vi--];
1086: sum1 -= *v1-- * tmp0;
1087: sum2 -= *v2-- * tmp0;
1088: sum3 -= *v3-- * tmp0;
1089: sum4 -= *v4-- * tmp0;
1090: }
1092: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1093: sum2 -= *v2-- * tmp0;
1094: sum3 -= *v3-- * tmp0;
1095: sum4 -= *v4-- * tmp0;
1096: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1097: sum3 -= *v3-- * tmp0;
1098: sum4 -= *v4-- * tmp0;
1099: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1100: sum4 -= *v4-- * tmp0;
1101: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1102: break;
1103: case 5 :
1104: sum1 = tmp[row];
1105: sum2 = tmp[row -1];
1106: sum3 = tmp[row -2];
1107: sum4 = tmp[row -3];
1108: sum5 = tmp[row -4];
1109: v2 = aa + ai[row]-1;
1110: v3 = aa + ai[row -1]-1;
1111: v4 = aa + ai[row -2]-1;
1112: v5 = aa + ai[row -3]-1;
1113: for (j=nz ; j>1; j-=2){
1114: vi -= 2;
1115: i0 = vi[2];
1116: i1 = vi[1];
1117: tmp0 = tmps[i0];
1118: tmp1 = tmps[i1];
1119: v1 -= 2;
1120: v2 -= 2;
1121: v3 -= 2;
1122: v4 -= 2;
1123: v5 -= 2;
1124: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1125: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1126: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1127: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1128: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1129: }
1130: if (j==1){
1131: tmp0 = tmps[*vi--];
1132: sum1 -= *v1-- * tmp0;
1133: sum2 -= *v2-- * tmp0;
1134: sum3 -= *v3-- * tmp0;
1135: sum4 -= *v4-- * tmp0;
1136: sum5 -= *v5-- * tmp0;
1137: }
1139: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1140: sum2 -= *v2-- * tmp0;
1141: sum3 -= *v3-- * tmp0;
1142: sum4 -= *v4-- * tmp0;
1143: sum5 -= *v5-- * tmp0;
1144: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1145: sum3 -= *v3-- * tmp0;
1146: sum4 -= *v4-- * tmp0;
1147: sum5 -= *v5-- * tmp0;
1148: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1149: sum4 -= *v4-- * tmp0;
1150: sum5 -= *v5-- * tmp0;
1151: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1152: sum5 -= *v5-- * tmp0;
1153: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1154: break;
1155: default:
1156: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1157: }
1158: }
1159: ISRestoreIndices(isrow,&rout);
1160: ISRestoreIndices(iscol,&cout);
1161: VecRestoreArray(bb,&b);
1162: VecRestoreArray(xx,&x);
1163: PetscLogFlops(2*a->nz - A->cmap.n);
1164: return(0);
1165: }
1169: PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1170: {
1171: Mat C = *B;
1172: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1173: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1175: PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1176: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1177: PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1178: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1179: PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1180: PetscScalar tmp,*ba = b->a,*aa = a->a,*pv;
1181: PetscReal rs=0.0;
1182: LUShift_Ctx sctx;
1183: PetscInt newshift;
1186: sctx.shift_top = 0;
1187: sctx.nshift_max = 0;
1188: sctx.shift_lo = 0;
1189: sctx.shift_hi = 0;
1191: /* if both shift schemes are chosen by user, only use info->shiftpd */
1192: if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1193: if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1194: sctx.shift_top = 0;
1195: for (i=0; i<n; i++) {
1196: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1197: rs = 0.0;
1198: ajtmp = aj + ai[i];
1199: rtmp1 = aa + ai[i];
1200: nz = ai[i+1] - ai[i];
1201: for (j=0; j<nz; j++){
1202: if (*ajtmp != i){
1203: rs += PetscAbsScalar(*rtmp1++);
1204: } else {
1205: rs -= PetscRealPart(*rtmp1++);
1206: }
1207: ajtmp++;
1208: }
1209: if (rs>sctx.shift_top) sctx.shift_top = rs;
1210: }
1211: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1212: sctx.shift_top *= 1.1;
1213: sctx.nshift_max = 5;
1214: sctx.shift_lo = 0.;
1215: sctx.shift_hi = 1.;
1216: }
1217: sctx.shift_amount = 0;
1218: sctx.nshift = 0;
1220: ISGetIndices(isrow,&r);
1221: ISGetIndices(iscol,&c);
1222: ISGetIndices(isicol,&ic);
1223: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1224: PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1225: ics = ic ;
1226: rtmp2 = rtmp1 + n;
1227: rtmp3 = rtmp2 + n;
1228:
1229: node_max = a->inode.node_count;
1230: ns = a->inode.size ;
1231: if (!ns){
1232: SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1233: }
1235: /* If max inode size > 3, split it into two inodes.*/
1236: /* also map the inode sizes according to the ordering */
1237: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1238: for (i=0,j=0; i<node_max; ++i,++j){
1239: if (ns[i]>3) {
1240: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1241: ++j;
1242: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1243: } else {
1244: tmp_vec1[j] = ns[i];
1245: }
1246: }
1247: /* Use the correct node_max */
1248: node_max = j;
1250: /* Now reorder the inode info based on mat re-ordering info */
1251: /* First create a row -> inode_size_array_index map */
1252: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1253: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1254: for (i=0,row=0; i<node_max; i++) {
1255: nodesz = tmp_vec1[i];
1256: for (j=0; j<nodesz; j++,row++) {
1257: nsmap[row] = i;
1258: }
1259: }
1260: /* Using nsmap, create a reordered ns structure */
1261: for (i=0,j=0; i< node_max; i++) {
1262: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1263: tmp_vec2[i] = nodesz;
1264: j += nodesz;
1265: }
1266: PetscFree(nsmap);
1267: PetscFree(tmp_vec1);
1268: /* Now use the correct ns */
1269: ns = tmp_vec2;
1271: do {
1272: sctx.lushift = PETSC_FALSE;
1273: /* Now loop over each block-row, and do the factorization */
1274: for (i=0,row=0; i<node_max; i++) {
1275: nodesz = ns[i];
1276: nz = bi[row+1] - bi[row];
1277: bjtmp = bj + bi[row];
1279: switch (nodesz){
1280: case 1:
1281: for (j=0; j<nz; j++){
1282: idx = bjtmp[j];
1283: rtmp1[idx] = 0.0;
1284: }
1285:
1286: /* load in initial (unfactored row) */
1287: idx = r[row];
1288: nz_tmp = ai[idx+1] - ai[idx];
1289: ajtmp = aj + ai[idx];
1290: v1 = aa + ai[idx];
1292: for (j=0; j<nz_tmp; j++) {
1293: idx = ics[ajtmp[j]];
1294: rtmp1[idx] = v1[j];
1295: }
1296: rtmp1[ics[r[row]]] += sctx.shift_amount;
1298: prow = *bjtmp++ ;
1299: while (prow < row) {
1300: pc1 = rtmp1 + prow;
1301: if (*pc1 != 0.0){
1302: pv = ba + bd[prow];
1303: pj = nbj + bd[prow];
1304: mul1 = *pc1 * *pv++;
1305: *pc1 = mul1;
1306: nz_tmp = bi[prow+1] - bd[prow] - 1;
1307: PetscLogFlops(2*nz_tmp);
1308: for (j=0; j<nz_tmp; j++) {
1309: tmp = pv[j];
1310: idx = pj[j];
1311: rtmp1[idx] -= mul1 * tmp;
1312: }
1313: }
1314: prow = *bjtmp++ ;
1315: }
1316: pj = bj + bi[row];
1317: pc1 = ba + bi[row];
1319: sctx.pv = rtmp1[row];
1320: rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */
1321: rs = 0.0;
1322: for (j=0; j<nz; j++) {
1323: idx = pj[j];
1324: pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */
1325: if (idx != row) rs += PetscAbsScalar(pc1[j]);
1326: }
1327: sctx.rs = rs;
1328: MatLUCheckShift_inline(info,sctx,row,newshift);
1329: if (newshift == 1) goto endofwhile;
1330: break;
1331:
1332: case 2:
1333: for (j=0; j<nz; j++) {
1334: idx = bjtmp[j];
1335: rtmp1[idx] = 0.0;
1336: rtmp2[idx] = 0.0;
1337: }
1338:
1339: /* load in initial (unfactored row) */
1340: idx = r[row];
1341: nz_tmp = ai[idx+1] - ai[idx];
1342: ajtmp = aj + ai[idx];
1343: v1 = aa + ai[idx];
1344: v2 = aa + ai[idx+1];
1345: for (j=0; j<nz_tmp; j++) {
1346: idx = ics[ajtmp[j]];
1347: rtmp1[idx] = v1[j];
1348: rtmp2[idx] = v2[j];
1349: }
1350: rtmp1[ics[r[row]]] += sctx.shift_amount;
1351: rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1353: prow = *bjtmp++ ;
1354: while (prow < row) {
1355: pc1 = rtmp1 + prow;
1356: pc2 = rtmp2 + prow;
1357: if (*pc1 != 0.0 || *pc2 != 0.0){
1358: pv = ba + bd[prow];
1359: pj = nbj + bd[prow];
1360: mul1 = *pc1 * *pv;
1361: mul2 = *pc2 * *pv;
1362: ++pv;
1363: *pc1 = mul1;
1364: *pc2 = mul2;
1365:
1366: nz_tmp = bi[prow+1] - bd[prow] - 1;
1367: for (j=0; j<nz_tmp; j++) {
1368: tmp = pv[j];
1369: idx = pj[j];
1370: rtmp1[idx] -= mul1 * tmp;
1371: rtmp2[idx] -= mul2 * tmp;
1372: }
1373: PetscLogFlops(4*nz_tmp);
1374: }
1375: prow = *bjtmp++ ;
1376: }
1378: /* Now take care of diagonal 2x2 block. Note: prow = row here */
1379: pc1 = rtmp1 + prow;
1380: pc2 = rtmp2 + prow;
1382: sctx.pv = *pc1;
1383: pj = bj + bi[prow];
1384: rs = 0.0;
1385: for (j=0; j<nz; j++){
1386: idx = pj[j];
1387: if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]);
1388: }
1389: sctx.rs = rs;
1390: MatLUCheckShift_inline(info,sctx,row,newshift);
1391: if (newshift == 1) goto endofwhile;
1393: if (*pc2 != 0.0){
1394: pj = nbj + bd[prow];
1395: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1396: *pc2 = mul2;
1397: nz_tmp = bi[prow+1] - bd[prow] - 1;
1398: for (j=0; j<nz_tmp; j++) {
1399: idx = pj[j] ;
1400: tmp = rtmp1[idx];
1401: rtmp2[idx] -= mul2 * tmp;
1402: }
1403: PetscLogFlops(2*nz_tmp);
1404: }
1405:
1406: pj = bj + bi[row];
1407: pc1 = ba + bi[row];
1408: pc2 = ba + bi[row+1];
1410: sctx.pv = rtmp2[row+1];
1411: rs = 0.0;
1412: rtmp1[row] = 1.0/rtmp1[row];
1413: rtmp2[row+1] = 1.0/rtmp2[row+1];
1414: /* copy row entries from dense representation to sparse */
1415: for (j=0; j<nz; j++) {
1416: idx = pj[j];
1417: pc1[j] = rtmp1[idx];
1418: pc2[j] = rtmp2[idx];
1419: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1420: }
1421: sctx.rs = rs;
1422: MatLUCheckShift_inline(info,sctx,row+1,newshift);
1423: if (newshift == 1) goto endofwhile;
1424: break;
1426: case 3:
1427: for (j=0; j<nz; j++) {
1428: idx = bjtmp[j];
1429: rtmp1[idx] = 0.0;
1430: rtmp2[idx] = 0.0;
1431: rtmp3[idx] = 0.0;
1432: }
1433: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1434: idx = r[row];
1435: nz_tmp = ai[idx+1] - ai[idx];
1436: ajtmp = aj + ai[idx];
1437: v1 = aa + ai[idx];
1438: v2 = aa + ai[idx+1];
1439: v3 = aa + ai[idx+2];
1440: for (j=0; j<nz_tmp; j++) {
1441: idx = ics[ajtmp[j]];
1442: rtmp1[idx] = v1[j];
1443: rtmp2[idx] = v2[j];
1444: rtmp3[idx] = v3[j];
1445: }
1446: rtmp1[ics[r[row]]] += sctx.shift_amount;
1447: rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1448: rtmp3[ics[r[row+2]]] += sctx.shift_amount;
1450: /* loop over all pivot row blocks above this row block */
1451: prow = *bjtmp++ ;
1452: while (prow < row) {
1453: pc1 = rtmp1 + prow;
1454: pc2 = rtmp2 + prow;
1455: pc3 = rtmp3 + prow;
1456: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1457: pv = ba + bd[prow];
1458: pj = nbj + bd[prow];
1459: mul1 = *pc1 * *pv;
1460: mul2 = *pc2 * *pv;
1461: mul3 = *pc3 * *pv;
1462: ++pv;
1463: *pc1 = mul1;
1464: *pc2 = mul2;
1465: *pc3 = mul3;
1466:
1467: nz_tmp = bi[prow+1] - bd[prow] - 1;
1468: /* update this row based on pivot row */
1469: for (j=0; j<nz_tmp; j++) {
1470: tmp = pv[j];
1471: idx = pj[j];
1472: rtmp1[idx] -= mul1 * tmp;
1473: rtmp2[idx] -= mul2 * tmp;
1474: rtmp3[idx] -= mul3 * tmp;
1475: }
1476: PetscLogFlops(6*nz_tmp);
1477: }
1478: prow = *bjtmp++ ;
1479: }
1481: /* Now take care of diagonal 3x3 block in this set of rows */
1482: /* note: prow = row here */
1483: pc1 = rtmp1 + prow;
1484: pc2 = rtmp2 + prow;
1485: pc3 = rtmp3 + prow;
1487: sctx.pv = *pc1;
1488: pj = bj + bi[prow];
1489: rs = 0.0;
1490: for (j=0; j<nz; j++){
1491: idx = pj[j];
1492: if (idx != row) rs += PetscAbsScalar(rtmp1[idx]);
1493: }
1494: sctx.rs = rs;
1495: MatLUCheckShift_inline(info,sctx,row,newshift);
1496: if (newshift == 1) goto endofwhile;
1498: if (*pc2 != 0.0 || *pc3 != 0.0){
1499: mul2 = (*pc2)/(*pc1);
1500: mul3 = (*pc3)/(*pc1);
1501: *pc2 = mul2;
1502: *pc3 = mul3;
1503: nz_tmp = bi[prow+1] - bd[prow] - 1;
1504: pj = nbj + bd[prow];
1505: for (j=0; j<nz_tmp; j++) {
1506: idx = pj[j] ;
1507: tmp = rtmp1[idx];
1508: rtmp2[idx] -= mul2 * tmp;
1509: rtmp3[idx] -= mul3 * tmp;
1510: }
1511: PetscLogFlops(4*nz_tmp);
1512: }
1513: ++prow;
1515: pc2 = rtmp2 + prow;
1516: pc3 = rtmp3 + prow;
1517: sctx.pv = *pc2;
1518: pj = bj + bi[prow];
1519: rs = 0.0;
1520: for (j=0; j<nz; j++){
1521: idx = pj[j];
1522: if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1523: }
1524: sctx.rs = rs;
1525: MatLUCheckShift_inline(info,sctx,row+1,newshift);
1526: if (newshift == 1) goto endofwhile;
1528: if (*pc3 != 0.0){
1529: mul3 = (*pc3)/(*pc2);
1530: *pc3 = mul3;
1531: pj = nbj + bd[prow];
1532: nz_tmp = bi[prow+1] - bd[prow] - 1;
1533: for (j=0; j<nz_tmp; j++) {
1534: idx = pj[j] ;
1535: tmp = rtmp2[idx];
1536: rtmp3[idx] -= mul3 * tmp;
1537: }
1538: PetscLogFlops(4*nz_tmp);
1539: }
1541: pj = bj + bi[row];
1542: pc1 = ba + bi[row];
1543: pc2 = ba + bi[row+1];
1544: pc3 = ba + bi[row+2];
1546: sctx.pv = rtmp3[row+2];
1547: rs = 0.0;
1548: rtmp1[row] = 1.0/rtmp1[row];
1549: rtmp2[row+1] = 1.0/rtmp2[row+1];
1550: rtmp3[row+2] = 1.0/rtmp3[row+2];
1551: /* copy row entries from dense representation to sparse */
1552: for (j=0; j<nz; j++) {
1553: idx = pj[j];
1554: pc1[j] = rtmp1[idx];
1555: pc2[j] = rtmp2[idx];
1556: pc3[j] = rtmp3[idx];
1557: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1558: }
1560: sctx.rs = rs;
1561: MatLUCheckShift_inline(info,sctx,row+2,newshift);
1562: if (newshift == 1) goto endofwhile;
1563: break;
1565: default:
1566: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1567: }
1568: row += nodesz; /* Update the row */
1569: }
1570: endofwhile:;
1571: } while (sctx.lushift);
1572: PetscFree(rtmp1);
1573: PetscFree(tmp_vec2);
1574: ISRestoreIndices(isicol,&ic);
1575: ISRestoreIndices(isrow,&r);
1576: ISRestoreIndices(iscol,&c);
1577: C->factor = FACTOR_LU;
1578: C->assembled = PETSC_TRUE;
1579: if (sctx.nshift) {
1580: if (info->shiftnz) {
1581: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1582: } else if (info->shiftpd) {
1583: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);
1584: }
1585: }
1586: PetscLogFlops(C->cmap.n);
1587: return(0);
1588: }
1590: /*
1591: Makes a longer coloring[] array and calls the usual code with that
1592: */
1595: PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1596: {
1597: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
1598: PetscErrorCode ierr;
1599: PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1600: PetscInt *colorused,i;
1601: ISColoringValue *newcolor;
1604: PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
1605: /* loop over inodes, marking a color for each column*/
1606: row = 0;
1607: for (i=0; i<m; i++){
1608: for (j=0; j<ns[i]; j++) {
1609: newcolor[row++] = coloring[i] + j*ncolors;
1610: }
1611: }
1613: /* eliminate unneeded colors */
1614: PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
1615: PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
1616: for (i=0; i<n; i++) {
1617: colorused[newcolor[i]] = 1;
1618: }
1620: for (i=1; i<5*ncolors; i++) {
1621: colorused[i] += colorused[i-1];
1622: }
1623: ncolors = colorused[5*ncolors-1];
1624: for (i=0; i<n; i++) {
1625: newcolor[i] = colorused[newcolor[i]]-1;
1626: }
1627: PetscFree(colorused);
1628: ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);
1629: PetscFree(coloring);
1630: return(0);
1631: }
1633: #include src/inline/ilu.h
1637: PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1638: {
1639: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1640: PetscScalar *x,*xs,*ibdiag,*bdiag,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3,*v1,*v2,*v3,*v4,*v5;
1641: PetscScalar *v = a->a,*b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1642: PetscReal zeropivot = 1.0e-15;
1643: PetscErrorCode ierr;
1644: PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1645: PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k;
1648: if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1649: if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1650: if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode");
1652: if (!a->inode.ibdiagvalid) {
1653: if (!a->inode.ibdiag) {
1654: /* calculate space needed for diagonal blocks */
1655: for (i=0; i<m; i++) {
1656: cnt += sizes[i]*sizes[i];
1657: }
1658: a->inode.bdiagsize = cnt;
1659: PetscMalloc2(cnt,PetscScalar,&a->inode.ibdiag,cnt,PetscScalar,&a->inode.bdiag);
1660: }
1662: /* copy over the diagonal blocks and invert them */
1663: ibdiag = a->inode.ibdiag;
1664: bdiag = a->inode.bdiag;
1665: cnt = 0;
1666: for (i=0, row = 0; i<m; i++) {
1667: for (j=0; j<sizes[i]; j++) {
1668: for (k=0; k<sizes[i]; k++) {
1669: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1670: }
1671: }
1672: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(PetscScalar));
1673:
1674: switch(sizes[i]) {
1675: case 1:
1676: /* Create matrix data structure */
1677: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1678: ibdiag[cnt] = 1.0/ibdiag[cnt];
1679: break;
1680: case 2:
1681: Kernel_A_gets_inverse_A_2(ibdiag+cnt);
1682: break;
1683: case 3:
1684: Kernel_A_gets_inverse_A_3(ibdiag+cnt);
1685: break;
1686: case 4:
1687: Kernel_A_gets_inverse_A_4(ibdiag+cnt);
1688: break;
1689: case 5:
1690: Kernel_A_gets_inverse_A_5(ibdiag+cnt);
1691: break;
1692: default:
1693: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1694: }
1695: cnt += sizes[i]*sizes[i];
1696: row += sizes[i];
1697: }
1698: a->inode.ibdiagvalid = PETSC_TRUE;
1699: }
1700: ibdiag = a->inode.ibdiag;
1701: bdiag = a->inode.bdiag;
1703: VecGetArray(xx,&x);
1704: if (xx != bb) {
1705: VecGetArray(bb,(PetscScalar**)&b);
1706: } else {
1707: b = x;
1708: }
1710: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1711: xs = x;
1712: if (flag & SOR_ZERO_INITIAL_GUESS) {
1713: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1715: for (i=0, row=0; i<m; i++) {
1716: sz = diag[row] - ii[row];
1717: v1 = a->a + ii[row];
1718: idx = a->j + ii[row];
1720: /* see comments for MatMult_Inode() for how this is coded */
1721: switch (sizes[i]){
1722: case 1:
1723:
1724: sum1 = b[row];
1725: for(n = 0; n<sz-1; n+=2) {
1726: i1 = idx[0];
1727: i2 = idx[1];
1728: idx += 2;
1729: tmp0 = x[i1];
1730: tmp1 = x[i2];
1731: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1732: }
1733:
1734: if (n == sz-1){
1735: tmp0 = x[*idx];
1736: sum1 -= *v1 * tmp0;
1737: }
1738: x[row++] = sum1*(*ibdiag++);
1739: break;
1740: case 2:
1741: v2 = a->a + ii[row+1];
1742: sum1 = b[row];
1743: sum2 = b[row+1];
1744: for(n = 0; n<sz-1; n+=2) {
1745: i1 = idx[0];
1746: i2 = idx[1];
1747: idx += 2;
1748: tmp0 = x[i1];
1749: tmp1 = x[i2];
1750: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1751: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1752: }
1753:
1754: if (n == sz-1){
1755: tmp0 = x[*idx];
1756: sum1 -= v1[0] * tmp0;
1757: sum2 -= v2[0] * tmp0;
1758: }
1759: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1760: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1761: ibdiag += 4;
1762: break;
1763: case 3:
1764: v2 = a->a + ii[row+1];
1765: v3 = a->a + ii[row+2];
1766: sum1 = b[row];
1767: sum2 = b[row+1];
1768: sum3 = b[row+2];
1769: for(n = 0; n<sz-1; n+=2) {
1770: i1 = idx[0];
1771: i2 = idx[1];
1772: idx += 2;
1773: tmp0 = x[i1];
1774: tmp1 = x[i2];
1775: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1776: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1777: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1778: }
1779:
1780: if (n == sz-1){
1781: tmp0 = x[*idx];
1782: sum1 -= v1[0] * tmp0;
1783: sum2 -= v2[0] * tmp0;
1784: sum3 -= v3[0] * tmp0;
1785: }
1786: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1787: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1788: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1789: ibdiag += 9;
1790: break;
1791: case 4:
1792: v2 = a->a + ii[row+1];
1793: v3 = a->a + ii[row+2];
1794: v4 = a->a + ii[row+3];
1795: sum1 = b[row];
1796: sum2 = b[row+1];
1797: sum3 = b[row+2];
1798: sum4 = b[row+3];
1799: for(n = 0; n<sz-1; n+=2) {
1800: i1 = idx[0];
1801: i2 = idx[1];
1802: idx += 2;
1803: tmp0 = x[i1];
1804: tmp1 = x[i2];
1805: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1806: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1807: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1808: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1809: }
1810:
1811: if (n == sz-1){
1812: tmp0 = x[*idx];
1813: sum1 -= v1[0] * tmp0;
1814: sum2 -= v2[0] * tmp0;
1815: sum3 -= v3[0] * tmp0;
1816: sum4 -= v4[0] * tmp0;
1817: }
1818: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1819: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1820: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1821: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1822: ibdiag += 16;
1823: break;
1824: case 5:
1825: v2 = a->a + ii[row+1];
1826: v3 = a->a + ii[row+2];
1827: v4 = a->a + ii[row+3];
1828: v5 = a->a + ii[row+4];
1829: sum1 = b[row];
1830: sum2 = b[row+1];
1831: sum3 = b[row+2];
1832: sum4 = b[row+3];
1833: sum5 = b[row+4];
1834: for(n = 0; n<sz-1; n+=2) {
1835: i1 = idx[0];
1836: i2 = idx[1];
1837: idx += 2;
1838: tmp0 = x[i1];
1839: tmp1 = x[i2];
1840: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1841: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1842: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1843: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1844: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1845: }
1846:
1847: if (n == sz-1){
1848: tmp0 = x[*idx];
1849: sum1 -= v1[0] * tmp0;
1850: sum2 -= v2[0] * tmp0;
1851: sum3 -= v3[0] * tmp0;
1852: sum4 -= v4[0] * tmp0;
1853: sum5 -= v5[0] * tmp0;
1854: }
1855: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1856: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1857: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1858: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1859: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1860: ibdiag += 25;
1861: break;
1862: default:
1863: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1864: }
1865: }
1867: xb = x;
1868: PetscLogFlops(a->nz);
1869: } else xb = b;
1870: if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1871: (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1872: cnt = 0;
1873: for (i=0, row=0; i<m; i++) {
1875: switch (sizes[i]){
1876: case 1:
1877: x[row++] *= bdiag[cnt++];
1878: break;
1879: case 2:
1880: x1 = x[row]; x2 = x[row+1];
1881: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1882: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1883: x[row++] = tmp1;
1884: x[row++] = tmp2;
1885: cnt += 4;
1886: break;
1887: case 3:
1888: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
1889: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1890: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1891: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1892: x[row++] = tmp1;
1893: x[row++] = tmp2;
1894: x[row++] = tmp3;
1895: cnt += 9;
1896: break;
1897: case 4:
1898: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1899: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1900: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1901: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1902: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1903: x[row++] = tmp1;
1904: x[row++] = tmp2;
1905: x[row++] = tmp3;
1906: x[row++] = tmp4;
1907: cnt += 16;
1908: break;
1909: case 5:
1910: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1911: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1912: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1913: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1914: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1915: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1916: x[row++] = tmp1;
1917: x[row++] = tmp2;
1918: x[row++] = tmp3;
1919: x[row++] = tmp4;
1920: x[row++] = tmp5;
1921: cnt += 25;
1922: break;
1923: default:
1924: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1925: }
1926: }
1927: PetscLogFlops(m);
1928: }
1929: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1931: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1932: for (i=m-1, row=A->rmap.n-1; i>=0; i--) {
1933: ibdiag -= sizes[i]*sizes[i];
1934: sz = ii[row+1] - diag[row] - 1;
1935: v1 = a->a + diag[row] + 1;
1936: idx = a->j + diag[row] + 1;
1938: /* see comments for MatMult_Inode() for how this is coded */
1939: switch (sizes[i]){
1940: case 1:
1941:
1942: sum1 = xb[row];
1943: for(n = 0; n<sz-1; n+=2) {
1944: i1 = idx[0];
1945: i2 = idx[1];
1946: idx += 2;
1947: tmp0 = x[i1];
1948: tmp1 = x[i2];
1949: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1950: }
1951:
1952: if (n == sz-1){
1953: tmp0 = x[*idx];
1954: sum1 -= *v1*tmp0;
1955: }
1956: x[row--] = sum1*(*ibdiag);
1957: break;
1959: case 2:
1960:
1961: sum1 = xb[row];
1962: sum2 = xb[row-1];
1963: /* note that sum1 is associated with the second of the two rows */
1964: v2 = a->a + diag[row-1] + 2;
1965: for(n = 0; n<sz-1; n+=2) {
1966: i1 = idx[0];
1967: i2 = idx[1];
1968: idx += 2;
1969: tmp0 = x[i1];
1970: tmp1 = x[i2];
1971: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1972: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1973: }
1974:
1975: if (n == sz-1){
1976: tmp0 = x[*idx];
1977: sum1 -= *v1*tmp0;
1978: sum2 -= *v2*tmp0;
1979: }
1980: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1981: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1982: break;
1983: case 3:
1984:
1985: sum1 = xb[row];
1986: sum2 = xb[row-1];
1987: sum3 = xb[row-2];
1988: v2 = a->a + diag[row-1] + 2;
1989: v3 = a->a + diag[row-2] + 3;
1990: for(n = 0; n<sz-1; n+=2) {
1991: i1 = idx[0];
1992: i2 = idx[1];
1993: idx += 2;
1994: tmp0 = x[i1];
1995: tmp1 = x[i2];
1996: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1997: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1998: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1999: }
2000:
2001: if (n == sz-1){
2002: tmp0 = x[*idx];
2003: sum1 -= *v1*tmp0;
2004: sum2 -= *v2*tmp0;
2005: sum3 -= *v3*tmp0;
2006: }
2007: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2008: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2009: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2010: break;
2011: case 4:
2012:
2013: sum1 = xb[row];
2014: sum2 = xb[row-1];
2015: sum3 = xb[row-2];
2016: sum4 = xb[row-3];
2017: v2 = a->a + diag[row-1] + 2;
2018: v3 = a->a + diag[row-2] + 3;
2019: v4 = a->a + diag[row-3] + 4;
2020: for(n = 0; n<sz-1; n+=2) {
2021: i1 = idx[0];
2022: i2 = idx[1];
2023: idx += 2;
2024: tmp0 = x[i1];
2025: tmp1 = x[i2];
2026: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2027: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2028: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2029: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2030: }
2031:
2032: if (n == sz-1){
2033: tmp0 = x[*idx];
2034: sum1 -= *v1*tmp0;
2035: sum2 -= *v2*tmp0;
2036: sum3 -= *v3*tmp0;
2037: sum4 -= *v4*tmp0;
2038: }
2039: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2040: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2041: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2042: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2043: break;
2044: case 5:
2045:
2046: sum1 = xb[row];
2047: sum2 = xb[row-1];
2048: sum3 = xb[row-2];
2049: sum4 = xb[row-3];
2050: sum5 = xb[row-4];
2051: v2 = a->a + diag[row-1] + 2;
2052: v3 = a->a + diag[row-2] + 3;
2053: v4 = a->a + diag[row-3] + 4;
2054: v5 = a->a + diag[row-4] + 5;
2055: for(n = 0; n<sz-1; n+=2) {
2056: i1 = idx[0];
2057: i2 = idx[1];
2058: idx += 2;
2059: tmp0 = x[i1];
2060: tmp1 = x[i2];
2061: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2062: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2063: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2064: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2065: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2066: }
2067:
2068: if (n == sz-1){
2069: tmp0 = x[*idx];
2070: sum1 -= *v1*tmp0;
2071: sum2 -= *v2*tmp0;
2072: sum3 -= *v3*tmp0;
2073: sum4 -= *v4*tmp0;
2074: sum5 -= *v5*tmp0;
2075: }
2076: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2077: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2078: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2079: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2080: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2081: break;
2082: default:
2083: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2084: }
2085: }
2087: PetscLogFlops(a->nz);
2088: }
2089: its--;
2090: }
2091: if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode");
2092: VecRestoreArray(xx,&x);
2093: if (bb != xx) {VecRestoreArray(bb,(PetscScalar**)&b);}
2094: return(0);
2095: }
2098: /*
2099: samestructure indicates that the matrix has not changed its nonzero structure so we
2100: do not need to recompute the inodes
2101: */
2104: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2105: {
2106: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2108: PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2109: PetscTruth flag;
2112: if (!a->inode.use) return(0);
2113: if (a->inode.checked && samestructure) return(0);
2116: m = A->rmap.n;
2117: if (a->inode.size) {ns = a->inode.size;}
2118: else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}
2120: i = 0;
2121: node_count = 0;
2122: idx = a->j;
2123: ii = a->i;
2124: while (i < m){ /* For each row */
2125: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
2126: /* Limits the number of elements in a node to 'a->inode.limit' */
2127: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2128: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
2129: if (nzy != nzx) break;
2130: idy += nzx; /* Same nonzero pattern */
2131: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
2132: if (!flag) break;
2133: }
2134: ns[node_count++] = blk_size;
2135: idx += blk_size*nzx;
2136: i = j;
2137: }
2138: /* If not enough inodes found,, do not use inode version of the routines */
2139: if (!a->inode.size && m && node_count > .9*m) {
2140: PetscFree(ns);
2141: a->inode.node_count = 0;
2142: a->inode.size = PETSC_NULL;
2143: a->inode.use = PETSC_FALSE;
2144: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
2145: } else {
2146: A->ops->mult = MatMult_Inode;
2147: A->ops->relax = MatRelax_Inode;
2148: A->ops->multadd = MatMultAdd_Inode;
2149: A->ops->solve = MatSolve_Inode;
2150: A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
2151: A->ops->getrowij = MatGetRowIJ_Inode;
2152: A->ops->restorerowij = MatRestoreRowIJ_Inode;
2153: A->ops->getcolumnij = MatGetColumnIJ_Inode;
2154: A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2155: A->ops->coloringpatch = MatColoringPatch_Inode;
2156: a->inode.node_count = node_count;
2157: a->inode.size = ns;
2158: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
2159: }
2160: return(0);
2161: }
2163: /*
2164: This is really ugly. if inodes are used this replaces the
2165: permutations with ones that correspond to rows/cols of the matrix
2166: rather then inode blocks
2167: */
2170: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2171: {
2172: PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2175: PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
2176: if (f) {
2177: (*f)(A,rperm,cperm);
2178: }
2179: return(0);
2180: }
2185: PetscErrorCode MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2186: {
2187: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
2189: PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
2190: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
2191: PetscInt nslim_col,*ns_col;
2192: IS ris = *rperm,cis = *cperm;
2195: if (!a->inode.size) return(0); /* no inodes so return */
2196: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
2198: Mat_CreateColInode(A,&nslim_col,&ns_col);
2199: PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
2200: PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
2201: permc = permr + m;
2203: ISGetIndices(ris,&ridx);
2204: ISGetIndices(cis,&cidx);
2206: /* Form the inode structure for the rows of permuted matric using inv perm*/
2207: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2209: /* Construct the permutations for rows*/
2210: for (i=0,row = 0; i<nslim_row; ++i){
2211: indx = ridx[i];
2212: start_val = tns[indx];
2213: end_val = tns[indx + 1];
2214: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2215: }
2217: /* Form the inode structure for the columns of permuted matrix using inv perm*/
2218: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2220: /* Construct permutations for columns */
2221: for (i=0,col=0; i<nslim_col; ++i){
2222: indx = cidx[i];
2223: start_val = tns[indx];
2224: end_val = tns[indx + 1];
2225: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2226: }
2228: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
2229: ISSetPermutation(*rperm);
2230: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
2231: ISSetPermutation(*cperm);
2232:
2233: ISRestoreIndices(ris,&ridx);
2234: ISRestoreIndices(cis,&cidx);
2236: PetscFree(ns_col);
2237: PetscFree(permr);
2238: ISDestroy(cis);
2239: ISDestroy(ris);
2240: PetscFree(tns);
2241: return(0);
2242: }
2247: /*@C
2248: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2250: Collective on Mat
2252: Input Parameter:
2253: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2255: Output Parameter:
2256: + node_count - no of inodes present in the matrix.
2257: . sizes - an array of size node_count,with sizes of each inode.
2258: - limit - the max size used to generate the inodes.
2260: Level: advanced
2262: Notes: This routine returns some internal storage information
2263: of the matrix, it is intended to be used by advanced users.
2264: It should be called after the matrix is assembled.
2265: The contents of the sizes[] array should not be changed.
2266: PETSC_NULL may be passed for information not requested.
2268: .keywords: matrix, seqaij, get, inode
2270: .seealso: MatGetInfo()
2271: @*/
2272: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2273: {
2274: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2277: if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2278: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
2279: if (f) {
2280: (*f)(A,node_count,sizes,limit);
2281: }
2282: return(0);
2283: }
2288: PetscErrorCode MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2289: {
2290: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2293: if (node_count) *node_count = a->inode.node_count;
2294: if (sizes) *sizes = a->inode.size;
2295: if (limit) *limit = a->inode.limit;
2296: return(0);
2297: }