Actual source code: inode.c
1: #define PETSCMAT_DLL
3: /*
4: This file provides high performance routines for the Inode format (compressed sparse row)
5: by taking advantage of rows with identical nonzero structure (I-nodes).
6: */
7: #include src/mat/impls/aij/seq/aij.h
11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12: {
13: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
15: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
18: n = A->cmap.n;
19: m = A->rmap.n;
20: ns_row = a->inode.size;
21:
22: min_mn = (m < n) ? m : n;
23: if (!ns) {
24: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25: for(; count+1 < n; count++,i++);
26: if (count < n) {
27: i++;
28: }
29: *size = i;
30: return(0);
31: }
32: PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
33:
34: /* Use the same row structure wherever feasible. */
35: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36: ns_col[i] = ns_row[i];
37: }
39: /* if m < n; pad up the remainder with inode_limit */
40: for(; count+1 < n; count++,i++) {
41: ns_col[i] = 1;
42: }
43: /* The last node is the odd ball. padd it up with the remaining rows; */
44: if (count < n) {
45: ns_col[i] = n - count;
46: i++;
47: } else if (count > n) {
48: /* Adjust for the over estimation */
49: ns_col[i-1] += n - count;
50: }
51: *size = i;
52: *ns = ns_col;
53: return(0);
54: }
57: /*
58: This builds symmetric version of nonzero structure,
59: */
62: static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63: {
64: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
66: PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap.n;
72: n = A->cmap.n;
73: if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74:
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
81: PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1){
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col)
87: tvc[col] = i1;
88: }
89: /* allocate space for row pointers */
90: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
91: *iia = ia;
92: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
93: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
95: /* determine the number of columns in each row */
96: ia[0] = oshift;
97: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
99: j = aj + ai[row] + ishift;
100: jmax = aj + ai[row+1] + ishift;
101: i2 = 0;
102: col = *j++ + ishift;
103: i2 = tvc[col];
104: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105: ia[i1+1]++;
106: ia[i2+1]++;
107: i2++; /* Start col of next node */
108: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109: i2 = tvc[col];
110: }
111: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
112: }
114: /* shift ia[i] to point to next row */
115: for (i1=1; i1<nslim_row+1; i1++) {
116: row = ia[i1-1];
117: ia[i1] += row;
118: work[i1-1] = row - oshift;
119: }
121: /* allocate space for column pointers */
122: nz = ia[nslim_row] + (!ishift);
123: PetscMalloc(nz*sizeof(PetscInt),&ja);
124: *jja = ja;
126: /* loop over lower triangular part putting into ja */
127: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128: j = aj + ai[row] + ishift;
129: jmax = aj + ai[row+1] + ishift;
130: i2 = 0; /* Col inode index */
131: col = *j++ + ishift;
132: i2 = tvc[col];
133: while (i2<i1 && j<jmax) {
134: ja[work[i2]++] = i1 + oshift;
135: ja[work[i1]++] = i2 + oshift;
136: ++i2;
137: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138: i2 = tvc[col];
139: }
140: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
142: }
143: PetscFree(work);
144: PetscFree(tns);
145: PetscFree(tvc);
146: return(0);
147: }
149: /*
150: This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
158: PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
162: nslim_row = a->inode.node_count;
163: n = A->cmap.n;
165: /* Create The column_inode for this matrix */
166: Mat_CreateColInode(A,&nslim_col,&ns_col);
167:
168: /* allocate space for reformated column_inode structure */
169: PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170: PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
173: for (i1=0,col=0; i1<nslim_col; ++i1){
174: nsz = ns_col[i1];
175: for (i2=0; i2<nsz; ++i2,++col)
176: tvc[col] = i1;
177: }
178: /* allocate space for row pointers */
179: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180: *iia = ia;
181: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
184: /* determine the number of columns in each row */
185: ia[0] = oshift;
186: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187: j = aj + ai[row] + ishift;
188: col = *j++ + ishift;
189: i2 = tvc[col];
190: nz = ai[row+1] - ai[row];
191: while (nz-- > 0) { /* off-diagonal elemets */
192: ia[i1+1]++;
193: i2++; /* Start col of next node */
194: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195: if (nz > 0) i2 = tvc[col];
196: }
197: }
199: /* shift ia[i] to point to next row */
200: for (i1=1; i1<nslim_row+1; i1++) {
201: row = ia[i1-1];
202: ia[i1] += row;
203: work[i1-1] = row - oshift;
204: }
206: /* allocate space for column pointers */
207: nz = ia[nslim_row] + (!ishift);
208: PetscMalloc(nz*sizeof(PetscInt),&ja);
209: *jja = ja;
211: /* loop over matrix putting into ja */
212: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213: j = aj + ai[row] + ishift;
214: i2 = 0; /* Col inode index */
215: col = *j++ + ishift;
216: i2 = tvc[col];
217: nz = ai[row+1] - ai[row];
218: while (nz-- > 0) {
219: ja[work[i1]++] = i2 + oshift;
220: ++i2;
221: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222: if (nz > 0) i2 = tvc[col];
223: }
224: }
225: PetscFree(ns_col);
226: PetscFree(work);
227: PetscFree(tns);
228: PetscFree(tvc);
229: return(0);
230: }
234: static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
240: *n = a->inode.node_count;
241: if (!ia) return(0);
242: if (!blockcompressed) {
243: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
244: } else if (symmetric) {
245: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
246: } else {
247: MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
248: }
249: return(0);
250: }
254: static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255: {
259: if (!ia) return(0);
261: if (!blockcompressed) {
262: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
263: } else {
264: PetscFree(*ia);
265: PetscFree(*ja);
266: }
268: return(0);
269: }
271: /* ----------------------------------------------------------- */
275: static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276: {
277: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
279: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
283: nslim_row = a->inode.node_count;
284: n = A->cmap.n;
286: /* Create The column_inode for this matrix */
287: Mat_CreateColInode(A,&nslim_col,&ns_col);
288:
289: /* allocate space for reformated column_inode structure */
290: PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
291: PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
292: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
294: for (i1=0,col=0; i1<nslim_col; ++i1){
295: nsz = ns_col[i1];
296: for (i2=0; i2<nsz; ++i2,++col)
297: tvc[col] = i1;
298: }
299: /* allocate space for column pointers */
300: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
301: *iia = ia;
302: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
303: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);
305: /* determine the number of columns in each row */
306: ia[0] = oshift;
307: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308: j = aj + ai[row] + ishift;
309: col = *j++ + ishift;
310: i2 = tvc[col];
311: nz = ai[row+1] - ai[row];
312: while (nz-- > 0) { /* off-diagonal elemets */
313: /* ia[i1+1]++; */
314: ia[i2+1]++;
315: i2++;
316: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317: if (nz > 0) i2 = tvc[col];
318: }
319: }
321: /* shift ia[i] to point to next col */
322: for (i1=1; i1<nslim_col+1; i1++) {
323: col = ia[i1-1];
324: ia[i1] += col;
325: work[i1-1] = col - oshift;
326: }
328: /* allocate space for column pointers */
329: nz = ia[nslim_col] + (!ishift);
330: PetscMalloc(nz*sizeof(PetscInt),&ja);
331: *jja = ja;
333: /* loop over matrix putting into ja */
334: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335: j = aj + ai[row] + ishift;
336: i2 = 0; /* Col inode index */
337: col = *j++ + ishift;
338: i2 = tvc[col];
339: nz = ai[row+1] - ai[row];
340: while (nz-- > 0) {
341: /* ja[work[i1]++] = i2 + oshift; */
342: ja[work[i2]++] = i1 + oshift;
343: i2++;
344: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345: if (nz > 0) i2 = tvc[col];
346: }
347: }
348: PetscFree(ns_col);
349: PetscFree(work);
350: PetscFree(tns);
351: PetscFree(tvc);
352: return(0);
353: }
357: static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358: {
362: Mat_CreateColInode(A,n,PETSC_NULL);
363: if (!ia) return(0);
365: if (!blockcompressed) {
366: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367: } else if (symmetric) {
368: /* Since the indices are symmetric it does'nt matter */
369: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
370: } else {
371: MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
372: }
373: return(0);
374: }
378: static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379: {
383: if (!ia) return(0);
384: if (!blockcompressed) {
385: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
386: } else {
387: PetscFree(*ia);
388: PetscFree(*ja);
389: }
390: return(0);
391: }
393: /* ----------------------------------------------------------- */
397: static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398: {
399: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
400: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401: PetscScalar *y;
402: const PetscScalar *x,*v1,*v2,*v3,*v4,*v5;
403: PetscErrorCode ierr;
404: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
405:
406: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
407: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
408: #endif
411: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
412: node_max = a->inode.node_count;
413: ns = a->inode.size; /* Node Size array */
414: VecGetArray(xx,(PetscScalar**)&x);
415: VecGetArray(yy,&y);
416: idx = a->j;
417: v1 = a->a;
418: ii = a->i;
420: for (i = 0,row = 0; i< node_max; ++i){
421: nsz = ns[i];
422: n = ii[1] - ii[0];
423: nonzerorow += (n>0)*nsz;
424: ii += nsz;
425: sz = n; /* No of non zeros in this row */
426: /* Switch on the size of Node */
427: switch (nsz){ /* Each loop in 'case' is unrolled */
428: case 1 :
429: sum1 = 0;
430:
431: for(n = 0; n< sz-1; n+=2) {
432: i1 = idx[0]; /* The instructions are ordered to */
433: i2 = idx[1]; /* make the compiler's job easy */
434: idx += 2;
435: tmp0 = x[i1];
436: tmp1 = x[i2];
437: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
438: }
439:
440: if (n == sz-1){ /* Take care of the last nonzero */
441: tmp0 = x[*idx++];
442: sum1 += *v1++ * tmp0;
443: }
444: y[row++]=sum1;
445: break;
446: case 2:
447: sum1 = 0;
448: sum2 = 0;
449: v2 = v1 + n;
450:
451: for (n = 0; n< sz-1; n+=2) {
452: i1 = idx[0];
453: i2 = idx[1];
454: idx += 2;
455: tmp0 = x[i1];
456: tmp1 = x[i2];
457: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
458: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
459: }
460: if (n == sz-1){
461: tmp0 = x[*idx++];
462: sum1 += *v1++ * tmp0;
463: sum2 += *v2++ * tmp0;
464: }
465: y[row++]=sum1;
466: y[row++]=sum2;
467: v1 =v2; /* Since the next block to be processed starts there*/
468: idx +=sz;
469: break;
470: case 3:
471: sum1 = 0;
472: sum2 = 0;
473: sum3 = 0;
474: v2 = v1 + n;
475: v3 = v2 + n;
476:
477: for (n = 0; n< sz-1; n+=2) {
478: i1 = idx[0];
479: i2 = idx[1];
480: idx += 2;
481: tmp0 = x[i1];
482: tmp1 = x[i2];
483: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
484: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
485: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
486: }
487: if (n == sz-1){
488: tmp0 = x[*idx++];
489: sum1 += *v1++ * tmp0;
490: sum2 += *v2++ * tmp0;
491: sum3 += *v3++ * tmp0;
492: }
493: y[row++]=sum1;
494: y[row++]=sum2;
495: y[row++]=sum3;
496: v1 =v3; /* Since the next block to be processed starts there*/
497: idx +=2*sz;
498: break;
499: case 4:
500: sum1 = 0;
501: sum2 = 0;
502: sum3 = 0;
503: sum4 = 0;
504: v2 = v1 + n;
505: v3 = v2 + n;
506: v4 = v3 + n;
507:
508: for (n = 0; n< sz-1; n+=2) {
509: i1 = idx[0];
510: i2 = idx[1];
511: idx += 2;
512: tmp0 = x[i1];
513: tmp1 = x[i2];
514: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
515: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
516: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
517: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
518: }
519: if (n == sz-1){
520: tmp0 = x[*idx++];
521: sum1 += *v1++ * tmp0;
522: sum2 += *v2++ * tmp0;
523: sum3 += *v3++ * tmp0;
524: sum4 += *v4++ * tmp0;
525: }
526: y[row++]=sum1;
527: y[row++]=sum2;
528: y[row++]=sum3;
529: y[row++]=sum4;
530: v1 =v4; /* Since the next block to be processed starts there*/
531: idx +=3*sz;
532: break;
533: case 5:
534: sum1 = 0;
535: sum2 = 0;
536: sum3 = 0;
537: sum4 = 0;
538: sum5 = 0;
539: v2 = v1 + n;
540: v3 = v2 + n;
541: v4 = v3 + n;
542: v5 = v4 + n;
543:
544: for (n = 0; n<sz-1; n+=2) {
545: i1 = idx[0];
546: i2 = idx[1];
547: idx += 2;
548: tmp0 = x[i1];
549: tmp1 = x[i2];
550: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
551: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
552: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
553: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
554: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
555: }
556: if (n == sz-1){
557: tmp0 = x[*idx++];
558: sum1 += *v1++ * tmp0;
559: sum2 += *v2++ * tmp0;
560: sum3 += *v3++ * tmp0;
561: sum4 += *v4++ * tmp0;
562: sum5 += *v5++ * tmp0;
563: }
564: y[row++]=sum1;
565: y[row++]=sum2;
566: y[row++]=sum3;
567: y[row++]=sum4;
568: y[row++]=sum5;
569: v1 =v5; /* Since the next block to be processed starts there */
570: idx +=4*sz;
571: break;
572: default :
573: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
574: }
575: }
576: VecRestoreArray(xx,(PetscScalar**)&x);
577: VecRestoreArray(yy,&y);
578: PetscLogFlops(2*a->nz - nonzerorow);
579: return(0);
580: }
581: /* ----------------------------------------------------------- */
582: /* Almost same code as the MatMult_Inode() */
585: static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
586: {
587: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
588: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
589: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
591: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
592:
594: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
595: node_max = a->inode.node_count;
596: ns = a->inode.size; /* Node Size array */
597: VecGetArray(xx,&x);
598: VecGetArray(yy,&y);
599: if (zz != yy) {
600: VecGetArray(zz,&z);
601: } else {
602: z = y;
603: }
604: zt = z;
606: idx = a->j;
607: v1 = a->a;
608: ii = a->i;
610: for (i = 0,row = 0; i< node_max; ++i){
611: nsz = ns[i];
612: n = ii[1] - ii[0];
613: ii += nsz;
614: sz = n; /* No of non zeros in this row */
615: /* Switch on the size of Node */
616: switch (nsz){ /* Each loop in 'case' is unrolled */
617: case 1 :
618: sum1 = *zt++;
619:
620: for(n = 0; n< sz-1; n+=2) {
621: i1 = idx[0]; /* The instructions are ordered to */
622: i2 = idx[1]; /* make the compiler's job easy */
623: idx += 2;
624: tmp0 = x[i1];
625: tmp1 = x[i2];
626: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627: }
628:
629: if(n == sz-1){ /* Take care of the last nonzero */
630: tmp0 = x[*idx++];
631: sum1 += *v1++ * tmp0;
632: }
633: y[row++]=sum1;
634: break;
635: case 2:
636: sum1 = *zt++;
637: sum2 = *zt++;
638: v2 = v1 + n;
639:
640: for(n = 0; n< sz-1; n+=2) {
641: i1 = idx[0];
642: i2 = idx[1];
643: idx += 2;
644: tmp0 = x[i1];
645: tmp1 = x[i2];
646: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
647: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
648: }
649: if(n == sz-1){
650: tmp0 = x[*idx++];
651: sum1 += *v1++ * tmp0;
652: sum2 += *v2++ * tmp0;
653: }
654: y[row++]=sum1;
655: y[row++]=sum2;
656: v1 =v2; /* Since the next block to be processed starts there*/
657: idx +=sz;
658: break;
659: case 3:
660: sum1 = *zt++;
661: sum2 = *zt++;
662: sum3 = *zt++;
663: v2 = v1 + n;
664: v3 = v2 + n;
665:
666: for (n = 0; n< sz-1; n+=2) {
667: i1 = idx[0];
668: i2 = idx[1];
669: idx += 2;
670: tmp0 = x[i1];
671: tmp1 = x[i2];
672: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
673: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
674: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
675: }
676: if (n == sz-1){
677: tmp0 = x[*idx++];
678: sum1 += *v1++ * tmp0;
679: sum2 += *v2++ * tmp0;
680: sum3 += *v3++ * tmp0;
681: }
682: y[row++]=sum1;
683: y[row++]=sum2;
684: y[row++]=sum3;
685: v1 =v3; /* Since the next block to be processed starts there*/
686: idx +=2*sz;
687: break;
688: case 4:
689: sum1 = *zt++;
690: sum2 = *zt++;
691: sum3 = *zt++;
692: sum4 = *zt++;
693: v2 = v1 + n;
694: v3 = v2 + n;
695: v4 = v3 + n;
696:
697: for (n = 0; n< sz-1; n+=2) {
698: i1 = idx[0];
699: i2 = idx[1];
700: idx += 2;
701: tmp0 = x[i1];
702: tmp1 = x[i2];
703: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
704: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
705: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
706: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
707: }
708: if (n == sz-1){
709: tmp0 = x[*idx++];
710: sum1 += *v1++ * tmp0;
711: sum2 += *v2++ * tmp0;
712: sum3 += *v3++ * tmp0;
713: sum4 += *v4++ * tmp0;
714: }
715: y[row++]=sum1;
716: y[row++]=sum2;
717: y[row++]=sum3;
718: y[row++]=sum4;
719: v1 =v4; /* Since the next block to be processed starts there*/
720: idx +=3*sz;
721: break;
722: case 5:
723: sum1 = *zt++;
724: sum2 = *zt++;
725: sum3 = *zt++;
726: sum4 = *zt++;
727: sum5 = *zt++;
728: v2 = v1 + n;
729: v3 = v2 + n;
730: v4 = v3 + n;
731: v5 = v4 + n;
732:
733: for (n = 0; n<sz-1; n+=2) {
734: i1 = idx[0];
735: i2 = idx[1];
736: idx += 2;
737: tmp0 = x[i1];
738: tmp1 = x[i2];
739: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
740: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
741: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
742: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
743: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
744: }
745: if(n == sz-1){
746: tmp0 = x[*idx++];
747: sum1 += *v1++ * tmp0;
748: sum2 += *v2++ * tmp0;
749: sum3 += *v3++ * tmp0;
750: sum4 += *v4++ * tmp0;
751: sum5 += *v5++ * tmp0;
752: }
753: y[row++]=sum1;
754: y[row++]=sum2;
755: y[row++]=sum3;
756: y[row++]=sum4;
757: y[row++]=sum5;
758: v1 =v5; /* Since the next block to be processed starts there */
759: idx +=4*sz;
760: break;
761: default :
762: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
763: }
764: }
765: VecRestoreArray(xx,&x);
766: VecRestoreArray(yy,&y);
767: if (zz != yy) {
768: VecRestoreArray(zz,&z);
769: }
770: PetscLogFlops(2*a->nz);
771: return(0);
772: }
774: /* ----------------------------------------------------------- */
777: PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
778: {
779: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
780: IS iscol = a->col,isrow = a->row;
781: PetscErrorCode ierr;
782: PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
783: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
784: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
785: PetscScalar sum1,sum2,sum3,sum4,sum5;
786: const PetscScalar *v1,*v2,*v3,*v4,*v5,*b,*a_a = a->a,*aa;
789: if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
790: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
791: node_max = a->inode.node_count;
792: ns = a->inode.size; /* Node Size array */
794: VecGetArray(bb,(PetscScalar**)&b);
795: VecGetArray(xx,&x);
796: tmp = a->solve_work;
797:
798: ISGetIndices(isrow,&rout); r = rout;
799: ISGetIndices(iscol,&cout); c = cout + (n-1);
800:
801: /* forward solve the lower triangular */
802: tmps = tmp ;
803: aa = a_a ;
804: aj = a_j ;
805: ad = a->diag;
807: for (i = 0,row = 0; i< node_max; ++i){
808: nsz = ns[i];
809: aii = ai[row];
810: v1 = aa + aii;
811: vi = aj + aii;
812: nz = ad[row]- aii;
813:
814: switch (nsz){ /* Each loop in 'case' is unrolled */
815: case 1 :
816: sum1 = b[*r++];
817: /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
818: for(j=0; j<nz-1; j+=2){
819: i0 = vi[0];
820: i1 = vi[1];
821: vi +=2;
822: tmp0 = tmps[i0];
823: tmp1 = tmps[i1];
824: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
825: }
826: if(j == nz-1){
827: tmp0 = tmps[*vi++];
828: sum1 -= *v1++ *tmp0;
829: }
830: tmp[row ++]=sum1;
831: break;
832: case 2:
833: sum1 = b[*r++];
834: sum2 = b[*r++];
835: v2 = aa + ai[row+1];
837: for(j=0; j<nz-1; j+=2){
838: i0 = vi[0];
839: i1 = vi[1];
840: vi +=2;
841: tmp0 = tmps[i0];
842: tmp1 = tmps[i1];
843: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
844: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
845: }
846: if(j == nz-1){
847: tmp0 = tmps[*vi++];
848: sum1 -= *v1++ *tmp0;
849: sum2 -= *v2++ *tmp0;
850: }
851: sum2 -= *v2++ * sum1;
852: tmp[row ++]=sum1;
853: tmp[row ++]=sum2;
854: break;
855: case 3:
856: sum1 = b[*r++];
857: sum2 = b[*r++];
858: sum3 = b[*r++];
859: v2 = aa + ai[row+1];
860: v3 = aa + ai[row+2];
861:
862: for (j=0; j<nz-1; j+=2){
863: i0 = vi[0];
864: i1 = vi[1];
865: vi +=2;
866: tmp0 = tmps[i0];
867: tmp1 = tmps[i1];
868: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
869: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
870: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
871: }
872: if (j == nz-1){
873: tmp0 = tmps[*vi++];
874: sum1 -= *v1++ *tmp0;
875: sum2 -= *v2++ *tmp0;
876: sum3 -= *v3++ *tmp0;
877: }
878: sum2 -= *v2++ * sum1;
879: sum3 -= *v3++ * sum1;
880: sum3 -= *v3++ * sum2;
881: tmp[row ++]=sum1;
882: tmp[row ++]=sum2;
883: tmp[row ++]=sum3;
884: break;
885:
886: case 4:
887: sum1 = b[*r++];
888: sum2 = b[*r++];
889: sum3 = b[*r++];
890: sum4 = b[*r++];
891: v2 = aa + ai[row+1];
892: v3 = aa + ai[row+2];
893: v4 = aa + ai[row+3];
894:
895: for (j=0; j<nz-1; j+=2){
896: i0 = vi[0];
897: i1 = vi[1];
898: vi +=2;
899: tmp0 = tmps[i0];
900: tmp1 = tmps[i1];
901: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
902: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
903: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
904: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
905: }
906: if (j == nz-1){
907: tmp0 = tmps[*vi++];
908: sum1 -= *v1++ *tmp0;
909: sum2 -= *v2++ *tmp0;
910: sum3 -= *v3++ *tmp0;
911: sum4 -= *v4++ *tmp0;
912: }
913: sum2 -= *v2++ * sum1;
914: sum3 -= *v3++ * sum1;
915: sum4 -= *v4++ * sum1;
916: sum3 -= *v3++ * sum2;
917: sum4 -= *v4++ * sum2;
918: sum4 -= *v4++ * sum3;
919:
920: tmp[row ++]=sum1;
921: tmp[row ++]=sum2;
922: tmp[row ++]=sum3;
923: tmp[row ++]=sum4;
924: break;
925: case 5:
926: sum1 = b[*r++];
927: sum2 = b[*r++];
928: sum3 = b[*r++];
929: sum4 = b[*r++];
930: sum5 = b[*r++];
931: v2 = aa + ai[row+1];
932: v3 = aa + ai[row+2];
933: v4 = aa + ai[row+3];
934: v5 = aa + ai[row+4];
935:
936: for (j=0; j<nz-1; j+=2){
937: i0 = vi[0];
938: i1 = vi[1];
939: vi +=2;
940: tmp0 = tmps[i0];
941: tmp1 = tmps[i1];
942: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
943: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
944: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
945: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
946: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
947: }
948: if (j == nz-1){
949: tmp0 = tmps[*vi++];
950: sum1 -= *v1++ *tmp0;
951: sum2 -= *v2++ *tmp0;
952: sum3 -= *v3++ *tmp0;
953: sum4 -= *v4++ *tmp0;
954: sum5 -= *v5++ *tmp0;
955: }
957: sum2 -= *v2++ * sum1;
958: sum3 -= *v3++ * sum1;
959: sum4 -= *v4++ * sum1;
960: sum5 -= *v5++ * sum1;
961: sum3 -= *v3++ * sum2;
962: sum4 -= *v4++ * sum2;
963: sum5 -= *v5++ * sum2;
964: sum4 -= *v4++ * sum3;
965: sum5 -= *v5++ * sum3;
966: sum5 -= *v5++ * sum4;
967:
968: tmp[row ++]=sum1;
969: tmp[row ++]=sum2;
970: tmp[row ++]=sum3;
971: tmp[row ++]=sum4;
972: tmp[row ++]=sum5;
973: break;
974: default:
975: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
976: }
977: }
978: /* backward solve the upper triangular */
979: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
980: nsz = ns[i];
981: aii = ai[row+1] -1;
982: v1 = aa + aii;
983: vi = aj + aii;
984: nz = aii- ad[row];
985: switch (nsz){ /* Each loop in 'case' is unrolled */
986: case 1 :
987: sum1 = tmp[row];
989: for(j=nz ; j>1; j-=2){
990: vi -=2;
991: i0 = vi[2];
992: i1 = vi[1];
993: tmp0 = tmps[i0];
994: tmp1 = tmps[i1];
995: v1 -= 2;
996: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
997: }
998: if (j==1){
999: tmp0 = tmps[*vi--];
1000: sum1 -= *v1-- * tmp0;
1001: }
1002: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1003: break;
1004: case 2 :
1005: sum1 = tmp[row];
1006: sum2 = tmp[row -1];
1007: v2 = aa + ai[row]-1;
1008: for (j=nz ; j>1; j-=2){
1009: vi -=2;
1010: i0 = vi[2];
1011: i1 = vi[1];
1012: tmp0 = tmps[i0];
1013: tmp1 = tmps[i1];
1014: v1 -= 2;
1015: v2 -= 2;
1016: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1017: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1018: }
1019: if (j==1){
1020: tmp0 = tmps[*vi--];
1021: sum1 -= *v1-- * tmp0;
1022: sum2 -= *v2-- * tmp0;
1023: }
1024:
1025: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1026: sum2 -= *v2-- * tmp0;
1027: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1028: break;
1029: case 3 :
1030: sum1 = tmp[row];
1031: sum2 = tmp[row -1];
1032: sum3 = tmp[row -2];
1033: v2 = aa + ai[row]-1;
1034: v3 = aa + ai[row -1]-1;
1035: for (j=nz ; j>1; j-=2){
1036: vi -=2;
1037: i0 = vi[2];
1038: i1 = vi[1];
1039: tmp0 = tmps[i0];
1040: tmp1 = tmps[i1];
1041: v1 -= 2;
1042: v2 -= 2;
1043: v3 -= 2;
1044: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1045: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1046: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1047: }
1048: if (j==1){
1049: tmp0 = tmps[*vi--];
1050: sum1 -= *v1-- * tmp0;
1051: sum2 -= *v2-- * tmp0;
1052: sum3 -= *v3-- * tmp0;
1053: }
1054: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1055: sum2 -= *v2-- * tmp0;
1056: sum3 -= *v3-- * tmp0;
1057: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1058: sum3 -= *v3-- * tmp0;
1059: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1060:
1061: break;
1062: case 4 :
1063: sum1 = tmp[row];
1064: sum2 = tmp[row -1];
1065: sum3 = tmp[row -2];
1066: sum4 = tmp[row -3];
1067: v2 = aa + ai[row]-1;
1068: v3 = aa + ai[row -1]-1;
1069: v4 = aa + ai[row -2]-1;
1071: for (j=nz ; j>1; j-=2){
1072: vi -=2;
1073: i0 = vi[2];
1074: i1 = vi[1];
1075: tmp0 = tmps[i0];
1076: tmp1 = tmps[i1];
1077: v1 -= 2;
1078: v2 -= 2;
1079: v3 -= 2;
1080: v4 -= 2;
1081: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1082: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1083: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1084: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1085: }
1086: if (j==1){
1087: tmp0 = tmps[*vi--];
1088: sum1 -= *v1-- * tmp0;
1089: sum2 -= *v2-- * tmp0;
1090: sum3 -= *v3-- * tmp0;
1091: sum4 -= *v4-- * tmp0;
1092: }
1094: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1095: sum2 -= *v2-- * tmp0;
1096: sum3 -= *v3-- * tmp0;
1097: sum4 -= *v4-- * tmp0;
1098: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1099: sum3 -= *v3-- * tmp0;
1100: sum4 -= *v4-- * tmp0;
1101: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1102: sum4 -= *v4-- * tmp0;
1103: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1104: break;
1105: case 5 :
1106: sum1 = tmp[row];
1107: sum2 = tmp[row -1];
1108: sum3 = tmp[row -2];
1109: sum4 = tmp[row -3];
1110: sum5 = tmp[row -4];
1111: v2 = aa + ai[row]-1;
1112: v3 = aa + ai[row -1]-1;
1113: v4 = aa + ai[row -2]-1;
1114: v5 = aa + ai[row -3]-1;
1115: for (j=nz ; j>1; j-=2){
1116: vi -= 2;
1117: i0 = vi[2];
1118: i1 = vi[1];
1119: tmp0 = tmps[i0];
1120: tmp1 = tmps[i1];
1121: v1 -= 2;
1122: v2 -= 2;
1123: v3 -= 2;
1124: v4 -= 2;
1125: v5 -= 2;
1126: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1127: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1128: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1129: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1130: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1131: }
1132: if (j==1){
1133: tmp0 = tmps[*vi--];
1134: sum1 -= *v1-- * tmp0;
1135: sum2 -= *v2-- * tmp0;
1136: sum3 -= *v3-- * tmp0;
1137: sum4 -= *v4-- * tmp0;
1138: sum5 -= *v5-- * tmp0;
1139: }
1141: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1142: sum2 -= *v2-- * tmp0;
1143: sum3 -= *v3-- * tmp0;
1144: sum4 -= *v4-- * tmp0;
1145: sum5 -= *v5-- * tmp0;
1146: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1147: sum3 -= *v3-- * tmp0;
1148: sum4 -= *v4-- * tmp0;
1149: sum5 -= *v5-- * tmp0;
1150: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1151: sum4 -= *v4-- * tmp0;
1152: sum5 -= *v5-- * tmp0;
1153: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1154: sum5 -= *v5-- * tmp0;
1155: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1156: break;
1157: default:
1158: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1159: }
1160: }
1161: ISRestoreIndices(isrow,&rout);
1162: ISRestoreIndices(iscol,&cout);
1163: VecRestoreArray(bb,(PetscScalar**)&b);
1164: VecRestoreArray(xx,&x);
1165: PetscLogFlops(2*a->nz - A->cmap.n);
1166: return(0);
1167: }
1171: PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1172: {
1173: Mat C = *B;
1174: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1175: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1176: PetscErrorCode ierr;
1177: PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1178: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1179: PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1180: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1181: PetscScalar *pc1,*pc2,*pc3,mul1,mul2,mul3;
1182: PetscScalar tmp,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1183: const PetscScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1184: PetscReal rs=0.0;
1185: LUShift_Ctx sctx;
1186: PetscInt newshift;
1189: sctx.shift_top = 0;
1190: sctx.nshift_max = 0;
1191: sctx.shift_lo = 0;
1192: sctx.shift_hi = 0;
1194: /* if both shift schemes are chosen by user, only use info->shiftpd */
1195: if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1196: if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1197: sctx.shift_top = 0;
1198: for (i=0; i<n; i++) {
1199: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1200: rs = 0.0;
1201: ajtmp = aj + ai[i];
1202: rtmp1 = aa + ai[i];
1203: nz = ai[i+1] - ai[i];
1204: for (j=0; j<nz; j++){
1205: if (*ajtmp != i){
1206: rs += PetscAbsScalar(*rtmp1++);
1207: } else {
1208: rs -= PetscRealPart(*rtmp1++);
1209: }
1210: ajtmp++;
1211: }
1212: if (rs>sctx.shift_top) sctx.shift_top = rs;
1213: }
1214: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1215: sctx.shift_top *= 1.1;
1216: sctx.nshift_max = 5;
1217: sctx.shift_lo = 0.;
1218: sctx.shift_hi = 1.;
1219: }
1220: sctx.shift_amount = 0;
1221: sctx.nshift = 0;
1223: ISGetIndices(isrow,&r);
1224: ISGetIndices(iscol,&c);
1225: ISGetIndices(isicol,&ic);
1226: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);
1227: PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));
1228: ics = ic ;
1229: rtmp22 = rtmp11 + n;
1230: rtmp33 = rtmp22 + n;
1231:
1232: node_max = a->inode.node_count;
1233: ns = a->inode.size ;
1234: if (!ns){
1235: SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1236: }
1238: /* If max inode size > 3, split it into two inodes.*/
1239: /* also map the inode sizes according to the ordering */
1240: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1241: for (i=0,j=0; i<node_max; ++i,++j){
1242: if (ns[i]>3) {
1243: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1244: ++j;
1245: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1246: } else {
1247: tmp_vec1[j] = ns[i];
1248: }
1249: }
1250: /* Use the correct node_max */
1251: node_max = j;
1253: /* Now reorder the inode info based on mat re-ordering info */
1254: /* First create a row -> inode_size_array_index map */
1255: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1256: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1257: for (i=0,row=0; i<node_max; i++) {
1258: nodesz = tmp_vec1[i];
1259: for (j=0; j<nodesz; j++,row++) {
1260: nsmap[row] = i;
1261: }
1262: }
1263: /* Using nsmap, create a reordered ns structure */
1264: for (i=0,j=0; i< node_max; i++) {
1265: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1266: tmp_vec2[i] = nodesz;
1267: j += nodesz;
1268: }
1269: PetscFree(nsmap);
1270: PetscFree(tmp_vec1);
1271: /* Now use the correct ns */
1272: ns = tmp_vec2;
1274: do {
1275: sctx.lushift = PETSC_FALSE;
1276: /* Now loop over each block-row, and do the factorization */
1277: for (i=0,row=0; i<node_max; i++) {
1278: nodesz = ns[i];
1279: nz = bi[row+1] - bi[row];
1280: bjtmp = bj + bi[row];
1282: switch (nodesz){
1283: case 1:
1284: for (j=0; j<nz; j++){
1285: idx = bjtmp[j];
1286: rtmp11[idx] = 0.0;
1287: }
1288:
1289: /* load in initial (unfactored row) */
1290: idx = r[row];
1291: nz_tmp = ai[idx+1] - ai[idx];
1292: ajtmp = aj + ai[idx];
1293: v1 = aa + ai[idx];
1295: for (j=0; j<nz_tmp; j++) {
1296: idx = ics[ajtmp[j]];
1297: rtmp11[idx] = v1[j];
1298: }
1299: rtmp11[ics[r[row]]] += sctx.shift_amount;
1301: prow = *bjtmp++ ;
1302: while (prow < row) {
1303: pc1 = rtmp11 + prow;
1304: if (*pc1 != 0.0){
1305: pv = ba + bd[prow];
1306: pj = nbj + bd[prow];
1307: mul1 = *pc1 * *pv++;
1308: *pc1 = mul1;
1309: nz_tmp = bi[prow+1] - bd[prow] - 1;
1310: PetscLogFlops(2*nz_tmp);
1311: for (j=0; j<nz_tmp; j++) {
1312: tmp = pv[j];
1313: idx = pj[j];
1314: rtmp11[idx] -= mul1 * tmp;
1315: }
1316: }
1317: prow = *bjtmp++ ;
1318: }
1319: pj = bj + bi[row];
1320: pc1 = ba + bi[row];
1322: sctx.pv = rtmp11[row];
1323: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1324: rs = 0.0;
1325: for (j=0; j<nz; j++) {
1326: idx = pj[j];
1327: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1328: if (idx != row) rs += PetscAbsScalar(pc1[j]);
1329: }
1330: sctx.rs = rs;
1331: MatLUCheckShift_inline(info,sctx,row,newshift);
1332: if (newshift == 1) goto endofwhile;
1333: break;
1334:
1335: case 2:
1336: for (j=0; j<nz; j++) {
1337: idx = bjtmp[j];
1338: rtmp11[idx] = 0.0;
1339: rtmp22[idx] = 0.0;
1340: }
1341:
1342: /* load in initial (unfactored row) */
1343: idx = r[row];
1344: nz_tmp = ai[idx+1] - ai[idx];
1345: ajtmp = aj + ai[idx];
1346: v1 = aa + ai[idx];
1347: v2 = aa + ai[idx+1];
1348: for (j=0; j<nz_tmp; j++) {
1349: idx = ics[ajtmp[j]];
1350: rtmp11[idx] = v1[j];
1351: rtmp22[idx] = v2[j];
1352: }
1353: rtmp11[ics[r[row]]] += sctx.shift_amount;
1354: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1356: prow = *bjtmp++ ;
1357: while (prow < row) {
1358: pc1 = rtmp11 + prow;
1359: pc2 = rtmp22 + prow;
1360: if (*pc1 != 0.0 || *pc2 != 0.0){
1361: pv = ba + bd[prow];
1362: pj = nbj + bd[prow];
1363: mul1 = *pc1 * *pv;
1364: mul2 = *pc2 * *pv;
1365: ++pv;
1366: *pc1 = mul1;
1367: *pc2 = mul2;
1368:
1369: nz_tmp = bi[prow+1] - bd[prow] - 1;
1370: for (j=0; j<nz_tmp; j++) {
1371: tmp = pv[j];
1372: idx = pj[j];
1373: rtmp11[idx] -= mul1 * tmp;
1374: rtmp22[idx] -= mul2 * tmp;
1375: }
1376: PetscLogFlops(4*nz_tmp);
1377: }
1378: prow = *bjtmp++ ;
1379: }
1381: /* Now take care of diagonal 2x2 block. Note: prow = row here */
1382: pc1 = rtmp11 + prow;
1383: pc2 = rtmp22 + prow;
1385: sctx.pv = *pc1;
1386: pj = bj + bi[prow];
1387: rs = 0.0;
1388: for (j=0; j<nz; j++){
1389: idx = pj[j];
1390: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1391: }
1392: sctx.rs = rs;
1393: MatLUCheckShift_inline(info,sctx,row,newshift);
1394: if (newshift == 1) goto endofwhile;
1396: if (*pc2 != 0.0){
1397: pj = nbj + bd[prow];
1398: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1399: *pc2 = mul2;
1400: nz_tmp = bi[prow+1] - bd[prow] - 1;
1401: for (j=0; j<nz_tmp; j++) {
1402: idx = pj[j] ;
1403: tmp = rtmp11[idx];
1404: rtmp22[idx] -= mul2 * tmp;
1405: }
1406: PetscLogFlops(2*nz_tmp);
1407: }
1408:
1409: pj = bj + bi[row];
1410: pc1 = ba + bi[row];
1411: pc2 = ba + bi[row+1];
1413: sctx.pv = rtmp22[row+1];
1414: rs = 0.0;
1415: rtmp11[row] = 1.0/rtmp11[row];
1416: rtmp22[row+1] = 1.0/rtmp22[row+1];
1417: /* copy row entries from dense representation to sparse */
1418: for (j=0; j<nz; j++) {
1419: idx = pj[j];
1420: pc1[j] = rtmp11[idx];
1421: pc2[j] = rtmp22[idx];
1422: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1423: }
1424: sctx.rs = rs;
1425: MatLUCheckShift_inline(info,sctx,row+1,newshift);
1426: if (newshift == 1) goto endofwhile;
1427: break;
1429: case 3:
1430: for (j=0; j<nz; j++) {
1431: idx = bjtmp[j];
1432: rtmp11[idx] = 0.0;
1433: rtmp22[idx] = 0.0;
1434: rtmp33[idx] = 0.0;
1435: }
1436: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1437: idx = r[row];
1438: nz_tmp = ai[idx+1] - ai[idx];
1439: ajtmp = aj + ai[idx];
1440: v1 = aa + ai[idx];
1441: v2 = aa + ai[idx+1];
1442: v3 = aa + ai[idx+2];
1443: for (j=0; j<nz_tmp; j++) {
1444: idx = ics[ajtmp[j]];
1445: rtmp11[idx] = v1[j];
1446: rtmp22[idx] = v2[j];
1447: rtmp33[idx] = v3[j];
1448: }
1449: rtmp11[ics[r[row]]] += sctx.shift_amount;
1450: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1451: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1453: /* loop over all pivot row blocks above this row block */
1454: prow = *bjtmp++ ;
1455: while (prow < row) {
1456: pc1 = rtmp11 + prow;
1457: pc2 = rtmp22 + prow;
1458: pc3 = rtmp33 + prow;
1459: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1460: pv = ba + bd[prow];
1461: pj = nbj + bd[prow];
1462: mul1 = *pc1 * *pv;
1463: mul2 = *pc2 * *pv;
1464: mul3 = *pc3 * *pv;
1465: ++pv;
1466: *pc1 = mul1;
1467: *pc2 = mul2;
1468: *pc3 = mul3;
1469:
1470: nz_tmp = bi[prow+1] - bd[prow] - 1;
1471: /* update this row based on pivot row */
1472: for (j=0; j<nz_tmp; j++) {
1473: tmp = pv[j];
1474: idx = pj[j];
1475: rtmp11[idx] -= mul1 * tmp;
1476: rtmp22[idx] -= mul2 * tmp;
1477: rtmp33[idx] -= mul3 * tmp;
1478: }
1479: PetscLogFlops(6*nz_tmp);
1480: }
1481: prow = *bjtmp++ ;
1482: }
1484: /* Now take care of diagonal 3x3 block in this set of rows */
1485: /* note: prow = row here */
1486: pc1 = rtmp11 + prow;
1487: pc2 = rtmp22 + prow;
1488: pc3 = rtmp33 + prow;
1490: sctx.pv = *pc1;
1491: pj = bj + bi[prow];
1492: rs = 0.0;
1493: for (j=0; j<nz; j++){
1494: idx = pj[j];
1495: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1496: }
1497: sctx.rs = rs;
1498: MatLUCheckShift_inline(info,sctx,row,newshift);
1499: if (newshift == 1) goto endofwhile;
1501: if (*pc2 != 0.0 || *pc3 != 0.0){
1502: mul2 = (*pc2)/(*pc1);
1503: mul3 = (*pc3)/(*pc1);
1504: *pc2 = mul2;
1505: *pc3 = mul3;
1506: nz_tmp = bi[prow+1] - bd[prow] - 1;
1507: pj = nbj + bd[prow];
1508: for (j=0; j<nz_tmp; j++) {
1509: idx = pj[j] ;
1510: tmp = rtmp11[idx];
1511: rtmp22[idx] -= mul2 * tmp;
1512: rtmp33[idx] -= mul3 * tmp;
1513: }
1514: PetscLogFlops(4*nz_tmp);
1515: }
1516: ++prow;
1518: pc2 = rtmp22 + prow;
1519: pc3 = rtmp33 + prow;
1520: sctx.pv = *pc2;
1521: pj = bj + bi[prow];
1522: rs = 0.0;
1523: for (j=0; j<nz; j++){
1524: idx = pj[j];
1525: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1526: }
1527: sctx.rs = rs;
1528: MatLUCheckShift_inline(info,sctx,row+1,newshift);
1529: if (newshift == 1) goto endofwhile;
1531: if (*pc3 != 0.0){
1532: mul3 = (*pc3)/(*pc2);
1533: *pc3 = mul3;
1534: pj = nbj + bd[prow];
1535: nz_tmp = bi[prow+1] - bd[prow] - 1;
1536: for (j=0; j<nz_tmp; j++) {
1537: idx = pj[j] ;
1538: tmp = rtmp22[idx];
1539: rtmp33[idx] -= mul3 * tmp;
1540: }
1541: PetscLogFlops(4*nz_tmp);
1542: }
1544: pj = bj + bi[row];
1545: pc1 = ba + bi[row];
1546: pc2 = ba + bi[row+1];
1547: pc3 = ba + bi[row+2];
1549: sctx.pv = rtmp33[row+2];
1550: rs = 0.0;
1551: rtmp11[row] = 1.0/rtmp11[row];
1552: rtmp22[row+1] = 1.0/rtmp22[row+1];
1553: rtmp33[row+2] = 1.0/rtmp33[row+2];
1554: /* copy row entries from dense representation to sparse */
1555: for (j=0; j<nz; j++) {
1556: idx = pj[j];
1557: pc1[j] = rtmp11[idx];
1558: pc2[j] = rtmp22[idx];
1559: pc3[j] = rtmp33[idx];
1560: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1561: }
1563: sctx.rs = rs;
1564: MatLUCheckShift_inline(info,sctx,row+2,newshift);
1565: if (newshift == 1) goto endofwhile;
1566: break;
1568: default:
1569: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1570: }
1571: row += nodesz; /* Update the row */
1572: }
1573: endofwhile:;
1574: } while (sctx.lushift);
1575: PetscFree(rtmp11);
1576: PetscFree(tmp_vec2);
1577: ISRestoreIndices(isicol,&ic);
1578: ISRestoreIndices(isrow,&r);
1579: ISRestoreIndices(iscol,&c);
1580: C->factor = FACTOR_LU;
1581: C->assembled = PETSC_TRUE;
1582: if (sctx.nshift) {
1583: if (info->shiftnz) {
1584: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1585: } else if (info->shiftpd) {
1586: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);
1587: }
1588: }
1589: PetscLogFlops(C->cmap.n);
1590: return(0);
1591: }
1593: /*
1594: Makes a longer coloring[] array and calls the usual code with that
1595: */
1598: PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1599: {
1600: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
1601: PetscErrorCode ierr;
1602: PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1603: PetscInt *colorused,i;
1604: ISColoringValue *newcolor;
1607: PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
1608: /* loop over inodes, marking a color for each column*/
1609: row = 0;
1610: for (i=0; i<m; i++){
1611: for (j=0; j<ns[i]; j++) {
1612: newcolor[row++] = coloring[i] + j*ncolors;
1613: }
1614: }
1616: /* eliminate unneeded colors */
1617: PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
1618: PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
1619: for (i=0; i<n; i++) {
1620: colorused[newcolor[i]] = 1;
1621: }
1623: for (i=1; i<5*ncolors; i++) {
1624: colorused[i] += colorused[i-1];
1625: }
1626: ncolors = colorused[5*ncolors-1];
1627: for (i=0; i<n; i++) {
1628: newcolor[i] = colorused[newcolor[i]]-1;
1629: }
1630: PetscFree(colorused);
1631: ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);
1632: PetscFree(coloring);
1633: return(0);
1634: }
1636: #include src/inline/ilu.h
1640: PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1641: {
1642: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1643: PetscScalar *x,*xs,*ibdiag,*bdiag,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1644: PetscScalar *b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1645: const PetscScalar *v = a->a,*v1,*v2,*v3,*v4,*v5;
1646: PetscReal zeropivot = 1.0e-15;
1647: PetscErrorCode ierr;
1648: PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1649: PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k;
1652: if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1653: if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1654: if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode");
1656: if (!a->inode.ibdiagvalid) {
1657: if (!a->inode.ibdiag) {
1658: /* calculate space needed for diagonal blocks */
1659: for (i=0; i<m; i++) {
1660: cnt += sizes[i]*sizes[i];
1661: }
1662: a->inode.bdiagsize = cnt;
1663: PetscMalloc2(cnt,PetscScalar,&a->inode.ibdiag,cnt,PetscScalar,&a->inode.bdiag);
1664: }
1666: /* copy over the diagonal blocks and invert them */
1667: ibdiag = a->inode.ibdiag;
1668: bdiag = a->inode.bdiag;
1669: cnt = 0;
1670: for (i=0, row = 0; i<m; i++) {
1671: for (j=0; j<sizes[i]; j++) {
1672: for (k=0; k<sizes[i]; k++) {
1673: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1674: }
1675: }
1676: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(PetscScalar));
1677:
1678: switch(sizes[i]) {
1679: case 1:
1680: /* Create matrix data structure */
1681: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1682: ibdiag[cnt] = 1.0/ibdiag[cnt];
1683: break;
1684: case 2:
1685: Kernel_A_gets_inverse_A_2(ibdiag+cnt);
1686: break;
1687: case 3:
1688: Kernel_A_gets_inverse_A_3(ibdiag+cnt);
1689: break;
1690: case 4:
1691: Kernel_A_gets_inverse_A_4(ibdiag+cnt);
1692: break;
1693: case 5:
1694: Kernel_A_gets_inverse_A_5(ibdiag+cnt);
1695: break;
1696: default:
1697: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1698: }
1699: cnt += sizes[i]*sizes[i];
1700: row += sizes[i];
1701: }
1702: a->inode.ibdiagvalid = PETSC_TRUE;
1703: }
1704: ibdiag = a->inode.ibdiag;
1705: bdiag = a->inode.bdiag;
1707: VecGetArray(xx,&x);
1708: if (xx != bb) {
1709: VecGetArray(bb,(PetscScalar**)&b);
1710: } else {
1711: b = x;
1712: }
1714: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1715: xs = x;
1716: if (flag & SOR_ZERO_INITIAL_GUESS) {
1717: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1719: for (i=0, row=0; i<m; i++) {
1720: sz = diag[row] - ii[row];
1721: v1 = a->a + ii[row];
1722: idx = a->j + ii[row];
1724: /* see comments for MatMult_Inode() for how this is coded */
1725: switch (sizes[i]){
1726: case 1:
1727:
1728: sum1 = b[row];
1729: for(n = 0; n<sz-1; n+=2) {
1730: i1 = idx[0];
1731: i2 = idx[1];
1732: idx += 2;
1733: tmp0 = x[i1];
1734: tmp1 = x[i2];
1735: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1736: }
1737:
1738: if (n == sz-1){
1739: tmp0 = x[*idx];
1740: sum1 -= *v1 * tmp0;
1741: }
1742: x[row++] = sum1*(*ibdiag++);
1743: break;
1744: case 2:
1745: v2 = a->a + ii[row+1];
1746: sum1 = b[row];
1747: sum2 = b[row+1];
1748: for(n = 0; n<sz-1; n+=2) {
1749: i1 = idx[0];
1750: i2 = idx[1];
1751: idx += 2;
1752: tmp0 = x[i1];
1753: tmp1 = x[i2];
1754: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1755: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1756: }
1757:
1758: if (n == sz-1){
1759: tmp0 = x[*idx];
1760: sum1 -= v1[0] * tmp0;
1761: sum2 -= v2[0] * tmp0;
1762: }
1763: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1764: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1765: ibdiag += 4;
1766: break;
1767: case 3:
1768: v2 = a->a + ii[row+1];
1769: v3 = a->a + ii[row+2];
1770: sum1 = b[row];
1771: sum2 = b[row+1];
1772: sum3 = b[row+2];
1773: for(n = 0; n<sz-1; n+=2) {
1774: i1 = idx[0];
1775: i2 = idx[1];
1776: idx += 2;
1777: tmp0 = x[i1];
1778: tmp1 = x[i2];
1779: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1780: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1781: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1782: }
1783:
1784: if (n == sz-1){
1785: tmp0 = x[*idx];
1786: sum1 -= v1[0] * tmp0;
1787: sum2 -= v2[0] * tmp0;
1788: sum3 -= v3[0] * tmp0;
1789: }
1790: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1791: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1792: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1793: ibdiag += 9;
1794: break;
1795: case 4:
1796: v2 = a->a + ii[row+1];
1797: v3 = a->a + ii[row+2];
1798: v4 = a->a + ii[row+3];
1799: sum1 = b[row];
1800: sum2 = b[row+1];
1801: sum3 = b[row+2];
1802: sum4 = b[row+3];
1803: for(n = 0; n<sz-1; n+=2) {
1804: i1 = idx[0];
1805: i2 = idx[1];
1806: idx += 2;
1807: tmp0 = x[i1];
1808: tmp1 = x[i2];
1809: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1810: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1811: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1812: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1813: }
1814:
1815: if (n == sz-1){
1816: tmp0 = x[*idx];
1817: sum1 -= v1[0] * tmp0;
1818: sum2 -= v2[0] * tmp0;
1819: sum3 -= v3[0] * tmp0;
1820: sum4 -= v4[0] * tmp0;
1821: }
1822: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1823: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1824: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1825: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1826: ibdiag += 16;
1827: break;
1828: case 5:
1829: v2 = a->a + ii[row+1];
1830: v3 = a->a + ii[row+2];
1831: v4 = a->a + ii[row+3];
1832: v5 = a->a + ii[row+4];
1833: sum1 = b[row];
1834: sum2 = b[row+1];
1835: sum3 = b[row+2];
1836: sum4 = b[row+3];
1837: sum5 = b[row+4];
1838: for(n = 0; n<sz-1; n+=2) {
1839: i1 = idx[0];
1840: i2 = idx[1];
1841: idx += 2;
1842: tmp0 = x[i1];
1843: tmp1 = x[i2];
1844: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1845: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1846: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1847: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1848: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1849: }
1850:
1851: if (n == sz-1){
1852: tmp0 = x[*idx];
1853: sum1 -= v1[0] * tmp0;
1854: sum2 -= v2[0] * tmp0;
1855: sum3 -= v3[0] * tmp0;
1856: sum4 -= v4[0] * tmp0;
1857: sum5 -= v5[0] * tmp0;
1858: }
1859: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1860: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1861: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1862: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1863: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1864: ibdiag += 25;
1865: break;
1866: default:
1867: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1868: }
1869: }
1871: xb = x;
1872: PetscLogFlops(a->nz);
1873: } else xb = b;
1874: if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1875: (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1876: cnt = 0;
1877: for (i=0, row=0; i<m; i++) {
1879: switch (sizes[i]){
1880: case 1:
1881: x[row++] *= bdiag[cnt++];
1882: break;
1883: case 2:
1884: x1 = x[row]; x2 = x[row+1];
1885: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1886: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1887: x[row++] = tmp1;
1888: x[row++] = tmp2;
1889: cnt += 4;
1890: break;
1891: case 3:
1892: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
1893: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1894: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1895: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1896: x[row++] = tmp1;
1897: x[row++] = tmp2;
1898: x[row++] = tmp3;
1899: cnt += 9;
1900: break;
1901: case 4:
1902: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1903: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1904: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1905: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1906: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1907: x[row++] = tmp1;
1908: x[row++] = tmp2;
1909: x[row++] = tmp3;
1910: x[row++] = tmp4;
1911: cnt += 16;
1912: break;
1913: case 5:
1914: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1915: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1916: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1917: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1918: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1919: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1920: x[row++] = tmp1;
1921: x[row++] = tmp2;
1922: x[row++] = tmp3;
1923: x[row++] = tmp4;
1924: x[row++] = tmp5;
1925: cnt += 25;
1926: break;
1927: default:
1928: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1929: }
1930: }
1931: PetscLogFlops(m);
1932: }
1933: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1935: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1936: for (i=m-1, row=A->rmap.n-1; i>=0; i--) {
1937: ibdiag -= sizes[i]*sizes[i];
1938: sz = ii[row+1] - diag[row] - 1;
1939: v1 = a->a + diag[row] + 1;
1940: idx = a->j + diag[row] + 1;
1942: /* see comments for MatMult_Inode() for how this is coded */
1943: switch (sizes[i]){
1944: case 1:
1945:
1946: sum1 = xb[row];
1947: for(n = 0; n<sz-1; n+=2) {
1948: i1 = idx[0];
1949: i2 = idx[1];
1950: idx += 2;
1951: tmp0 = x[i1];
1952: tmp1 = x[i2];
1953: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1954: }
1955:
1956: if (n == sz-1){
1957: tmp0 = x[*idx];
1958: sum1 -= *v1*tmp0;
1959: }
1960: x[row--] = sum1*(*ibdiag);
1961: break;
1963: case 2:
1964:
1965: sum1 = xb[row];
1966: sum2 = xb[row-1];
1967: /* note that sum1 is associated with the second of the two rows */
1968: v2 = a->a + diag[row-1] + 2;
1969: for(n = 0; n<sz-1; n+=2) {
1970: i1 = idx[0];
1971: i2 = idx[1];
1972: idx += 2;
1973: tmp0 = x[i1];
1974: tmp1 = x[i2];
1975: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1976: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1977: }
1978:
1979: if (n == sz-1){
1980: tmp0 = x[*idx];
1981: sum1 -= *v1*tmp0;
1982: sum2 -= *v2*tmp0;
1983: }
1984: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1985: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1986: break;
1987: case 3:
1988:
1989: sum1 = xb[row];
1990: sum2 = xb[row-1];
1991: sum3 = xb[row-2];
1992: v2 = a->a + diag[row-1] + 2;
1993: v3 = a->a + diag[row-2] + 3;
1994: for(n = 0; n<sz-1; n+=2) {
1995: i1 = idx[0];
1996: i2 = idx[1];
1997: idx += 2;
1998: tmp0 = x[i1];
1999: tmp1 = x[i2];
2000: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2001: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2002: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2003: }
2004:
2005: if (n == sz-1){
2006: tmp0 = x[*idx];
2007: sum1 -= *v1*tmp0;
2008: sum2 -= *v2*tmp0;
2009: sum3 -= *v3*tmp0;
2010: }
2011: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2012: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2013: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2014: break;
2015: case 4:
2016:
2017: sum1 = xb[row];
2018: sum2 = xb[row-1];
2019: sum3 = xb[row-2];
2020: sum4 = xb[row-3];
2021: v2 = a->a + diag[row-1] + 2;
2022: v3 = a->a + diag[row-2] + 3;
2023: v4 = a->a + diag[row-3] + 4;
2024: for(n = 0; n<sz-1; n+=2) {
2025: i1 = idx[0];
2026: i2 = idx[1];
2027: idx += 2;
2028: tmp0 = x[i1];
2029: tmp1 = x[i2];
2030: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2031: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2032: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2033: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2034: }
2035:
2036: if (n == sz-1){
2037: tmp0 = x[*idx];
2038: sum1 -= *v1*tmp0;
2039: sum2 -= *v2*tmp0;
2040: sum3 -= *v3*tmp0;
2041: sum4 -= *v4*tmp0;
2042: }
2043: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2044: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2045: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2046: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2047: break;
2048: case 5:
2049:
2050: sum1 = xb[row];
2051: sum2 = xb[row-1];
2052: sum3 = xb[row-2];
2053: sum4 = xb[row-3];
2054: sum5 = xb[row-4];
2055: v2 = a->a + diag[row-1] + 2;
2056: v3 = a->a + diag[row-2] + 3;
2057: v4 = a->a + diag[row-3] + 4;
2058: v5 = a->a + diag[row-4] + 5;
2059: for(n = 0; n<sz-1; n+=2) {
2060: i1 = idx[0];
2061: i2 = idx[1];
2062: idx += 2;
2063: tmp0 = x[i1];
2064: tmp1 = x[i2];
2065: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2066: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2067: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2068: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2069: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2070: }
2071:
2072: if (n == sz-1){
2073: tmp0 = x[*idx];
2074: sum1 -= *v1*tmp0;
2075: sum2 -= *v2*tmp0;
2076: sum3 -= *v3*tmp0;
2077: sum4 -= *v4*tmp0;
2078: sum5 -= *v5*tmp0;
2079: }
2080: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2081: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2082: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2083: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2084: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2085: break;
2086: default:
2087: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2088: }
2089: }
2091: PetscLogFlops(a->nz);
2092: }
2093: its--;
2094: }
2095: if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode");
2096: VecRestoreArray(xx,&x);
2097: if (bb != xx) {VecRestoreArray(bb,(PetscScalar**)&b);}
2098: return(0);
2099: }
2102: /*
2103: samestructure indicates that the matrix has not changed its nonzero structure so we
2104: do not need to recompute the inodes
2105: */
2108: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2109: {
2110: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2112: PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2113: PetscTruth flag;
2116: if (!a->inode.use) return(0);
2117: if (a->inode.checked && samestructure) return(0);
2120: m = A->rmap.n;
2121: if (a->inode.size) {ns = a->inode.size;}
2122: else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}
2124: i = 0;
2125: node_count = 0;
2126: idx = a->j;
2127: ii = a->i;
2128: while (i < m){ /* For each row */
2129: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
2130: /* Limits the number of elements in a node to 'a->inode.limit' */
2131: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2132: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
2133: if (nzy != nzx) break;
2134: idy += nzx; /* Same nonzero pattern */
2135: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
2136: if (!flag) break;
2137: }
2138: ns[node_count++] = blk_size;
2139: idx += blk_size*nzx;
2140: i = j;
2141: }
2142: /* If not enough inodes found,, do not use inode version of the routines */
2143: if (!a->inode.size && m && node_count > .9*m) {
2144: PetscFree(ns);
2145: a->inode.node_count = 0;
2146: a->inode.size = PETSC_NULL;
2147: a->inode.use = PETSC_FALSE;
2148: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
2149: } else {
2150: A->ops->mult = MatMult_Inode;
2151: A->ops->relax = MatRelax_Inode;
2152: A->ops->multadd = MatMultAdd_Inode;
2153: A->ops->solve = MatSolve_Inode;
2154: A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
2155: A->ops->getrowij = MatGetRowIJ_Inode;
2156: A->ops->restorerowij = MatRestoreRowIJ_Inode;
2157: A->ops->getcolumnij = MatGetColumnIJ_Inode;
2158: A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2159: A->ops->coloringpatch = MatColoringPatch_Inode;
2160: a->inode.node_count = node_count;
2161: a->inode.size = ns;
2162: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
2163: }
2164: return(0);
2165: }
2167: /*
2168: This is really ugly. if inodes are used this replaces the
2169: permutations with ones that correspond to rows/cols of the matrix
2170: rather then inode blocks
2171: */
2174: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2175: {
2176: PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2179: PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
2180: if (f) {
2181: (*f)(A,rperm,cperm);
2182: }
2183: return(0);
2184: }
2189: PetscErrorCode MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2190: {
2191: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
2193: PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
2194: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
2195: PetscInt nslim_col,*ns_col;
2196: IS ris = *rperm,cis = *cperm;
2199: if (!a->inode.size) return(0); /* no inodes so return */
2200: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
2202: Mat_CreateColInode(A,&nslim_col,&ns_col);
2203: PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
2204: PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
2205: permc = permr + m;
2207: ISGetIndices(ris,&ridx);
2208: ISGetIndices(cis,&cidx);
2210: /* Form the inode structure for the rows of permuted matric using inv perm*/
2211: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2213: /* Construct the permutations for rows*/
2214: for (i=0,row = 0; i<nslim_row; ++i){
2215: indx = ridx[i];
2216: start_val = tns[indx];
2217: end_val = tns[indx + 1];
2218: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2219: }
2221: /* Form the inode structure for the columns of permuted matrix using inv perm*/
2222: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2224: /* Construct permutations for columns */
2225: for (i=0,col=0; i<nslim_col; ++i){
2226: indx = cidx[i];
2227: start_val = tns[indx];
2228: end_val = tns[indx + 1];
2229: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2230: }
2232: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
2233: ISSetPermutation(*rperm);
2234: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
2235: ISSetPermutation(*cperm);
2236:
2237: ISRestoreIndices(ris,&ridx);
2238: ISRestoreIndices(cis,&cidx);
2240: PetscFree(ns_col);
2241: PetscFree(permr);
2242: ISDestroy(cis);
2243: ISDestroy(ris);
2244: PetscFree(tns);
2245: return(0);
2246: }
2251: /*@C
2252: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2254: Collective on Mat
2256: Input Parameter:
2257: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2259: Output Parameter:
2260: + node_count - no of inodes present in the matrix.
2261: . sizes - an array of size node_count,with sizes of each inode.
2262: - limit - the max size used to generate the inodes.
2264: Level: advanced
2266: Notes: This routine returns some internal storage information
2267: of the matrix, it is intended to be used by advanced users.
2268: It should be called after the matrix is assembled.
2269: The contents of the sizes[] array should not be changed.
2270: PETSC_NULL may be passed for information not requested.
2272: .keywords: matrix, seqaij, get, inode
2274: .seealso: MatGetInfo()
2275: @*/
2276: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2277: {
2278: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2281: if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2282: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
2283: if (f) {
2284: (*f)(A,node_count,sizes,limit);
2285: }
2286: return(0);
2287: }
2292: PetscErrorCode MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2293: {
2294: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2297: if (node_count) *node_count = a->inode.node_count;
2298: if (sizes) *sizes = a->inode.size;
2299: if (limit) *limit = a->inode.limit;
2300: return(0);
2301: }