Actual source code: aijnode.c
1: /*$Id: aijnode.c,v 1.125 2001/04/10 19:35:19 bsmith Exp $*/
2: /*
3: This file provides high performance routines for the AIJ (compressed row)
4: format by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include src/mat/impls/aij/seq/aij.h
7: #include src/vec/vecimpl.h
9: EXTERN int Mat_AIJ_CheckInode(Mat);
10: EXTERN int MatSolve_SeqAIJ_Inode(Mat,Vec,Vec);
11: EXTERN int MatLUFactorNumeric_SeqAIJ_Inode(Mat,Mat *);
13: EXTERN int MatMult_SeqAIJ(Mat,Vec,Vec);
14: EXTERN int MatMultAdd_SeqAIJ(Mat,Vec,Vec,Vec);
15: EXTERN int MatSolve_SeqAIJ(Mat,Vec,Vec);
16: EXTERN int MatLUFactorNumeric_SeqAIJ(Mat,Mat*);
17: EXTERN int MatGetRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
18: EXTERN int MatRestoreRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
19: EXTERN int MatGetColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
20: EXTERN int MatRestoreColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
22: static int Mat_AIJ_CreateColInode(Mat A,int* size,int ** ns)
23: {
24: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
25: int ierr,i,count,m,n,min_mn,*ns_row,*ns_col;
28: n = A->n;
29: m = A->m;
30: ns_row = a->inode.size;
31:
32: min_mn = (m < n) ? m : n;
33: if (!ns) {
34: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
35: for(; count+1 < n; count++,i++);
36: if (count < n) {
37: i++;
38: }
39: *size = i;
40: return(0);
41: }
42: PetscMalloc((n+1)*sizeof(int),&ns_col);
43:
44: /* Use the same row structure wherever feasible. */
45: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
46: ns_col[i] = ns_row[i];
47: }
49: /* if m < n; pad up the remainder with inode_limit */
50: for(; count+1 < n; count++,i++) {
51: ns_col[i] = 1;
52: }
53: /* The last node is the odd ball. padd it up with the remaining rows; */
54: if (count < n) {
55: ns_col[i] = n - count;
56: i++;
57: } else if (count > n) {
58: /* Adjust for the over estimation */
59: ns_col[i-1] += n - count;
60: }
61: *size = i;
62: *ns = ns_col;
63: return(0);
64: }
67: /*
68: This builds symmetric version of nonzero structure,
69: */
70: static int MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
71: {
72: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
73: int *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n,ierr;
74: int *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
77: nslim_row = a->inode.node_count;
78: m = A->m;
79: n = A->n;
80: if (m != n) SETERRQ(1,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix shoul be square");
81:
82: /* Use the row_inode as column_inode */
83: nslim_col = nslim_row;
84: ns_col = ns_row;
86: /* allocate space for reformated inode structure */
87: PetscMalloc((nslim_col+1)*sizeof(int),&tns);
88: PetscMalloc((n+1)*sizeof(int),&tvc);
89: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
91: for (i1=0,col=0; i1<nslim_col; ++i1){
92: nsz = ns_col[i1];
93: for (i2=0; i2<nsz; ++i2,++col)
94: tvc[col] = i1;
95: }
96: /* allocate space for row pointers */
97: PetscMalloc((nslim_row+1)*sizeof(int),&ia);
98: *iia = ia;
99: PetscMemzero(ia,(nslim_row+1)*sizeof(int));
100: PetscMalloc((nslim_row+1)*sizeof(int),&work);
102: /* determine the number of columns in each row */
103: ia[0] = oshift;
104: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
106: j = aj + ai[row] + ishift;
107: jmax = aj + ai[row+1] + ishift;
108: i2 = 0;
109: col = *j++ + ishift;
110: i2 = tvc[col];
111: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
112: ia[i1+1]++;
113: ia[i2+1]++;
114: i2++; /* Start col of next node */
115: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
116: i2 = tvc[col];
117: }
118: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
119: }
121: /* shift ia[i] to point to next row */
122: for (i1=1; i1<nslim_row+1; i1++) {
123: row = ia[i1-1];
124: ia[i1] += row;
125: work[i1-1] = row - oshift;
126: }
128: /* allocate space for column pointers */
129: nz = ia[nslim_row] + (!ishift);
130: PetscMalloc(nz*sizeof(int),&ja);
131: *jja = ja;
133: /* loop over lower triangular part putting into ja */
134: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
135: j = aj + ai[row] + ishift;
136: jmax = aj + ai[row+1] + ishift;
137: i2 = 0; /* Col inode index */
138: col = *j++ + ishift;
139: i2 = tvc[col];
140: while (i2<i1 && j<jmax) {
141: ja[work[i2]++] = i1 + oshift;
142: ja[work[i1]++] = i2 + oshift;
143: ++i2;
144: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
145: i2 = tvc[col];
146: }
147: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
149: }
150: PetscFree(work);
151: PetscFree(tns);
152: PetscFree(tvc);
153: return(0);
154: }
156: /*
157: This builds nonsymmetric version of nonzero structure,
158: */
159: static int MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
160: {
161: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
162: int *work,*ia,*ja,*j,nz,nslim_row,n,row,col,ierr,*ns_col,nslim_col;
163: int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
166: nslim_row = a->inode.node_count;
167: n = A->n;
169: /* Create The column_inode for this matrix */
170: Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
171:
172: /* allocate space for reformated column_inode structure */
173: PetscMalloc((nslim_col +1)*sizeof(int),&tns);
174: PetscMalloc((n +1)*sizeof(int),&tvc);
175: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
177: for (i1=0,col=0; i1<nslim_col; ++i1){
178: nsz = ns_col[i1];
179: for (i2=0; i2<nsz; ++i2,++col)
180: tvc[col] = i1;
181: }
182: /* allocate space for row pointers */
183: PetscMalloc((nslim_row+1)*sizeof(int),&ia);
184: *iia = ia;
185: PetscMemzero(ia,(nslim_row+1)*sizeof(int));
186: PetscMalloc((nslim_row+1)*sizeof(int),&work);
188: /* determine the number of columns in each row */
189: ia[0] = oshift;
190: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
191: j = aj + ai[row] + ishift;
192: col = *j++ + ishift;
193: i2 = tvc[col];
194: nz = ai[row+1] - ai[row];
195: while (nz-- > 0) { /* off-diagonal elemets */
196: ia[i1+1]++;
197: i2++; /* Start col of next node */
198: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
199: i2 = tvc[col];
200: }
201: }
203: /* shift ia[i] to point to next row */
204: for (i1=1; i1<nslim_row+1; i1++) {
205: row = ia[i1-1];
206: ia[i1] += row;
207: work[i1-1] = row - oshift;
208: }
210: /* allocate space for column pointers */
211: nz = ia[nslim_row] + (!ishift);
212: PetscMalloc(nz*sizeof(int),&ja);
213: *jja = ja;
215: /* loop over matrix putting into ja */
216: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
217: j = aj + ai[row] + ishift;
218: i2 = 0; /* Col inode index */
219: col = *j++ + ishift;
220: i2 = tvc[col];
221: nz = ai[row+1] - ai[row];
222: while (nz-- > 0) {
223: ja[work[i1]++] = i2 + oshift;
224: ++i2;
225: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
226: i2 = tvc[col];
227: }
228: }
229: PetscFree(ns_col);
230: PetscFree(work);
231: PetscFree(tns);
232: PetscFree(tvc);
233: return(0);
234: }
236: static int MatGetRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
237: {
238: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
239: int ierr,ishift;
242: *n = a->inode.node_count;
243: if (!ia) return(0);
245: ishift = a->indexshift;
246: if (symmetric) {
247: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,ishift,oshift);
248: } else {
249: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,ishift,oshift);
250: }
251: return(0);
252: }
254: static int MatRestoreRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
255: {
259: if (!ia) return(0);
260: PetscFree(*ia);
261: PetscFree(*ja);
262: return(0);
263: }
265: /* ----------------------------------------------------------- */
267: static int MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
268: {
269: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
270: int *work,*ia,*ja,*j,nz,nslim_row, n,row,col,ierr,*ns_col,nslim_col;
271: int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
274: nslim_row = a->inode.node_count;
275: n = A->n;
277: /* Create The column_inode for this matrix */
278: Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
279:
280: /* allocate space for reformated column_inode structure */
281: PetscMalloc((nslim_col + 1)*sizeof(int),&tns);
282: PetscMalloc((n + 1)*sizeof(int),&tvc);
283: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
285: for (i1=0,col=0; i1<nslim_col; ++i1){
286: nsz = ns_col[i1];
287: for (i2=0; i2<nsz; ++i2,++col)
288: tvc[col] = i1;
289: }
290: /* allocate space for column pointers */
291: PetscMalloc((nslim_col+1)*sizeof(int),&ia);
292: *iia = ia;
293: PetscMemzero(ia,(nslim_col+1)*sizeof(int));
294: PetscMalloc((nslim_col+1)*sizeof(int),&work);
296: /* determine the number of columns in each row */
297: ia[0] = oshift;
298: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
299: j = aj + ai[row] + ishift;
300: col = *j++ + ishift;
301: i2 = tvc[col];
302: nz = ai[row+1] - ai[row];
303: while (nz-- > 0) { /* off-diagonal elemets */
304: /* ia[i1+1]++; */
305: ia[i2+1]++;
306: i2++;
307: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
308: i2 = tvc[col];
309: }
310: }
312: /* shift ia[i] to point to next col */
313: for (i1=1; i1<nslim_col+1; i1++) {
314: col = ia[i1-1];
315: ia[i1] += col;
316: work[i1-1] = col - oshift;
317: }
319: /* allocate space for column pointers */
320: nz = ia[nslim_col] + (!ishift);
321: PetscMalloc(nz*sizeof(int),&ja);
322: *jja = ja;
324: /* loop over matrix putting into ja */
325: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
326: j = aj + ai[row] + ishift;
327: i2 = 0; /* Col inode index */
328: col = *j++ + ishift;
329: i2 = tvc[col];
330: nz = ai[row+1] - ai[row];
331: while (nz-- > 0) {
332: /* ja[work[i1]++] = i2 + oshift; */
333: ja[work[i2]++] = i1 + oshift;
334: i2++;
335: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
336: i2 = tvc[col];
337: }
338: }
339: PetscFree(ns_col);
340: PetscFree(work);
341: PetscFree(tns);
342: PetscFree(tvc);
343: return(0);
344: }
346: static int MatGetColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
347: {
348: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
349: int ierr,ishift;
352: Mat_AIJ_CreateColInode(A,n,PETSC_NULL);
353: if (!ia) return(0);
355: ishift = a->indexshift;
356: if (symmetric) {
357: /* Since the indices are symmetric it does'nt matter */
358: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,ishift,oshift);
359: } else {
360: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,ishift,oshift);
361: }
362: return(0);
363: }
365: static int MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
366: {
370: if (!ia) return(0);
371: PetscFree(*ia);
372: PetscFree(*ja);
373: return(0);
374: }
376: /* ----------------------------------------------------------- */
378: static int MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
379: {
380: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
381: Scalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
382: Scalar *v1,*v2,*v3,*v4,*v5,*x,*y;
383: int ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
384: int shift = a->indexshift;
385:
386: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
387: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
388: #endif
391: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
392: node_max = a->inode.node_count;
393: ns = a->inode.size; /* Node Size array */
394: VecGetArray(xx,&x);
395: VecGetArray(yy,&y);
396: x = x + shift; /* shift for Fortran start by 1 indexing */
397: idx = a->j;
398: v1 = a->a;
399: ii = a->i;
401: for (i = 0,row = 0; i< node_max; ++i){
402: nsz = ns[i];
403: n = ii[1] - ii[0];
404: ii += nsz;
405: sz = n; /*No of non zeros in this row */
406: /* Switch on the size of Node */
407: switch (nsz){ /* Each loop in 'case' is unrolled */
408: case 1 :
409: sum1 = 0;
410:
411: for(n = 0; n< sz-1; n+=2) {
412: i1 = idx[0]; /* The instructions are ordered to */
413: i2 = idx[1]; /* make the compiler's job easy */
414: idx += 2;
415: tmp0 = x[i1];
416: tmp1 = x[i2];
417: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
418: }
419:
420: if (n == sz-1){ /* Take care of the last nonzero */
421: tmp0 = x[*idx++];
422: sum1 += *v1++ * tmp0;
423: }
424: y[row++]=sum1;
425: break;
426: case 2:
427: sum1 = 0;
428: sum2 = 0;
429: v2 = v1 + n;
430:
431: for (n = 0; n< sz-1; n+=2) {
432: i1 = idx[0];
433: i2 = idx[1];
434: idx += 2;
435: tmp0 = x[i1];
436: tmp1 = x[i2];
437: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
438: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
439: }
440: if (n == sz-1){
441: tmp0 = x[*idx++];
442: sum1 += *v1++ * tmp0;
443: sum2 += *v2++ * tmp0;
444: }
445: y[row++]=sum1;
446: y[row++]=sum2;
447: v1 =v2; /* Since the next block to be processed starts there*/
448: idx +=sz;
449: break;
450: case 3:
451: sum1 = 0;
452: sum2 = 0;
453: sum3 = 0;
454: v2 = v1 + n;
455: v3 = v2 + n;
456:
457: for (n = 0; n< sz-1; n+=2) {
458: i1 = idx[0];
459: i2 = idx[1];
460: idx += 2;
461: tmp0 = x[i1];
462: tmp1 = x[i2];
463: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
464: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
465: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
466: }
467: if (n == sz-1){
468: tmp0 = x[*idx++];
469: sum1 += *v1++ * tmp0;
470: sum2 += *v2++ * tmp0;
471: sum3 += *v3++ * tmp0;
472: }
473: y[row++]=sum1;
474: y[row++]=sum2;
475: y[row++]=sum3;
476: v1 =v3; /* Since the next block to be processed starts there*/
477: idx +=2*sz;
478: break;
479: case 4:
480: sum1 = 0;
481: sum2 = 0;
482: sum3 = 0;
483: sum4 = 0;
484: v2 = v1 + n;
485: v3 = v2 + n;
486: v4 = v3 + n;
487:
488: for (n = 0; n< sz-1; n+=2) {
489: i1 = idx[0];
490: i2 = idx[1];
491: idx += 2;
492: tmp0 = x[i1];
493: tmp1 = x[i2];
494: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
495: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
496: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
497: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
498: }
499: if (n == sz-1){
500: tmp0 = x[*idx++];
501: sum1 += *v1++ * tmp0;
502: sum2 += *v2++ * tmp0;
503: sum3 += *v3++ * tmp0;
504: sum4 += *v4++ * tmp0;
505: }
506: y[row++]=sum1;
507: y[row++]=sum2;
508: y[row++]=sum3;
509: y[row++]=sum4;
510: v1 =v4; /* Since the next block to be processed starts there*/
511: idx +=3*sz;
512: break;
513: case 5:
514: sum1 = 0;
515: sum2 = 0;
516: sum3 = 0;
517: sum4 = 0;
518: sum5 = 0;
519: v2 = v1 + n;
520: v3 = v2 + n;
521: v4 = v3 + n;
522: v5 = v4 + n;
523:
524: for (n = 0; n<sz-1; n+=2) {
525: i1 = idx[0];
526: i2 = idx[1];
527: idx += 2;
528: tmp0 = x[i1];
529: tmp1 = x[i2];
530: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
531: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
532: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
533: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
534: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
535: }
536: if (n == sz-1){
537: tmp0 = x[*idx++];
538: sum1 += *v1++ * tmp0;
539: sum2 += *v2++ * tmp0;
540: sum3 += *v3++ * tmp0;
541: sum4 += *v4++ * tmp0;
542: sum5 += *v5++ * tmp0;
543: }
544: y[row++]=sum1;
545: y[row++]=sum2;
546: y[row++]=sum3;
547: y[row++]=sum4;
548: y[row++]=sum5;
549: v1 =v5; /* Since the next block to be processed starts there */
550: idx +=4*sz;
551: break;
552: default :
553: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
554: }
555: }
556: VecRestoreArray(xx,&x);
557: VecRestoreArray(yy,&y);
558: PetscLogFlops(2*a->nz - A->m);
559: return(0);
560: }
561: /* ----------------------------------------------------------- */
562: /* Almost same code as the MatMult_SeqAij_Inode() */
563: static int MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
564: {
565: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
566: Scalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
567: Scalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z;
568: int ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
569: int shift = a->indexshift;
570:
572: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
573: node_max = a->inode.node_count;
574: ns = a->inode.size; /* Node Size array */
575: VecGetArray(xx,&x);
576: VecGetArray(yy,&y);
577: if (zz != yy) {
578: VecGetArray(zz,&z);
579: } else {
580: z = y;
581: }
583: x = x + shift; /* shift for Fortran start by 1 indexing */
584: idx = a->j;
585: v1 = a->a;
586: ii = a->i;
588: for (i = 0,row = 0; i< node_max; ++i){
589: nsz = ns[i];
590: n = ii[1] - ii[0];
591: ii += nsz;
592: sz = n; /*No of non zeros in this row */
593: /* Switch on the size of Node */
594: switch (nsz){ /* Each loop in 'case' is unrolled */
595: case 1 :
596: sum1 = *z++;
597:
598: for(n = 0; n< sz-1; n+=2) {
599: i1 = idx[0]; /* The instructions are ordered to */
600: i2 = idx[1]; /* make the compiler's job easy */
601: idx += 2;
602: tmp0 = x[i1];
603: tmp1 = x[i2];
604: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
605: }
606:
607: if(n == sz-1){ /* Take care of the last nonzero */
608: tmp0 = x[*idx++];
609: sum1 += *v1++ * tmp0;
610: }
611: y[row++]=sum1;
612: break;
613: case 2:
614: sum1 = *z++;
615: sum2 = *z++;
616: v2 = v1 + n;
617:
618: for(n = 0; n< sz-1; n+=2) {
619: i1 = idx[0];
620: i2 = idx[1];
621: idx += 2;
622: tmp0 = x[i1];
623: tmp1 = x[i2];
624: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
625: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
626: }
627: if(n == sz-1){
628: tmp0 = x[*idx++];
629: sum1 += *v1++ * tmp0;
630: sum2 += *v2++ * tmp0;
631: }
632: y[row++]=sum1;
633: y[row++]=sum2;
634: v1 =v2; /* Since the next block to be processed starts there*/
635: idx +=sz;
636: break;
637: case 3:
638: sum1 = *z++;
639: sum2 = *z++;
640: sum3 = *z++;
641: v2 = v1 + n;
642: v3 = v2 + n;
643:
644: for (n = 0; n< sz-1; n+=2) {
645: i1 = idx[0];
646: i2 = idx[1];
647: idx += 2;
648: tmp0 = x[i1];
649: tmp1 = x[i2];
650: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
653: }
654: if (n == sz-1){
655: tmp0 = x[*idx++];
656: sum1 += *v1++ * tmp0;
657: sum2 += *v2++ * tmp0;
658: sum3 += *v3++ * tmp0;
659: }
660: y[row++]=sum1;
661: y[row++]=sum2;
662: y[row++]=sum3;
663: v1 =v3; /* Since the next block to be processed starts there*/
664: idx +=2*sz;
665: break;
666: case 4:
667: sum1 = *z++;
668: sum2 = *z++;
669: sum3 = *z++;
670: sum4 = *z++;
671: v2 = v1 + n;
672: v3 = v2 + n;
673: v4 = v3 + n;
674:
675: for (n = 0; n< sz-1; n+=2) {
676: i1 = idx[0];
677: i2 = idx[1];
678: idx += 2;
679: tmp0 = x[i1];
680: tmp1 = x[i2];
681: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
682: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
683: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
684: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
685: }
686: if (n == sz-1){
687: tmp0 = x[*idx++];
688: sum1 += *v1++ * tmp0;
689: sum2 += *v2++ * tmp0;
690: sum3 += *v3++ * tmp0;
691: sum4 += *v4++ * tmp0;
692: }
693: y[row++]=sum1;
694: y[row++]=sum2;
695: y[row++]=sum3;
696: y[row++]=sum4;
697: v1 =v4; /* Since the next block to be processed starts there*/
698: idx +=3*sz;
699: break;
700: case 5:
701: sum1 = *z++;
702: sum2 = *z++;
703: sum3 = *z++;
704: sum4 = *z++;
705: sum5 = *z++;
706: v2 = v1 + n;
707: v3 = v2 + n;
708: v4 = v3 + n;
709: v5 = v4 + n;
710:
711: for (n = 0; n<sz-1; n+=2) {
712: i1 = idx[0];
713: i2 = idx[1];
714: idx += 2;
715: tmp0 = x[i1];
716: tmp1 = x[i2];
717: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
718: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
719: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
720: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
721: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
722: }
723: if(n == sz-1){
724: tmp0 = x[*idx++];
725: sum1 += *v1++ * tmp0;
726: sum2 += *v2++ * tmp0;
727: sum3 += *v3++ * tmp0;
728: sum4 += *v4++ * tmp0;
729: sum5 += *v5++ * tmp0;
730: }
731: y[row++]=sum1;
732: y[row++]=sum2;
733: y[row++]=sum3;
734: y[row++]=sum4;
735: y[row++]=sum5;
736: v1 =v5; /* Since the next block to be processed starts there */
737: idx +=4*sz;
738: break;
739: default :
740: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
741: }
742: }
743: VecRestoreArray(xx,&x);
744: VecRestoreArray(yy,&y);
745: if (zz != yy) {
746: VecRestoreArray(zz,&z);
747: }
748: PetscLogFlops(2*a->nz);
749: return(0);
750: }
751: /* ----------------------------------------------------------- */
752: EXTERN int MatColoringPatch_SeqAIJ_Inode(Mat,int,int,int *,ISColoring *);
754: int Mat_AIJ_CheckInode(Mat A)
755: {
756: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
757: int ierr,i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
758: PetscTruth flag,flg;
761: /* Notes: We set a->inode.limit=5 in MatCreateSeqAIJ(). */
762: if (!a->inode.use) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODESn"); return(0);}
763: PetscOptionsHasName(A->prefix,"-mat_aij_no_inode",&flg);
764: if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_aij_no_inoden");return(0);}
765: PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
766: if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_no_unrolln");return(0);}
767: PetscOptionsGetInt(A->prefix,"-mat_aij_inode_limit",&a->inode.limit,PETSC_NULL);
768: if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
769: m = A->m;
770: if (a->inode.size) {ns = a->inode.size;}
771: else {PetscMalloc((m+1)*sizeof(int),&ns);}
773: i = 0;
774: node_count = 0;
775: idx = a->j;
776: ii = a->i;
777: while (i < m){ /* For each row */
778: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
779: /* Limits the number of elements in a node to 'a->inode.limit' */
780: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
781: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
782: if (nzy != nzx) break;
783: idy += nzx; /* Same nonzero pattern */
784: PetscMemcmp(idx,idy,nzx*sizeof(int),&flag);
785: if (flag == PETSC_FALSE) break;
786: }
787: ns[node_count++] = blk_size;
788: idx += blk_size*nzx;
789: i = j;
790: }
791: /* If not enough inodes found,, do not use inode version of the routines */
792: if (!a->inode.size && m && node_count > 0.9*m) {
793: PetscFree(ns);
794: A->ops->mult = MatMult_SeqAIJ;
795: A->ops->multadd = MatMultAdd_SeqAIJ;
796: A->ops->solve = MatSolve_SeqAIJ;
797: A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ;
798: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
799: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
800: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
801: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
802: A->ops->coloringpatch = 0;
803: a->inode.node_count = 0;
804: a->inode.size = 0;
805: PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes out of %d rows. Not using Inode routinesn",node_count,m);
806: } else {
807: A->ops->mult = MatMult_SeqAIJ_Inode;
808: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
809: A->ops->solve = MatSolve_SeqAIJ_Inode;
810: A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
811: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
812: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
813: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
814: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
815: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
816: a->inode.node_count = node_count;
817: a->inode.size = ns;
818: PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes of %d. Limit used: %d. Using Inode routinesn",node_count,m,a->inode.limit);
819: }
820: return(0);
821: }
823: /* ----------------------------------------------------------- */
824: int MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
825: {
826: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
827: IS iscol = a->col,isrow = a->row;
828: int *r,*c,ierr,i,j,n = A->m,*ai = a->i,nz,shift = a->indexshift,*a_j = a->j;
829: int node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
830: Scalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
831: Scalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
834: if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
835: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
836: node_max = a->inode.node_count;
837: ns = a->inode.size; /* Node Size array */
839: VecGetArray(bb,&b);
840: VecGetArray(xx,&x);
841: tmp = a->solve_work;
842:
843: ISGetIndices(isrow,&rout); r = rout;
844: ISGetIndices(iscol,&cout); c = cout + (n-1);
845:
846: /* forward solve the lower triangular */
847: tmps = tmp + shift;
848: aa = a_a +shift;
849: aj = a_j + shift;
850: ad = a->diag;
852: for (i = 0,row = 0; i< node_max; ++i){
853: nsz = ns[i];
854: aii = ai[row];
855: v1 = aa + aii;
856: vi = aj + aii;
857: nz = ad[row]- aii;
858:
859: switch (nsz){ /* Each loop in 'case' is unrolled */
860: case 1 :
861: sum1 = b[*r++];
862: /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
863: for(j=0; j<nz-1; j+=2){
864: i0 = vi[0];
865: i1 = vi[1];
866: vi +=2;
867: tmp0 = tmps[i0];
868: tmp1 = tmps[i1];
869: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
870: }
871: if(j == nz-1){
872: tmp0 = tmps[*vi++];
873: sum1 -= *v1++ *tmp0;
874: }
875: tmp[row ++]=sum1;
876: break;
877: case 2:
878: sum1 = b[*r++];
879: sum2 = b[*r++];
880: v2 = aa + ai[row+1];
882: for(j=0; j<nz-1; j+=2){
883: i0 = vi[0];
884: i1 = vi[1];
885: vi +=2;
886: tmp0 = tmps[i0];
887: tmp1 = tmps[i1];
888: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
889: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
890: }
891: if(j == nz-1){
892: tmp0 = tmps[*vi++];
893: sum1 -= *v1++ *tmp0;
894: sum2 -= *v2++ *tmp0;
895: }
896: sum2 -= *v2++ * sum1;
897: tmp[row ++]=sum1;
898: tmp[row ++]=sum2;
899: break;
900: case 3:
901: sum1 = b[*r++];
902: sum2 = b[*r++];
903: sum3 = b[*r++];
904: v2 = aa + ai[row+1];
905: v3 = aa + ai[row+2];
906:
907: for (j=0; j<nz-1; j+=2){
908: i0 = vi[0];
909: i1 = vi[1];
910: vi +=2;
911: tmp0 = tmps[i0];
912: tmp1 = tmps[i1];
913: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916: }
917: if (j == nz-1){
918: tmp0 = tmps[*vi++];
919: sum1 -= *v1++ *tmp0;
920: sum2 -= *v2++ *tmp0;
921: sum3 -= *v3++ *tmp0;
922: }
923: sum2 -= *v2++ * sum1;
924: sum3 -= *v3++ * sum1;
925: sum3 -= *v3++ * sum2;
926: tmp[row ++]=sum1;
927: tmp[row ++]=sum2;
928: tmp[row ++]=sum3;
929: break;
930:
931: case 4:
932: sum1 = b[*r++];
933: sum2 = b[*r++];
934: sum3 = b[*r++];
935: sum4 = b[*r++];
936: v2 = aa + ai[row+1];
937: v3 = aa + ai[row+2];
938: v4 = aa + ai[row+3];
939:
940: for (j=0; j<nz-1; j+=2){
941: i0 = vi[0];
942: i1 = vi[1];
943: vi +=2;
944: tmp0 = tmps[i0];
945: tmp1 = tmps[i1];
946: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
947: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
948: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
949: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
950: }
951: if (j == nz-1){
952: tmp0 = tmps[*vi++];
953: sum1 -= *v1++ *tmp0;
954: sum2 -= *v2++ *tmp0;
955: sum3 -= *v3++ *tmp0;
956: sum4 -= *v4++ *tmp0;
957: }
958: sum2 -= *v2++ * sum1;
959: sum3 -= *v3++ * sum1;
960: sum4 -= *v4++ * sum1;
961: sum3 -= *v3++ * sum2;
962: sum4 -= *v4++ * sum2;
963: sum4 -= *v4++ * sum3;
964:
965: tmp[row ++]=sum1;
966: tmp[row ++]=sum2;
967: tmp[row ++]=sum3;
968: tmp[row ++]=sum4;
969: break;
970: case 5:
971: sum1 = b[*r++];
972: sum2 = b[*r++];
973: sum3 = b[*r++];
974: sum4 = b[*r++];
975: sum5 = b[*r++];
976: v2 = aa + ai[row+1];
977: v3 = aa + ai[row+2];
978: v4 = aa + ai[row+3];
979: v5 = aa + ai[row+4];
980:
981: for (j=0; j<nz-1; j+=2){
982: i0 = vi[0];
983: i1 = vi[1];
984: vi +=2;
985: tmp0 = tmps[i0];
986: tmp1 = tmps[i1];
987: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
988: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
989: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
990: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
991: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
992: }
993: if (j == nz-1){
994: tmp0 = tmps[*vi++];
995: sum1 -= *v1++ *tmp0;
996: sum2 -= *v2++ *tmp0;
997: sum3 -= *v3++ *tmp0;
998: sum4 -= *v4++ *tmp0;
999: sum5 -= *v5++ *tmp0;
1000: }
1002: sum2 -= *v2++ * sum1;
1003: sum3 -= *v3++ * sum1;
1004: sum4 -= *v4++ * sum1;
1005: sum5 -= *v5++ * sum1;
1006: sum3 -= *v3++ * sum2;
1007: sum4 -= *v4++ * sum2;
1008: sum5 -= *v5++ * sum2;
1009: sum4 -= *v4++ * sum3;
1010: sum5 -= *v5++ * sum3;
1011: sum5 -= *v5++ * sum4;
1012:
1013: tmp[row ++]=sum1;
1014: tmp[row ++]=sum2;
1015: tmp[row ++]=sum3;
1016: tmp[row ++]=sum4;
1017: tmp[row ++]=sum5;
1018: break;
1019: default:
1020: SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1021: }
1022: }
1023: /* backward solve the upper triangular */
1024: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1025: nsz = ns[i];
1026: aii = ai[row+1] -1;
1027: v1 = aa + aii;
1028: vi = aj + aii;
1029: nz = aii- ad[row];
1030: switch (nsz){ /* Each loop in 'case' is unrolled */
1031: case 1 :
1032: sum1 = tmp[row];
1034: for(j=nz ; j>1; j-=2){
1035: i0 = vi[0];
1036: i1 = vi[-1];
1037: vi -=2;
1038: tmp0 = tmps[i0];
1039: tmp1 = tmps[i1];
1040: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1041: }
1042: if (j==1){
1043: tmp0 = tmps[*vi--];
1044: sum1 -= *v1-- * tmp0;
1045: }
1046: x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1047: break;
1048: case 2 :
1049: sum1 = tmp[row];
1050: sum2 = tmp[row -1];
1051: v2 = aa + ai[row]-1;
1052: for (j=nz ; j>1; j-=2){
1053: i0 = vi[0];
1054: i1 = vi[-1];
1055: vi -=2;
1056: tmp0 = tmps[i0];
1057: tmp1 = tmps[i1];
1058: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1059: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1060: }
1061: if (j==1){
1062: tmp0 = tmps[*vi--];
1063: sum1 -= *v1-- * tmp0;
1064: sum2 -= *v2-- * tmp0;
1065: }
1066:
1067: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1068: sum2 -= *v2-- * tmp0;
1069: x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1070: break;
1071: case 3 :
1072: sum1 = tmp[row];
1073: sum2 = tmp[row -1];
1074: sum3 = tmp[row -2];
1075: v2 = aa + ai[row]-1;
1076: v3 = aa + ai[row -1]-1;
1077: for (j=nz ; j>1; j-=2){
1078: i0 = vi[0];
1079: i1 = vi[-1];
1080: vi -=2;
1081: tmp0 = tmps[i0];
1082: tmp1 = tmps[i1];
1083: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1084: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1085: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1086: }
1087: if (j==1){
1088: tmp0 = tmps[*vi--];
1089: sum1 -= *v1-- * tmp0;
1090: sum2 -= *v2-- * tmp0;
1091: sum3 -= *v3-- * tmp0;
1092: }
1093: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1094: sum2 -= *v2-- * tmp0;
1095: sum3 -= *v3-- * tmp0;
1096: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1097: sum3 -= *v3-- * tmp0;
1098: x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1099:
1100: break;
1101: case 4 :
1102: sum1 = tmp[row];
1103: sum2 = tmp[row -1];
1104: sum3 = tmp[row -2];
1105: sum4 = tmp[row -3];
1106: v2 = aa + ai[row]-1;
1107: v3 = aa + ai[row -1]-1;
1108: v4 = aa + ai[row -2]-1;
1110: for (j=nz ; j>1; j-=2){
1111: i0 = vi[0];
1112: i1 = vi[-1];
1113: vi -=2;
1114: tmp0 = tmps[i0];
1115: tmp1 = tmps[i1];
1116: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1117: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1118: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1119: sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1120: }
1121: if (j==1){
1122: tmp0 = tmps[*vi--];
1123: sum1 -= *v1-- * tmp0;
1124: sum2 -= *v2-- * tmp0;
1125: sum3 -= *v3-- * tmp0;
1126: sum4 -= *v4-- * tmp0;
1127: }
1129: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1130: sum2 -= *v2-- * tmp0;
1131: sum3 -= *v3-- * tmp0;
1132: sum4 -= *v4-- * tmp0;
1133: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1134: sum3 -= *v3-- * tmp0;
1135: sum4 -= *v4-- * tmp0;
1136: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1137: sum4 -= *v4-- * tmp0;
1138: x[*c--] = tmp[row] = sum4*a_a[ad[row]+shift]; row--;
1139: break;
1140: case 5 :
1141: sum1 = tmp[row];
1142: sum2 = tmp[row -1];
1143: sum3 = tmp[row -2];
1144: sum4 = tmp[row -3];
1145: sum5 = tmp[row -4];
1146: v2 = aa + ai[row]-1;
1147: v3 = aa + ai[row -1]-1;
1148: v4 = aa + ai[row -2]-1;
1149: v5 = aa + ai[row -3]-1;
1150: for (j=nz ; j>1; j-=2){
1151: i0 = vi[0];
1152: i1 = vi[-1];
1153: vi -=2;
1154: tmp0 = tmps[i0];
1155: tmp1 = tmps[i1];
1156: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1157: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1158: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1159: sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1160: sum5 -= v5[0] * tmp0 + v5[-1] * tmp1; v5 -= 2;
1161: }
1162: if (j==1){
1163: tmp0 = tmps[*vi--];
1164: sum1 -= *v1-- * tmp0;
1165: sum2 -= *v2-- * tmp0;
1166: sum3 -= *v3-- * tmp0;
1167: sum4 -= *v4-- * tmp0;
1168: sum5 -= *v5-- * tmp0;
1169: }
1171: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1172: sum2 -= *v2-- * tmp0;
1173: sum3 -= *v3-- * tmp0;
1174: sum4 -= *v4-- * tmp0;
1175: sum5 -= *v5-- * tmp0;
1176: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1177: sum3 -= *v3-- * tmp0;
1178: sum4 -= *v4-- * tmp0;
1179: sum5 -= *v5-- * tmp0;
1180: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1181: sum4 -= *v4-- * tmp0;
1182: sum5 -= *v5-- * tmp0;
1183: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]+shift]; row--;
1184: sum5 -= *v5-- * tmp0;
1185: x[*c--] = tmp[row] = sum5*a_a[ad[row]+shift]; row--;
1186: break;
1187: default:
1188: SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1189: }
1190: }
1191: ISRestoreIndices(isrow,&rout);
1192: ISRestoreIndices(iscol,&cout);
1193: VecRestoreArray(bb,&b);
1194: VecRestoreArray(xx,&x);
1195: PetscLogFlops(2*a->nz - A->n);
1196: return(0);
1197: }
1200: int MatLUFactorNumeric_SeqAIJ_Inode(Mat A,Mat *B)
1201: {
1202: Mat C = *B;
1203: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ *)C->data;
1204: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1205: int shift = a->indexshift,*r,*ic,*c,ierr,n = A->m,*bi = b->i;
1206: int *bj = b->j+shift,*nbj=b->j +(!shift),*ajtmp,*bjtmp,nz,row,prow;
1207: int *ics,i,j,idx,*ai = a->i,*aj = a->j+shift,*bd = b->diag,node_max,nsz;
1208: int *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj,ndamp = 0;
1209: Scalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1210: Scalar tmp,*ba = b->a+shift,*aa = a->a+shift,*pv,*rtmps1,*rtmps2,*rtmps3;
1211: PetscTruth damp;
1212: PetscReal damping = b->lu_damping;
1215: ierr = ISGetIndices(isrow,&r);
1216: ierr = ISGetIndices(iscol,&c);
1217: ierr = ISGetIndices(isicol,&ic);
1218: PetscMalloc((3*n+1)*sizeof(Scalar),&rtmp1);
1219: ierr = PetscMemzero(rtmp1,(3*n+1)*sizeof(Scalar));
1220: ics = ic + shift; rtmps1 = rtmp1 + shift;
1221: rtmp2 = rtmp1 + n; rtmps2 = rtmp2 + shift;
1222: rtmp3 = rtmp2 + n; rtmps3 = rtmp3 + shift;
1223:
1224: node_max = a->inode.node_count;
1225: ns = a->inode.size ;
1226: if (!ns){
1227: SETERRQ(1,"Matrix without inode information");
1228: }
1230: /* If max inode size > 3, split it into two inodes.*/
1231: /* also map the inode sizes according to the ordering */
1232: PetscMalloc((n+1)* sizeof(int),&tmp_vec1);
1233: for (i=0,j=0; i<node_max; ++i,++j){
1234: if (ns[i]>3) {
1235: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1236: ++j;
1237: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1238: } else {
1239: tmp_vec1[j] = ns[i];
1240: }
1241: }
1242: /* Use the correct node_max */
1243: node_max = j;
1245: /* Now reorder the inode info based on mat re-ordering info */
1246: /* First create a row -> inode_size_array_index map */
1247: PetscMalloc(n*sizeof(int)+1,&nsmap);
1248: PetscMalloc(node_max*sizeof(int)+1,&tmp_vec2);
1249: for (i=0,row=0; i<node_max; i++) {
1250: nsz = tmp_vec1[i];
1251: for (j=0; j<nsz; j++,row++) {
1252: nsmap[row] = i;
1253: }
1254: }
1255: /* Using nsmap, create a reordered ns structure */
1256: for (i=0,j=0; i< node_max; i++) {
1257: nsz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1258: tmp_vec2[i] = nsz;
1259: j += nsz;
1260: }
1261: PetscFree(nsmap);
1262: PetscFree(tmp_vec1);
1263: /* Now use the correct ns */
1264: ns = tmp_vec2;
1267: do {
1268: damp = PETSC_FALSE;
1269: /* Now loop over each block-row, and do the factorization */
1270: for (i=0,row=0; i<node_max; i++) {
1271: nsz = ns[i];
1272: nz = bi[row+1] - bi[row];
1273: bjtmp = bj + bi[row];
1274:
1275: switch (nsz){
1276: case 1:
1277: for (j=0; j<nz; j++){
1278: idx = bjtmp[j];
1279: rtmps1[idx] = 0.0;
1280: }
1281:
1282: /* load in initial (unfactored row) */
1283: idx = r[row];
1284: nz = ai[idx+1] - ai[idx];
1285: ajtmp = aj + ai[idx];
1286: v1 = aa + ai[idx];
1288: for (j=0; j<nz; j++) {
1289: idx = ics[ajtmp[j]];
1290: rtmp1[idx] = v1[j];
1291: if (ajtmp[j] == r[row]) {
1292: rtmp1[idx] += damping;
1293: }
1294: }
1295: prow = *bjtmp++ + shift;
1296: while (prow < row) {
1297: pc1 = rtmp1 + prow;
1298: if (*pc1 != 0.0){
1299: pv = ba + bd[prow];
1300: pj = nbj + bd[prow];
1301: mul1 = *pc1 * *pv++;
1302: *pc1 = mul1;
1303: nz = bi[prow+1] - bd[prow] - 1;
1304: PetscLogFlops(2*nz);
1305: for (j=0; j<nz; j++) {
1306: tmp = pv[j];
1307: idx = pj[j];
1308: rtmps1[idx] -= mul1 * tmp;
1309: }
1310: }
1311: prow = *bjtmp++ + shift;
1312: }
1313: nz = bi[row+1] - bi[row];
1314: pj = bj + bi[row];
1315: pc1 = ba + bi[row];
1316: if (PetscAbsScalar(rtmp1[row]) < 1.e-12) {
1317: if (b->lu_damping) {
1318: damp = PETSC_TRUE;
1319: if (damping) damping *= 2.0;
1320: else damping = 1.e-12;
1321: ndamp++;
1322: goto endofwhile;
1323: } else {
1324: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1325: }
1326: }
1327: rtmp1[row] = 1.0/rtmp1[row];
1328: for (j=0; j<nz; j++) {
1329: idx = pj[j];
1330: pc1[j] = rtmps1[idx];
1331: }
1332: break;
1333:
1334: case 2:
1335: for (j=0; j<nz; j++) {
1336: idx = bjtmp[j];
1337: rtmps1[idx] = 0.0;
1338: rtmps2[idx] = 0.0;
1339: }
1340:
1341: /* load in initial (unfactored row) */
1342: idx = r[row];
1343: nz = ai[idx+1] - ai[idx];
1344: ajtmp = aj + ai[idx];
1345: v1 = aa + ai[idx];
1346: v2 = aa + ai[idx+1];
1347:
1348: for (j=0; j<nz; j++) {
1349: idx = ics[ajtmp[j]];
1350: rtmp1[idx] = v1[j];
1351: rtmp2[idx] = v2[j];
1352: if (ajtmp[j] == r[row]) {
1353: rtmp1[idx] += damping;
1354: }
1355: if (ajtmp[j] == r[row+1]) {
1356: rtmp2[idx] += damping;
1357: }
1358: }
1359: prow = *bjtmp++ + shift;
1360: while (prow < row) {
1361: pc1 = rtmp1 + prow;
1362: pc2 = rtmp2 + prow;
1363: if (*pc1 != 0.0 || *pc2 != 0.0){
1364: pv = ba + bd[prow];
1365: pj = nbj + bd[prow];
1366: mul1 = *pc1 * *pv;
1367: mul2 = *pc2 * *pv;
1368: ++pv;
1369: *pc1 = mul1;
1370: *pc2 = mul2;
1371:
1372: nz = bi[prow+1] - bd[prow] - 1;
1373: PetscLogFlops(2*2*nz);
1374: for (j=0; j<nz; j++) {
1375: tmp = pv[j];
1376: idx = pj[j];
1377: rtmps1[idx] -= mul1 * tmp;
1378: rtmps2[idx] -= mul2 * tmp;
1379: }
1380: }
1381: prow = *bjtmp++ + shift;
1382: }
1383: /* Now take care of the odd element*/
1384: pc1 = rtmp1 + prow;
1385: pc2 = rtmp2 + prow;
1386: if (*pc2 != 0.0){
1387: pj = nbj + bd[prow];
1388: if (PetscAbsScalar(*pc1) < 1.e-12) {
1389: if (b->lu_damping) {
1390: damp = PETSC_TRUE;
1391: if (damping) damping *= 2.0;
1392: else damping = 1.e-12;
1393: ndamp++;
1394: goto endofwhile;
1395: } else {
1396: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1397: }
1398: }
1399: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1400: *pc2 = mul2;
1401: nz = bi[prow+1] - bd[prow] - 1;
1402: PetscLogFlops(2*nz);
1403: for (j=0; j<nz; j++) {
1404: idx = pj[j] + shift;
1405: tmp = rtmp1[idx];
1406: rtmp2[idx] -= mul2 * tmp;
1407: }
1408: }
1409:
1410: nz = bi[row+1] - bi[row];
1411: pj = bj + bi[row];
1412: pc1 = ba + bi[row];
1413: pc2 = ba + bi[row+1];
1414: if (PetscAbsScalar(rtmp1[row]) < 1.e-12 || PetscAbsScalar(rtmp2[row+1]) < 1.e-12) {
1415: if (b->lu_damping) {
1416: damp = PETSC_TRUE;
1417: if (damping) damping *= 2.0;
1418: else damping = 1.e-12;
1419: ndamp++;
1420: goto endofwhile;
1421: } else {
1422: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1423: }
1424: }
1425: rtmp1[row] = 1.0/rtmp1[row];
1426: rtmp2[row+1] = 1.0/rtmp2[row+1];
1427: for (j=0; j<nz; j++) {
1428: idx = pj[j];
1429: pc1[j] = rtmps1[idx];
1430: pc2[j] = rtmps2[idx];
1431: }
1432: break;
1434: case 3:
1435: for (j=0; j<nz; j++) {
1436: idx = bjtmp[j];
1437: rtmps1[idx] = 0.0;
1438: rtmps2[idx] = 0.0;
1439: rtmps3[idx] = 0.0;
1440: }
1441: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1442: idx = r[row];
1443: nz = ai[idx+1] - ai[idx];
1444: ajtmp = aj + ai[idx];
1445: v1 = aa + ai[idx];
1446: v2 = aa + ai[idx+1];
1447: v3 = aa + ai[idx+2];
1448: for (j=0; j<nz; j++) {
1449: idx = ics[ajtmp[j]];
1450: rtmp1[idx] = v1[j];
1451: rtmp2[idx] = v2[j];
1452: rtmp3[idx] = v3[j];
1453: if (ajtmp[j] == r[row]) {
1454: rtmp1[idx] += damping;
1455: }
1456: if (ajtmp[j] == r[row+1]) {
1457: rtmp2[idx] += damping;
1458: }
1459: if (ajtmp[j] == r[row+2]) {
1460: rtmp3[idx] += damping;
1461: }
1462: }
1463: /* loop over all pivot row blocks above this row block */
1464: prow = *bjtmp++ + shift;
1465: while (prow < row) {
1466: pc1 = rtmp1 + prow;
1467: pc2 = rtmp2 + prow;
1468: pc3 = rtmp3 + prow;
1469: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1470: pv = ba + bd[prow];
1471: pj = nbj + bd[prow];
1472: mul1 = *pc1 * *pv;
1473: mul2 = *pc2 * *pv;
1474: mul3 = *pc3 * *pv;
1475: ++pv;
1476: *pc1 = mul1;
1477: *pc2 = mul2;
1478: *pc3 = mul3;
1479:
1480: nz = bi[prow+1] - bd[prow] - 1;
1481: PetscLogFlops(3*2*nz);
1482: /* update this row based on pivot row */
1483: for (j=0; j<nz; j++) {
1484: tmp = pv[j];
1485: idx = pj[j];
1486: rtmps1[idx] -= mul1 * tmp;
1487: rtmps2[idx] -= mul2 * tmp;
1488: rtmps3[idx] -= mul3 * tmp;
1489: }
1490: }
1491: prow = *bjtmp++ + shift;
1492: }
1493: /* Now take care of diagonal block in this set of rows */
1494: pc1 = rtmp1 + prow;
1495: pc2 = rtmp2 + prow;
1496: pc3 = rtmp3 + prow;
1497: if (*pc2 != 0.0 || *pc3 != 0.0){
1498: pj = nbj + bd[prow];
1499: if (PetscAbsScalar(*pc1) < 1.e-12) {
1500: if (b->lu_damping) {
1501: damp = PETSC_TRUE;
1502: if (damping) damping *= 2.0;
1503: else damping = 1.e-12;
1504: ndamp++;
1505: goto endofwhile;
1506: } else {
1507: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1508: }
1509: }
1510: mul2 = (*pc2)/(*pc1);
1511: mul3 = (*pc3)/(*pc1);
1512: *pc2 = mul2;
1513: *pc3 = mul3;
1514: nz = bi[prow+1] - bd[prow] - 1;
1515: PetscLogFlops(2*2*nz);
1516: for (j=0; j<nz; j++) {
1517: idx = pj[j] + shift;
1518: tmp = rtmp1[idx];
1519: rtmp2[idx] -= mul2 * tmp;
1520: rtmp3[idx] -= mul3 * tmp;
1521: }
1522: }
1523: ++prow;
1524: pc2 = rtmp2 + prow;
1525: pc3 = rtmp3 + prow;
1526: if (*pc3 != 0.0){
1527: pj = nbj + bd[prow];
1528: pj = nbj + bd[prow];
1529: if (PetscAbsScalar(*pc2) < 1.e-12) {
1530: if (b->lu_damping) {
1531: damp = PETSC_TRUE;
1532: if (damping) damping *= 2.0;
1533: else damping = 1.e-12;
1534: ndamp++;
1535: goto endofwhile;
1536: } else {
1537: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1538: }
1539: }
1540: mul3 = (*pc3)/(*pc2);
1541: *pc3 = mul3;
1542: nz = bi[prow+1] - bd[prow] - 1;
1543: PetscLogFlops(2*2*nz);
1544: for (j=0; j<nz; j++) {
1545: idx = pj[j] + shift;
1546: tmp = rtmp2[idx];
1547: rtmp3[idx] -= mul3 * tmp;
1548: }
1549: }
1550: nz = bi[row+1] - bi[row];
1551: pj = bj + bi[row];
1552: pc1 = ba + bi[row];
1553: pc2 = ba + bi[row+1];
1554: pc3 = ba + bi[row+2];
1555: if (PetscAbsScalar(rtmp1[row]) < 1.e-12 || PetscAbsScalar(rtmp2[row+1]) < 1.e-12 || PetscAbsScalar(rtmp3[row+2]) < 1.e-12) {
1556: if (b->lu_damping) {
1557: damp = PETSC_TRUE;
1558: if (damping) damping *= 2.0;
1559: else damping = 1.e-12;
1560: ndamp++;
1561: goto endofwhile;
1562: } else {
1563: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1564: }
1565: }
1566: rtmp1[row] = 1.0/rtmp1[row];
1567: rtmp2[row+1] = 1.0/rtmp2[row+1];
1568: rtmp3[row+2] = 1.0/rtmp3[row+2];
1569: /* copy row entries from dense representation to sparse */
1570: for (j=0; j<nz; j++) {
1571: idx = pj[j];
1572: pc1[j] = rtmps1[idx];
1573: pc2[j] = rtmps2[idx];
1574: pc3[j] = rtmps3[idx];
1575: }
1576: break;
1578: default:
1579: SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1580: }
1581: row += nsz; /* Update the row */
1582: }
1583: endofwhile:;
1584: } while (damp);
1585: PetscFree(rtmp1);
1586: PetscFree(tmp_vec2);
1587: ISRestoreIndices(isicol,&ic);
1588: ISRestoreIndices(isrow,&r);
1589: ISRestoreIndices(iscol,&c);
1590: C->factor = FACTOR_LU;
1591: C->assembled = PETSC_TRUE;
1592: Mat_AIJ_CheckInode(C);
1593: if (ndamp || b->lu_damping) {
1594: PetscLogInfo(0,"MatLUFactorNumerical_SeqAIJ_Inode: number of damping tries %d damping value %gn",ndamp,damping);
1595: }
1596: PetscLogFlops(C->n);
1597: return(0);
1598: }
1600: /*
1601: This is really ugly. if inodes are used this replaces the
1602: permutations with ones that correspond to rows/cols of the matrix
1603: rather then inode blocks
1604: */
1605: int MatAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1606: {
1607: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1608: int ierr,m = A->m,n = A->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1609: int row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
1610: int nslim_col,*ns_col;
1611: IS ris = *rperm,cis = *cperm;
1612: PetscTruth flg;
1615: PetscTypeCompare((PetscObject)A,MATSEQAIJ,&flg);
1616: if (!flg) return(0);
1618: if (!a->inode.size) return(0); /* no inodes so return */
1619: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
1621: ierr = Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
1622: ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(int),&tns);
1623: ierr = PetscMalloc((m+n+1)*sizeof(int),&permr);
1624: permc = permr + m;
1626: ierr = ISGetIndices(ris,&ridx);
1627: ierr = ISGetIndices(cis,&cidx);
1629: /* Form the inode structure for the rows of permuted matric using inv perm*/
1630: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
1632: /* Construct the permutations for rows*/
1633: for (i=0,row = 0; i<nslim_row; ++i){
1634: indx = ridx[i];
1635: start_val = tns[indx];
1636: end_val = tns[indx + 1];
1637: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1638: }
1640: /* Form the inode structure for the columns of permuted matrix using inv perm*/
1641: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
1643: /*Construct permutations for columns*/
1644: for (i=0,col=0; i<nslim_col; ++i){
1645: indx = cidx[i];
1646: start_val = tns[indx];
1647: end_val = tns[indx + 1];
1648: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1649: }
1651: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1652: ISSetPermutation(*rperm);
1653: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1654: ISSetPermutation(*cperm);
1655:
1656: ierr = ISRestoreIndices(ris,&ridx);
1657: ierr = ISRestoreIndices(cis,&cidx);
1659: PetscFree(ns_col);
1660: PetscFree(permr);
1661: ISDestroy(cis);
1662: ISDestroy(ris);
1663: PetscFree(tns);
1664: return(0);
1665: }
1669: /*@C
1670: MatSeqAIJGetInodeSizes - Returns the inode information of the SeqAIJ matrix.
1672: Collective on Mat
1674: Input Parameter:
1675: . A - the SeqAIJ matrix.
1677: Output Parameter:
1678: + node_count - no of inodes present in the matrix.
1679: . sizes - an array of size node_count,with sizes of each inode.
1680: - limit - the max size used to generate the inodes.
1682: Level: advanced
1684: Notes: This routine returns some internal storage information
1685: of the matrix, it is intended to be used by advanced users.
1686: It should be called after the matrix is assembled.
1687: The contents of the sizes[] array should not be changed.
1689: .keywords: matrix, seqaij, get, inode
1691: .seealso: MatGetInfo()
1692: @*/
1693: int MatSeqAIJGetInodeSizes(Mat A,int *node_count,int *sizes[],int *limit)
1694: {
1695: Mat_SeqAIJ *a;
1696: PetscTruth flg;
1697: int ierr;
1701: PetscTypeCompare((PetscObject)A,MATSEQAIJ,&flg);
1702: if (!flg) SETERRQ(PETSC_ERR_ARG_WRONG,"MatSeqAIJ only");
1703: a = (Mat_SeqAIJ*)A->data;
1704: *node_count = a->inode.node_count;
1705: *sizes = a->inode.size;
1706: *limit = a->inode.limit;
1707: return(0);
1708: }