Actual source code: aijnode.c
1: /*$Id: aijnode.c,v 1.128 2001/08/07 03:02:47 balay Exp $*/
2: /*
3: This file provides high performance routines for the AIJ (compressed row)
4: format by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include src/mat/impls/aij/seq/aij.h
7: #include src/vec/vecimpl.h
9: EXTERN int Mat_AIJ_CheckInode(Mat,PetscTruth);
10: EXTERN int MatSolve_SeqAIJ_Inode(Mat,Vec,Vec);
11: EXTERN int MatLUFactorNumeric_SeqAIJ_Inode(Mat,Mat *);
13: EXTERN int MatMult_SeqAIJ(Mat,Vec,Vec);
14: EXTERN int MatMultAdd_SeqAIJ(Mat,Vec,Vec,Vec);
15: EXTERN int MatSolve_SeqAIJ(Mat,Vec,Vec);
16: EXTERN int MatLUFactorNumeric_SeqAIJ(Mat,Mat*);
17: EXTERN int MatGetRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
18: EXTERN int MatRestoreRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
19: EXTERN int MatGetColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
20: EXTERN int MatRestoreColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
24: static int Mat_AIJ_CreateColInode(Mat A,int* size,int ** ns)
25: {
26: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
27: int ierr,i,count,m,n,min_mn,*ns_row,*ns_col;
30: n = A->n;
31: m = A->m;
32: ns_row = a->inode.size;
33:
34: min_mn = (m < n) ? m : n;
35: if (!ns) {
36: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
37: for(; count+1 < n; count++,i++);
38: if (count < n) {
39: i++;
40: }
41: *size = i;
42: return(0);
43: }
44: PetscMalloc((n+1)*sizeof(int),&ns_col);
45:
46: /* Use the same row structure wherever feasible. */
47: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
48: ns_col[i] = ns_row[i];
49: }
51: /* if m < n; pad up the remainder with inode_limit */
52: for(; count+1 < n; count++,i++) {
53: ns_col[i] = 1;
54: }
55: /* The last node is the odd ball. padd it up with the remaining rows; */
56: if (count < n) {
57: ns_col[i] = n - count;
58: i++;
59: } else if (count > n) {
60: /* Adjust for the over estimation */
61: ns_col[i-1] += n - count;
62: }
63: *size = i;
64: *ns = ns_col;
65: return(0);
66: }
69: /*
70: This builds symmetric version of nonzero structure,
71: */
74: static int MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
75: {
76: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
77: int *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n,ierr;
78: int *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
81: nslim_row = a->inode.node_count;
82: m = A->m;
83: n = A->n;
84: if (m != n) SETERRQ(1,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix shoul be square");
85:
86: /* Use the row_inode as column_inode */
87: nslim_col = nslim_row;
88: ns_col = ns_row;
90: /* allocate space for reformated inode structure */
91: PetscMalloc((nslim_col+1)*sizeof(int),&tns);
92: PetscMalloc((n+1)*sizeof(int),&tvc);
93: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
95: for (i1=0,col=0; i1<nslim_col; ++i1){
96: nsz = ns_col[i1];
97: for (i2=0; i2<nsz; ++i2,++col)
98: tvc[col] = i1;
99: }
100: /* allocate space for row pointers */
101: PetscMalloc((nslim_row+1)*sizeof(int),&ia);
102: *iia = ia;
103: PetscMemzero(ia,(nslim_row+1)*sizeof(int));
104: PetscMalloc((nslim_row+1)*sizeof(int),&work);
106: /* determine the number of columns in each row */
107: ia[0] = oshift;
108: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
110: j = aj + ai[row] + ishift;
111: jmax = aj + ai[row+1] + ishift;
112: i2 = 0;
113: col = *j++ + ishift;
114: i2 = tvc[col];
115: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
116: ia[i1+1]++;
117: ia[i2+1]++;
118: i2++; /* Start col of next node */
119: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
120: i2 = tvc[col];
121: }
122: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
123: }
125: /* shift ia[i] to point to next row */
126: for (i1=1; i1<nslim_row+1; i1++) {
127: row = ia[i1-1];
128: ia[i1] += row;
129: work[i1-1] = row - oshift;
130: }
132: /* allocate space for column pointers */
133: nz = ia[nslim_row] + (!ishift);
134: PetscMalloc(nz*sizeof(int),&ja);
135: *jja = ja;
137: /* loop over lower triangular part putting into ja */
138: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
139: j = aj + ai[row] + ishift;
140: jmax = aj + ai[row+1] + ishift;
141: i2 = 0; /* Col inode index */
142: col = *j++ + ishift;
143: i2 = tvc[col];
144: while (i2<i1 && j<jmax) {
145: ja[work[i2]++] = i1 + oshift;
146: ja[work[i1]++] = i2 + oshift;
147: ++i2;
148: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
149: i2 = tvc[col];
150: }
151: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
153: }
154: PetscFree(work);
155: PetscFree(tns);
156: PetscFree(tvc);
157: return(0);
158: }
160: /*
161: This builds nonsymmetric version of nonzero structure,
162: */
165: static int MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
166: {
167: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
168: int *work,*ia,*ja,*j,nz,nslim_row,n,row,col,ierr,*ns_col,nslim_col;
169: int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
172: nslim_row = a->inode.node_count;
173: n = A->n;
175: /* Create The column_inode for this matrix */
176: Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
177:
178: /* allocate space for reformated column_inode structure */
179: PetscMalloc((nslim_col +1)*sizeof(int),&tns);
180: PetscMalloc((n +1)*sizeof(int),&tvc);
181: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
183: for (i1=0,col=0; i1<nslim_col; ++i1){
184: nsz = ns_col[i1];
185: for (i2=0; i2<nsz; ++i2,++col)
186: tvc[col] = i1;
187: }
188: /* allocate space for row pointers */
189: PetscMalloc((nslim_row+1)*sizeof(int),&ia);
190: *iia = ia;
191: PetscMemzero(ia,(nslim_row+1)*sizeof(int));
192: PetscMalloc((nslim_row+1)*sizeof(int),&work);
194: /* determine the number of columns in each row */
195: ia[0] = oshift;
196: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
197: j = aj + ai[row] + ishift;
198: col = *j++ + ishift;
199: i2 = tvc[col];
200: nz = ai[row+1] - ai[row];
201: while (nz-- > 0) { /* off-diagonal elemets */
202: ia[i1+1]++;
203: i2++; /* Start col of next node */
204: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
205: i2 = tvc[col];
206: }
207: }
209: /* shift ia[i] to point to next row */
210: for (i1=1; i1<nslim_row+1; i1++) {
211: row = ia[i1-1];
212: ia[i1] += row;
213: work[i1-1] = row - oshift;
214: }
216: /* allocate space for column pointers */
217: nz = ia[nslim_row] + (!ishift);
218: PetscMalloc(nz*sizeof(int),&ja);
219: *jja = ja;
221: /* loop over matrix putting into ja */
222: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
223: j = aj + ai[row] + ishift;
224: i2 = 0; /* Col inode index */
225: col = *j++ + ishift;
226: i2 = tvc[col];
227: nz = ai[row+1] - ai[row];
228: while (nz-- > 0) {
229: ja[work[i1]++] = i2 + oshift;
230: ++i2;
231: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
232: i2 = tvc[col];
233: }
234: }
235: PetscFree(ns_col);
236: PetscFree(work);
237: PetscFree(tns);
238: PetscFree(tvc);
239: return(0);
240: }
244: static int MatGetRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
245: {
246: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
247: int ierr;
250: *n = a->inode.node_count;
251: if (!ia) return(0);
253: if (symmetric) {
254: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
255: } else {
256: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
257: }
258: return(0);
259: }
263: static int MatRestoreRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
264: {
268: if (!ia) return(0);
269: PetscFree(*ia);
270: PetscFree(*ja);
271: return(0);
272: }
274: /* ----------------------------------------------------------- */
278: static int MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
279: {
280: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
281: int *work,*ia,*ja,*j,nz,nslim_row, n,row,col,ierr,*ns_col,nslim_col;
282: int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
285: nslim_row = a->inode.node_count;
286: n = A->n;
288: /* Create The column_inode for this matrix */
289: Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
290:
291: /* allocate space for reformated column_inode structure */
292: PetscMalloc((nslim_col + 1)*sizeof(int),&tns);
293: PetscMalloc((n + 1)*sizeof(int),&tvc);
294: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
296: for (i1=0,col=0; i1<nslim_col; ++i1){
297: nsz = ns_col[i1];
298: for (i2=0; i2<nsz; ++i2,++col)
299: tvc[col] = i1;
300: }
301: /* allocate space for column pointers */
302: PetscMalloc((nslim_col+1)*sizeof(int),&ia);
303: *iia = ia;
304: PetscMemzero(ia,(nslim_col+1)*sizeof(int));
305: PetscMalloc((nslim_col+1)*sizeof(int),&work);
307: /* determine the number of columns in each row */
308: ia[0] = oshift;
309: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
310: j = aj + ai[row] + ishift;
311: col = *j++ + ishift;
312: i2 = tvc[col];
313: nz = ai[row+1] - ai[row];
314: while (nz-- > 0) { /* off-diagonal elemets */
315: /* ia[i1+1]++; */
316: ia[i2+1]++;
317: i2++;
318: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
319: i2 = tvc[col];
320: }
321: }
323: /* shift ia[i] to point to next col */
324: for (i1=1; i1<nslim_col+1; i1++) {
325: col = ia[i1-1];
326: ia[i1] += col;
327: work[i1-1] = col - oshift;
328: }
330: /* allocate space for column pointers */
331: nz = ia[nslim_col] + (!ishift);
332: PetscMalloc(nz*sizeof(int),&ja);
333: *jja = ja;
335: /* loop over matrix putting into ja */
336: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
337: j = aj + ai[row] + ishift;
338: i2 = 0; /* Col inode index */
339: col = *j++ + ishift;
340: i2 = tvc[col];
341: nz = ai[row+1] - ai[row];
342: while (nz-- > 0) {
343: /* ja[work[i1]++] = i2 + oshift; */
344: ja[work[i2]++] = i1 + oshift;
345: i2++;
346: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
347: i2 = tvc[col];
348: }
349: }
350: PetscFree(ns_col);
351: PetscFree(work);
352: PetscFree(tns);
353: PetscFree(tvc);
354: return(0);
355: }
359: static int MatGetColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
360: {
364: Mat_AIJ_CreateColInode(A,n,PETSC_NULL);
365: if (!ia) return(0);
367: if (symmetric) {
368: /* Since the indices are symmetric it does'nt matter */
369: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
370: } else {
371: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
372: }
373: return(0);
374: }
378: static int MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
379: {
383: if (!ia) return(0);
384: PetscFree(*ia);
385: PetscFree(*ja);
386: return(0);
387: }
389: /* ----------------------------------------------------------- */
393: static int MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
394: {
395: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
396: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
397: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y;
398: int ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
399:
400: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
401: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
402: #endif
405: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
406: node_max = a->inode.node_count;
407: ns = a->inode.size; /* Node Size array */
408: VecGetArray(xx,&x);
409: VecGetArray(yy,&y);
410: idx = a->j;
411: v1 = a->a;
412: ii = a->i;
414: for (i = 0,row = 0; i< node_max; ++i){
415: nsz = ns[i];
416: n = ii[1] - ii[0];
417: ii += nsz;
418: sz = n; /* No of non zeros in this row */
419: /* Switch on the size of Node */
420: switch (nsz){ /* Each loop in 'case' is unrolled */
421: case 1 :
422: sum1 = 0;
423:
424: for(n = 0; n< sz-1; n+=2) {
425: i1 = idx[0]; /* The instructions are ordered to */
426: i2 = idx[1]; /* make the compiler's job easy */
427: idx += 2;
428: tmp0 = x[i1];
429: tmp1 = x[i2];
430: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
431: }
432:
433: if (n == sz-1){ /* Take care of the last nonzero */
434: tmp0 = x[*idx++];
435: sum1 += *v1++ * tmp0;
436: }
437: y[row++]=sum1;
438: break;
439: case 2:
440: sum1 = 0;
441: sum2 = 0;
442: v2 = v1 + n;
443:
444: for (n = 0; n< sz-1; n+=2) {
445: i1 = idx[0];
446: i2 = idx[1];
447: idx += 2;
448: tmp0 = x[i1];
449: tmp1 = x[i2];
450: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
451: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
452: }
453: if (n == sz-1){
454: tmp0 = x[*idx++];
455: sum1 += *v1++ * tmp0;
456: sum2 += *v2++ * tmp0;
457: }
458: y[row++]=sum1;
459: y[row++]=sum2;
460: v1 =v2; /* Since the next block to be processed starts there*/
461: idx +=sz;
462: break;
463: case 3:
464: sum1 = 0;
465: sum2 = 0;
466: sum3 = 0;
467: v2 = v1 + n;
468: v3 = v2 + n;
469:
470: for (n = 0; n< sz-1; n+=2) {
471: i1 = idx[0];
472: i2 = idx[1];
473: idx += 2;
474: tmp0 = x[i1];
475: tmp1 = x[i2];
476: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
477: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
478: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
479: }
480: if (n == sz-1){
481: tmp0 = x[*idx++];
482: sum1 += *v1++ * tmp0;
483: sum2 += *v2++ * tmp0;
484: sum3 += *v3++ * tmp0;
485: }
486: y[row++]=sum1;
487: y[row++]=sum2;
488: y[row++]=sum3;
489: v1 =v3; /* Since the next block to be processed starts there*/
490: idx +=2*sz;
491: break;
492: case 4:
493: sum1 = 0;
494: sum2 = 0;
495: sum3 = 0;
496: sum4 = 0;
497: v2 = v1 + n;
498: v3 = v2 + n;
499: v4 = v3 + n;
500:
501: for (n = 0; n< sz-1; n+=2) {
502: i1 = idx[0];
503: i2 = idx[1];
504: idx += 2;
505: tmp0 = x[i1];
506: tmp1 = x[i2];
507: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
508: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
509: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
510: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
511: }
512: if (n == sz-1){
513: tmp0 = x[*idx++];
514: sum1 += *v1++ * tmp0;
515: sum2 += *v2++ * tmp0;
516: sum3 += *v3++ * tmp0;
517: sum4 += *v4++ * tmp0;
518: }
519: y[row++]=sum1;
520: y[row++]=sum2;
521: y[row++]=sum3;
522: y[row++]=sum4;
523: v1 =v4; /* Since the next block to be processed starts there*/
524: idx +=3*sz;
525: break;
526: case 5:
527: sum1 = 0;
528: sum2 = 0;
529: sum3 = 0;
530: sum4 = 0;
531: sum5 = 0;
532: v2 = v1 + n;
533: v3 = v2 + n;
534: v4 = v3 + n;
535: v5 = v4 + n;
536:
537: for (n = 0; n<sz-1; n+=2) {
538: i1 = idx[0];
539: i2 = idx[1];
540: idx += 2;
541: tmp0 = x[i1];
542: tmp1 = x[i2];
543: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
544: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
545: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
546: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
547: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
548: }
549: if (n == sz-1){
550: tmp0 = x[*idx++];
551: sum1 += *v1++ * tmp0;
552: sum2 += *v2++ * tmp0;
553: sum3 += *v3++ * tmp0;
554: sum4 += *v4++ * tmp0;
555: sum5 += *v5++ * tmp0;
556: }
557: y[row++]=sum1;
558: y[row++]=sum2;
559: y[row++]=sum3;
560: y[row++]=sum4;
561: y[row++]=sum5;
562: v1 =v5; /* Since the next block to be processed starts there */
563: idx +=4*sz;
564: break;
565: default :
566: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
567: }
568: }
569: VecRestoreArray(xx,&x);
570: VecRestoreArray(yy,&y);
571: PetscLogFlops(2*a->nz - A->m);
572: return(0);
573: }
574: /* ----------------------------------------------------------- */
575: /* Almost same code as the MatMult_SeqAij_Inode() */
578: static int MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
579: {
580: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
581: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
582: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
583: int ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
584:
586: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
587: node_max = a->inode.node_count;
588: ns = a->inode.size; /* Node Size array */
589: VecGetArray(xx,&x);
590: VecGetArray(yy,&y);
591: if (zz != yy) {
592: VecGetArray(zz,&z);
593: } else {
594: z = y;
595: }
596: zt = z;
598: idx = a->j;
599: v1 = a->a;
600: ii = a->i;
602: for (i = 0,row = 0; i< node_max; ++i){
603: nsz = ns[i];
604: n = ii[1] - ii[0];
605: ii += nsz;
606: sz = n; /* No of non zeros in this row */
607: /* Switch on the size of Node */
608: switch (nsz){ /* Each loop in 'case' is unrolled */
609: case 1 :
610: sum1 = *zt++;
611:
612: for(n = 0; n< sz-1; n+=2) {
613: i1 = idx[0]; /* The instructions are ordered to */
614: i2 = idx[1]; /* make the compiler's job easy */
615: idx += 2;
616: tmp0 = x[i1];
617: tmp1 = x[i2];
618: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
619: }
620:
621: if(n == sz-1){ /* Take care of the last nonzero */
622: tmp0 = x[*idx++];
623: sum1 += *v1++ * tmp0;
624: }
625: y[row++]=sum1;
626: break;
627: case 2:
628: sum1 = *zt++;
629: sum2 = *zt++;
630: v2 = v1 + n;
631:
632: for(n = 0; n< sz-1; n+=2) {
633: i1 = idx[0];
634: i2 = idx[1];
635: idx += 2;
636: tmp0 = x[i1];
637: tmp1 = x[i2];
638: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
639: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
640: }
641: if(n == sz-1){
642: tmp0 = x[*idx++];
643: sum1 += *v1++ * tmp0;
644: sum2 += *v2++ * tmp0;
645: }
646: y[row++]=sum1;
647: y[row++]=sum2;
648: v1 =v2; /* Since the next block to be processed starts there*/
649: idx +=sz;
650: break;
651: case 3:
652: sum1 = *zt++;
653: sum2 = *zt++;
654: sum3 = *zt++;
655: v2 = v1 + n;
656: v3 = v2 + n;
657:
658: for (n = 0; n< sz-1; n+=2) {
659: i1 = idx[0];
660: i2 = idx[1];
661: idx += 2;
662: tmp0 = x[i1];
663: tmp1 = x[i2];
664: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
665: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
666: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
667: }
668: if (n == sz-1){
669: tmp0 = x[*idx++];
670: sum1 += *v1++ * tmp0;
671: sum2 += *v2++ * tmp0;
672: sum3 += *v3++ * tmp0;
673: }
674: y[row++]=sum1;
675: y[row++]=sum2;
676: y[row++]=sum3;
677: v1 =v3; /* Since the next block to be processed starts there*/
678: idx +=2*sz;
679: break;
680: case 4:
681: sum1 = *zt++;
682: sum2 = *zt++;
683: sum3 = *zt++;
684: sum4 = *zt++;
685: v2 = v1 + n;
686: v3 = v2 + n;
687: v4 = v3 + n;
688:
689: for (n = 0; n< sz-1; n+=2) {
690: i1 = idx[0];
691: i2 = idx[1];
692: idx += 2;
693: tmp0 = x[i1];
694: tmp1 = x[i2];
695: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
696: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
697: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
698: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
699: }
700: if (n == sz-1){
701: tmp0 = x[*idx++];
702: sum1 += *v1++ * tmp0;
703: sum2 += *v2++ * tmp0;
704: sum3 += *v3++ * tmp0;
705: sum4 += *v4++ * tmp0;
706: }
707: y[row++]=sum1;
708: y[row++]=sum2;
709: y[row++]=sum3;
710: y[row++]=sum4;
711: v1 =v4; /* Since the next block to be processed starts there*/
712: idx +=3*sz;
713: break;
714: case 5:
715: sum1 = *zt++;
716: sum2 = *zt++;
717: sum3 = *zt++;
718: sum4 = *zt++;
719: sum5 = *zt++;
720: v2 = v1 + n;
721: v3 = v2 + n;
722: v4 = v3 + n;
723: v5 = v4 + n;
724:
725: for (n = 0; n<sz-1; n+=2) {
726: i1 = idx[0];
727: i2 = idx[1];
728: idx += 2;
729: tmp0 = x[i1];
730: tmp1 = x[i2];
731: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
732: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
733: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
734: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
735: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
736: }
737: if(n == sz-1){
738: tmp0 = x[*idx++];
739: sum1 += *v1++ * tmp0;
740: sum2 += *v2++ * tmp0;
741: sum3 += *v3++ * tmp0;
742: sum4 += *v4++ * tmp0;
743: sum5 += *v5++ * tmp0;
744: }
745: y[row++]=sum1;
746: y[row++]=sum2;
747: y[row++]=sum3;
748: y[row++]=sum4;
749: y[row++]=sum5;
750: v1 =v5; /* Since the next block to be processed starts there */
751: idx +=4*sz;
752: break;
753: default :
754: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
755: }
756: }
757: VecRestoreArray(xx,&x);
758: VecRestoreArray(yy,&y);
759: if (zz != yy) {
760: VecRestoreArray(zz,&z);
761: }
762: PetscLogFlops(2*a->nz);
763: return(0);
764: }
765: /* ----------------------------------------------------------- */
766: EXTERN int MatColoringPatch_SeqAIJ_Inode(Mat,int,int,const ISColoringValue[],ISColoring *);
768: /*
769: samestructure indicates that the matrix has not changed its nonzero structure so we
770: do not need to recompute the inodes
771: */
774: int Mat_AIJ_CheckInode(Mat A,PetscTruth samestructure)
775: {
776: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
777: int ierr,i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
778: PetscTruth flag,flg;
781: if (samestructure && a->inode.checked) return(0);
783: a->inode.checked = PETSC_TRUE;
785: /* Notes: We set a->inode.limit=5 in MatCreateSeqAIJ(). */
786: if (!a->inode.use) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODES\n"); return(0);}
787: PetscOptionsHasName(A->prefix,"-mat_aij_no_inode",&flg);
788: if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_aij_no_inode\n");return(0);}
789: PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
790: if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_no_unroll\n");return(0);}
791: PetscOptionsGetInt(A->prefix,"-mat_aij_inode_limit",&a->inode.limit,PETSC_NULL);
792: if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
793: m = A->m;
794: if (a->inode.size) {ns = a->inode.size;}
795: else {PetscMalloc((m+1)*sizeof(int),&ns);}
797: i = 0;
798: node_count = 0;
799: idx = a->j;
800: ii = a->i;
801: while (i < m){ /* For each row */
802: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
803: /* Limits the number of elements in a node to 'a->inode.limit' */
804: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
805: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
806: if (nzy != nzx) break;
807: idy += nzx; /* Same nonzero pattern */
808: PetscMemcmp(idx,idy,nzx*sizeof(int),&flag);
809: if (flag == PETSC_FALSE) break;
810: }
811: ns[node_count++] = blk_size;
812: idx += blk_size*nzx;
813: i = j;
814: }
815: /* If not enough inodes found,, do not use inode version of the routines */
816: if (!a->inode.size && m && node_count > 0.9*m) {
817: PetscFree(ns);
818: A->ops->mult = MatMult_SeqAIJ;
819: A->ops->multadd = MatMultAdd_SeqAIJ;
820: A->ops->solve = MatSolve_SeqAIJ;
821: A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ;
822: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
823: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
824: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
825: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
826: A->ops->coloringpatch = 0;
827: a->inode.node_count = 0;
828: a->inode.size = 0;
829: PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes out of %d rows. Not using Inode routines\n",node_count,m);
830: } else {
831: A->ops->mult = MatMult_SeqAIJ_Inode;
832: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
833: A->ops->solve = MatSolve_SeqAIJ_Inode;
834: A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
835: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
836: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
837: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
838: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
839: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
840: a->inode.node_count = node_count;
841: a->inode.size = ns;
842: PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes of %d. Limit used: %d. Using Inode routines\n",node_count,m,a->inode.limit);
843: }
844: return(0);
845: }
847: /* ----------------------------------------------------------- */
850: int MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
851: {
852: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
853: IS iscol = a->col,isrow = a->row;
854: int *r,*c,ierr,i,j,n = A->m,*ai = a->i,nz,*a_j = a->j;
855: int node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
856: PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
857: PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
860: if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
861: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
862: node_max = a->inode.node_count;
863: ns = a->inode.size; /* Node Size array */
865: VecGetArray(bb,&b);
866: VecGetArray(xx,&x);
867: tmp = a->solve_work;
868:
869: ISGetIndices(isrow,&rout); r = rout;
870: ISGetIndices(iscol,&cout); c = cout + (n-1);
871:
872: /* forward solve the lower triangular */
873: tmps = tmp ;
874: aa = a_a ;
875: aj = a_j ;
876: ad = a->diag;
878: for (i = 0,row = 0; i< node_max; ++i){
879: nsz = ns[i];
880: aii = ai[row];
881: v1 = aa + aii;
882: vi = aj + aii;
883: nz = ad[row]- aii;
884:
885: switch (nsz){ /* Each loop in 'case' is unrolled */
886: case 1 :
887: sum1 = b[*r++];
888: /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
889: for(j=0; j<nz-1; j+=2){
890: i0 = vi[0];
891: i1 = vi[1];
892: vi +=2;
893: tmp0 = tmps[i0];
894: tmp1 = tmps[i1];
895: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
896: }
897: if(j == nz-1){
898: tmp0 = tmps[*vi++];
899: sum1 -= *v1++ *tmp0;
900: }
901: tmp[row ++]=sum1;
902: break;
903: case 2:
904: sum1 = b[*r++];
905: sum2 = b[*r++];
906: v2 = aa + ai[row+1];
908: for(j=0; j<nz-1; j+=2){
909: i0 = vi[0];
910: i1 = vi[1];
911: vi +=2;
912: tmp0 = tmps[i0];
913: tmp1 = tmps[i1];
914: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
915: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
916: }
917: if(j == nz-1){
918: tmp0 = tmps[*vi++];
919: sum1 -= *v1++ *tmp0;
920: sum2 -= *v2++ *tmp0;
921: }
922: sum2 -= *v2++ * sum1;
923: tmp[row ++]=sum1;
924: tmp[row ++]=sum2;
925: break;
926: case 3:
927: sum1 = b[*r++];
928: sum2 = b[*r++];
929: sum3 = b[*r++];
930: v2 = aa + ai[row+1];
931: v3 = aa + ai[row+2];
932:
933: for (j=0; j<nz-1; j+=2){
934: i0 = vi[0];
935: i1 = vi[1];
936: vi +=2;
937: tmp0 = tmps[i0];
938: tmp1 = tmps[i1];
939: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
940: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
941: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
942: }
943: if (j == nz-1){
944: tmp0 = tmps[*vi++];
945: sum1 -= *v1++ *tmp0;
946: sum2 -= *v2++ *tmp0;
947: sum3 -= *v3++ *tmp0;
948: }
949: sum2 -= *v2++ * sum1;
950: sum3 -= *v3++ * sum1;
951: sum3 -= *v3++ * sum2;
952: tmp[row ++]=sum1;
953: tmp[row ++]=sum2;
954: tmp[row ++]=sum3;
955: break;
956:
957: case 4:
958: sum1 = b[*r++];
959: sum2 = b[*r++];
960: sum3 = b[*r++];
961: sum4 = b[*r++];
962: v2 = aa + ai[row+1];
963: v3 = aa + ai[row+2];
964: v4 = aa + ai[row+3];
965:
966: for (j=0; j<nz-1; j+=2){
967: i0 = vi[0];
968: i1 = vi[1];
969: vi +=2;
970: tmp0 = tmps[i0];
971: tmp1 = tmps[i1];
972: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
973: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
974: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
975: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
976: }
977: if (j == nz-1){
978: tmp0 = tmps[*vi++];
979: sum1 -= *v1++ *tmp0;
980: sum2 -= *v2++ *tmp0;
981: sum3 -= *v3++ *tmp0;
982: sum4 -= *v4++ *tmp0;
983: }
984: sum2 -= *v2++ * sum1;
985: sum3 -= *v3++ * sum1;
986: sum4 -= *v4++ * sum1;
987: sum3 -= *v3++ * sum2;
988: sum4 -= *v4++ * sum2;
989: sum4 -= *v4++ * sum3;
990:
991: tmp[row ++]=sum1;
992: tmp[row ++]=sum2;
993: tmp[row ++]=sum3;
994: tmp[row ++]=sum4;
995: break;
996: case 5:
997: sum1 = b[*r++];
998: sum2 = b[*r++];
999: sum3 = b[*r++];
1000: sum4 = b[*r++];
1001: sum5 = b[*r++];
1002: v2 = aa + ai[row+1];
1003: v3 = aa + ai[row+2];
1004: v4 = aa + ai[row+3];
1005: v5 = aa + ai[row+4];
1006:
1007: for (j=0; j<nz-1; j+=2){
1008: i0 = vi[0];
1009: i1 = vi[1];
1010: vi +=2;
1011: tmp0 = tmps[i0];
1012: tmp1 = tmps[i1];
1013: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1014: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1015: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1016: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1017: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1018: }
1019: if (j == nz-1){
1020: tmp0 = tmps[*vi++];
1021: sum1 -= *v1++ *tmp0;
1022: sum2 -= *v2++ *tmp0;
1023: sum3 -= *v3++ *tmp0;
1024: sum4 -= *v4++ *tmp0;
1025: sum5 -= *v5++ *tmp0;
1026: }
1028: sum2 -= *v2++ * sum1;
1029: sum3 -= *v3++ * sum1;
1030: sum4 -= *v4++ * sum1;
1031: sum5 -= *v5++ * sum1;
1032: sum3 -= *v3++ * sum2;
1033: sum4 -= *v4++ * sum2;
1034: sum5 -= *v5++ * sum2;
1035: sum4 -= *v4++ * sum3;
1036: sum5 -= *v5++ * sum3;
1037: sum5 -= *v5++ * sum4;
1038:
1039: tmp[row ++]=sum1;
1040: tmp[row ++]=sum2;
1041: tmp[row ++]=sum3;
1042: tmp[row ++]=sum4;
1043: tmp[row ++]=sum5;
1044: break;
1045: default:
1046: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1047: }
1048: }
1049: /* backward solve the upper triangular */
1050: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1051: nsz = ns[i];
1052: aii = ai[row+1] -1;
1053: v1 = aa + aii;
1054: vi = aj + aii;
1055: nz = aii- ad[row];
1056: switch (nsz){ /* Each loop in 'case' is unrolled */
1057: case 1 :
1058: sum1 = tmp[row];
1060: for(j=nz ; j>1; j-=2){
1061: i0 = vi[0];
1062: i1 = vi[-1];
1063: vi -=2;
1064: tmp0 = tmps[i0];
1065: tmp1 = tmps[i1];
1066: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1067: }
1068: if (j==1){
1069: tmp0 = tmps[*vi--];
1070: sum1 -= *v1-- * tmp0;
1071: }
1072: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1073: break;
1074: case 2 :
1075: sum1 = tmp[row];
1076: sum2 = tmp[row -1];
1077: v2 = aa + ai[row]-1;
1078: for (j=nz ; j>1; j-=2){
1079: i0 = vi[0];
1080: i1 = vi[-1];
1081: vi -=2;
1082: tmp0 = tmps[i0];
1083: tmp1 = tmps[i1];
1084: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1085: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1086: }
1087: if (j==1){
1088: tmp0 = tmps[*vi--];
1089: sum1 -= *v1-- * tmp0;
1090: sum2 -= *v2-- * tmp0;
1091: }
1092:
1093: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1094: sum2 -= *v2-- * tmp0;
1095: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1096: break;
1097: case 3 :
1098: sum1 = tmp[row];
1099: sum2 = tmp[row -1];
1100: sum3 = tmp[row -2];
1101: v2 = aa + ai[row]-1;
1102: v3 = aa + ai[row -1]-1;
1103: for (j=nz ; j>1; j-=2){
1104: i0 = vi[0];
1105: i1 = vi[-1];
1106: vi -=2;
1107: tmp0 = tmps[i0];
1108: tmp1 = tmps[i1];
1109: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1110: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1111: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1112: }
1113: if (j==1){
1114: tmp0 = tmps[*vi--];
1115: sum1 -= *v1-- * tmp0;
1116: sum2 -= *v2-- * tmp0;
1117: sum3 -= *v3-- * tmp0;
1118: }
1119: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1120: sum2 -= *v2-- * tmp0;
1121: sum3 -= *v3-- * tmp0;
1122: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1123: sum3 -= *v3-- * tmp0;
1124: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1125:
1126: break;
1127: case 4 :
1128: sum1 = tmp[row];
1129: sum2 = tmp[row -1];
1130: sum3 = tmp[row -2];
1131: sum4 = tmp[row -3];
1132: v2 = aa + ai[row]-1;
1133: v3 = aa + ai[row -1]-1;
1134: v4 = aa + ai[row -2]-1;
1136: for (j=nz ; j>1; j-=2){
1137: i0 = vi[0];
1138: i1 = vi[-1];
1139: vi -=2;
1140: tmp0 = tmps[i0];
1141: tmp1 = tmps[i1];
1142: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1143: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1144: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1145: sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1146: }
1147: if (j==1){
1148: tmp0 = tmps[*vi--];
1149: sum1 -= *v1-- * tmp0;
1150: sum2 -= *v2-- * tmp0;
1151: sum3 -= *v3-- * tmp0;
1152: sum4 -= *v4-- * tmp0;
1153: }
1155: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1156: sum2 -= *v2-- * tmp0;
1157: sum3 -= *v3-- * tmp0;
1158: sum4 -= *v4-- * tmp0;
1159: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1160: sum3 -= *v3-- * tmp0;
1161: sum4 -= *v4-- * tmp0;
1162: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163: sum4 -= *v4-- * tmp0;
1164: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165: break;
1166: case 5 :
1167: sum1 = tmp[row];
1168: sum2 = tmp[row -1];
1169: sum3 = tmp[row -2];
1170: sum4 = tmp[row -3];
1171: sum5 = tmp[row -4];
1172: v2 = aa + ai[row]-1;
1173: v3 = aa + ai[row -1]-1;
1174: v4 = aa + ai[row -2]-1;
1175: v5 = aa + ai[row -3]-1;
1176: for (j=nz ; j>1; j-=2){
1177: i0 = vi[0];
1178: i1 = vi[-1];
1179: vi -=2;
1180: tmp0 = tmps[i0];
1181: tmp1 = tmps[i1];
1182: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1183: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1184: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1185: sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1186: sum5 -= v5[0] * tmp0 + v5[-1] * tmp1; v5 -= 2;
1187: }
1188: if (j==1){
1189: tmp0 = tmps[*vi--];
1190: sum1 -= *v1-- * tmp0;
1191: sum2 -= *v2-- * tmp0;
1192: sum3 -= *v3-- * tmp0;
1193: sum4 -= *v4-- * tmp0;
1194: sum5 -= *v5-- * tmp0;
1195: }
1197: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1198: sum2 -= *v2-- * tmp0;
1199: sum3 -= *v3-- * tmp0;
1200: sum4 -= *v4-- * tmp0;
1201: sum5 -= *v5-- * tmp0;
1202: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1203: sum3 -= *v3-- * tmp0;
1204: sum4 -= *v4-- * tmp0;
1205: sum5 -= *v5-- * tmp0;
1206: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1207: sum4 -= *v4-- * tmp0;
1208: sum5 -= *v5-- * tmp0;
1209: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1210: sum5 -= *v5-- * tmp0;
1211: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1212: break;
1213: default:
1214: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1215: }
1216: }
1217: ISRestoreIndices(isrow,&rout);
1218: ISRestoreIndices(iscol,&cout);
1219: VecRestoreArray(bb,&b);
1220: VecRestoreArray(xx,&x);
1221: PetscLogFlops(2*a->nz - A->n);
1222: return(0);
1223: }
1228: int MatLUFactorNumeric_SeqAIJ_Inode(Mat A,Mat *B)
1229: {
1230: Mat C = *B;
1231: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ *)C->data;
1232: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1233: int *r,*ic,*c,ierr,n = A->m,*bi = b->i;
1234: int *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,row,prow;
1235: int *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nsz;
1236: int *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj,ndamp = 0;
1237: PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1238: PetscScalar tmp,*ba = b->a,*aa = a->a,*pv,*rtmps1,*rtmps2,*rtmps3;
1239: PetscReal damping = b->lu_damping,zeropivot = b->lu_zeropivot;
1240: PetscTruth damp;
1243: ISGetIndices(isrow,&r);
1244: ISGetIndices(iscol,&c);
1245: ISGetIndices(isicol,&ic);
1246: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1247: PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1248: ics = ic ; rtmps1 = rtmp1 ;
1249: rtmp2 = rtmp1 + n; rtmps2 = rtmp2 ;
1250: rtmp3 = rtmp2 + n; rtmps3 = rtmp3 ;
1251:
1252: node_max = a->inode.node_count;
1253: ns = a->inode.size ;
1254: if (!ns){
1255: SETERRQ(1,"Matrix without inode information");
1256: }
1258: /* If max inode size > 3, split it into two inodes.*/
1259: /* also map the inode sizes according to the ordering */
1260: PetscMalloc((n+1)* sizeof(int),&tmp_vec1);
1261: for (i=0,j=0; i<node_max; ++i,++j){
1262: if (ns[i]>3) {
1263: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1264: ++j;
1265: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1266: } else {
1267: tmp_vec1[j] = ns[i];
1268: }
1269: }
1270: /* Use the correct node_max */
1271: node_max = j;
1273: /* Now reorder the inode info based on mat re-ordering info */
1274: /* First create a row -> inode_size_array_index map */
1275: PetscMalloc(n*sizeof(int)+1,&nsmap);
1276: PetscMalloc(node_max*sizeof(int)+1,&tmp_vec2);
1277: for (i=0,row=0; i<node_max; i++) {
1278: nsz = tmp_vec1[i];
1279: for (j=0; j<nsz; j++,row++) {
1280: nsmap[row] = i;
1281: }
1282: }
1283: /* Using nsmap, create a reordered ns structure */
1284: for (i=0,j=0; i< node_max; i++) {
1285: nsz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1286: tmp_vec2[i] = nsz;
1287: j += nsz;
1288: }
1289: PetscFree(nsmap);
1290: PetscFree(tmp_vec1);
1291: /* Now use the correct ns */
1292: ns = tmp_vec2;
1295: do {
1296: damp = PETSC_FALSE;
1297: /* Now loop over each block-row, and do the factorization */
1298: for (i=0,row=0; i<node_max; i++) {
1299: nsz = ns[i];
1300: nz = bi[row+1] - bi[row];
1301: bjtmp = bj + bi[row];
1302:
1303: switch (nsz){
1304: case 1:
1305: for (j=0; j<nz; j++){
1306: idx = bjtmp[j];
1307: rtmps1[idx] = 0.0;
1308: }
1309:
1310: /* load in initial (unfactored row) */
1311: idx = r[row];
1312: nz = ai[idx+1] - ai[idx];
1313: ajtmp = aj + ai[idx];
1314: v1 = aa + ai[idx];
1316: for (j=0; j<nz; j++) {
1317: idx = ics[ajtmp[j]];
1318: rtmp1[idx] = v1[j];
1319: if (ndamp && ajtmp[j] == r[row]) {
1320: rtmp1[idx] += damping;
1321: }
1322: }
1323: prow = *bjtmp++ ;
1324: while (prow < row) {
1325: pc1 = rtmp1 + prow;
1326: if (*pc1 != 0.0){
1327: pv = ba + bd[prow];
1328: pj = nbj + bd[prow];
1329: mul1 = *pc1 * *pv++;
1330: *pc1 = mul1;
1331: nz = bi[prow+1] - bd[prow] - 1;
1332: PetscLogFlops(2*nz);
1333: for (j=0; j<nz; j++) {
1334: tmp = pv[j];
1335: idx = pj[j];
1336: rtmps1[idx] -= mul1 * tmp;
1337: }
1338: }
1339: prow = *bjtmp++ ;
1340: }
1341: nz = bi[row+1] - bi[row];
1342: pj = bj + bi[row];
1343: pc1 = ba + bi[row];
1344: if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1345: if (damping) {
1346: if (ndamp) damping *= 2.0;
1347: damp = PETSC_TRUE;
1348: ndamp++;
1349: goto endofwhile;
1350: } else {
1351: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1352: }
1353: }
1354: rtmp1[row] = 1.0/rtmp1[row];
1355: for (j=0; j<nz; j++) {
1356: idx = pj[j];
1357: pc1[j] = rtmps1[idx];
1358: }
1359: break;
1360:
1361: case 2:
1362: for (j=0; j<nz; j++) {
1363: idx = bjtmp[j];
1364: rtmps1[idx] = 0.0;
1365: rtmps2[idx] = 0.0;
1366: }
1367:
1368: /* load in initial (unfactored row) */
1369: idx = r[row];
1370: nz = ai[idx+1] - ai[idx];
1371: ajtmp = aj + ai[idx];
1372: v1 = aa + ai[idx];
1373: v2 = aa + ai[idx+1];
1374:
1375: for (j=0; j<nz; j++) {
1376: idx = ics[ajtmp[j]];
1377: rtmp1[idx] = v1[j];
1378: rtmp2[idx] = v2[j];
1379: if (ndamp && ajtmp[j] == r[row]) {
1380: rtmp1[idx] += damping;
1381: }
1382: if (ndamp && ajtmp[j] == r[row+1]) {
1383: rtmp2[idx] += damping;
1384: }
1385: }
1386: prow = *bjtmp++ ;
1387: while (prow < row) {
1388: pc1 = rtmp1 + prow;
1389: pc2 = rtmp2 + prow;
1390: if (*pc1 != 0.0 || *pc2 != 0.0){
1391: pv = ba + bd[prow];
1392: pj = nbj + bd[prow];
1393: mul1 = *pc1 * *pv;
1394: mul2 = *pc2 * *pv;
1395: ++pv;
1396: *pc1 = mul1;
1397: *pc2 = mul2;
1398:
1399: nz = bi[prow+1] - bd[prow] - 1;
1400: PetscLogFlops(2*2*nz);
1401: for (j=0; j<nz; j++) {
1402: tmp = pv[j];
1403: idx = pj[j];
1404: rtmps1[idx] -= mul1 * tmp;
1405: rtmps2[idx] -= mul2 * tmp;
1406: }
1407: }
1408: prow = *bjtmp++ ;
1409: }
1410: /* Now take care of the odd element*/
1411: pc1 = rtmp1 + prow;
1412: pc2 = rtmp2 + prow;
1413: if (*pc2 != 0.0){
1414: pj = nbj + bd[prow];
1415: if (PetscAbsScalar(*pc1) < zeropivot) {
1416: if (damping) {
1417: if (ndamp) damping *= 2.0;
1418: damp = PETSC_TRUE;
1419: ndamp++;
1420: goto endofwhile;
1421: } else {
1422: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1423: }
1424: }
1425: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1426: *pc2 = mul2;
1427: nz = bi[prow+1] - bd[prow] - 1;
1428: PetscLogFlops(2*nz);
1429: for (j=0; j<nz; j++) {
1430: idx = pj[j] ;
1431: tmp = rtmp1[idx];
1432: rtmp2[idx] -= mul2 * tmp;
1433: }
1434: }
1435:
1436: nz = bi[row+1] - bi[row];
1437: pj = bj + bi[row];
1438: pc1 = ba + bi[row];
1439: pc2 = ba + bi[row+1];
1440: if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1441: if (damping) {
1442: if (ndamp) damping *= 2.0;
1443: damp = PETSC_TRUE;
1444: ndamp++;
1445: goto endofwhile;
1446: } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1447: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1448: } else {
1449: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1450: }
1451: }
1452: rtmp1[row] = 1.0/rtmp1[row];
1453: rtmp2[row+1] = 1.0/rtmp2[row+1];
1454: for (j=0; j<nz; j++) {
1455: idx = pj[j];
1456: pc1[j] = rtmps1[idx];
1457: pc2[j] = rtmps2[idx];
1458: }
1459: break;
1461: case 3:
1462: for (j=0; j<nz; j++) {
1463: idx = bjtmp[j];
1464: rtmps1[idx] = 0.0;
1465: rtmps2[idx] = 0.0;
1466: rtmps3[idx] = 0.0;
1467: }
1468: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1469: idx = r[row];
1470: nz = ai[idx+1] - ai[idx];
1471: ajtmp = aj + ai[idx];
1472: v1 = aa + ai[idx];
1473: v2 = aa + ai[idx+1];
1474: v3 = aa + ai[idx+2];
1475: for (j=0; j<nz; j++) {
1476: idx = ics[ajtmp[j]];
1477: rtmp1[idx] = v1[j];
1478: rtmp2[idx] = v2[j];
1479: rtmp3[idx] = v3[j];
1480: if (ndamp && ajtmp[j] == r[row]) {
1481: rtmp1[idx] += damping;
1482: }
1483: if (ndamp && ajtmp[j] == r[row+1]) {
1484: rtmp2[idx] += damping;
1485: }
1486: if (ndamp && ajtmp[j] == r[row+2]) {
1487: rtmp3[idx] += damping;
1488: }
1489: }
1490: /* loop over all pivot row blocks above this row block */
1491: prow = *bjtmp++ ;
1492: while (prow < row) {
1493: pc1 = rtmp1 + prow;
1494: pc2 = rtmp2 + prow;
1495: pc3 = rtmp3 + prow;
1496: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1497: pv = ba + bd[prow];
1498: pj = nbj + bd[prow];
1499: mul1 = *pc1 * *pv;
1500: mul2 = *pc2 * *pv;
1501: mul3 = *pc3 * *pv;
1502: ++pv;
1503: *pc1 = mul1;
1504: *pc2 = mul2;
1505: *pc3 = mul3;
1506:
1507: nz = bi[prow+1] - bd[prow] - 1;
1508: PetscLogFlops(3*2*nz);
1509: /* update this row based on pivot row */
1510: for (j=0; j<nz; j++) {
1511: tmp = pv[j];
1512: idx = pj[j];
1513: rtmps1[idx] -= mul1 * tmp;
1514: rtmps2[idx] -= mul2 * tmp;
1515: rtmps3[idx] -= mul3 * tmp;
1516: }
1517: }
1518: prow = *bjtmp++ ;
1519: }
1520: /* Now take care of diagonal block in this set of rows */
1521: pc1 = rtmp1 + prow;
1522: pc2 = rtmp2 + prow;
1523: pc3 = rtmp3 + prow;
1524: if (*pc2 != 0.0 || *pc3 != 0.0){
1525: pj = nbj + bd[prow];
1526: if (PetscAbsScalar(*pc1) < zeropivot) {
1527: if (damping) {
1528: if (ndamp) damping *= 2.0;
1529: damp = PETSC_TRUE;
1530: ndamp++;
1531: goto endofwhile;
1532: } else {
1533: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1534: }
1535: }
1536: mul2 = (*pc2)/(*pc1);
1537: mul3 = (*pc3)/(*pc1);
1538: *pc2 = mul2;
1539: *pc3 = mul3;
1540: nz = bi[prow+1] - bd[prow] - 1;
1541: PetscLogFlops(2*2*nz);
1542: for (j=0; j<nz; j++) {
1543: idx = pj[j] ;
1544: tmp = rtmp1[idx];
1545: rtmp2[idx] -= mul2 * tmp;
1546: rtmp3[idx] -= mul3 * tmp;
1547: }
1548: }
1549: ++prow;
1550: pc2 = rtmp2 + prow;
1551: pc3 = rtmp3 + prow;
1552: if (*pc3 != 0.0){
1553: pj = nbj + bd[prow];
1554: pj = nbj + bd[prow];
1555: if (PetscAbsScalar(*pc2) < zeropivot) {
1556: if (damping) {
1557: if (ndamp) damping *= 2.0;
1558: damp = PETSC_TRUE;
1559: ndamp++;
1560: goto endofwhile;
1561: } else {
1562: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc2),zeropivot);
1563: }
1564: }
1565: mul3 = (*pc3)/(*pc2);
1566: *pc3 = mul3;
1567: nz = bi[prow+1] - bd[prow] - 1;
1568: PetscLogFlops(2*2*nz);
1569: for (j=0; j<nz; j++) {
1570: idx = pj[j] ;
1571: tmp = rtmp2[idx];
1572: rtmp3[idx] -= mul3 * tmp;
1573: }
1574: }
1575: nz = bi[row+1] - bi[row];
1576: pj = bj + bi[row];
1577: pc1 = ba + bi[row];
1578: pc2 = ba + bi[row+1];
1579: pc3 = ba + bi[row+2];
1580: if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot || PetscAbsScalar(rtmp3[row+2]) < zeropivot) {
1581: if (damping) {
1582: if (ndamp) damping *= 2.0;
1583: damp = PETSC_TRUE;
1584: ndamp++;
1585: goto endofwhile;
1586: } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1587: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1588: } else if (PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1589: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1590: } else {
1591: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+2,PetscAbsScalar(rtmp3[row+2]),zeropivot);
1592: }
1593: }
1594: rtmp1[row] = 1.0/rtmp1[row];
1595: rtmp2[row+1] = 1.0/rtmp2[row+1];
1596: rtmp3[row+2] = 1.0/rtmp3[row+2];
1597: /* copy row entries from dense representation to sparse */
1598: for (j=0; j<nz; j++) {
1599: idx = pj[j];
1600: pc1[j] = rtmps1[idx];
1601: pc2[j] = rtmps2[idx];
1602: pc3[j] = rtmps3[idx];
1603: }
1604: break;
1606: default:
1607: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1608: }
1609: row += nsz; /* Update the row */
1610: }
1611: endofwhile:;
1612: } while (damp);
1613: PetscFree(rtmp1);
1614: PetscFree(tmp_vec2);
1615: ISRestoreIndices(isicol,&ic);
1616: ISRestoreIndices(isrow,&r);
1617: ISRestoreIndices(iscol,&c);
1618: C->factor = FACTOR_LU;
1619: C->assembled = PETSC_TRUE;
1620: if (ndamp || b->lu_damping) {
1621: PetscLogInfo(0,"MatLUFactorNumerical_SeqAIJ_Inode: number of damping tries %d damping value %g\n",ndamp,damping);
1622: }
1623: PetscLogFlops(C->n);
1624: return(0);
1625: }
1627: /*
1628: This is really ugly. if inodes are used this replaces the
1629: permutations with ones that correspond to rows/cols of the matrix
1630: rather then inode blocks
1631: */
1634: int MatAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1635: {
1636: int ierr,(*f)(Mat,IS*,IS*);
1639: PetscObjectQueryFunction((PetscObject)A,"MatAdjustForInodes_C",(void (**)(void))&f);
1640: if (f) {
1641: (*f)(A,rperm,cperm);
1642: }
1643: return(0);
1644: }
1646: EXTERN_C_BEGIN
1649: int MatAdjustForInodes_SeqAIJ(Mat A,IS *rperm,IS *cperm)
1650: {
1651: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1652: int ierr,m = A->m,n = A->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1653: int row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
1654: int nslim_col,*ns_col;
1655: IS ris = *rperm,cis = *cperm;
1658: if (!a->inode.size) return(0); /* no inodes so return */
1659: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
1661: Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
1662: PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(int),&tns);
1663: PetscMalloc((m+n+1)*sizeof(int),&permr);
1664: permc = permr + m;
1666: ISGetIndices(ris,&ridx);
1667: ISGetIndices(cis,&cidx);
1669: /* Form the inode structure for the rows of permuted matric using inv perm*/
1670: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
1672: /* Construct the permutations for rows*/
1673: for (i=0,row = 0; i<nslim_row; ++i){
1674: indx = ridx[i];
1675: start_val = tns[indx];
1676: end_val = tns[indx + 1];
1677: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1678: }
1680: /* Form the inode structure for the columns of permuted matrix using inv perm*/
1681: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
1683: /* Construct permutations for columns */
1684: for (i=0,col=0; i<nslim_col; ++i){
1685: indx = cidx[i];
1686: start_val = tns[indx];
1687: end_val = tns[indx + 1];
1688: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1689: }
1691: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1692: ISSetPermutation(*rperm);
1693: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1694: ISSetPermutation(*cperm);
1695:
1696: ISRestoreIndices(ris,&ridx);
1697: ISRestoreIndices(cis,&cidx);
1699: PetscFree(ns_col);
1700: PetscFree(permr);
1701: ISDestroy(cis);
1702: ISDestroy(ris);
1703: PetscFree(tns);
1704: return(0);
1705: }
1706: EXTERN_C_END
1710: /*@C
1711: MatSeqAIJGetInodeSizes - Returns the inode information of the SeqAIJ matrix.
1713: Collective on Mat
1715: Input Parameter:
1716: . A - the SeqAIJ matrix.
1718: Output Parameter:
1719: + node_count - no of inodes present in the matrix.
1720: . sizes - an array of size node_count,with sizes of each inode.
1721: - limit - the max size used to generate the inodes.
1723: Level: advanced
1725: Notes: This routine returns some internal storage information
1726: of the matrix, it is intended to be used by advanced users.
1727: It should be called after the matrix is assembled.
1728: The contents of the sizes[] array should not be changed.
1730: .keywords: matrix, seqaij, get, inode
1732: .seealso: MatGetInfo()
1733: @*/
1734: int MatSeqAIJGetInodeSizes(Mat A,int *node_count,int *sizes[],int *limit)
1735: {
1736: int ierr,(*f)(Mat,int*,int*[],int*);
1739: PetscObjectQueryFunction((PetscObject)A,"MatSeqAIJGetInodeSizes_C",(void (**)(void))&f);
1740: if (f) {
1741: (*f)(A,node_count,sizes,limit);
1742: }
1743: return(0);
1744: }
1746: EXTERN_C_BEGIN
1749: int MatSeqAIJGetInodeSizes_SeqAIJ(Mat A,int *node_count,int *sizes[],int *limit)
1750: {
1751: Mat_SeqAIJ *a;
1755: a = (Mat_SeqAIJ*)A->data;
1756: *node_count = a->inode.node_count;
1757: *sizes = a->inode.size;
1758: *limit = a->inode.limit;
1759: return(0);
1760: }
1761: EXTERN_C_END