Actual source code: aijnode.c

  1: /*
  2:   This file provides high performance routines for the AIJ (compressed row)
  3:   format by taking advantage of rows with identical nonzero structure (I-nodes).
  4: */
 5:  #include src/mat/impls/aij/seq/aij.h

  7: EXTERN PetscErrorCode Mat_AIJ_CheckInode(Mat,PetscTruth);
  8: EXTERN PetscErrorCode MatSolve_SeqAIJ_Inode(Mat,Vec,Vec);
  9: EXTERN PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat,Mat *);

 11: EXTERN PetscErrorCode MatMult_SeqAIJ(Mat,Vec,Vec);
 12: EXTERN PetscErrorCode MatMultAdd_SeqAIJ(Mat,Vec,Vec,Vec);
 13: EXTERN PetscErrorCode MatSolve_SeqAIJ(Mat,Vec,Vec);
 14: EXTERN PetscErrorCode MatLUFactorNumeric_SeqAIJ(Mat,Mat*);
 15: EXTERN PetscErrorCode MatGetRowIJ_SeqAIJ(Mat,PetscInt,PetscTruth,PetscInt*,PetscInt*[],PetscInt*[],PetscTruth*);
 16: EXTERN PetscErrorCode MatRestoreRowIJ_SeqAIJ(Mat,PetscInt,PetscTruth,PetscInt*,PetscInt*[],PetscInt*[],PetscTruth*);
 17: EXTERN PetscErrorCode MatGetColumnIJ_SeqAIJ(Mat,PetscInt,PetscTruth,PetscInt*,PetscInt*[],PetscInt*[],PetscTruth*);
 18: EXTERN PetscErrorCode MatRestoreColumnIJ_SeqAIJ(Mat,PetscInt,PetscTruth,PetscInt*,PetscInt*[],PetscInt*[],PetscTruth*);

 22: static PetscErrorCode Mat_AIJ_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
 23: {
 24:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 26:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 29:   n      = A->n;
 30:   m      = A->m;
 31:   ns_row = a->inode.size;
 32: 
 33:   min_mn = (m < n) ? m : n;
 34:   if (!ns) {
 35:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
 36:     for(; count+1 < n; count++,i++);
 37:     if (count < n)  {
 38:       i++;
 39:     }
 40:     *size = i;
 41:     return(0);
 42:   }
 43:   PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
 44: 
 45:   /* Use the same row structure wherever feasible. */
 46:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 47:     ns_col[i] = ns_row[i];
 48:   }

 50:   /* if m < n; pad up the remainder with inode_limit */
 51:   for(; count+1 < n; count++,i++) {
 52:     ns_col[i] = 1;
 53:   }
 54:   /* The last node is the odd ball. padd it up with the remaining rows; */
 55:   if (count < n)  {
 56:     ns_col[i] = n - count;
 57:     i++;
 58:   } else if (count > n) {
 59:     /* Adjust for the over estimation */
 60:     ns_col[i-1] += n - count;
 61:   }
 62:   *size = i;
 63:   *ns   = ns_col;
 64:   return(0);
 65: }


 68: /*
 69:       This builds symmetric version of nonzero structure,
 70: */
 73: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 74: {
 75:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 77:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
 78:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 81:   nslim_row = a->inode.node_count;
 82:   m         = A->m;
 83:   n         = A->n;
 84:   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix shoul be square");
 85: 
 86:   /* Use the row_inode as column_inode */
 87:   nslim_col = nslim_row;
 88:   ns_col    = ns_row;

 90:   /* allocate space for reformated inode structure */
 91:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
 92:   PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
 93:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 95:   for (i1=0,col=0; i1<nslim_col; ++i1){
 96:     nsz = ns_col[i1];
 97:     for (i2=0; i2<nsz; ++i2,++col)
 98:       tvc[col] = i1;
 99:   }
100:   /* allocate space for row pointers */
101:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
102:   *iia = ia;
103:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
104:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

106:   /* determine the number of columns in each row */
107:   ia[0] = oshift;
108:   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {

110:     j    = aj + ai[row] + ishift;
111:     jmax = aj + ai[row+1] + ishift;
112:     i2   = 0;
113:     col  = *j++ + ishift;
114:     i2   = tvc[col];
115:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
116:       ia[i1+1]++;
117:       ia[i2+1]++;
118:       i2++;                     /* Start col of next node */
119:       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
120:       i2 = tvc[col];
121:     }
122:     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
123:   }

125:   /* shift ia[i] to point to next row */
126:   for (i1=1; i1<nslim_row+1; i1++) {
127:     row        = ia[i1-1];
128:     ia[i1]    += row;
129:     work[i1-1] = row - oshift;
130:   }

132:   /* allocate space for column pointers */
133:   nz   = ia[nslim_row] + (!ishift);
134:   PetscMalloc(nz*sizeof(PetscInt),&ja);
135:   *jja = ja;

137:  /* loop over lower triangular part putting into ja */
138:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
139:     j    = aj + ai[row] + ishift;
140:     jmax = aj + ai[row+1] + ishift;
141:     i2   = 0;                     /* Col inode index */
142:     col  = *j++ + ishift;
143:     i2   = tvc[col];
144:     while (i2<i1 && j<jmax) {
145:       ja[work[i2]++] = i1 + oshift;
146:       ja[work[i1]++] = i2 + oshift;
147:       ++i2;
148:       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
149:       i2 = tvc[col];
150:     }
151:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

153:   }
154:   PetscFree(work);
155:   PetscFree(tns);
156:   PetscFree(tvc);
157:   return(0);
158: }

160: /*
161:       This builds nonsymmetric version of nonzero structure,
162: */
165: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
166: {
167:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
169:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
170:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

173:   nslim_row = a->inode.node_count;
174:   n         = A->n;

176:   /* Create The column_inode for this matrix */
177:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
178: 
179:   /* allocate space for reformated column_inode structure */
180:   PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
181:   PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
182:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

184:   for (i1=0,col=0; i1<nslim_col; ++i1){
185:     nsz = ns_col[i1];
186:     for (i2=0; i2<nsz; ++i2,++col)
187:       tvc[col] = i1;
188:   }
189:   /* allocate space for row pointers */
190:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
191:   *iia = ia;
192:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
193:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

195:   /* determine the number of columns in each row */
196:   ia[0] = oshift;
197:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
198:     j   = aj + ai[row] + ishift;
199:     col = *j++ + ishift;
200:     i2  = tvc[col];
201:     nz  = ai[row+1] - ai[row];
202:     while (nz-- > 0) {           /* off-diagonal elemets */
203:       ia[i1+1]++;
204:       i2++;                     /* Start col of next node */
205:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
206:       i2 = tvc[col];
207:     }
208:   }

210:   /* shift ia[i] to point to next row */
211:   for (i1=1; i1<nslim_row+1; i1++) {
212:     row        = ia[i1-1];
213:     ia[i1]    += row;
214:     work[i1-1] = row - oshift;
215:   }

217:   /* allocate space for column pointers */
218:   nz   = ia[nslim_row] + (!ishift);
219:   PetscMalloc(nz*sizeof(PetscInt),&ja);
220:   *jja = ja;

222:  /* loop over matrix putting into ja */
223:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
224:     j   = aj + ai[row] + ishift;
225:     i2  = 0;                     /* Col inode index */
226:     col = *j++ + ishift;
227:     i2  = tvc[col];
228:     nz  = ai[row+1] - ai[row];
229:     while (nz-- > 0) {
230:       ja[work[i1]++] = i2 + oshift;
231:       ++i2;
232:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
233:       i2 = tvc[col];
234:     }
235:   }
236:   PetscFree(ns_col);
237:   PetscFree(work);
238:   PetscFree(tns);
239:   PetscFree(tvc);
240:   return(0);
241: }

245: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
246: {
247:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

251:   *n     = a->inode.node_count;
252:   if (!ia) return(0);

254:   if (symmetric) {
255:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
256:   } else {
257:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
258:   }
259:   return(0);
260: }

264: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
265: {

269:   if (!ia) return(0);
270:   PetscFree(*ia);
271:   PetscFree(*ja);
272:   return(0);
273: }

275: /* ----------------------------------------------------------- */

279: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
280: {
281:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
283:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
284:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

287:   nslim_row = a->inode.node_count;
288:   n         = A->n;

290:   /* Create The column_inode for this matrix */
291:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
292: 
293:   /* allocate space for reformated column_inode structure */
294:   PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
295:   PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
296:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

298:   for (i1=0,col=0; i1<nslim_col; ++i1){
299:     nsz = ns_col[i1];
300:     for (i2=0; i2<nsz; ++i2,++col)
301:       tvc[col] = i1;
302:   }
303:   /* allocate space for column pointers */
304:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
305:   *iia = ia;
306:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
307:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);

309:   /* determine the number of columns in each row */
310:   ia[0] = oshift;
311:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
312:     j   = aj + ai[row] + ishift;
313:     col = *j++ + ishift;
314:     i2  = tvc[col];
315:     nz  = ai[row+1] - ai[row];
316:     while (nz-- > 0) {           /* off-diagonal elemets */
317:       /* ia[i1+1]++; */
318:       ia[i2+1]++;
319:       i2++;
320:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
321:       i2 = tvc[col];
322:     }
323:   }

325:   /* shift ia[i] to point to next col */
326:   for (i1=1; i1<nslim_col+1; i1++) {
327:     col        = ia[i1-1];
328:     ia[i1]    += col;
329:     work[i1-1] = col - oshift;
330:   }

332:   /* allocate space for column pointers */
333:   nz   = ia[nslim_col] + (!ishift);
334:   PetscMalloc(nz*sizeof(PetscInt),&ja);
335:   *jja = ja;

337:  /* loop over matrix putting into ja */
338:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
339:     j   = aj + ai[row] + ishift;
340:     i2  = 0;                     /* Col inode index */
341:     col = *j++ + ishift;
342:     i2  = tvc[col];
343:     nz  = ai[row+1] - ai[row];
344:     while (nz-- > 0) {
345:       /* ja[work[i1]++] = i2 + oshift; */
346:       ja[work[i2]++] = i1 + oshift;
347:       i2++;
348:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
349:       i2 = tvc[col];
350:     }
351:   }
352:   PetscFree(ns_col);
353:   PetscFree(work);
354:   PetscFree(tns);
355:   PetscFree(tvc);
356:   return(0);
357: }

361: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
362: {

366:   Mat_AIJ_CreateColInode(A,n,PETSC_NULL);
367:   if (!ia) return(0);

369:   if (symmetric) {
370:     /* Since the indices are symmetric it does'nt matter */
371:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
372:   } else {
373:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
374:   }
375:   return(0);
376: }

380: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
381: {

385:   if (!ia) return(0);
386:   PetscFree(*ia);
387:   PetscFree(*ja);
388:   return(0);
389: }

391: /* ----------------------------------------------------------- */

395: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
396: {
397:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
398:   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
399:   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y;
401:   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
402: 
403: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
404: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
405: #endif

408:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
409:   node_max = a->inode.node_count;
410:   ns       = a->inode.size;     /* Node Size array */
411:   VecGetArray(xx,&x);
412:   VecGetArray(yy,&y);
413:   idx  = a->j;
414:   v1   = a->a;
415:   ii   = a->i;

417:   for (i = 0,row = 0; i< node_max; ++i){
418:     nsz  = ns[i];
419:     n    = ii[1] - ii[0];
420:     ii  += nsz;
421:     sz   = n;                   /* No of non zeros in this row */
422:                                 /* Switch on the size of Node */
423:     switch (nsz){               /* Each loop in 'case' is unrolled */
424:     case 1 :
425:       sum1  = 0;
426: 
427:       for(n = 0; n< sz-1; n+=2) {
428:         i1   = idx[0];          /* The instructions are ordered to */
429:         i2   = idx[1];          /* make the compiler's job easy */
430:         idx += 2;
431:         tmp0 = x[i1];
432:         tmp1 = x[i2];
433:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
434:        }
435: 
436:       if (n == sz-1){          /* Take care of the last nonzero  */
437:         tmp0  = x[*idx++];
438:         sum1 += *v1++ * tmp0;
439:       }
440:       y[row++]=sum1;
441:       break;
442:     case 2:
443:       sum1  = 0;
444:       sum2  = 0;
445:       v2    = v1 + n;
446: 
447:       for (n = 0; n< sz-1; n+=2) {
448:         i1   = idx[0];
449:         i2   = idx[1];
450:         idx += 2;
451:         tmp0 = x[i1];
452:         tmp1 = x[i2];
453:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
454:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
455:       }
456:       if (n == sz-1){
457:         tmp0  = x[*idx++];
458:         sum1 += *v1++ * tmp0;
459:         sum2 += *v2++ * tmp0;
460:       }
461:       y[row++]=sum1;
462:       y[row++]=sum2;
463:       v1      =v2;              /* Since the next block to be processed starts there*/
464:       idx    +=sz;
465:       break;
466:     case 3:
467:       sum1  = 0;
468:       sum2  = 0;
469:       sum3  = 0;
470:       v2    = v1 + n;
471:       v3    = v2 + n;
472: 
473:       for (n = 0; n< sz-1; n+=2) {
474:         i1   = idx[0];
475:         i2   = idx[1];
476:         idx += 2;
477:         tmp0 = x[i1];
478:         tmp1 = x[i2];
479:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
480:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
481:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
482:       }
483:       if (n == sz-1){
484:         tmp0  = x[*idx++];
485:         sum1 += *v1++ * tmp0;
486:         sum2 += *v2++ * tmp0;
487:         sum3 += *v3++ * tmp0;
488:       }
489:       y[row++]=sum1;
490:       y[row++]=sum2;
491:       y[row++]=sum3;
492:       v1       =v3;             /* Since the next block to be processed starts there*/
493:       idx     +=2*sz;
494:       break;
495:     case 4:
496:       sum1  = 0;
497:       sum2  = 0;
498:       sum3  = 0;
499:       sum4  = 0;
500:       v2    = v1 + n;
501:       v3    = v2 + n;
502:       v4    = v3 + n;
503: 
504:       for (n = 0; n< sz-1; n+=2) {
505:         i1   = idx[0];
506:         i2   = idx[1];
507:         idx += 2;
508:         tmp0 = x[i1];
509:         tmp1 = x[i2];
510:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
511:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
512:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
513:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
514:       }
515:       if (n == sz-1){
516:         tmp0  = x[*idx++];
517:         sum1 += *v1++ * tmp0;
518:         sum2 += *v2++ * tmp0;
519:         sum3 += *v3++ * tmp0;
520:         sum4 += *v4++ * tmp0;
521:       }
522:       y[row++]=sum1;
523:       y[row++]=sum2;
524:       y[row++]=sum3;
525:       y[row++]=sum4;
526:       v1      =v4;              /* Since the next block to be processed starts there*/
527:       idx    +=3*sz;
528:       break;
529:     case 5:
530:       sum1  = 0;
531:       sum2  = 0;
532:       sum3  = 0;
533:       sum4  = 0;
534:       sum5  = 0;
535:       v2    = v1 + n;
536:       v3    = v2 + n;
537:       v4    = v3 + n;
538:       v5    = v4 + n;
539: 
540:       for (n = 0; n<sz-1; n+=2) {
541:         i1   = idx[0];
542:         i2   = idx[1];
543:         idx += 2;
544:         tmp0 = x[i1];
545:         tmp1 = x[i2];
546:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
547:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
548:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
549:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
550:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
551:       }
552:       if (n == sz-1){
553:         tmp0  = x[*idx++];
554:         sum1 += *v1++ * tmp0;
555:         sum2 += *v2++ * tmp0;
556:         sum3 += *v3++ * tmp0;
557:         sum4 += *v4++ * tmp0;
558:         sum5 += *v5++ * tmp0;
559:       }
560:       y[row++]=sum1;
561:       y[row++]=sum2;
562:       y[row++]=sum3;
563:       y[row++]=sum4;
564:       y[row++]=sum5;
565:       v1      =v5;       /* Since the next block to be processed starts there */
566:       idx    +=4*sz;
567:       break;
568:     default :
569:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
570:     }
571:   }
572:   VecRestoreArray(xx,&x);
573:   VecRestoreArray(yy,&y);
574:   PetscLogFlops(2*a->nz - A->m);
575:   return(0);
576: }
577: /* ----------------------------------------------------------- */
578: /* Almost same code as the MatMult_SeqAij_Inode() */
581: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
582: {
583:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
584:   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
585:   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
587:   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
588: 
590:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
591:   node_max = a->inode.node_count;
592:   ns       = a->inode.size;     /* Node Size array */
593:   VecGetArray(xx,&x);
594:   VecGetArray(yy,&y);
595:   if (zz != yy) {
596:     VecGetArray(zz,&z);
597:   } else {
598:     z = y;
599:   }
600:   zt = z;

602:   idx  = a->j;
603:   v1   = a->a;
604:   ii   = a->i;

606:   for (i = 0,row = 0; i< node_max; ++i){
607:     nsz  = ns[i];
608:     n    = ii[1] - ii[0];
609:     ii  += nsz;
610:     sz   = n;                   /* No of non zeros in this row */
611:                                 /* Switch on the size of Node */
612:     switch (nsz){               /* Each loop in 'case' is unrolled */
613:     case 1 :
614:       sum1  = *zt++;
615: 
616:       for(n = 0; n< sz-1; n+=2) {
617:         i1   = idx[0];          /* The instructions are ordered to */
618:         i2   = idx[1];          /* make the compiler's job easy */
619:         idx += 2;
620:         tmp0 = x[i1];
621:         tmp1 = x[i2];
622:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623:        }
624: 
625:       if(n   == sz-1){          /* Take care of the last nonzero  */
626:         tmp0  = x[*idx++];
627:         sum1 += *v1++ * tmp0;
628:       }
629:       y[row++]=sum1;
630:       break;
631:     case 2:
632:       sum1  = *zt++;
633:       sum2  = *zt++;
634:       v2    = v1 + n;
635: 
636:       for(n = 0; n< sz-1; n+=2) {
637:         i1   = idx[0];
638:         i2   = idx[1];
639:         idx += 2;
640:         tmp0 = x[i1];
641:         tmp1 = x[i2];
642:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644:       }
645:       if(n   == sz-1){
646:         tmp0  = x[*idx++];
647:         sum1 += *v1++ * tmp0;
648:         sum2 += *v2++ * tmp0;
649:       }
650:       y[row++]=sum1;
651:       y[row++]=sum2;
652:       v1      =v2;              /* Since the next block to be processed starts there*/
653:       idx    +=sz;
654:       break;
655:     case 3:
656:       sum1  = *zt++;
657:       sum2  = *zt++;
658:       sum3  = *zt++;
659:       v2    = v1 + n;
660:       v3    = v2 + n;
661: 
662:       for (n = 0; n< sz-1; n+=2) {
663:         i1   = idx[0];
664:         i2   = idx[1];
665:         idx += 2;
666:         tmp0 = x[i1];
667:         tmp1 = x[i2];
668:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671:       }
672:       if (n == sz-1){
673:         tmp0  = x[*idx++];
674:         sum1 += *v1++ * tmp0;
675:         sum2 += *v2++ * tmp0;
676:         sum3 += *v3++ * tmp0;
677:       }
678:       y[row++]=sum1;
679:       y[row++]=sum2;
680:       y[row++]=sum3;
681:       v1       =v3;             /* Since the next block to be processed starts there*/
682:       idx     +=2*sz;
683:       break;
684:     case 4:
685:       sum1  = *zt++;
686:       sum2  = *zt++;
687:       sum3  = *zt++;
688:       sum4  = *zt++;
689:       v2    = v1 + n;
690:       v3    = v2 + n;
691:       v4    = v3 + n;
692: 
693:       for (n = 0; n< sz-1; n+=2) {
694:         i1   = idx[0];
695:         i2   = idx[1];
696:         idx += 2;
697:         tmp0 = x[i1];
698:         tmp1 = x[i2];
699:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703:       }
704:       if (n == sz-1){
705:         tmp0  = x[*idx++];
706:         sum1 += *v1++ * tmp0;
707:         sum2 += *v2++ * tmp0;
708:         sum3 += *v3++ * tmp0;
709:         sum4 += *v4++ * tmp0;
710:       }
711:       y[row++]=sum1;
712:       y[row++]=sum2;
713:       y[row++]=sum3;
714:       y[row++]=sum4;
715:       v1      =v4;              /* Since the next block to be processed starts there*/
716:       idx    +=3*sz;
717:       break;
718:     case 5:
719:       sum1  = *zt++;
720:       sum2  = *zt++;
721:       sum3  = *zt++;
722:       sum4  = *zt++;
723:       sum5  = *zt++;
724:       v2    = v1 + n;
725:       v3    = v2 + n;
726:       v4    = v3 + n;
727:       v5    = v4 + n;
728: 
729:       for (n = 0; n<sz-1; n+=2) {
730:         i1   = idx[0];
731:         i2   = idx[1];
732:         idx += 2;
733:         tmp0 = x[i1];
734:         tmp1 = x[i2];
735:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740:       }
741:       if(n   == sz-1){
742:         tmp0  = x[*idx++];
743:         sum1 += *v1++ * tmp0;
744:         sum2 += *v2++ * tmp0;
745:         sum3 += *v3++ * tmp0;
746:         sum4 += *v4++ * tmp0;
747:         sum5 += *v5++ * tmp0;
748:       }
749:       y[row++]=sum1;
750:       y[row++]=sum2;
751:       y[row++]=sum3;
752:       y[row++]=sum4;
753:       y[row++]=sum5;
754:       v1      =v5;       /* Since the next block to be processed starts there */
755:       idx    +=4*sz;
756:       break;
757:     default :
758:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
759:     }
760:   }
761:   VecRestoreArray(xx,&x);
762:   VecRestoreArray(yy,&y);
763:   if (zz != yy) {
764:     VecRestoreArray(zz,&z);
765:   }
766:   PetscLogFlops(2*a->nz);
767:   return(0);
768: }
769: /* ----------------------------------------------------------- */
770: EXTERN PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat,PetscInt,PetscInt,ISColoringValue[],ISColoring *);

772: /*
773:     samestructure indicates that the matrix has not changed its nonzero structure so we 
774:     do not need to recompute the inodes 
775: */
778: PetscErrorCode Mat_AIJ_CheckInode(Mat A,PetscTruth samestructure)
779: {
780:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
782:   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
783:   PetscTruth     flag,flg;

786:   if (samestructure && a->inode.checked) return(0);

788:   a->inode.checked = PETSC_TRUE;

790:   /* Notes: We set a->inode.limit=5 in MatCreateSeqAIJ(). */
791:   if (!a->inode.use) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODES\n"); return(0);}
792:   PetscOptionsHasName(A->prefix,"-mat_aij_no_inode",&flg);
793:   if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_aij_no_inode\n");return(0);}
794:   PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
795:   if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_no_unroll\n");return(0);}
796:   PetscOptionsGetInt(A->prefix,"-mat_aij_inode_limit",&a->inode.limit,PETSC_NULL);
797:   if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
798:   m = A->m;
799:   if (a->inode.size) {ns = a->inode.size;}
800:   else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}

802:   i          = 0;
803:   node_count = 0;
804:   idx        = a->j;
805:   ii         = a->i;
806:   while (i < m){                /* For each row */
807:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
808:     /* Limits the number of elements in a node to 'a->inode.limit' */
809:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
810:       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
811:       if (nzy != nzx) break;
812:       idy  += nzx;             /* Same nonzero pattern */
813:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
814:       if (flag == PETSC_FALSE) break;
815:     }
816:     ns[node_count++] = blk_size;
817:     idx += blk_size*nzx;
818:     i    = j;
819:   }
820:   /* If not enough inodes found,, do not use inode version of the routines */
821:   if (!a->inode.size && m && node_count > 0.9*m) {
822:     PetscFree(ns);
823:     A->ops->mult            = MatMult_SeqAIJ;
824:     A->ops->multadd         = MatMultAdd_SeqAIJ;
825:     A->ops->solve           = MatSolve_SeqAIJ;
826:     A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ;
827:     A->ops->getrowij        = MatGetRowIJ_SeqAIJ;
828:     A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ;
829:     A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ;
830:     A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
831:     A->ops->coloringpatch   = 0;
832:     a->inode.node_count     = 0;
833:     a->inode.size           = 0;
834:     PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
835:   } else {
836:     A->ops->mult            = MatMult_SeqAIJ_Inode;
837:     A->ops->multadd         = MatMultAdd_SeqAIJ_Inode;
838:     A->ops->solve           = MatSolve_SeqAIJ_Inode;
839:     A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
840:     A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
841:     A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
842:     A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
843:     A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
844:     A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
845:     a->inode.node_count     = node_count;
846:     a->inode.size           = ns;
847:     PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
848:   }
849:   return(0);
850: }

852: /* ----------------------------------------------------------- */
855: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
856: {
857:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
858:   IS             iscol = a->col,isrow = a->row;
860:   PetscInt       *r,*c,i,j,n = A->m,*ai = a->i,nz,*a_j = a->j;
861:   PetscInt       node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
862:   PetscScalar    *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
863:   PetscScalar    sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;

866:   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
867:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
868:   node_max = a->inode.node_count;
869:   ns       = a->inode.size;     /* Node Size array */

871:   VecGetArray(bb,&b);
872:   VecGetArray(xx,&x);
873:   tmp  = a->solve_work;
874: 
875:   ISGetIndices(isrow,&rout); r = rout;
876:   ISGetIndices(iscol,&cout); c = cout + (n-1);
877: 
878:   /* forward solve the lower triangular */
879:   tmps = tmp ;
880:   aa   = a_a ;
881:   aj   = a_j ;
882:   ad   = a->diag;

884:   for (i = 0,row = 0; i< node_max; ++i){
885:     nsz = ns[i];
886:     aii = ai[row];
887:     v1  = aa + aii;
888:     vi  = aj + aii;
889:     nz  = ad[row]- aii;
890: 
891:     switch (nsz){               /* Each loop in 'case' is unrolled */
892:     case 1 :
893:       sum1 = b[*r++];
894:       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
895:       for(j=0; j<nz-1; j+=2){
896:         i0   = vi[0];
897:         i1   = vi[1];
898:         vi  +=2;
899:         tmp0 = tmps[i0];
900:         tmp1 = tmps[i1];
901:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
902:       }
903:       if(j == nz-1){
904:         tmp0 = tmps[*vi++];
905:         sum1 -= *v1++ *tmp0;
906:       }
907:       tmp[row ++]=sum1;
908:       break;
909:     case 2:
910:       sum1 = b[*r++];
911:       sum2 = b[*r++];
912:       v2   = aa + ai[row+1];

914:       for(j=0; j<nz-1; j+=2){
915:         i0   = vi[0];
916:         i1   = vi[1];
917:         vi  +=2;
918:         tmp0 = tmps[i0];
919:         tmp1 = tmps[i1];
920:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
921:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
922:       }
923:       if(j == nz-1){
924:         tmp0 = tmps[*vi++];
925:         sum1 -= *v1++ *tmp0;
926:         sum2 -= *v2++ *tmp0;
927:       }
928:       sum2 -= *v2++ * sum1;
929:       tmp[row ++]=sum1;
930:       tmp[row ++]=sum2;
931:       break;
932:     case 3:
933:       sum1 = b[*r++];
934:       sum2 = b[*r++];
935:       sum3 = b[*r++];
936:       v2   = aa + ai[row+1];
937:       v3   = aa + ai[row+2];
938: 
939:       for (j=0; j<nz-1; j+=2){
940:         i0   = vi[0];
941:         i1   = vi[1];
942:         vi  +=2;
943:         tmp0 = tmps[i0];
944:         tmp1 = tmps[i1];
945:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948:       }
949:       if (j == nz-1){
950:         tmp0 = tmps[*vi++];
951:         sum1 -= *v1++ *tmp0;
952:         sum2 -= *v2++ *tmp0;
953:         sum3 -= *v3++ *tmp0;
954:       }
955:       sum2 -= *v2++ * sum1;
956:       sum3 -= *v3++ * sum1;
957:       sum3 -= *v3++ * sum2;
958:       tmp[row ++]=sum1;
959:       tmp[row ++]=sum2;
960:       tmp[row ++]=sum3;
961:       break;
962: 
963:     case 4:
964:       sum1 = b[*r++];
965:       sum2 = b[*r++];
966:       sum3 = b[*r++];
967:       sum4 = b[*r++];
968:       v2   = aa + ai[row+1];
969:       v3   = aa + ai[row+2];
970:       v4   = aa + ai[row+3];
971: 
972:       for (j=0; j<nz-1; j+=2){
973:         i0   = vi[0];
974:         i1   = vi[1];
975:         vi  +=2;
976:         tmp0 = tmps[i0];
977:         tmp1 = tmps[i1];
978:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
979:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
980:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
981:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
982:       }
983:       if (j == nz-1){
984:         tmp0 = tmps[*vi++];
985:         sum1 -= *v1++ *tmp0;
986:         sum2 -= *v2++ *tmp0;
987:         sum3 -= *v3++ *tmp0;
988:         sum4 -= *v4++ *tmp0;
989:       }
990:       sum2 -= *v2++ * sum1;
991:       sum3 -= *v3++ * sum1;
992:       sum4 -= *v4++ * sum1;
993:       sum3 -= *v3++ * sum2;
994:       sum4 -= *v4++ * sum2;
995:       sum4 -= *v4++ * sum3;
996: 
997:       tmp[row ++]=sum1;
998:       tmp[row ++]=sum2;
999:       tmp[row ++]=sum3;
1000:       tmp[row ++]=sum4;
1001:       break;
1002:     case 5:
1003:       sum1 = b[*r++];
1004:       sum2 = b[*r++];
1005:       sum3 = b[*r++];
1006:       sum4 = b[*r++];
1007:       sum5 = b[*r++];
1008:       v2   = aa + ai[row+1];
1009:       v3   = aa + ai[row+2];
1010:       v4   = aa + ai[row+3];
1011:       v5   = aa + ai[row+4];
1012: 
1013:       for (j=0; j<nz-1; j+=2){
1014:         i0   = vi[0];
1015:         i1   = vi[1];
1016:         vi  +=2;
1017:         tmp0 = tmps[i0];
1018:         tmp1 = tmps[i1];
1019:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1020:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1021:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1022:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1023:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1024:       }
1025:       if (j == nz-1){
1026:         tmp0 = tmps[*vi++];
1027:         sum1 -= *v1++ *tmp0;
1028:         sum2 -= *v2++ *tmp0;
1029:         sum3 -= *v3++ *tmp0;
1030:         sum4 -= *v4++ *tmp0;
1031:         sum5 -= *v5++ *tmp0;
1032:       }

1034:       sum2 -= *v2++ * sum1;
1035:       sum3 -= *v3++ * sum1;
1036:       sum4 -= *v4++ * sum1;
1037:       sum5 -= *v5++ * sum1;
1038:       sum3 -= *v3++ * sum2;
1039:       sum4 -= *v4++ * sum2;
1040:       sum5 -= *v5++ * sum2;
1041:       sum4 -= *v4++ * sum3;
1042:       sum5 -= *v5++ * sum3;
1043:       sum5 -= *v5++ * sum4;
1044: 
1045:       tmp[row ++]=sum1;
1046:       tmp[row ++]=sum2;
1047:       tmp[row ++]=sum3;
1048:       tmp[row ++]=sum4;
1049:       tmp[row ++]=sum5;
1050:       break;
1051:     default:
1052:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1053:     }
1054:   }
1055:   /* backward solve the upper triangular */
1056:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1057:     nsz = ns[i];
1058:     aii = ai[row+1] -1;
1059:     v1  = aa + aii;
1060:     vi  = aj + aii;
1061:     nz  = aii- ad[row];
1062:     switch (nsz){               /* Each loop in 'case' is unrolled */
1063:     case 1 :
1064:       sum1 = tmp[row];

1066:       for(j=nz ; j>1; j-=2){
1067:         i0   = vi[0];
1068:         i1   = vi[-1];
1069:         vi  -=2;
1070:         tmp0 = tmps[i0];
1071:         tmp1 = tmps[i1];
1072:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1073:       }
1074:       if (j==1){
1075:         tmp0  = tmps[*vi--];
1076:         sum1 -= *v1-- * tmp0;
1077:       }
1078:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1079:       break;
1080:     case 2 :
1081:       sum1 = tmp[row];
1082:       sum2 = tmp[row -1];
1083:       v2   = aa + ai[row]-1;
1084:       for (j=nz ; j>1; j-=2){
1085:         i0   = vi[0];
1086:         i1   = vi[-1];
1087:         vi  -=2;
1088:         tmp0 = tmps[i0];
1089:         tmp1 = tmps[i1];
1090:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1091:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1092:       }
1093:       if (j==1){
1094:         tmp0  = tmps[*vi--];
1095:         sum1 -= *v1-- * tmp0;
1096:         sum2 -= *v2-- * tmp0;
1097:       }
1098: 
1099:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1100:       sum2   -= *v2-- * tmp0;
1101:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102:       break;
1103:     case 3 :
1104:       sum1 = tmp[row];
1105:       sum2 = tmp[row -1];
1106:       sum3 = tmp[row -2];
1107:       v2   = aa + ai[row]-1;
1108:       v3   = aa + ai[row -1]-1;
1109:       for (j=nz ; j>1; j-=2){
1110:         i0   = vi[0];
1111:         i1   = vi[-1];
1112:         vi  -=2;
1113:         tmp0 = tmps[i0];
1114:         tmp1 = tmps[i1];
1115:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1116:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1117:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1118:       }
1119:       if (j==1){
1120:         tmp0  = tmps[*vi--];
1121:         sum1 -= *v1-- * tmp0;
1122:         sum2 -= *v2-- * tmp0;
1123:         sum3 -= *v3-- * tmp0;
1124:       }
1125:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1126:       sum2   -= *v2-- * tmp0;
1127:       sum3   -= *v3-- * tmp0;
1128:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1129:       sum3   -= *v3-- * tmp0;
1130:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1131: 
1132:       break;
1133:     case 4 :
1134:       sum1 = tmp[row];
1135:       sum2 = tmp[row -1];
1136:       sum3 = tmp[row -2];
1137:       sum4 = tmp[row -3];
1138:       v2   = aa + ai[row]-1;
1139:       v3   = aa + ai[row -1]-1;
1140:       v4   = aa + ai[row -2]-1;

1142:       for (j=nz ; j>1; j-=2){
1143:         i0   = vi[0];
1144:         i1   = vi[-1];
1145:         vi  -=2;
1146:         tmp0 = tmps[i0];
1147:         tmp1 = tmps[i1];
1148:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1149:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1150:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1151:         sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1152:       }
1153:       if (j==1){
1154:         tmp0  = tmps[*vi--];
1155:         sum1 -= *v1-- * tmp0;
1156:         sum2 -= *v2-- * tmp0;
1157:         sum3 -= *v3-- * tmp0;
1158:         sum4 -= *v4-- * tmp0;
1159:       }

1161:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1162:       sum2   -= *v2-- * tmp0;
1163:       sum3   -= *v3-- * tmp0;
1164:       sum4   -= *v4-- * tmp0;
1165:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1166:       sum3   -= *v3-- * tmp0;
1167:       sum4   -= *v4-- * tmp0;
1168:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1169:       sum4   -= *v4-- * tmp0;
1170:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1171:       break;
1172:     case 5 :
1173:       sum1 = tmp[row];
1174:       sum2 = tmp[row -1];
1175:       sum3 = tmp[row -2];
1176:       sum4 = tmp[row -3];
1177:       sum5 = tmp[row -4];
1178:       v2   = aa + ai[row]-1;
1179:       v3   = aa + ai[row -1]-1;
1180:       v4   = aa + ai[row -2]-1;
1181:       v5   = aa + ai[row -3]-1;
1182:       for (j=nz ; j>1; j-=2){
1183:         i0   = vi[0];
1184:         i1   = vi[-1];
1185:         vi  -=2;
1186:         tmp0 = tmps[i0];
1187:         tmp1 = tmps[i1];
1188:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1189:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1190:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1191:         sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1192:         sum5 -= v5[0] * tmp0 + v5[-1] * tmp1; v5 -= 2;
1193:       }
1194:       if (j==1){
1195:         tmp0  = tmps[*vi--];
1196:         sum1 -= *v1-- * tmp0;
1197:         sum2 -= *v2-- * tmp0;
1198:         sum3 -= *v3-- * tmp0;
1199:         sum4 -= *v4-- * tmp0;
1200:         sum5 -= *v5-- * tmp0;
1201:       }

1203:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1204:       sum2   -= *v2-- * tmp0;
1205:       sum3   -= *v3-- * tmp0;
1206:       sum4   -= *v4-- * tmp0;
1207:       sum5   -= *v5-- * tmp0;
1208:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1209:       sum3   -= *v3-- * tmp0;
1210:       sum4   -= *v4-- * tmp0;
1211:       sum5   -= *v5-- * tmp0;
1212:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1213:       sum4   -= *v4-- * tmp0;
1214:       sum5   -= *v5-- * tmp0;
1215:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1216:       sum5   -= *v5-- * tmp0;
1217:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1218:       break;
1219:     default:
1220:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1221:     }
1222:   }
1223:   ISRestoreIndices(isrow,&rout);
1224:   ISRestoreIndices(iscol,&cout);
1225:   VecRestoreArray(bb,&b);
1226:   VecRestoreArray(xx,&x);
1227:   PetscLogFlops(2*a->nz - A->n);
1228:   return(0);
1229: }


1234: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat A,Mat *B)
1235: {
1236:   Mat            C = *B;
1237:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ *)C->data;
1238:   IS             iscol = b->col,isrow = b->row,isicol = b->icol;
1240:   PetscInt       *r,*ic,*c,n = A->m,*bi = b->i;
1241:   PetscInt       *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,row,prow;
1242:   PetscInt       *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nsz;
1243:   PetscInt       *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj,ndamp = 0;
1244:   PetscScalar    *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1245:   PetscScalar    tmp,*ba = b->a,*aa = a->a,*pv,*rtmps1,*rtmps2,*rtmps3;
1246:   PetscReal      damping = b->lu_damping,zeropivot = b->lu_zeropivot;
1247:   PetscTruth     damp;

1250:   ISGetIndices(isrow,&r);
1251:   ISGetIndices(iscol,&c);
1252:   ISGetIndices(isicol,&ic);
1253:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1254:   PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1255:   ics    = ic ; rtmps1 = rtmp1 ;
1256:   rtmp2  = rtmp1 + n;  rtmps2 = rtmp2 ;
1257:   rtmp3  = rtmp2 + n;  rtmps3 = rtmp3 ;
1258: 
1259:   node_max = a->inode.node_count;
1260:   ns       = a->inode.size ;
1261:   if (!ns){
1262:     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1263:   }

1265:   /* If max inode size > 3, split it into two inodes.*/
1266:   /* also map the inode sizes according to the ordering */
1267:   PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1268:   for (i=0,j=0; i<node_max; ++i,++j){
1269:     if (ns[i]>3) {
1270:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1271:       ++j;
1272:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1273:     } else {
1274:       tmp_vec1[j] = ns[i];
1275:     }
1276:   }
1277:   /* Use the correct node_max */
1278:   node_max = j;

1280:   /* Now reorder the inode info based on mat re-ordering info */
1281:   /* First create a row -> inode_size_array_index map */
1282:   PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1283:   PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1284:   for (i=0,row=0; i<node_max; i++) {
1285:     nsz = tmp_vec1[i];
1286:     for (j=0; j<nsz; j++,row++) {
1287:       nsmap[row] = i;
1288:     }
1289:   }
1290:   /* Using nsmap, create a reordered ns structure */
1291:   for (i=0,j=0; i< node_max; i++) {
1292:     nsz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1293:     tmp_vec2[i] = nsz;
1294:     j        += nsz;
1295:   }
1296:   PetscFree(nsmap);
1297:   PetscFree(tmp_vec1);
1298:   /* Now use the correct ns */
1299:   ns = tmp_vec2;


1302:   do {
1303:     damp = PETSC_FALSE;
1304:     /* Now loop over each block-row, and do the factorization */
1305:     for (i=0,row=0; i<node_max; i++) {
1306:       nsz   = ns[i];
1307:       nz    = bi[row+1] - bi[row];
1308:       bjtmp = bj + bi[row];
1309: 
1310:       switch (nsz){
1311:       case 1:
1312:         for  (j=0; j<nz; j++){
1313:           idx         = bjtmp[j];
1314:           rtmps1[idx] = 0.0;
1315:         }
1316: 
1317:         /* load in initial (unfactored row) */
1318:         idx   = r[row];
1319:         nz    = ai[idx+1] - ai[idx];
1320:         ajtmp = aj + ai[idx];
1321:         v1    = aa + ai[idx];

1323:         for (j=0; j<nz; j++) {
1324:           idx        = ics[ajtmp[j]];
1325:           rtmp1[idx] = v1[j];
1326:           if (ndamp && ajtmp[j] == r[row]) {
1327:             rtmp1[idx] += damping;
1328:           }
1329:         }
1330:         prow = *bjtmp++ ;
1331:         while (prow < row) {
1332:           pc1 = rtmp1 + prow;
1333:           if (*pc1 != 0.0){
1334:             pv   = ba + bd[prow];
1335:             pj   = nbj + bd[prow];
1336:             mul1 = *pc1 * *pv++;
1337:             *pc1 = mul1;
1338:             nz   = bi[prow+1] - bd[prow] - 1;
1339:             PetscLogFlops(2*nz);
1340:             for (j=0; j<nz; j++) {
1341:               tmp = pv[j];
1342:               idx = pj[j];
1343:               rtmps1[idx] -= mul1 * tmp;
1344:             }
1345:           }
1346:           prow = *bjtmp++ ;
1347:         }
1348:         nz  = bi[row+1] - bi[row];
1349:         pj  = bj + bi[row];
1350:         pc1 = ba + bi[row];
1351:         if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1352:           if (damping) {
1353:             if (ndamp) damping *= 2.0;
1354:             damp = PETSC_TRUE;
1355:             ndamp++;
1356:             goto endofwhile;
1357:           } else {
1358:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1359:           }
1360:         }
1361:         rtmp1[row] = 1.0/rtmp1[row];
1362:         for (j=0; j<nz; j++) {
1363:           idx    = pj[j];
1364:           pc1[j] = rtmps1[idx];
1365:         }
1366:         break;
1367: 
1368:       case 2:
1369:         for  (j=0; j<nz; j++) {
1370:           idx         = bjtmp[j];
1371:           rtmps1[idx] = 0.0;
1372:           rtmps2[idx] = 0.0;
1373:         }
1374: 
1375:         /* load in initial (unfactored row) */
1376:         idx   = r[row];
1377:         nz    = ai[idx+1] - ai[idx];
1378:         ajtmp = aj + ai[idx];
1379:         v1    = aa + ai[idx];
1380:         v2    = aa + ai[idx+1];
1381: 
1382:         for (j=0; j<nz; j++) {
1383:           idx        = ics[ajtmp[j]];
1384:           rtmp1[idx] = v1[j];
1385:           rtmp2[idx] = v2[j];
1386:           if (ndamp && ajtmp[j] == r[row]) {
1387:             rtmp1[idx] += damping;
1388:           }
1389:           if (ndamp && ajtmp[j] == r[row+1]) {
1390:             rtmp2[idx] += damping;
1391:           }
1392:         }
1393:         prow = *bjtmp++ ;
1394:         while (prow < row) {
1395:           pc1 = rtmp1 + prow;
1396:           pc2 = rtmp2 + prow;
1397:           if (*pc1 != 0.0 || *pc2 != 0.0){
1398:             pv   = ba + bd[prow];
1399:             pj   = nbj + bd[prow];
1400:             mul1 = *pc1 * *pv;
1401:             mul2 = *pc2 * *pv;
1402:             ++pv;
1403:             *pc1 = mul1;
1404:             *pc2 = mul2;
1405: 
1406:             nz   = bi[prow+1] - bd[prow] - 1;
1407:             PetscLogFlops(2*2*nz);
1408:             for (j=0; j<nz; j++) {
1409:               tmp = pv[j];
1410:               idx = pj[j];
1411:               rtmps1[idx] -= mul1 * tmp;
1412:               rtmps2[idx] -= mul2 * tmp;
1413:             }
1414:           }
1415:           prow = *bjtmp++ ;
1416:         }
1417:         /* Now take care of the odd element*/
1418:         pc1 = rtmp1 + prow;
1419:         pc2 = rtmp2 + prow;
1420:         if (*pc2 != 0.0){
1421:           pj   = nbj + bd[prow];
1422:           if (PetscAbsScalar(*pc1) < zeropivot) {
1423:             if (damping) {
1424:               if (ndamp) damping *= 2.0;
1425:               damp = PETSC_TRUE;
1426:               ndamp++;
1427:               goto endofwhile;
1428:             } else {
1429:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1430:             }
1431:           }
1432:           mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1433:           *pc2 = mul2;
1434:           nz   = bi[prow+1] - bd[prow] - 1;
1435:           PetscLogFlops(2*nz);
1436:           for (j=0; j<nz; j++) {
1437:             idx = pj[j] ;
1438:             tmp = rtmp1[idx];
1439:             rtmp2[idx] -= mul2 * tmp;
1440:           }
1441:         }
1442: 
1443:         nz  = bi[row+1] - bi[row];
1444:         pj  = bj + bi[row];
1445:         pc1 = ba + bi[row];
1446:         pc2 = ba + bi[row+1];
1447:         if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1448:           if (damping) {
1449:             if (ndamp) damping *= 2.0;
1450:             damp = PETSC_TRUE;
1451:             ndamp++;
1452:             goto endofwhile;
1453:           } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1454:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1455:           } else {
1456:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1457:           }
1458:         }
1459:         rtmp1[row]   = 1.0/rtmp1[row];
1460:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1461:         for (j=0; j<nz; j++) {
1462:           idx    = pj[j];
1463:           pc1[j] = rtmps1[idx];
1464:           pc2[j] = rtmps2[idx];
1465:         }
1466:         break;

1468:       case 3:
1469:         for  (j=0; j<nz; j++) {
1470:           idx         = bjtmp[j];
1471:           rtmps1[idx] = 0.0;
1472:           rtmps2[idx] = 0.0;
1473:           rtmps3[idx] = 0.0;
1474:         }
1475:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1476:         idx   = r[row];
1477:         nz    = ai[idx+1] - ai[idx];
1478:         ajtmp = aj + ai[idx];
1479:         v1    = aa + ai[idx];
1480:         v2    = aa + ai[idx+1];
1481:         v3    = aa + ai[idx+2];
1482:         for (j=0; j<nz; j++) {
1483:           idx        = ics[ajtmp[j]];
1484:           rtmp1[idx] = v1[j];
1485:           rtmp2[idx] = v2[j];
1486:           rtmp3[idx] = v3[j];
1487:           if (ndamp && ajtmp[j] == r[row]) {
1488:             rtmp1[idx] += damping;
1489:           }
1490:           if (ndamp && ajtmp[j] == r[row+1]) {
1491:             rtmp2[idx] += damping;
1492:           }
1493:           if (ndamp && ajtmp[j] == r[row+2]) {
1494:             rtmp3[idx] += damping;
1495:           }
1496:         }
1497:         /* loop over all pivot row blocks above this row block */
1498:         prow = *bjtmp++ ;
1499:         while (prow < row) {
1500:           pc1 = rtmp1 + prow;
1501:           pc2 = rtmp2 + prow;
1502:           pc3 = rtmp3 + prow;
1503:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1504:             pv   = ba  + bd[prow];
1505:             pj   = nbj + bd[prow];
1506:             mul1 = *pc1 * *pv;
1507:             mul2 = *pc2 * *pv;
1508:             mul3 = *pc3 * *pv;
1509:             ++pv;
1510:             *pc1 = mul1;
1511:             *pc2 = mul2;
1512:             *pc3 = mul3;
1513: 
1514:             nz   = bi[prow+1] - bd[prow] - 1;
1515:             PetscLogFlops(3*2*nz);
1516:             /* update this row based on pivot row */
1517:             for (j=0; j<nz; j++) {
1518:               tmp = pv[j];
1519:               idx = pj[j];
1520:               rtmps1[idx] -= mul1 * tmp;
1521:               rtmps2[idx] -= mul2 * tmp;
1522:               rtmps3[idx] -= mul3 * tmp;
1523:             }
1524:           }
1525:           prow = *bjtmp++ ;
1526:         }
1527:         /* Now take care of diagonal block in this set of rows */
1528:         pc1 = rtmp1 + prow;
1529:         pc2 = rtmp2 + prow;
1530:         pc3 = rtmp3 + prow;
1531:         if (*pc2 != 0.0 || *pc3 != 0.0){
1532:           pj   = nbj + bd[prow];
1533:           if (PetscAbsScalar(*pc1) < zeropivot) {
1534:             if (damping) {
1535:               if (ndamp) damping *= 2.0;
1536:               damp = PETSC_TRUE;
1537:               ndamp++;
1538:               goto endofwhile;
1539:             } else {
1540:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1541:             }
1542:           }
1543:           mul2 = (*pc2)/(*pc1);
1544:           mul3 = (*pc3)/(*pc1);
1545:           *pc2 = mul2;
1546:           *pc3 = mul3;
1547:           nz   = bi[prow+1] - bd[prow] - 1;
1548:           PetscLogFlops(2*2*nz);
1549:           for (j=0; j<nz; j++) {
1550:             idx = pj[j] ;
1551:             tmp = rtmp1[idx];
1552:             rtmp2[idx] -= mul2 * tmp;
1553:             rtmp3[idx] -= mul3 * tmp;
1554:           }
1555:         }
1556:         ++prow;
1557:         pc2 = rtmp2 + prow;
1558:         pc3 = rtmp3 + prow;
1559:         if (*pc3 != 0.0){
1560:           pj   = nbj + bd[prow];
1561:           pj   = nbj + bd[prow];
1562:           if (PetscAbsScalar(*pc2) < zeropivot) {
1563:             if (damping) {
1564:               if (ndamp) damping *= 2.0;
1565:               damp = PETSC_TRUE;
1566:               ndamp++;
1567:               goto endofwhile;
1568:             } else {
1569:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",prow,PetscAbsScalar(*pc2),zeropivot);
1570:             }
1571:           }
1572:           mul3 = (*pc3)/(*pc2);
1573:           *pc3 = mul3;
1574:           nz   = bi[prow+1] - bd[prow] - 1;
1575:           PetscLogFlops(2*2*nz);
1576:           for (j=0; j<nz; j++) {
1577:             idx = pj[j] ;
1578:             tmp = rtmp2[idx];
1579:             rtmp3[idx] -= mul3 * tmp;
1580:           }
1581:         }
1582:         nz  = bi[row+1] - bi[row];
1583:         pj  = bj + bi[row];
1584:         pc1 = ba + bi[row];
1585:         pc2 = ba + bi[row+1];
1586:         pc3 = ba + bi[row+2];
1587:         if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot || PetscAbsScalar(rtmp3[row+2]) < zeropivot) {
1588:           if (damping) {
1589:             if (ndamp) damping *= 2.0;
1590:             damp = PETSC_TRUE;
1591:             ndamp++;
1592:             goto endofwhile;
1593:           } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1594:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1595:           } else if (PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1596:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1597:         } else {
1598:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",row+2,PetscAbsScalar(rtmp3[row+2]),zeropivot);
1599:           }
1600:         }
1601:         rtmp1[row]   = 1.0/rtmp1[row];
1602:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1603:         rtmp3[row+2] = 1.0/rtmp3[row+2];
1604:         /* copy row entries from dense representation to sparse */
1605:         for (j=0; j<nz; j++) {
1606:           idx    = pj[j];
1607:           pc1[j] = rtmps1[idx];
1608:           pc2[j] = rtmps2[idx];
1609:           pc3[j] = rtmps3[idx];
1610:         }
1611:         break;

1613:       default:
1614:         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1615:       }
1616:       row += nsz;                 /* Update the row */
1617:     }
1618:     endofwhile:;
1619:   } while (damp);
1620:   PetscFree(rtmp1);
1621:   PetscFree(tmp_vec2);
1622:   ISRestoreIndices(isicol,&ic);
1623:   ISRestoreIndices(isrow,&r);
1624:   ISRestoreIndices(iscol,&c);
1625:   C->factor      = FACTOR_LU;
1626:   C->assembled   = PETSC_TRUE;
1627:   if (ndamp || b->lu_damping) {
1628:     PetscLogInfo(0,"MatLUFactorNumerical_SeqAIJ_Inode: number of damping tries %D damping value %g\n",ndamp,damping);
1629:   }
1630:   PetscLogFlops(C->n);
1631:   return(0);
1632: }

1634: /*
1635:      This is really ugly. if inodes are used this replaces the 
1636:   permutations with ones that correspond to rows/cols of the matrix
1637:   rather then inode blocks
1638: */
1641: PetscErrorCode MatAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1642: {
1643:   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);

1646:   PetscObjectQueryFunction((PetscObject)A,"MatAdjustForInodes_C",(void (**)(void))&f);
1647:   if (f) {
1648:     (*f)(A,rperm,cperm);
1649:   }
1650:   return(0);
1651: }

1656: PetscErrorCode MatAdjustForInodes_SeqAIJ(Mat A,IS *rperm,IS *cperm)
1657: {
1658:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1660:   PetscInt       m = A->m,n = A->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1661:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
1662:   PetscInt       nslim_col,*ns_col;
1663:   IS             ris = *rperm,cis = *cperm;

1666:   if (!a->inode.size) return(0); /* no inodes so return */
1667:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

1669:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
1670:   PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
1671:   PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
1672:   permc = permr + m;

1674:   ISGetIndices(ris,&ridx);
1675:   ISGetIndices(cis,&cidx);

1677:   /* Form the inode structure for the rows of permuted matric using inv perm*/
1678:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

1680:   /* Construct the permutations for rows*/
1681:   for (i=0,row = 0; i<nslim_row; ++i){
1682:     indx      = ridx[i];
1683:     start_val = tns[indx];
1684:     end_val   = tns[indx + 1];
1685:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1686:   }

1688:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
1689:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

1691:  /* Construct permutations for columns */
1692:   for (i=0,col=0; i<nslim_col; ++i){
1693:     indx      = cidx[i];
1694:     start_val = tns[indx];
1695:     end_val   = tns[indx + 1];
1696:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1697:   }

1699:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1700:   ISSetPermutation(*rperm);
1701:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1702:   ISSetPermutation(*cperm);
1703: 
1704:   ISRestoreIndices(ris,&ridx);
1705:   ISRestoreIndices(cis,&cidx);

1707:   PetscFree(ns_col);
1708:   PetscFree(permr);
1709:   ISDestroy(cis);
1710:   ISDestroy(ris);
1711:   PetscFree(tns);
1712:   return(0);
1713: }

1718: /*@C
1719:    MatSeqAIJGetInodeSizes - Returns the inode information of the SeqAIJ matrix.

1721:    Collective on Mat

1723:    Input Parameter:
1724: .  A - the SeqAIJ matrix.

1726:    Output Parameter:
1727: +  node_count - no of inodes present in the matrix.
1728: .  sizes      - an array of size node_count,with sizes of each inode.
1729: -  limit      - the max size used to generate the inodes.

1731:    Level: advanced

1733:    Notes: This routine returns some internal storage information
1734:    of the matrix, it is intended to be used by advanced users.
1735:    It should be called after the matrix is assembled.
1736:    The contents of the sizes[] array should not be changed.

1738: .keywords: matrix, seqaij, get, inode

1740: .seealso: MatGetInfo()
1741: @*/
1742: PetscErrorCode MatSeqAIJGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1743: {
1744:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

1747:   PetscObjectQueryFunction((PetscObject)A,"MatSeqAIJGetInodeSizes_C",(void (**)(void))&f);
1748:   if (f) {
1749:     (*f)(A,node_count,sizes,limit);
1750:   }
1751:   return(0);
1752: }

1757: PetscErrorCode MatSeqAIJGetInodeSizes_SeqAIJ(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1758: {
1759:   Mat_SeqAIJ *a;

1762:   a           = (Mat_SeqAIJ*)A->data;
1763:   *node_count = a->inode.node_count;
1764:   *sizes      = a->inode.size;
1765:   *limit      = a->inode.limit;
1766:   return(0);
1767: }