Actual source code: aijnode.c

  1: /*$Id: aijnode.c,v 1.128 2001/08/07 03:02:47 balay Exp $*/
  2: /*
  3:   This file provides high performance routines for the AIJ (compressed row)
  4:   format by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
 6:  #include src/mat/impls/aij/seq/aij.h
 7:  #include src/vec/vecimpl.h

  9: EXTERN int Mat_AIJ_CheckInode(Mat,PetscTruth);
 10: EXTERN int MatSolve_SeqAIJ_Inode(Mat,Vec,Vec);
 11: EXTERN int MatLUFactorNumeric_SeqAIJ_Inode(Mat,Mat *);

 13: EXTERN int MatMult_SeqAIJ(Mat,Vec,Vec);
 14: EXTERN int MatMultAdd_SeqAIJ(Mat,Vec,Vec,Vec);
 15: EXTERN int MatSolve_SeqAIJ(Mat,Vec,Vec);
 16: EXTERN int MatLUFactorNumeric_SeqAIJ(Mat,Mat*);
 17: EXTERN int MatGetRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
 18: EXTERN int MatRestoreRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
 19: EXTERN int MatGetColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);
 20: EXTERN int MatRestoreColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int*[],int*[],PetscTruth*);

 24: static int Mat_AIJ_CreateColInode(Mat A,int* size,int ** ns)
 25: {
 26:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
 27:   int        ierr,i,count,m,n,min_mn,*ns_row,*ns_col;

 30:   n      = A->n;
 31:   m      = A->m;
 32:   ns_row = a->inode.size;
 33: 
 34:   min_mn = (m < n) ? m : n;
 35:   if (!ns) {
 36:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
 37:     for(; count+1 < n; count++,i++);
 38:     if (count < n)  {
 39:       i++;
 40:     }
 41:     *size = i;
 42:     return(0);
 43:   }
 44:   PetscMalloc((n+1)*sizeof(int),&ns_col);
 45: 
 46:   /* Use the same row structure wherever feasible. */
 47:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 48:     ns_col[i] = ns_row[i];
 49:   }

 51:   /* if m < n; pad up the remainder with inode_limit */
 52:   for(; count+1 < n; count++,i++) {
 53:     ns_col[i] = 1;
 54:   }
 55:   /* The last node is the odd ball. padd it up with the remaining rows; */
 56:   if (count < n)  {
 57:     ns_col[i] = n - count;
 58:     i++;
 59:   } else if (count > n) {
 60:     /* Adjust for the over estimation */
 61:     ns_col[i-1] += n - count;
 62:   }
 63:   *size = i;
 64:   *ns   = ns_col;
 65:   return(0);
 66: }


 69: /*
 70:       This builds symmetric version of nonzero structure,
 71: */
 74: static int MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
 75: {
 76:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
 77:   int        *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n,ierr;
 78:   int        *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 81:   nslim_row = a->inode.node_count;
 82:   m         = A->m;
 83:   n         = A->n;
 84:   if (m != n) SETERRQ(1,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix shoul be square");
 85: 
 86:   /* Use the row_inode as column_inode */
 87:   nslim_col = nslim_row;
 88:   ns_col    = ns_row;

 90:   /* allocate space for reformated inode structure */
 91:   PetscMalloc((nslim_col+1)*sizeof(int),&tns);
 92:   PetscMalloc((n+1)*sizeof(int),&tvc);
 93:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 95:   for (i1=0,col=0; i1<nslim_col; ++i1){
 96:     nsz = ns_col[i1];
 97:     for (i2=0; i2<nsz; ++i2,++col)
 98:       tvc[col] = i1;
 99:   }
100:   /* allocate space for row pointers */
101:   PetscMalloc((nslim_row+1)*sizeof(int),&ia);
102:   *iia = ia;
103:   PetscMemzero(ia,(nslim_row+1)*sizeof(int));
104:   PetscMalloc((nslim_row+1)*sizeof(int),&work);

106:   /* determine the number of columns in each row */
107:   ia[0] = oshift;
108:   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {

110:     j    = aj + ai[row] + ishift;
111:     jmax = aj + ai[row+1] + ishift;
112:     i2   = 0;
113:     col  = *j++ + ishift;
114:     i2   = tvc[col];
115:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
116:       ia[i1+1]++;
117:       ia[i2+1]++;
118:       i2++;                     /* Start col of next node */
119:       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
120:       i2 = tvc[col];
121:     }
122:     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
123:   }

125:   /* shift ia[i] to point to next row */
126:   for (i1=1; i1<nslim_row+1; i1++) {
127:     row        = ia[i1-1];
128:     ia[i1]    += row;
129:     work[i1-1] = row - oshift;
130:   }

132:   /* allocate space for column pointers */
133:   nz   = ia[nslim_row] + (!ishift);
134:   PetscMalloc(nz*sizeof(int),&ja);
135:   *jja = ja;

137:  /* loop over lower triangular part putting into ja */
138:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
139:     j    = aj + ai[row] + ishift;
140:     jmax = aj + ai[row+1] + ishift;
141:     i2   = 0;                     /* Col inode index */
142:     col  = *j++ + ishift;
143:     i2   = tvc[col];
144:     while (i2<i1 && j<jmax) {
145:       ja[work[i2]++] = i1 + oshift;
146:       ja[work[i1]++] = i2 + oshift;
147:       ++i2;
148:       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
149:       i2 = tvc[col];
150:     }
151:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

153:   }
154:   PetscFree(work);
155:   PetscFree(tns);
156:   PetscFree(tvc);
157:   return(0);
158: }

160: /*
161:       This builds nonsymmetric version of nonzero structure,
162: */
165: static int MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
166: {
167:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
168:   int        *work,*ia,*ja,*j,nz,nslim_row,n,row,col,ierr,*ns_col,nslim_col;
169:   int        *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

172:   nslim_row = a->inode.node_count;
173:   n         = A->n;

175:   /* Create The column_inode for this matrix */
176:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
177: 
178:   /* allocate space for reformated column_inode structure */
179:   PetscMalloc((nslim_col +1)*sizeof(int),&tns);
180:   PetscMalloc((n +1)*sizeof(int),&tvc);
181:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

183:   for (i1=0,col=0; i1<nslim_col; ++i1){
184:     nsz = ns_col[i1];
185:     for (i2=0; i2<nsz; ++i2,++col)
186:       tvc[col] = i1;
187:   }
188:   /* allocate space for row pointers */
189:   PetscMalloc((nslim_row+1)*sizeof(int),&ia);
190:   *iia = ia;
191:   PetscMemzero(ia,(nslim_row+1)*sizeof(int));
192:   PetscMalloc((nslim_row+1)*sizeof(int),&work);

194:   /* determine the number of columns in each row */
195:   ia[0] = oshift;
196:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
197:     j   = aj + ai[row] + ishift;
198:     col = *j++ + ishift;
199:     i2  = tvc[col];
200:     nz  = ai[row+1] - ai[row];
201:     while (nz-- > 0) {           /* off-diagonal elemets */
202:       ia[i1+1]++;
203:       i2++;                     /* Start col of next node */
204:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
205:       i2 = tvc[col];
206:     }
207:   }

209:   /* shift ia[i] to point to next row */
210:   for (i1=1; i1<nslim_row+1; i1++) {
211:     row        = ia[i1-1];
212:     ia[i1]    += row;
213:     work[i1-1] = row - oshift;
214:   }

216:   /* allocate space for column pointers */
217:   nz   = ia[nslim_row] + (!ishift);
218:   PetscMalloc(nz*sizeof(int),&ja);
219:   *jja = ja;

221:  /* loop over matrix putting into ja */
222:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
223:     j   = aj + ai[row] + ishift;
224:     i2  = 0;                     /* Col inode index */
225:     col = *j++ + ishift;
226:     i2  = tvc[col];
227:     nz  = ai[row+1] - ai[row];
228:     while (nz-- > 0) {
229:       ja[work[i1]++] = i2 + oshift;
230:       ++i2;
231:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
232:       i2 = tvc[col];
233:     }
234:   }
235:   PetscFree(ns_col);
236:   PetscFree(work);
237:   PetscFree(tns);
238:   PetscFree(tvc);
239:   return(0);
240: }

244: static int MatGetRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
245: {
246:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
247:   int        ierr;

250:   *n     = a->inode.node_count;
251:   if (!ia) return(0);

253:   if (symmetric) {
254:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
255:   } else {
256:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
257:   }
258:   return(0);
259: }

263: static int MatRestoreRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
264: {

268:   if (!ia) return(0);
269:   PetscFree(*ia);
270:   PetscFree(*ja);
271:   return(0);
272: }

274: /* ----------------------------------------------------------- */

278: static int MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int *iia[],int *jja[],int ishift,int oshift)
279: {
280:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
281:   int *work,*ia,*ja,*j,nz,nslim_row, n,row,col,ierr,*ns_col,nslim_col;
282:   int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

285:   nslim_row = a->inode.node_count;
286:   n         = A->n;

288:   /* Create The column_inode for this matrix */
289:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
290: 
291:   /* allocate space for reformated column_inode structure */
292:   PetscMalloc((nslim_col + 1)*sizeof(int),&tns);
293:   PetscMalloc((n + 1)*sizeof(int),&tvc);
294:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

296:   for (i1=0,col=0; i1<nslim_col; ++i1){
297:     nsz = ns_col[i1];
298:     for (i2=0; i2<nsz; ++i2,++col)
299:       tvc[col] = i1;
300:   }
301:   /* allocate space for column pointers */
302:   PetscMalloc((nslim_col+1)*sizeof(int),&ia);
303:   *iia = ia;
304:   PetscMemzero(ia,(nslim_col+1)*sizeof(int));
305:   PetscMalloc((nslim_col+1)*sizeof(int),&work);

307:   /* determine the number of columns in each row */
308:   ia[0] = oshift;
309:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
310:     j   = aj + ai[row] + ishift;
311:     col = *j++ + ishift;
312:     i2  = tvc[col];
313:     nz  = ai[row+1] - ai[row];
314:     while (nz-- > 0) {           /* off-diagonal elemets */
315:       /* ia[i1+1]++; */
316:       ia[i2+1]++;
317:       i2++;
318:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
319:       i2 = tvc[col];
320:     }
321:   }

323:   /* shift ia[i] to point to next col */
324:   for (i1=1; i1<nslim_col+1; i1++) {
325:     col        = ia[i1-1];
326:     ia[i1]    += col;
327:     work[i1-1] = col - oshift;
328:   }

330:   /* allocate space for column pointers */
331:   nz   = ia[nslim_col] + (!ishift);
332:   PetscMalloc(nz*sizeof(int),&ja);
333:   *jja = ja;

335:  /* loop over matrix putting into ja */
336:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
337:     j   = aj + ai[row] + ishift;
338:     i2  = 0;                     /* Col inode index */
339:     col = *j++ + ishift;
340:     i2  = tvc[col];
341:     nz  = ai[row+1] - ai[row];
342:     while (nz-- > 0) {
343:       /* ja[work[i1]++] = i2 + oshift; */
344:       ja[work[i2]++] = i1 + oshift;
345:       i2++;
346:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
347:       i2 = tvc[col];
348:     }
349:   }
350:   PetscFree(ns_col);
351:   PetscFree(work);
352:   PetscFree(tns);
353:   PetscFree(tvc);
354:   return(0);
355: }

359: static int MatGetColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
360: {

364:   Mat_AIJ_CreateColInode(A,n,PETSC_NULL);
365:   if (!ia) return(0);

367:   if (symmetric) {
368:     /* Since the indices are symmetric it does'nt matter */
369:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
370:   } else {
371:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
372:   }
373:   return(0);
374: }

378: static int MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int *ia[],int *ja[],PetscTruth *done)
379: {

383:   if (!ia) return(0);
384:   PetscFree(*ia);
385:   PetscFree(*ja);
386:   return(0);
387: }

389: /* ----------------------------------------------------------- */

393: static int MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
394: {
395:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
396:   PetscScalar  sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
397:   PetscScalar  *v1,*v2,*v3,*v4,*v5,*x,*y;
398:   int          ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
399: 
400: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
401: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
402: #endif

405:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
406:   node_max = a->inode.node_count;
407:   ns       = a->inode.size;     /* Node Size array */
408:   VecGetArray(xx,&x);
409:   VecGetArray(yy,&y);
410:   idx  = a->j;
411:   v1   = a->a;
412:   ii   = a->i;

414:   for (i = 0,row = 0; i< node_max; ++i){
415:     nsz  = ns[i];
416:     n    = ii[1] - ii[0];
417:     ii  += nsz;
418:     sz   = n;                   /* No of non zeros in this row */
419:                                 /* Switch on the size of Node */
420:     switch (nsz){               /* Each loop in 'case' is unrolled */
421:     case 1 :
422:       sum1  = 0;
423: 
424:       for(n = 0; n< sz-1; n+=2) {
425:         i1   = idx[0];          /* The instructions are ordered to */
426:         i2   = idx[1];          /* make the compiler's job easy */
427:         idx += 2;
428:         tmp0 = x[i1];
429:         tmp1 = x[i2];
430:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
431:        }
432: 
433:       if (n == sz-1){          /* Take care of the last nonzero  */
434:         tmp0  = x[*idx++];
435:         sum1 += *v1++ * tmp0;
436:       }
437:       y[row++]=sum1;
438:       break;
439:     case 2:
440:       sum1  = 0;
441:       sum2  = 0;
442:       v2    = v1 + n;
443: 
444:       for (n = 0; n< sz-1; n+=2) {
445:         i1   = idx[0];
446:         i2   = idx[1];
447:         idx += 2;
448:         tmp0 = x[i1];
449:         tmp1 = x[i2];
450:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
451:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
452:       }
453:       if (n == sz-1){
454:         tmp0  = x[*idx++];
455:         sum1 += *v1++ * tmp0;
456:         sum2 += *v2++ * tmp0;
457:       }
458:       y[row++]=sum1;
459:       y[row++]=sum2;
460:       v1      =v2;              /* Since the next block to be processed starts there*/
461:       idx    +=sz;
462:       break;
463:     case 3:
464:       sum1  = 0;
465:       sum2  = 0;
466:       sum3  = 0;
467:       v2    = v1 + n;
468:       v3    = v2 + n;
469: 
470:       for (n = 0; n< sz-1; n+=2) {
471:         i1   = idx[0];
472:         i2   = idx[1];
473:         idx += 2;
474:         tmp0 = x[i1];
475:         tmp1 = x[i2];
476:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
477:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
478:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
479:       }
480:       if (n == sz-1){
481:         tmp0  = x[*idx++];
482:         sum1 += *v1++ * tmp0;
483:         sum2 += *v2++ * tmp0;
484:         sum3 += *v3++ * tmp0;
485:       }
486:       y[row++]=sum1;
487:       y[row++]=sum2;
488:       y[row++]=sum3;
489:       v1       =v3;             /* Since the next block to be processed starts there*/
490:       idx     +=2*sz;
491:       break;
492:     case 4:
493:       sum1  = 0;
494:       sum2  = 0;
495:       sum3  = 0;
496:       sum4  = 0;
497:       v2    = v1 + n;
498:       v3    = v2 + n;
499:       v4    = v3 + n;
500: 
501:       for (n = 0; n< sz-1; n+=2) {
502:         i1   = idx[0];
503:         i2   = idx[1];
504:         idx += 2;
505:         tmp0 = x[i1];
506:         tmp1 = x[i2];
507:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
508:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
509:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
510:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
511:       }
512:       if (n == sz-1){
513:         tmp0  = x[*idx++];
514:         sum1 += *v1++ * tmp0;
515:         sum2 += *v2++ * tmp0;
516:         sum3 += *v3++ * tmp0;
517:         sum4 += *v4++ * tmp0;
518:       }
519:       y[row++]=sum1;
520:       y[row++]=sum2;
521:       y[row++]=sum3;
522:       y[row++]=sum4;
523:       v1      =v4;              /* Since the next block to be processed starts there*/
524:       idx    +=3*sz;
525:       break;
526:     case 5:
527:       sum1  = 0;
528:       sum2  = 0;
529:       sum3  = 0;
530:       sum4  = 0;
531:       sum5  = 0;
532:       v2    = v1 + n;
533:       v3    = v2 + n;
534:       v4    = v3 + n;
535:       v5    = v4 + n;
536: 
537:       for (n = 0; n<sz-1; n+=2) {
538:         i1   = idx[0];
539:         i2   = idx[1];
540:         idx += 2;
541:         tmp0 = x[i1];
542:         tmp1 = x[i2];
543:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
544:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
545:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
546:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
547:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
548:       }
549:       if (n == sz-1){
550:         tmp0  = x[*idx++];
551:         sum1 += *v1++ * tmp0;
552:         sum2 += *v2++ * tmp0;
553:         sum3 += *v3++ * tmp0;
554:         sum4 += *v4++ * tmp0;
555:         sum5 += *v5++ * tmp0;
556:       }
557:       y[row++]=sum1;
558:       y[row++]=sum2;
559:       y[row++]=sum3;
560:       y[row++]=sum4;
561:       y[row++]=sum5;
562:       v1      =v5;       /* Since the next block to be processed starts there */
563:       idx    +=4*sz;
564:       break;
565:     default :
566:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
567:     }
568:   }
569:   VecRestoreArray(xx,&x);
570:   VecRestoreArray(yy,&y);
571:   PetscLogFlops(2*a->nz - A->m);
572:   return(0);
573: }
574: /* ----------------------------------------------------------- */
575: /* Almost same code as the MatMult_SeqAij_Inode() */
578: static int MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
579: {
580:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
581:   PetscScalar  sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
582:   PetscScalar  *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
583:   int          ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
584: 
586:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
587:   node_max = a->inode.node_count;
588:   ns       = a->inode.size;     /* Node Size array */
589:   VecGetArray(xx,&x);
590:   VecGetArray(yy,&y);
591:   if (zz != yy) {
592:     VecGetArray(zz,&z);
593:   } else {
594:     z = y;
595:   }
596:   zt = z;

598:   idx  = a->j;
599:   v1   = a->a;
600:   ii   = a->i;

602:   for (i = 0,row = 0; i< node_max; ++i){
603:     nsz  = ns[i];
604:     n    = ii[1] - ii[0];
605:     ii  += nsz;
606:     sz   = n;                   /* No of non zeros in this row */
607:                                 /* Switch on the size of Node */
608:     switch (nsz){               /* Each loop in 'case' is unrolled */
609:     case 1 :
610:       sum1  = *zt++;
611: 
612:       for(n = 0; n< sz-1; n+=2) {
613:         i1   = idx[0];          /* The instructions are ordered to */
614:         i2   = idx[1];          /* make the compiler's job easy */
615:         idx += 2;
616:         tmp0 = x[i1];
617:         tmp1 = x[i2];
618:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
619:        }
620: 
621:       if(n   == sz-1){          /* Take care of the last nonzero  */
622:         tmp0  = x[*idx++];
623:         sum1 += *v1++ * tmp0;
624:       }
625:       y[row++]=sum1;
626:       break;
627:     case 2:
628:       sum1  = *zt++;
629:       sum2  = *zt++;
630:       v2    = v1 + n;
631: 
632:       for(n = 0; n< sz-1; n+=2) {
633:         i1   = idx[0];
634:         i2   = idx[1];
635:         idx += 2;
636:         tmp0 = x[i1];
637:         tmp1 = x[i2];
638:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
639:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
640:       }
641:       if(n   == sz-1){
642:         tmp0  = x[*idx++];
643:         sum1 += *v1++ * tmp0;
644:         sum2 += *v2++ * tmp0;
645:       }
646:       y[row++]=sum1;
647:       y[row++]=sum2;
648:       v1      =v2;              /* Since the next block to be processed starts there*/
649:       idx    +=sz;
650:       break;
651:     case 3:
652:       sum1  = *zt++;
653:       sum2  = *zt++;
654:       sum3  = *zt++;
655:       v2    = v1 + n;
656:       v3    = v2 + n;
657: 
658:       for (n = 0; n< sz-1; n+=2) {
659:         i1   = idx[0];
660:         i2   = idx[1];
661:         idx += 2;
662:         tmp0 = x[i1];
663:         tmp1 = x[i2];
664:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
665:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
666:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
667:       }
668:       if (n == sz-1){
669:         tmp0  = x[*idx++];
670:         sum1 += *v1++ * tmp0;
671:         sum2 += *v2++ * tmp0;
672:         sum3 += *v3++ * tmp0;
673:       }
674:       y[row++]=sum1;
675:       y[row++]=sum2;
676:       y[row++]=sum3;
677:       v1       =v3;             /* Since the next block to be processed starts there*/
678:       idx     +=2*sz;
679:       break;
680:     case 4:
681:       sum1  = *zt++;
682:       sum2  = *zt++;
683:       sum3  = *zt++;
684:       sum4  = *zt++;
685:       v2    = v1 + n;
686:       v3    = v2 + n;
687:       v4    = v3 + n;
688: 
689:       for (n = 0; n< sz-1; n+=2) {
690:         i1   = idx[0];
691:         i2   = idx[1];
692:         idx += 2;
693:         tmp0 = x[i1];
694:         tmp1 = x[i2];
695:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
696:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
697:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
698:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
699:       }
700:       if (n == sz-1){
701:         tmp0  = x[*idx++];
702:         sum1 += *v1++ * tmp0;
703:         sum2 += *v2++ * tmp0;
704:         sum3 += *v3++ * tmp0;
705:         sum4 += *v4++ * tmp0;
706:       }
707:       y[row++]=sum1;
708:       y[row++]=sum2;
709:       y[row++]=sum3;
710:       y[row++]=sum4;
711:       v1      =v4;              /* Since the next block to be processed starts there*/
712:       idx    +=3*sz;
713:       break;
714:     case 5:
715:       sum1  = *zt++;
716:       sum2  = *zt++;
717:       sum3  = *zt++;
718:       sum4  = *zt++;
719:       sum5  = *zt++;
720:       v2    = v1 + n;
721:       v3    = v2 + n;
722:       v4    = v3 + n;
723:       v5    = v4 + n;
724: 
725:       for (n = 0; n<sz-1; n+=2) {
726:         i1   = idx[0];
727:         i2   = idx[1];
728:         idx += 2;
729:         tmp0 = x[i1];
730:         tmp1 = x[i2];
731:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
732:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
733:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
734:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
735:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
736:       }
737:       if(n   == sz-1){
738:         tmp0  = x[*idx++];
739:         sum1 += *v1++ * tmp0;
740:         sum2 += *v2++ * tmp0;
741:         sum3 += *v3++ * tmp0;
742:         sum4 += *v4++ * tmp0;
743:         sum5 += *v5++ * tmp0;
744:       }
745:       y[row++]=sum1;
746:       y[row++]=sum2;
747:       y[row++]=sum3;
748:       y[row++]=sum4;
749:       y[row++]=sum5;
750:       v1      =v5;       /* Since the next block to be processed starts there */
751:       idx    +=4*sz;
752:       break;
753:     default :
754:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
755:     }
756:   }
757:   VecRestoreArray(xx,&x);
758:   VecRestoreArray(yy,&y);
759:   if (zz != yy) {
760:     VecRestoreArray(zz,&z);
761:   }
762:   PetscLogFlops(2*a->nz);
763:   return(0);
764: }
765: /* ----------------------------------------------------------- */
766: EXTERN int MatColoringPatch_SeqAIJ_Inode(Mat,int,int,const ISColoringValue[],ISColoring *);

768: /*
769:     samestructure indicates that the matrix has not changed its nonzero structure so we 
770:     do not need to recompute the inodes 
771: */
774: int Mat_AIJ_CheckInode(Mat A,PetscTruth samestructure)
775: {
776:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
777:   int        ierr,i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
778:   PetscTruth flag,flg;

781:   if (samestructure && a->inode.checked) return(0);

783:   a->inode.checked = PETSC_TRUE;

785:   /* Notes: We set a->inode.limit=5 in MatCreateSeqAIJ(). */
786:   if (!a->inode.use) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODES\n"); return(0);}
787:   PetscOptionsHasName(A->prefix,"-mat_aij_no_inode",&flg);
788:   if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_aij_no_inode\n");return(0);}
789:   PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
790:   if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_no_unroll\n");return(0);}
791:   PetscOptionsGetInt(A->prefix,"-mat_aij_inode_limit",&a->inode.limit,PETSC_NULL);
792:   if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
793:   m = A->m;
794:   if (a->inode.size) {ns = a->inode.size;}
795:   else {PetscMalloc((m+1)*sizeof(int),&ns);}

797:   i          = 0;
798:   node_count = 0;
799:   idx        = a->j;
800:   ii         = a->i;
801:   while (i < m){                /* For each row */
802:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
803:     /* Limits the number of elements in a node to 'a->inode.limit' */
804:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
805:       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
806:       if (nzy != nzx) break;
807:       idy  += nzx;             /* Same nonzero pattern */
808:       PetscMemcmp(idx,idy,nzx*sizeof(int),&flag);
809:       if (flag == PETSC_FALSE) break;
810:     }
811:     ns[node_count++] = blk_size;
812:     idx += blk_size*nzx;
813:     i    = j;
814:   }
815:   /* If not enough inodes found,, do not use inode version of the routines */
816:   if (!a->inode.size && m && node_count > 0.9*m) {
817:     PetscFree(ns);
818:     A->ops->mult            = MatMult_SeqAIJ;
819:     A->ops->multadd         = MatMultAdd_SeqAIJ;
820:     A->ops->solve           = MatSolve_SeqAIJ;
821:     A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ;
822:     A->ops->getrowij        = MatGetRowIJ_SeqAIJ;
823:     A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ;
824:     A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ;
825:     A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
826:     A->ops->coloringpatch   = 0;
827:     a->inode.node_count     = 0;
828:     a->inode.size           = 0;
829:     PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes out of %d rows. Not using Inode routines\n",node_count,m);
830:   } else {
831:     A->ops->mult            = MatMult_SeqAIJ_Inode;
832:     A->ops->multadd         = MatMultAdd_SeqAIJ_Inode;
833:     A->ops->solve           = MatSolve_SeqAIJ_Inode;
834:     A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
835:     A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
836:     A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
837:     A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
838:     A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
839:     A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
840:     a->inode.node_count     = node_count;
841:     a->inode.size           = ns;
842:     PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes of %d. Limit used: %d. Using Inode routines\n",node_count,m,a->inode.limit);
843:   }
844:   return(0);
845: }

847: /* ----------------------------------------------------------- */
850: int MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
851: {
852:   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data;
853:   IS          iscol = a->col,isrow = a->row;
854:   int         *r,*c,ierr,i,j,n = A->m,*ai = a->i,nz,*a_j = a->j;
855:   int         node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
856:   PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
857:   PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;

860:   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
861:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
862:   node_max = a->inode.node_count;
863:   ns       = a->inode.size;     /* Node Size array */

865:   VecGetArray(bb,&b);
866:   VecGetArray(xx,&x);
867:   tmp  = a->solve_work;
868: 
869:   ISGetIndices(isrow,&rout); r = rout;
870:   ISGetIndices(iscol,&cout); c = cout + (n-1);
871: 
872:   /* forward solve the lower triangular */
873:   tmps = tmp ;
874:   aa   = a_a ;
875:   aj   = a_j ;
876:   ad   = a->diag;

878:   for (i = 0,row = 0; i< node_max; ++i){
879:     nsz = ns[i];
880:     aii = ai[row];
881:     v1  = aa + aii;
882:     vi  = aj + aii;
883:     nz  = ad[row]- aii;
884: 
885:     switch (nsz){               /* Each loop in 'case' is unrolled */
886:     case 1 :
887:       sum1 = b[*r++];
888:       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
889:       for(j=0; j<nz-1; j+=2){
890:         i0   = vi[0];
891:         i1   = vi[1];
892:         vi  +=2;
893:         tmp0 = tmps[i0];
894:         tmp1 = tmps[i1];
895:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
896:       }
897:       if(j == nz-1){
898:         tmp0 = tmps[*vi++];
899:         sum1 -= *v1++ *tmp0;
900:       }
901:       tmp[row ++]=sum1;
902:       break;
903:     case 2:
904:       sum1 = b[*r++];
905:       sum2 = b[*r++];
906:       v2   = aa + ai[row+1];

908:       for(j=0; j<nz-1; j+=2){
909:         i0   = vi[0];
910:         i1   = vi[1];
911:         vi  +=2;
912:         tmp0 = tmps[i0];
913:         tmp1 = tmps[i1];
914:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
915:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
916:       }
917:       if(j == nz-1){
918:         tmp0 = tmps[*vi++];
919:         sum1 -= *v1++ *tmp0;
920:         sum2 -= *v2++ *tmp0;
921:       }
922:       sum2 -= *v2++ * sum1;
923:       tmp[row ++]=sum1;
924:       tmp[row ++]=sum2;
925:       break;
926:     case 3:
927:       sum1 = b[*r++];
928:       sum2 = b[*r++];
929:       sum3 = b[*r++];
930:       v2   = aa + ai[row+1];
931:       v3   = aa + ai[row+2];
932: 
933:       for (j=0; j<nz-1; j+=2){
934:         i0   = vi[0];
935:         i1   = vi[1];
936:         vi  +=2;
937:         tmp0 = tmps[i0];
938:         tmp1 = tmps[i1];
939:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
940:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
941:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
942:       }
943:       if (j == nz-1){
944:         tmp0 = tmps[*vi++];
945:         sum1 -= *v1++ *tmp0;
946:         sum2 -= *v2++ *tmp0;
947:         sum3 -= *v3++ *tmp0;
948:       }
949:       sum2 -= *v2++ * sum1;
950:       sum3 -= *v3++ * sum1;
951:       sum3 -= *v3++ * sum2;
952:       tmp[row ++]=sum1;
953:       tmp[row ++]=sum2;
954:       tmp[row ++]=sum3;
955:       break;
956: 
957:     case 4:
958:       sum1 = b[*r++];
959:       sum2 = b[*r++];
960:       sum3 = b[*r++];
961:       sum4 = b[*r++];
962:       v2   = aa + ai[row+1];
963:       v3   = aa + ai[row+2];
964:       v4   = aa + ai[row+3];
965: 
966:       for (j=0; j<nz-1; j+=2){
967:         i0   = vi[0];
968:         i1   = vi[1];
969:         vi  +=2;
970:         tmp0 = tmps[i0];
971:         tmp1 = tmps[i1];
972:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
973:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
974:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
975:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
976:       }
977:       if (j == nz-1){
978:         tmp0 = tmps[*vi++];
979:         sum1 -= *v1++ *tmp0;
980:         sum2 -= *v2++ *tmp0;
981:         sum3 -= *v3++ *tmp0;
982:         sum4 -= *v4++ *tmp0;
983:       }
984:       sum2 -= *v2++ * sum1;
985:       sum3 -= *v3++ * sum1;
986:       sum4 -= *v4++ * sum1;
987:       sum3 -= *v3++ * sum2;
988:       sum4 -= *v4++ * sum2;
989:       sum4 -= *v4++ * sum3;
990: 
991:       tmp[row ++]=sum1;
992:       tmp[row ++]=sum2;
993:       tmp[row ++]=sum3;
994:       tmp[row ++]=sum4;
995:       break;
996:     case 5:
997:       sum1 = b[*r++];
998:       sum2 = b[*r++];
999:       sum3 = b[*r++];
1000:       sum4 = b[*r++];
1001:       sum5 = b[*r++];
1002:       v2   = aa + ai[row+1];
1003:       v3   = aa + ai[row+2];
1004:       v4   = aa + ai[row+3];
1005:       v5   = aa + ai[row+4];
1006: 
1007:       for (j=0; j<nz-1; j+=2){
1008:         i0   = vi[0];
1009:         i1   = vi[1];
1010:         vi  +=2;
1011:         tmp0 = tmps[i0];
1012:         tmp1 = tmps[i1];
1013:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1014:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1015:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1016:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1017:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1018:       }
1019:       if (j == nz-1){
1020:         tmp0 = tmps[*vi++];
1021:         sum1 -= *v1++ *tmp0;
1022:         sum2 -= *v2++ *tmp0;
1023:         sum3 -= *v3++ *tmp0;
1024:         sum4 -= *v4++ *tmp0;
1025:         sum5 -= *v5++ *tmp0;
1026:       }

1028:       sum2 -= *v2++ * sum1;
1029:       sum3 -= *v3++ * sum1;
1030:       sum4 -= *v4++ * sum1;
1031:       sum5 -= *v5++ * sum1;
1032:       sum3 -= *v3++ * sum2;
1033:       sum4 -= *v4++ * sum2;
1034:       sum5 -= *v5++ * sum2;
1035:       sum4 -= *v4++ * sum3;
1036:       sum5 -= *v5++ * sum3;
1037:       sum5 -= *v5++ * sum4;
1038: 
1039:       tmp[row ++]=sum1;
1040:       tmp[row ++]=sum2;
1041:       tmp[row ++]=sum3;
1042:       tmp[row ++]=sum4;
1043:       tmp[row ++]=sum5;
1044:       break;
1045:     default:
1046:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1047:     }
1048:   }
1049:   /* backward solve the upper triangular */
1050:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1051:     nsz = ns[i];
1052:     aii = ai[row+1] -1;
1053:     v1  = aa + aii;
1054:     vi  = aj + aii;
1055:     nz  = aii- ad[row];
1056:     switch (nsz){               /* Each loop in 'case' is unrolled */
1057:     case 1 :
1058:       sum1 = tmp[row];

1060:       for(j=nz ; j>1; j-=2){
1061:         i0   = vi[0];
1062:         i1   = vi[-1];
1063:         vi  -=2;
1064:         tmp0 = tmps[i0];
1065:         tmp1 = tmps[i1];
1066:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1067:       }
1068:       if (j==1){
1069:         tmp0  = tmps[*vi--];
1070:         sum1 -= *v1-- * tmp0;
1071:       }
1072:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1073:       break;
1074:     case 2 :
1075:       sum1 = tmp[row];
1076:       sum2 = tmp[row -1];
1077:       v2   = aa + ai[row]-1;
1078:       for (j=nz ; j>1; j-=2){
1079:         i0   = vi[0];
1080:         i1   = vi[-1];
1081:         vi  -=2;
1082:         tmp0 = tmps[i0];
1083:         tmp1 = tmps[i1];
1084:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1085:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1086:       }
1087:       if (j==1){
1088:         tmp0  = tmps[*vi--];
1089:         sum1 -= *v1-- * tmp0;
1090:         sum2 -= *v2-- * tmp0;
1091:       }
1092: 
1093:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1094:       sum2   -= *v2-- * tmp0;
1095:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1096:       break;
1097:     case 3 :
1098:       sum1 = tmp[row];
1099:       sum2 = tmp[row -1];
1100:       sum3 = tmp[row -2];
1101:       v2   = aa + ai[row]-1;
1102:       v3   = aa + ai[row -1]-1;
1103:       for (j=nz ; j>1; j-=2){
1104:         i0   = vi[0];
1105:         i1   = vi[-1];
1106:         vi  -=2;
1107:         tmp0 = tmps[i0];
1108:         tmp1 = tmps[i1];
1109:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1110:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1111:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1112:       }
1113:       if (j==1){
1114:         tmp0  = tmps[*vi--];
1115:         sum1 -= *v1-- * tmp0;
1116:         sum2 -= *v2-- * tmp0;
1117:         sum3 -= *v3-- * tmp0;
1118:       }
1119:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1120:       sum2   -= *v2-- * tmp0;
1121:       sum3   -= *v3-- * tmp0;
1122:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1123:       sum3   -= *v3-- * tmp0;
1124:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1125: 
1126:       break;
1127:     case 4 :
1128:       sum1 = tmp[row];
1129:       sum2 = tmp[row -1];
1130:       sum3 = tmp[row -2];
1131:       sum4 = tmp[row -3];
1132:       v2   = aa + ai[row]-1;
1133:       v3   = aa + ai[row -1]-1;
1134:       v4   = aa + ai[row -2]-1;

1136:       for (j=nz ; j>1; j-=2){
1137:         i0   = vi[0];
1138:         i1   = vi[-1];
1139:         vi  -=2;
1140:         tmp0 = tmps[i0];
1141:         tmp1 = tmps[i1];
1142:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1143:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1144:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1145:         sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1146:       }
1147:       if (j==1){
1148:         tmp0  = tmps[*vi--];
1149:         sum1 -= *v1-- * tmp0;
1150:         sum2 -= *v2-- * tmp0;
1151:         sum3 -= *v3-- * tmp0;
1152:         sum4 -= *v4-- * tmp0;
1153:       }

1155:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1156:       sum2   -= *v2-- * tmp0;
1157:       sum3   -= *v3-- * tmp0;
1158:       sum4   -= *v4-- * tmp0;
1159:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1160:       sum3   -= *v3-- * tmp0;
1161:       sum4   -= *v4-- * tmp0;
1162:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163:       sum4   -= *v4-- * tmp0;
1164:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165:       break;
1166:     case 5 :
1167:       sum1 = tmp[row];
1168:       sum2 = tmp[row -1];
1169:       sum3 = tmp[row -2];
1170:       sum4 = tmp[row -3];
1171:       sum5 = tmp[row -4];
1172:       v2   = aa + ai[row]-1;
1173:       v3   = aa + ai[row -1]-1;
1174:       v4   = aa + ai[row -2]-1;
1175:       v5   = aa + ai[row -3]-1;
1176:       for (j=nz ; j>1; j-=2){
1177:         i0   = vi[0];
1178:         i1   = vi[-1];
1179:         vi  -=2;
1180:         tmp0 = tmps[i0];
1181:         tmp1 = tmps[i1];
1182:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1183:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1184:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1185:         sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1186:         sum5 -= v5[0] * tmp0 + v5[-1] * tmp1; v5 -= 2;
1187:       }
1188:       if (j==1){
1189:         tmp0  = tmps[*vi--];
1190:         sum1 -= *v1-- * tmp0;
1191:         sum2 -= *v2-- * tmp0;
1192:         sum3 -= *v3-- * tmp0;
1193:         sum4 -= *v4-- * tmp0;
1194:         sum5 -= *v5-- * tmp0;
1195:       }

1197:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1198:       sum2   -= *v2-- * tmp0;
1199:       sum3   -= *v3-- * tmp0;
1200:       sum4   -= *v4-- * tmp0;
1201:       sum5   -= *v5-- * tmp0;
1202:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1203:       sum3   -= *v3-- * tmp0;
1204:       sum4   -= *v4-- * tmp0;
1205:       sum5   -= *v5-- * tmp0;
1206:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1207:       sum4   -= *v4-- * tmp0;
1208:       sum5   -= *v5-- * tmp0;
1209:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1210:       sum5   -= *v5-- * tmp0;
1211:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1212:       break;
1213:     default:
1214:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1215:     }
1216:   }
1217:   ISRestoreIndices(isrow,&rout);
1218:   ISRestoreIndices(iscol,&cout);
1219:   VecRestoreArray(bb,&b);
1220:   VecRestoreArray(xx,&x);
1221:   PetscLogFlops(2*a->nz - A->n);
1222:   return(0);
1223: }


1228: int MatLUFactorNumeric_SeqAIJ_Inode(Mat A,Mat *B)
1229: {
1230:   Mat          C = *B;
1231:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ *)C->data;
1232:   IS           iscol = b->col,isrow = b->row,isicol = b->icol;
1233:   int          *r,*ic,*c,ierr,n = A->m,*bi = b->i;
1234:   int          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,row,prow;
1235:   int          *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nsz;
1236:   int          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj,ndamp = 0;
1237:   PetscScalar  *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1238:   PetscScalar  tmp,*ba = b->a,*aa = a->a,*pv,*rtmps1,*rtmps2,*rtmps3;
1239:   PetscReal    damping = b->lu_damping,zeropivot = b->lu_zeropivot;
1240:   PetscTruth   damp;

1243:   ISGetIndices(isrow,&r);
1244:   ISGetIndices(iscol,&c);
1245:   ISGetIndices(isicol,&ic);
1246:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1247:   PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1248:   ics    = ic ; rtmps1 = rtmp1 ;
1249:   rtmp2  = rtmp1 + n;  rtmps2 = rtmp2 ;
1250:   rtmp3  = rtmp2 + n;  rtmps3 = rtmp3 ;
1251: 
1252:   node_max = a->inode.node_count;
1253:   ns       = a->inode.size ;
1254:   if (!ns){
1255:     SETERRQ(1,"Matrix without inode information");
1256:   }

1258:   /* If max inode size > 3, split it into two inodes.*/
1259:   /* also map the inode sizes according to the ordering */
1260:   PetscMalloc((n+1)* sizeof(int),&tmp_vec1);
1261:   for (i=0,j=0; i<node_max; ++i,++j){
1262:     if (ns[i]>3) {
1263:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1264:       ++j;
1265:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1266:     } else {
1267:       tmp_vec1[j] = ns[i];
1268:     }
1269:   }
1270:   /* Use the correct node_max */
1271:   node_max = j;

1273:   /* Now reorder the inode info based on mat re-ordering info */
1274:   /* First create a row -> inode_size_array_index map */
1275:   PetscMalloc(n*sizeof(int)+1,&nsmap);
1276:   PetscMalloc(node_max*sizeof(int)+1,&tmp_vec2);
1277:   for (i=0,row=0; i<node_max; i++) {
1278:     nsz = tmp_vec1[i];
1279:     for (j=0; j<nsz; j++,row++) {
1280:       nsmap[row] = i;
1281:     }
1282:   }
1283:   /* Using nsmap, create a reordered ns structure */
1284:   for (i=0,j=0; i< node_max; i++) {
1285:     nsz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1286:     tmp_vec2[i] = nsz;
1287:     j        += nsz;
1288:   }
1289:   PetscFree(nsmap);
1290:   PetscFree(tmp_vec1);
1291:   /* Now use the correct ns */
1292:   ns = tmp_vec2;


1295:   do {
1296:     damp = PETSC_FALSE;
1297:     /* Now loop over each block-row, and do the factorization */
1298:     for (i=0,row=0; i<node_max; i++) {
1299:       nsz   = ns[i];
1300:       nz    = bi[row+1] - bi[row];
1301:       bjtmp = bj + bi[row];
1302: 
1303:       switch (nsz){
1304:       case 1:
1305:         for  (j=0; j<nz; j++){
1306:           idx         = bjtmp[j];
1307:           rtmps1[idx] = 0.0;
1308:         }
1309: 
1310:         /* load in initial (unfactored row) */
1311:         idx   = r[row];
1312:         nz    = ai[idx+1] - ai[idx];
1313:         ajtmp = aj + ai[idx];
1314:         v1    = aa + ai[idx];

1316:         for (j=0; j<nz; j++) {
1317:           idx        = ics[ajtmp[j]];
1318:           rtmp1[idx] = v1[j];
1319:           if (ndamp && ajtmp[j] == r[row]) {
1320:             rtmp1[idx] += damping;
1321:           }
1322:         }
1323:         prow = *bjtmp++ ;
1324:         while (prow < row) {
1325:           pc1 = rtmp1 + prow;
1326:           if (*pc1 != 0.0){
1327:             pv   = ba + bd[prow];
1328:             pj   = nbj + bd[prow];
1329:             mul1 = *pc1 * *pv++;
1330:             *pc1 = mul1;
1331:             nz   = bi[prow+1] - bd[prow] - 1;
1332:             PetscLogFlops(2*nz);
1333:             for (j=0; j<nz; j++) {
1334:               tmp = pv[j];
1335:               idx = pj[j];
1336:               rtmps1[idx] -= mul1 * tmp;
1337:             }
1338:           }
1339:           prow = *bjtmp++ ;
1340:         }
1341:         nz  = bi[row+1] - bi[row];
1342:         pj  = bj + bi[row];
1343:         pc1 = ba + bi[row];
1344:         if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1345:           if (damping) {
1346:             if (ndamp) damping *= 2.0;
1347:             damp = PETSC_TRUE;
1348:             ndamp++;
1349:             goto endofwhile;
1350:           } else {
1351:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1352:           }
1353:         }
1354:         rtmp1[row] = 1.0/rtmp1[row];
1355:         for (j=0; j<nz; j++) {
1356:           idx    = pj[j];
1357:           pc1[j] = rtmps1[idx];
1358:         }
1359:         break;
1360: 
1361:       case 2:
1362:         for  (j=0; j<nz; j++) {
1363:           idx         = bjtmp[j];
1364:           rtmps1[idx] = 0.0;
1365:           rtmps2[idx] = 0.0;
1366:         }
1367: 
1368:         /* load in initial (unfactored row) */
1369:         idx   = r[row];
1370:         nz    = ai[idx+1] - ai[idx];
1371:         ajtmp = aj + ai[idx];
1372:         v1    = aa + ai[idx];
1373:         v2    = aa + ai[idx+1];
1374: 
1375:         for (j=0; j<nz; j++) {
1376:           idx        = ics[ajtmp[j]];
1377:           rtmp1[idx] = v1[j];
1378:           rtmp2[idx] = v2[j];
1379:           if (ndamp && ajtmp[j] == r[row]) {
1380:             rtmp1[idx] += damping;
1381:           }
1382:           if (ndamp && ajtmp[j] == r[row+1]) {
1383:             rtmp2[idx] += damping;
1384:           }
1385:         }
1386:         prow = *bjtmp++ ;
1387:         while (prow < row) {
1388:           pc1 = rtmp1 + prow;
1389:           pc2 = rtmp2 + prow;
1390:           if (*pc1 != 0.0 || *pc2 != 0.0){
1391:             pv   = ba + bd[prow];
1392:             pj   = nbj + bd[prow];
1393:             mul1 = *pc1 * *pv;
1394:             mul2 = *pc2 * *pv;
1395:             ++pv;
1396:             *pc1 = mul1;
1397:             *pc2 = mul2;
1398: 
1399:             nz   = bi[prow+1] - bd[prow] - 1;
1400:             PetscLogFlops(2*2*nz);
1401:             for (j=0; j<nz; j++) {
1402:               tmp = pv[j];
1403:               idx = pj[j];
1404:               rtmps1[idx] -= mul1 * tmp;
1405:               rtmps2[idx] -= mul2 * tmp;
1406:             }
1407:           }
1408:           prow = *bjtmp++ ;
1409:         }
1410:         /* Now take care of the odd element*/
1411:         pc1 = rtmp1 + prow;
1412:         pc2 = rtmp2 + prow;
1413:         if (*pc2 != 0.0){
1414:           pj   = nbj + bd[prow];
1415:           if (PetscAbsScalar(*pc1) < zeropivot) {
1416:             if (damping) {
1417:               if (ndamp) damping *= 2.0;
1418:               damp = PETSC_TRUE;
1419:               ndamp++;
1420:               goto endofwhile;
1421:             } else {
1422:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1423:             }
1424:           }
1425:           mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1426:           *pc2 = mul2;
1427:           nz   = bi[prow+1] - bd[prow] - 1;
1428:           PetscLogFlops(2*nz);
1429:           for (j=0; j<nz; j++) {
1430:             idx = pj[j] ;
1431:             tmp = rtmp1[idx];
1432:             rtmp2[idx] -= mul2 * tmp;
1433:           }
1434:         }
1435: 
1436:         nz  = bi[row+1] - bi[row];
1437:         pj  = bj + bi[row];
1438:         pc1 = ba + bi[row];
1439:         pc2 = ba + bi[row+1];
1440:         if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1441:           if (damping) {
1442:             if (ndamp) damping *= 2.0;
1443:             damp = PETSC_TRUE;
1444:             ndamp++;
1445:             goto endofwhile;
1446:           } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1447:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1448:           } else {
1449:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1450:           }
1451:         }
1452:         rtmp1[row]   = 1.0/rtmp1[row];
1453:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1454:         for (j=0; j<nz; j++) {
1455:           idx    = pj[j];
1456:           pc1[j] = rtmps1[idx];
1457:           pc2[j] = rtmps2[idx];
1458:         }
1459:         break;

1461:       case 3:
1462:         for  (j=0; j<nz; j++) {
1463:           idx         = bjtmp[j];
1464:           rtmps1[idx] = 0.0;
1465:           rtmps2[idx] = 0.0;
1466:           rtmps3[idx] = 0.0;
1467:         }
1468:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1469:         idx   = r[row];
1470:         nz    = ai[idx+1] - ai[idx];
1471:         ajtmp = aj + ai[idx];
1472:         v1    = aa + ai[idx];
1473:         v2    = aa + ai[idx+1];
1474:         v3    = aa + ai[idx+2];
1475:         for (j=0; j<nz; j++) {
1476:           idx        = ics[ajtmp[j]];
1477:           rtmp1[idx] = v1[j];
1478:           rtmp2[idx] = v2[j];
1479:           rtmp3[idx] = v3[j];
1480:           if (ndamp && ajtmp[j] == r[row]) {
1481:             rtmp1[idx] += damping;
1482:           }
1483:           if (ndamp && ajtmp[j] == r[row+1]) {
1484:             rtmp2[idx] += damping;
1485:           }
1486:           if (ndamp && ajtmp[j] == r[row+2]) {
1487:             rtmp3[idx] += damping;
1488:           }
1489:         }
1490:         /* loop over all pivot row blocks above this row block */
1491:         prow = *bjtmp++ ;
1492:         while (prow < row) {
1493:           pc1 = rtmp1 + prow;
1494:           pc2 = rtmp2 + prow;
1495:           pc3 = rtmp3 + prow;
1496:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1497:             pv   = ba  + bd[prow];
1498:             pj   = nbj + bd[prow];
1499:             mul1 = *pc1 * *pv;
1500:             mul2 = *pc2 * *pv;
1501:             mul3 = *pc3 * *pv;
1502:             ++pv;
1503:             *pc1 = mul1;
1504:             *pc2 = mul2;
1505:             *pc3 = mul3;
1506: 
1507:             nz   = bi[prow+1] - bd[prow] - 1;
1508:             PetscLogFlops(3*2*nz);
1509:             /* update this row based on pivot row */
1510:             for (j=0; j<nz; j++) {
1511:               tmp = pv[j];
1512:               idx = pj[j];
1513:               rtmps1[idx] -= mul1 * tmp;
1514:               rtmps2[idx] -= mul2 * tmp;
1515:               rtmps3[idx] -= mul3 * tmp;
1516:             }
1517:           }
1518:           prow = *bjtmp++ ;
1519:         }
1520:         /* Now take care of diagonal block in this set of rows */
1521:         pc1 = rtmp1 + prow;
1522:         pc2 = rtmp2 + prow;
1523:         pc3 = rtmp3 + prow;
1524:         if (*pc2 != 0.0 || *pc3 != 0.0){
1525:           pj   = nbj + bd[prow];
1526:           if (PetscAbsScalar(*pc1) < zeropivot) {
1527:             if (damping) {
1528:               if (ndamp) damping *= 2.0;
1529:               damp = PETSC_TRUE;
1530:               ndamp++;
1531:               goto endofwhile;
1532:             } else {
1533:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1534:             }
1535:           }
1536:           mul2 = (*pc2)/(*pc1);
1537:           mul3 = (*pc3)/(*pc1);
1538:           *pc2 = mul2;
1539:           *pc3 = mul3;
1540:           nz   = bi[prow+1] - bd[prow] - 1;
1541:           PetscLogFlops(2*2*nz);
1542:           for (j=0; j<nz; j++) {
1543:             idx = pj[j] ;
1544:             tmp = rtmp1[idx];
1545:             rtmp2[idx] -= mul2 * tmp;
1546:             rtmp3[idx] -= mul3 * tmp;
1547:           }
1548:         }
1549:         ++prow;
1550:         pc2 = rtmp2 + prow;
1551:         pc3 = rtmp3 + prow;
1552:         if (*pc3 != 0.0){
1553:           pj   = nbj + bd[prow];
1554:           pj   = nbj + bd[prow];
1555:           if (PetscAbsScalar(*pc2) < zeropivot) {
1556:             if (damping) {
1557:               if (ndamp) damping *= 2.0;
1558:               damp = PETSC_TRUE;
1559:               ndamp++;
1560:               goto endofwhile;
1561:             } else {
1562:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc2),zeropivot);
1563:             }
1564:           }
1565:           mul3 = (*pc3)/(*pc2);
1566:           *pc3 = mul3;
1567:           nz   = bi[prow+1] - bd[prow] - 1;
1568:           PetscLogFlops(2*2*nz);
1569:           for (j=0; j<nz; j++) {
1570:             idx = pj[j] ;
1571:             tmp = rtmp2[idx];
1572:             rtmp3[idx] -= mul3 * tmp;
1573:           }
1574:         }
1575:         nz  = bi[row+1] - bi[row];
1576:         pj  = bj + bi[row];
1577:         pc1 = ba + bi[row];
1578:         pc2 = ba + bi[row+1];
1579:         pc3 = ba + bi[row+2];
1580:         if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot || PetscAbsScalar(rtmp3[row+2]) < zeropivot) {
1581:           if (damping) {
1582:             if (ndamp) damping *= 2.0;
1583:             damp = PETSC_TRUE;
1584:             ndamp++;
1585:             goto endofwhile;
1586:           } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1587:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1588:           } else if (PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1589:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1590:         } else {
1591:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+2,PetscAbsScalar(rtmp3[row+2]),zeropivot);
1592:           }
1593:         }
1594:         rtmp1[row]   = 1.0/rtmp1[row];
1595:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1596:         rtmp3[row+2] = 1.0/rtmp3[row+2];
1597:         /* copy row entries from dense representation to sparse */
1598:         for (j=0; j<nz; j++) {
1599:           idx    = pj[j];
1600:           pc1[j] = rtmps1[idx];
1601:           pc2[j] = rtmps2[idx];
1602:           pc3[j] = rtmps3[idx];
1603:         }
1604:         break;

1606:       default:
1607:         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1608:       }
1609:       row += nsz;                 /* Update the row */
1610:     }
1611:     endofwhile:;
1612:   } while (damp);
1613:   PetscFree(rtmp1);
1614:   PetscFree(tmp_vec2);
1615:   ISRestoreIndices(isicol,&ic);
1616:   ISRestoreIndices(isrow,&r);
1617:   ISRestoreIndices(iscol,&c);
1618:   C->factor      = FACTOR_LU;
1619:   C->assembled   = PETSC_TRUE;
1620:   if (ndamp || b->lu_damping) {
1621:     PetscLogInfo(0,"MatLUFactorNumerical_SeqAIJ_Inode: number of damping tries %d damping value %g\n",ndamp,damping);
1622:   }
1623:   PetscLogFlops(C->n);
1624:   return(0);
1625: }

1627: /*
1628:      This is really ugly. if inodes are used this replaces the 
1629:   permutations with ones that correspond to rows/cols of the matrix
1630:   rather then inode blocks
1631: */
1634: int MatAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1635: {
1636:   int ierr,(*f)(Mat,IS*,IS*);

1639:   PetscObjectQueryFunction((PetscObject)A,"MatAdjustForInodes_C",(void (**)(void))&f);
1640:   if (f) {
1641:     (*f)(A,rperm,cperm);
1642:   }
1643:   return(0);
1644: }

1646: EXTERN_C_BEGIN
1649: int MatAdjustForInodes_SeqAIJ(Mat A,IS *rperm,IS *cperm)
1650: {
1651:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1652:   int        ierr,m = A->m,n = A->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1653:   int        row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
1654:   int        nslim_col,*ns_col;
1655:   IS         ris = *rperm,cis = *cperm;

1658:   if (!a->inode.size) return(0); /* no inodes so return */
1659:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

1661:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
1662:   PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(int),&tns);
1663:   PetscMalloc((m+n+1)*sizeof(int),&permr);
1664:   permc = permr + m;

1666:   ISGetIndices(ris,&ridx);
1667:   ISGetIndices(cis,&cidx);

1669:   /* Form the inode structure for the rows of permuted matric using inv perm*/
1670:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

1672:   /* Construct the permutations for rows*/
1673:   for (i=0,row = 0; i<nslim_row; ++i){
1674:     indx      = ridx[i];
1675:     start_val = tns[indx];
1676:     end_val   = tns[indx + 1];
1677:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1678:   }

1680:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
1681:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

1683:  /* Construct permutations for columns */
1684:   for (i=0,col=0; i<nslim_col; ++i){
1685:     indx      = cidx[i];
1686:     start_val = tns[indx];
1687:     end_val   = tns[indx + 1];
1688:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1689:   }

1691:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1692:   ISSetPermutation(*rperm);
1693:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1694:   ISSetPermutation(*cperm);
1695: 
1696:   ISRestoreIndices(ris,&ridx);
1697:   ISRestoreIndices(cis,&cidx);

1699:   PetscFree(ns_col);
1700:   PetscFree(permr);
1701:   ISDestroy(cis);
1702:   ISDestroy(ris);
1703:   PetscFree(tns);
1704:   return(0);
1705: }
1706: EXTERN_C_END

1710: /*@C
1711:    MatSeqAIJGetInodeSizes - Returns the inode information of the SeqAIJ matrix.

1713:    Collective on Mat

1715:    Input Parameter:
1716: .  A - the SeqAIJ matrix.

1718:    Output Parameter:
1719: +  node_count - no of inodes present in the matrix.
1720: .  sizes      - an array of size node_count,with sizes of each inode.
1721: -  limit      - the max size used to generate the inodes.

1723:    Level: advanced

1725:    Notes: This routine returns some internal storage information
1726:    of the matrix, it is intended to be used by advanced users.
1727:    It should be called after the matrix is assembled.
1728:    The contents of the sizes[] array should not be changed.

1730: .keywords: matrix, seqaij, get, inode

1732: .seealso: MatGetInfo()
1733: @*/
1734: int MatSeqAIJGetInodeSizes(Mat A,int *node_count,int *sizes[],int *limit)
1735: {
1736:   int ierr,(*f)(Mat,int*,int*[],int*);

1739:   PetscObjectQueryFunction((PetscObject)A,"MatSeqAIJGetInodeSizes_C",(void (**)(void))&f);
1740:   if (f) {
1741:     (*f)(A,node_count,sizes,limit);
1742:   }
1743:   return(0);
1744: }

1746: EXTERN_C_BEGIN
1749: int MatSeqAIJGetInodeSizes_SeqAIJ(Mat A,int *node_count,int *sizes[],int *limit)
1750: {
1751:   Mat_SeqAIJ *a;

1755:   a           = (Mat_SeqAIJ*)A->data;
1756:   *node_count = a->inode.node_count;
1757:   *sizes      = a->inode.size;
1758:   *limit      = a->inode.limit;
1759:   return(0);
1760: }
1761: EXTERN_C_END