Actual source code: inode.c

  1: #define PETSCMAT_DLL

  3: /*
  4:   This file provides high performance routines for the Inode format (compressed sparse row)
  5:   by taking advantage of rows with identical nonzero structure (I-nodes).
  6: */
 7:  #include src/mat/impls/aij/seq/aij.h

 11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
 12: {
 13:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
 15:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 18:   n      = A->cmap.n;
 19:   m      = A->rmap.n;
 20:   ns_row = a->inode.size;
 21: 
 22:   min_mn = (m < n) ? m : n;
 23:   if (!ns) {
 24:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
 25:     for(; count+1 < n; count++,i++);
 26:     if (count < n)  {
 27:       i++;
 28:     }
 29:     *size = i;
 30:     return(0);
 31:   }
 32:   PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
 33: 
 34:   /* Use the same row structure wherever feasible. */
 35:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 36:     ns_col[i] = ns_row[i];
 37:   }

 39:   /* if m < n; pad up the remainder with inode_limit */
 40:   for(; count+1 < n; count++,i++) {
 41:     ns_col[i] = 1;
 42:   }
 43:   /* The last node is the odd ball. padd it up with the remaining rows; */
 44:   if (count < n)  {
 45:     ns_col[i] = n - count;
 46:     i++;
 47:   } else if (count > n) {
 48:     /* Adjust for the over estimation */
 49:     ns_col[i-1] += n - count;
 50:   }
 51:   *size = i;
 52:   *ns   = ns_col;
 53:   return(0);
 54: }


 57: /*
 58:       This builds symmetric version of nonzero structure,
 59: */
 62: static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 63: {
 64:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
 66:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
 67:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 70:   nslim_row = a->inode.node_count;
 71:   m         = A->rmap.n;
 72:   n         = A->cmap.n;
 73:   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
 74: 
 75:   /* Use the row_inode as column_inode */
 76:   nslim_col = nslim_row;
 77:   ns_col    = ns_row;

 79:   /* allocate space for reformated inode structure */
 80:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
 81:   PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
 82:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 84:   for (i1=0,col=0; i1<nslim_col; ++i1){
 85:     nsz = ns_col[i1];
 86:     for (i2=0; i2<nsz; ++i2,++col)
 87:       tvc[col] = i1;
 88:   }
 89:   /* allocate space for row pointers */
 90:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
 91:   *iia = ia;
 92:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
 93:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

 95:   /* determine the number of columns in each row */
 96:   ia[0] = oshift;
 97:   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {

 99:     j    = aj + ai[row] + ishift;
100:     jmax = aj + ai[row+1] + ishift;
101:     i2   = 0;
102:     col  = *j++ + ishift;
103:     i2   = tvc[col];
104:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105:       ia[i1+1]++;
106:       ia[i2+1]++;
107:       i2++;                     /* Start col of next node */
108:       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109:       i2 = tvc[col];
110:     }
111:     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112:   }

114:   /* shift ia[i] to point to next row */
115:   for (i1=1; i1<nslim_row+1; i1++) {
116:     row        = ia[i1-1];
117:     ia[i1]    += row;
118:     work[i1-1] = row - oshift;
119:   }

121:   /* allocate space for column pointers */
122:   nz   = ia[nslim_row] + (!ishift);
123:   PetscMalloc(nz*sizeof(PetscInt),&ja);
124:   *jja = ja;

126:  /* loop over lower triangular part putting into ja */
127:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128:     j    = aj + ai[row] + ishift;
129:     jmax = aj + ai[row+1] + ishift;
130:     i2   = 0;                     /* Col inode index */
131:     col  = *j++ + ishift;
132:     i2   = tvc[col];
133:     while (i2<i1 && j<jmax) {
134:       ja[work[i2]++] = i1 + oshift;
135:       ja[work[i1]++] = i2 + oshift;
136:       ++i2;
137:       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138:       i2 = tvc[col];
139:     }
140:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

142:   }
143:   PetscFree(work);
144:   PetscFree(tns);
145:   PetscFree(tvc);
146:   return(0);
147: }

149: /*
150:       This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
158:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

162:   nslim_row = a->inode.node_count;
163:   n         = A->cmap.n;

165:   /* Create The column_inode for this matrix */
166:   Mat_CreateColInode(A,&nslim_col,&ns_col);
167: 
168:   /* allocate space for reformated column_inode structure */
169:   PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170:   PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

173:   for (i1=0,col=0; i1<nslim_col; ++i1){
174:     nsz = ns_col[i1];
175:     for (i2=0; i2<nsz; ++i2,++col)
176:       tvc[col] = i1;
177:   }
178:   /* allocate space for row pointers */
179:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180:   *iia = ia;
181:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

184:   /* determine the number of columns in each row */
185:   ia[0] = oshift;
186:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187:     j   = aj + ai[row] + ishift;
188:     col = *j++ + ishift;
189:     i2  = tvc[col];
190:     nz  = ai[row+1] - ai[row];
191:     while (nz-- > 0) {           /* off-diagonal elemets */
192:       ia[i1+1]++;
193:       i2++;                     /* Start col of next node */
194:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195:       if (nz > 0) i2 = tvc[col];
196:     }
197:   }

199:   /* shift ia[i] to point to next row */
200:   for (i1=1; i1<nslim_row+1; i1++) {
201:     row        = ia[i1-1];
202:     ia[i1]    += row;
203:     work[i1-1] = row - oshift;
204:   }

206:   /* allocate space for column pointers */
207:   nz   = ia[nslim_row] + (!ishift);
208:   PetscMalloc(nz*sizeof(PetscInt),&ja);
209:   *jja = ja;

211:  /* loop over matrix putting into ja */
212:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213:     j   = aj + ai[row] + ishift;
214:     i2  = 0;                     /* Col inode index */
215:     col = *j++ + ishift;
216:     i2  = tvc[col];
217:     nz  = ai[row+1] - ai[row];
218:     while (nz-- > 0) {
219:       ja[work[i1]++] = i2 + oshift;
220:       ++i2;
221:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222:       if (nz > 0) i2 = tvc[col];
223:     }
224:   }
225:   PetscFree(ns_col);
226:   PetscFree(work);
227:   PetscFree(tns);
228:   PetscFree(tvc);
229:   return(0);
230: }

234: static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;

240:   *n     = a->inode.node_count;
241:   if (!ia) return(0);
242:   if (!blockcompressed) {
243:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
244:   } else if (symmetric) {
245:     MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
246:   } else {
247:     MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
248:   }
249:   return(0);
250: }

254: static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255: {

259:   if (!ia) return(0);

261:   if (!blockcompressed) {
262:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
263:   } else {
264:     PetscFree(*ia);
265:     PetscFree(*ja);
266:   }

268:   return(0);
269: }

271: /* ----------------------------------------------------------- */

275: static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276: {
277:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
279:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

283:   nslim_row = a->inode.node_count;
284:   n         = A->cmap.n;

286:   /* Create The column_inode for this matrix */
287:   Mat_CreateColInode(A,&nslim_col,&ns_col);
288: 
289:   /* allocate space for reformated column_inode structure */
290:   PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
291:   PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
292:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

294:   for (i1=0,col=0; i1<nslim_col; ++i1){
295:     nsz = ns_col[i1];
296:     for (i2=0; i2<nsz; ++i2,++col)
297:       tvc[col] = i1;
298:   }
299:   /* allocate space for column pointers */
300:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
301:   *iia = ia;
302:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
303:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);

305:   /* determine the number of columns in each row */
306:   ia[0] = oshift;
307:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308:     j   = aj + ai[row] + ishift;
309:     col = *j++ + ishift;
310:     i2  = tvc[col];
311:     nz  = ai[row+1] - ai[row];
312:     while (nz-- > 0) {           /* off-diagonal elemets */
313:       /* ia[i1+1]++; */
314:       ia[i2+1]++;
315:       i2++;
316:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317:       if (nz > 0) i2 = tvc[col];
318:     }
319:   }

321:   /* shift ia[i] to point to next col */
322:   for (i1=1; i1<nslim_col+1; i1++) {
323:     col        = ia[i1-1];
324:     ia[i1]    += col;
325:     work[i1-1] = col - oshift;
326:   }

328:   /* allocate space for column pointers */
329:   nz   = ia[nslim_col] + (!ishift);
330:   PetscMalloc(nz*sizeof(PetscInt),&ja);
331:   *jja = ja;

333:  /* loop over matrix putting into ja */
334:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335:     j   = aj + ai[row] + ishift;
336:     i2  = 0;                     /* Col inode index */
337:     col = *j++ + ishift;
338:     i2  = tvc[col];
339:     nz  = ai[row+1] - ai[row];
340:     while (nz-- > 0) {
341:       /* ja[work[i1]++] = i2 + oshift; */
342:       ja[work[i2]++] = i1 + oshift;
343:       i2++;
344:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345:       if (nz > 0) i2 = tvc[col];
346:     }
347:   }
348:   PetscFree(ns_col);
349:   PetscFree(work);
350:   PetscFree(tns);
351:   PetscFree(tvc);
352:   return(0);
353: }

357: static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358: {

362:   Mat_CreateColInode(A,n,PETSC_NULL);
363:   if (!ia) return(0);

365:   if (!blockcompressed) {
366:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367:   } else if (symmetric) {
368:     /* Since the indices are symmetric it does'nt matter */
369:     MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
370:   } else {
371:     MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
372:   }
373:   return(0);
374: }

378: static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379: {

383:   if (!ia) return(0);
384:   if (!blockcompressed) {
385:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
386:   } else {
387:     PetscFree(*ia);
388:     PetscFree(*ja);
389:   }
390:   return(0);
391: }

393: /* ----------------------------------------------------------- */

397: static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398: {
399:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401:   PetscScalar       *y;
402:   const PetscScalar *x,*v1,*v2,*v3,*v4,*v5;
403:   PetscErrorCode    ierr;
404:   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
405: 
406: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
407: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
408: #endif

411:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
412:   node_max = a->inode.node_count;
413:   ns       = a->inode.size;     /* Node Size array */
414:   VecGetArray(xx,(PetscScalar**)&x);
415:   VecGetArray(yy,&y);
416:   idx  = a->j;
417:   v1   = a->a;
418:   ii   = a->i;

420:   for (i = 0,row = 0; i< node_max; ++i){
421:     nsz  = ns[i];
422:     n    = ii[1] - ii[0];
423:     nonzerorow += (n>0)*nsz;
424:     ii  += nsz;
425:     sz   = n;                   /* No of non zeros in this row */
426:                                 /* Switch on the size of Node */
427:     switch (nsz){               /* Each loop in 'case' is unrolled */
428:     case 1 :
429:       sum1  = 0;
430: 
431:       for(n = 0; n< sz-1; n+=2) {
432:         i1   = idx[0];          /* The instructions are ordered to */
433:         i2   = idx[1];          /* make the compiler's job easy */
434:         idx += 2;
435:         tmp0 = x[i1];
436:         tmp1 = x[i2];
437:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
438:        }
439: 
440:       if (n == sz-1){          /* Take care of the last nonzero  */
441:         tmp0  = x[*idx++];
442:         sum1 += *v1++ * tmp0;
443:       }
444:       y[row++]=sum1;
445:       break;
446:     case 2:
447:       sum1  = 0;
448:       sum2  = 0;
449:       v2    = v1 + n;
450: 
451:       for (n = 0; n< sz-1; n+=2) {
452:         i1   = idx[0];
453:         i2   = idx[1];
454:         idx += 2;
455:         tmp0 = x[i1];
456:         tmp1 = x[i2];
457:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
458:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
459:       }
460:       if (n == sz-1){
461:         tmp0  = x[*idx++];
462:         sum1 += *v1++ * tmp0;
463:         sum2 += *v2++ * tmp0;
464:       }
465:       y[row++]=sum1;
466:       y[row++]=sum2;
467:       v1      =v2;              /* Since the next block to be processed starts there*/
468:       idx    +=sz;
469:       break;
470:     case 3:
471:       sum1  = 0;
472:       sum2  = 0;
473:       sum3  = 0;
474:       v2    = v1 + n;
475:       v3    = v2 + n;
476: 
477:       for (n = 0; n< sz-1; n+=2) {
478:         i1   = idx[0];
479:         i2   = idx[1];
480:         idx += 2;
481:         tmp0 = x[i1];
482:         tmp1 = x[i2];
483:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
484:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
485:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
486:       }
487:       if (n == sz-1){
488:         tmp0  = x[*idx++];
489:         sum1 += *v1++ * tmp0;
490:         sum2 += *v2++ * tmp0;
491:         sum3 += *v3++ * tmp0;
492:       }
493:       y[row++]=sum1;
494:       y[row++]=sum2;
495:       y[row++]=sum3;
496:       v1       =v3;             /* Since the next block to be processed starts there*/
497:       idx     +=2*sz;
498:       break;
499:     case 4:
500:       sum1  = 0;
501:       sum2  = 0;
502:       sum3  = 0;
503:       sum4  = 0;
504:       v2    = v1 + n;
505:       v3    = v2 + n;
506:       v4    = v3 + n;
507: 
508:       for (n = 0; n< sz-1; n+=2) {
509:         i1   = idx[0];
510:         i2   = idx[1];
511:         idx += 2;
512:         tmp0 = x[i1];
513:         tmp1 = x[i2];
514:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
515:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
516:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
517:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
518:       }
519:       if (n == sz-1){
520:         tmp0  = x[*idx++];
521:         sum1 += *v1++ * tmp0;
522:         sum2 += *v2++ * tmp0;
523:         sum3 += *v3++ * tmp0;
524:         sum4 += *v4++ * tmp0;
525:       }
526:       y[row++]=sum1;
527:       y[row++]=sum2;
528:       y[row++]=sum3;
529:       y[row++]=sum4;
530:       v1      =v4;              /* Since the next block to be processed starts there*/
531:       idx    +=3*sz;
532:       break;
533:     case 5:
534:       sum1  = 0;
535:       sum2  = 0;
536:       sum3  = 0;
537:       sum4  = 0;
538:       sum5  = 0;
539:       v2    = v1 + n;
540:       v3    = v2 + n;
541:       v4    = v3 + n;
542:       v5    = v4 + n;
543: 
544:       for (n = 0; n<sz-1; n+=2) {
545:         i1   = idx[0];
546:         i2   = idx[1];
547:         idx += 2;
548:         tmp0 = x[i1];
549:         tmp1 = x[i2];
550:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
551:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
552:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
553:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
554:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
555:       }
556:       if (n == sz-1){
557:         tmp0  = x[*idx++];
558:         sum1 += *v1++ * tmp0;
559:         sum2 += *v2++ * tmp0;
560:         sum3 += *v3++ * tmp0;
561:         sum4 += *v4++ * tmp0;
562:         sum5 += *v5++ * tmp0;
563:       }
564:       y[row++]=sum1;
565:       y[row++]=sum2;
566:       y[row++]=sum3;
567:       y[row++]=sum4;
568:       y[row++]=sum5;
569:       v1      =v5;       /* Since the next block to be processed starts there */
570:       idx    +=4*sz;
571:       break;
572:     default :
573:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
574:     }
575:   }
576:   VecRestoreArray(xx,(PetscScalar**)&x);
577:   VecRestoreArray(yy,&y);
578:   PetscLogFlops(2*a->nz - nonzerorow);
579:   return(0);
580: }
581: /* ----------------------------------------------------------- */
582: /* Almost same code as the MatMult_Inode() */
585: static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
586: {
587:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
588:   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
589:   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
591:   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
592: 
594:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
595:   node_max = a->inode.node_count;
596:   ns       = a->inode.size;     /* Node Size array */
597:   VecGetArray(xx,&x);
598:   VecGetArray(yy,&y);
599:   if (zz != yy) {
600:     VecGetArray(zz,&z);
601:   } else {
602:     z = y;
603:   }
604:   zt = z;

606:   idx  = a->j;
607:   v1   = a->a;
608:   ii   = a->i;

610:   for (i = 0,row = 0; i< node_max; ++i){
611:     nsz  = ns[i];
612:     n    = ii[1] - ii[0];
613:     ii  += nsz;
614:     sz   = n;                   /* No of non zeros in this row */
615:                                 /* Switch on the size of Node */
616:     switch (nsz){               /* Each loop in 'case' is unrolled */
617:     case 1 :
618:       sum1  = *zt++;
619: 
620:       for(n = 0; n< sz-1; n+=2) {
621:         i1   = idx[0];          /* The instructions are ordered to */
622:         i2   = idx[1];          /* make the compiler's job easy */
623:         idx += 2;
624:         tmp0 = x[i1];
625:         tmp1 = x[i2];
626:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627:        }
628: 
629:       if(n   == sz-1){          /* Take care of the last nonzero  */
630:         tmp0  = x[*idx++];
631:         sum1 += *v1++ * tmp0;
632:       }
633:       y[row++]=sum1;
634:       break;
635:     case 2:
636:       sum1  = *zt++;
637:       sum2  = *zt++;
638:       v2    = v1 + n;
639: 
640:       for(n = 0; n< sz-1; n+=2) {
641:         i1   = idx[0];
642:         i2   = idx[1];
643:         idx += 2;
644:         tmp0 = x[i1];
645:         tmp1 = x[i2];
646:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
647:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
648:       }
649:       if(n   == sz-1){
650:         tmp0  = x[*idx++];
651:         sum1 += *v1++ * tmp0;
652:         sum2 += *v2++ * tmp0;
653:       }
654:       y[row++]=sum1;
655:       y[row++]=sum2;
656:       v1      =v2;              /* Since the next block to be processed starts there*/
657:       idx    +=sz;
658:       break;
659:     case 3:
660:       sum1  = *zt++;
661:       sum2  = *zt++;
662:       sum3  = *zt++;
663:       v2    = v1 + n;
664:       v3    = v2 + n;
665: 
666:       for (n = 0; n< sz-1; n+=2) {
667:         i1   = idx[0];
668:         i2   = idx[1];
669:         idx += 2;
670:         tmp0 = x[i1];
671:         tmp1 = x[i2];
672:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
673:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
674:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
675:       }
676:       if (n == sz-1){
677:         tmp0  = x[*idx++];
678:         sum1 += *v1++ * tmp0;
679:         sum2 += *v2++ * tmp0;
680:         sum3 += *v3++ * tmp0;
681:       }
682:       y[row++]=sum1;
683:       y[row++]=sum2;
684:       y[row++]=sum3;
685:       v1       =v3;             /* Since the next block to be processed starts there*/
686:       idx     +=2*sz;
687:       break;
688:     case 4:
689:       sum1  = *zt++;
690:       sum2  = *zt++;
691:       sum3  = *zt++;
692:       sum4  = *zt++;
693:       v2    = v1 + n;
694:       v3    = v2 + n;
695:       v4    = v3 + n;
696: 
697:       for (n = 0; n< sz-1; n+=2) {
698:         i1   = idx[0];
699:         i2   = idx[1];
700:         idx += 2;
701:         tmp0 = x[i1];
702:         tmp1 = x[i2];
703:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
704:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
705:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
706:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
707:       }
708:       if (n == sz-1){
709:         tmp0  = x[*idx++];
710:         sum1 += *v1++ * tmp0;
711:         sum2 += *v2++ * tmp0;
712:         sum3 += *v3++ * tmp0;
713:         sum4 += *v4++ * tmp0;
714:       }
715:       y[row++]=sum1;
716:       y[row++]=sum2;
717:       y[row++]=sum3;
718:       y[row++]=sum4;
719:       v1      =v4;              /* Since the next block to be processed starts there*/
720:       idx    +=3*sz;
721:       break;
722:     case 5:
723:       sum1  = *zt++;
724:       sum2  = *zt++;
725:       sum3  = *zt++;
726:       sum4  = *zt++;
727:       sum5  = *zt++;
728:       v2    = v1 + n;
729:       v3    = v2 + n;
730:       v4    = v3 + n;
731:       v5    = v4 + n;
732: 
733:       for (n = 0; n<sz-1; n+=2) {
734:         i1   = idx[0];
735:         i2   = idx[1];
736:         idx += 2;
737:         tmp0 = x[i1];
738:         tmp1 = x[i2];
739:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
740:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
741:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
742:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
743:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
744:       }
745:       if(n   == sz-1){
746:         tmp0  = x[*idx++];
747:         sum1 += *v1++ * tmp0;
748:         sum2 += *v2++ * tmp0;
749:         sum3 += *v3++ * tmp0;
750:         sum4 += *v4++ * tmp0;
751:         sum5 += *v5++ * tmp0;
752:       }
753:       y[row++]=sum1;
754:       y[row++]=sum2;
755:       y[row++]=sum3;
756:       y[row++]=sum4;
757:       y[row++]=sum5;
758:       v1      =v5;       /* Since the next block to be processed starts there */
759:       idx    +=4*sz;
760:       break;
761:     default :
762:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
763:     }
764:   }
765:   VecRestoreArray(xx,&x);
766:   VecRestoreArray(yy,&y);
767:   if (zz != yy) {
768:     VecRestoreArray(zz,&z);
769:   }
770:   PetscLogFlops(2*a->nz);
771:   return(0);
772: }

774: /* ----------------------------------------------------------- */
777: PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
778: {
779:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
780:   IS                iscol = a->col,isrow = a->row;
781:   PetscErrorCode    ierr;
782:   PetscInt          *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
783:   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
784:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
785:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
786:   const PetscScalar *v1,*v2,*v3,*v4,*v5,*b,*a_a = a->a,*aa;

789:   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
790:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
791:   node_max = a->inode.node_count;
792:   ns       = a->inode.size;     /* Node Size array */

794:   VecGetArray(bb,(PetscScalar**)&b);
795:   VecGetArray(xx,&x);
796:   tmp  = a->solve_work;
797: 
798:   ISGetIndices(isrow,&rout); r = rout;
799:   ISGetIndices(iscol,&cout); c = cout + (n-1);
800: 
801:   /* forward solve the lower triangular */
802:   tmps = tmp ;
803:   aa   = a_a ;
804:   aj   = a_j ;
805:   ad   = a->diag;

807:   for (i = 0,row = 0; i< node_max; ++i){
808:     nsz = ns[i];
809:     aii = ai[row];
810:     v1  = aa + aii;
811:     vi  = aj + aii;
812:     nz  = ad[row]- aii;
813: 
814:     switch (nsz){               /* Each loop in 'case' is unrolled */
815:     case 1 :
816:       sum1 = b[*r++];
817:       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
818:       for(j=0; j<nz-1; j+=2){
819:         i0   = vi[0];
820:         i1   = vi[1];
821:         vi  +=2;
822:         tmp0 = tmps[i0];
823:         tmp1 = tmps[i1];
824:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
825:       }
826:       if(j == nz-1){
827:         tmp0 = tmps[*vi++];
828:         sum1 -= *v1++ *tmp0;
829:       }
830:       tmp[row ++]=sum1;
831:       break;
832:     case 2:
833:       sum1 = b[*r++];
834:       sum2 = b[*r++];
835:       v2   = aa + ai[row+1];

837:       for(j=0; j<nz-1; j+=2){
838:         i0   = vi[0];
839:         i1   = vi[1];
840:         vi  +=2;
841:         tmp0 = tmps[i0];
842:         tmp1 = tmps[i1];
843:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
844:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
845:       }
846:       if(j == nz-1){
847:         tmp0 = tmps[*vi++];
848:         sum1 -= *v1++ *tmp0;
849:         sum2 -= *v2++ *tmp0;
850:       }
851:       sum2 -= *v2++ * sum1;
852:       tmp[row ++]=sum1;
853:       tmp[row ++]=sum2;
854:       break;
855:     case 3:
856:       sum1 = b[*r++];
857:       sum2 = b[*r++];
858:       sum3 = b[*r++];
859:       v2   = aa + ai[row+1];
860:       v3   = aa + ai[row+2];
861: 
862:       for (j=0; j<nz-1; j+=2){
863:         i0   = vi[0];
864:         i1   = vi[1];
865:         vi  +=2;
866:         tmp0 = tmps[i0];
867:         tmp1 = tmps[i1];
868:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
869:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
870:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
871:       }
872:       if (j == nz-1){
873:         tmp0 = tmps[*vi++];
874:         sum1 -= *v1++ *tmp0;
875:         sum2 -= *v2++ *tmp0;
876:         sum3 -= *v3++ *tmp0;
877:       }
878:       sum2 -= *v2++ * sum1;
879:       sum3 -= *v3++ * sum1;
880:       sum3 -= *v3++ * sum2;
881:       tmp[row ++]=sum1;
882:       tmp[row ++]=sum2;
883:       tmp[row ++]=sum3;
884:       break;
885: 
886:     case 4:
887:       sum1 = b[*r++];
888:       sum2 = b[*r++];
889:       sum3 = b[*r++];
890:       sum4 = b[*r++];
891:       v2   = aa + ai[row+1];
892:       v3   = aa + ai[row+2];
893:       v4   = aa + ai[row+3];
894: 
895:       for (j=0; j<nz-1; j+=2){
896:         i0   = vi[0];
897:         i1   = vi[1];
898:         vi  +=2;
899:         tmp0 = tmps[i0];
900:         tmp1 = tmps[i1];
901:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
902:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
903:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
904:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
905:       }
906:       if (j == nz-1){
907:         tmp0 = tmps[*vi++];
908:         sum1 -= *v1++ *tmp0;
909:         sum2 -= *v2++ *tmp0;
910:         sum3 -= *v3++ *tmp0;
911:         sum4 -= *v4++ *tmp0;
912:       }
913:       sum2 -= *v2++ * sum1;
914:       sum3 -= *v3++ * sum1;
915:       sum4 -= *v4++ * sum1;
916:       sum3 -= *v3++ * sum2;
917:       sum4 -= *v4++ * sum2;
918:       sum4 -= *v4++ * sum3;
919: 
920:       tmp[row ++]=sum1;
921:       tmp[row ++]=sum2;
922:       tmp[row ++]=sum3;
923:       tmp[row ++]=sum4;
924:       break;
925:     case 5:
926:       sum1 = b[*r++];
927:       sum2 = b[*r++];
928:       sum3 = b[*r++];
929:       sum4 = b[*r++];
930:       sum5 = b[*r++];
931:       v2   = aa + ai[row+1];
932:       v3   = aa + ai[row+2];
933:       v4   = aa + ai[row+3];
934:       v5   = aa + ai[row+4];
935: 
936:       for (j=0; j<nz-1; j+=2){
937:         i0   = vi[0];
938:         i1   = vi[1];
939:         vi  +=2;
940:         tmp0 = tmps[i0];
941:         tmp1 = tmps[i1];
942:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
943:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
944:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
945:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
946:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
947:       }
948:       if (j == nz-1){
949:         tmp0 = tmps[*vi++];
950:         sum1 -= *v1++ *tmp0;
951:         sum2 -= *v2++ *tmp0;
952:         sum3 -= *v3++ *tmp0;
953:         sum4 -= *v4++ *tmp0;
954:         sum5 -= *v5++ *tmp0;
955:       }

957:       sum2 -= *v2++ * sum1;
958:       sum3 -= *v3++ * sum1;
959:       sum4 -= *v4++ * sum1;
960:       sum5 -= *v5++ * sum1;
961:       sum3 -= *v3++ * sum2;
962:       sum4 -= *v4++ * sum2;
963:       sum5 -= *v5++ * sum2;
964:       sum4 -= *v4++ * sum3;
965:       sum5 -= *v5++ * sum3;
966:       sum5 -= *v5++ * sum4;
967: 
968:       tmp[row ++]=sum1;
969:       tmp[row ++]=sum2;
970:       tmp[row ++]=sum3;
971:       tmp[row ++]=sum4;
972:       tmp[row ++]=sum5;
973:       break;
974:     default:
975:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
976:     }
977:   }
978:   /* backward solve the upper triangular */
979:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
980:     nsz = ns[i];
981:     aii = ai[row+1] -1;
982:     v1  = aa + aii;
983:     vi  = aj + aii;
984:     nz  = aii- ad[row];
985:     switch (nsz){               /* Each loop in 'case' is unrolled */
986:     case 1 :
987:       sum1 = tmp[row];

989:       for(j=nz ; j>1; j-=2){
990:         vi  -=2;
991:         i0   = vi[2];
992:         i1   = vi[1];
993:         tmp0 = tmps[i0];
994:         tmp1 = tmps[i1];
995:         v1   -= 2;
996:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
997:       }
998:       if (j==1){
999:         tmp0  = tmps[*vi--];
1000:         sum1 -= *v1-- * tmp0;
1001:       }
1002:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1003:       break;
1004:     case 2 :
1005:       sum1 = tmp[row];
1006:       sum2 = tmp[row -1];
1007:       v2   = aa + ai[row]-1;
1008:       for (j=nz ; j>1; j-=2){
1009:         vi  -=2;
1010:         i0   = vi[2];
1011:         i1   = vi[1];
1012:         tmp0 = tmps[i0];
1013:         tmp1 = tmps[i1];
1014:         v1   -= 2;
1015:         v2   -= 2;
1016:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1017:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1018:       }
1019:       if (j==1){
1020:         tmp0  = tmps[*vi--];
1021:         sum1 -= *v1-- * tmp0;
1022:         sum2 -= *v2-- * tmp0;
1023:       }
1024: 
1025:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1026:       sum2   -= *v2-- * tmp0;
1027:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1028:       break;
1029:     case 3 :
1030:       sum1 = tmp[row];
1031:       sum2 = tmp[row -1];
1032:       sum3 = tmp[row -2];
1033:       v2   = aa + ai[row]-1;
1034:       v3   = aa + ai[row -1]-1;
1035:       for (j=nz ; j>1; j-=2){
1036:         vi  -=2;
1037:         i0   = vi[2];
1038:         i1   = vi[1];
1039:         tmp0 = tmps[i0];
1040:         tmp1 = tmps[i1];
1041:         v1   -= 2;
1042:         v2   -= 2;
1043:         v3   -= 2;
1044:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1045:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1046:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1047:       }
1048:       if (j==1){
1049:         tmp0  = tmps[*vi--];
1050:         sum1 -= *v1-- * tmp0;
1051:         sum2 -= *v2-- * tmp0;
1052:         sum3 -= *v3-- * tmp0;
1053:       }
1054:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1055:       sum2   -= *v2-- * tmp0;
1056:       sum3   -= *v3-- * tmp0;
1057:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1058:       sum3   -= *v3-- * tmp0;
1059:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1060: 
1061:       break;
1062:     case 4 :
1063:       sum1 = tmp[row];
1064:       sum2 = tmp[row -1];
1065:       sum3 = tmp[row -2];
1066:       sum4 = tmp[row -3];
1067:       v2   = aa + ai[row]-1;
1068:       v3   = aa + ai[row -1]-1;
1069:       v4   = aa + ai[row -2]-1;

1071:       for (j=nz ; j>1; j-=2){
1072:         vi  -=2;
1073:         i0   = vi[2];
1074:         i1   = vi[1];
1075:         tmp0 = tmps[i0];
1076:         tmp1 = tmps[i1];
1077:         v1  -= 2;
1078:         v2  -= 2;
1079:         v3  -= 2;
1080:         v4  -= 2;
1081:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1082:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1083:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1084:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1085:       }
1086:       if (j==1){
1087:         tmp0  = tmps[*vi--];
1088:         sum1 -= *v1-- * tmp0;
1089:         sum2 -= *v2-- * tmp0;
1090:         sum3 -= *v3-- * tmp0;
1091:         sum4 -= *v4-- * tmp0;
1092:       }

1094:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1095:       sum2   -= *v2-- * tmp0;
1096:       sum3   -= *v3-- * tmp0;
1097:       sum4   -= *v4-- * tmp0;
1098:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1099:       sum3   -= *v3-- * tmp0;
1100:       sum4   -= *v4-- * tmp0;
1101:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1102:       sum4   -= *v4-- * tmp0;
1103:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1104:       break;
1105:     case 5 :
1106:       sum1 = tmp[row];
1107:       sum2 = tmp[row -1];
1108:       sum3 = tmp[row -2];
1109:       sum4 = tmp[row -3];
1110:       sum5 = tmp[row -4];
1111:       v2   = aa + ai[row]-1;
1112:       v3   = aa + ai[row -1]-1;
1113:       v4   = aa + ai[row -2]-1;
1114:       v5   = aa + ai[row -3]-1;
1115:       for (j=nz ; j>1; j-=2){
1116:         vi  -= 2;
1117:         i0   = vi[2];
1118:         i1   = vi[1];
1119:         tmp0 = tmps[i0];
1120:         tmp1 = tmps[i1];
1121:         v1   -= 2;
1122:         v2   -= 2;
1123:         v3   -= 2;
1124:         v4   -= 2;
1125:         v5   -= 2;
1126:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1127:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1128:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1129:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1130:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1131:       }
1132:       if (j==1){
1133:         tmp0  = tmps[*vi--];
1134:         sum1 -= *v1-- * tmp0;
1135:         sum2 -= *v2-- * tmp0;
1136:         sum3 -= *v3-- * tmp0;
1137:         sum4 -= *v4-- * tmp0;
1138:         sum5 -= *v5-- * tmp0;
1139:       }

1141:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1142:       sum2   -= *v2-- * tmp0;
1143:       sum3   -= *v3-- * tmp0;
1144:       sum4   -= *v4-- * tmp0;
1145:       sum5   -= *v5-- * tmp0;
1146:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1147:       sum3   -= *v3-- * tmp0;
1148:       sum4   -= *v4-- * tmp0;
1149:       sum5   -= *v5-- * tmp0;
1150:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1151:       sum4   -= *v4-- * tmp0;
1152:       sum5   -= *v5-- * tmp0;
1153:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1154:       sum5   -= *v5-- * tmp0;
1155:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1156:       break;
1157:     default:
1158:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1159:     }
1160:   }
1161:   ISRestoreIndices(isrow,&rout);
1162:   ISRestoreIndices(iscol,&cout);
1163:   VecRestoreArray(bb,(PetscScalar**)&b);
1164:   VecRestoreArray(xx,&x);
1165:   PetscLogFlops(2*a->nz - A->cmap.n);
1166:   return(0);
1167: }

1171: PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1172: {
1173:   Mat               C = *B;
1174:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1175:   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1176:   PetscErrorCode    ierr;
1177:   PetscInt          *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1178:   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1179:   PetscInt          *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1180:   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1181:   PetscScalar       *pc1,*pc2,*pc3,mul1,mul2,mul3;
1182:   PetscScalar       tmp,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1183:   const PetscScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1184:   PetscReal         rs=0.0;
1185:   LUShift_Ctx       sctx;
1186:   PetscInt          newshift;

1189:   sctx.shift_top  = 0;
1190:   sctx.nshift_max = 0;
1191:   sctx.shift_lo   = 0;
1192:   sctx.shift_hi   = 0;

1194:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1195:   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1196:   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1197:     sctx.shift_top = 0;
1198:     for (i=0; i<n; i++) {
1199:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1200:       rs    = 0.0;
1201:       ajtmp = aj + ai[i];
1202:       rtmp1 = aa + ai[i];
1203:       nz = ai[i+1] - ai[i];
1204:       for (j=0; j<nz; j++){
1205:         if (*ajtmp != i){
1206:           rs += PetscAbsScalar(*rtmp1++);
1207:         } else {
1208:           rs -= PetscRealPart(*rtmp1++);
1209:         }
1210:         ajtmp++;
1211:       }
1212:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1213:     }
1214:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1215:     sctx.shift_top *= 1.1;
1216:     sctx.nshift_max = 5;
1217:     sctx.shift_lo   = 0.;
1218:     sctx.shift_hi   = 1.;
1219:   }
1220:   sctx.shift_amount = 0;
1221:   sctx.nshift       = 0;

1223:   ISGetIndices(isrow,&r);
1224:   ISGetIndices(iscol,&c);
1225:   ISGetIndices(isicol,&ic);
1226:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);
1227:   PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));
1228:   ics   = ic ;
1229:   rtmp22 = rtmp11 + n;
1230:   rtmp33 = rtmp22 + n;
1231: 
1232:   node_max = a->inode.node_count;
1233:   ns       = a->inode.size ;
1234:   if (!ns){
1235:     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1236:   }

1238:   /* If max inode size > 3, split it into two inodes.*/
1239:   /* also map the inode sizes according to the ordering */
1240:   PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1241:   for (i=0,j=0; i<node_max; ++i,++j){
1242:     if (ns[i]>3) {
1243:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1244:       ++j;
1245:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1246:     } else {
1247:       tmp_vec1[j] = ns[i];
1248:     }
1249:   }
1250:   /* Use the correct node_max */
1251:   node_max = j;

1253:   /* Now reorder the inode info based on mat re-ordering info */
1254:   /* First create a row -> inode_size_array_index map */
1255:   PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1256:   PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1257:   for (i=0,row=0; i<node_max; i++) {
1258:     nodesz = tmp_vec1[i];
1259:     for (j=0; j<nodesz; j++,row++) {
1260:       nsmap[row] = i;
1261:     }
1262:   }
1263:   /* Using nsmap, create a reordered ns structure */
1264:   for (i=0,j=0; i< node_max; i++) {
1265:     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1266:     tmp_vec2[i]  = nodesz;
1267:     j           += nodesz;
1268:   }
1269:   PetscFree(nsmap);
1270:   PetscFree(tmp_vec1);
1271:   /* Now use the correct ns */
1272:   ns = tmp_vec2;

1274:   do {
1275:     sctx.lushift = PETSC_FALSE;
1276:     /* Now loop over each block-row, and do the factorization */
1277:     for (i=0,row=0; i<node_max; i++) {
1278:       nodesz = ns[i];
1279:       nz     = bi[row+1] - bi[row];
1280:       bjtmp  = bj + bi[row];

1282:       switch (nodesz){
1283:       case 1:
1284:         for  (j=0; j<nz; j++){
1285:           idx        = bjtmp[j];
1286:           rtmp11[idx] = 0.0;
1287:         }
1288: 
1289:         /* load in initial (unfactored row) */
1290:         idx    = r[row];
1291:         nz_tmp = ai[idx+1] - ai[idx];
1292:         ajtmp  = aj + ai[idx];
1293:         v1     = aa + ai[idx];

1295:         for (j=0; j<nz_tmp; j++) {
1296:           idx        = ics[ajtmp[j]];
1297:           rtmp11[idx] = v1[j];
1298:         }
1299:         rtmp11[ics[r[row]]] += sctx.shift_amount;

1301:         prow = *bjtmp++ ;
1302:         while (prow < row) {
1303:           pc1 = rtmp11 + prow;
1304:           if (*pc1 != 0.0){
1305:             pv   = ba + bd[prow];
1306:             pj   = nbj + bd[prow];
1307:             mul1 = *pc1 * *pv++;
1308:             *pc1 = mul1;
1309:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1310:             PetscLogFlops(2*nz_tmp);
1311:             for (j=0; j<nz_tmp; j++) {
1312:               tmp = pv[j];
1313:               idx = pj[j];
1314:               rtmp11[idx] -= mul1 * tmp;
1315:             }
1316:           }
1317:           prow = *bjtmp++ ;
1318:         }
1319:         pj  = bj + bi[row];
1320:         pc1 = ba + bi[row];

1322:         sctx.pv    = rtmp11[row];
1323:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1324:         rs         = 0.0;
1325:         for (j=0; j<nz; j++) {
1326:           idx    = pj[j];
1327:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1328:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1329:         }
1330:         sctx.rs  = rs;
1331:         MatLUCheckShift_inline(info,sctx,row,newshift);
1332:         if (newshift == 1) goto endofwhile;
1333:         break;
1334: 
1335:       case 2:
1336:         for (j=0; j<nz; j++) {
1337:           idx        = bjtmp[j];
1338:           rtmp11[idx] = 0.0;
1339:           rtmp22[idx] = 0.0;
1340:         }
1341: 
1342:         /* load in initial (unfactored row) */
1343:         idx    = r[row];
1344:         nz_tmp = ai[idx+1] - ai[idx];
1345:         ajtmp  = aj + ai[idx];
1346:         v1     = aa + ai[idx];
1347:         v2     = aa + ai[idx+1];
1348:         for (j=0; j<nz_tmp; j++) {
1349:           idx        = ics[ajtmp[j]];
1350:           rtmp11[idx] = v1[j];
1351:           rtmp22[idx] = v2[j];
1352:         }
1353:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1354:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

1356:         prow = *bjtmp++ ;
1357:         while (prow < row) {
1358:           pc1 = rtmp11 + prow;
1359:           pc2 = rtmp22 + prow;
1360:           if (*pc1 != 0.0 || *pc2 != 0.0){
1361:             pv   = ba + bd[prow];
1362:             pj   = nbj + bd[prow];
1363:             mul1 = *pc1 * *pv;
1364:             mul2 = *pc2 * *pv;
1365:             ++pv;
1366:             *pc1 = mul1;
1367:             *pc2 = mul2;
1368: 
1369:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1370:             for (j=0; j<nz_tmp; j++) {
1371:               tmp = pv[j];
1372:               idx = pj[j];
1373:               rtmp11[idx] -= mul1 * tmp;
1374:               rtmp22[idx] -= mul2 * tmp;
1375:             }
1376:             PetscLogFlops(4*nz_tmp);
1377:           }
1378:           prow = *bjtmp++ ;
1379:         }

1381:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1382:         pc1 = rtmp11 + prow;
1383:         pc2 = rtmp22 + prow;

1385:         sctx.pv = *pc1;
1386:         pj      = bj + bi[prow];
1387:         rs      = 0.0;
1388:         for (j=0; j<nz; j++){
1389:           idx = pj[j];
1390:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1391:         }
1392:         sctx.rs = rs;
1393:         MatLUCheckShift_inline(info,sctx,row,newshift);
1394:         if (newshift == 1) goto endofwhile;

1396:         if (*pc2 != 0.0){
1397:           pj     = nbj + bd[prow];
1398:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1399:           *pc2   = mul2;
1400:           nz_tmp = bi[prow+1] - bd[prow] - 1;
1401:           for (j=0; j<nz_tmp; j++) {
1402:             idx = pj[j] ;
1403:             tmp = rtmp11[idx];
1404:             rtmp22[idx] -= mul2 * tmp;
1405:           }
1406:           PetscLogFlops(2*nz_tmp);
1407:         }
1408: 
1409:         pj  = bj + bi[row];
1410:         pc1 = ba + bi[row];
1411:         pc2 = ba + bi[row+1];

1413:         sctx.pv = rtmp22[row+1];
1414:         rs = 0.0;
1415:         rtmp11[row]   = 1.0/rtmp11[row];
1416:         rtmp22[row+1] = 1.0/rtmp22[row+1];
1417:         /* copy row entries from dense representation to sparse */
1418:         for (j=0; j<nz; j++) {
1419:           idx    = pj[j];
1420:           pc1[j] = rtmp11[idx];
1421:           pc2[j] = rtmp22[idx];
1422:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1423:         }
1424:         sctx.rs = rs;
1425:         MatLUCheckShift_inline(info,sctx,row+1,newshift);
1426:         if (newshift == 1) goto endofwhile;
1427:         break;

1429:       case 3:
1430:         for  (j=0; j<nz; j++) {
1431:           idx        = bjtmp[j];
1432:           rtmp11[idx] = 0.0;
1433:           rtmp22[idx] = 0.0;
1434:           rtmp33[idx] = 0.0;
1435:         }
1436:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1437:         idx    = r[row];
1438:         nz_tmp = ai[idx+1] - ai[idx];
1439:         ajtmp = aj + ai[idx];
1440:         v1    = aa + ai[idx];
1441:         v2    = aa + ai[idx+1];
1442:         v3    = aa + ai[idx+2];
1443:         for (j=0; j<nz_tmp; j++) {
1444:           idx        = ics[ajtmp[j]];
1445:           rtmp11[idx] = v1[j];
1446:           rtmp22[idx] = v2[j];
1447:           rtmp33[idx] = v3[j];
1448:         }
1449:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1450:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1451:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

1453:         /* loop over all pivot row blocks above this row block */
1454:         prow = *bjtmp++ ;
1455:         while (prow < row) {
1456:           pc1 = rtmp11 + prow;
1457:           pc2 = rtmp22 + prow;
1458:           pc3 = rtmp33 + prow;
1459:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1460:             pv   = ba  + bd[prow];
1461:             pj   = nbj + bd[prow];
1462:             mul1 = *pc1 * *pv;
1463:             mul2 = *pc2 * *pv;
1464:             mul3 = *pc3 * *pv;
1465:             ++pv;
1466:             *pc1 = mul1;
1467:             *pc2 = mul2;
1468:             *pc3 = mul3;
1469: 
1470:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1471:             /* update this row based on pivot row */
1472:             for (j=0; j<nz_tmp; j++) {
1473:               tmp = pv[j];
1474:               idx = pj[j];
1475:               rtmp11[idx] -= mul1 * tmp;
1476:               rtmp22[idx] -= mul2 * tmp;
1477:               rtmp33[idx] -= mul3 * tmp;
1478:             }
1479:             PetscLogFlops(6*nz_tmp);
1480:           }
1481:           prow = *bjtmp++ ;
1482:         }

1484:         /* Now take care of diagonal 3x3 block in this set of rows */
1485:         /* note: prow = row here */
1486:         pc1 = rtmp11 + prow;
1487:         pc2 = rtmp22 + prow;
1488:         pc3 = rtmp33 + prow;

1490:         sctx.pv = *pc1;
1491:         pj      = bj + bi[prow];
1492:         rs      = 0.0;
1493:         for (j=0; j<nz; j++){
1494:           idx = pj[j];
1495:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1496:         }
1497:         sctx.rs = rs;
1498:         MatLUCheckShift_inline(info,sctx,row,newshift);
1499:         if (newshift == 1) goto endofwhile;

1501:         if (*pc2 != 0.0 || *pc3 != 0.0){
1502:           mul2 = (*pc2)/(*pc1);
1503:           mul3 = (*pc3)/(*pc1);
1504:           *pc2 = mul2;
1505:           *pc3 = mul3;
1506:           nz_tmp = bi[prow+1] - bd[prow] - 1;
1507:           pj     = nbj + bd[prow];
1508:           for (j=0; j<nz_tmp; j++) {
1509:             idx = pj[j] ;
1510:             tmp = rtmp11[idx];
1511:             rtmp22[idx] -= mul2 * tmp;
1512:             rtmp33[idx] -= mul3 * tmp;
1513:           }
1514:           PetscLogFlops(4*nz_tmp);
1515:         }
1516:         ++prow;

1518:         pc2 = rtmp22 + prow;
1519:         pc3 = rtmp33 + prow;
1520:         sctx.pv = *pc2;
1521:         pj      = bj + bi[prow];
1522:         rs      = 0.0;
1523:         for (j=0; j<nz; j++){
1524:           idx = pj[j];
1525:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1526:         }
1527:         sctx.rs = rs;
1528:         MatLUCheckShift_inline(info,sctx,row+1,newshift);
1529:         if (newshift == 1) goto endofwhile;

1531:         if (*pc3 != 0.0){
1532:           mul3   = (*pc3)/(*pc2);
1533:           *pc3   = mul3;
1534:           pj     = nbj + bd[prow];
1535:           nz_tmp = bi[prow+1] - bd[prow] - 1;
1536:           for (j=0; j<nz_tmp; j++) {
1537:             idx = pj[j] ;
1538:             tmp = rtmp22[idx];
1539:             rtmp33[idx] -= mul3 * tmp;
1540:           }
1541:           PetscLogFlops(4*nz_tmp);
1542:         }

1544:         pj  = bj + bi[row];
1545:         pc1 = ba + bi[row];
1546:         pc2 = ba + bi[row+1];
1547:         pc3 = ba + bi[row+2];

1549:         sctx.pv = rtmp33[row+2];
1550:         rs = 0.0;
1551:         rtmp11[row]   = 1.0/rtmp11[row];
1552:         rtmp22[row+1] = 1.0/rtmp22[row+1];
1553:         rtmp33[row+2] = 1.0/rtmp33[row+2];
1554:         /* copy row entries from dense representation to sparse */
1555:         for (j=0; j<nz; j++) {
1556:           idx    = pj[j];
1557:           pc1[j] = rtmp11[idx];
1558:           pc2[j] = rtmp22[idx];
1559:           pc3[j] = rtmp33[idx];
1560:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1561:         }

1563:         sctx.rs = rs;
1564:         MatLUCheckShift_inline(info,sctx,row+2,newshift);
1565:         if (newshift == 1) goto endofwhile;
1566:         break;

1568:       default:
1569:         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1570:       }
1571:       row += nodesz;                 /* Update the row */
1572:     }
1573:     endofwhile:;
1574:   } while (sctx.lushift);
1575:   PetscFree(rtmp11);
1576:   PetscFree(tmp_vec2);
1577:   ISRestoreIndices(isicol,&ic);
1578:   ISRestoreIndices(isrow,&r);
1579:   ISRestoreIndices(iscol,&c);
1580:   C->factor      = FACTOR_LU;
1581:   C->assembled   = PETSC_TRUE;
1582:   if (sctx.nshift) {
1583:     if (info->shiftnz) {
1584:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1585:     } else if (info->shiftpd) {
1586:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);
1587:     }
1588:   }
1589:   PetscLogFlops(C->cmap.n);
1590:   return(0);
1591: }

1593: /*
1594:      Makes a longer coloring[] array and calls the usual code with that
1595: */
1598: PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1599: {
1600:   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1601:   PetscErrorCode  ierr;
1602:   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1603:   PetscInt        *colorused,i;
1604:   ISColoringValue *newcolor;

1607:   PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
1608:   /* loop over inodes, marking a color for each column*/
1609:   row = 0;
1610:   for (i=0; i<m; i++){
1611:     for (j=0; j<ns[i]; j++) {
1612:       newcolor[row++] = coloring[i] + j*ncolors;
1613:     }
1614:   }

1616:   /* eliminate unneeded colors */
1617:   PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
1618:   PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
1619:   for (i=0; i<n; i++) {
1620:     colorused[newcolor[i]] = 1;
1621:   }

1623:   for (i=1; i<5*ncolors; i++) {
1624:     colorused[i] += colorused[i-1];
1625:   }
1626:   ncolors = colorused[5*ncolors-1];
1627:   for (i=0; i<n; i++) {
1628:     newcolor[i] = colorused[newcolor[i]]-1;
1629:   }
1630:   PetscFree(colorused);
1631:   ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);
1632:   PetscFree(coloring);
1633:   return(0);
1634: }

1636:  #include src/inline/ilu.h

1640: PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1641: {
1642:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1643:   PetscScalar        *x,*xs,*ibdiag,*bdiag,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1644:   PetscScalar        *b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1645:   const PetscScalar  *v = a->a,*v1,*v2,*v3,*v4,*v5;
1646:   PetscReal          zeropivot = 1.0e-15;
1647:   PetscErrorCode     ierr;
1648:   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1649:   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;

1652:   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1653:   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1654:   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode");

1656:   if (!a->inode.ibdiagvalid) {
1657:     if (!a->inode.ibdiag) {
1658:       /* calculate space needed for diagonal blocks */
1659:       for (i=0; i<m; i++) {
1660:         cnt += sizes[i]*sizes[i];
1661:       }
1662:       a->inode.bdiagsize = cnt;
1663:       PetscMalloc2(cnt,PetscScalar,&a->inode.ibdiag,cnt,PetscScalar,&a->inode.bdiag);
1664:     }

1666:     /* copy over the diagonal blocks and invert them */
1667:     ibdiag = a->inode.ibdiag;
1668:     bdiag  = a->inode.bdiag;
1669:     cnt = 0;
1670:     for (i=0, row = 0; i<m; i++) {
1671:       for (j=0; j<sizes[i]; j++) {
1672:         for (k=0; k<sizes[i]; k++) {
1673:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1674:         }
1675:       }
1676:       PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(PetscScalar));
1677: 
1678:       switch(sizes[i]) {
1679:         case 1:
1680:           /* Create matrix data structure */
1681:           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1682:           ibdiag[cnt] = 1.0/ibdiag[cnt];
1683:           break;
1684:         case 2:
1685:           Kernel_A_gets_inverse_A_2(ibdiag+cnt);
1686:           break;
1687:         case 3:
1688:           Kernel_A_gets_inverse_A_3(ibdiag+cnt);
1689:           break;
1690:         case 4:
1691:           Kernel_A_gets_inverse_A_4(ibdiag+cnt);
1692:           break;
1693:         case 5:
1694:           Kernel_A_gets_inverse_A_5(ibdiag+cnt);
1695:           break;
1696:        default:
1697:          SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1698:       }
1699:       cnt += sizes[i]*sizes[i];
1700:       row += sizes[i];
1701:     }
1702:     a->inode.ibdiagvalid = PETSC_TRUE;
1703:   }
1704:   ibdiag = a->inode.ibdiag;
1705:   bdiag  = a->inode.bdiag;

1707:   VecGetArray(xx,&x);
1708:   if (xx != bb) {
1709:     VecGetArray(bb,(PetscScalar**)&b);
1710:   } else {
1711:     b = x;
1712:   }

1714:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1715:   xs   = x;
1716:   if (flag & SOR_ZERO_INITIAL_GUESS) {
1717:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){

1719:       for (i=0, row=0; i<m; i++) {
1720:         sz  = diag[row] - ii[row];
1721:         v1  = a->a + ii[row];
1722:         idx = a->j + ii[row];

1724:         /* see comments for MatMult_Inode() for how this is coded */
1725:         switch (sizes[i]){
1726:           case 1:
1727: 
1728:             sum1  = b[row];
1729:             for(n = 0; n<sz-1; n+=2) {
1730:               i1   = idx[0];
1731:               i2   = idx[1];
1732:               idx += 2;
1733:               tmp0 = x[i1];
1734:               tmp1 = x[i2];
1735:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1736:             }
1737: 
1738:             if (n == sz-1){
1739:               tmp0  = x[*idx];
1740:               sum1 -= *v1 * tmp0;
1741:             }
1742:             x[row++] = sum1*(*ibdiag++);
1743:             break;
1744:           case 2:
1745:             v2    = a->a + ii[row+1];
1746:             sum1  = b[row];
1747:             sum2  = b[row+1];
1748:             for(n = 0; n<sz-1; n+=2) {
1749:               i1   = idx[0];
1750:               i2   = idx[1];
1751:               idx += 2;
1752:               tmp0 = x[i1];
1753:               tmp1 = x[i2];
1754:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1755:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1756:             }
1757: 
1758:             if (n == sz-1){
1759:               tmp0  = x[*idx];
1760:               sum1 -= v1[0] * tmp0;
1761:               sum2 -= v2[0] * tmp0;
1762:             }
1763:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1764:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1765:             ibdiag  += 4;
1766:             break;
1767:           case 3:
1768:             v2    = a->a + ii[row+1];
1769:             v3    = a->a + ii[row+2];
1770:             sum1  = b[row];
1771:             sum2  = b[row+1];
1772:             sum3  = b[row+2];
1773:             for(n = 0; n<sz-1; n+=2) {
1774:               i1   = idx[0];
1775:               i2   = idx[1];
1776:               idx += 2;
1777:               tmp0 = x[i1];
1778:               tmp1 = x[i2];
1779:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1780:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1781:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1782:             }
1783: 
1784:             if (n == sz-1){
1785:               tmp0  = x[*idx];
1786:               sum1 -= v1[0] * tmp0;
1787:               sum2 -= v2[0] * tmp0;
1788:               sum3 -= v3[0] * tmp0;
1789:             }
1790:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1791:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1792:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1793:             ibdiag  += 9;
1794:             break;
1795:           case 4:
1796:             v2    = a->a + ii[row+1];
1797:             v3    = a->a + ii[row+2];
1798:             v4    = a->a + ii[row+3];
1799:             sum1  = b[row];
1800:             sum2  = b[row+1];
1801:             sum3  = b[row+2];
1802:             sum4  = b[row+3];
1803:             for(n = 0; n<sz-1; n+=2) {
1804:               i1   = idx[0];
1805:               i2   = idx[1];
1806:               idx += 2;
1807:               tmp0 = x[i1];
1808:               tmp1 = x[i2];
1809:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1810:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1811:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1812:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1813:             }
1814: 
1815:             if (n == sz-1){
1816:               tmp0  = x[*idx];
1817:               sum1 -= v1[0] * tmp0;
1818:               sum2 -= v2[0] * tmp0;
1819:               sum3 -= v3[0] * tmp0;
1820:               sum4 -= v4[0] * tmp0;
1821:             }
1822:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1823:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1824:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1825:             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1826:             ibdiag  += 16;
1827:             break;
1828:           case 5:
1829:             v2    = a->a + ii[row+1];
1830:             v3    = a->a + ii[row+2];
1831:             v4    = a->a + ii[row+3];
1832:             v5    = a->a + ii[row+4];
1833:             sum1  = b[row];
1834:             sum2  = b[row+1];
1835:             sum3  = b[row+2];
1836:             sum4  = b[row+3];
1837:             sum5  = b[row+4];
1838:             for(n = 0; n<sz-1; n+=2) {
1839:               i1   = idx[0];
1840:               i2   = idx[1];
1841:               idx += 2;
1842:               tmp0 = x[i1];
1843:               tmp1 = x[i2];
1844:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1845:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1846:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1847:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1848:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1849:             }
1850: 
1851:             if (n == sz-1){
1852:               tmp0  = x[*idx];
1853:               sum1 -= v1[0] * tmp0;
1854:               sum2 -= v2[0] * tmp0;
1855:               sum3 -= v3[0] * tmp0;
1856:               sum4 -= v4[0] * tmp0;
1857:               sum5 -= v5[0] * tmp0;
1858:             }
1859:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1860:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1861:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1862:             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1863:             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1864:             ibdiag  += 25;
1865:             break;
1866:           default:
1867:                SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1868:         }
1869:       }

1871:       xb = x;
1872:       PetscLogFlops(a->nz);
1873:     } else xb = b;
1874:     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1875:         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1876:       cnt = 0;
1877:       for (i=0, row=0; i<m; i++) {

1879:         switch (sizes[i]){
1880:           case 1:
1881:             x[row++] *= bdiag[cnt++];
1882:             break;
1883:           case 2:
1884:             x1   = x[row]; x2 = x[row+1];
1885:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1886:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1887:             x[row++] = tmp1;
1888:             x[row++] = tmp2;
1889:             cnt += 4;
1890:             break;
1891:           case 3:
1892:             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1893:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1894:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1895:             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1896:             x[row++] = tmp1;
1897:             x[row++] = tmp2;
1898:             x[row++] = tmp3;
1899:             cnt += 9;
1900:             break;
1901:           case 4:
1902:             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1903:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1904:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1905:             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1906:             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1907:             x[row++] = tmp1;
1908:             x[row++] = tmp2;
1909:             x[row++] = tmp3;
1910:             x[row++] = tmp4;
1911:             cnt += 16;
1912:             break;
1913:           case 5:
1914:             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1915:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1916:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1917:             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1918:             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1919:             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1920:             x[row++] = tmp1;
1921:             x[row++] = tmp2;
1922:             x[row++] = tmp3;
1923:             x[row++] = tmp4;
1924:             x[row++] = tmp5;
1925:             cnt += 25;
1926:             break;
1927:           default:
1928:                SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1929:         }
1930:       }
1931:       PetscLogFlops(m);
1932:     }
1933:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){

1935:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1936:       for (i=m-1, row=A->rmap.n-1; i>=0; i--) {
1937:         ibdiag -= sizes[i]*sizes[i];
1938:         sz      = ii[row+1] - diag[row] - 1;
1939:         v1      = a->a + diag[row] + 1;
1940:         idx     = a->j + diag[row] + 1;

1942:         /* see comments for MatMult_Inode() for how this is coded */
1943:         switch (sizes[i]){
1944:           case 1:
1945: 
1946:             sum1  = xb[row];
1947:             for(n = 0; n<sz-1; n+=2) {
1948:               i1   = idx[0];
1949:               i2   = idx[1];
1950:               idx += 2;
1951:               tmp0 = x[i1];
1952:               tmp1 = x[i2];
1953:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1954:             }
1955: 
1956:             if (n == sz-1){
1957:               tmp0  = x[*idx];
1958:               sum1 -= *v1*tmp0;
1959:             }
1960:             x[row--] = sum1*(*ibdiag);
1961:             break;

1963:           case 2:
1964: 
1965:             sum1  = xb[row];
1966:             sum2  = xb[row-1];
1967:             /* note that sum1 is associated with the second of the two rows */
1968:             v2    = a->a + diag[row-1] + 2;
1969:             for(n = 0; n<sz-1; n+=2) {
1970:               i1   = idx[0];
1971:               i2   = idx[1];
1972:               idx += 2;
1973:               tmp0 = x[i1];
1974:               tmp1 = x[i2];
1975:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1976:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1977:             }
1978: 
1979:             if (n == sz-1){
1980:               tmp0  = x[*idx];
1981:               sum1 -= *v1*tmp0;
1982:               sum2 -= *v2*tmp0;
1983:             }
1984:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1985:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1986:             break;
1987:           case 3:
1988: 
1989:             sum1  = xb[row];
1990:             sum2  = xb[row-1];
1991:             sum3  = xb[row-2];
1992:             v2    = a->a + diag[row-1] + 2;
1993:             v3    = a->a + diag[row-2] + 3;
1994:             for(n = 0; n<sz-1; n+=2) {
1995:               i1   = idx[0];
1996:               i2   = idx[1];
1997:               idx += 2;
1998:               tmp0 = x[i1];
1999:               tmp1 = x[i2];
2000:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2001:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2002:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2003:             }
2004: 
2005:             if (n == sz-1){
2006:               tmp0  = x[*idx];
2007:               sum1 -= *v1*tmp0;
2008:               sum2 -= *v2*tmp0;
2009:               sum3 -= *v3*tmp0;
2010:             }
2011:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2012:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2013:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2014:             break;
2015:           case 4:
2016: 
2017:             sum1  = xb[row];
2018:             sum2  = xb[row-1];
2019:             sum3  = xb[row-2];
2020:             sum4  = xb[row-3];
2021:             v2    = a->a + diag[row-1] + 2;
2022:             v3    = a->a + diag[row-2] + 3;
2023:             v4    = a->a + diag[row-3] + 4;
2024:             for(n = 0; n<sz-1; n+=2) {
2025:               i1   = idx[0];
2026:               i2   = idx[1];
2027:               idx += 2;
2028:               tmp0 = x[i1];
2029:               tmp1 = x[i2];
2030:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2031:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2032:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2033:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2034:             }
2035: 
2036:             if (n == sz-1){
2037:               tmp0  = x[*idx];
2038:               sum1 -= *v1*tmp0;
2039:               sum2 -= *v2*tmp0;
2040:               sum3 -= *v3*tmp0;
2041:               sum4 -= *v4*tmp0;
2042:             }
2043:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2044:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2045:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2046:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2047:             break;
2048:           case 5:
2049: 
2050:             sum1  = xb[row];
2051:             sum2  = xb[row-1];
2052:             sum3  = xb[row-2];
2053:             sum4  = xb[row-3];
2054:             sum5  = xb[row-4];
2055:             v2    = a->a + diag[row-1] + 2;
2056:             v3    = a->a + diag[row-2] + 3;
2057:             v4    = a->a + diag[row-3] + 4;
2058:             v5    = a->a + diag[row-4] + 5;
2059:             for(n = 0; n<sz-1; n+=2) {
2060:               i1   = idx[0];
2061:               i2   = idx[1];
2062:               idx += 2;
2063:               tmp0 = x[i1];
2064:               tmp1 = x[i2];
2065:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2066:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2067:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2068:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2069:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2070:             }
2071: 
2072:             if (n == sz-1){
2073:               tmp0  = x[*idx];
2074:               sum1 -= *v1*tmp0;
2075:               sum2 -= *v2*tmp0;
2076:               sum3 -= *v3*tmp0;
2077:               sum4 -= *v4*tmp0;
2078:               sum5 -= *v5*tmp0;
2079:             }
2080:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2081:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2082:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2083:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2084:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2085:             break;
2086:           default:
2087:                SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2088:         }
2089:       }

2091:       PetscLogFlops(a->nz);
2092:     }
2093:     its--;
2094:   }
2095:   if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode");
2096:   VecRestoreArray(xx,&x);
2097:   if (bb != xx) {VecRestoreArray(bb,(PetscScalar**)&b);}
2098:   return(0);
2099: }


2102: /*
2103:     samestructure indicates that the matrix has not changed its nonzero structure so we 
2104:     do not need to recompute the inodes 
2105: */
2108: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2109: {
2110:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2112:   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2113:   PetscTruth     flag;

2116:   if (!a->inode.use)                     return(0);
2117:   if (a->inode.checked && samestructure) return(0);


2120:   m = A->rmap.n;
2121:   if (a->inode.size) {ns = a->inode.size;}
2122:   else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}

2124:   i          = 0;
2125:   node_count = 0;
2126:   idx        = a->j;
2127:   ii         = a->i;
2128:   while (i < m){                /* For each row */
2129:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2130:     /* Limits the number of elements in a node to 'a->inode.limit' */
2131:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2132:       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2133:       if (nzy != nzx) break;
2134:       idy  += nzx;             /* Same nonzero pattern */
2135:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
2136:       if (!flag) break;
2137:     }
2138:     ns[node_count++] = blk_size;
2139:     idx += blk_size*nzx;
2140:     i    = j;
2141:   }
2142:   /* If not enough inodes found,, do not use inode version of the routines */
2143:   if (!a->inode.size && m && node_count > .9*m) {
2144:     PetscFree(ns);
2145:     a->inode.node_count     = 0;
2146:     a->inode.size           = PETSC_NULL;
2147:     a->inode.use            = PETSC_FALSE;
2148:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
2149:   } else {
2150:     A->ops->mult            = MatMult_Inode;
2151:     A->ops->relax           = MatRelax_Inode;
2152:     A->ops->multadd         = MatMultAdd_Inode;
2153:     A->ops->solve           = MatSolve_Inode;
2154:     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
2155:     A->ops->getrowij        = MatGetRowIJ_Inode;
2156:     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
2157:     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
2158:     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2159:     A->ops->coloringpatch   = MatColoringPatch_Inode;
2160:     a->inode.node_count     = node_count;
2161:     a->inode.size           = ns;
2162:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
2163:   }
2164:   return(0);
2165: }

2167: /*
2168:      This is really ugly. if inodes are used this replaces the 
2169:   permutations with ones that correspond to rows/cols of the matrix
2170:   rather then inode blocks
2171: */
2174: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2175: {
2176:   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);

2179:   PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
2180:   if (f) {
2181:     (*f)(A,rperm,cperm);
2182:   }
2183:   return(0);
2184: }

2189: PetscErrorCode  MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2190: {
2191:   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2193:   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
2194:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2195:   PetscInt       nslim_col,*ns_col;
2196:   IS             ris = *rperm,cis = *cperm;

2199:   if (!a->inode.size) return(0); /* no inodes so return */
2200:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

2202:   Mat_CreateColInode(A,&nslim_col,&ns_col);
2203:   PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
2204:   PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
2205:   permc = permr + m;

2207:   ISGetIndices(ris,&ridx);
2208:   ISGetIndices(cis,&cidx);

2210:   /* Form the inode structure for the rows of permuted matric using inv perm*/
2211:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

2213:   /* Construct the permutations for rows*/
2214:   for (i=0,row = 0; i<nslim_row; ++i){
2215:     indx      = ridx[i];
2216:     start_val = tns[indx];
2217:     end_val   = tns[indx + 1];
2218:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2219:   }

2221:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2222:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

2224:  /* Construct permutations for columns */
2225:   for (i=0,col=0; i<nslim_col; ++i){
2226:     indx      = cidx[i];
2227:     start_val = tns[indx];
2228:     end_val   = tns[indx + 1];
2229:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2230:   }

2232:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
2233:   ISSetPermutation(*rperm);
2234:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
2235:   ISSetPermutation(*cperm);
2236: 
2237:   ISRestoreIndices(ris,&ridx);
2238:   ISRestoreIndices(cis,&cidx);

2240:   PetscFree(ns_col);
2241:   PetscFree(permr);
2242:   ISDestroy(cis);
2243:   ISDestroy(ris);
2244:   PetscFree(tns);
2245:   return(0);
2246: }

2251: /*@C
2252:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

2254:    Collective on Mat

2256:    Input Parameter:
2257: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

2259:    Output Parameter:
2260: +  node_count - no of inodes present in the matrix.
2261: .  sizes      - an array of size node_count,with sizes of each inode.
2262: -  limit      - the max size used to generate the inodes.

2264:    Level: advanced

2266:    Notes: This routine returns some internal storage information
2267:    of the matrix, it is intended to be used by advanced users.
2268:    It should be called after the matrix is assembled.
2269:    The contents of the sizes[] array should not be changed.
2270:    PETSC_NULL may be passed for information not requested.

2272: .keywords: matrix, seqaij, get, inode

2274: .seealso: MatGetInfo()
2275: @*/
2276: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2277: {
2278:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

2281:   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2282:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
2283:   if (f) {
2284:     (*f)(A,node_count,sizes,limit);
2285:   }
2286:   return(0);
2287: }

2292: PetscErrorCode  MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2293: {
2294:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

2297:   if (node_count) *node_count = a->inode.node_count;
2298:   if (sizes)      *sizes      = a->inode.size;
2299:   if (limit)      *limit      = a->inode.limit;
2300:   return(0);
2301: }