Actual source code: inode.c

petsc-3.6.0 2015-06-09
Report Typos and Errors
  2: /*
  3:   This file provides high performance routines for the Inode format (compressed sparse row)
  4:   by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h>

 10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
 11: {
 12:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 14:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 17:   n      = A->cmap->n;
 18:   m      = A->rmap->n;
 19:   ns_row = a->inode.size;

 21:   min_mn = (m < n) ? m : n;
 22:   if (!ns) {
 23:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
 24:     for (; count+1 < n; count++,i++) ;
 25:     if (count < n)  {
 26:       i++;
 27:     }
 28:     *size = i;
 29:     return(0);
 30:   }
 31:   PetscMalloc1(n+1,&ns_col);

 33:   /* Use the same row structure wherever feasible. */
 34:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 35:     ns_col[i] = ns_row[i];
 36:   }

 38:   /* if m < n; pad up the remainder with inode_limit */
 39:   for (; count+1 < n; count++,i++) {
 40:     ns_col[i] = 1;
 41:   }
 42:   /* The last node is the odd ball. padd it up with the remaining rows; */
 43:   if (count < n) {
 44:     ns_col[i] = n - count;
 45:     i++;
 46:   } else if (count > n) {
 47:     /* Adjust for the over estimation */
 48:     ns_col[i-1] += n - count;
 49:   }
 50:   *size = i;
 51:   *ns   = ns_col;
 52:   return(0);
 53: }


 56: /*
 57:       This builds symmetric version of nonzero structure,
 58: */
 61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 62: {
 63:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 65:   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
 66:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
 67:   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;

 70:   nslim_row = a->inode.node_count;
 71:   m         = A->rmap->n;
 72:   n         = A->cmap->n;
 73:   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");

 75:   /* Use the row_inode as column_inode */
 76:   nslim_col = nslim_row;
 77:   ns_col    = ns_row;

 79:   /* allocate space for reformated inode structure */
 80:   PetscMalloc1(nslim_col+1,&tns);
 81:   PetscMalloc1(n+1,&tvc);
 82:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 84:   for (i1=0,col=0; i1<nslim_col; ++i1) {
 85:     nsz = ns_col[i1];
 86:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
 87:   }
 88:   /* allocate space for row pointers */
 89:   PetscMalloc1(nslim_row+1,&ia);
 90:   *iia = ia;
 91:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
 92:   PetscMalloc1(nslim_row+1,&work);

 94:   /* determine the number of columns in each row */
 95:   ia[0] = oshift;
 96:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {

 98:     j    = aj + ai[row] + ishift;
 99:     jmax = aj + ai[row+1] + ishift;
100:     col  = *j++ + ishift;
101:     i2   = tvc[col];
102:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103:       ia[i1+1]++;
104:       ia[i2+1]++;
105:       i2++;                     /* Start col of next node */
106:       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107:       i2 = tvc[col];
108:     }
109:     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110:   }

112:   /* shift ia[i] to point to next row */
113:   for (i1=1; i1<nslim_row+1; i1++) {
114:     row        = ia[i1-1];
115:     ia[i1]    += row;
116:     work[i1-1] = row - oshift;
117:   }

119:   /* allocate space for column pointers */
120:   nz   = ia[nslim_row] + (!ishift);
121:   PetscMalloc1(nz,&ja);
122:   *jja = ja;

124:   /* loop over lower triangular part putting into ja */
125:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126:     j    = aj + ai[row] + ishift;
127:     jmax = aj + ai[row+1] + ishift;
128:     col  = *j++ + ishift;
129:     i2   = tvc[col];
130:     while (i2<i1 && j<jmax) {
131:       ja[work[i2]++] = i1 + oshift;
132:       ja[work[i1]++] = i2 + oshift;
133:       ++i2;
134:       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135:       i2 = tvc[col];
136:     }
137:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

139:   }
140:   PetscFree(work);
141:   PetscFree(tns);
142:   PetscFree(tvc);
143:   return(0);
144: }

146: /*
147:       This builds nonsymmetric version of nonzero structure,
148: */
151: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152: {
153:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
155:   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156:   PetscInt       *tns,*tvc,nsz,i1,i2;
157:   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;

160:   nslim_row = a->inode.node_count;
161:   n         = A->cmap->n;

163:   /* Create The column_inode for this matrix */
164:   Mat_CreateColInode(A,&nslim_col,&ns_col);

166:   /* allocate space for reformated column_inode structure */
167:   PetscMalloc1(nslim_col +1,&tns);
168:   PetscMalloc1(n + 1,&tvc);
169:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

171:   for (i1=0,col=0; i1<nslim_col; ++i1) {
172:     nsz = ns_col[i1];
173:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174:   }
175:   /* allocate space for row pointers */
176:   PetscMalloc1(nslim_row+1,&ia);
177:   *iia = ia;
178:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
179:   PetscMalloc1(nslim_row+1,&work);

181:   /* determine the number of columns in each row */
182:   ia[0] = oshift;
183:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184:     j   = aj + ai[row] + ishift;
185:     col = *j++ + ishift;
186:     i2  = tvc[col];
187:     nz  = ai[row+1] - ai[row];
188:     while (nz-- > 0) {           /* off-diagonal elemets */
189:       ia[i1+1]++;
190:       i2++;                     /* Start col of next node */
191:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192:       if (nz > 0) i2 = tvc[col];
193:     }
194:   }

196:   /* shift ia[i] to point to next row */
197:   for (i1=1; i1<nslim_row+1; i1++) {
198:     row        = ia[i1-1];
199:     ia[i1]    += row;
200:     work[i1-1] = row - oshift;
201:   }

203:   /* allocate space for column pointers */
204:   nz   = ia[nslim_row] + (!ishift);
205:   PetscMalloc1(nz,&ja);
206:   *jja = ja;

208:   /* loop over matrix putting into ja */
209:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210:     j   = aj + ai[row] + ishift;
211:     col = *j++ + ishift;
212:     i2  = tvc[col];
213:     nz  = ai[row+1] - ai[row];
214:     while (nz-- > 0) {
215:       ja[work[i1]++] = i2 + oshift;
216:       ++i2;
217:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218:       if (nz > 0) i2 = tvc[col];
219:     }
220:   }
221:   PetscFree(ns_col);
222:   PetscFree(work);
223:   PetscFree(tns);
224:   PetscFree(tvc);
225:   return(0);
226: }

230: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231: {
232:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

236:   *n = a->inode.node_count;
237:   if (!ia) return(0);
238:   if (!blockcompressed) {
239:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
240:   } else if (symmetric) {
241:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
242:   } else {
243:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
244:   }
245:   return(0);
246: }

250: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251: {

255:   if (!ia) return(0);

257:   if (!blockcompressed) {
258:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
259:   } else {
260:     PetscFree(*ia);
261:     PetscFree(*ja);
262:   }
263:   return(0);
264: }

266: /* ----------------------------------------------------------- */

270: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271: {
272:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
274:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

278:   nslim_row = a->inode.node_count;
279:   n         = A->cmap->n;

281:   /* Create The column_inode for this matrix */
282:   Mat_CreateColInode(A,&nslim_col,&ns_col);

284:   /* allocate space for reformated column_inode structure */
285:   PetscMalloc1(nslim_col + 1,&tns);
286:   PetscMalloc1(n + 1,&tvc);
287:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

289:   for (i1=0,col=0; i1<nslim_col; ++i1) {
290:     nsz = ns_col[i1];
291:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292:   }
293:   /* allocate space for column pointers */
294:   PetscMalloc1(nslim_col+1,&ia);
295:   *iia = ia;
296:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
297:   PetscMalloc1(nslim_col+1,&work);

299:   /* determine the number of columns in each row */
300:   ia[0] = oshift;
301:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302:     j   = aj + ai[row] + ishift;
303:     col = *j++ + ishift;
304:     i2  = tvc[col];
305:     nz  = ai[row+1] - ai[row];
306:     while (nz-- > 0) {           /* off-diagonal elemets */
307:       /* ia[i1+1]++; */
308:       ia[i2+1]++;
309:       i2++;
310:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311:       if (nz > 0) i2 = tvc[col];
312:     }
313:   }

315:   /* shift ia[i] to point to next col */
316:   for (i1=1; i1<nslim_col+1; i1++) {
317:     col        = ia[i1-1];
318:     ia[i1]    += col;
319:     work[i1-1] = col - oshift;
320:   }

322:   /* allocate space for column pointers */
323:   nz   = ia[nslim_col] + (!ishift);
324:   PetscMalloc1(nz,&ja);
325:   *jja = ja;

327:   /* loop over matrix putting into ja */
328:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329:     j   = aj + ai[row] + ishift;
330:     col = *j++ + ishift;
331:     i2  = tvc[col];
332:     nz  = ai[row+1] - ai[row];
333:     while (nz-- > 0) {
334:       /* ja[work[i1]++] = i2 + oshift; */
335:       ja[work[i2]++] = i1 + oshift;
336:       i2++;
337:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338:       if (nz > 0) i2 = tvc[col];
339:     }
340:   }
341:   PetscFree(ns_col);
342:   PetscFree(work);
343:   PetscFree(tns);
344:   PetscFree(tvc);
345:   return(0);
346: }

350: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351: {

355:   Mat_CreateColInode(A,n,NULL);
356:   if (!ia) return(0);

358:   if (!blockcompressed) {
359:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
360:   } else if (symmetric) {
361:     /* Since the indices are symmetric it does'nt matter */
362:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
363:   } else {
364:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
365:   }
366:   return(0);
367: }

371: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
372: {

376:   if (!ia) return(0);
377:   if (!blockcompressed) {
378:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
379:   } else {
380:     PetscFree(*ia);
381:     PetscFree(*ja);
382:   }
383:   return(0);
384: }

386: /* ----------------------------------------------------------- */

390: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391: {
392:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394:   PetscScalar       *y;
395:   const PetscScalar *x;
396:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
397:   PetscErrorCode    ierr;
398:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399:   const PetscInt    *idx,*ns,*ii;

401: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403: #endif

406:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407:   node_max = a->inode.node_count;
408:   ns       = a->inode.size;     /* Node Size array */
409:   VecGetArrayRead(xx,&x);
410:   VecGetArray(yy,&y);
411:   idx      = a->j;
412:   v1       = a->a;
413:   ii       = a->i;

415:   for (i = 0,row = 0; i< node_max; ++i) {
416:     nsz         = ns[i];
417:     n           = ii[1] - ii[0];
418:     nonzerorow += (n>0)*nsz;
419:     ii         += nsz;
420:     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421:     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422:     sz = n;                     /* No of non zeros in this row */
423:                                 /* Switch on the size of Node */
424:     switch (nsz) {               /* Each loop in 'case' is unrolled */
425:     case 1:
426:       sum1 = 0.;

428:       for (n = 0; n< sz-1; n+=2) {
429:         i1    = idx[0];         /* The instructions are ordered to */
430:         i2    = idx[1];         /* make the compiler's job easy */
431:         idx  += 2;
432:         tmp0  = x[i1];
433:         tmp1  = x[i2];
434:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435:       }

437:       if (n == sz-1) {          /* Take care of the last nonzero  */
438:         tmp0  = x[*idx++];
439:         sum1 += *v1++ *tmp0;
440:       }
441:       y[row++]=sum1;
442:       break;
443:     case 2:
444:       sum1 = 0.;
445:       sum2 = 0.;
446:       v2   = v1 + n;

448:       for (n = 0; n< sz-1; n+=2) {
449:         i1    = idx[0];
450:         i2    = idx[1];
451:         idx  += 2;
452:         tmp0  = x[i1];
453:         tmp1  = x[i2];
454:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456:       }
457:       if (n == sz-1) {
458:         tmp0  = x[*idx++];
459:         sum1 += *v1++ * tmp0;
460:         sum2 += *v2++ * tmp0;
461:       }
462:       y[row++]=sum1;
463:       y[row++]=sum2;
464:       v1      =v2;              /* Since the next block to be processed starts there*/
465:       idx    +=sz;
466:       break;
467:     case 3:
468:       sum1 = 0.;
469:       sum2 = 0.;
470:       sum3 = 0.;
471:       v2   = v1 + n;
472:       v3   = v2 + n;

474:       for (n = 0; n< sz-1; n+=2) {
475:         i1    = idx[0];
476:         i2    = idx[1];
477:         idx  += 2;
478:         tmp0  = x[i1];
479:         tmp1  = x[i2];
480:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483:       }
484:       if (n == sz-1) {
485:         tmp0  = x[*idx++];
486:         sum1 += *v1++ * tmp0;
487:         sum2 += *v2++ * tmp0;
488:         sum3 += *v3++ * tmp0;
489:       }
490:       y[row++]=sum1;
491:       y[row++]=sum2;
492:       y[row++]=sum3;
493:       v1      =v3;              /* Since the next block to be processed starts there*/
494:       idx    +=2*sz;
495:       break;
496:     case 4:
497:       sum1 = 0.;
498:       sum2 = 0.;
499:       sum3 = 0.;
500:       sum4 = 0.;
501:       v2   = v1 + n;
502:       v3   = v2 + n;
503:       v4   = v3 + n;

505:       for (n = 0; n< sz-1; n+=2) {
506:         i1    = idx[0];
507:         i2    = idx[1];
508:         idx  += 2;
509:         tmp0  = x[i1];
510:         tmp1  = x[i2];
511:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515:       }
516:       if (n == sz-1) {
517:         tmp0  = x[*idx++];
518:         sum1 += *v1++ * tmp0;
519:         sum2 += *v2++ * tmp0;
520:         sum3 += *v3++ * tmp0;
521:         sum4 += *v4++ * tmp0;
522:       }
523:       y[row++]=sum1;
524:       y[row++]=sum2;
525:       y[row++]=sum3;
526:       y[row++]=sum4;
527:       v1      =v4;              /* Since the next block to be processed starts there*/
528:       idx    +=3*sz;
529:       break;
530:     case 5:
531:       sum1 = 0.;
532:       sum2 = 0.;
533:       sum3 = 0.;
534:       sum4 = 0.;
535:       sum5 = 0.;
536:       v2   = v1 + n;
537:       v3   = v2 + n;
538:       v4   = v3 + n;
539:       v5   = v4 + n;

541:       for (n = 0; n<sz-1; n+=2) {
542:         i1    = idx[0];
543:         i2    = idx[1];
544:         idx  += 2;
545:         tmp0  = x[i1];
546:         tmp1  = x[i2];
547:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552:       }
553:       if (n == sz-1) {
554:         tmp0  = x[*idx++];
555:         sum1 += *v1++ * tmp0;
556:         sum2 += *v2++ * tmp0;
557:         sum3 += *v3++ * tmp0;
558:         sum4 += *v4++ * tmp0;
559:         sum5 += *v5++ * tmp0;
560:       }
561:       y[row++]=sum1;
562:       y[row++]=sum2;
563:       y[row++]=sum3;
564:       y[row++]=sum4;
565:       y[row++]=sum5;
566:       v1      =v5;       /* Since the next block to be processed starts there */
567:       idx    +=4*sz;
568:       break;
569:     default:
570:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571:     }
572:   }
573:   VecRestoreArrayRead(xx,&x);
574:   VecRestoreArray(yy,&y);
575:   PetscLogFlops(2.0*a->nz - nonzerorow);
576:   return(0);
577: }
578: /* ----------------------------------------------------------- */
579: /* Almost same code as the MatMult_SeqAIJ_Inode() */
582: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583: {
584:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
587:   const PetscScalar *x;
588:   PetscScalar       *y,*z,*zt;
589:   PetscErrorCode    ierr;
590:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
591:   const PetscInt    *idx,*ns,*ii;

594:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595:   node_max = a->inode.node_count;
596:   ns       = a->inode.size;     /* Node Size array */

598:   VecGetArrayRead(xx,&x);
599:   VecGetArray(yy,&y);
600:   if (zz != yy) {
601:     VecGetArray(zz,&z);
602:   } else {
603:     z = y;
604:   }
605:   zt = z;

607:   idx = a->j;
608:   v1  = a->a;
609:   ii  = a->i;

611:   for (i = 0,row = 0; i< node_max; ++i) {
612:     nsz = ns[i];
613:     n   = ii[1] - ii[0];
614:     ii += nsz;
615:     sz  = n;                    /* No of non zeros in this row */
616:                                 /* Switch on the size of Node */
617:     switch (nsz) {               /* Each loop in 'case' is unrolled */
618:     case 1:
619:       sum1 = *zt++;

621:       for (n = 0; n< sz-1; n+=2) {
622:         i1    = idx[0];         /* The instructions are ordered to */
623:         i2    = idx[1];         /* make the compiler's job easy */
624:         idx  += 2;
625:         tmp0  = x[i1];
626:         tmp1  = x[i2];
627:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
628:       }

630:       if (n   == sz-1) {          /* Take care of the last nonzero  */
631:         tmp0  = x[*idx++];
632:         sum1 += *v1++ * tmp0;
633:       }
634:       y[row++]=sum1;
635:       break;
636:     case 2:
637:       sum1 = *zt++;
638:       sum2 = *zt++;
639:       v2   = v1 + n;

641:       for (n = 0; n< sz-1; n+=2) {
642:         i1    = idx[0];
643:         i2    = idx[1];
644:         idx  += 2;
645:         tmp0  = x[i1];
646:         tmp1  = x[i2];
647:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649:       }
650:       if (n   == sz-1) {
651:         tmp0  = x[*idx++];
652:         sum1 += *v1++ * tmp0;
653:         sum2 += *v2++ * tmp0;
654:       }
655:       y[row++]=sum1;
656:       y[row++]=sum2;
657:       v1      =v2;              /* Since the next block to be processed starts there*/
658:       idx    +=sz;
659:       break;
660:     case 3:
661:       sum1 = *zt++;
662:       sum2 = *zt++;
663:       sum3 = *zt++;
664:       v2   = v1 + n;
665:       v3   = v2 + n;

667:       for (n = 0; n< sz-1; n+=2) {
668:         i1    = idx[0];
669:         i2    = idx[1];
670:         idx  += 2;
671:         tmp0  = x[i1];
672:         tmp1  = x[i2];
673:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
674:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
675:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
676:       }
677:       if (n == sz-1) {
678:         tmp0  = x[*idx++];
679:         sum1 += *v1++ * tmp0;
680:         sum2 += *v2++ * tmp0;
681:         sum3 += *v3++ * tmp0;
682:       }
683:       y[row++]=sum1;
684:       y[row++]=sum2;
685:       y[row++]=sum3;
686:       v1      =v3;              /* Since the next block to be processed starts there*/
687:       idx    +=2*sz;
688:       break;
689:     case 4:
690:       sum1 = *zt++;
691:       sum2 = *zt++;
692:       sum3 = *zt++;
693:       sum4 = *zt++;
694:       v2   = v1 + n;
695:       v3   = v2 + n;
696:       v4   = v3 + n;

698:       for (n = 0; n< sz-1; n+=2) {
699:         i1    = idx[0];
700:         i2    = idx[1];
701:         idx  += 2;
702:         tmp0  = x[i1];
703:         tmp1  = x[i2];
704:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
705:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
706:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
707:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
708:       }
709:       if (n == sz-1) {
710:         tmp0  = x[*idx++];
711:         sum1 += *v1++ * tmp0;
712:         sum2 += *v2++ * tmp0;
713:         sum3 += *v3++ * tmp0;
714:         sum4 += *v4++ * tmp0;
715:       }
716:       y[row++]=sum1;
717:       y[row++]=sum2;
718:       y[row++]=sum3;
719:       y[row++]=sum4;
720:       v1      =v4;              /* Since the next block to be processed starts there*/
721:       idx    +=3*sz;
722:       break;
723:     case 5:
724:       sum1 = *zt++;
725:       sum2 = *zt++;
726:       sum3 = *zt++;
727:       sum4 = *zt++;
728:       sum5 = *zt++;
729:       v2   = v1 + n;
730:       v3   = v2 + n;
731:       v4   = v3 + n;
732:       v5   = v4 + n;

734:       for (n = 0; n<sz-1; n+=2) {
735:         i1    = idx[0];
736:         i2    = idx[1];
737:         idx  += 2;
738:         tmp0  = x[i1];
739:         tmp1  = x[i2];
740:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
741:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
742:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
743:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
744:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
745:       }
746:       if (n   == sz-1) {
747:         tmp0  = x[*idx++];
748:         sum1 += *v1++ * tmp0;
749:         sum2 += *v2++ * tmp0;
750:         sum3 += *v3++ * tmp0;
751:         sum4 += *v4++ * tmp0;
752:         sum5 += *v5++ * tmp0;
753:       }
754:       y[row++]=sum1;
755:       y[row++]=sum2;
756:       y[row++]=sum3;
757:       y[row++]=sum4;
758:       y[row++]=sum5;
759:       v1      =v5;       /* Since the next block to be processed starts there */
760:       idx    +=4*sz;
761:       break;
762:     default:
763:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
764:     }
765:   }
766:   VecRestoreArrayRead(xx,&x);
767:   VecRestoreArray(yy,&y);
768:   if (zz != yy) {
769:     VecRestoreArray(zz,&z);
770:   }
771:   PetscLogFlops(2.0*a->nz);
772:   return(0);
773: }

775: /* ----------------------------------------------------------- */
778: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
779: {
780:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
781:   IS                iscol = a->col,isrow = a->row;
782:   PetscErrorCode    ierr;
783:   const PetscInt    *r,*c,*rout,*cout;
784:   PetscInt          i,j,n = A->rmap->n,nz;
785:   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
786:   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
787:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790:   const PetscScalar *b;

793:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
794:   node_max = a->inode.node_count;
795:   ns       = a->inode.size;     /* Node Size array */

797:   VecGetArrayRead(bb,&b);
798:   VecGetArray(xx,&x);
799:   tmp  = a->solve_work;

801:   ISGetIndices(isrow,&rout); r = rout;
802:   ISGetIndices(iscol,&cout); c = cout + (n-1);

804:   /* forward solve the lower triangular */
805:   tmps = tmp;
806:   aa   = a_a;
807:   aj   = a_j;
808:   ad   = a->diag;

810:   for (i = 0,row = 0; i< node_max; ++i) {
811:     nsz = ns[i];
812:     aii = ai[row];
813:     v1  = aa + aii;
814:     vi  = aj + aii;
815:     nz  = ad[row]- aii;
816:     if (i < node_max-1) {
817:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
818:       * but our indexing to determine it's size could. */
819:       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
820:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
821:       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
822:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
823:     }

825:     switch (nsz) {               /* Each loop in 'case' is unrolled */
826:     case 1:
827:       sum1 = b[*r++];
828:       for (j=0; j<nz-1; j+=2) {
829:         i0    = vi[0];
830:         i1    = vi[1];
831:         vi   +=2;
832:         tmp0  = tmps[i0];
833:         tmp1  = tmps[i1];
834:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
835:       }
836:       if (j == nz-1) {
837:         tmp0  = tmps[*vi++];
838:         sum1 -= *v1++ *tmp0;
839:       }
840:       tmp[row++]=sum1;
841:       break;
842:     case 2:
843:       sum1 = b[*r++];
844:       sum2 = b[*r++];
845:       v2   = aa + ai[row+1];

847:       for (j=0; j<nz-1; j+=2) {
848:         i0    = vi[0];
849:         i1    = vi[1];
850:         vi   +=2;
851:         tmp0  = tmps[i0];
852:         tmp1  = tmps[i1];
853:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
854:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
855:       }
856:       if (j == nz-1) {
857:         tmp0  = tmps[*vi++];
858:         sum1 -= *v1++ *tmp0;
859:         sum2 -= *v2++ *tmp0;
860:       }
861:       sum2     -= *v2++ *sum1;
862:       tmp[row++]=sum1;
863:       tmp[row++]=sum2;
864:       break;
865:     case 3:
866:       sum1 = b[*r++];
867:       sum2 = b[*r++];
868:       sum3 = b[*r++];
869:       v2   = aa + ai[row+1];
870:       v3   = aa + ai[row+2];

872:       for (j=0; j<nz-1; j+=2) {
873:         i0    = vi[0];
874:         i1    = vi[1];
875:         vi   +=2;
876:         tmp0  = tmps[i0];
877:         tmp1  = tmps[i1];
878:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
879:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
880:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
881:       }
882:       if (j == nz-1) {
883:         tmp0  = tmps[*vi++];
884:         sum1 -= *v1++ *tmp0;
885:         sum2 -= *v2++ *tmp0;
886:         sum3 -= *v3++ *tmp0;
887:       }
888:       sum2 -= *v2++ * sum1;
889:       sum3 -= *v3++ * sum1;
890:       sum3 -= *v3++ * sum2;

892:       tmp[row++]=sum1;
893:       tmp[row++]=sum2;
894:       tmp[row++]=sum3;
895:       break;

897:     case 4:
898:       sum1 = b[*r++];
899:       sum2 = b[*r++];
900:       sum3 = b[*r++];
901:       sum4 = b[*r++];
902:       v2   = aa + ai[row+1];
903:       v3   = aa + ai[row+2];
904:       v4   = aa + ai[row+3];

906:       for (j=0; j<nz-1; j+=2) {
907:         i0    = vi[0];
908:         i1    = vi[1];
909:         vi   +=2;
910:         tmp0  = tmps[i0];
911:         tmp1  = tmps[i1];
912:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916:       }
917:       if (j == nz-1) {
918:         tmp0  = tmps[*vi++];
919:         sum1 -= *v1++ *tmp0;
920:         sum2 -= *v2++ *tmp0;
921:         sum3 -= *v3++ *tmp0;
922:         sum4 -= *v4++ *tmp0;
923:       }
924:       sum2 -= *v2++ * sum1;
925:       sum3 -= *v3++ * sum1;
926:       sum4 -= *v4++ * sum1;
927:       sum3 -= *v3++ * sum2;
928:       sum4 -= *v4++ * sum2;
929:       sum4 -= *v4++ * sum3;

931:       tmp[row++]=sum1;
932:       tmp[row++]=sum2;
933:       tmp[row++]=sum3;
934:       tmp[row++]=sum4;
935:       break;
936:     case 5:
937:       sum1 = b[*r++];
938:       sum2 = b[*r++];
939:       sum3 = b[*r++];
940:       sum4 = b[*r++];
941:       sum5 = b[*r++];
942:       v2   = aa + ai[row+1];
943:       v3   = aa + ai[row+2];
944:       v4   = aa + ai[row+3];
945:       v5   = aa + ai[row+4];

947:       for (j=0; j<nz-1; j+=2) {
948:         i0    = vi[0];
949:         i1    = vi[1];
950:         vi   +=2;
951:         tmp0  = tmps[i0];
952:         tmp1  = tmps[i1];
953:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958:       }
959:       if (j == nz-1) {
960:         tmp0  = tmps[*vi++];
961:         sum1 -= *v1++ *tmp0;
962:         sum2 -= *v2++ *tmp0;
963:         sum3 -= *v3++ *tmp0;
964:         sum4 -= *v4++ *tmp0;
965:         sum5 -= *v5++ *tmp0;
966:       }

968:       sum2 -= *v2++ * sum1;
969:       sum3 -= *v3++ * sum1;
970:       sum4 -= *v4++ * sum1;
971:       sum5 -= *v5++ * sum1;
972:       sum3 -= *v3++ * sum2;
973:       sum4 -= *v4++ * sum2;
974:       sum5 -= *v5++ * sum2;
975:       sum4 -= *v4++ * sum3;
976:       sum5 -= *v5++ * sum3;
977:       sum5 -= *v5++ * sum4;

979:       tmp[row++]=sum1;
980:       tmp[row++]=sum2;
981:       tmp[row++]=sum3;
982:       tmp[row++]=sum4;
983:       tmp[row++]=sum5;
984:       break;
985:     default:
986:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987:     }
988:   }
989:   /* backward solve the upper triangular */
990:   for (i=node_max -1,row = n-1; i>=0; i--) {
991:     nsz = ns[i];
992:     aii = ai[row+1] -1;
993:     v1  = aa + aii;
994:     vi  = aj + aii;
995:     nz  = aii- ad[row];
996:     switch (nsz) {               /* Each loop in 'case' is unrolled */
997:     case 1:
998:       sum1 = tmp[row];

1000:       for (j=nz; j>1; j-=2) {
1001:         vi   -=2;
1002:         i0    = vi[2];
1003:         i1    = vi[1];
1004:         tmp0  = tmps[i0];
1005:         tmp1  = tmps[i1];
1006:         v1   -= 2;
1007:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008:       }
1009:       if (j==1) {
1010:         tmp0  = tmps[*vi--];
1011:         sum1 -= *v1-- * tmp0;
1012:       }
1013:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014:       break;
1015:     case 2:
1016:       sum1 = tmp[row];
1017:       sum2 = tmp[row -1];
1018:       v2   = aa + ai[row]-1;
1019:       for (j=nz; j>1; j-=2) {
1020:         vi   -=2;
1021:         i0    = vi[2];
1022:         i1    = vi[1];
1023:         tmp0  = tmps[i0];
1024:         tmp1  = tmps[i1];
1025:         v1   -= 2;
1026:         v2   -= 2;
1027:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029:       }
1030:       if (j==1) {
1031:         tmp0  = tmps[*vi--];
1032:         sum1 -= *v1-- * tmp0;
1033:         sum2 -= *v2-- * tmp0;
1034:       }

1036:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037:       sum2   -= *v2-- * tmp0;
1038:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039:       break;
1040:     case 3:
1041:       sum1 = tmp[row];
1042:       sum2 = tmp[row -1];
1043:       sum3 = tmp[row -2];
1044:       v2   = aa + ai[row]-1;
1045:       v3   = aa + ai[row -1]-1;
1046:       for (j=nz; j>1; j-=2) {
1047:         vi   -=2;
1048:         i0    = vi[2];
1049:         i1    = vi[1];
1050:         tmp0  = tmps[i0];
1051:         tmp1  = tmps[i1];
1052:         v1   -= 2;
1053:         v2   -= 2;
1054:         v3   -= 2;
1055:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058:       }
1059:       if (j==1) {
1060:         tmp0  = tmps[*vi--];
1061:         sum1 -= *v1-- * tmp0;
1062:         sum2 -= *v2-- * tmp0;
1063:         sum3 -= *v3-- * tmp0;
1064:       }
1065:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066:       sum2   -= *v2-- * tmp0;
1067:       sum3   -= *v3-- * tmp0;
1068:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069:       sum3   -= *v3-- * tmp0;
1070:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;

1072:       break;
1073:     case 4:
1074:       sum1 = tmp[row];
1075:       sum2 = tmp[row -1];
1076:       sum3 = tmp[row -2];
1077:       sum4 = tmp[row -3];
1078:       v2   = aa + ai[row]-1;
1079:       v3   = aa + ai[row -1]-1;
1080:       v4   = aa + ai[row -2]-1;

1082:       for (j=nz; j>1; j-=2) {
1083:         vi   -=2;
1084:         i0    = vi[2];
1085:         i1    = vi[1];
1086:         tmp0  = tmps[i0];
1087:         tmp1  = tmps[i1];
1088:         v1   -= 2;
1089:         v2   -= 2;
1090:         v3   -= 2;
1091:         v4   -= 2;
1092:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096:       }
1097:       if (j==1) {
1098:         tmp0  = tmps[*vi--];
1099:         sum1 -= *v1-- * tmp0;
1100:         sum2 -= *v2-- * tmp0;
1101:         sum3 -= *v3-- * tmp0;
1102:         sum4 -= *v4-- * tmp0;
1103:       }

1105:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106:       sum2   -= *v2-- * tmp0;
1107:       sum3   -= *v3-- * tmp0;
1108:       sum4   -= *v4-- * tmp0;
1109:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110:       sum3   -= *v3-- * tmp0;
1111:       sum4   -= *v4-- * tmp0;
1112:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113:       sum4   -= *v4-- * tmp0;
1114:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115:       break;
1116:     case 5:
1117:       sum1 = tmp[row];
1118:       sum2 = tmp[row -1];
1119:       sum3 = tmp[row -2];
1120:       sum4 = tmp[row -3];
1121:       sum5 = tmp[row -4];
1122:       v2   = aa + ai[row]-1;
1123:       v3   = aa + ai[row -1]-1;
1124:       v4   = aa + ai[row -2]-1;
1125:       v5   = aa + ai[row -3]-1;
1126:       for (j=nz; j>1; j-=2) {
1127:         vi   -= 2;
1128:         i0    = vi[2];
1129:         i1    = vi[1];
1130:         tmp0  = tmps[i0];
1131:         tmp1  = tmps[i1];
1132:         v1   -= 2;
1133:         v2   -= 2;
1134:         v3   -= 2;
1135:         v4   -= 2;
1136:         v5   -= 2;
1137:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142:       }
1143:       if (j==1) {
1144:         tmp0  = tmps[*vi--];
1145:         sum1 -= *v1-- * tmp0;
1146:         sum2 -= *v2-- * tmp0;
1147:         sum3 -= *v3-- * tmp0;
1148:         sum4 -= *v4-- * tmp0;
1149:         sum5 -= *v5-- * tmp0;
1150:       }

1152:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153:       sum2   -= *v2-- * tmp0;
1154:       sum3   -= *v3-- * tmp0;
1155:       sum4   -= *v4-- * tmp0;
1156:       sum5   -= *v5-- * tmp0;
1157:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158:       sum3   -= *v3-- * tmp0;
1159:       sum4   -= *v4-- * tmp0;
1160:       sum5   -= *v5-- * tmp0;
1161:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162:       sum4   -= *v4-- * tmp0;
1163:       sum5   -= *v5-- * tmp0;
1164:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165:       sum5   -= *v5-- * tmp0;
1166:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167:       break;
1168:     default:
1169:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170:     }
1171:   }
1172:   ISRestoreIndices(isrow,&rout);
1173:   ISRestoreIndices(iscol,&cout);
1174:   VecRestoreArrayRead(bb,&b);
1175:   VecRestoreArray(xx,&x);
1176:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1177:   return(0);
1178: }

1182: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183: {
1184:   Mat             C     =B;
1185:   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1186:   IS              isrow = b->row,isicol = b->icol;
1187:   PetscErrorCode  ierr;
1188:   const PetscInt  *r,*ic,*ics;
1189:   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190:   PetscInt        i,j,k,nz,nzL,row,*pj;
1191:   const PetscInt  *ajtmp,*bjtmp;
1192:   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193:   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194:   FactorShiftCtx  sctx;
1195:   const PetscInt  *ddiag;
1196:   PetscReal       rs;
1197:   MatScalar       d;
1198:   PetscInt        inod,nodesz,node_max,col;
1199:   const PetscInt  *ns;
1200:   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;

1203:   /* MatPivotSetUp(): initialize shift context sctx */
1204:   PetscMemzero(&sctx,sizeof(FactorShiftCtx));

1206:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207:     ddiag          = a->diag;
1208:     sctx.shift_top = info->zeropivot;
1209:     for (i=0; i<n; i++) {
1210:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211:       d  = (aa)[ddiag[i]];
1212:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213:       v  = aa+ai[i];
1214:       nz = ai[i+1] - ai[i];
1215:       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1216:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1217:     }
1218:     sctx.shift_top *= 1.1;
1219:     sctx.nshift_max = 5;
1220:     sctx.shift_lo   = 0.;
1221:     sctx.shift_hi   = 1.;
1222:   }

1224:   ISGetIndices(isrow,&r);
1225:   ISGetIndices(isicol,&ic);

1227:   PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1228:   ics   = ic;

1230:   node_max = a->inode.node_count;
1231:   ns       = a->inode.size;
1232:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1234:   /* If max inode size > 4, split it into two inodes.*/
1235:   /* also map the inode sizes according to the ordering */
1236:   PetscMalloc1(n+1,&tmp_vec1);
1237:   for (i=0,j=0; i<node_max; ++i,++j) {
1238:     if (ns[i] > 4) {
1239:       tmp_vec1[j] = 4;
1240:       ++j;
1241:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1242:     } else {
1243:       tmp_vec1[j] = ns[i];
1244:     }
1245:   }
1246:   /* Use the correct node_max */
1247:   node_max = j;

1249:   /* Now reorder the inode info based on mat re-ordering info */
1250:   /* First create a row -> inode_size_array_index map */
1251:   PetscMalloc1(n+1,&nsmap);
1252:   PetscMalloc1(node_max+1,&tmp_vec2);
1253:   for (i=0,row=0; i<node_max; i++) {
1254:     nodesz = tmp_vec1[i];
1255:     for (j=0; j<nodesz; j++,row++) {
1256:       nsmap[row] = i;
1257:     }
1258:   }
1259:   /* Using nsmap, create a reordered ns structure */
1260:   for (i=0,j=0; i< node_max; i++) {
1261:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1262:     tmp_vec2[i] = nodesz;
1263:     j          += nodesz;
1264:   }
1265:   PetscFree(nsmap);
1266:   PetscFree(tmp_vec1);

1268:   /* Now use the correct ns */
1269:   ns = tmp_vec2;

1271:   do {
1272:     sctx.newshift = PETSC_FALSE;
1273:     /* Now loop over each block-row, and do the factorization */
1274:     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1275:       nodesz = ns[inod];

1277:       switch (nodesz) {
1278:       case 1:
1279:         /*----------*/
1280:         /* zero rtmp1 */
1281:         /* L part */
1282:         nz    = bi[i+1] - bi[i];
1283:         bjtmp = bj + bi[i];
1284:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1286:         /* U part */
1287:         nz    = bdiag[i]-bdiag[i+1];
1288:         bjtmp = bj + bdiag[i+1]+1;
1289:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1291:         /* load in initial (unfactored row) */
1292:         nz    = ai[r[i]+1] - ai[r[i]];
1293:         ajtmp = aj + ai[r[i]];
1294:         v     = aa + ai[r[i]];
1295:         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1297:         /* ZeropivotApply() */
1298:         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */

1300:         /* elimination */
1301:         bjtmp = bj + bi[i];
1302:         row   = *bjtmp++;
1303:         nzL   = bi[i+1] - bi[i];
1304:         for (k=0; k < nzL; k++) {
1305:           pc = rtmp1 + row;
1306:           if (*pc != 0.0) {
1307:             pv   = b->a + bdiag[row];
1308:             mul1 = *pc * (*pv);
1309:             *pc  = mul1;
1310:             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1311:             pv   = b->a + bdiag[row+1]+1;
1312:             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1313:             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1314:             PetscLogFlops(1+2*nz);
1315:           }
1316:           row = *bjtmp++;
1317:         }

1319:         /* finished row so stick it into b->a */
1320:         rs = 0.0;
1321:         /* L part */
1322:         pv = b->a + bi[i];
1323:         pj = b->j + bi[i];
1324:         nz = bi[i+1] - bi[i];
1325:         for (j=0; j<nz; j++) {
1326:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327:         }

1329:         /* U part */
1330:         pv = b->a + bdiag[i+1]+1;
1331:         pj = b->j + bdiag[i+1]+1;
1332:         nz = bdiag[i] - bdiag[i+1]-1;
1333:         for (j=0; j<nz; j++) {
1334:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1335:         }

1337:         /* Check zero pivot */
1338:         sctx.rs = rs;
1339:         sctx.pv = rtmp1[i];
1340:         MatPivotCheck(A,info,&sctx,i);
1341:         if (sctx.newshift) break;

1343:         /* Mark diagonal and invert diagonal for simplier triangular solves */
1344:         pv  = b->a + bdiag[i];
1345:         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1346:         break;

1348:       case 2:
1349:         /*----------*/
1350:         /* zero rtmp1 and rtmp2 */
1351:         /* L part */
1352:         nz    = bi[i+1] - bi[i];
1353:         bjtmp = bj + bi[i];
1354:         for  (j=0; j<nz; j++) {
1355:           col        = bjtmp[j];
1356:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357:         }

1359:         /* U part */
1360:         nz    = bdiag[i]-bdiag[i+1];
1361:         bjtmp = bj + bdiag[i+1]+1;
1362:         for  (j=0; j<nz; j++) {
1363:           col        = bjtmp[j];
1364:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1365:         }

1367:         /* load in initial (unfactored row) */
1368:         nz    = ai[r[i]+1] - ai[r[i]];
1369:         ajtmp = aj + ai[r[i]];
1370:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1371:         for (j=0; j<nz; j++) {
1372:           col        = ics[ajtmp[j]];
1373:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1374:         }
1375:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1376:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;

1378:         /* elimination */
1379:         bjtmp = bj + bi[i];
1380:         row   = *bjtmp++; /* pivot row */
1381:         nzL   = bi[i+1] - bi[i];
1382:         for (k=0; k < nzL; k++) {
1383:           pc1 = rtmp1 + row;
1384:           pc2 = rtmp2 + row;
1385:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1386:             pv   = b->a + bdiag[row];
1387:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1388:             *pc1 = mul1;       *pc2 = mul2;

1390:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1391:             pv = b->a + bdiag[row+1]+1;
1392:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1393:             for (j=0; j<nz; j++) {
1394:               col         = pj[j];
1395:               rtmp1[col] -= mul1 * pv[j];
1396:               rtmp2[col] -= mul2 * pv[j];
1397:             }
1398:             PetscLogFlops(2+4*nz);
1399:           }
1400:           row = *bjtmp++;
1401:         }

1403:         /* finished row i; check zero pivot, then stick row i into b->a */
1404:         rs = 0.0;
1405:         /* L part */
1406:         pc1 = b->a + bi[i];
1407:         pj  = b->j + bi[i];
1408:         nz  = bi[i+1] - bi[i];
1409:         for (j=0; j<nz; j++) {
1410:           col    = pj[j];
1411:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412:         }
1413:         /* U part */
1414:         pc1 = b->a + bdiag[i+1]+1;
1415:         pj  = b->j + bdiag[i+1]+1;
1416:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1417:         for (j=0; j<nz; j++) {
1418:           col    = pj[j];
1419:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1420:         }

1422:         sctx.rs = rs;
1423:         sctx.pv = rtmp1[i];
1424:         MatPivotCheck(A,info,&sctx,i);
1425:         if (sctx.newshift) break;
1426:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1427:         *pc1 = 1.0/sctx.pv;

1429:         /* Now take care of diagonal 2x2 block. */
1430:         pc2 = rtmp2 + i;
1431:         if (*pc2 != 0.0) {
1432:           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1433:           *pc2 = mul1;          /* insert L entry */
1434:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1435:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1436:           for (j=0; j<nz; j++) {
1437:             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1438:           }
1439:           PetscLogFlops(1+2*nz);
1440:         }

1442:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1443:         rs = 0.0;
1444:         /* L part */
1445:         pc2 = b->a + bi[i+1];
1446:         pj  = b->j + bi[i+1];
1447:         nz  = bi[i+2] - bi[i+1];
1448:         for (j=0; j<nz; j++) {
1449:           col    = pj[j];
1450:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451:         }
1452:         /* U part */
1453:         pc2 = b->a + bdiag[i+2]+1;
1454:         pj  = b->j + bdiag[i+2]+1;
1455:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1456:         for (j=0; j<nz; j++) {
1457:           col    = pj[j];
1458:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1459:         }

1461:         sctx.rs = rs;
1462:         sctx.pv = rtmp2[i+1];
1463:         MatPivotCheck(A,info,&sctx,i+1);
1464:         if (sctx.newshift) break;
1465:         pc2  = b->a + bdiag[i+1];
1466:         *pc2 = 1.0/sctx.pv;
1467:         break;

1469:       case 3:
1470:         /*----------*/
1471:         /* zero rtmp */
1472:         /* L part */
1473:         nz    = bi[i+1] - bi[i];
1474:         bjtmp = bj + bi[i];
1475:         for  (j=0; j<nz; j++) {
1476:           col        = bjtmp[j];
1477:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478:         }

1480:         /* U part */
1481:         nz    = bdiag[i]-bdiag[i+1];
1482:         bjtmp = bj + bdiag[i+1]+1;
1483:         for  (j=0; j<nz; j++) {
1484:           col        = bjtmp[j];
1485:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1486:         }

1488:         /* load in initial (unfactored row) */
1489:         nz    = ai[r[i]+1] - ai[r[i]];
1490:         ajtmp = aj + ai[r[i]];
1491:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1492:         for (j=0; j<nz; j++) {
1493:           col        = ics[ajtmp[j]];
1494:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1495:         }
1496:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1497:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;

1499:         /* elimination */
1500:         bjtmp = bj + bi[i];
1501:         row   = *bjtmp++; /* pivot row */
1502:         nzL   = bi[i+1] - bi[i];
1503:         for (k=0; k < nzL; k++) {
1504:           pc1 = rtmp1 + row;
1505:           pc2 = rtmp2 + row;
1506:           pc3 = rtmp3 + row;
1507:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1508:             pv   = b->a + bdiag[row];
1509:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1510:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;

1512:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1513:             pv = b->a + bdiag[row+1]+1;
1514:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1515:             for (j=0; j<nz; j++) {
1516:               col         = pj[j];
1517:               rtmp1[col] -= mul1 * pv[j];
1518:               rtmp2[col] -= mul2 * pv[j];
1519:               rtmp3[col] -= mul3 * pv[j];
1520:             }
1521:             PetscLogFlops(3+6*nz);
1522:           }
1523:           row = *bjtmp++;
1524:         }

1526:         /* finished row i; check zero pivot, then stick row i into b->a */
1527:         rs = 0.0;
1528:         /* L part */
1529:         pc1 = b->a + bi[i];
1530:         pj  = b->j + bi[i];
1531:         nz  = bi[i+1] - bi[i];
1532:         for (j=0; j<nz; j++) {
1533:           col    = pj[j];
1534:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535:         }
1536:         /* U part */
1537:         pc1 = b->a + bdiag[i+1]+1;
1538:         pj  = b->j + bdiag[i+1]+1;
1539:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1540:         for (j=0; j<nz; j++) {
1541:           col    = pj[j];
1542:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1543:         }

1545:         sctx.rs = rs;
1546:         sctx.pv = rtmp1[i];
1547:         MatPivotCheck(A,info,&sctx,i);
1548:         if (sctx.newshift) break;
1549:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1550:         *pc1 = 1.0/sctx.pv;

1552:         /* Now take care of 1st column of diagonal 3x3 block. */
1553:         pc2 = rtmp2 + i;
1554:         pc3 = rtmp3 + i;
1555:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1556:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1557:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1558:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1559:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1560:           for (j=0; j<nz; j++) {
1561:             col         = pj[j];
1562:             rtmp2[col] -= mul2 * rtmp1[col];
1563:             rtmp3[col] -= mul3 * rtmp1[col];
1564:           }
1565:           PetscLogFlops(2+4*nz);
1566:         }

1568:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1569:         rs = 0.0;
1570:         /* L part */
1571:         pc2 = b->a + bi[i+1];
1572:         pj  = b->j + bi[i+1];
1573:         nz  = bi[i+2] - bi[i+1];
1574:         for (j=0; j<nz; j++) {
1575:           col    = pj[j];
1576:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577:         }
1578:         /* U part */
1579:         pc2 = b->a + bdiag[i+2]+1;
1580:         pj  = b->j + bdiag[i+2]+1;
1581:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1582:         for (j=0; j<nz; j++) {
1583:           col    = pj[j];
1584:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1585:         }

1587:         sctx.rs = rs;
1588:         sctx.pv = rtmp2[i+1];
1589:         MatPivotCheck(A,info,&sctx,i+1);
1590:         if (sctx.newshift) break;
1591:         pc2  = b->a + bdiag[i+1];
1592:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1594:         /* Now take care of 2nd column of diagonal 3x3 block. */
1595:         pc3 = rtmp3 + i+1;
1596:         if (*pc3 != 0.0) {
1597:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1598:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1599:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1600:           for (j=0; j<nz; j++) {
1601:             col         = pj[j];
1602:             rtmp3[col] -= mul3 * rtmp2[col];
1603:           }
1604:           PetscLogFlops(1+2*nz);
1605:         }

1607:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1608:         rs = 0.0;
1609:         /* L part */
1610:         pc3 = b->a + bi[i+2];
1611:         pj  = b->j + bi[i+2];
1612:         nz  = bi[i+3] - bi[i+2];
1613:         for (j=0; j<nz; j++) {
1614:           col    = pj[j];
1615:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616:         }
1617:         /* U part */
1618:         pc3 = b->a + bdiag[i+3]+1;
1619:         pj  = b->j + bdiag[i+3]+1;
1620:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1621:         for (j=0; j<nz; j++) {
1622:           col    = pj[j];
1623:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1624:         }

1626:         sctx.rs = rs;
1627:         sctx.pv = rtmp3[i+2];
1628:         MatPivotCheck(A,info,&sctx,i+2);
1629:         if (sctx.newshift) break;
1630:         pc3  = b->a + bdiag[i+2];
1631:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1632:         break;
1633:       case 4:
1634:         /*----------*/
1635:         /* zero rtmp */
1636:         /* L part */
1637:         nz    = bi[i+1] - bi[i];
1638:         bjtmp = bj + bi[i];
1639:         for  (j=0; j<nz; j++) {
1640:           col        = bjtmp[j];
1641:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1642:         }

1644:         /* U part */
1645:         nz    = bdiag[i]-bdiag[i+1];
1646:         bjtmp = bj + bdiag[i+1]+1;
1647:         for  (j=0; j<nz; j++) {
1648:           col        = bjtmp[j];
1649:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1650:         }

1652:         /* load in initial (unfactored row) */
1653:         nz    = ai[r[i]+1] - ai[r[i]];
1654:         ajtmp = aj + ai[r[i]];
1655:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1656:         for (j=0; j<nz; j++) {
1657:           col        = ics[ajtmp[j]];
1658:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1659:         }
1660:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1661:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;

1663:         /* elimination */
1664:         bjtmp = bj + bi[i];
1665:         row   = *bjtmp++; /* pivot row */
1666:         nzL   = bi[i+1] - bi[i];
1667:         for (k=0; k < nzL; k++) {
1668:           pc1 = rtmp1 + row;
1669:           pc2 = rtmp2 + row;
1670:           pc3 = rtmp3 + row;
1671:           pc4 = rtmp4 + row;
1672:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1673:             pv   = b->a + bdiag[row];
1674:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1675:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;

1677:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1678:             pv = b->a + bdiag[row+1]+1;
1679:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1680:             for (j=0; j<nz; j++) {
1681:               col         = pj[j];
1682:               rtmp1[col] -= mul1 * pv[j];
1683:               rtmp2[col] -= mul2 * pv[j];
1684:               rtmp3[col] -= mul3 * pv[j];
1685:               rtmp4[col] -= mul4 * pv[j];
1686:             }
1687:             PetscLogFlops(4+8*nz);
1688:           }
1689:           row = *bjtmp++;
1690:         }

1692:         /* finished row i; check zero pivot, then stick row i into b->a */
1693:         rs = 0.0;
1694:         /* L part */
1695:         pc1 = b->a + bi[i];
1696:         pj  = b->j + bi[i];
1697:         nz  = bi[i+1] - bi[i];
1698:         for (j=0; j<nz; j++) {
1699:           col    = pj[j];
1700:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701:         }
1702:         /* U part */
1703:         pc1 = b->a + bdiag[i+1]+1;
1704:         pj  = b->j + bdiag[i+1]+1;
1705:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1706:         for (j=0; j<nz; j++) {
1707:           col    = pj[j];
1708:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1709:         }

1711:         sctx.rs = rs;
1712:         sctx.pv = rtmp1[i];
1713:         MatPivotCheck(A,info,&sctx,i);
1714:         if (sctx.newshift) break;
1715:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1716:         *pc1 = 1.0/sctx.pv;

1718:         /* Now take care of 1st column of diagonal 4x4 block. */
1719:         pc2 = rtmp2 + i;
1720:         pc3 = rtmp3 + i;
1721:         pc4 = rtmp4 + i;
1722:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1723:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1724:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1725:           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1726:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1727:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1728:           for (j=0; j<nz; j++) {
1729:             col         = pj[j];
1730:             rtmp2[col] -= mul2 * rtmp1[col];
1731:             rtmp3[col] -= mul3 * rtmp1[col];
1732:             rtmp4[col] -= mul4 * rtmp1[col];
1733:           }
1734:           PetscLogFlops(3+6*nz);
1735:         }

1737:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1738:         rs = 0.0;
1739:         /* L part */
1740:         pc2 = b->a + bi[i+1];
1741:         pj  = b->j + bi[i+1];
1742:         nz  = bi[i+2] - bi[i+1];
1743:         for (j=0; j<nz; j++) {
1744:           col    = pj[j];
1745:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746:         }
1747:         /* U part */
1748:         pc2 = b->a + bdiag[i+2]+1;
1749:         pj  = b->j + bdiag[i+2]+1;
1750:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1751:         for (j=0; j<nz; j++) {
1752:           col    = pj[j];
1753:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1754:         }

1756:         sctx.rs = rs;
1757:         sctx.pv = rtmp2[i+1];
1758:         MatPivotCheck(A,info,&sctx,i+1);
1759:         if (sctx.newshift) break;
1760:         pc2  = b->a + bdiag[i+1];
1761:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1763:         /* Now take care of 2nd column of diagonal 4x4 block. */
1764:         pc3 = rtmp3 + i+1;
1765:         pc4 = rtmp4 + i+1;
1766:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1767:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1768:           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1769:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1770:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1771:           for (j=0; j<nz; j++) {
1772:             col         = pj[j];
1773:             rtmp3[col] -= mul3 * rtmp2[col];
1774:             rtmp4[col] -= mul4 * rtmp2[col];
1775:           }
1776:           PetscLogFlops(4*nz);
1777:         }

1779:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1780:         rs = 0.0;
1781:         /* L part */
1782:         pc3 = b->a + bi[i+2];
1783:         pj  = b->j + bi[i+2];
1784:         nz  = bi[i+3] - bi[i+2];
1785:         for (j=0; j<nz; j++) {
1786:           col    = pj[j];
1787:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788:         }
1789:         /* U part */
1790:         pc3 = b->a + bdiag[i+3]+1;
1791:         pj  = b->j + bdiag[i+3]+1;
1792:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1793:         for (j=0; j<nz; j++) {
1794:           col    = pj[j];
1795:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1796:         }

1798:         sctx.rs = rs;
1799:         sctx.pv = rtmp3[i+2];
1800:         MatPivotCheck(A,info,&sctx,i+2);
1801:         if (sctx.newshift) break;
1802:         pc3  = b->a + bdiag[i+2];
1803:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */

1805:         /* Now take care of 3rd column of diagonal 4x4 block. */
1806:         pc4 = rtmp4 + i+2;
1807:         if (*pc4 != 0.0) {
1808:           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1809:           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1810:           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1811:           for (j=0; j<nz; j++) {
1812:             col         = pj[j];
1813:             rtmp4[col] -= mul4 * rtmp3[col];
1814:           }
1815:           PetscLogFlops(1+2*nz);
1816:         }

1818:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1819:         rs = 0.0;
1820:         /* L part */
1821:         pc4 = b->a + bi[i+3];
1822:         pj  = b->j + bi[i+3];
1823:         nz  = bi[i+4] - bi[i+3];
1824:         for (j=0; j<nz; j++) {
1825:           col    = pj[j];
1826:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827:         }
1828:         /* U part */
1829:         pc4 = b->a + bdiag[i+4]+1;
1830:         pj  = b->j + bdiag[i+4]+1;
1831:         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1832:         for (j=0; j<nz; j++) {
1833:           col    = pj[j];
1834:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1835:         }

1837:         sctx.rs = rs;
1838:         sctx.pv = rtmp4[i+3];
1839:         MatPivotCheck(A,info,&sctx,i+3);
1840:         if (sctx.newshift) break;
1841:         pc4  = b->a + bdiag[i+3];
1842:         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1843:         break;

1845:       default:
1846:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1847:       }
1848:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1849:       i += nodesz;                 /* Update the row */
1850:     }

1852:     /* MatPivotRefine() */
1853:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1854:       /*
1855:        * if no shift in this attempt & shifting & started shifting & can refine,
1856:        * then try lower shift
1857:        */
1858:       sctx.shift_hi       = sctx.shift_fraction;
1859:       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1860:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1861:       sctx.newshift       = PETSC_TRUE;
1862:       sctx.nshift++;
1863:     }
1864:   } while (sctx.newshift);

1866:   PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1867:   PetscFree(tmp_vec2);
1868:   ISRestoreIndices(isicol,&ic);
1869:   ISRestoreIndices(isrow,&r);

1871:   if (b->inode.size) {
1872:     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1873:   } else {
1874:     C->ops->solve           = MatSolve_SeqAIJ;
1875:   }
1876:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1877:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1878:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1879:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1880:   C->assembled              = PETSC_TRUE;
1881:   C->preallocated           = PETSC_TRUE;

1883:   PetscLogFlops(C->cmap->n);

1885:   /* MatShiftView(A,info,&sctx) */
1886:   if (sctx.nshift) {
1887:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1888:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1889:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1890:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1891:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1892:       PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1893:     }
1894:   }
1895:   return(0);
1896: }

1900: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1901: {
1902:   Mat             C     = B;
1903:   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1904:   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1905:   PetscErrorCode  ierr;
1906:   const PetscInt  *r,*ic,*c,*ics;
1907:   PetscInt        n   = A->rmap->n,*bi = b->i;
1908:   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1909:   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1910:   PetscInt        *ai = a->i,*aj = a->j;
1911:   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1912:   PetscScalar     mul1,mul2,mul3,tmp;
1913:   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1914:   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1915:   PetscReal       rs=0.0;
1916:   FactorShiftCtx  sctx;

1919:   sctx.shift_top      = 0;
1920:   sctx.nshift_max     = 0;
1921:   sctx.shift_lo       = 0;
1922:   sctx.shift_hi       = 0;
1923:   sctx.shift_fraction = 0;

1925:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1926:   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1927:     sctx.shift_top = 0;
1928:     for (i=0; i<n; i++) {
1929:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1930:       rs    = 0.0;
1931:       ajtmp = aj + ai[i];
1932:       rtmp1 = aa + ai[i];
1933:       nz    = ai[i+1] - ai[i];
1934:       for (j=0; j<nz; j++) {
1935:         if (*ajtmp != i) {
1936:           rs += PetscAbsScalar(*rtmp1++);
1937:         } else {
1938:           rs -= PetscRealPart(*rtmp1++);
1939:         }
1940:         ajtmp++;
1941:       }
1942:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1943:     }
1944:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1945:     sctx.shift_top *= 1.1;
1946:     sctx.nshift_max = 5;
1947:     sctx.shift_lo   = 0.;
1948:     sctx.shift_hi   = 1.;
1949:   }
1950:   sctx.shift_amount = 0;
1951:   sctx.nshift       = 0;

1953:   ISGetIndices(isrow,&r);
1954:   ISGetIndices(iscol,&c);
1955:   ISGetIndices(isicol,&ic);
1956:   PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1957:   ics    = ic;

1959:   node_max = a->inode.node_count;
1960:   ns       = a->inode.size;
1961:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1963:   /* If max inode size > 3, split it into two inodes.*/
1964:   /* also map the inode sizes according to the ordering */
1965:   PetscMalloc1(n+1,&tmp_vec1);
1966:   for (i=0,j=0; i<node_max; ++i,++j) {
1967:     if (ns[i]>3) {
1968:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1969:       ++j;
1970:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1971:     } else {
1972:       tmp_vec1[j] = ns[i];
1973:     }
1974:   }
1975:   /* Use the correct node_max */
1976:   node_max = j;

1978:   /* Now reorder the inode info based on mat re-ordering info */
1979:   /* First create a row -> inode_size_array_index map */
1980:   PetscMalloc1(n+1,&nsmap);
1981:   PetscMalloc1(node_max+1,&tmp_vec2);
1982:   for (i=0,row=0; i<node_max; i++) {
1983:     nodesz = tmp_vec1[i];
1984:     for (j=0; j<nodesz; j++,row++) {
1985:       nsmap[row] = i;
1986:     }
1987:   }
1988:   /* Using nsmap, create a reordered ns structure */
1989:   for (i=0,j=0; i< node_max; i++) {
1990:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1991:     tmp_vec2[i] = nodesz;
1992:     j          += nodesz;
1993:   }
1994:   PetscFree(nsmap);
1995:   PetscFree(tmp_vec1);
1996:   /* Now use the correct ns */
1997:   ns = tmp_vec2;

1999:   do {
2000:     sctx.newshift = PETSC_FALSE;
2001:     /* Now loop over each block-row, and do the factorization */
2002:     for (i=0,row=0; i<node_max; i++) {
2003:       nodesz = ns[i];
2004:       nz     = bi[row+1] - bi[row];
2005:       bjtmp  = bj + bi[row];

2007:       switch (nodesz) {
2008:       case 1:
2009:         for  (j=0; j<nz; j++) {
2010:           idx         = bjtmp[j];
2011:           rtmp11[idx] = 0.0;
2012:         }

2014:         /* load in initial (unfactored row) */
2015:         idx    = r[row];
2016:         nz_tmp = ai[idx+1] - ai[idx];
2017:         ajtmp  = aj + ai[idx];
2018:         v1     = aa + ai[idx];

2020:         for (j=0; j<nz_tmp; j++) {
2021:           idx         = ics[ajtmp[j]];
2022:           rtmp11[idx] = v1[j];
2023:         }
2024:         rtmp11[ics[r[row]]] += sctx.shift_amount;

2026:         prow = *bjtmp++;
2027:         while (prow < row) {
2028:           pc1 = rtmp11 + prow;
2029:           if (*pc1 != 0.0) {
2030:             pv     = ba + bd[prow];
2031:             pj     = nbj + bd[prow];
2032:             mul1   = *pc1 * *pv++;
2033:             *pc1   = mul1;
2034:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2035:             PetscLogFlops(1+2*nz_tmp);
2036:             for (j=0; j<nz_tmp; j++) {
2037:               tmp          = pv[j];
2038:               idx          = pj[j];
2039:               rtmp11[idx] -= mul1 * tmp;
2040:             }
2041:           }
2042:           prow = *bjtmp++;
2043:         }
2044:         pj  = bj + bi[row];
2045:         pc1 = ba + bi[row];

2047:         sctx.pv     = rtmp11[row];
2048:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2049:         rs          = 0.0;
2050:         for (j=0; j<nz; j++) {
2051:           idx    = pj[j];
2052:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2053:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2054:         }
2055:         sctx.rs = rs;
2056:         MatPivotCheck(A,info,&sctx,row);
2057:         if (sctx.newshift) goto endofwhile;
2058:         break;

2060:       case 2:
2061:         for (j=0; j<nz; j++) {
2062:           idx         = bjtmp[j];
2063:           rtmp11[idx] = 0.0;
2064:           rtmp22[idx] = 0.0;
2065:         }

2067:         /* load in initial (unfactored row) */
2068:         idx    = r[row];
2069:         nz_tmp = ai[idx+1] - ai[idx];
2070:         ajtmp  = aj + ai[idx];
2071:         v1     = aa + ai[idx];
2072:         v2     = aa + ai[idx+1];
2073:         for (j=0; j<nz_tmp; j++) {
2074:           idx         = ics[ajtmp[j]];
2075:           rtmp11[idx] = v1[j];
2076:           rtmp22[idx] = v2[j];
2077:         }
2078:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2079:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

2081:         prow = *bjtmp++;
2082:         while (prow < row) {
2083:           pc1 = rtmp11 + prow;
2084:           pc2 = rtmp22 + prow;
2085:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2086:             pv   = ba + bd[prow];
2087:             pj   = nbj + bd[prow];
2088:             mul1 = *pc1 * *pv;
2089:             mul2 = *pc2 * *pv;
2090:             ++pv;
2091:             *pc1 = mul1;
2092:             *pc2 = mul2;

2094:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2095:             for (j=0; j<nz_tmp; j++) {
2096:               tmp          = pv[j];
2097:               idx          = pj[j];
2098:               rtmp11[idx] -= mul1 * tmp;
2099:               rtmp22[idx] -= mul2 * tmp;
2100:             }
2101:             PetscLogFlops(2+4*nz_tmp);
2102:           }
2103:           prow = *bjtmp++;
2104:         }

2106:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2107:         pc1 = rtmp11 + prow;
2108:         pc2 = rtmp22 + prow;

2110:         sctx.pv = *pc1;
2111:         pj      = bj + bi[prow];
2112:         rs      = 0.0;
2113:         for (j=0; j<nz; j++) {
2114:           idx = pj[j];
2115:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2116:         }
2117:         sctx.rs = rs;
2118:         MatPivotCheck(A,info,&sctx,row);
2119:         if (sctx.newshift) goto endofwhile;

2121:         if (*pc2 != 0.0) {
2122:           pj     = nbj + bd[prow];
2123:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2124:           *pc2   = mul2;
2125:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2126:           for (j=0; j<nz_tmp; j++) {
2127:             idx          = pj[j];
2128:             tmp          = rtmp11[idx];
2129:             rtmp22[idx] -= mul2 * tmp;
2130:           }
2131:           PetscLogFlops(1+2*nz_tmp);
2132:         }

2134:         pj  = bj + bi[row];
2135:         pc1 = ba + bi[row];
2136:         pc2 = ba + bi[row+1];

2138:         sctx.pv       = rtmp22[row+1];
2139:         rs            = 0.0;
2140:         rtmp11[row]   = 1.0/rtmp11[row];
2141:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2142:         /* copy row entries from dense representation to sparse */
2143:         for (j=0; j<nz; j++) {
2144:           idx    = pj[j];
2145:           pc1[j] = rtmp11[idx];
2146:           pc2[j] = rtmp22[idx];
2147:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2148:         }
2149:         sctx.rs = rs;
2150:         MatPivotCheck(A,info,&sctx,row+1);
2151:         if (sctx.newshift) goto endofwhile;
2152:         break;

2154:       case 3:
2155:         for  (j=0; j<nz; j++) {
2156:           idx         = bjtmp[j];
2157:           rtmp11[idx] = 0.0;
2158:           rtmp22[idx] = 0.0;
2159:           rtmp33[idx] = 0.0;
2160:         }
2161:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2162:         idx    = r[row];
2163:         nz_tmp = ai[idx+1] - ai[idx];
2164:         ajtmp  = aj + ai[idx];
2165:         v1     = aa + ai[idx];
2166:         v2     = aa + ai[idx+1];
2167:         v3     = aa + ai[idx+2];
2168:         for (j=0; j<nz_tmp; j++) {
2169:           idx         = ics[ajtmp[j]];
2170:           rtmp11[idx] = v1[j];
2171:           rtmp22[idx] = v2[j];
2172:           rtmp33[idx] = v3[j];
2173:         }
2174:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2175:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2176:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

2178:         /* loop over all pivot row blocks above this row block */
2179:         prow = *bjtmp++;
2180:         while (prow < row) {
2181:           pc1 = rtmp11 + prow;
2182:           pc2 = rtmp22 + prow;
2183:           pc3 = rtmp33 + prow;
2184:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2185:             pv   = ba  + bd[prow];
2186:             pj   = nbj + bd[prow];
2187:             mul1 = *pc1 * *pv;
2188:             mul2 = *pc2 * *pv;
2189:             mul3 = *pc3 * *pv;
2190:             ++pv;
2191:             *pc1 = mul1;
2192:             *pc2 = mul2;
2193:             *pc3 = mul3;

2195:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2196:             /* update this row based on pivot row */
2197:             for (j=0; j<nz_tmp; j++) {
2198:               tmp          = pv[j];
2199:               idx          = pj[j];
2200:               rtmp11[idx] -= mul1 * tmp;
2201:               rtmp22[idx] -= mul2 * tmp;
2202:               rtmp33[idx] -= mul3 * tmp;
2203:             }
2204:             PetscLogFlops(3+6*nz_tmp);
2205:           }
2206:           prow = *bjtmp++;
2207:         }

2209:         /* Now take care of diagonal 3x3 block in this set of rows */
2210:         /* note: prow = row here */
2211:         pc1 = rtmp11 + prow;
2212:         pc2 = rtmp22 + prow;
2213:         pc3 = rtmp33 + prow;

2215:         sctx.pv = *pc1;
2216:         pj      = bj + bi[prow];
2217:         rs      = 0.0;
2218:         for (j=0; j<nz; j++) {
2219:           idx = pj[j];
2220:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2221:         }
2222:         sctx.rs = rs;
2223:         MatPivotCheck(A,info,&sctx,row);
2224:         if (sctx.newshift) goto endofwhile;

2226:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2227:           mul2   = (*pc2)/(*pc1);
2228:           mul3   = (*pc3)/(*pc1);
2229:           *pc2   = mul2;
2230:           *pc3   = mul3;
2231:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2232:           pj     = nbj + bd[prow];
2233:           for (j=0; j<nz_tmp; j++) {
2234:             idx          = pj[j];
2235:             tmp          = rtmp11[idx];
2236:             rtmp22[idx] -= mul2 * tmp;
2237:             rtmp33[idx] -= mul3 * tmp;
2238:           }
2239:           PetscLogFlops(2+4*nz_tmp);
2240:         }
2241:         ++prow;

2243:         pc2     = rtmp22 + prow;
2244:         pc3     = rtmp33 + prow;
2245:         sctx.pv = *pc2;
2246:         pj      = bj + bi[prow];
2247:         rs      = 0.0;
2248:         for (j=0; j<nz; j++) {
2249:           idx = pj[j];
2250:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2251:         }
2252:         sctx.rs = rs;
2253:         MatPivotCheck(A,info,&sctx,row+1);
2254:         if (sctx.newshift) goto endofwhile;

2256:         if (*pc3 != 0.0) {
2257:           mul3   = (*pc3)/(*pc2);
2258:           *pc3   = mul3;
2259:           pj     = nbj + bd[prow];
2260:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2261:           for (j=0; j<nz_tmp; j++) {
2262:             idx          = pj[j];
2263:             tmp          = rtmp22[idx];
2264:             rtmp33[idx] -= mul3 * tmp;
2265:           }
2266:           PetscLogFlops(1+2*nz_tmp);
2267:         }

2269:         pj  = bj + bi[row];
2270:         pc1 = ba + bi[row];
2271:         pc2 = ba + bi[row+1];
2272:         pc3 = ba + bi[row+2];

2274:         sctx.pv       = rtmp33[row+2];
2275:         rs            = 0.0;
2276:         rtmp11[row]   = 1.0/rtmp11[row];
2277:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2278:         rtmp33[row+2] = 1.0/rtmp33[row+2];
2279:         /* copy row entries from dense representation to sparse */
2280:         for (j=0; j<nz; j++) {
2281:           idx    = pj[j];
2282:           pc1[j] = rtmp11[idx];
2283:           pc2[j] = rtmp22[idx];
2284:           pc3[j] = rtmp33[idx];
2285:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2286:         }

2288:         sctx.rs = rs;
2289:         MatPivotCheck(A,info,&sctx,row+2);
2290:         if (sctx.newshift) goto endofwhile;
2291:         break;

2293:       default:
2294:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2295:       }
2296:       row += nodesz;                 /* Update the row */
2297:     }
2298: endofwhile:;
2299:   } while (sctx.newshift);
2300:   PetscFree3(rtmp11,rtmp22,rtmp33);
2301:   PetscFree(tmp_vec2);
2302:   ISRestoreIndices(isicol,&ic);
2303:   ISRestoreIndices(isrow,&r);
2304:   ISRestoreIndices(iscol,&c);

2306:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2307:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2308:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2309:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2310:   C->assembled              = PETSC_TRUE;
2311:   C->preallocated           = PETSC_TRUE;
2312:   if (sctx.nshift) {
2313:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2314:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2315:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2316:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2317:     }
2318:   }
2319:   PetscLogFlops(C->cmap->n);
2320:   MatSeqAIJCheckInode(C);
2321:   return(0);
2322: }


2325: /* ----------------------------------------------------------- */
2328: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2329: {
2330:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2331:   IS                iscol = a->col,isrow = a->row;
2332:   PetscErrorCode    ierr;
2333:   const PetscInt    *r,*c,*rout,*cout;
2334:   PetscInt          i,j,n = A->rmap->n;
2335:   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2336:   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2337:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2338:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2339:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2340:   const PetscScalar *b;

2343:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2344:   node_max = a->inode.node_count;
2345:   ns       = a->inode.size;     /* Node Size array */

2347:   VecGetArrayRead(bb,&b);
2348:   VecGetArray(xx,&x);
2349:   tmp  = a->solve_work;

2351:   ISGetIndices(isrow,&rout); r = rout;
2352:   ISGetIndices(iscol,&cout); c = cout;

2354:   /* forward solve the lower triangular */
2355:   tmps = tmp;
2356:   aa   = a_a;
2357:   aj   = a_j;
2358:   ad   = a->diag;

2360:   for (i = 0,row = 0; i< node_max; ++i) {
2361:     nsz = ns[i];
2362:     aii = ai[row];
2363:     v1  = aa + aii;
2364:     vi  = aj + aii;
2365:     nz  = ai[row+1]- ai[row];

2367:     if (i < node_max-1) {
2368:       /* Prefetch the indices for the next block */
2369:       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2370:       /* Prefetch the data for the next block */
2371:       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2372:     }

2374:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2375:     case 1:
2376:       sum1 = b[r[row]];
2377:       for (j=0; j<nz-1; j+=2) {
2378:         i0    = vi[j];
2379:         i1    = vi[j+1];
2380:         tmp0  = tmps[i0];
2381:         tmp1  = tmps[i1];
2382:         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2383:       }
2384:       if (j == nz-1) {
2385:         tmp0  = tmps[vi[j]];
2386:         sum1 -= v1[j]*tmp0;
2387:       }
2388:       tmp[row++]=sum1;
2389:       break;
2390:     case 2:
2391:       sum1 = b[r[row]];
2392:       sum2 = b[r[row+1]];
2393:       v2   = aa + ai[row+1];

2395:       for (j=0; j<nz-1; j+=2) {
2396:         i0    = vi[j];
2397:         i1    = vi[j+1];
2398:         tmp0  = tmps[i0];
2399:         tmp1  = tmps[i1];
2400:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2401:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2402:       }
2403:       if (j == nz-1) {
2404:         tmp0  = tmps[vi[j]];
2405:         sum1 -= v1[j] *tmp0;
2406:         sum2 -= v2[j] *tmp0;
2407:       }
2408:       sum2     -= v2[nz] * sum1;
2409:       tmp[row++]=sum1;
2410:       tmp[row++]=sum2;
2411:       break;
2412:     case 3:
2413:       sum1 = b[r[row]];
2414:       sum2 = b[r[row+1]];
2415:       sum3 = b[r[row+2]];
2416:       v2   = aa + ai[row+1];
2417:       v3   = aa + ai[row+2];

2419:       for (j=0; j<nz-1; j+=2) {
2420:         i0    = vi[j];
2421:         i1    = vi[j+1];
2422:         tmp0  = tmps[i0];
2423:         tmp1  = tmps[i1];
2424:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427:       }
2428:       if (j == nz-1) {
2429:         tmp0  = tmps[vi[j]];
2430:         sum1 -= v1[j] *tmp0;
2431:         sum2 -= v2[j] *tmp0;
2432:         sum3 -= v3[j] *tmp0;
2433:       }
2434:       sum2     -= v2[nz] * sum1;
2435:       sum3     -= v3[nz] * sum1;
2436:       sum3     -= v3[nz+1] * sum2;
2437:       tmp[row++]=sum1;
2438:       tmp[row++]=sum2;
2439:       tmp[row++]=sum3;
2440:       break;

2442:     case 4:
2443:       sum1 = b[r[row]];
2444:       sum2 = b[r[row+1]];
2445:       sum3 = b[r[row+2]];
2446:       sum4 = b[r[row+3]];
2447:       v2   = aa + ai[row+1];
2448:       v3   = aa + ai[row+2];
2449:       v4   = aa + ai[row+3];

2451:       for (j=0; j<nz-1; j+=2) {
2452:         i0    = vi[j];
2453:         i1    = vi[j+1];
2454:         tmp0  = tmps[i0];
2455:         tmp1  = tmps[i1];
2456:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2457:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2458:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2459:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2460:       }
2461:       if (j == nz-1) {
2462:         tmp0  = tmps[vi[j]];
2463:         sum1 -= v1[j] *tmp0;
2464:         sum2 -= v2[j] *tmp0;
2465:         sum3 -= v3[j] *tmp0;
2466:         sum4 -= v4[j] *tmp0;
2467:       }
2468:       sum2 -= v2[nz] * sum1;
2469:       sum3 -= v3[nz] * sum1;
2470:       sum4 -= v4[nz] * sum1;
2471:       sum3 -= v3[nz+1] * sum2;
2472:       sum4 -= v4[nz+1] * sum2;
2473:       sum4 -= v4[nz+2] * sum3;

2475:       tmp[row++]=sum1;
2476:       tmp[row++]=sum2;
2477:       tmp[row++]=sum3;
2478:       tmp[row++]=sum4;
2479:       break;
2480:     case 5:
2481:       sum1 = b[r[row]];
2482:       sum2 = b[r[row+1]];
2483:       sum3 = b[r[row+2]];
2484:       sum4 = b[r[row+3]];
2485:       sum5 = b[r[row+4]];
2486:       v2   = aa + ai[row+1];
2487:       v3   = aa + ai[row+2];
2488:       v4   = aa + ai[row+3];
2489:       v5   = aa + ai[row+4];

2491:       for (j=0; j<nz-1; j+=2) {
2492:         i0    = vi[j];
2493:         i1    = vi[j+1];
2494:         tmp0  = tmps[i0];
2495:         tmp1  = tmps[i1];
2496:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2497:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2498:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2499:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2500:         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2501:       }
2502:       if (j == nz-1) {
2503:         tmp0  = tmps[vi[j]];
2504:         sum1 -= v1[j] *tmp0;
2505:         sum2 -= v2[j] *tmp0;
2506:         sum3 -= v3[j] *tmp0;
2507:         sum4 -= v4[j] *tmp0;
2508:         sum5 -= v5[j] *tmp0;
2509:       }

2511:       sum2 -= v2[nz] * sum1;
2512:       sum3 -= v3[nz] * sum1;
2513:       sum4 -= v4[nz] * sum1;
2514:       sum5 -= v5[nz] * sum1;
2515:       sum3 -= v3[nz+1] * sum2;
2516:       sum4 -= v4[nz+1] * sum2;
2517:       sum5 -= v5[nz+1] * sum2;
2518:       sum4 -= v4[nz+2] * sum3;
2519:       sum5 -= v5[nz+2] * sum3;
2520:       sum5 -= v5[nz+3] * sum4;

2522:       tmp[row++]=sum1;
2523:       tmp[row++]=sum2;
2524:       tmp[row++]=sum3;
2525:       tmp[row++]=sum4;
2526:       tmp[row++]=sum5;
2527:       break;
2528:     default:
2529:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2530:     }
2531:   }
2532:   /* backward solve the upper triangular */
2533:   for (i=node_max -1,row = n-1; i>=0; i--) {
2534:     nsz = ns[i];
2535:     aii = ad[row+1] + 1;
2536:     v1  = aa + aii;
2537:     vi  = aj + aii;
2538:     nz  = ad[row]- ad[row+1] - 1;

2540:     if (i > 0) {
2541:       /* Prefetch the indices for the next block */
2542:       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2543:       /* Prefetch the data for the next block */
2544:       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2545:     }

2547:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2548:     case 1:
2549:       sum1 = tmp[row];

2551:       for (j=0; j<nz-1; j+=2) {
2552:         i0    = vi[j];
2553:         i1    = vi[j+1];
2554:         tmp0  = tmps[i0];
2555:         tmp1  = tmps[i1];
2556:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2557:       }
2558:       if (j == nz-1) {
2559:         tmp0  = tmps[vi[j]];
2560:         sum1 -= v1[j]*tmp0;
2561:       }
2562:       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2563:       break;
2564:     case 2:
2565:       sum1 = tmp[row];
2566:       sum2 = tmp[row-1];
2567:       v2   = aa + ad[row] + 1;
2568:       for (j=0; j<nz-1; j+=2) {
2569:         i0    = vi[j];
2570:         i1    = vi[j+1];
2571:         tmp0  = tmps[i0];
2572:         tmp1  = tmps[i1];
2573:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2574:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2575:       }
2576:       if (j == nz-1) {
2577:         tmp0  = tmps[vi[j]];
2578:         sum1 -= v1[j]* tmp0;
2579:         sum2 -= v2[j+1]* tmp0;
2580:       }

2582:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2583:       sum2     -= v2[0] * tmp0;
2584:       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2585:       break;
2586:     case 3:
2587:       sum1 = tmp[row];
2588:       sum2 = tmp[row -1];
2589:       sum3 = tmp[row -2];
2590:       v2   = aa + ad[row] + 1;
2591:       v3   = aa + ad[row -1] + 1;
2592:       for (j=0; j<nz-1; j+=2) {
2593:         i0    = vi[j];
2594:         i1    = vi[j+1];
2595:         tmp0  = tmps[i0];
2596:         tmp1  = tmps[i1];
2597:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2598:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2599:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2600:       }
2601:       if (j== nz-1) {
2602:         tmp0  = tmps[vi[j]];
2603:         sum1 -= v1[j] * tmp0;
2604:         sum2 -= v2[j+1] * tmp0;
2605:         sum3 -= v3[j+2] * tmp0;
2606:       }
2607:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2608:       sum2     -= v2[0]* tmp0;
2609:       sum3     -= v3[1] * tmp0;
2610:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2611:       sum3     -= v3[0]* tmp0;
2612:       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;

2614:       break;
2615:     case 4:
2616:       sum1 = tmp[row];
2617:       sum2 = tmp[row -1];
2618:       sum3 = tmp[row -2];
2619:       sum4 = tmp[row -3];
2620:       v2   = aa + ad[row]+1;
2621:       v3   = aa + ad[row -1]+1;
2622:       v4   = aa + ad[row -2]+1;

2624:       for (j=0; j<nz-1; j+=2) {
2625:         i0    = vi[j];
2626:         i1    = vi[j+1];
2627:         tmp0  = tmps[i0];
2628:         tmp1  = tmps[i1];
2629:         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2630:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2631:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2632:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2633:       }
2634:       if (j== nz-1) {
2635:         tmp0  = tmps[vi[j]];
2636:         sum1 -= v1[j] * tmp0;
2637:         sum2 -= v2[j+1] * tmp0;
2638:         sum3 -= v3[j+2] * tmp0;
2639:         sum4 -= v4[j+3] * tmp0;
2640:       }

2642:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2643:       sum2     -= v2[0] * tmp0;
2644:       sum3     -= v3[1] * tmp0;
2645:       sum4     -= v4[2] * tmp0;
2646:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2647:       sum3     -= v3[0] * tmp0;
2648:       sum4     -= v4[1] * tmp0;
2649:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2650:       sum4     -= v4[0] * tmp0;
2651:       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2652:       break;
2653:     case 5:
2654:       sum1 = tmp[row];
2655:       sum2 = tmp[row -1];
2656:       sum3 = tmp[row -2];
2657:       sum4 = tmp[row -3];
2658:       sum5 = tmp[row -4];
2659:       v2   = aa + ad[row]+1;
2660:       v3   = aa + ad[row -1]+1;
2661:       v4   = aa + ad[row -2]+1;
2662:       v5   = aa + ad[row -3]+1;
2663:       for (j=0; j<nz-1; j+=2) {
2664:         i0    = vi[j];
2665:         i1    = vi[j+1];
2666:         tmp0  = tmps[i0];
2667:         tmp1  = tmps[i1];
2668:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2669:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2670:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2671:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2672:         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2673:       }
2674:       if (j==nz-1) {
2675:         tmp0  = tmps[vi[j]];
2676:         sum1 -= v1[j] * tmp0;
2677:         sum2 -= v2[j+1] * tmp0;
2678:         sum3 -= v3[j+2] * tmp0;
2679:         sum4 -= v4[j+3] * tmp0;
2680:         sum5 -= v5[j+4] * tmp0;
2681:       }

2683:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2684:       sum2     -= v2[0] * tmp0;
2685:       sum3     -= v3[1] * tmp0;
2686:       sum4     -= v4[2] * tmp0;
2687:       sum5     -= v5[3] * tmp0;
2688:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2689:       sum3     -= v3[0] * tmp0;
2690:       sum4     -= v4[1] * tmp0;
2691:       sum5     -= v5[2] * tmp0;
2692:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2693:       sum4     -= v4[0] * tmp0;
2694:       sum5     -= v5[1] * tmp0;
2695:       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2696:       sum5     -= v5[0] * tmp0;
2697:       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2698:       break;
2699:     default:
2700:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2701:     }
2702:   }
2703:   ISRestoreIndices(isrow,&rout);
2704:   ISRestoreIndices(iscol,&cout);
2705:   VecRestoreArrayRead(bb,&b);
2706:   VecRestoreArray(xx,&x);
2707:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2708:   return(0);
2709: }


2712: /*
2713:      Makes a longer coloring[] array and calls the usual code with that
2714: */
2717: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2718: {
2719:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2720:   PetscErrorCode  ierr;
2721:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2722:   PetscInt        *colorused,i;
2723:   ISColoringValue *newcolor;

2726:   PetscMalloc1(n+1,&newcolor);
2727:   /* loop over inodes, marking a color for each column*/
2728:   row = 0;
2729:   for (i=0; i<m; i++) {
2730:     for (j=0; j<ns[i]; j++) {
2731:       newcolor[row++] = coloring[i] + j*ncolors;
2732:     }
2733:   }

2735:   /* eliminate unneeded colors */
2736:   PetscCalloc1(5*ncolors,&colorused);
2737:   for (i=0; i<n; i++) {
2738:     colorused[newcolor[i]] = 1;
2739:   }

2741:   for (i=1; i<5*ncolors; i++) {
2742:     colorused[i] += colorused[i-1];
2743:   }
2744:   ncolors = colorused[5*ncolors-1];
2745:   for (i=0; i<n; i++) {
2746:     newcolor[i] = colorused[newcolor[i]]-1;
2747:   }
2748:   PetscFree(colorused);
2749:   ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2750:   PetscFree(coloring);
2751:   return(0);
2752: }

2754: #include <petsc/private/kernels/blockinvert.h>

2758: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2759: {
2760:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2761:   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2762:   MatScalar         *ibdiag,*bdiag,work[25],*t;
2763:   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2764:   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2765:   const PetscScalar *xb, *b;
2766:   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2767:   PetscErrorCode    ierr;
2768:   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2769:   PetscInt          sz,k,ipvt[5];
2770:   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;

2773:   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2774:   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2775:   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");

2777:   if (!a->inode.ibdiagvalid) {
2778:     if (!a->inode.ibdiag) {
2779:       /* calculate space needed for diagonal blocks */
2780:       for (i=0; i<m; i++) {
2781:         cnt += sizes[i]*sizes[i];
2782:       }
2783:       a->inode.bdiagsize = cnt;

2785:       PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2786:     }

2788:     /* copy over the diagonal blocks and invert them */
2789:     ibdiag = a->inode.ibdiag;
2790:     bdiag  = a->inode.bdiag;
2791:     cnt    = 0;
2792:     for (i=0, row = 0; i<m; i++) {
2793:       for (j=0; j<sizes[i]; j++) {
2794:         for (k=0; k<sizes[i]; k++) {
2795:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2796:         }
2797:       }
2798:       PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));

2800:       switch (sizes[i]) {
2801:       case 1:
2802:         /* Create matrix data structure */
2803:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2804:         ibdiag[cnt] = 1.0/ibdiag[cnt];
2805:         break;
2806:       case 2:
2807:         PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
2808:         break;
2809:       case 3:
2810:         PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
2811:         break;
2812:       case 4:
2813:         PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
2814:         break;
2815:       case 5:
2816:         PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);
2817:         break;
2818:       default:
2819:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2820:       }
2821:       cnt += sizes[i]*sizes[i];
2822:       row += sizes[i];
2823:     }
2824:     a->inode.ibdiagvalid = PETSC_TRUE;
2825:   }
2826:   ibdiag = a->inode.ibdiag;
2827:   bdiag  = a->inode.bdiag;
2828:   t      = a->inode.ssor_work;

2830:   VecGetArray(xx,&x);
2831:   VecGetArrayRead(bb,&b);
2832:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2833:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2834:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {

2836:       for (i=0, row=0; i<m; i++) {
2837:         sz  = diag[row] - ii[row];
2838:         v1  = a->a + ii[row];
2839:         idx = a->j + ii[row];

2841:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2842:         switch (sizes[i]) {
2843:         case 1:

2845:           sum1 = b[row];
2846:           for (n = 0; n<sz-1; n+=2) {
2847:             i1    = idx[0];
2848:             i2    = idx[1];
2849:             idx  += 2;
2850:             tmp0  = x[i1];
2851:             tmp1  = x[i2];
2852:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2853:           }

2855:           if (n == sz-1) {
2856:             tmp0  = x[*idx];
2857:             sum1 -= *v1 * tmp0;
2858:           }
2859:           t[row]   = sum1;
2860:           x[row++] = sum1*(*ibdiag++);
2861:           break;
2862:         case 2:
2863:           v2   = a->a + ii[row+1];
2864:           sum1 = b[row];
2865:           sum2 = b[row+1];
2866:           for (n = 0; n<sz-1; n+=2) {
2867:             i1    = idx[0];
2868:             i2    = idx[1];
2869:             idx  += 2;
2870:             tmp0  = x[i1];
2871:             tmp1  = x[i2];
2872:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2873:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2874:           }

2876:           if (n == sz-1) {
2877:             tmp0  = x[*idx];
2878:             sum1 -= v1[0] * tmp0;
2879:             sum2 -= v2[0] * tmp0;
2880:           }
2881:           t[row]   = sum1;
2882:           t[row+1] = sum2;
2883:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2884:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2885:           ibdiag  += 4;
2886:           break;
2887:         case 3:
2888:           v2   = a->a + ii[row+1];
2889:           v3   = a->a + ii[row+2];
2890:           sum1 = b[row];
2891:           sum2 = b[row+1];
2892:           sum3 = b[row+2];
2893:           for (n = 0; n<sz-1; n+=2) {
2894:             i1    = idx[0];
2895:             i2    = idx[1];
2896:             idx  += 2;
2897:             tmp0  = x[i1];
2898:             tmp1  = x[i2];
2899:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2900:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2901:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2902:           }

2904:           if (n == sz-1) {
2905:             tmp0  = x[*idx];
2906:             sum1 -= v1[0] * tmp0;
2907:             sum2 -= v2[0] * tmp0;
2908:             sum3 -= v3[0] * tmp0;
2909:           }
2910:           t[row]   = sum1;
2911:           t[row+1] = sum2;
2912:           t[row+2] = sum3;
2913:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2914:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2915:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2916:           ibdiag  += 9;
2917:           break;
2918:         case 4:
2919:           v2   = a->a + ii[row+1];
2920:           v3   = a->a + ii[row+2];
2921:           v4   = a->a + ii[row+3];
2922:           sum1 = b[row];
2923:           sum2 = b[row+1];
2924:           sum3 = b[row+2];
2925:           sum4 = b[row+3];
2926:           for (n = 0; n<sz-1; n+=2) {
2927:             i1    = idx[0];
2928:             i2    = idx[1];
2929:             idx  += 2;
2930:             tmp0  = x[i1];
2931:             tmp1  = x[i2];
2932:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2933:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2934:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2935:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2936:           }

2938:           if (n == sz-1) {
2939:             tmp0  = x[*idx];
2940:             sum1 -= v1[0] * tmp0;
2941:             sum2 -= v2[0] * tmp0;
2942:             sum3 -= v3[0] * tmp0;
2943:             sum4 -= v4[0] * tmp0;
2944:           }
2945:           t[row]   = sum1;
2946:           t[row+1] = sum2;
2947:           t[row+2] = sum3;
2948:           t[row+3] = sum4;
2949:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2950:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2951:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2952:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2953:           ibdiag  += 16;
2954:           break;
2955:         case 5:
2956:           v2   = a->a + ii[row+1];
2957:           v3   = a->a + ii[row+2];
2958:           v4   = a->a + ii[row+3];
2959:           v5   = a->a + ii[row+4];
2960:           sum1 = b[row];
2961:           sum2 = b[row+1];
2962:           sum3 = b[row+2];
2963:           sum4 = b[row+3];
2964:           sum5 = b[row+4];
2965:           for (n = 0; n<sz-1; n+=2) {
2966:             i1    = idx[0];
2967:             i2    = idx[1];
2968:             idx  += 2;
2969:             tmp0  = x[i1];
2970:             tmp1  = x[i2];
2971:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2972:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2973:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2974:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2975:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2976:           }

2978:           if (n == sz-1) {
2979:             tmp0  = x[*idx];
2980:             sum1 -= v1[0] * tmp0;
2981:             sum2 -= v2[0] * tmp0;
2982:             sum3 -= v3[0] * tmp0;
2983:             sum4 -= v4[0] * tmp0;
2984:             sum5 -= v5[0] * tmp0;
2985:           }
2986:           t[row]   = sum1;
2987:           t[row+1] = sum2;
2988:           t[row+2] = sum3;
2989:           t[row+3] = sum4;
2990:           t[row+4] = sum5;
2991:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2992:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2993:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2994:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2995:           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2996:           ibdiag  += 25;
2997:           break;
2998:         default:
2999:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3000:         }
3001:       }

3003:       xb   = t;
3004:       PetscLogFlops(a->nz);
3005:     } else xb = b;
3006:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3008:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3009:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3010:         ibdiag -= sizes[i]*sizes[i];
3011:         sz      = ii[row+1] - diag[row] - 1;
3012:         v1      = a->a + diag[row] + 1;
3013:         idx     = a->j + diag[row] + 1;

3015:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3016:         switch (sizes[i]) {
3017:         case 1:

3019:           sum1 = xb[row];
3020:           for (n = 0; n<sz-1; n+=2) {
3021:             i1    = idx[0];
3022:             i2    = idx[1];
3023:             idx  += 2;
3024:             tmp0  = x[i1];
3025:             tmp1  = x[i2];
3026:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3027:           }

3029:           if (n == sz-1) {
3030:             tmp0  = x[*idx];
3031:             sum1 -= *v1*tmp0;
3032:           }
3033:           x[row--] = sum1*(*ibdiag);
3034:           break;

3036:         case 2:

3038:           sum1 = xb[row];
3039:           sum2 = xb[row-1];
3040:           /* note that sum1 is associated with the second of the two rows */
3041:           v2 = a->a + diag[row-1] + 2;
3042:           for (n = 0; n<sz-1; n+=2) {
3043:             i1    = idx[0];
3044:             i2    = idx[1];
3045:             idx  += 2;
3046:             tmp0  = x[i1];
3047:             tmp1  = x[i2];
3048:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3049:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3050:           }

3052:           if (n == sz-1) {
3053:             tmp0  = x[*idx];
3054:             sum1 -= *v1*tmp0;
3055:             sum2 -= *v2*tmp0;
3056:           }
3057:           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3058:           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3059:           break;
3060:         case 3:

3062:           sum1 = xb[row];
3063:           sum2 = xb[row-1];
3064:           sum3 = xb[row-2];
3065:           v2   = a->a + diag[row-1] + 2;
3066:           v3   = a->a + diag[row-2] + 3;
3067:           for (n = 0; n<sz-1; n+=2) {
3068:             i1    = idx[0];
3069:             i2    = idx[1];
3070:             idx  += 2;
3071:             tmp0  = x[i1];
3072:             tmp1  = x[i2];
3073:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3074:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3075:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3076:           }

3078:           if (n == sz-1) {
3079:             tmp0  = x[*idx];
3080:             sum1 -= *v1*tmp0;
3081:             sum2 -= *v2*tmp0;
3082:             sum3 -= *v3*tmp0;
3083:           }
3084:           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3085:           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3086:           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3087:           break;
3088:         case 4:

3090:           sum1 = xb[row];
3091:           sum2 = xb[row-1];
3092:           sum3 = xb[row-2];
3093:           sum4 = xb[row-3];
3094:           v2   = a->a + diag[row-1] + 2;
3095:           v3   = a->a + diag[row-2] + 3;
3096:           v4   = a->a + diag[row-3] + 4;
3097:           for (n = 0; n<sz-1; n+=2) {
3098:             i1    = idx[0];
3099:             i2    = idx[1];
3100:             idx  += 2;
3101:             tmp0  = x[i1];
3102:             tmp1  = x[i2];
3103:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3104:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3105:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3106:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3107:           }

3109:           if (n == sz-1) {
3110:             tmp0  = x[*idx];
3111:             sum1 -= *v1*tmp0;
3112:             sum2 -= *v2*tmp0;
3113:             sum3 -= *v3*tmp0;
3114:             sum4 -= *v4*tmp0;
3115:           }
3116:           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3117:           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3118:           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3119:           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3120:           break;
3121:         case 5:

3123:           sum1 = xb[row];
3124:           sum2 = xb[row-1];
3125:           sum3 = xb[row-2];
3126:           sum4 = xb[row-3];
3127:           sum5 = xb[row-4];
3128:           v2   = a->a + diag[row-1] + 2;
3129:           v3   = a->a + diag[row-2] + 3;
3130:           v4   = a->a + diag[row-3] + 4;
3131:           v5   = a->a + diag[row-4] + 5;
3132:           for (n = 0; n<sz-1; n+=2) {
3133:             i1    = idx[0];
3134:             i2    = idx[1];
3135:             idx  += 2;
3136:             tmp0  = x[i1];
3137:             tmp1  = x[i2];
3138:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3139:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3140:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3141:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3142:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3143:           }

3145:           if (n == sz-1) {
3146:             tmp0  = x[*idx];
3147:             sum1 -= *v1*tmp0;
3148:             sum2 -= *v2*tmp0;
3149:             sum3 -= *v3*tmp0;
3150:             sum4 -= *v4*tmp0;
3151:             sum5 -= *v5*tmp0;
3152:           }
3153:           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3154:           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3155:           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3156:           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3157:           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3158:           break;
3159:         default:
3160:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3161:         }
3162:       }

3164:       PetscLogFlops(a->nz);
3165:     }
3166:     its--;
3167:   }
3168:   while (its--) {

3170:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3171:       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3172:            i<m;
3173:            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {

3175:         sz  = diag[row] - ii[row];
3176:         v1  = a->a + ii[row];
3177:         idx = a->j + ii[row];
3178:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3179:         switch (sizes[i]) {
3180:         case 1:
3181:           sum1 = b[row];
3182:           for (n = 0; n<sz-1; n+=2) {
3183:             i1    = idx[0];
3184:             i2    = idx[1];
3185:             idx  += 2;
3186:             tmp0  = x[i1];
3187:             tmp1  = x[i2];
3188:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3189:           }
3190:           if (n == sz-1) {
3191:             tmp0  = x[*idx++];
3192:             sum1 -= *v1 * tmp0;
3193:             v1++;
3194:           }
3195:           t[row]   = sum1;
3196:           sz      = ii[row+1] - diag[row] - 1;
3197:           idx     = a->j + diag[row] + 1;
3198:           v1 += 1;
3199:           for (n = 0; n<sz-1; n+=2) {
3200:             i1    = idx[0];
3201:             i2    = idx[1];
3202:             idx  += 2;
3203:             tmp0  = x[i1];
3204:             tmp1  = x[i2];
3205:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3206:           }
3207:           if (n == sz-1) {
3208:             tmp0  = x[*idx++];
3209:             sum1 -= *v1 * tmp0;
3210:           }
3211:           /* in MatSOR_SeqAIJ this line would be
3212:            *
3213:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3214:            *
3215:            * but omega == 1, so this becomes
3216:            *
3217:            * x[row] = sum1*(*ibdiag++);
3218:            *
3219:            */
3220:           x[row] = sum1*(*ibdiag);
3221:           break;
3222:         case 2:
3223:           v2   = a->a + ii[row+1];
3224:           sum1 = b[row];
3225:           sum2 = b[row+1];
3226:           for (n = 0; n<sz-1; n+=2) {
3227:             i1    = idx[0];
3228:             i2    = idx[1];
3229:             idx  += 2;
3230:             tmp0  = x[i1];
3231:             tmp1  = x[i2];
3232:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3233:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3234:           }
3235:           if (n == sz-1) {
3236:             tmp0  = x[*idx++];
3237:             sum1 -= v1[0] * tmp0;
3238:             sum2 -= v2[0] * tmp0;
3239:             v1++; v2++;
3240:           }
3241:           t[row]   = sum1;
3242:           t[row+1] = sum2;
3243:           sz      = ii[row+1] - diag[row] - 2;
3244:           idx     = a->j + diag[row] + 2;
3245:           v1 += 2;
3246:           v2 += 2;
3247:           for (n = 0; n<sz-1; n+=2) {
3248:             i1    = idx[0];
3249:             i2    = idx[1];
3250:             idx  += 2;
3251:             tmp0  = x[i1];
3252:             tmp1  = x[i2];
3253:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3254:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3255:           }
3256:           if (n == sz-1) {
3257:             tmp0  = x[*idx];
3258:             sum1 -= v1[0] * tmp0;
3259:             sum2 -= v2[0] * tmp0;
3260:           }
3261:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3262:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3263:           break;
3264:         case 3:
3265:           v2   = a->a + ii[row+1];
3266:           v3   = a->a + ii[row+2];
3267:           sum1 = b[row];
3268:           sum2 = b[row+1];
3269:           sum3 = b[row+2];
3270:           for (n = 0; n<sz-1; n+=2) {
3271:             i1    = idx[0];
3272:             i2    = idx[1];
3273:             idx  += 2;
3274:             tmp0  = x[i1];
3275:             tmp1  = x[i2];
3276:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3277:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3278:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3279:           }
3280:           if (n == sz-1) {
3281:             tmp0  = x[*idx++];
3282:             sum1 -= v1[0] * tmp0;
3283:             sum2 -= v2[0] * tmp0;
3284:             sum3 -= v3[0] * tmp0;
3285:             v1++; v2++; v3++;
3286:           }
3287:           t[row]   = sum1;
3288:           t[row+1] = sum2;
3289:           t[row+2] = sum3;
3290:           sz      = ii[row+1] - diag[row] - 3;
3291:           idx     = a->j + diag[row] + 3;
3292:           v1 += 3;
3293:           v2 += 3;
3294:           v3 += 3;
3295:           for (n = 0; n<sz-1; n+=2) {
3296:             i1    = idx[0];
3297:             i2    = idx[1];
3298:             idx  += 2;
3299:             tmp0  = x[i1];
3300:             tmp1  = x[i2];
3301:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3302:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3303:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3304:           }
3305:           if (n == sz-1) {
3306:             tmp0  = x[*idx];
3307:             sum1 -= v1[0] * tmp0;
3308:             sum2 -= v2[0] * tmp0;
3309:             sum3 -= v3[0] * tmp0;
3310:           }
3311:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3312:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3313:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3314:           break;
3315:         case 4:
3316:           v2   = a->a + ii[row+1];
3317:           v3   = a->a + ii[row+2];
3318:           v4   = a->a + ii[row+3];
3319:           sum1 = b[row];
3320:           sum2 = b[row+1];
3321:           sum3 = b[row+2];
3322:           sum4 = b[row+3];
3323:           for (n = 0; n<sz-1; n+=2) {
3324:             i1    = idx[0];
3325:             i2    = idx[1];
3326:             idx  += 2;
3327:             tmp0  = x[i1];
3328:             tmp1  = x[i2];
3329:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3330:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3331:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3332:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3333:           }
3334:           if (n == sz-1) {
3335:             tmp0  = x[*idx++];
3336:             sum1 -= v1[0] * tmp0;
3337:             sum2 -= v2[0] * tmp0;
3338:             sum3 -= v3[0] * tmp0;
3339:             sum4 -= v4[0] * tmp0;
3340:             v1++; v2++; v3++; v4++;
3341:           }
3342:           t[row]   = sum1;
3343:           t[row+1] = sum2;
3344:           t[row+2] = sum3;
3345:           t[row+3] = sum4;
3346:           sz      = ii[row+1] - diag[row] - 4;
3347:           idx     = a->j + diag[row] + 4;
3348:           v1 += 4;
3349:           v2 += 4;
3350:           v3 += 4;
3351:           v4 += 4;
3352:           for (n = 0; n<sz-1; n+=2) {
3353:             i1    = idx[0];
3354:             i2    = idx[1];
3355:             idx  += 2;
3356:             tmp0  = x[i1];
3357:             tmp1  = x[i2];
3358:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3359:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3360:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3361:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3362:           }
3363:           if (n == sz-1) {
3364:             tmp0  = x[*idx];
3365:             sum1 -= v1[0] * tmp0;
3366:             sum2 -= v2[0] * tmp0;
3367:             sum3 -= v3[0] * tmp0;
3368:             sum4 -= v4[0] * tmp0;
3369:           }
3370:           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3371:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3372:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3373:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3374:           break;
3375:         case 5:
3376:           v2   = a->a + ii[row+1];
3377:           v3   = a->a + ii[row+2];
3378:           v4   = a->a + ii[row+3];
3379:           v5   = a->a + ii[row+4];
3380:           sum1 = b[row];
3381:           sum2 = b[row+1];
3382:           sum3 = b[row+2];
3383:           sum4 = b[row+3];
3384:           sum5 = b[row+4];
3385:           for (n = 0; n<sz-1; n+=2) {
3386:             i1    = idx[0];
3387:             i2    = idx[1];
3388:             idx  += 2;
3389:             tmp0  = x[i1];
3390:             tmp1  = x[i2];
3391:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3392:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3393:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3394:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3395:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3396:           }
3397:           if (n == sz-1) {
3398:             tmp0  = x[*idx++];
3399:             sum1 -= v1[0] * tmp0;
3400:             sum2 -= v2[0] * tmp0;
3401:             sum3 -= v3[0] * tmp0;
3402:             sum4 -= v4[0] * tmp0;
3403:             sum5 -= v5[0] * tmp0;
3404:             v1++; v2++; v3++; v4++; v5++;
3405:           }
3406:           t[row]   = sum1;
3407:           t[row+1] = sum2;
3408:           t[row+2] = sum3;
3409:           t[row+3] = sum4;
3410:           t[row+4] = sum5;
3411:           sz      = ii[row+1] - diag[row] - 5;
3412:           idx     = a->j + diag[row] + 5;
3413:           v1 += 5;
3414:           v2 += 5;
3415:           v3 += 5;
3416:           v4 += 5;
3417:           v5 += 5;
3418:           for (n = 0; n<sz-1; n+=2) {
3419:             i1    = idx[0];
3420:             i2    = idx[1];
3421:             idx  += 2;
3422:             tmp0  = x[i1];
3423:             tmp1  = x[i2];
3424:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3425:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3426:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3427:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3428:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3429:           }
3430:           if (n == sz-1) {
3431:             tmp0  = x[*idx];
3432:             sum1 -= v1[0] * tmp0;
3433:             sum2 -= v2[0] * tmp0;
3434:             sum3 -= v3[0] * tmp0;
3435:             sum4 -= v4[0] * tmp0;
3436:             sum5 -= v5[0] * tmp0;
3437:           }
3438:           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3439:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3440:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3441:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3442:           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3443:           break;
3444:         default:
3445:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3446:         }
3447:       }
3448:       xb   = t;
3449:       PetscLogFlops(2.0*a->nz);  /* undercounts diag inverse */
3450:     } else xb = b;

3452:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3454:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3455:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3456:         ibdiag -= sizes[i]*sizes[i];

3458:         /* set RHS */
3459:         if (xb == b) {
3460:           /* whole (old way) */
3461:           sz      = ii[row+1] - ii[row];
3462:           idx     = a->j + ii[row];
3463:           switch (sizes[i]) {
3464:           case 5:
3465:             v5      = a->a + ii[row-4];
3466:           case 4: /* fall through */
3467:             v4      = a->a + ii[row-3];
3468:           case 3:
3469:             v3      = a->a + ii[row-2];
3470:           case 2:
3471:             v2      = a->a + ii[row-1];
3472:           case 1:
3473:             v1      = a->a + ii[row];
3474:             break;
3475:           default:
3476:             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3477:           }
3478:         } else {
3479:           /* upper, no diag */
3480:           sz      = ii[row+1] - diag[row] - 1;
3481:           idx     = a->j + diag[row] + 1;
3482:           switch (sizes[i]) {
3483:           case 5:
3484:             v5      = a->a + diag[row-4] + 5;
3485:           case 4: /* fall through */
3486:             v4      = a->a + diag[row-3] + 4;
3487:           case 3:
3488:             v3      = a->a + diag[row-2] + 3;
3489:           case 2:
3490:             v2      = a->a + diag[row-1] + 2;
3491:           case 1:
3492:             v1      = a->a + diag[row] + 1;
3493:           }
3494:         }
3495:         /* set sum */
3496:         switch (sizes[i]) {
3497:         case 5:
3498:           sum5 = xb[row-4];
3499:         case 4: /* fall through */
3500:           sum4 = xb[row-3];
3501:         case 3:
3502:           sum3 = xb[row-2];
3503:         case 2:
3504:           sum2 = xb[row-1];
3505:         case 1:
3506:           /* note that sum1 is associated with the last row */
3507:           sum1 = xb[row];
3508:         }
3509:         /* do sums */
3510:         for (n = 0; n<sz-1; n+=2) {
3511:             i1    = idx[0];
3512:             i2    = idx[1];
3513:             idx  += 2;
3514:             tmp0  = x[i1];
3515:             tmp1  = x[i2];
3516:             switch (sizes[i]) {
3517:             case 5:
3518:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3519:             case 4: /* fall through */
3520:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3521:             case 3:
3522:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3523:             case 2:
3524:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3525:             case 1:
3526:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3527:             }
3528:         }
3529:         /* ragged edge */
3530:         if (n == sz-1) {
3531:           tmp0  = x[*idx];
3532:           switch (sizes[i]) {
3533:           case 5:
3534:             sum5 -= *v5*tmp0;
3535:           case 4: /* fall through */
3536:             sum4 -= *v4*tmp0;
3537:           case 3:
3538:             sum3 -= *v3*tmp0;
3539:           case 2:
3540:             sum2 -= *v2*tmp0;
3541:           case 1:
3542:             sum1 -= *v1*tmp0;
3543:           }
3544:         }
3545:         /* update */
3546:         if (xb == b) {
3547:           /* whole (old way) w/ diag */
3548:           switch (sizes[i]) {
3549:           case 5:
3550:             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3551:             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3552:             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3553:             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3554:             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3555:             break;
3556:           case 4:
3557:             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3558:             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3559:             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3560:             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3561:             break;
3562:           case 3:
3563:             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3564:             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3565:             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3566:             break;
3567:           case 2:
3568:             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3569:             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3570:             break;
3571:           case 1:
3572:             x[row--] += sum1*(*ibdiag);
3573:             break;
3574:           }
3575:         } else {
3576:           /* no diag so set =  */
3577:           switch (sizes[i]) {
3578:           case 5:
3579:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3580:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3581:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3582:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3583:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3584:             break;
3585:           case 4:
3586:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3587:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3588:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3589:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3590:             break;
3591:           case 3:
3592:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3593:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3594:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3595:             break;
3596:           case 2:
3597:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3598:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3599:             break;
3600:           case 1:
3601:             x[row--] = sum1*(*ibdiag);
3602:             break;
3603:           }
3604:         }
3605:       }
3606:       if (xb == b) {
3607:         PetscLogFlops(2.0*a->nz);
3608:       } else {
3609:         PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3610:       }
3611:     }
3612:   }
3613:   if (flag & SOR_EISENSTAT) {
3614:     /*
3615:           Apply  (U + D)^-1  where D is now the block diagonal
3616:     */
3617:     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3618:     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3619:       ibdiag -= sizes[i]*sizes[i];
3620:       sz      = ii[row+1] - diag[row] - 1;
3621:       v1      = a->a + diag[row] + 1;
3622:       idx     = a->j + diag[row] + 1;
3623:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3624:       switch (sizes[i]) {
3625:       case 1:

3627:         sum1 = b[row];
3628:         for (n = 0; n<sz-1; n+=2) {
3629:           i1    = idx[0];
3630:           i2    = idx[1];
3631:           idx  += 2;
3632:           tmp0  = x[i1];
3633:           tmp1  = x[i2];
3634:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3635:         }

3637:         if (n == sz-1) {
3638:           tmp0  = x[*idx];
3639:           sum1 -= *v1*tmp0;
3640:         }
3641:         x[row] = sum1*(*ibdiag);row--;
3642:         break;

3644:       case 2:

3646:         sum1 = b[row];
3647:         sum2 = b[row-1];
3648:         /* note that sum1 is associated with the second of the two rows */
3649:         v2 = a->a + diag[row-1] + 2;
3650:         for (n = 0; n<sz-1; n+=2) {
3651:           i1    = idx[0];
3652:           i2    = idx[1];
3653:           idx  += 2;
3654:           tmp0  = x[i1];
3655:           tmp1  = x[i2];
3656:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3657:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3658:         }

3660:         if (n == sz-1) {
3661:           tmp0  = x[*idx];
3662:           sum1 -= *v1*tmp0;
3663:           sum2 -= *v2*tmp0;
3664:         }
3665:         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3666:         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3667:         row     -= 2;
3668:         break;
3669:       case 3:

3671:         sum1 = b[row];
3672:         sum2 = b[row-1];
3673:         sum3 = b[row-2];
3674:         v2   = a->a + diag[row-1] + 2;
3675:         v3   = a->a + diag[row-2] + 3;
3676:         for (n = 0; n<sz-1; n+=2) {
3677:           i1    = idx[0];
3678:           i2    = idx[1];
3679:           idx  += 2;
3680:           tmp0  = x[i1];
3681:           tmp1  = x[i2];
3682:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3683:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3684:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3685:         }

3687:         if (n == sz-1) {
3688:           tmp0  = x[*idx];
3689:           sum1 -= *v1*tmp0;
3690:           sum2 -= *v2*tmp0;
3691:           sum3 -= *v3*tmp0;
3692:         }
3693:         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3694:         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3695:         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3696:         row     -= 3;
3697:         break;
3698:       case 4:

3700:         sum1 = b[row];
3701:         sum2 = b[row-1];
3702:         sum3 = b[row-2];
3703:         sum4 = b[row-3];
3704:         v2   = a->a + diag[row-1] + 2;
3705:         v3   = a->a + diag[row-2] + 3;
3706:         v4   = a->a + diag[row-3] + 4;
3707:         for (n = 0; n<sz-1; n+=2) {
3708:           i1    = idx[0];
3709:           i2    = idx[1];
3710:           idx  += 2;
3711:           tmp0  = x[i1];
3712:           tmp1  = x[i2];
3713:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3714:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3715:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3716:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3717:         }

3719:         if (n == sz-1) {
3720:           tmp0  = x[*idx];
3721:           sum1 -= *v1*tmp0;
3722:           sum2 -= *v2*tmp0;
3723:           sum3 -= *v3*tmp0;
3724:           sum4 -= *v4*tmp0;
3725:         }
3726:         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3727:         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3728:         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3729:         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3730:         row     -= 4;
3731:         break;
3732:       case 5:

3734:         sum1 = b[row];
3735:         sum2 = b[row-1];
3736:         sum3 = b[row-2];
3737:         sum4 = b[row-3];
3738:         sum5 = b[row-4];
3739:         v2   = a->a + diag[row-1] + 2;
3740:         v3   = a->a + diag[row-2] + 3;
3741:         v4   = a->a + diag[row-3] + 4;
3742:         v5   = a->a + diag[row-4] + 5;
3743:         for (n = 0; n<sz-1; n+=2) {
3744:           i1    = idx[0];
3745:           i2    = idx[1];
3746:           idx  += 2;
3747:           tmp0  = x[i1];
3748:           tmp1  = x[i2];
3749:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3750:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3751:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3752:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3753:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3754:         }

3756:         if (n == sz-1) {
3757:           tmp0  = x[*idx];
3758:           sum1 -= *v1*tmp0;
3759:           sum2 -= *v2*tmp0;
3760:           sum3 -= *v3*tmp0;
3761:           sum4 -= *v4*tmp0;
3762:           sum5 -= *v5*tmp0;
3763:         }
3764:         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3765:         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3766:         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3767:         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3768:         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3769:         row     -= 5;
3770:         break;
3771:       default:
3772:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3773:       }
3774:     }
3775:     PetscLogFlops(a->nz);

3777:     /*
3778:            t = b - D x    where D is the block diagonal
3779:     */
3780:     cnt = 0;
3781:     for (i=0, row=0; i<m; i++) {
3782:       switch (sizes[i]) {
3783:       case 1:
3784:         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3785:         break;
3786:       case 2:
3787:         x1       = x[row]; x2 = x[row+1];
3788:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3789:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3790:         t[row]   = b[row] - tmp1;
3791:         t[row+1] = b[row+1] - tmp2; row += 2;
3792:         cnt     += 4;
3793:         break;
3794:       case 3:
3795:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3796:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3797:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3798:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3799:         t[row]   = b[row] - tmp1;
3800:         t[row+1] = b[row+1] - tmp2;
3801:         t[row+2] = b[row+2] - tmp3; row += 3;
3802:         cnt     += 9;
3803:         break;
3804:       case 4:
3805:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3806:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3807:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3808:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3809:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3810:         t[row]   = b[row] - tmp1;
3811:         t[row+1] = b[row+1] - tmp2;
3812:         t[row+2] = b[row+2] - tmp3;
3813:         t[row+3] = b[row+3] - tmp4; row += 4;
3814:         cnt     += 16;
3815:         break;
3816:       case 5:
3817:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3818:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3819:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3820:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3821:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3822:         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3823:         t[row]   = b[row] - tmp1;
3824:         t[row+1] = b[row+1] - tmp2;
3825:         t[row+2] = b[row+2] - tmp3;
3826:         t[row+3] = b[row+3] - tmp4;
3827:         t[row+4] = b[row+4] - tmp5;row += 5;
3828:         cnt     += 25;
3829:         break;
3830:       default:
3831:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3832:       }
3833:     }
3834:     PetscLogFlops(m);



3838:     /*
3839:           Apply (L + D)^-1 where D is the block diagonal
3840:     */
3841:     for (i=0, row=0; i<m; i++) {
3842:       sz  = diag[row] - ii[row];
3843:       v1  = a->a + ii[row];
3844:       idx = a->j + ii[row];
3845:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3846:       switch (sizes[i]) {
3847:       case 1:

3849:         sum1 = t[row];
3850:         for (n = 0; n<sz-1; n+=2) {
3851:           i1    = idx[0];
3852:           i2    = idx[1];
3853:           idx  += 2;
3854:           tmp0  = t[i1];
3855:           tmp1  = t[i2];
3856:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3857:         }

3859:         if (n == sz-1) {
3860:           tmp0  = t[*idx];
3861:           sum1 -= *v1 * tmp0;
3862:         }
3863:         x[row] += t[row] = sum1*(*ibdiag++); row++;
3864:         break;
3865:       case 2:
3866:         v2   = a->a + ii[row+1];
3867:         sum1 = t[row];
3868:         sum2 = t[row+1];
3869:         for (n = 0; n<sz-1; n+=2) {
3870:           i1    = idx[0];
3871:           i2    = idx[1];
3872:           idx  += 2;
3873:           tmp0  = t[i1];
3874:           tmp1  = t[i2];
3875:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3876:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3877:         }

3879:         if (n == sz-1) {
3880:           tmp0  = t[*idx];
3881:           sum1 -= v1[0] * tmp0;
3882:           sum2 -= v2[0] * tmp0;
3883:         }
3884:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3885:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3886:         ibdiag   += 4; row += 2;
3887:         break;
3888:       case 3:
3889:         v2   = a->a + ii[row+1];
3890:         v3   = a->a + ii[row+2];
3891:         sum1 = t[row];
3892:         sum2 = t[row+1];
3893:         sum3 = t[row+2];
3894:         for (n = 0; n<sz-1; n+=2) {
3895:           i1    = idx[0];
3896:           i2    = idx[1];
3897:           idx  += 2;
3898:           tmp0  = t[i1];
3899:           tmp1  = t[i2];
3900:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3901:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3902:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3903:         }

3905:         if (n == sz-1) {
3906:           tmp0  = t[*idx];
3907:           sum1 -= v1[0] * tmp0;
3908:           sum2 -= v2[0] * tmp0;
3909:           sum3 -= v3[0] * tmp0;
3910:         }
3911:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3912:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3913:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3914:         ibdiag   += 9; row += 3;
3915:         break;
3916:       case 4:
3917:         v2   = a->a + ii[row+1];
3918:         v3   = a->a + ii[row+2];
3919:         v4   = a->a + ii[row+3];
3920:         sum1 = t[row];
3921:         sum2 = t[row+1];
3922:         sum3 = t[row+2];
3923:         sum4 = t[row+3];
3924:         for (n = 0; n<sz-1; n+=2) {
3925:           i1    = idx[0];
3926:           i2    = idx[1];
3927:           idx  += 2;
3928:           tmp0  = t[i1];
3929:           tmp1  = t[i2];
3930:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3931:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3932:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3933:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3934:         }

3936:         if (n == sz-1) {
3937:           tmp0  = t[*idx];
3938:           sum1 -= v1[0] * tmp0;
3939:           sum2 -= v2[0] * tmp0;
3940:           sum3 -= v3[0] * tmp0;
3941:           sum4 -= v4[0] * tmp0;
3942:         }
3943:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3944:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3945:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3946:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3947:         ibdiag   += 16; row += 4;
3948:         break;
3949:       case 5:
3950:         v2   = a->a + ii[row+1];
3951:         v3   = a->a + ii[row+2];
3952:         v4   = a->a + ii[row+3];
3953:         v5   = a->a + ii[row+4];
3954:         sum1 = t[row];
3955:         sum2 = t[row+1];
3956:         sum3 = t[row+2];
3957:         sum4 = t[row+3];
3958:         sum5 = t[row+4];
3959:         for (n = 0; n<sz-1; n+=2) {
3960:           i1    = idx[0];
3961:           i2    = idx[1];
3962:           idx  += 2;
3963:           tmp0  = t[i1];
3964:           tmp1  = t[i2];
3965:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3966:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3967:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3968:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3969:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3970:         }

3972:         if (n == sz-1) {
3973:           tmp0  = t[*idx];
3974:           sum1 -= v1[0] * tmp0;
3975:           sum2 -= v2[0] * tmp0;
3976:           sum3 -= v3[0] * tmp0;
3977:           sum4 -= v4[0] * tmp0;
3978:           sum5 -= v5[0] * tmp0;
3979:         }
3980:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3981:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3982:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3983:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3984:         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3985:         ibdiag   += 25; row += 5;
3986:         break;
3987:       default:
3988:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3989:       }
3990:     }
3991:     PetscLogFlops(a->nz);
3992:   }
3993:   VecRestoreArray(xx,&x);
3994:   VecRestoreArrayRead(bb,&b);
3995:   return(0);
3996: }

4000: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4001: {
4002:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4003:   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4004:   const MatScalar   *bdiag = a->inode.bdiag;
4005:   const PetscScalar *b;
4006:   PetscErrorCode    ierr;
4007:   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4008:   const PetscInt    *sizes = a->inode.size;

4011:   VecGetArray(xx,&x);
4012:   VecGetArrayRead(bb,&b);
4013:   cnt  = 0;
4014:   for (i=0, row=0; i<m; i++) {
4015:     switch (sizes[i]) {
4016:     case 1:
4017:       x[row] = b[row]*bdiag[cnt++];row++;
4018:       break;
4019:     case 2:
4020:       x1       = b[row]; x2 = b[row+1];
4021:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4022:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4023:       x[row++] = tmp1;
4024:       x[row++] = tmp2;
4025:       cnt     += 4;
4026:       break;
4027:     case 3:
4028:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4029:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4030:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4031:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4032:       x[row++] = tmp1;
4033:       x[row++] = tmp2;
4034:       x[row++] = tmp3;
4035:       cnt     += 9;
4036:       break;
4037:     case 4:
4038:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4039:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4040:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4041:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4042:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4043:       x[row++] = tmp1;
4044:       x[row++] = tmp2;
4045:       x[row++] = tmp3;
4046:       x[row++] = tmp4;
4047:       cnt     += 16;
4048:       break;
4049:     case 5:
4050:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4051:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4052:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4053:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4054:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4055:       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4056:       x[row++] = tmp1;
4057:       x[row++] = tmp2;
4058:       x[row++] = tmp3;
4059:       x[row++] = tmp4;
4060:       x[row++] = tmp5;
4061:       cnt     += 25;
4062:       break;
4063:     default:
4064:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4065:     }
4066:   }
4067:   PetscLogFlops(2*cnt);
4068:   VecRestoreArray(xx,&x);
4069:   VecRestoreArrayRead(bb,&b);
4070:   return(0);
4071: }

4073: /*
4074:     samestructure indicates that the matrix has not changed its nonzero structure so we
4075:     do not need to recompute the inodes
4076: */
4079: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4080: {
4081:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4083:   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4084:   PetscBool      flag;
4085:   const PetscInt *idx,*idy,*ii;

4088:   if (!a->inode.use) return(0);
4089:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);

4091:   m = A->rmap->n;
4092:   if (a->inode.size) ns = a->inode.size;
4093:   else {
4094:     PetscMalloc1(m+1,&ns);
4095:   }

4097:   i          = 0;
4098:   node_count = 0;
4099:   idx        = a->j;
4100:   ii         = a->i;
4101:   while (i < m) {                /* For each row */
4102:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4103:     /* Limits the number of elements in a node to 'a->inode.limit' */
4104:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4105:       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4106:       if (nzy != nzx) break;
4107:       idy += nzx;              /* Same nonzero pattern */
4108:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
4109:       if (!flag) break;
4110:     }
4111:     ns[node_count++] = blk_size;
4112:     idx             += blk_size*nzx;
4113:     i                = j;
4114:   }
4115:   /* If not enough inodes found,, do not use inode version of the routines */
4116:   if (!m || node_count > .8*m) {
4117:     PetscFree(ns);

4119:     a->inode.node_count       = 0;
4120:     a->inode.size             = NULL;
4121:     a->inode.use              = PETSC_FALSE;
4122:     A->ops->mult              = MatMult_SeqAIJ;
4123:     A->ops->sor               = MatSOR_SeqAIJ;
4124:     A->ops->multadd           = MatMultAdd_SeqAIJ;
4125:     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4126:     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4127:     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4128:     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4129:     A->ops->coloringpatch     = 0;
4130:     A->ops->multdiagonalblock = 0;

4132:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4133:   } else {
4134:     if (!A->factortype) {
4135:       A->ops->mult              = MatMult_SeqAIJ_Inode;
4136:       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4137:       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4138:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4139:       if (A->rmap->n == A->cmap->n) {
4140:         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4141:         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4142:         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4143:         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4144:         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4145:       }
4146:     } else {
4147:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4148:     }
4149:     a->inode.node_count = node_count;
4150:     a->inode.size       = ns;
4151:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4152:   }
4153:   a->inode.checked          = PETSC_TRUE;
4154:   a->inode.mat_nonzerostate = A->nonzerostate;
4155:   return(0);
4156: }

4160: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4161: {
4162:   Mat            B =*C;
4163:   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4165:   PetscInt       m=A->rmap->n;

4168:   c->inode.use       = a->inode.use;
4169:   c->inode.limit     = a->inode.limit;
4170:   c->inode.max_limit = a->inode.max_limit;
4171:   if (a->inode.size) {
4172:     PetscMalloc1(m+1,&c->inode.size);
4173:     c->inode.node_count = a->inode.node_count;
4174:     PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4175:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4176:     if (!B->factortype) {
4177:       B->ops->mult              = MatMult_SeqAIJ_Inode;
4178:       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4179:       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4180:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4181:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4182:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4183:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4184:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4185:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4186:     } else {
4187:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4188:     }
4189:   } else {
4190:     c->inode.size       = 0;
4191:     c->inode.node_count = 0;
4192:   }
4193:   c->inode.ibdiagvalid = PETSC_FALSE;
4194:   c->inode.ibdiag      = 0;
4195:   c->inode.bdiag       = 0;
4196:   return(0);
4197: }

4201: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4202: {
4203:   PetscInt       k;
4204:   const PetscInt *vi;

4207:   vi = aj + ai[row];
4208:   for (k=0; k<nzl; k++) cols[k] = vi[k];
4209:   vi        = aj + adiag[row];
4210:   cols[nzl] = vi[0];
4211:   vi        = aj + adiag[row+1]+1;
4212:   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4213:   return(0);
4214: }
4215: /*
4216:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4217:    Modified from MatSeqAIJCheckInode().

4219:    Input Parameters:
4220: .  Mat A - ILU or LU matrix factor

4222: */
4225: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4226: {
4227:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4229:   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4230:   PetscInt       *cols1,*cols2,*ns;
4231:   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4232:   PetscBool      flag;

4235:   if (!a->inode.use)    return(0);
4236:   if (a->inode.checked) return(0);

4238:   m = A->rmap->n;
4239:   if (a->inode.size) ns = a->inode.size;
4240:   else {
4241:     PetscMalloc1(m+1,&ns);
4242:   }

4244:   i          = 0;
4245:   node_count = 0;
4246:   PetscMalloc2(m,&cols1,m,&cols2);
4247:   while (i < m) {                /* For each row */
4248:     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4249:     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4250:     nzx  = nzl1 + nzu1 + 1;
4251:     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);

4253:     /* Limits the number of elements in a node to 'a->inode.limit' */
4254:     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4255:       nzl2 = ai[j+1] - ai[j];
4256:       nzu2 = adiag[j] - adiag[j+1] - 1;
4257:       nzy  = nzl2 + nzu2 + 1;
4258:       if (nzy != nzx) break;
4259:       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4260:       PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4261:       if (!flag) break;
4262:     }
4263:     ns[node_count++] = blk_size;
4264:     i                = j;
4265:   }
4266:   PetscFree2(cols1,cols2);
4267:   /* If not enough inodes found,, do not use inode version of the routines */
4268:   if (!m || node_count > .8*m) {
4269:     PetscFree(ns);

4271:     a->inode.node_count = 0;
4272:     a->inode.size       = NULL;
4273:     a->inode.use        = PETSC_FALSE;

4275:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4276:   } else {
4277:     A->ops->mult              = 0;
4278:     A->ops->sor               = 0;
4279:     A->ops->multadd           = 0;
4280:     A->ops->getrowij          = 0;
4281:     A->ops->restorerowij      = 0;
4282:     A->ops->getcolumnij       = 0;
4283:     A->ops->restorecolumnij   = 0;
4284:     A->ops->coloringpatch     = 0;
4285:     A->ops->multdiagonalblock = 0;
4286:     a->inode.node_count       = node_count;
4287:     a->inode.size             = ns;

4289:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4290:   }
4291:   a->inode.checked = PETSC_TRUE;
4292:   return(0);
4293: }

4297: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4298: {
4299:   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;

4302:   a->inode.ibdiagvalid = PETSC_FALSE;
4303:   return(0);
4304: }

4306: /*
4307:      This is really ugly. if inodes are used this replaces the
4308:   permutations with ones that correspond to rows/cols of the matrix
4309:   rather then inode blocks
4310: */
4313: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4314: {

4318:   PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4319:   return(0);
4320: }

4324: PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4325: {
4326:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4328:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4329:   const PetscInt *ridx,*cidx;
4330:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4331:   PetscInt       nslim_col,*ns_col;
4332:   IS             ris = *rperm,cis = *cperm;

4335:   if (!a->inode.size) return(0); /* no inodes so return */
4336:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

4338:   Mat_CreateColInode(A,&nslim_col,&ns_col);
4339:   PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4340:   PetscMalloc2(m,&permr,n,&permc);

4342:   ISGetIndices(ris,&ridx);
4343:   ISGetIndices(cis,&cidx);

4345:   /* Form the inode structure for the rows of permuted matric using inv perm*/
4346:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

4348:   /* Construct the permutations for rows*/
4349:   for (i=0,row = 0; i<nslim_row; ++i) {
4350:     indx      = ridx[i];
4351:     start_val = tns[indx];
4352:     end_val   = tns[indx + 1];
4353:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4354:   }

4356:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4357:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

4359:   /* Construct permutations for columns */
4360:   for (i=0,col=0; i<nslim_col; ++i) {
4361:     indx      = cidx[i];
4362:     start_val = tns[indx];
4363:     end_val   = tns[indx + 1];
4364:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4365:   }

4367:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4368:   ISSetPermutation(*rperm);
4369:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4370:   ISSetPermutation(*cperm);

4372:   ISRestoreIndices(ris,&ridx);
4373:   ISRestoreIndices(cis,&cidx);

4375:   PetscFree(ns_col);
4376:   PetscFree2(permr,permc);
4377:   ISDestroy(&cis);
4378:   ISDestroy(&ris);
4379:   PetscFree(tns);
4380:   return(0);
4381: }

4385: /*@C
4386:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

4388:    Not Collective

4390:    Input Parameter:
4391: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

4393:    Output Parameter:
4394: +  node_count - no of inodes present in the matrix.
4395: .  sizes      - an array of size node_count,with sizes of each inode.
4396: -  limit      - the max size used to generate the inodes.

4398:    Level: advanced

4400:    Notes: This routine returns some internal storage information
4401:    of the matrix, it is intended to be used by advanced users.
4402:    It should be called after the matrix is assembled.
4403:    The contents of the sizes[] array should not be changed.
4404:    NULL may be passed for information not requested.

4406: .keywords: matrix, seqaij, get, inode

4408: .seealso: MatGetInfo()
4409: @*/
4410: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4411: {
4412:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

4415:   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4416:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4417:   if (f) {
4418:     (*f)(A,node_count,sizes,limit);
4419:   }
4420:   return(0);
4421: }

4425: PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4426: {
4427:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4430:   if (node_count) *node_count = a->inode.node_count;
4431:   if (sizes)      *sizes      = a->inode.size;
4432:   if (limit)      *limit      = a->inode.limit;
4433:   return(0);
4434: }