Actual source code: baij2.c

  1: #define PETSCMAT_DLL

 3:  #include src/mat/impls/baij/seq/baij.h
 4:  #include src/inline/spops.h
 5:  #include src/inline/ilu.h
 6:  #include petscbt.h

 10: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 11: {
 12:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 14:   PetscInt       row,i,j,k,l,m,n,*idx,*nidx,isz,val,ival;
 15:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 16:   PetscBT        table;

 19:   m     = a->mbs;
 20:   ai    = a->i;
 21:   aj    = a->j;
 22:   bs    = A->rmap.bs;

 24:   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 26:   PetscBTCreate(m,table);
 27:   PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
 28:   PetscMalloc((A->rmap.N+1)*sizeof(PetscInt),&nidx2);

 30:   for (i=0; i<is_max; i++) {
 31:     /* Initialise the two local arrays */
 32:     isz  = 0;
 33:     PetscBTMemzero(m,table);
 34: 
 35:     /* Extract the indices, assume there can be duplicate entries */
 36:     ISGetIndices(is[i],&idx);
 37:     ISGetLocalSize(is[i],&n);

 39:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 40:     for (j=0; j<n ; ++j){
 41:       ival = idx[j]/bs; /* convert the indices into block indices */
 42:       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 43:       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
 44:     }
 45:     ISRestoreIndices(is[i],&idx);
 46:     ISDestroy(is[i]);
 47: 
 48:     k = 0;
 49:     for (j=0; j<ov; j++){ /* for each overlap*/
 50:       n = isz;
 51:       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
 52:         row   = nidx[k];
 53:         start = ai[row];
 54:         end   = ai[row+1];
 55:         for (l = start; l<end ; l++){
 56:           val = aj[l];
 57:           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
 58:         }
 59:       }
 60:     }
 61:     /* expand the Index Set */
 62:     for (j=0; j<isz; j++) {
 63:       for (k=0; k<bs; k++)
 64:         nidx2[j*bs+k] = nidx[j]*bs+k;
 65:     }
 66:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
 67:   }
 68:   PetscBTDestroy(table);
 69:   PetscFree(nidx);
 70:   PetscFree(nidx2);
 71:   return(0);
 72: }

 76: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
 77: {
 78:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 80:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 81:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 82:   PetscInt       *irow,*icol,nrows,ncols,*ssmap,bs=A->rmap.bs,bs2=a->bs2;
 83:   PetscInt       *aj = a->j,*ai = a->i;
 84:   MatScalar      *mat_a;
 85:   Mat            C;
 86:   PetscTruth     flag;

 89:   ISSorted(iscol,(PetscTruth*)&i);
 90:   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 92:   ISGetIndices(isrow,&irow);
 93:   ISGetIndices(iscol,&icol);
 94:   ISGetLocalSize(isrow,&nrows);
 95:   ISGetLocalSize(iscol,&ncols);

 97:   PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
 98:   ssmap = smap;
 99:   PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
100:   PetscMemzero(smap,oldcols*sizeof(PetscInt));
101:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102:   /* determine lens of each row */
103:   for (i=0; i<nrows; i++) {
104:     kstart  = ai[irow[i]];
105:     kend    = kstart + a->ilen[irow[i]];
106:     lens[i] = 0;
107:       for (k=kstart; k<kend; k++) {
108:         if (ssmap[aj[k]]) {
109:           lens[i]++;
110:         }
111:       }
112:     }
113:   /* Create and fill new matrix */
114:   if (scall == MAT_REUSE_MATRIX) {
115:     c = (Mat_SeqBAIJ *)((*B)->data);

117:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap.bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
119:     if (!flag) {
120:       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
121:     }
122:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
123:     C = *B;
124:   } else {
125:     MatCreate(((PetscObject)A)->comm,&C);
126:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
127:     MatSetType(C,((PetscObject)A)->type_name);
128:     MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
129:   }
130:   c = (Mat_SeqBAIJ *)(C->data);
131:   for (i=0; i<nrows; i++) {
132:     row    = irow[i];
133:     kstart = ai[row];
134:     kend   = kstart + a->ilen[row];
135:     mat_i  = c->i[i];
136:     mat_j  = c->j + mat_i;
137:     mat_a  = c->a + mat_i*bs2;
138:     mat_ilen = c->ilen + i;
139:     for (k=kstart; k<kend; k++) {
140:       if ((tcol=ssmap[a->j[k]])) {
141:         *mat_j++ = tcol - 1;
142:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
143:         mat_a   += bs2;
144:         (*mat_ilen)++;
145:       }
146:     }
147:   }
148: 
149:   /* Free work space */
150:   ISRestoreIndices(iscol,&icol);
151:   PetscFree(smap);
152:   PetscFree(lens);
153:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
154:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
155: 
156:   ISRestoreIndices(isrow,&irow);
157:   *B = C;
158:   return(0);
159: }

163: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
164: {
165:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166:   IS             is1,is2;
168:   PetscInt       *vary,*iary,*irow,*icol,nrows,ncols,i,bs=A->rmap.bs,count;

171:   ISGetIndices(isrow,&irow);
172:   ISGetIndices(iscol,&icol);
173:   ISGetLocalSize(isrow,&nrows);
174:   ISGetLocalSize(iscol,&ncols);
175: 
176:   /* Verify if the indices corespond to each element in a block 
177:    and form the IS with compressed IS */
178:   PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
179:   iary = vary + a->mbs;
180:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
181:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
182:   count = 0;
183:   for (i=0; i<a->mbs; i++) {
184:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185:     if (vary[i]==bs) iary[count++] = i;
186:   }
187:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
188: 
189:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
190:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
191:   count = 0;
192:   for (i=0; i<a->mbs; i++) {
193:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
194:     if (vary[i]==bs) iary[count++] = i;
195:   }
196:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
197:   ISRestoreIndices(isrow,&irow);
198:   ISRestoreIndices(iscol,&icol);
199:   PetscFree(vary);

201:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
202:   ISDestroy(is1);
203:   ISDestroy(is2);
204:   return(0);
205: }

209: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
210: {
212:   PetscInt       i;

215:   if (scall == MAT_INITIAL_MATRIX) {
216:     PetscMalloc((n+1)*sizeof(Mat),B);
217:   }

219:   for (i=0; i<n; i++) {
220:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
221:   }
222:   return(0);
223: }


226: /* -------------------------------------------------------*/
227: /* Should check that shapes of vectors and matrices match */
228: /* -------------------------------------------------------*/
229:  #include petscblaslapack.h

233: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
234: {
235:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
236:   PetscScalar       *z,sum;
237:   const PetscScalar *x;
238:   const MatScalar   *v;
239:   PetscErrorCode    ierr;
240:   PetscInt          mbs,i,*idx,*ii,n,*ridx=PETSC_NULL,nonzerorow=0;
241:   PetscTruth        usecprow=a->compressedrow.use;

244:   VecGetArray(xx,(PetscScalar**)&x);
245:   VecGetArray(zz,&z);

247:   idx = a->j;
248:   v   = a->a;
249:   if (usecprow){
250:     mbs  = a->compressedrow.nrows;
251:     ii   = a->compressedrow.i;
252:     ridx = a->compressedrow.rindex;
253:   } else {
254:     mbs = a->mbs;
255:     ii  = a->i;
256:   }

258:   for (i=0; i<mbs; i++) {
259:     n    = ii[1] - ii[0]; ii++;
260:     sum  = 0.0;
261:     nonzerorow += (n>0);
262:     while (n--) sum += *v++ * x[*idx++];
263:     if (usecprow){
264:       z[ridx[i]] = sum;
265:     } else {
266:       z[i] = sum;
267:     }
268:   }
269:   VecRestoreArray(xx,(PetscScalar**)&x);
270:   VecRestoreArray(zz,&z);
271:   PetscLogFlops(2*a->nz - nonzerorow);
272:   return(0);
273: }

277: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
278: {
279:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
280:   PetscScalar       *z = 0,sum1,sum2,*zarray;
281:   const PetscScalar *x,*xb;
282:   PetscScalar       x1,x2;
283:   const MatScalar   *v;
284:   PetscErrorCode    ierr;
285:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
286:   PetscTruth        usecprow=a->compressedrow.use;

289:   VecGetArray(xx,(PetscScalar**)&x);
290:   VecGetArray(zz,&zarray);

292:   idx = a->j;
293:   v   = a->a;
294:   if (usecprow){
295:     mbs  = a->compressedrow.nrows;
296:     ii   = a->compressedrow.i;
297:     ridx = a->compressedrow.rindex;
298:   } else {
299:     mbs = a->mbs;
300:     ii  = a->i;
301:     z   = zarray;
302:   }

304:   for (i=0; i<mbs; i++) {
305:     n  = ii[1] - ii[0]; ii++;
306:     sum1 = 0.0; sum2 = 0.0;
307:     nonzerorow += (n>0);
308:     for (j=0; j<n; j++) {
309:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
310:       sum1 += v[0]*x1 + v[2]*x2;
311:       sum2 += v[1]*x1 + v[3]*x2;
312:       v += 4;
313:     }
314:     if (usecprow) z = zarray + 2*ridx[i];
315:     z[0] = sum1; z[1] = sum2;
316:     if (!usecprow) z += 2;
317:   }
318:   VecRestoreArray(xx,(PetscScalar**)&x);
319:   VecRestoreArray(zz,&zarray);
320:   PetscLogFlops(8*a->nz - 2*nonzerorow);
321:   return(0);
322: }

326: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
327: {
328:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
329:   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
330:   const PetscScalar *x,*xb;
331:   const MatScalar   *v;
332:   PetscErrorCode    ierr;
333:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
334:   PetscTruth        usecprow=a->compressedrow.use;
335: 

337: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
338: #pragma disjoint(*v,*z,*xb)
339: #endif

342:   VecGetArray(xx,(PetscScalar**)&x);
343:   VecGetArray(zz,&zarray);

345:   idx = a->j;
346:   v   = a->a;
347:   if (usecprow){
348:     mbs  = a->compressedrow.nrows;
349:     ii   = a->compressedrow.i;
350:     ridx = a->compressedrow.rindex;
351:   } else {
352:     mbs = a->mbs;
353:     ii  = a->i;
354:     z   = zarray;
355:   }

357:   for (i=0; i<mbs; i++) {
358:     n  = ii[1] - ii[0]; ii++;
359:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
360:     nonzerorow += (n>0);
361:     for (j=0; j<n; j++) {
362:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
363:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
364:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
365:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
366:       v += 9;
367:     }
368:     if (usecprow) z = zarray + 3*ridx[i];
369:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
370:     if (!usecprow) z += 3;
371:   }
372:   VecRestoreArray(xx,(PetscScalar**)&x);
373:   VecRestoreArray(zz,&zarray);
374:   PetscLogFlops(18*a->nz - 3*nonzerorow);
375:   return(0);
376: }

380: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
381: {
382:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
383:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
384:   const PetscScalar *x,*xb;
385:   const MatScalar   *v;
386:   PetscErrorCode    ierr;
387:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
388:   PetscTruth        usecprow=a->compressedrow.use;

391:   VecGetArray(xx,(PetscScalar**)&x);
392:   VecGetArray(zz,&zarray);

394:   idx = a->j;
395:   v   = a->a;
396:   if (usecprow){
397:     mbs  = a->compressedrow.nrows;
398:     ii   = a->compressedrow.i;
399:     ridx = a->compressedrow.rindex;
400:   } else {
401:     mbs = a->mbs;
402:     ii  = a->i;
403:     z   = zarray;
404:   }

406:   for (i=0; i<mbs; i++) {
407:     n  = ii[1] - ii[0]; ii++;
408:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
409:     nonzerorow += (n>0);
410:     for (j=0; j<n; j++) {
411:       xb = x + 4*(*idx++);
412:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
413:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
414:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
415:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
416:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
417:       v += 16;
418:     }
419:     if (usecprow) z = zarray + 4*ridx[i];
420:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
421:     if (!usecprow) z += 4;
422:   }
423:   VecRestoreArray(xx,(PetscScalar**)&x);
424:   VecRestoreArray(zz,&zarray);
425:   PetscLogFlops(32*a->nz - 4*nonzerorow);
426:   return(0);
427: }

431: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
432: {
433:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
434:   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
435:   const PetscScalar *xb,*x;
436:   const MatScalar   *v;
437:   PetscErrorCode    ierr;
438:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
439:   PetscTruth        usecprow=a->compressedrow.use;

442:   VecGetArray(xx,(PetscScalar**)&x);
443:   VecGetArray(zz,&zarray);

445:   idx = a->j;
446:   v   = a->a;
447:   if (usecprow){
448:     mbs  = a->compressedrow.nrows;
449:     ii   = a->compressedrow.i;
450:     ridx = a->compressedrow.rindex;
451:   } else {
452:     mbs = a->mbs;
453:     ii  = a->i;
454:     z   = zarray;
455:   }

457:   for (i=0; i<mbs; i++) {
458:     n  = ii[1] - ii[0]; ii++;
459:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
460:     nonzerorow += (n>0);
461:     for (j=0; j<n; j++) {
462:       xb = x + 5*(*idx++);
463:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
464:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
465:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
466:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
467:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
468:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
469:       v += 25;
470:     }
471:     if (usecprow) z = zarray + 5*ridx[i];
472:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
473:     if (!usecprow) z += 5;
474:   }
475:   VecRestoreArray(xx,(PetscScalar**)&x);
476:   VecRestoreArray(zz,&zarray);
477:   PetscLogFlops(50*a->nz - 5*nonzerorow);
478:   return(0);
479: }


484: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
485: {
486:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
487:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
488:   const PetscScalar *x,*xb;
489:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
490:   const MatScalar   *v;
491:   PetscErrorCode    ierr;
492:   PetscInt          mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
493:   PetscTruth        usecprow=a->compressedrow.use;

496:   VecGetArray(xx,(PetscScalar**)&x);
497:   VecGetArray(zz,&zarray);

499:   idx = a->j;
500:   v   = a->a;
501:   if (usecprow){
502:     mbs  = a->compressedrow.nrows;
503:     ii   = a->compressedrow.i;
504:     ridx = a->compressedrow.rindex;
505:   } else {
506:     mbs = a->mbs;
507:     ii  = a->i;
508:     z   = zarray;
509:   }

511:   for (i=0; i<mbs; i++) {
512:     n  = ii[1] - ii[0]; ii++;
513:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
514:     nonzerorow += (n>0);
515:     for (j=0; j<n; j++) {
516:       xb = x + 6*(*idx++);
517:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
518:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
519:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
520:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
521:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
522:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
523:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
524:       v += 36;
525:     }
526:     if (usecprow) z = zarray + 6*ridx[i];
527:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
528:     if (!usecprow) z += 6;
529:   }

531:   VecRestoreArray(xx,(PetscScalar**)&x);
532:   VecRestoreArray(zz,&zarray);
533:   PetscLogFlops(72*a->nz - 6*nonzerorow);
534:   return(0);
535: }
538: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
539: {
540:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
541:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
542:   const PetscScalar *x,*xb;
543:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
544:   const MatScalar   *v;
545:   PetscErrorCode    ierr;
546:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
547:   PetscTruth        usecprow=a->compressedrow.use;

550:   VecGetArray(xx,(PetscScalar**)&x);
551:   VecGetArray(zz,&zarray);

553:   idx = a->j;
554:   v   = a->a;
555:   if (usecprow){
556:     mbs    = a->compressedrow.nrows;
557:     ii     = a->compressedrow.i;
558:     ridx = a->compressedrow.rindex;
559:   } else {
560:     mbs = a->mbs;
561:     ii  = a->i;
562:     z   = zarray;
563:   }

565:   for (i=0; i<mbs; i++) {
566:     n  = ii[1] - ii[0]; ii++;
567:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
568:     nonzerorow += (n>0);
569:     for (j=0; j<n; j++) {
570:       xb = x + 7*(*idx++);
571:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
572:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
573:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
574:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
575:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
576:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
577:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
578:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
579:       v += 49;
580:     }
581:     if (usecprow) z = zarray + 7*ridx[i];
582:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
583:     if (!usecprow) z += 7;
584:   }

586:   VecRestoreArray(xx,(PetscScalar**)&x);
587:   VecRestoreArray(zz,&zarray);
588:   PetscLogFlops(98*a->nz - 7*nonzerorow);
589:   return(0);
590: }

592: /*
593:     This will not work with MatScalar == float because it calls the BLAS
594: */
597: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
598: {
599:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
600:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
601:   MatScalar      *v;
603:   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2;
604:   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
605:   PetscTruth     usecprow=a->compressedrow.use;

608:   VecGetArray(xx,&x);
609:   VecGetArray(zz,&zarray);

611:   idx = a->j;
612:   v   = a->a;
613:   if (usecprow){
614:     mbs  = a->compressedrow.nrows;
615:     ii   = a->compressedrow.i;
616:     ridx = a->compressedrow.rindex;
617:   } else {
618:     mbs = a->mbs;
619:     ii  = a->i;
620:     z   = zarray;
621:   }

623:   if (!a->mult_work) {
624:     k    = PetscMax(A->rmap.n,A->cmap.n);
625:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
626:   }
627:   work = a->mult_work;
628:   for (i=0; i<mbs; i++) {
629:     n     = ii[1] - ii[0]; ii++;
630:     ncols = n*bs;
631:     workt = work;
632:     nonzerorow += (n>0);
633:     for (j=0; j<n; j++) {
634:       xb = x + bs*(*idx++);
635:       for (k=0; k<bs; k++) workt[k] = xb[k];
636:       workt += bs;
637:     }
638:     if (usecprow) z = zarray + bs*ridx[i];
639:     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
640:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
641:     v += n*bs2;
642:     if (!usecprow) z += bs;
643:   }
644:   VecRestoreArray(xx,&x);
645:   VecRestoreArray(zz,&zarray);
646:   PetscLogFlops(2*a->nz*bs2 - bs*nonzerorow);
647:   return(0);
648: }

653: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
654: {
655:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
656:   PetscScalar    *x,*y = 0,*z = 0,sum,*yarray,*zarray;
657:   MatScalar      *v;
659:   PetscInt       mbs=a->mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
660:   PetscTruth     usecprow=a->compressedrow.use;

663:   VecGetArray(xx,&x);
664:   VecGetArray(yy,&yarray);
665:   if (zz != yy) {
666:     VecGetArray(zz,&zarray);
667:   } else {
668:     zarray = yarray;
669:   }

671:   idx = a->j;
672:   v   = a->a;
673:   if (usecprow){
674:     if (zz != yy){
675:       PetscMemcpy(zarray,yarray,mbs*sizeof(PetscScalar));
676:     }
677:     mbs  = a->compressedrow.nrows;
678:     ii   = a->compressedrow.i;
679:     ridx = a->compressedrow.rindex;
680:   } else {
681:     ii  = a->i;
682:     y   = yarray;
683:     z   = zarray;
684:   }

686:   for (i=0; i<mbs; i++) {
687:     n    = ii[1] - ii[0]; ii++;
688:     if (usecprow){
689:       z = zarray + ridx[i];
690:       y = yarray + ridx[i];
691:     }
692:     sum = y[0];
693:     while (n--) sum += *v++ * x[*idx++];
694:     z[0] = sum;
695:     if (!usecprow){
696:       z++; y++;
697:     }
698:   }
699:   VecRestoreArray(xx,&x);
700:   VecRestoreArray(yy,&yarray);
701:   if (zz != yy) {
702:     VecRestoreArray(zz,&zarray);
703:   }
704:   PetscLogFlops(2*a->nz);
705:   return(0);
706: }

710: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
711: {
712:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
713:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
714:   PetscScalar    x1,x2,*yarray,*zarray;
715:   MatScalar      *v;
717:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
718:   PetscTruth     usecprow=a->compressedrow.use;

721:   VecGetArray(xx,&x);
722:   VecGetArray(yy,&yarray);
723:   if (zz != yy) {
724:     VecGetArray(zz,&zarray);
725:   } else {
726:     zarray = yarray;
727:   }

729:   idx = a->j;
730:   v   = a->a;
731:   if (usecprow){
732:     if (zz != yy){
733:       PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
734:     }
735:     mbs  = a->compressedrow.nrows;
736:     ii   = a->compressedrow.i;
737:     ridx = a->compressedrow.rindex;
738:     if (zz != yy){
739:       PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
740:     }
741:   } else {
742:     ii  = a->i;
743:     y   = yarray;
744:     z   = zarray;
745:   }

747:   for (i=0; i<mbs; i++) {
748:     n  = ii[1] - ii[0]; ii++;
749:     if (usecprow){
750:       z = zarray + 2*ridx[i];
751:       y = yarray + 2*ridx[i];
752:     }
753:     sum1 = y[0]; sum2 = y[1];
754:     for (j=0; j<n; j++) {
755:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
756:       sum1 += v[0]*x1 + v[2]*x2;
757:       sum2 += v[1]*x1 + v[3]*x2;
758:       v += 4;
759:     }
760:     z[0] = sum1; z[1] = sum2;
761:     if (!usecprow){
762:       z += 2; y += 2;
763:     }
764:   }
765:   VecRestoreArray(xx,&x);
766:   VecRestoreArray(yy,&yarray);
767:   if (zz != yy) {
768:     VecRestoreArray(zz,&zarray);
769:   }
770:   PetscLogFlops(4*a->nz);
771:   return(0);
772: }

776: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
777: {
778:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
779:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
780:   MatScalar      *v;
782:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
783:   PetscTruth     usecprow=a->compressedrow.use;

786:   VecGetArray(xx,&x);
787:   VecGetArray(yy,&yarray);
788:   if (zz != yy) {
789:     VecGetArray(zz,&zarray);
790:   } else {
791:     zarray = yarray;
792:   }

794:   idx = a->j;
795:   v   = a->a;
796:   if (usecprow){
797:     if (zz != yy){
798:       PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
799:     }
800:     mbs  = a->compressedrow.nrows;
801:     ii   = a->compressedrow.i;
802:     ridx = a->compressedrow.rindex;
803:   } else {
804:     ii  = a->i;
805:     y   = yarray;
806:     z   = zarray;
807:   }

809:   for (i=0; i<mbs; i++) {
810:     n  = ii[1] - ii[0]; ii++;
811:     if (usecprow){
812:       z = zarray + 3*ridx[i];
813:       y = yarray + 3*ridx[i];
814:     }
815:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
816:     for (j=0; j<n; j++) {
817:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
818:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
819:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
820:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
821:       v += 9;
822:     }
823:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
824:     if (!usecprow){
825:       z += 3; y += 3;
826:     }
827:   }
828:   VecRestoreArray(xx,&x);
829:   VecRestoreArray(yy,&yarray);
830:   if (zz != yy) {
831:     VecRestoreArray(zz,&zarray);
832:   }
833:   PetscLogFlops(18*a->nz);
834:   return(0);
835: }

839: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
840: {
841:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
842:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
843:   MatScalar      *v;
845:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
846:   PetscTruth     usecprow=a->compressedrow.use;

849:   VecGetArray(xx,&x);
850:   VecGetArray(yy,&yarray);
851:   if (zz != yy) {
852:     VecGetArray(zz,&zarray);
853:   } else {
854:     zarray = yarray;
855:   }

857:   idx   = a->j;
858:   v     = a->a;
859:   if (usecprow){
860:     if (zz != yy){
861:       PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
862:     }
863:     mbs  = a->compressedrow.nrows;
864:     ii   = a->compressedrow.i;
865:     ridx = a->compressedrow.rindex;
866:   } else {
867:     ii  = a->i;
868:     y   = yarray;
869:     z   = zarray;
870:   }

872:   for (i=0; i<mbs; i++) {
873:     n  = ii[1] - ii[0]; ii++;
874:     if (usecprow){
875:       z = zarray + 4*ridx[i];
876:       y = yarray + 4*ridx[i];
877:     }
878:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
879:     for (j=0; j<n; j++) {
880:       xb = x + 4*(*idx++);
881:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
882:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
883:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
884:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
885:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
886:       v += 16;
887:     }
888:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
889:     if (!usecprow){
890:       z += 4; y += 4;
891:     }
892:   }
893:   VecRestoreArray(xx,&x);
894:   VecRestoreArray(yy,&yarray);
895:   if (zz != yy) {
896:     VecRestoreArray(zz,&zarray);
897:   }
898:   PetscLogFlops(32*a->nz);
899:   return(0);
900: }

904: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
905: {
906:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
907:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
908:   PetscScalar    *yarray,*zarray;
909:   MatScalar      *v;
911:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
912:   PetscTruth     usecprow=a->compressedrow.use;

915:   VecGetArray(xx,&x);
916:   VecGetArray(yy,&yarray);
917:   if (zz != yy) {
918:     VecGetArray(zz,&zarray);
919:   } else {
920:     zarray = yarray;
921:   }

923:   idx = a->j;
924:   v   = a->a;
925:   if (usecprow){
926:     if (zz != yy){
927:       PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
928:     }
929:     mbs  = a->compressedrow.nrows;
930:     ii   = a->compressedrow.i;
931:     ridx = a->compressedrow.rindex;
932:   } else {
933:     ii  = a->i;
934:     y   = yarray;
935:     z   = zarray;
936:   }

938:   for (i=0; i<mbs; i++) {
939:     n  = ii[1] - ii[0]; ii++;
940:     if (usecprow){
941:       z = zarray + 5*ridx[i];
942:       y = yarray + 5*ridx[i];
943:     }
944:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
945:     for (j=0; j<n; j++) {
946:       xb = x + 5*(*idx++);
947:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
948:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
949:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
950:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
951:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
952:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
953:       v += 25;
954:     }
955:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
956:     if (!usecprow){
957:       z += 5; y += 5;
958:     }
959:   }
960:   VecRestoreArray(xx,&x);
961:   VecRestoreArray(yy,&yarray);
962:   if (zz != yy) {
963:     VecRestoreArray(zz,&zarray);
964:   }
965:   PetscLogFlops(50*a->nz);
966:   return(0);
967: }
970: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
971: {
972:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
973:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
974:   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
975:   MatScalar      *v;
977:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
978:   PetscTruth     usecprow=a->compressedrow.use;

981:   VecGetArray(xx,&x);
982:   VecGetArray(yy,&yarray);
983:   if (zz != yy) {
984:     VecGetArray(zz,&zarray);
985:   } else {
986:     zarray = yarray;
987:   }

989:   idx = a->j;
990:   v   = a->a;
991:   if (usecprow){
992:     if (zz != yy){
993:       PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
994:     }
995:     mbs  = a->compressedrow.nrows;
996:     ii   = a->compressedrow.i;
997:     ridx = a->compressedrow.rindex;
998:   } else {
999:     ii  = a->i;
1000:     y   = yarray;
1001:     z   = zarray;
1002:   }

1004:   for (i=0; i<mbs; i++) {
1005:     n  = ii[1] - ii[0]; ii++;
1006:     if (usecprow){
1007:       z = zarray + 6*ridx[i];
1008:       y = yarray + 6*ridx[i];
1009:     }
1010:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1011:     for (j=0; j<n; j++) {
1012:       xb = x + 6*(*idx++);
1013:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1014:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1015:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1016:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1017:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1018:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1019:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1020:       v += 36;
1021:     }
1022:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1023:     if (!usecprow){
1024:       z += 6; y += 6;
1025:     }
1026:   }
1027:   VecRestoreArray(xx,&x);
1028:   VecRestoreArray(yy,&yarray);
1029:   if (zz != yy) {
1030:     VecRestoreArray(zz,&zarray);
1031:   }
1032:   PetscLogFlops(72*a->nz);
1033:   return(0);
1034: }

1038: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1039: {
1040:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1041:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1042:   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1043:   MatScalar      *v;
1045:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1046:   PetscTruth     usecprow=a->compressedrow.use;

1049:   VecGetArray(xx,&x);
1050:   VecGetArray(yy,&yarray);
1051:   if (zz != yy) {
1052:     VecGetArray(zz,&zarray);
1053:   } else {
1054:     zarray = yarray;
1055:   }

1057:   idx = a->j;
1058:   v   = a->a;
1059:   if (usecprow){
1060:     if (zz != yy){
1061:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1062:     }
1063:     mbs  = a->compressedrow.nrows;
1064:     ii   = a->compressedrow.i;
1065:     ridx = a->compressedrow.rindex;
1066:   } else {
1067:     ii  = a->i;
1068:     y   = yarray;
1069:     z   = zarray;
1070:   }

1072:   for (i=0; i<mbs; i++) {
1073:     n  = ii[1] - ii[0]; ii++;
1074:     if (usecprow){
1075:       z = zarray + 7*ridx[i];
1076:       y = yarray + 7*ridx[i];
1077:     }
1078:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1079:     for (j=0; j<n; j++) {
1080:       xb = x + 7*(*idx++);
1081:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1082:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1083:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1084:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1085:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1086:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1087:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1088:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1089:       v += 49;
1090:     }
1091:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1092:     if (!usecprow){
1093:       z += 7; y += 7;
1094:     }
1095:   }
1096:   VecRestoreArray(xx,&x);
1097:   VecRestoreArray(yy,&yarray);
1098:   if (zz != yy) {
1099:     VecRestoreArray(zz,&zarray);
1100:   }
1101:   PetscLogFlops(98*a->nz);
1102:   return(0);
1103: }

1107: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1108: {
1109:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1110:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1111:   MatScalar      *v;
1113:   PetscInt       mbs,i,*idx,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2;
1114:   PetscInt       ncols,k,*ridx=PETSC_NULL;
1115:   PetscTruth     usecprow=a->compressedrow.use;

1118:   VecCopy_Seq(yy,zz);
1119:   VecGetArray(xx,&x);
1120:   VecGetArray(zz,&zarray);

1122:   idx = a->j;
1123:   v   = a->a;
1124:   if (usecprow){
1125:     mbs    = a->compressedrow.nrows;
1126:     ii     = a->compressedrow.i;
1127:     ridx = a->compressedrow.rindex;
1128:   } else {
1129:     mbs = a->mbs;
1130:     ii  = a->i;
1131:     z   = zarray;
1132:   }

1134:   if (!a->mult_work) {
1135:     k    = PetscMax(A->rmap.n,A->cmap.n);
1136:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1137:   }
1138:   work = a->mult_work;
1139:   for (i=0; i<mbs; i++) {
1140:     n     = ii[1] - ii[0]; ii++;
1141:     ncols = n*bs;
1142:     workt = work;
1143:     for (j=0; j<n; j++) {
1144:       xb = x + bs*(*idx++);
1145:       for (k=0; k<bs; k++) workt[k] = xb[k];
1146:       workt += bs;
1147:     }
1148:     if (usecprow) z = zarray + bs*ridx[i];
1149:     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1150:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1151:     v += n*bs2;
1152:     if (!usecprow){
1153:       z += bs;
1154:     }
1155:   }
1156:   VecRestoreArray(xx,&x);
1157:   VecRestoreArray(zz,&zarray);
1158:   PetscLogFlops(2*a->nz*bs2);
1159:   return(0);
1160: }

1164: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1165: {
1166:   PetscScalar    zero = 0.0;

1170:   VecSet(zz,zero);
1171:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1172:   return(0);
1173: }

1177: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)

1179: {
1180:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1181:   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1182:   MatScalar         *v;
1183:   PetscErrorCode    ierr;
1184:   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap.bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1185:   Mat_CompressedRow cprow = a->compressedrow;
1186:   PetscTruth        usecprow=cprow.use;

1189:   if (yy != zz) { VecCopy_Seq(yy,zz); }
1190:   VecGetArray(xx,&x);
1191:   VecGetArray(zz,&z);

1193:   idx = a->j;
1194:   v   = a->a;
1195:   if (usecprow){
1196:     mbs  = cprow.nrows;
1197:     ii   = cprow.i;
1198:     ridx = cprow.rindex;
1199:   } else {
1200:     mbs=a->mbs;
1201:     ii = a->i;
1202:     xb = x;
1203:   }

1205:   switch (bs) {
1206:   case 1:
1207:     for (i=0; i<mbs; i++) {
1208:       if (usecprow) xb = x + ridx[i];
1209:       x1 = xb[0];
1210:       ib = idx + ii[0];
1211:       n  = ii[1] - ii[0]; ii++;
1212:       for (j=0; j<n; j++) {
1213:         rval    = ib[j];
1214:         z[rval] += *v * x1;
1215:         v++;
1216:       }
1217:       if (!usecprow) xb++;
1218:     }
1219:     break;
1220:   case 2:
1221:     for (i=0; i<mbs; i++) {
1222:       if (usecprow) xb = x + 2*ridx[i];
1223:       x1 = xb[0]; x2 = xb[1];
1224:       ib = idx + ii[0];
1225:       n  = ii[1] - ii[0]; ii++;
1226:       for (j=0; j<n; j++) {
1227:         rval      = ib[j]*2;
1228:         z[rval++] += v[0]*x1 + v[1]*x2;
1229:         z[rval++] += v[2]*x1 + v[3]*x2;
1230:         v  += 4;
1231:       }
1232:       if (!usecprow) xb += 2;
1233:     }
1234:     break;
1235:   case 3:
1236:     for (i=0; i<mbs; i++) {
1237:       if (usecprow) xb = x + 3*ridx[i];
1238:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1239:       ib = idx + ii[0];
1240:       n  = ii[1] - ii[0]; ii++;
1241:       for (j=0; j<n; j++) {
1242:         rval      = ib[j]*3;
1243:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1244:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1245:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1246:         v  += 9;
1247:       }
1248:       if (!usecprow) xb += 3;
1249:     }
1250:     break;
1251:   case 4:
1252:     for (i=0; i<mbs; i++) {
1253:       if (usecprow) xb = x + 4*ridx[i];
1254:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1255:       ib = idx + ii[0];
1256:       n  = ii[1] - ii[0]; ii++;
1257:       for (j=0; j<n; j++) {
1258:         rval      = ib[j]*4;
1259:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1260:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1261:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1262:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1263:         v  += 16;
1264:       }
1265:       if (!usecprow) xb += 4;
1266:     }
1267:     break;
1268:   case 5:
1269:     for (i=0; i<mbs; i++) {
1270:       if (usecprow) xb = x + 5*ridx[i];
1271:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1272:       x4 = xb[3]; x5 = xb[4];
1273:       ib = idx + ii[0];
1274:       n  = ii[1] - ii[0]; ii++;
1275:       for (j=0; j<n; j++) {
1276:         rval      = ib[j]*5;
1277:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1278:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1279:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1280:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1281:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1282:         v  += 25;
1283:       }
1284:       if (!usecprow) xb += 5;
1285:     }
1286:     break;
1287:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1288:       PetscInt     ncols,k;
1289:       PetscScalar  *work,*workt,*xtmp;

1291:       if (!a->mult_work) {
1292:         k = PetscMax(A->rmap.n,A->cmap.n);
1293:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1294:       }
1295:       work = a->mult_work;
1296:       xtmp = x;
1297:       for (i=0; i<mbs; i++) {
1298:         n     = ii[1] - ii[0]; ii++;
1299:         ncols = n*bs;
1300:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1301:         if (usecprow) {
1302:           xtmp = x + bs*ridx[i];
1303:         }
1304:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1305:         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1306:         v += n*bs2;
1307:         if (!usecprow) xtmp += bs;
1308:         workt = work;
1309:         for (j=0; j<n; j++) {
1310:           zb = z + bs*(*idx++);
1311:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1312:           workt += bs;
1313:         }
1314:       }
1315:     }
1316:   }
1317:   VecRestoreArray(xx,&x);
1318:   VecRestoreArray(zz,&z);
1319:   PetscLogFlops(2*a->nz*a->bs2);
1320:   return(0);
1321: }

1325: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1326: {
1327:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)inA->data;
1328:   PetscInt     totalnz = a->bs2*a->nz;
1329:   PetscScalar  oalpha = alpha;
1331: #if defined(PETSC_USE_MAT_SINGLE)
1332:   PetscInt     i;
1333: #else
1334:   PetscBLASInt tnz = (PetscBLASInt) totalnz,one = 1;
1335: #endif

1338: #if defined(PETSC_USE_MAT_SINGLE)
1339:   for (i=0; i<totalnz; i++) a->a[i] *= oalpha;
1340: #else
1341:   BLASscal_(&tnz,&oalpha,a->a,&one);
1342: #endif
1343:   PetscLogFlops(totalnz);
1344:   return(0);
1345: }

1349: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1350: {
1352:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1353:   MatScalar      *v = a->a;
1354:   PetscReal      sum = 0.0;
1355:   PetscInt       i,j,k,bs=A->rmap.bs,nz=a->nz,bs2=a->bs2,k1;

1358:   if (type == NORM_FROBENIUS) {
1359:     for (i=0; i< bs2*nz; i++) {
1360: #if defined(PETSC_USE_COMPLEX)
1361:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1362: #else
1363:       sum += (*v)*(*v); v++;
1364: #endif
1365:     }
1366:     *norm = sqrt(sum);
1367:   } else if (type == NORM_1) { /* maximum column sum */
1368:     PetscReal *tmp;
1369:     PetscInt  *bcol = a->j;
1370:     PetscMalloc((A->cmap.n+1)*sizeof(PetscReal),&tmp);
1371:     PetscMemzero(tmp,A->cmap.n*sizeof(PetscReal));
1372:     for (i=0; i<nz; i++){
1373:       for (j=0; j<bs; j++){
1374:         k1 = bs*(*bcol) + j; /* column index */
1375:         for (k=0; k<bs; k++){
1376:           tmp[k1] += PetscAbsScalar(*v); v++;
1377:         }
1378:       }
1379:       bcol++;
1380:     }
1381:     *norm = 0.0;
1382:     for (j=0; j<A->cmap.n; j++) {
1383:       if (tmp[j] > *norm) *norm = tmp[j];
1384:     }
1385:     PetscFree(tmp);
1386:   } else if (type == NORM_INFINITY) { /* maximum row sum */
1387:     *norm = 0.0;
1388:     for (k=0; k<bs; k++) {
1389:       for (j=0; j<a->mbs; j++) {
1390:         v = a->a + bs2*a->i[j] + k;
1391:         sum = 0.0;
1392:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1393:           for (k1=0; k1<bs; k1++){
1394:             sum += PetscAbsScalar(*v);
1395:             v   += bs;
1396:           }
1397:         }
1398:         if (sum > *norm) *norm = sum;
1399:       }
1400:     }
1401:   } else {
1402:     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1403:   }
1404:   return(0);
1405: }


1410: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1411: {
1412:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;

1416:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1417:   if ((A->rmap.N != B->rmap.N) || (A->cmap.n != B->cmap.n) || (A->rmap.bs != B->rmap.bs)|| (a->nz != b->nz)) {
1418:     *flg = PETSC_FALSE;
1419:     return(0);
1420:   }
1421: 
1422:   /* if the a->i are the same */
1423:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1424:   if (!*flg) {
1425:     return(0);
1426:   }
1427: 
1428:   /* if a->j are the same */
1429:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1430:   if (!*flg) {
1431:     return(0);
1432:   }
1433:   /* if a->a are the same */
1434:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap.bs)*(B->rmap.bs)*sizeof(PetscScalar),flg);
1435:   return(0);
1436: 
1437: }

1441: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1442: {
1443:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1445:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1446:   PetscScalar    *x,zero = 0.0;
1447:   MatScalar      *aa,*aa_j;

1450:   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1451:   bs   = A->rmap.bs;
1452:   aa   = a->a;
1453:   ai   = a->i;
1454:   aj   = a->j;
1455:   ambs = a->mbs;
1456:   bs2  = a->bs2;

1458:   VecSet(v,zero);
1459:   VecGetArray(v,&x);
1460:   VecGetLocalSize(v,&n);
1461:   if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1462:   for (i=0; i<ambs; i++) {
1463:     for (j=ai[i]; j<ai[i+1]; j++) {
1464:       if (aj[j] == i) {
1465:         row  = i*bs;
1466:         aa_j = aa+j*bs2;
1467:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1468:         break;
1469:       }
1470:     }
1471:   }
1472:   VecRestoreArray(v,&x);
1473:   return(0);
1474: }

1478: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1479: {
1480:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1481:   PetscScalar    *l,*r,x,*li,*ri;
1482:   MatScalar      *aa,*v;
1484:   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;

1487:   ai  = a->i;
1488:   aj  = a->j;
1489:   aa  = a->a;
1490:   m   = A->rmap.n;
1491:   n   = A->cmap.n;
1492:   bs  = A->rmap.bs;
1493:   mbs = a->mbs;
1494:   bs2 = a->bs2;
1495:   if (ll) {
1496:     VecGetArray(ll,&l);
1497:     VecGetLocalSize(ll,&lm);
1498:     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1499:     for (i=0; i<mbs; i++) { /* for each block row */
1500:       M  = ai[i+1] - ai[i];
1501:       li = l + i*bs;
1502:       v  = aa + bs2*ai[i];
1503:       for (j=0; j<M; j++) { /* for each block */
1504:         for (k=0; k<bs2; k++) {
1505:           (*v++) *= li[k%bs];
1506:         }
1507:       }
1508:     }
1509:     VecRestoreArray(ll,&l);
1510:     PetscLogFlops(a->nz);
1511:   }
1512: 
1513:   if (rr) {
1514:     VecGetArray(rr,&r);
1515:     VecGetLocalSize(rr,&rn);
1516:     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1517:     for (i=0; i<mbs; i++) { /* for each block row */
1518:       M  = ai[i+1] - ai[i];
1519:       v  = aa + bs2*ai[i];
1520:       for (j=0; j<M; j++) { /* for each block */
1521:         ri = r + bs*aj[ai[i]+j];
1522:         for (k=0; k<bs; k++) {
1523:           x = ri[k];
1524:           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1525:         }
1526:       }
1527:     }
1528:     VecRestoreArray(rr,&r);
1529:     PetscLogFlops(a->nz);
1530:   }
1531:   return(0);
1532: }


1537: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1538: {
1539:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

1542:   info->rows_global    = (double)A->rmap.N;
1543:   info->columns_global = (double)A->cmap.n;
1544:   info->rows_local     = (double)A->rmap.n;
1545:   info->columns_local  = (double)A->cmap.n;
1546:   info->block_size     = a->bs2;
1547:   info->nz_allocated   = a->maxnz;
1548:   info->nz_used        = a->bs2*a->nz;
1549:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1550:   info->assemblies   = A->num_ass;
1551:   info->mallocs      = a->reallocs;
1552:   info->memory       = ((PetscObject)A)->mem;
1553:   if (A->factor) {
1554:     info->fill_ratio_given  = A->info.fill_ratio_given;
1555:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1556:     info->factor_mallocs    = A->info.factor_mallocs;
1557:   } else {
1558:     info->fill_ratio_given  = 0;
1559:     info->fill_ratio_needed = 0;
1560:     info->factor_mallocs    = 0;
1561:   }
1562:   return(0);
1563: }


1568: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1569: {
1570:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

1574:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1575:   return(0);
1576: }