Actual source code: baij2.c

  1: #define PETSCMAT_DLL

 3:  #include src/mat/impls/baij/seq/baij.h
 4:  #include src/inline/spops.h
 5:  #include src/inline/ilu.h
 6:  #include petscbt.h

 10: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 11: {
 12:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 14:   PetscInt       row,i,j,k,l,m,n,*idx,*nidx,isz,val,ival;
 15:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 16:   PetscBT        table;

 19:   m     = a->mbs;
 20:   ai    = a->i;
 21:   aj    = a->j;
 22:   bs    = A->rmap.bs;

 24:   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 26:   PetscBTCreate(m,table);
 27:   PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
 28:   PetscMalloc((A->rmap.N+1)*sizeof(PetscInt),&nidx2);

 30:   for (i=0; i<is_max; i++) {
 31:     /* Initialise the two local arrays */
 32:     isz  = 0;
 33:     PetscBTMemzero(m,table);
 34: 
 35:     /* Extract the indices, assume there can be duplicate entries */
 36:     ISGetIndices(is[i],&idx);
 37:     ISGetLocalSize(is[i],&n);

 39:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 40:     for (j=0; j<n ; ++j){
 41:       ival = idx[j]/bs; /* convert the indices into block indices */
 42:       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 43:       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
 44:     }
 45:     ISRestoreIndices(is[i],&idx);
 46:     ISDestroy(is[i]);
 47: 
 48:     k = 0;
 49:     for (j=0; j<ov; j++){ /* for each overlap*/
 50:       n = isz;
 51:       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
 52:         row   = nidx[k];
 53:         start = ai[row];
 54:         end   = ai[row+1];
 55:         for (l = start; l<end ; l++){
 56:           val = aj[l];
 57:           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
 58:         }
 59:       }
 60:     }
 61:     /* expand the Index Set */
 62:     for (j=0; j<isz; j++) {
 63:       for (k=0; k<bs; k++)
 64:         nidx2[j*bs+k] = nidx[j]*bs+k;
 65:     }
 66:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
 67:   }
 68:   PetscBTDestroy(table);
 69:   PetscFree(nidx);
 70:   PetscFree(nidx2);
 71:   return(0);
 72: }

 76: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
 77: {
 78:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 80:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 81:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 82:   PetscInt       *irow,*icol,nrows,ncols,*ssmap,bs=A->rmap.bs,bs2=a->bs2;
 83:   PetscInt       *aj = a->j,*ai = a->i;
 84:   MatScalar      *mat_a;
 85:   Mat            C;
 86:   PetscTruth     flag;

 89:   ISSorted(iscol,(PetscTruth*)&i);
 90:   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 92:   ISGetIndices(isrow,&irow);
 93:   ISGetIndices(iscol,&icol);
 94:   ISGetLocalSize(isrow,&nrows);
 95:   ISGetLocalSize(iscol,&ncols);

 97:   PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
 98:   ssmap = smap;
 99:   PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
100:   PetscMemzero(smap,oldcols*sizeof(PetscInt));
101:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102:   /* determine lens of each row */
103:   for (i=0; i<nrows; i++) {
104:     kstart  = ai[irow[i]];
105:     kend    = kstart + a->ilen[irow[i]];
106:     lens[i] = 0;
107:       for (k=kstart; k<kend; k++) {
108:         if (ssmap[aj[k]]) {
109:           lens[i]++;
110:         }
111:       }
112:     }
113:   /* Create and fill new matrix */
114:   if (scall == MAT_REUSE_MATRIX) {
115:     c = (Mat_SeqBAIJ *)((*B)->data);

117:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap.bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
119:     if (!flag) {
120:       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
121:     }
122:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
123:     C = *B;
124:   } else {
125:     MatCreate(((PetscObject)A)->comm,&C);
126:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
127:     MatSetType(C,((PetscObject)A)->type_name);
128:     MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
129:   }
130:   c = (Mat_SeqBAIJ *)(C->data);
131:   for (i=0; i<nrows; i++) {
132:     row    = irow[i];
133:     kstart = ai[row];
134:     kend   = kstart + a->ilen[row];
135:     mat_i  = c->i[i];
136:     mat_j  = c->j + mat_i;
137:     mat_a  = c->a + mat_i*bs2;
138:     mat_ilen = c->ilen + i;
139:     for (k=kstart; k<kend; k++) {
140:       if ((tcol=ssmap[a->j[k]])) {
141:         *mat_j++ = tcol - 1;
142:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
143:         mat_a   += bs2;
144:         (*mat_ilen)++;
145:       }
146:     }
147:   }
148: 
149:   /* Free work space */
150:   ISRestoreIndices(iscol,&icol);
151:   PetscFree(smap);
152:   PetscFree(lens);
153:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
154:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
155: 
156:   ISRestoreIndices(isrow,&irow);
157:   *B = C;
158:   return(0);
159: }

163: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
164: {
165:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166:   IS             is1,is2;
168:   PetscInt       *vary,*iary,*irow,*icol,nrows,ncols,i,bs=A->rmap.bs,count;

171:   ISGetIndices(isrow,&irow);
172:   ISGetIndices(iscol,&icol);
173:   ISGetLocalSize(isrow,&nrows);
174:   ISGetLocalSize(iscol,&ncols);
175: 
176:   /* Verify if the indices corespond to each element in a block 
177:    and form the IS with compressed IS */
178:   PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
179:   iary = vary + a->mbs;
180:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
181:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
182:   count = 0;
183:   for (i=0; i<a->mbs; i++) {
184:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185:     if (vary[i]==bs) iary[count++] = i;
186:   }
187:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
188: 
189:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
190:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
191:   count = 0;
192:   for (i=0; i<a->mbs; i++) {
193:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
194:     if (vary[i]==bs) iary[count++] = i;
195:   }
196:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
197:   ISRestoreIndices(isrow,&irow);
198:   ISRestoreIndices(iscol,&icol);
199:   PetscFree(vary);

201:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
202:   ISDestroy(is1);
203:   ISDestroy(is2);
204:   return(0);
205: }

209: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
210: {
212:   PetscInt       i;

215:   if (scall == MAT_INITIAL_MATRIX) {
216:     PetscMalloc((n+1)*sizeof(Mat),B);
217:   }

219:   for (i=0; i<n; i++) {
220:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
221:   }
222:   return(0);
223: }


226: /* -------------------------------------------------------*/
227: /* Should check that shapes of vectors and matrices match */
228: /* -------------------------------------------------------*/
229:  #include petscblaslapack.h

233: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
234: {
235:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
236:   PetscScalar    *x,*z,sum;
237:   MatScalar      *v;
239:   PetscInt       mbs,i,*idx,*ii,n,*ridx=PETSC_NULL,nonzerorow=0;
240:   PetscTruth     usecprow=a->compressedrow.use;

243:   VecGetArray(xx,&x);
244:   VecGetArray(zz,&z);

246:   idx = a->j;
247:   v   = a->a;
248:   if (usecprow){
249:     mbs  = a->compressedrow.nrows;
250:     ii   = a->compressedrow.i;
251:     ridx = a->compressedrow.rindex;
252:   } else {
253:     mbs = a->mbs;
254:     ii  = a->i;
255:   }

257:   for (i=0; i<mbs; i++) {
258:     n    = ii[1] - ii[0]; ii++;
259:     sum  = 0.0;
260:     nonzerorow += (n>0);
261:     while (n--) sum += *v++ * x[*idx++];
262:     if (usecprow){
263:       z[ridx[i]] = sum;
264:     } else {
265:       z[i] = sum;
266:     }
267:   }
268:   VecRestoreArray(xx,&x);
269:   VecRestoreArray(zz,&z);
270:   PetscLogFlops(2*a->nz - nonzerorow);
271:   return(0);
272: }

276: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
277: {
278:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
279:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,*zarray;
280:   PetscScalar    x1,x2;
281:   MatScalar      *v;
283:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
284:   PetscTruth     usecprow=a->compressedrow.use;

287:   VecGetArray(xx,&x);
288:   VecGetArray(zz,&zarray);

290:   idx = a->j;
291:   v   = a->a;
292:   if (usecprow){
293:     mbs  = a->compressedrow.nrows;
294:     ii   = a->compressedrow.i;
295:     ridx = a->compressedrow.rindex;
296:   } else {
297:     mbs = a->mbs;
298:     ii  = a->i;
299:     z   = zarray;
300:   }

302:   for (i=0; i<mbs; i++) {
303:     n  = ii[1] - ii[0]; ii++;
304:     sum1 = 0.0; sum2 = 0.0;
305:     nonzerorow += (n>0);
306:     for (j=0; j<n; j++) {
307:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
308:       sum1 += v[0]*x1 + v[2]*x2;
309:       sum2 += v[1]*x1 + v[3]*x2;
310:       v += 4;
311:     }
312:     if (usecprow) z = zarray + 2*ridx[i];
313:     z[0] = sum1; z[1] = sum2;
314:     if (!usecprow) z += 2;
315:   }
316:   VecRestoreArray(xx,&x);
317:   VecRestoreArray(zz,&zarray);
318:   PetscLogFlops(8*a->nz - 2*nonzerorow);
319:   return(0);
320: }

324: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
325: {
326:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
327:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*zarray;
328:   MatScalar      *v;
330:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
331:   PetscTruth     usecprow=a->compressedrow.use;
332: 

334: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
335: #pragma disjoint(*v,*z,*xb)
336: #endif

339:   VecGetArray(xx,&x);
340:   VecGetArray(zz,&zarray);

342:   idx = a->j;
343:   v   = a->a;
344:   if (usecprow){
345:     mbs  = a->compressedrow.nrows;
346:     ii   = a->compressedrow.i;
347:     ridx = a->compressedrow.rindex;
348:   } else {
349:     mbs = a->mbs;
350:     ii  = a->i;
351:     z   = zarray;
352:   }

354:   for (i=0; i<mbs; i++) {
355:     n  = ii[1] - ii[0]; ii++;
356:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
357:     nonzerorow += (n>0);
358:     for (j=0; j<n; j++) {
359:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
360:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
361:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
362:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
363:       v += 9;
364:     }
365:     if (usecprow) z = zarray + 3*ridx[i];
366:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
367:     if (!usecprow) z += 3;
368:   }
369:   VecRestoreArray(xx,&x);
370:   VecRestoreArray(zz,&zarray);
371:   PetscLogFlops(18*a->nz - 3*nonzerorow);
372:   return(0);
373: }

377: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
378: {
379:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
380:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
381:   MatScalar      *v;
383:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
384:   PetscTruth     usecprow=a->compressedrow.use;

387:   VecGetArray(xx,&x);
388:   VecGetArray(zz,&zarray);

390:   idx = a->j;
391:   v   = a->a;
392:   if (usecprow){
393:     mbs  = a->compressedrow.nrows;
394:     ii   = a->compressedrow.i;
395:     ridx = a->compressedrow.rindex;
396:   } else {
397:     mbs = a->mbs;
398:     ii  = a->i;
399:     z   = zarray;
400:   }

402:   for (i=0; i<mbs; i++) {
403:     n  = ii[1] - ii[0]; ii++;
404:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
405:     nonzerorow += (n>0);
406:     for (j=0; j<n; j++) {
407:       xb = x + 4*(*idx++);
408:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
409:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
410:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
411:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
412:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
413:       v += 16;
414:     }
415:     if (usecprow) z = zarray + 4*ridx[i];
416:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
417:     if (!usecprow) z += 4;
418:   }
419:   VecRestoreArray(xx,&x);
420:   VecRestoreArray(zz,&zarray);
421:   PetscLogFlops(32*a->nz - 4*nonzerorow);
422:   return(0);
423: }

427: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
428: {
429:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
430:   PetscScalar    sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z = 0,*x,*zarray;
431:   MatScalar      *v;
433:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
434:   PetscTruth     usecprow=a->compressedrow.use;

437:   VecGetArray(xx,&x);
438:   VecGetArray(zz,&zarray);

440:   idx = a->j;
441:   v   = a->a;
442:   if (usecprow){
443:     mbs  = a->compressedrow.nrows;
444:     ii   = a->compressedrow.i;
445:     ridx = a->compressedrow.rindex;
446:   } else {
447:     mbs = a->mbs;
448:     ii  = a->i;
449:     z   = zarray;
450:   }

452:   for (i=0; i<mbs; i++) {
453:     n  = ii[1] - ii[0]; ii++;
454:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
455:     nonzerorow += (n>0);
456:     for (j=0; j<n; j++) {
457:       xb = x + 5*(*idx++);
458:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
459:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
460:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
461:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
462:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
463:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
464:       v += 25;
465:     }
466:     if (usecprow) z = zarray + 5*ridx[i];
467:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
468:     if (!usecprow) z += 5;
469:   }
470:   VecRestoreArray(xx,&x);
471:   VecRestoreArray(zz,&zarray);
472:   PetscLogFlops(50*a->nz - 5*nonzerorow);
473:   return(0);
474: }


479: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
480: {
481:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
482:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
483:   PetscScalar    x1,x2,x3,x4,x5,x6,*zarray;
484:   MatScalar      *v;
486:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
487:   PetscTruth     usecprow=a->compressedrow.use;

490:   VecGetArray(xx,&x);
491:   VecGetArray(zz,&zarray);

493:   idx = a->j;
494:   v   = a->a;
495:   if (usecprow){
496:     mbs  = a->compressedrow.nrows;
497:     ii   = a->compressedrow.i;
498:     ridx = a->compressedrow.rindex;
499:   } else {
500:     mbs = a->mbs;
501:     ii  = a->i;
502:     z   = zarray;
503:   }

505:   for (i=0; i<mbs; i++) {
506:     n  = ii[1] - ii[0]; ii++;
507:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
508:     nonzerorow += (n>0);
509:     for (j=0; j<n; j++) {
510:       xb = x + 6*(*idx++);
511:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
512:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
513:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
514:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
515:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
516:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
517:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
518:       v += 36;
519:     }
520:     if (usecprow) z = zarray + 6*ridx[i];
521:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
522:     if (!usecprow) z += 6;
523:   }

525:   VecRestoreArray(xx,&x);
526:   VecRestoreArray(zz,&zarray);
527:   PetscLogFlops(72*a->nz - 6*nonzerorow);
528:   return(0);
529: }
532: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
533: {
534:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
535:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
536:   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*zarray;
537:   MatScalar      *v;
539:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
540:   PetscTruth     usecprow=a->compressedrow.use;

543:   VecGetArray(xx,&x);
544:   VecGetArray(zz,&zarray);

546:   idx = a->j;
547:   v   = a->a;
548:   if (usecprow){
549:     mbs    = a->compressedrow.nrows;
550:     ii     = a->compressedrow.i;
551:     ridx = a->compressedrow.rindex;
552:   } else {
553:     mbs = a->mbs;
554:     ii  = a->i;
555:     z   = zarray;
556:   }

558:   for (i=0; i<mbs; i++) {
559:     n  = ii[1] - ii[0]; ii++;
560:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
561:     nonzerorow += (n>0);
562:     for (j=0; j<n; j++) {
563:       xb = x + 7*(*idx++);
564:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
565:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
566:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
567:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
568:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
569:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
570:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
571:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
572:       v += 49;
573:     }
574:     if (usecprow) z = zarray + 7*ridx[i];
575:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
576:     if (!usecprow) z += 7;
577:   }

579:   VecRestoreArray(xx,&x);
580:   VecRestoreArray(zz,&zarray);
581:   PetscLogFlops(98*a->nz - 7*nonzerorow);
582:   return(0);
583: }

585: /*
586:     This will not work with MatScalar == float because it calls the BLAS
587: */
590: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
591: {
592:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
593:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
594:   MatScalar      *v;
596:   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2;
597:   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
598:   PetscTruth     usecprow=a->compressedrow.use;

601:   VecGetArray(xx,&x);
602:   VecGetArray(zz,&zarray);

604:   idx = a->j;
605:   v   = a->a;
606:   if (usecprow){
607:     mbs  = a->compressedrow.nrows;
608:     ii   = a->compressedrow.i;
609:     ridx = a->compressedrow.rindex;
610:   } else {
611:     mbs = a->mbs;
612:     ii  = a->i;
613:     z   = zarray;
614:   }

616:   if (!a->mult_work) {
617:     k    = PetscMax(A->rmap.n,A->cmap.n);
618:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
619:   }
620:   work = a->mult_work;
621:   for (i=0; i<mbs; i++) {
622:     n     = ii[1] - ii[0]; ii++;
623:     ncols = n*bs;
624:     workt = work;
625:     nonzerorow += (n>0);
626:     for (j=0; j<n; j++) {
627:       xb = x + bs*(*idx++);
628:       for (k=0; k<bs; k++) workt[k] = xb[k];
629:       workt += bs;
630:     }
631:     if (usecprow) z = zarray + bs*ridx[i];
632:     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
633:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
634:     v += n*bs2;
635:     if (!usecprow) z += bs;
636:   }
637:   VecRestoreArray(xx,&x);
638:   VecRestoreArray(zz,&zarray);
639:   PetscLogFlops(2*a->nz*bs2 - bs*nonzerorow);
640:   return(0);
641: }

646: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
647: {
648:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
649:   PetscScalar    *x,*y = 0,*z = 0,sum,*yarray,*zarray;
650:   MatScalar      *v;
652:   PetscInt       mbs=a->mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
653:   PetscTruth     usecprow=a->compressedrow.use;

656:   VecGetArray(xx,&x);
657:   VecGetArray(yy,&yarray);
658:   if (zz != yy) {
659:     VecGetArray(zz,&zarray);
660:   } else {
661:     zarray = yarray;
662:   }

664:   idx = a->j;
665:   v   = a->a;
666:   if (usecprow){
667:     if (zz != yy){
668:       PetscMemcpy(zarray,yarray,mbs*sizeof(PetscScalar));
669:     }
670:     mbs  = a->compressedrow.nrows;
671:     ii   = a->compressedrow.i;
672:     ridx = a->compressedrow.rindex;
673:   } else {
674:     ii  = a->i;
675:     y   = yarray;
676:     z   = zarray;
677:   }

679:   for (i=0; i<mbs; i++) {
680:     n    = ii[1] - ii[0]; ii++;
681:     if (usecprow){
682:       z = zarray + ridx[i];
683:       y = yarray + ridx[i];
684:     }
685:     sum = y[0];
686:     while (n--) sum += *v++ * x[*idx++];
687:     z[0] = sum;
688:     if (!usecprow){
689:       z++; y++;
690:     }
691:   }
692:   VecRestoreArray(xx,&x);
693:   VecRestoreArray(yy,&yarray);
694:   if (zz != yy) {
695:     VecRestoreArray(zz,&zarray);
696:   }
697:   PetscLogFlops(2*a->nz);
698:   return(0);
699: }

703: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
704: {
705:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
706:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
707:   PetscScalar    x1,x2,*yarray,*zarray;
708:   MatScalar      *v;
710:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
711:   PetscTruth     usecprow=a->compressedrow.use;

714:   VecGetArray(xx,&x);
715:   VecGetArray(yy,&yarray);
716:   if (zz != yy) {
717:     VecGetArray(zz,&zarray);
718:   } else {
719:     zarray = yarray;
720:   }

722:   idx = a->j;
723:   v   = a->a;
724:   if (usecprow){
725:     if (zz != yy){
726:       PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
727:     }
728:     mbs  = a->compressedrow.nrows;
729:     ii   = a->compressedrow.i;
730:     ridx = a->compressedrow.rindex;
731:     if (zz != yy){
732:       PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
733:     }
734:   } else {
735:     ii  = a->i;
736:     y   = yarray;
737:     z   = zarray;
738:   }

740:   for (i=0; i<mbs; i++) {
741:     n  = ii[1] - ii[0]; ii++;
742:     if (usecprow){
743:       z = zarray + 2*ridx[i];
744:       y = yarray + 2*ridx[i];
745:     }
746:     sum1 = y[0]; sum2 = y[1];
747:     for (j=0; j<n; j++) {
748:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
749:       sum1 += v[0]*x1 + v[2]*x2;
750:       sum2 += v[1]*x1 + v[3]*x2;
751:       v += 4;
752:     }
753:     z[0] = sum1; z[1] = sum2;
754:     if (!usecprow){
755:       z += 2; y += 2;
756:     }
757:   }
758:   VecRestoreArray(xx,&x);
759:   VecRestoreArray(yy,&yarray);
760:   if (zz != yy) {
761:     VecRestoreArray(zz,&zarray);
762:   }
763:   PetscLogFlops(4*a->nz);
764:   return(0);
765: }

769: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
770: {
771:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
772:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
773:   MatScalar      *v;
775:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
776:   PetscTruth     usecprow=a->compressedrow.use;

779:   VecGetArray(xx,&x);
780:   VecGetArray(yy,&yarray);
781:   if (zz != yy) {
782:     VecGetArray(zz,&zarray);
783:   } else {
784:     zarray = yarray;
785:   }

787:   idx = a->j;
788:   v   = a->a;
789:   if (usecprow){
790:     if (zz != yy){
791:       PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
792:     }
793:     mbs  = a->compressedrow.nrows;
794:     ii   = a->compressedrow.i;
795:     ridx = a->compressedrow.rindex;
796:   } else {
797:     ii  = a->i;
798:     y   = yarray;
799:     z   = zarray;
800:   }

802:   for (i=0; i<mbs; i++) {
803:     n  = ii[1] - ii[0]; ii++;
804:     if (usecprow){
805:       z = zarray + 3*ridx[i];
806:       y = yarray + 3*ridx[i];
807:     }
808:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
809:     for (j=0; j<n; j++) {
810:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
811:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
812:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
813:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
814:       v += 9;
815:     }
816:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
817:     if (!usecprow){
818:       z += 3; y += 3;
819:     }
820:   }
821:   VecRestoreArray(xx,&x);
822:   VecRestoreArray(yy,&yarray);
823:   if (zz != yy) {
824:     VecRestoreArray(zz,&zarray);
825:   }
826:   PetscLogFlops(18*a->nz);
827:   return(0);
828: }

832: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
833: {
834:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
835:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
836:   MatScalar      *v;
838:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
839:   PetscTruth     usecprow=a->compressedrow.use;

842:   VecGetArray(xx,&x);
843:   VecGetArray(yy,&yarray);
844:   if (zz != yy) {
845:     VecGetArray(zz,&zarray);
846:   } else {
847:     zarray = yarray;
848:   }

850:   idx   = a->j;
851:   v     = a->a;
852:   if (usecprow){
853:     if (zz != yy){
854:       PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
855:     }
856:     mbs  = a->compressedrow.nrows;
857:     ii   = a->compressedrow.i;
858:     ridx = a->compressedrow.rindex;
859:   } else {
860:     ii  = a->i;
861:     y   = yarray;
862:     z   = zarray;
863:   }

865:   for (i=0; i<mbs; i++) {
866:     n  = ii[1] - ii[0]; ii++;
867:     if (usecprow){
868:       z = zarray + 4*ridx[i];
869:       y = yarray + 4*ridx[i];
870:     }
871:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
872:     for (j=0; j<n; j++) {
873:       xb = x + 4*(*idx++);
874:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
875:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
876:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
877:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
878:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
879:       v += 16;
880:     }
881:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
882:     if (!usecprow){
883:       z += 4; y += 4;
884:     }
885:   }
886:   VecRestoreArray(xx,&x);
887:   VecRestoreArray(yy,&yarray);
888:   if (zz != yy) {
889:     VecRestoreArray(zz,&zarray);
890:   }
891:   PetscLogFlops(32*a->nz);
892:   return(0);
893: }

897: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
898: {
899:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
900:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
901:   PetscScalar    *yarray,*zarray;
902:   MatScalar      *v;
904:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
905:   PetscTruth     usecprow=a->compressedrow.use;

908:   VecGetArray(xx,&x);
909:   VecGetArray(yy,&yarray);
910:   if (zz != yy) {
911:     VecGetArray(zz,&zarray);
912:   } else {
913:     zarray = yarray;
914:   }

916:   idx = a->j;
917:   v   = a->a;
918:   if (usecprow){
919:     if (zz != yy){
920:       PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
921:     }
922:     mbs  = a->compressedrow.nrows;
923:     ii   = a->compressedrow.i;
924:     ridx = a->compressedrow.rindex;
925:   } else {
926:     ii  = a->i;
927:     y   = yarray;
928:     z   = zarray;
929:   }

931:   for (i=0; i<mbs; i++) {
932:     n  = ii[1] - ii[0]; ii++;
933:     if (usecprow){
934:       z = zarray + 5*ridx[i];
935:       y = yarray + 5*ridx[i];
936:     }
937:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
938:     for (j=0; j<n; j++) {
939:       xb = x + 5*(*idx++);
940:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
941:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
942:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
943:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
944:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
945:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
946:       v += 25;
947:     }
948:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
949:     if (!usecprow){
950:       z += 5; y += 5;
951:     }
952:   }
953:   VecRestoreArray(xx,&x);
954:   VecRestoreArray(yy,&yarray);
955:   if (zz != yy) {
956:     VecRestoreArray(zz,&zarray);
957:   }
958:   PetscLogFlops(50*a->nz);
959:   return(0);
960: }
963: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
964: {
965:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
966:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
967:   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
968:   MatScalar      *v;
970:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
971:   PetscTruth     usecprow=a->compressedrow.use;

974:   VecGetArray(xx,&x);
975:   VecGetArray(yy,&yarray);
976:   if (zz != yy) {
977:     VecGetArray(zz,&zarray);
978:   } else {
979:     zarray = yarray;
980:   }

982:   idx = a->j;
983:   v   = a->a;
984:   if (usecprow){
985:     if (zz != yy){
986:       PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
987:     }
988:     mbs  = a->compressedrow.nrows;
989:     ii   = a->compressedrow.i;
990:     ridx = a->compressedrow.rindex;
991:   } else {
992:     ii  = a->i;
993:     y   = yarray;
994:     z   = zarray;
995:   }

997:   for (i=0; i<mbs; i++) {
998:     n  = ii[1] - ii[0]; ii++;
999:     if (usecprow){
1000:       z = zarray + 6*ridx[i];
1001:       y = yarray + 6*ridx[i];
1002:     }
1003:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1004:     for (j=0; j<n; j++) {
1005:       xb = x + 6*(*idx++);
1006:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1007:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1008:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1009:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1010:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1011:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1012:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1013:       v += 36;
1014:     }
1015:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1016:     if (!usecprow){
1017:       z += 6; y += 6;
1018:     }
1019:   }
1020:   VecRestoreArray(xx,&x);
1021:   VecRestoreArray(yy,&yarray);
1022:   if (zz != yy) {
1023:     VecRestoreArray(zz,&zarray);
1024:   }
1025:   PetscLogFlops(72*a->nz);
1026:   return(0);
1027: }

1031: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1032: {
1033:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1034:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1035:   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1036:   MatScalar      *v;
1038:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1039:   PetscTruth     usecprow=a->compressedrow.use;

1042:   VecGetArray(xx,&x);
1043:   VecGetArray(yy,&yarray);
1044:   if (zz != yy) {
1045:     VecGetArray(zz,&zarray);
1046:   } else {
1047:     zarray = yarray;
1048:   }

1050:   idx = a->j;
1051:   v   = a->a;
1052:   if (usecprow){
1053:     if (zz != yy){
1054:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1055:     }
1056:     mbs  = a->compressedrow.nrows;
1057:     ii   = a->compressedrow.i;
1058:     ridx = a->compressedrow.rindex;
1059:   } else {
1060:     ii  = a->i;
1061:     y   = yarray;
1062:     z   = zarray;
1063:   }

1065:   for (i=0; i<mbs; i++) {
1066:     n  = ii[1] - ii[0]; ii++;
1067:     if (usecprow){
1068:       z = zarray + 7*ridx[i];
1069:       y = yarray + 7*ridx[i];
1070:     }
1071:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1072:     for (j=0; j<n; j++) {
1073:       xb = x + 7*(*idx++);
1074:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1075:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1076:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1077:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1078:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1079:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1080:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1081:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1082:       v += 49;
1083:     }
1084:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1085:     if (!usecprow){
1086:       z += 7; y += 7;
1087:     }
1088:   }
1089:   VecRestoreArray(xx,&x);
1090:   VecRestoreArray(yy,&yarray);
1091:   if (zz != yy) {
1092:     VecRestoreArray(zz,&zarray);
1093:   }
1094:   PetscLogFlops(98*a->nz);
1095:   return(0);
1096: }

1100: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1101: {
1102:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1103:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1104:   MatScalar      *v;
1106:   PetscInt       mbs,i,*idx,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2;
1107:   PetscInt       ncols,k,*ridx=PETSC_NULL;
1108:   PetscTruth     usecprow=a->compressedrow.use;

1111:   VecCopy_Seq(yy,zz);
1112:   VecGetArray(xx,&x);
1113:   VecGetArray(zz,&zarray);

1115:   idx = a->j;
1116:   v   = a->a;
1117:   if (usecprow){
1118:     mbs    = a->compressedrow.nrows;
1119:     ii     = a->compressedrow.i;
1120:     ridx = a->compressedrow.rindex;
1121:   } else {
1122:     mbs = a->mbs;
1123:     ii  = a->i;
1124:     z   = zarray;
1125:   }

1127:   if (!a->mult_work) {
1128:     k    = PetscMax(A->rmap.n,A->cmap.n);
1129:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1130:   }
1131:   work = a->mult_work;
1132:   for (i=0; i<mbs; i++) {
1133:     n     = ii[1] - ii[0]; ii++;
1134:     ncols = n*bs;
1135:     workt = work;
1136:     for (j=0; j<n; j++) {
1137:       xb = x + bs*(*idx++);
1138:       for (k=0; k<bs; k++) workt[k] = xb[k];
1139:       workt += bs;
1140:     }
1141:     if (usecprow) z = zarray + bs*ridx[i];
1142:     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1143:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1144:     v += n*bs2;
1145:     if (!usecprow){
1146:       z += bs;
1147:     }
1148:   }
1149:   VecRestoreArray(xx,&x);
1150:   VecRestoreArray(zz,&zarray);
1151:   PetscLogFlops(2*a->nz*bs2);
1152:   return(0);
1153: }

1157: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1158: {
1159:   PetscScalar    zero = 0.0;

1163:   VecSet(zz,zero);
1164:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1165:   return(0);
1166: }

1170: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)

1172: {
1173:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1174:   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1175:   MatScalar         *v;
1176:   PetscErrorCode    ierr;
1177:   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap.bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1178:   Mat_CompressedRow cprow = a->compressedrow;
1179:   PetscTruth        usecprow=cprow.use;

1182:   if (yy != zz) { VecCopy_Seq(yy,zz); }
1183:   VecGetArray(xx,&x);
1184:   VecGetArray(zz,&z);

1186:   idx = a->j;
1187:   v   = a->a;
1188:   if (usecprow){
1189:     mbs  = cprow.nrows;
1190:     ii   = cprow.i;
1191:     ridx = cprow.rindex;
1192:   } else {
1193:     mbs=a->mbs;
1194:     ii = a->i;
1195:     xb = x;
1196:   }

1198:   switch (bs) {
1199:   case 1:
1200:     for (i=0; i<mbs; i++) {
1201:       if (usecprow) xb = x + ridx[i];
1202:       x1 = xb[0];
1203:       ib = idx + ii[0];
1204:       n  = ii[1] - ii[0]; ii++;
1205:       for (j=0; j<n; j++) {
1206:         rval    = ib[j];
1207:         z[rval] += *v * x1;
1208:         v++;
1209:       }
1210:       if (!usecprow) xb++;
1211:     }
1212:     break;
1213:   case 2:
1214:     for (i=0; i<mbs; i++) {
1215:       if (usecprow) xb = x + 2*ridx[i];
1216:       x1 = xb[0]; x2 = xb[1];
1217:       ib = idx + ii[0];
1218:       n  = ii[1] - ii[0]; ii++;
1219:       for (j=0; j<n; j++) {
1220:         rval      = ib[j]*2;
1221:         z[rval++] += v[0]*x1 + v[1]*x2;
1222:         z[rval++] += v[2]*x1 + v[3]*x2;
1223:         v  += 4;
1224:       }
1225:       if (!usecprow) xb += 2;
1226:     }
1227:     break;
1228:   case 3:
1229:     for (i=0; i<mbs; i++) {
1230:       if (usecprow) xb = x + 3*ridx[i];
1231:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1232:       ib = idx + ii[0];
1233:       n  = ii[1] - ii[0]; ii++;
1234:       for (j=0; j<n; j++) {
1235:         rval      = ib[j]*3;
1236:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1237:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1238:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1239:         v  += 9;
1240:       }
1241:       if (!usecprow) xb += 3;
1242:     }
1243:     break;
1244:   case 4:
1245:     for (i=0; i<mbs; i++) {
1246:       if (usecprow) xb = x + 4*ridx[i];
1247:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1248:       ib = idx + ii[0];
1249:       n  = ii[1] - ii[0]; ii++;
1250:       for (j=0; j<n; j++) {
1251:         rval      = ib[j]*4;
1252:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1253:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1254:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1255:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1256:         v  += 16;
1257:       }
1258:       if (!usecprow) xb += 4;
1259:     }
1260:     break;
1261:   case 5:
1262:     for (i=0; i<mbs; i++) {
1263:       if (usecprow) xb = x + 5*ridx[i];
1264:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1265:       x4 = xb[3]; x5 = xb[4];
1266:       ib = idx + ii[0];
1267:       n  = ii[1] - ii[0]; ii++;
1268:       for (j=0; j<n; j++) {
1269:         rval      = ib[j]*5;
1270:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1271:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1272:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1273:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1274:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1275:         v  += 25;
1276:       }
1277:       if (!usecprow) xb += 5;
1278:     }
1279:     break;
1280:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1281:       PetscInt     ncols,k;
1282:       PetscScalar  *work,*workt,*xtmp;

1284:       if (!a->mult_work) {
1285:         k = PetscMax(A->rmap.n,A->cmap.n);
1286:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1287:       }
1288:       work = a->mult_work;
1289:       xtmp = x;
1290:       for (i=0; i<mbs; i++) {
1291:         n     = ii[1] - ii[0]; ii++;
1292:         ncols = n*bs;
1293:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1294:         if (usecprow) {
1295:           xtmp = x + bs*ridx[i];
1296:         }
1297:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1298:         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1299:         v += n*bs2;
1300:         if (!usecprow) xtmp += bs;
1301:         workt = work;
1302:         for (j=0; j<n; j++) {
1303:           zb = z + bs*(*idx++);
1304:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1305:           workt += bs;
1306:         }
1307:       }
1308:     }
1309:   }
1310:   VecRestoreArray(xx,&x);
1311:   VecRestoreArray(zz,&z);
1312:   PetscLogFlops(2*a->nz*a->bs2);
1313:   return(0);
1314: }

1318: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1319: {
1320:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)inA->data;
1321:   PetscInt     totalnz = a->bs2*a->nz;
1322:   PetscScalar  oalpha = alpha;
1324: #if defined(PETSC_USE_MAT_SINGLE)
1325:   PetscInt     i;
1326: #else
1327:   PetscBLASInt tnz = (PetscBLASInt) totalnz,one = 1;
1328: #endif

1331: #if defined(PETSC_USE_MAT_SINGLE)
1332:   for (i=0; i<totalnz; i++) a->a[i] *= oalpha;
1333: #else
1334:   BLASscal_(&tnz,&oalpha,a->a,&one);
1335: #endif
1336:   PetscLogFlops(totalnz);
1337:   return(0);
1338: }

1342: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1343: {
1345:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1346:   MatScalar      *v = a->a;
1347:   PetscReal      sum = 0.0;
1348:   PetscInt       i,j,k,bs=A->rmap.bs,nz=a->nz,bs2=a->bs2,k1;

1351:   if (type == NORM_FROBENIUS) {
1352:     for (i=0; i< bs2*nz; i++) {
1353: #if defined(PETSC_USE_COMPLEX)
1354:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1355: #else
1356:       sum += (*v)*(*v); v++;
1357: #endif
1358:     }
1359:     *norm = sqrt(sum);
1360:   } else if (type == NORM_1) { /* maximum column sum */
1361:     PetscReal *tmp;
1362:     PetscInt  *bcol = a->j;
1363:     PetscMalloc((A->cmap.n+1)*sizeof(PetscReal),&tmp);
1364:     PetscMemzero(tmp,A->cmap.n*sizeof(PetscReal));
1365:     for (i=0; i<nz; i++){
1366:       for (j=0; j<bs; j++){
1367:         k1 = bs*(*bcol) + j; /* column index */
1368:         for (k=0; k<bs; k++){
1369:           tmp[k1] += PetscAbsScalar(*v); v++;
1370:         }
1371:       }
1372:       bcol++;
1373:     }
1374:     *norm = 0.0;
1375:     for (j=0; j<A->cmap.n; j++) {
1376:       if (tmp[j] > *norm) *norm = tmp[j];
1377:     }
1378:     PetscFree(tmp);
1379:   } else if (type == NORM_INFINITY) { /* maximum row sum */
1380:     *norm = 0.0;
1381:     for (k=0; k<bs; k++) {
1382:       for (j=0; j<a->mbs; j++) {
1383:         v = a->a + bs2*a->i[j] + k;
1384:         sum = 0.0;
1385:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1386:           for (k1=0; k1<bs; k1++){
1387:             sum += PetscAbsScalar(*v);
1388:             v   += bs;
1389:           }
1390:         }
1391:         if (sum > *norm) *norm = sum;
1392:       }
1393:     }
1394:   } else {
1395:     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1396:   }
1397:   return(0);
1398: }


1403: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1404: {
1405:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;

1409:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1410:   if ((A->rmap.N != B->rmap.N) || (A->cmap.n != B->cmap.n) || (A->rmap.bs != B->rmap.bs)|| (a->nz != b->nz)) {
1411:     *flg = PETSC_FALSE;
1412:     return(0);
1413:   }
1414: 
1415:   /* if the a->i are the same */
1416:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1417:   if (!*flg) {
1418:     return(0);
1419:   }
1420: 
1421:   /* if a->j are the same */
1422:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1423:   if (!*flg) {
1424:     return(0);
1425:   }
1426:   /* if a->a are the same */
1427:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap.bs)*(B->rmap.bs)*sizeof(PetscScalar),flg);
1428:   return(0);
1429: 
1430: }

1434: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1435: {
1436:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1438:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1439:   PetscScalar    *x,zero = 0.0;
1440:   MatScalar      *aa,*aa_j;

1443:   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1444:   bs   = A->rmap.bs;
1445:   aa   = a->a;
1446:   ai   = a->i;
1447:   aj   = a->j;
1448:   ambs = a->mbs;
1449:   bs2  = a->bs2;

1451:   VecSet(v,zero);
1452:   VecGetArray(v,&x);
1453:   VecGetLocalSize(v,&n);
1454:   if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1455:   for (i=0; i<ambs; i++) {
1456:     for (j=ai[i]; j<ai[i+1]; j++) {
1457:       if (aj[j] == i) {
1458:         row  = i*bs;
1459:         aa_j = aa+j*bs2;
1460:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1461:         break;
1462:       }
1463:     }
1464:   }
1465:   VecRestoreArray(v,&x);
1466:   return(0);
1467: }

1471: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1472: {
1473:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1474:   PetscScalar    *l,*r,x,*li,*ri;
1475:   MatScalar      *aa,*v;
1477:   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;

1480:   ai  = a->i;
1481:   aj  = a->j;
1482:   aa  = a->a;
1483:   m   = A->rmap.n;
1484:   n   = A->cmap.n;
1485:   bs  = A->rmap.bs;
1486:   mbs = a->mbs;
1487:   bs2 = a->bs2;
1488:   if (ll) {
1489:     VecGetArray(ll,&l);
1490:     VecGetLocalSize(ll,&lm);
1491:     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1492:     for (i=0; i<mbs; i++) { /* for each block row */
1493:       M  = ai[i+1] - ai[i];
1494:       li = l + i*bs;
1495:       v  = aa + bs2*ai[i];
1496:       for (j=0; j<M; j++) { /* for each block */
1497:         for (k=0; k<bs2; k++) {
1498:           (*v++) *= li[k%bs];
1499:         }
1500:       }
1501:     }
1502:     VecRestoreArray(ll,&l);
1503:     PetscLogFlops(a->nz);
1504:   }
1505: 
1506:   if (rr) {
1507:     VecGetArray(rr,&r);
1508:     VecGetLocalSize(rr,&rn);
1509:     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1510:     for (i=0; i<mbs; i++) { /* for each block row */
1511:       M  = ai[i+1] - ai[i];
1512:       v  = aa + bs2*ai[i];
1513:       for (j=0; j<M; j++) { /* for each block */
1514:         ri = r + bs*aj[ai[i]+j];
1515:         for (k=0; k<bs; k++) {
1516:           x = ri[k];
1517:           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1518:         }
1519:       }
1520:     }
1521:     VecRestoreArray(rr,&r);
1522:     PetscLogFlops(a->nz);
1523:   }
1524:   return(0);
1525: }


1530: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1531: {
1532:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

1535:   info->rows_global    = (double)A->rmap.N;
1536:   info->columns_global = (double)A->cmap.n;
1537:   info->rows_local     = (double)A->rmap.n;
1538:   info->columns_local  = (double)A->cmap.n;
1539:   info->block_size     = a->bs2;
1540:   info->nz_allocated   = a->maxnz;
1541:   info->nz_used        = a->bs2*a->nz;
1542:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1543:   info->assemblies   = A->num_ass;
1544:   info->mallocs      = a->reallocs;
1545:   info->memory       = ((PetscObject)A)->mem;
1546:   if (A->factor) {
1547:     info->fill_ratio_given  = A->info.fill_ratio_given;
1548:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1549:     info->factor_mallocs    = A->info.factor_mallocs;
1550:   } else {
1551:     info->fill_ratio_given  = 0;
1552:     info->fill_ratio_needed = 0;
1553:     info->factor_mallocs    = 0;
1554:   }
1555:   return(0);
1556: }


1561: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1562: {
1563:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

1567:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1568:   return(0);
1569: }